diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/drivers/block/as-iosched.c linux-2.6.5-rc1-mm2/drivers/block/as-iosched.c --- /opt/kernel/linux-2.6.5-rc1-mm2/drivers/block/as-iosched.c 2004-03-11 03:55:20.000000000 +0100 +++ linux-2.6.5-rc1-mm2/drivers/block/as-iosched.c 2004-03-19 16:10:26.794665823 +0100 @@ -1498,20 +1498,13 @@ as_insert_request(request_queue_t *q, st struct as_data *ad = q->elevator.elevator_data; struct as_rq *arq = RQ_DATA(rq); - if (arq) { - if (arq->state != AS_RQ_PRESCHED) { - printk("arq->state: %d\n", arq->state); - WARN_ON(1); - } + if (arq) arq->state = AS_RQ_NEW; - } /* barriers must flush the reorder queue */ if (unlikely(rq->flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER) - && where == ELEVATOR_INSERT_SORT)) { - WARN_ON(1); + && where == ELEVATOR_INSERT_SORT)) where = ELEVATOR_INSERT_BACK; - } switch (where) { case ELEVATOR_INSERT_BACK: @@ -1526,6 +1519,8 @@ as_insert_request(request_queue_t *q, st break; case ELEVATOR_INSERT_FRONT: list_add(&rq->queuelist, ad->dispatch); + if (blk_fs_request(rq)) + ad->nr_dispatched++; as_antic_stop(ad); break; case ELEVATOR_INSERT_SORT: diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/drivers/block/ll_rw_blk.c linux-2.6.5-rc1-mm2/drivers/block/ll_rw_blk.c --- /opt/kernel/linux-2.6.5-rc1-mm2/drivers/block/ll_rw_blk.c 2004-03-19 15:13:57.000000000 +0100 +++ linux-2.6.5-rc1-mm2/drivers/block/ll_rw_blk.c 2004-03-19 16:10:26.794665823 +0100 @@ -255,6 +255,28 @@ void blk_queue_make_request(request_queu EXPORT_SYMBOL(blk_queue_make_request); /** + * blk_queue_ordered - does this queue support ordered writes + * @q: the request queue + * @flag: see below + * + * Description: + * For journalled file systems, doing ordered writes on a commit + * block instead of explicitly doing wait_on_buffer (which is bad + * for performance) can be a big win. Block drivers supporting this + * feature should call this function and indicate so. + * + **/ +void blk_queue_ordered(request_queue_t *q, int flag) +{ + if (flag) + set_bit(QUEUE_FLAG_ORDERED, &q->queue_flags); + else + clear_bit(QUEUE_FLAG_ORDERED, &q->queue_flags); +} + +EXPORT_SYMBOL(blk_queue_ordered); + +/** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device * @dma_addr: bus address limit @@ -1895,6 +1917,43 @@ int blk_execute_rq(request_queue_t *q, s EXPORT_SYMBOL(blk_execute_rq); +/* + * the idea here is to insert a SYNC_CACHE scsi command, and let lower layers + * transform it if they have to. two possible ways to fix this to work on + * dm/md: turns the last part of this into a queue ->issue_flush_fn() so + * drivers can implement, or + */ +int blkdev_issue_flush(struct block_device *bdev) +{ + request_queue_t *q; + struct request *rq; + int ret; + + if (bdev->bd_disk == NULL) + return -ENXIO; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + if (!q->request_fn) + return -EOPNOTSUPP; + + rq = blk_get_request(q, WRITE, __GFP_WAIT); + + memset(rq->cmd, 0, sizeof(rq->cmd)); + rq->cmd[0] = 0x35; + rq->flags |= REQ_BLOCK_PC | REQ_SOFTBARRIER; + rq->data = NULL; + rq->cmd_len = 12; + rq->timeout = 60 * HZ; + + ret = blk_execute_rq(q, bdev->bd_disk, rq); + blk_put_request(rq); + return ret; +} + +EXPORT_SYMBOL(blkdev_issue_flush); + void drive_stat_acct(struct request *rq, int nr_sectors, int new_io) { int rw = rq_data_dir(rq); @@ -1973,6 +2032,8 @@ void __blk_put_request(request_queue_t * if (unlikely(!q)) return; + + WARN_ON(!req->ref_count); if (unlikely(--req->ref_count)) return; @@ -2148,7 +2209,7 @@ EXPORT_SYMBOL(__blk_attempt_remerge); static int __make_request(request_queue_t *q, struct bio *bio) { struct request *req, *freereq = NULL; - int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra; + int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err; sector_t sector; sector = bio->bi_sector; @@ -2166,9 +2227,11 @@ static int __make_request(request_queue_ spin_lock_prefetch(q->queue_lock); - barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw); - - ra = bio->bi_rw & (1 << BIO_RW_AHEAD); + barrier = bio_barrier(bio); + if (barrier && !(q->queue_flags & (1 << QUEUE_FLAG_ORDERED))) { + err = -EOPNOTSUPP; + goto end_io; + } again: spin_lock_irq(q->queue_lock); @@ -2248,7 +2311,8 @@ get_rq: /* * READA bit set */ - if (ra) + err = -EWOULDBLOCK; + if (bio_rw_ahead(bio)) goto end_io; freereq = get_request_wait(q, rw); @@ -2259,10 +2323,9 @@ get_rq: req->flags |= REQ_CMD; /* - * inherit FAILFAST from bio and don't stack up - * retries for read ahead + * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST) */ - if (ra || test_bit(BIO_RW_FAILFAST, &bio->bi_rw)) + if (bio_rw_ahead(bio) || bio_failfast(bio)) req->flags |= REQ_FAILFAST; /* @@ -2300,7 +2363,7 @@ out: return 0; end_io: - bio_endio(bio, nr_sectors << 9, -EWOULDBLOCK); + bio_endio(bio, nr_sectors << 9, err); return 0; } diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/drivers/ide/ide-disk.c linux-2.6.5-rc1-mm2/drivers/ide/ide-disk.c --- /opt/kernel/linux-2.6.5-rc1-mm2/drivers/ide/ide-disk.c 2004-03-19 15:13:51.000000000 +0100 +++ linux-2.6.5-rc1-mm2/drivers/ide/ide-disk.c 2004-03-19 16:10:26.987645047 +0100 @@ -1361,6 +1361,7 @@ static int set_nowerr(ide_drive_t *drive static int write_cache (ide_drive_t *drive, int arg) { ide_task_t args; + int err; if (!(drive->id->cfs_enable_2 & 0x3000)) return 1; @@ -1371,7 +1372,10 @@ static int write_cache (ide_drive_t *dri args.tfRegister[IDE_COMMAND_OFFSET] = WIN_SETFEATURES; args.command_type = IDE_DRIVE_TASK_NO_DATA; args.handler = &task_no_data_intr; - (void) ide_raw_taskfile(drive, &args, NULL); + + err = ide_raw_taskfile(drive, &args, NULL); + if (err) + return err; drive->wcache = arg; return 0; @@ -1680,6 +1684,8 @@ static void idedisk_setup (ide_drive_t * if (drive->id->cfs_enable_2 & 0x3000) write_cache(drive, (id->cfs_enable_2 & 0x3000)); + blk_queue_ordered(drive->queue, 1); + #ifdef CONFIG_BLK_DEV_IDE_TCQ_DEFAULT if (drive->using_dma) __ide_dma_queued_on(drive); @@ -1728,10 +1734,14 @@ static ide_driver_t idedisk_driver = { static int idedisk_open(struct inode *inode, struct file *filp) { ide_drive_t *drive = inode->i_bdev->bd_disk->private_data; + u8 cf; + drive->usage++; - if (drive->removable && drive->usage == 1) { + if (drive->usage != 1) + return 0; + + if (drive->removable) { ide_task_t args; - u8 cf; memset(&args, 0, sizeof(ide_task_t)); args.tfRegister[IDE_COMMAND_OFFSET] = WIN_DOORLOCK; args.command_type = IDE_DRIVE_TASK_NO_DATA; @@ -1744,18 +1754,19 @@ static int idedisk_open(struct inode *in */ if (drive->doorlocking && ide_raw_taskfile(drive, &args, NULL)) drive->doorlocking = 0; - drive->wcache = 0; - /* Cache enabled ? */ - if (drive->id->csfo & 1) - drive->wcache = 1; - /* Cache command set available ? */ - if (drive->id->cfs_enable_1 & (1<<5)) - drive->wcache = 1; - /* ATA6 cache extended commands */ - cf = drive->id->command_set_2 >> 24; - if((cf & 0xC0) == 0x40 && (cf & 0x30) != 0) - drive->wcache = 1; } + + drive->wcache = 0; + /* Cache enabled ? */ + if (drive->id->csfo & 1) + drive->wcache = 1; + /* Cache command set available ? */ + if (drive->id->cfs_enable_1 & (1<<5)) + drive->wcache = 1; + /* ATA6 cache extended commands */ + cf = drive->id->command_set_2 >> 24; + if((cf & 0xC0) == 0x40 && (cf & 0x30) != 0) + drive->wcache = 1; return 0; } diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/drivers/ide/ide-io.c linux-2.6.5-rc1-mm2/drivers/ide/ide-io.c --- /opt/kernel/linux-2.6.5-rc1-mm2/drivers/ide/ide-io.c 2004-03-19 15:13:51.000000000 +0100 +++ linux-2.6.5-rc1-mm2/drivers/ide/ide-io.c 2004-03-19 16:10:26.990644724 +0100 @@ -54,30 +54,84 @@ #include #include -/** - * ide_end_request - complete an IDE I/O - * @drive: IDE device for the I/O - * @uptodate: - * @nr_sectors: number of sectors completed - * - * This is our end_request wrapper function. We complete the I/O - * update random number input and dequeue the request, which if - * it was tagged may be out of order. +static void ide_fill_flush_cmd(ide_drive_t *drive, struct request *rq) +{ + memset(drive->special_buf, 0, sizeof(drive->special_buf)); + + rq->flags &= ~REQ_BLOCK_PC; + rq->flags |= REQ_DRIVE_TASK | REQ_STARTED; + rq->buffer = drive->special_buf; + rq->buffer[0] = WIN_FLUSH_CACHE; + + if (drive->id->cfs_enable_2 & 0x2400) + rq->buffer[0] = WIN_FLUSH_CACHE_EXT; +} + +static int ide_transform_pc_req(ide_drive_t *drive, struct request *rq) +{ + if (rq->cmd[0] != 0x35) { + ide_end_request(drive, 0, 0); + return 1; + } + + if (!drive->wcache) { + ide_end_request(drive, 1, 0); + return 1; + } + + ide_fill_flush_cmd(drive, rq); + return 0; +} + +/* + * preempt pending requests, and store this cache flush for immediate + * execution */ - -int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors) +static struct request *ide_queue_flush_cmd(ide_drive_t *drive, + struct request *rq, int post) { - struct request *rq; - unsigned long flags; - int ret = 1; + struct request *flush_rq = &HWGROUP(drive)->wrq; - spin_lock_irqsave(&ide_lock, flags); - rq = HWGROUP(drive)->rq; + /* + * write cache disabled, just return barrier write immediately + */ + if (!drive->wcache) + return rq; - BUG_ON(!(rq->flags & REQ_STARTED)); + /* + * if last rq issued was the post-flush, we can skip the pre-flush + */ +#if 0 + if (drive->last_rq_flush) { + rq->flags |= REQ_BAR_PREFLUSH; + return rq; + } +#endif - if (!nr_sectors) - nr_sectors = rq->hard_cur_sectors; + ide_init_drive_cmd(flush_rq); + ide_fill_flush_cmd(drive, flush_rq); + + flush_rq->special = rq; + flush_rq->nr_sectors = rq->nr_sectors; + + if (!post) { + drive->doing_barrier = 1; + flush_rq->flags |= REQ_BAR_PREFLUSH; + blkdev_dequeue_request(rq); + } else + flush_rq->flags |= REQ_BAR_POSTFLUSH; + + __elv_add_request(drive->queue, flush_rq, ELEVATOR_INSERT_FRONT, 0); + HWGROUP(drive)->rq = NULL; + return flush_rq; +} + +static int __ide_end_request(ide_drive_t *drive, struct request *rq, + int uptodate, int nr_sectors) +{ + int ret = 1; + + BUG_ON(!(rq->flags & REQ_STARTED)); /* * if failfast is set on a request, override number of sectors and @@ -86,6 +140,9 @@ int ide_end_request (ide_drive_t *drive, if (blk_noretry_request(rq) && !uptodate) nr_sectors = rq->hard_nr_sectors; + if (!blk_fs_request(rq) && !uptodate && !rq->errors) + rq->errors = -EIO; + /* * decide whether to reenable DMA -- 3 is a random magic for now, * if we DMA timeout more than 3 times, just stay in PIO @@ -97,14 +154,54 @@ int ide_end_request (ide_drive_t *drive, if (!end_that_request_first(rq, uptodate, nr_sectors)) { add_disk_randomness(rq->rq_disk); - if (!blk_rq_tagged(rq)) - blkdev_dequeue_request(rq); - else + + if (blk_rq_tagged(rq)) blk_queue_end_tag(drive->queue, rq); - HWGROUP(drive)->rq = NULL; + + blkdev_dequeue_request(rq); end_that_request_last(rq); + HWGROUP(drive)->rq = NULL; ret = 0; } + + return ret; +} + +/** + * ide_end_request - complete an IDE I/O + * @drive: IDE device for the I/O + * @uptodate: + * @nr_sectors: number of sectors completed + * + * This is our end_request wrapper function. We complete the I/O + * update random number input and dequeue the request, which if + * it was tagged may be out of order. + */ + +int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors) +{ + struct request *rq; + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&ide_lock, flags); + rq = HWGROUP(drive)->rq; + + if (!nr_sectors) + nr_sectors = rq->hard_cur_sectors; + + if (!blk_barrier_rq(rq)) + ret = __ide_end_request(drive, rq, uptodate, nr_sectors); + else { + struct request *flush_rq = &HWGROUP(drive)->wrq; + + flush_rq->nr_sectors -= nr_sectors; + if (!flush_rq->nr_sectors) { + ide_queue_flush_cmd(drive, rq, 1); + ret = 0; + } + } + spin_unlock_irqrestore(&ide_lock, flags); return ret; } @@ -140,6 +237,97 @@ static void ide_complete_pm_request (ide spin_unlock_irqrestore(&ide_lock, flags); } +/* + * FIXME: probably move this somewhere else, name is bad too :) + */ +static sector_t ide_get_error_location(ide_drive_t *drive, char *args) +{ + u32 high, low; + u8 hcyl, lcyl, sect; + sector_t sector; + + high = 0; + hcyl = args[5]; + lcyl = args[4]; + sect = args[3]; + + if (drive->id->cfs_enable_2 & 0x2400) { + low = (hcyl << 16) | (lcyl << 8) | sect; + HWIF(drive)->OUTB(drive->ctl|0x80, IDE_CONTROL_REG); + high = ide_read_24(drive); + } else { + u8 cur = HWIF(drive)->INB(IDE_SELECT_REG); + if (cur & 0x40) + low = (hcyl << 16) | (lcyl << 8) | sect; + else { + low = hcyl * drive->head * drive->sect; + low += lcyl * drive->sect; + low += sect - 1; + } + } + + sector = ((sector_t) high << 24) | low; + return sector; +} + +static void ide_complete_barrier(ide_drive_t *drive, struct request *rq, + int error) +{ + struct request *real_rq = rq->special; + int good_sectors, bad_sectors; + sector_t sector; + + if (!error) { + if (blk_barrier_postflush(rq)) { + /* + * this completes the barrier write + */ + __ide_end_request(drive, real_rq, 1, real_rq->hard_nr_sectors); + drive->doing_barrier = 0; + drive->last_rq_flush = 1; + } else { + /* + * just indicate that we did the pre flush + */ + real_rq->flags |= REQ_BAR_PREFLUSH; + __elv_add_request(drive->queue, real_rq, ELEVATOR_INSERT_FRONT, 0); + } + +#ifdef IDE_DUMP_FLUSH_TIMINGS + printk("%s: %sflush took %lu jiffies\n", drive->name, blk_barrier_postflush(rq) ? "post" : "pre", jiffies - rq->timeout); +#endif + + /* + * all is fine, return + */ + return; + } + + /* + * bummer, flush failed. if it was the pre-flush, fail the barrier. + * if it was the post-flush, complete the succesful part of the request + * and fail the rest + */ + good_sectors = 0; + if (blk_barrier_postflush(rq)) { + sector = ide_get_error_location(drive, rq->buffer); + + if ((sector >= real_rq->hard_sector) && + (sector < real_rq->hard_sector + real_rq->hard_nr_sectors)) + good_sectors = sector - real_rq->hard_sector; + } else + sector = real_rq->hard_sector; + + bad_sectors = real_rq->hard_nr_sectors - good_sectors; + if (good_sectors) + __ide_end_request(drive, real_rq, 1, good_sectors); + if (bad_sectors) + __ide_end_request(drive, real_rq, 0, bad_sectors); + + printk(KERN_ERR "%s: failed barrier write: sector=%Lx(good=%d/bad=%d)\n", drive->name, sector, good_sectors, bad_sectors); + blk_queue_ordered(drive->queue, 0); +} + /** * ide_end_drive_cmd - end an explicit drive command * @drive: command @@ -229,6 +417,10 @@ void ide_end_drive_cmd (ide_drive_t *dri spin_lock_irqsave(&ide_lock, flags); blkdev_dequeue_request(rq); + + if (blk_barrier_preflush(rq) || blk_barrier_postflush(rq)) + ide_complete_barrier(drive, rq, err); + HWGROUP(drive)->rq = NULL; end_that_request_last(rq); spin_unlock_irqrestore(&ide_lock, flags); @@ -610,6 +802,16 @@ ide_startstop_t start_request (ide_drive if (drive->suspend_reset) goto kill_rq; + /* + * basic transformation support for scsi -> ata commands + */ + if (blk_pc_request(rq)) { + if (drive->media != ide_disk) + goto kill_rq; + if (ide_transform_pc_req(drive, rq)) + return ide_stopped; + } + block = rq->sector; if (blk_fs_request(rq) && (drive->media == ide_disk || drive->media == ide_floppy)) { @@ -715,6 +917,15 @@ static inline ide_drive_t *choose_drive repeat: best = NULL; drive = hwgroup->drive; + + /* + * drive is doing pre-flush, ordered write, post-flush sequence. even + * though that is 3 requests, it must be seen as a single transaction. + * we must not preempt this drive until that is complete + */ + if (drive->doing_barrier) + return drive; + do { if ((!drive->sleep || time_after_eq(jiffies, drive->sleep)) && !elv_queue_empty(drive->queue)) { @@ -882,6 +1093,15 @@ queue_next: } /* + * if rq is a barrier write, issue pre cache flush if not + * already done + */ + if (blk_barrier_rq(rq) && !blk_barrier_preflush(rq)) + rq = ide_queue_flush_cmd(drive, rq, 0); + + drive->last_rq_flush = 0; + + /* * Sanity: don't accept a request that isn't a PM request * if we are currently power managed. This is very important as * blk_stop_queue() doesn't prevent the elv_next_request() @@ -900,6 +1120,10 @@ queue_next: break; } + /* + * we can only queue read-write requests, so let the drive + * queue drain before continuing with this command. + */ if (!rq->bio && ata_pending_commands(drive)) break; @@ -1305,6 +1529,7 @@ void ide_init_drive_cmd (struct request { memset(rq, 0, sizeof(*rq)); rq->flags = REQ_DRIVE_CMD; + rq->ref_count = 1; } EXPORT_SYMBOL(ide_init_drive_cmd); diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/buffer.c linux-2.6.5-rc1-mm2/fs/buffer.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/buffer.c 2004-03-19 15:13:58.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/buffer.c 2004-03-19 16:10:24.604901573 +0100 @@ -1589,6 +1589,7 @@ int try_to_release_page(struct page *pag return mapping->a_ops->releasepage(page, gfp_mask); return try_to_free_buffers(page); } +EXPORT_SYMBOL(try_to_release_page); /** * block_invalidatepage - invalidate part of all of a buffer-backed page @@ -2707,6 +2708,9 @@ int submit_bh(int rw, struct buffer_head if (rw == READ && buffer_dirty(bh)) buffer_error(); + if (buffer_ordered(bh) && (rw == WRITE)) + rw = WRITE_BARRIER; + /* Only clear out a write error when rewriting */ if (test_set_buffer_req(bh) && rw == WRITE) clear_buffer_write_io_error(bh); diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/ext3/super.c linux-2.6.5-rc1-mm2/fs/ext3/super.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/ext3/super.c 2004-03-19 15:13:58.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/ext3/super.c 2004-03-19 16:10:24.212943771 +0100 @@ -536,7 +536,8 @@ enum { Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_noload, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_ignore, Opt_err, + Opt_ignore, Opt_barrier, + Opt_err, }; static match_table_t tokens = { @@ -575,6 +576,7 @@ static match_table_t tokens = { {Opt_ignore, "noquota"}, {Opt_ignore, "quota"}, {Opt_ignore, "usrquota"}, + {Opt_barrier, "barrier=%u"}, {Opt_err, NULL} }; @@ -762,6 +764,14 @@ static int parse_options (char * options case Opt_abort: set_opt(sbi->s_mount_opt, ABORT); break; + case Opt_barrier: + if (match_int(&args[0], &option)) + return 0; + if (option) + set_opt(sbi->s_mount_opt, BARRIER); + else + clear_opt(sbi->s_mount_opt, BARRIER); + break; case Opt_ignore: break; default: @@ -1419,16 +1429,23 @@ out_fail: * initial mount, once the journal has been initialised but before we've * done any recovery; and again on any subsequent remount. */ -static void ext3_init_journal_params(struct ext3_sb_info *sbi, - journal_t *journal) +static void ext3_init_journal_params(struct super_block *sb, journal_t *journal) { + struct ext3_sb_info *sbi = EXT3_SB(sb); + if (sbi->s_commit_interval) journal->j_commit_interval = sbi->s_commit_interval; /* We could also set up an ext3-specific default for the commit * interval here, but for now we'll just fall back to the jbd * default. */ -} + spin_lock(&journal->j_state_lock); + if (test_opt(sb, BARRIER)) + journal->j_flags |= JFS_BARRIER; + else + journal->j_flags &= ~JFS_BARRIER; + spin_unlock(&journal->j_state_lock); +} static journal_t *ext3_get_journal(struct super_block *sb, int journal_inum) { @@ -1465,7 +1482,7 @@ static journal_t *ext3_get_journal(struc iput(journal_inode); } journal->j_private = sb; - ext3_init_journal_params(EXT3_SB(sb), journal); + ext3_init_journal_params(sb, journal); return journal; } @@ -1550,7 +1567,7 @@ static journal_t *ext3_get_dev_journal(s goto out_journal; } EXT3_SB(sb)->journal_bdev = bdev; - ext3_init_journal_params(EXT3_SB(sb), journal); + ext3_init_journal_params(sb, journal); return journal; out_journal: journal_destroy(journal); @@ -1843,7 +1860,7 @@ int ext3_remount (struct super_block * s es = sbi->s_es; - ext3_init_journal_params(sbi, sbi->s_journal); + ext3_init_journal_params(sb, sbi->s_journal); if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/jbd/commit.c linux-2.6.5-rc1-mm2/fs/jbd/commit.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/jbd/commit.c 2004-03-11 03:55:44.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/jbd/commit.c 2004-03-19 16:10:24.488914060 +0100 @@ -636,6 +636,8 @@ wait_for_iobuf: { struct buffer_head *bh = jh2bh(descriptor); set_buffer_uptodate(bh); + if (journal->j_flags & JFS_BARRIER) + set_buffer_ordered(bh); sync_dirty_buffer(bh); if (unlikely(!buffer_uptodate(bh))) err = -EIO; diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/Kconfig linux-2.6.5-rc1-mm2/fs/Kconfig --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/Kconfig 2004-03-19 15:13:58.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/Kconfig 2004-03-19 16:10:24.604901573 +0100 @@ -244,6 +244,40 @@ config REISERFS_PROC_INFO Almost everyone but ReiserFS developers and people fine-tuning reiserfs or tracing problems should say N. +config REISERFS_FS_XATTR + bool "ReiserFS extended attributes" + depends on REISERFS_FS + help + Extended attributes are name:value pairs associated with inodes by + the kernel or by users (see the attr(5) manual page, or visit + for details). + + If unsure, say N. + +config REISERFS_FS_POSIX_ACL + bool "ReiserFS POSIX Access Control Lists" + depends on REISERFS_FS_XATTR + help + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + + To learn more about Access Control Lists, visit the Posix ACLs for + Linux website . + + If you don't know what Access Control Lists are, say N + +config REISERFS_FS_SECURITY + bool "ReiserFS Security Labels" + depends on REISERFS_FS_XATTR + help + Security labels support alternative access control models + implemented by security modules like SELinux. This option + enables an extended attribute handler for file security + labels in the ReiserFS filesystem. + + If you are not using a security module that requires using + extended attributes for file security labels, say N. + config JFS_FS tristate "JFS filesystem support" select NLS @@ -282,13 +316,13 @@ config JFS_STATISTICS to be made available to the user in the /proc/fs/jfs/ directory. config FS_POSIX_ACL -# Posix ACL utility routines (for now, only ext2/ext3/jfs) +# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs) # # NOTE: you can implement Posix ACLs without these helpers (XFS does). # Never use this symbol for ifdefs. # bool - depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL + depends on EXT2_FS_POSIX_ACL || EXT3_FS_POSIX_ACL || JFS_POSIX_ACL || REISERFS_FS_POSIX_ACL default y config XFS_FS diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/bitmap.c linux-2.6.5-rc1-mm2/fs/reiserfs/bitmap.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/bitmap.c 2004-03-11 03:55:35.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/bitmap.c 2004-03-19 16:10:24.015964978 +0100 @@ -12,6 +12,7 @@ #include #include #include +#include #define PREALLOCATION_SIZE 9 @@ -281,7 +282,8 @@ static int scan_bitmap (struct reiserfs_ } static void _reiserfs_free_block (struct reiserfs_transaction_handle *th, - b_blocknr_t block) + struct inode *inode, b_blocknr_t block, + int for_unformatted) { struct super_block * s = th->t_super; struct reiserfs_super_block * rs; @@ -323,11 +325,13 @@ static void _reiserfs_free_block (struct set_sb_free_blocks( rs, sb_free_blocks(rs) + 1 ); journal_mark_dirty (th, s, sbh); - s->s_dirt = 1; + if (for_unformatted) + DQUOT_FREE_BLOCK_NODIRTY(inode, 1); } void reiserfs_free_block (struct reiserfs_transaction_handle *th, - b_blocknr_t block) + struct inode *inode, b_blocknr_t block, + int for_unformatted) { struct super_block * s = th->t_super; @@ -335,42 +339,46 @@ void reiserfs_free_block (struct reiserf RFALSE(is_reusable (s, block, 1) == 0, "vs-4071: can not free such block"); /* mark it before we clear it, just in case */ journal_mark_freed(th, s, block) ; - _reiserfs_free_block(th, block) ; + _reiserfs_free_block(th, inode, block, for_unformatted) ; } /* preallocated blocks don't need to be run through journal_mark_freed */ void reiserfs_free_prealloc_block (struct reiserfs_transaction_handle *th, - b_blocknr_t block) { + struct inode *inode, b_blocknr_t block) { RFALSE(!th->t_super, "vs-4060: trying to free block on nonexistent device"); RFALSE(is_reusable (th->t_super, block, 1) == 0, "vs-4070: can not free such block"); - _reiserfs_free_block(th, block) ; + _reiserfs_free_block(th, inode, block, 1) ; } static void __discard_prealloc (struct reiserfs_transaction_handle * th, struct reiserfs_inode_info *ei) { unsigned long save = ei->i_prealloc_block ; + int dirty = 0; + struct inode *inode = &ei->vfs_inode; #ifdef CONFIG_REISERFS_CHECK if (ei->i_prealloc_count < 0) reiserfs_warning("zam-4001:%s: inode has negative prealloc blocks count.\n", __FUNCTION__ ); #endif while (ei->i_prealloc_count > 0) { - reiserfs_free_prealloc_block(th,ei->i_prealloc_block); + reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block); ei->i_prealloc_block++; ei->i_prealloc_count --; + dirty = 1; } + if (dirty) + reiserfs_update_sd(th, inode); ei->i_prealloc_block = save; list_del_init(&(ei->i_prealloc_list)); } /* FIXME: It should be inline function */ void reiserfs_discard_prealloc (struct reiserfs_transaction_handle *th, - struct inode * inode) + struct inode *inode) { struct reiserfs_inode_info *ei = REISERFS_I(inode); - if (ei->i_prealloc_count) { + if (ei->i_prealloc_count) __discard_prealloc(th, ei); - } } void reiserfs_discard_all_prealloc (struct reiserfs_transaction_handle *th) @@ -772,6 +780,24 @@ static inline int blocknrs_and_prealloc_ int nr_allocated = 0; determine_prealloc_size(hint); + if (!hint->formatted_node) { + int quota_ret; +#ifdef REISERQUOTA_DEBUG + printk(KERN_DEBUG "reiserquota: allocating %d blocks id=%u\n", amount_needed, hint->inode->i_uid); +#endif + quota_ret = DQUOT_ALLOC_BLOCK_NODIRTY(hint->inode, amount_needed); + if (quota_ret) /* Quota exceeded? */ + return QUOTA_EXCEEDED; + if (hint->preallocate && hint->prealloc_size ) { +#ifdef REISERQUOTA_DEBUG + printk(KERN_DEBUG "reiserquota: allocating (prealloc) %d blocks id=%u\n", hint->prealloc_size, hint->inode->i_uid); +#endif + quota_ret = DQUOT_PREALLOC_BLOCK_NODIRTY(hint->inode, hint->prealloc_size); + if (quota_ret) + hint->preallocate=hint->prealloc_size=0; + } + } + while((nr_allocated += allocate_without_wrapping_disk(hint, new_blocknrs + nr_allocated, start, finish, amount_needed - nr_allocated, hint->prealloc_size)) @@ -779,8 +805,14 @@ static inline int blocknrs_and_prealloc_ /* not all blocks were successfully allocated yet*/ if (second_pass) { /* it was a second pass; we must free all blocks */ + if (!hint->formatted_node) { +#ifdef REISERQUOTA_DEBUG + printk(KERN_DEBUG "reiserquota: freeing (nospace) %d blocks id=%u\n", amount_needed + hint->prealloc_size - nr_allocated, hint->inode->i_uid); +#endif + DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + hint->prealloc_size - nr_allocated); /* Free not allocated blocks */ + } while (nr_allocated --) - reiserfs_free_block(hint->th, new_blocknrs[nr_allocated]); + reiserfs_free_block(hint->th, hint->inode, new_blocknrs[nr_allocated], !hint->formatted_node); return NO_DISK_SPACE; } else { /* refine search parameters for next pass */ @@ -789,7 +821,19 @@ static inline int blocknrs_and_prealloc_ start = 0; continue; } - } + } + if ( !hint->formatted_node && + amount_needed + hint->prealloc_size > + nr_allocated + REISERFS_I(hint->inode)->i_prealloc_count) { + /* Some of preallocation blocks were not allocated */ +#ifdef REISERQUOTA_DEBUG + printk(KERN_DEBUG "reiserquota: freeing (failed prealloc) %d blocks id=%u\n", amount_needed + hint->prealloc_size - nr_allocated - INODE_INFO(hint->inode)->i_prealloc_count, hint->inode->i_uid); +#endif + DQUOT_FREE_BLOCK_NODIRTY(hint->inode, amount_needed + + hint->prealloc_size - nr_allocated - + REISERFS_I(hint->inode)->i_prealloc_count); + } + return CARRY_ON; } @@ -858,7 +902,7 @@ int reiserfs_allocate_blocknrs(reiserfs_ if (ret != CARRY_ON) { while (amount_needed ++ < initial_amount_needed) { - reiserfs_free_block(hint->th, *(--new_blocknrs)); + reiserfs_free_block(hint->th, hint->inode, *(--new_blocknrs), 1); } } return ret; diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/dir.c linux-2.6.5-rc1-mm2/fs/reiserfs/dir.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/dir.c 2004-03-19 15:13:52.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/dir.c 2004-03-19 16:10:24.009965624 +0100 @@ -115,6 +115,17 @@ static int reiserfs_readdir (struct file /* too big to send back to VFS */ continue ; } + + /* Ignore the .reiserfs_priv entry */ + if (reiserfs_xattrs (inode->i_sb) && + !old_format_only(inode->i_sb) && + filp->f_dentry == inode->i_sb->s_root && + REISERFS_SB(inode->i_sb)->priv_root && + REISERFS_SB(inode->i_sb)->priv_root->d_inode && + deh_objectid(deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid)) { + continue; + } + d_off = deh_offset (deh); filp->f_pos = d_off ; d_ino = deh_objectid (deh); diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/do_balan.c linux-2.6.5-rc1-mm2/fs/reiserfs/do_balan.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/do_balan.c 2004-03-11 03:55:23.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/do_balan.c 2004-03-19 16:10:24.008965731 +0100 @@ -30,34 +30,11 @@ struct tree_balance * cur_tb = NULL; /* is interrupting do_balance */ #endif -/* - * AKPM: The __mark_buffer_dirty() call here will not - * put the buffer on the dirty buffer LRU because we've just - * set BH_Dirty. That's a thinko in reiserfs. - * - * I'm reluctant to "fix" this bug because that would change - * behaviour. Using mark_buffer_dirty() here would make the - * buffer eligible for VM and periodic writeback, which may - * violate ordering constraints. I'll just leave the code - * as-is by removing the __mark_buffer_dirty call altogether. - * - * Chris says this code has "probably never been run" anyway. - * It is due to go away. - */ - inline void do_balance_mark_leaf_dirty (struct tree_balance * tb, struct buffer_head * bh, int flag) { - if (reiserfs_dont_log(tb->tb_sb)) { - if (!test_set_buffer_dirty(bh)) { -// __mark_buffer_dirty(bh) ; - tb->need_balance_dirty = 1; - } - } else { - int windex = push_journal_writer("do_balance") ; - journal_mark_dirty(tb->transaction_handle, tb->transaction_handle->t_super, bh) ; - pop_journal_writer(windex) ; - } + journal_mark_dirty(tb->transaction_handle, + tb->transaction_handle->t_super, bh) ; } #define do_balance_mark_internal_dirty do_balance_mark_leaf_dirty @@ -1257,7 +1234,7 @@ static void free_thrown(struct tree_bala if (buffer_dirty (tb->thrown[i])) printk ("free_thrown deals with dirty buffer %d\n", blocknr); brelse(tb->thrown[i]) ; /* incremented in store_thrown */ - reiserfs_free_block (tb->transaction_handle, blocknr); + reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0); } } } @@ -1270,10 +1247,6 @@ void reiserfs_invalidate_buffer (struct set_blkh_nr_item( blkh, 0 ); clear_buffer_dirty(bh); - /* reiserfs_free_block is no longer schedule safe - reiserfs_free_block (tb->transaction_handle, tb->tb_sb, bh->b_blocknr); - */ - store_thrown (tb, bh); } diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/file.c linux-2.6.5-rc1-mm2/fs/reiserfs/file.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/file.c 2004-03-19 15:13:58.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/file.c 2004-03-19 16:30:24.726645382 +0100 @@ -5,10 +5,15 @@ #include #include +#include +#include #include #include #include #include +#include +#include +#include /* ** We pack the tails of files on file close, not at the time they are written. @@ -29,7 +34,6 @@ static int reiserfs_file_release (struct { struct reiserfs_transaction_handle th ; - int windex ; if (!S_ISREG (inode->i_mode)) BUG (); @@ -59,9 +63,7 @@ static int reiserfs_file_release (struct appended (we append by unformatted node only) or its direct item(s) had to be converted, then it may have to be indirect2direct converted */ - windex = push_journal_writer("file_release") ; reiserfs_truncate_file(inode, 0) ; - pop_journal_writer(windex) ; } up (&inode->i_sem); reiserfs_write_unlock(inode->i_sb); @@ -86,63 +88,19 @@ static int reiserfs_sync_file( ) { struct inode * p_s_inode = p_s_dentry->d_inode; int n_err; - - reiserfs_write_lock(p_s_inode->i_sb); + int barrier_done; if (!S_ISREG(p_s_inode->i_mode)) BUG (); - n_err = sync_mapping_buffers(p_s_inode->i_mapping) ; - reiserfs_commit_for_inode(p_s_inode) ; + reiserfs_write_lock(p_s_inode->i_sb); + barrier_done = reiserfs_commit_for_inode(p_s_inode); reiserfs_write_unlock(p_s_inode->i_sb); + if (barrier_done != 1) + blkdev_issue_flush(p_s_inode->i_sb->s_bdev); return ( n_err < 0 ) ? -EIO : 0; } -static int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode ; - int error ; - reiserfs_write_lock(inode->i_sb); - if (attr->ia_valid & ATTR_SIZE) { - /* version 2 items will be caught by the s_maxbytes check - ** done for us in vmtruncate - */ - if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && - attr->ia_size > MAX_NON_LFS) { - error = -EFBIG ; - goto out; - } - /* fill in hole pointers in the expanding truncate case. */ - if (attr->ia_size > inode->i_size) { - error = generic_cont_expand(inode, attr->ia_size) ; - if (REISERFS_I(inode)->i_prealloc_count > 0) { - struct reiserfs_transaction_handle th ; - /* we're changing at most 2 bitmaps, inode + super */ - journal_begin(&th, inode->i_sb, 4) ; - reiserfs_discard_prealloc (&th, inode); - journal_end(&th, inode->i_sb, 4) ; - } - if (error) - goto out; - } - } - - if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) || - ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) && - (get_inode_sd_version (inode) == STAT_DATA_V1)) { - /* stat data of format v3.5 has 16 bit uid and gid */ - error = -EINVAL; - goto out; - } - - error = inode_change_ok(inode, attr) ; - if (!error) - inode_setattr(inode, attr) ; - -out: - reiserfs_write_unlock(inode->i_sb); - return error ; -} - /* I really do not want to play with memory shortage right now, so to simplify the code, we are not going to write more than this much pages at a time. This still should considerably improve performance compared to 4k @@ -153,6 +111,7 @@ out: Maps all unmapped but prepared pages from the list. Updates metadata with newly allocated blocknumbers as needed */ int reiserfs_allocate_blocks_for_region( + struct reiserfs_transaction_handle *th, struct inode *inode, /* Inode we work with */ loff_t pos, /* Writing position */ int num_pages, /* number of pages write going @@ -170,7 +129,6 @@ int reiserfs_allocate_blocks_for_region( struct cpu_key key; // cpu key of item that we are going to deal with struct item_head *ih; // pointer to item head that we are going to deal with struct buffer_head *bh; // Buffer head that contains items that we are going to deal with - struct reiserfs_transaction_handle th; // transaction handle for transaction we are going to create. __u32 * item; // pointer to item we are going to deal with INITIALIZE_PATH(path); // path to item, that we are going to deal with. b_blocknr_t allocated_blocks[blocks_to_allocate]; // Pointer to a place where allocated blocknumbers would be stored. Right now statically allocated, later that will change. @@ -197,7 +155,7 @@ int reiserfs_allocate_blocks_for_region( /* If we came here, it means we absolutely need to open a transaction, since we need to allocate some blocks */ reiserfs_write_lock(inode->i_sb); // Journaling stuff and we need that. - journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); // Wish I know if this number enough + journal_begin(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); // Wish I know if this number enough reiserfs_update_inode_transaction(inode) ; /* Look for the in-tree position of our write, need path for block allocator */ @@ -209,14 +167,20 @@ int reiserfs_allocate_blocks_for_region( /* Allocate blocks */ /* First fill in "hint" structure for block allocator */ - hint.th = &th; // transaction handle. + hint.th = th; // transaction handle. hint.path = &path; // Path, so that block allocator can determine packing locality or whatever it needs to determine. hint.inode = inode; // Inode is needed by block allocator too. hint.search_start = 0; // We have no hint on where to search free blocks for block allocator. hint.key = key.on_disk_key; // on disk key of file. hint.block = inode->i_blocks>>(inode->i_sb->s_blocksize_bits-9); // Number of disk blocks this file occupies already. hint.formatted_node = 0; // We are allocating blocks for unformatted node. - hint.preallocate = 0; // We do not do any preallocation for now. + + /* only preallocate if this is a small write */ + if (blocks_to_allocate < + REISERFS_SB(inode->i_sb)->s_alloc_options.preallocsize) + hint.preallocate = 1; + else + hint.preallocate = 0; /* Call block allocator to allocate blocks */ res = reiserfs_allocate_blocknrs(&hint, allocated_blocks, blocks_to_allocate, blocks_to_allocate); @@ -225,7 +189,7 @@ int reiserfs_allocate_blocks_for_region( /* We flush the transaction in case of no space. This way some blocks might become free */ SB_JOURNAL(inode->i_sb)->j_must_wait = 1; - restart_transaction(&th, inode, &path); + restart_transaction(th, inode, &path); /* We might have scheduled, so search again */ res = search_for_position_by_key(inode->i_sb, &key, &path); @@ -280,7 +244,20 @@ int reiserfs_allocate_blocks_for_region( // position, and how many blocks it is going to cover (we need to // populate pointers to file blocks representing the hole with zeros) - hole_size = (pos + 1 - (le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key))+op_bytes_number(ih, inode->i_sb->s_blocksize))) >> inode->i_sb->s_blocksize_bits; + { + int item_offset = 1; + /* + * if ih is stat data, its offset is 0 and we don't want to + * add 1 to pos in the hole_size calculation + */ + if (is_statdata_le_ih(ih)) + item_offset = 0; + hole_size = (pos + item_offset - + (le_key_k_offset( get_inode_item_key_version(inode), + &(ih->ih_key)) + + op_bytes_number(ih, inode->i_sb->s_blocksize))) >> + inode->i_sb->s_blocksize_bits; + } if ( hole_size > 0 ) { int to_paste = min_t(__u64, hole_size, MAX_ITEM_LEN(inode->i_sb->s_blocksize)/UNFM_P_SIZE ); // How much data to insert first time. @@ -299,7 +276,7 @@ int reiserfs_allocate_blocks_for_region( /* Ok, there is existing indirect item already. Need to append it */ /* Calculate position past inserted item */ make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3); - res = reiserfs_paste_into_item( &th, &path, &key, (char *)zeros, UNFM_P_SIZE*to_paste); + res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)zeros, UNFM_P_SIZE*to_paste); if ( res ) { kfree(zeros); goto error_exit_free_blocks; @@ -329,7 +306,7 @@ int reiserfs_allocate_blocks_for_region( kfree(zeros); goto error_exit_free_blocks; } - res = reiserfs_insert_item( &th, &path, &key, &ins_ih, (char *)zeros); + res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)zeros); } else { reiserfs_panic(inode->i_sb, "green-9011: Unexpected key type %K\n", &key); } @@ -339,8 +316,8 @@ int reiserfs_allocate_blocks_for_region( } /* Now we want to check if transaction is too full, and if it is we restart it. This will also free the path. */ - if (journal_transaction_should_end(&th, th.t_blocks_allocated)) - restart_transaction(&th, inode, &path); + if (journal_transaction_should_end(th, th->t_blocks_allocated)) + restart_transaction(th, inode, &path); /* Well, need to recalculate path and stuff */ set_cpu_key_k_offset( &key, cpu_key_k_offset(&key) + (to_paste << inode->i_blkbits)); @@ -371,7 +348,7 @@ retry: one. */ /* First if we are already modifying current item, log it */ if ( modifying_this_item ) { - journal_mark_dirty (&th, inode->i_sb, bh); + journal_mark_dirty (th, inode->i_sb, bh); modifying_this_item = 0; } /* Then set the key to look for a new indirect item (offset of old @@ -435,7 +412,7 @@ retry: if ( modifying_this_item ) { // We need to log last-accessed block, if it // was modified, but not logged yet. - journal_mark_dirty (&th, inode->i_sb, bh); + journal_mark_dirty (th, inode->i_sb, bh); } if ( curr_block < blocks_to_allocate ) { @@ -446,7 +423,7 @@ retry: // position. We do not need to recalculate path as it should // already point to correct place. make_cpu_key( &key, inode, le_key_k_offset( get_inode_item_key_version(inode), &(ih->ih_key)) + op_bytes_number(ih, inode->i_sb->s_blocksize), TYPE_INDIRECT, 3); - res = reiserfs_paste_into_item( &th, &path, &key, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block)); + res = reiserfs_paste_into_item( th, &path, &key, inode, (char *)(allocated_blocks+curr_block), UNFM_P_SIZE*(blocks_to_allocate-curr_block)); if ( res ) { goto error_exit_free_blocks; } @@ -477,29 +454,17 @@ retry: goto error_exit_free_blocks; } /* Insert item into the tree with the data as its body */ - res = reiserfs_insert_item( &th, &path, &key, &ins_ih, (char *)(allocated_blocks+curr_block)); + res = reiserfs_insert_item( th, &path, &key, &ins_ih, inode, (char *)(allocated_blocks+curr_block)); } else { reiserfs_panic(inode->i_sb, "green-9010: unexpected item type for key %K\n",&key); } } - /* Now the final thing, if we have grew the file, we must update it's size*/ - if ( pos + write_bytes > inode->i_size) { - inode->i_size = pos + write_bytes; // Set new size - /* If the file have grown so much that tail packing is no longer possible, reset - "need to pack" flag */ - if ( (have_large_tails (inode->i_sb) && inode->i_size > i_block_size (inode)*4) || - (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) ) - REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ; - } - - /* Amount of on-disk blocks used by file have changed, update it */ - inode->i_blocks += blocks_to_allocate << (inode->i_blkbits - 9); - reiserfs_update_sd(&th, inode); // And update on-disk metadata - // finish all journal stuff now, We are not going to play with metadata - // anymore. + // the caller is responsible for closing the transaction + // unless we return an error, they are also responsible for logging + // the inode. + // pathrelse(&path); - journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); reiserfs_write_unlock(inode->i_sb); // go through all the pages/buffers and map the buffers to newly allocated @@ -530,6 +495,7 @@ retry: if ( !buffer_mapped(bh) ) { // Ok, unmapped buffer, need to map it map_bh( bh, inode->i_sb, le32_to_cpu(allocated_blocks[curr_block])); curr_block++; + set_buffer_new(bh); } } } @@ -543,10 +509,11 @@ error_exit_free_blocks: pathrelse(&path); // free blocks for( i = 0; i < blocks_to_allocate; i++ ) - reiserfs_free_block( &th, le32_to_cpu(allocated_blocks[i])); + reiserfs_free_block(th, inode, le32_to_cpu(allocated_blocks[i]), 1); error_exit: - journal_end(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); + reiserfs_update_sd(th, inode); // update any changes we made to blk count + journal_end(th, inode->i_sb, JOURNAL_PER_BALANCE_CNT * 3 + 1); reiserfs_write_unlock(inode->i_sb); return res; @@ -606,12 +573,63 @@ int reiserfs_copy_from_user_to_file_regi return page_fault?-EFAULT:0; } +/* taken fs/buffer.c:__block_commit_write */ +int reiserfs_commit_page(struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + unsigned block_start, block_end; + int partial = 0; + unsigned blocksize; + struct buffer_head *bh, *head; + unsigned long i_size_index = inode->i_size >> PAGE_CACHE_SHIFT; + int new; + + blocksize = 1 << inode->i_blkbits; + + for(bh = head = page_buffers(page), block_start = 0; + bh != head || !block_start; + block_start=block_end, bh = bh->b_this_page) + { + + new = buffer_new(bh); + clear_buffer_new(bh); + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (!buffer_uptodate(bh)) + partial = 1; + } else { + set_buffer_uptodate(bh); + if (!buffer_dirty(bh)) { + mark_buffer_dirty(bh); + /* do data=ordered on any page past the end + * of file and any buffer marked BH_New. + */ + if (reiserfs_data_ordered(inode->i_sb) && + (new || page->index >= i_size_index)) { + reiserfs_add_ordered_list(inode, bh); + } + } + } + } + + /* + * If this is a partial write which happened to make all buffers + * uptodate then we can optimize away a bogus readpage() for + * the next read(). Here we 'discover' whether the page went + * uptodate as a result of this (potentially partial) write. + */ + if (!partial) + SetPageUptodate(page); + return 0; +} /* Submit pages for write. This was separated from actual file copying because we might want to allocate block numbers in-between. This function assumes that caller will adjust file size to correct value. */ int reiserfs_submit_file_region_for_write( + struct reiserfs_transaction_handle *th, + struct inode *inode, loff_t pos, /* Writing position offset */ int num_pages, /* Number of pages to write */ int write_bytes, /* number of bytes to write */ @@ -622,12 +640,14 @@ int reiserfs_submit_file_region_for_writ int retval = 0; // Return value we are going to return. int i; // loop counter int offset; // Writing offset in page. + int orig_write_bytes = write_bytes; + int sd_update = 0; for ( i = 0, offset = (pos & (PAGE_CACHE_SIZE-1)); i < num_pages ; i++,offset=0) { int count = min_t(int,PAGE_CACHE_SIZE-offset,write_bytes); // How much of bytes to write to this page struct page *page=prepared_pages[i]; // Current page we process. - status = block_commit_write(page, offset, offset+count); + status = reiserfs_commit_page(inode, page, offset, offset+count); if ( status ) retval = status; // To not overcomplicate matters We are going to // submit all the pages even if there was error. @@ -639,6 +659,41 @@ int reiserfs_submit_file_region_for_writ // to grab_cache_page page_cache_release(page); } + /* now that we've gotten all the ordered buffers marked dirty, + * we can safely update i_size and close any running transaction + */ + if ( pos + orig_write_bytes > inode->i_size) { + inode->i_size = pos + orig_write_bytes; // Set new size + /* If the file have grown so much that tail packing is no + * longer possible, reset "need to pack" flag */ + if ( (have_large_tails (inode->i_sb) && + inode->i_size > i_block_size (inode)*4) || + (have_small_tails (inode->i_sb) && + inode->i_size > i_block_size(inode)) ) + REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ; + else if ( (have_large_tails (inode->i_sb) && + inode->i_size < i_block_size (inode)*4) || + (have_small_tails (inode->i_sb) && + inode->i_size < i_block_size(inode)) ) + REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ; + + if (th->t_trans_id) { + reiserfs_write_lock(inode->i_sb); + reiserfs_update_sd(th, inode); // And update on-disk metadata + reiserfs_write_unlock(inode->i_sb); + } else + inode->i_sb->s_op->dirty_inode(inode); + + sd_update = 1; + } + if (th->t_trans_id) { + reiserfs_write_lock(inode->i_sb); + if (!sd_update) + reiserfs_update_sd(th, inode); + journal_end(th, th->t_super, th->t_blocks_allocated); + reiserfs_write_unlock(inode->i_sb); + } + th->t_trans_id = 0; return retval; } @@ -1006,19 +1061,18 @@ ssize_t reiserfs_file_write( struct file loff_t pos; // Current position in the file. size_t res; // return value of various functions that we call. struct inode *inode = file->f_dentry->d_inode; // Inode of the file that we are writing to. - struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME]; /* To simplify coding at this time, we store locked pages in array for now */ - if ( count <= PAGE_CACHE_SIZE ) - return generic_file_write(file, buf, count, ppos); + struct page * prepared_pages[REISERFS_WRITE_PAGES_AT_A_TIME]; + struct reiserfs_transaction_handle th; + th.t_trans_id = 0; - if ( file->f_flags & O_DIRECT) { // Direct IO needs some special threating. + if ( file->f_flags & O_DIRECT) { // Direct IO needs treatment int result, after_file_end = 0; if ( (*ppos + count >= inode->i_size) || (file->f_flags & O_APPEND) ) { /* If we are appending a file, we need to put this savelink in here. If we will crash while doing direct io, finish_unfinished will cut the garbage from the file end. */ - struct reiserfs_transaction_handle th; reiserfs_write_lock(inode->i_sb); journal_begin(&th, inode->i_sb, JOURNAL_PER_BALANCE_CNT ); reiserfs_update_inode_transaction(inode); @@ -1043,7 +1097,6 @@ ssize_t reiserfs_file_write( struct file return result; } - if ( unlikely((ssize_t) count < 0 )) return -EINVAL; @@ -1146,12 +1199,8 @@ ssize_t reiserfs_file_write( struct file if ( blocks_to_allocate > 0) {/*We only allocate blocks if we need to*/ /* Fill in all the possible holes and append the file if needed */ - res = reiserfs_allocate_blocks_for_region(inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate); - } else if ( pos + write_bytes > inode->i_size ) { - /* File might have grown even though no new blocks were added */ - inode->i_size = pos + write_bytes; - inode->i_sb->s_op->dirty_inode(inode); - } + res = reiserfs_allocate_blocks_for_region(&th, inode, pos, num_pages, write_bytes, prepared_pages, blocks_to_allocate); + } /* well, we have allocated the blocks, so it is time to free the reservation we made earlier. */ @@ -1173,7 +1222,8 @@ ssize_t reiserfs_file_write( struct file } /* Send the pages to disk and unlock them. */ - res = reiserfs_submit_file_region_for_write(pos, num_pages, write_bytes, prepared_pages); + res = reiserfs_submit_file_region_for_write(&th, inode, pos, num_pages, + write_bytes,prepared_pages); if ( res ) break; @@ -1184,10 +1234,17 @@ ssize_t reiserfs_file_write( struct file balance_dirty_pages_ratelimited(inode->i_mapping); } + /* this is only true on error */ + if (th.t_trans_id) { + reiserfs_write_lock(inode->i_sb); + journal_end(&th, th.t_super, th.t_blocks_allocated); + reiserfs_write_unlock(inode->i_sb); + } if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) res = generic_osync_inode(inode, file->f_mapping, OSYNC_METADATA|OSYNC_DATA); up(&inode->i_sem); + reiserfs_async_progress_wait(inode->i_sb); return (already_written != 0)?already_written:res; out: @@ -1219,6 +1276,11 @@ struct file_operations reiserfs_file_ope struct inode_operations reiserfs_file_inode_operations = { .truncate = reiserfs_vfs_truncate_file, .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, }; diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/fix_node.c linux-2.6.5-rc1-mm2/fs/reiserfs/fix_node.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/fix_node.c 2004-03-11 03:55:24.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/fix_node.c 2004-03-19 16:10:24.008965731 +0100 @@ -795,8 +795,9 @@ static int get_empty_nodes( else /* If we have enough already then there is nothing to do. */ return CARRY_ON; - if ( reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs, - n_amount_needed) == NO_DISK_SPACE ) + /* No need to check quota - is not allocated for blocks used for formatted nodes */ + if (reiserfs_new_form_blocknrs (p_s_tb, a_n_blocknrs, + n_amount_needed) == NO_DISK_SPACE) return NO_DISK_SPACE; /* for each blocknumber we just got, get a buffer and stick it on FEB */ @@ -2106,9 +2107,9 @@ static void tb_buffer_sanity_check (stru {;} #endif -static void clear_all_dirty_bits(struct super_block *s, +static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh) { - reiserfs_prepare_for_journal(s, bh, 0) ; + return reiserfs_prepare_for_journal(s, bh, 0) ; } static int wait_tb_buffers_until_unlocked (struct tree_balance * p_s_tb) @@ -2137,11 +2138,11 @@ static int wait_tb_buffers_until_unlocke p_s_tb->tb_path->path_length - i); } #endif - clear_all_dirty_bits(p_s_tb->tb_sb, - PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) ; - - if ( buffer_locked (PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i)) ) + if (!clear_all_dirty_bits(p_s_tb->tb_sb, + PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i))) + { locked = PATH_OFFSET_PBUFFER (p_s_tb->tb_path, i); + } } } @@ -2151,22 +2152,19 @@ static int wait_tb_buffers_until_unlocke if ( p_s_tb->L[i] ) { tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->L[i], "L", i); - clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i]) ; - if ( buffer_locked (p_s_tb->L[i]) ) + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->L[i])) locked = p_s_tb->L[i]; } if ( !locked && p_s_tb->FL[i] ) { tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FL[i], "FL", i); - clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i]) ; - if ( buffer_locked (p_s_tb->FL[i]) ) + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FL[i])) locked = p_s_tb->FL[i]; } if ( !locked && p_s_tb->CFL[i] ) { tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFL[i], "CFL", i); - clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i]) ; - if ( buffer_locked (p_s_tb->CFL[i]) ) + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFL[i])) locked = p_s_tb->CFL[i]; } @@ -2176,23 +2174,20 @@ static int wait_tb_buffers_until_unlocke if ( p_s_tb->R[i] ) { tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->R[i], "R", i); - clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i]) ; - if ( buffer_locked (p_s_tb->R[i]) ) + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->R[i])) locked = p_s_tb->R[i]; } if ( !locked && p_s_tb->FR[i] ) { tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->FR[i], "FR", i); - clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i]) ; - if ( buffer_locked (p_s_tb->FR[i]) ) + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FR[i])) locked = p_s_tb->FR[i]; } if ( !locked && p_s_tb->CFR[i] ) { tb_buffer_sanity_check (p_s_tb->tb_sb, p_s_tb->CFR[i], "CFR", i); - clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i]) ; - if ( buffer_locked (p_s_tb->CFR[i]) ) + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->CFR[i])) locked = p_s_tb->CFR[i]; } } @@ -2207,10 +2202,8 @@ static int wait_tb_buffers_until_unlocke */ for ( i = 0; !locked && i < MAX_FEB_SIZE; i++ ) { if ( p_s_tb->FEB[i] ) { - clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i]) ; - if (buffer_locked(p_s_tb->FEB[i])) { + if (!clear_all_dirty_bits(p_s_tb->tb_sb, p_s_tb->FEB[i])) locked = p_s_tb->FEB[i] ; - } } } @@ -2280,7 +2273,6 @@ int fix_nodes (int n_op_mode, ** during wait_tb_buffers_run */ int wait_tb_buffers_run = 0 ; - int windex ; struct buffer_head * p_s_tbS0 = PATH_PLAST_BUFFER(p_s_tb->tb_path); ++ REISERFS_SB(p_s_tb -> tb_sb) -> s_fix_nodes; @@ -2407,10 +2399,7 @@ int fix_nodes (int n_op_mode, p_s_tb->insert_size[n_h + 1] = (DC_SIZE + KEY_SIZE) * (p_s_tb->blknum[n_h] - 1); } - - windex = push_journal_writer("fix_nodes") ; if ((n_ret_value = wait_tb_buffers_until_unlocked (p_s_tb)) == CARRY_ON) { - pop_journal_writer(windex) ; if (FILESYSTEM_CHANGED_TB(p_s_tb)) { wait_tb_buffers_run = 1 ; n_ret_value = REPEAT_SEARCH ; @@ -2420,7 +2409,6 @@ int fix_nodes (int n_op_mode, } } else { wait_tb_buffers_run = 1 ; - pop_journal_writer(windex) ; goto repeat; } @@ -2505,7 +2493,7 @@ void unfix_nodes (struct tree_balance * /* de-allocated block which was not used by balancing and bforget about buffer for it */ brelse (tb->FEB[i]); - reiserfs_free_block (tb->transaction_handle, blocknr); + reiserfs_free_block (tb->transaction_handle, NULL, blocknr, 0); } if (tb->used[i]) { /* release used as new nodes including a new root */ diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/ibalance.c linux-2.6.5-rc1-mm2/fs/reiserfs/ibalance.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/ibalance.c 2004-03-11 03:55:28.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/ibalance.c 2004-03-19 16:10:24.009965624 +0100 @@ -633,7 +633,6 @@ static void balance_internal_when_delete /* use check_internal if new root is an internal node */ check_internal (new_root); /*&&&&&&&&&&&&&&&&&&&&&&*/ - tb->tb_sb->s_dirt = 1; /* do what is needed for buffer thrown from tree */ reiserfs_invalidate_buffer(tb, tbSh); @@ -951,7 +950,6 @@ int balance_internal (struct tree_balanc PUT_SB_ROOT_BLOCK( tb->tb_sb, tbSh->b_blocknr ); PUT_SB_TREE_HEIGHT( tb->tb_sb, SB_TREE_HEIGHT(tb->tb_sb) + 1 ); do_balance_mark_sb_dirty (tb, REISERFS_SB(tb->tb_sb)->s_sbh, 1); - tb->tb_sb->s_dirt = 1; } if ( tb->blknum[h] == 2 ) { diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/inode.c linux-2.6.5-rc1-mm2/fs/reiserfs/inode.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/inode.c 2004-03-19 15:13:58.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/inode.c 2004-03-19 16:10:24.009965624 +0100 @@ -4,7 +4,10 @@ #include #include +#include #include +#include +#include #include #include #include @@ -13,6 +16,7 @@ #include #include #include +#include extern int reiserfs_default_io_size; /* default io size devuned in super.c */ @@ -22,29 +26,31 @@ extern int reiserfs_default_io_size; /* #define GET_BLOCK_NO_HOLE 2 /* return -ENOENT for file holes */ #define GET_BLOCK_READ_DIRECT 4 /* read the tail if indirect item not found */ #define GET_BLOCK_NO_ISEM 8 /* i_sem is not held, don't preallocate */ +#define GET_BLOCK_NO_DANGLE 16 /* don't leave any transactions running */ static int reiserfs_get_block (struct inode * inode, sector_t block, struct buffer_head * bh_result, int create); +static int reiserfs_commit_write(struct file *f, struct page *page, + unsigned from, unsigned to); void reiserfs_delete_inode (struct inode * inode) { int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2; - int windex ; struct reiserfs_transaction_handle th ; - reiserfs_write_lock(inode->i_sb); + DQUOT_FREE_INODE(inode); /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ down (&inode->i_sem); + reiserfs_delete_xattrs (inode); + journal_begin(&th, inode->i_sb, jbegin_count) ; reiserfs_update_inode_transaction(inode) ; - windex = push_journal_writer("delete_inode") ; reiserfs_delete_object (&th, inode); - pop_journal_writer(windex) ; journal_end(&th, inode->i_sb, jbegin_count) ; @@ -107,12 +113,6 @@ inline void make_le_item_head (struct it put_ih_entry_count( ih, entry_count ); } -static void add_to_flushlist(struct inode *inode, struct buffer_head *bh) { - struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ; - - buffer_insert_list(&j->j_dirty_buffers_lock, bh, &j->j_dirty_buffers) ; -} - // // FIXME: we might cache recently accessed indirect item @@ -206,6 +206,10 @@ static int file_capable (struct inode * struct super_block *s = th->t_super ; int len = th->t_blocks_allocated ; + /* we cannot restart while nested */ + if (th->t_refcount > 1) { + return ; + } pathrelse(path) ; reiserfs_update_sd(th, inode) ; journal_end(th, s, len) ; @@ -437,7 +441,8 @@ static int reiserfs_get_blocks_direct_io reiserfs_get_block() */ bh_result->b_size = (1 << inode->i_blkbits); - ret = reiserfs_get_block(inode, iblock, bh_result, create) ; + ret = reiserfs_get_block(inode, iblock, bh_result, + create | GET_BLOCK_NO_DANGLE) ; /* don't allow direct io onto tail pages */ if (ret == 0 && buffer_mapped(bh_result) && bh_result->b_blocknr == 0) { @@ -510,15 +515,14 @@ static int convert_tail_for_hole(struct ** won't trigger a get_block in this case. */ fix_tail_page_for_writing(tail_page) ; - retval = block_prepare_write(tail_page, tail_start, tail_end, - reiserfs_get_block) ; + retval = reiserfs_prepare_write(NULL, tail_page, tail_start, tail_end); if (retval) goto unlock ; /* tail conversion might change the data in the page */ flush_dcache_page(tail_page) ; - retval = generic_commit_write(NULL, tail_page, tail_start, tail_end) ; + retval = reiserfs_commit_write(NULL, tail_page, tail_start, tail_end) ; unlock: if (tail_page != hole_page) { @@ -557,8 +561,7 @@ int reiserfs_get_block (struct inode * i __u32 * item; int done; int fs_gen; - int windex ; - struct reiserfs_transaction_handle th ; + struct reiserfs_transaction_handle *th = NULL; /* space reserved in transaction batch: . 3 balancings in direct->indirect conversion . 1 block involved into reiserfs_update_sd() @@ -566,12 +569,11 @@ int reiserfs_get_block (struct inode * i can incur (much) more that 3 balancings. */ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3 + 1; int version; - int transaction_started = 0 ; + int dangle = 1; loff_t new_offset = (((loff_t)block) << inode->i_sb->s_blocksize_bits) + 1 ; /* bad.... */ reiserfs_write_lock(inode->i_sb); - th.t_trans_id = 0 ; version = get_inode_item_key_version (inode); if (block < 0) { @@ -595,6 +597,13 @@ int reiserfs_get_block (struct inode * i reiserfs_write_unlock(inode->i_sb); return ret; } + /* + * if we're already in a transaction, make sure to close + * any new transactions we start in this func + */ + if ((create & GET_BLOCK_NO_DANGLE) || + reiserfs_transaction_running(inode->i_sb)) + dangle = 0; /* If file is of such a size, that it might have a tail and tails are enabled ** we should mark it as possibly needing tail packing on close @@ -603,15 +612,17 @@ int reiserfs_get_block (struct inode * i (have_small_tails (inode->i_sb) && inode->i_size < i_block_size(inode)) ) REISERFS_I(inode)->i_flags |= i_pack_on_close_mask ; - windex = push_journal_writer("reiserfs_get_block") ; - /* set the key of the first byte in the 'block'-th block of file */ make_cpu_key (&key, inode, new_offset, TYPE_ANY, 3/*key length*/); if ((new_offset + inode->i_sb->s_blocksize - 1) > inode->i_size) { - journal_begin(&th, inode->i_sb, jbegin_count) ; +start_trans: + th = reiserfs_persistent_transaction(inode->i_sb, jbegin_count); + if (!th) { + retval = -ENOMEM; + goto failure; + } reiserfs_update_inode_transaction(inode) ; - transaction_started = 1 ; } research: @@ -631,28 +642,29 @@ int reiserfs_get_block (struct inode * i if (allocation_needed (retval, allocated_block_nr, ih, item, pos_in_item)) { /* we have to allocate block for the unformatted node */ - if (!transaction_started) { + if (!th) { pathrelse(&path) ; - journal_begin(&th, inode->i_sb, jbegin_count) ; - reiserfs_update_inode_transaction(inode) ; - transaction_started = 1 ; - goto research ; + goto start_trans; } - repeat = _allocate_block(&th, block, inode, &allocated_block_nr, &path, create); + repeat = _allocate_block(th, block, inode, &allocated_block_nr, &path, create); - if (repeat == NO_DISK_SPACE) { + if (repeat == NO_DISK_SPACE || repeat == QUOTA_EXCEEDED) { /* restart the transaction to give the journal a chance to free ** some blocks. releases the path, so we have to go back to ** research if we succeed on the second try */ - restart_transaction(&th, inode, &path) ; - repeat = _allocate_block(&th, block, inode, &allocated_block_nr, NULL, create); + SB_JOURNAL(inode->i_sb)->j_next_async_flush = 1; + restart_transaction(th, inode, &path) ; + repeat = _allocate_block(th, block, inode, &allocated_block_nr, NULL, create); - if (repeat != NO_DISK_SPACE) { + if (repeat != NO_DISK_SPACE && repeat != QUOTA_EXCEEDED) { goto research ; } - retval = -ENOSPC; + if (repeat == QUOTA_EXCEEDED) + retval = -EDQUOT; + else + retval = -ENOSPC; goto failure; } @@ -675,17 +687,17 @@ int reiserfs_get_block (struct inode * i goto research; } set_buffer_new(bh_result); + if (buffer_dirty(bh_result) && reiserfs_data_ordered(inode->i_sb)) + reiserfs_add_ordered_list(inode, bh_result); put_block_num(item, pos_in_item, allocated_block_nr) ; unfm_ptr = allocated_block_nr; - journal_mark_dirty (&th, inode->i_sb, bh); - inode->i_blocks += (inode->i_sb->s_blocksize / 512) ; - reiserfs_update_sd(&th, inode) ; + journal_mark_dirty (th, inode->i_sb, bh); + reiserfs_update_sd(th, inode) ; } set_block_dev_mapped(bh_result, unfm_ptr, inode); pathrelse (&path); - pop_journal_writer(windex) ; - if (transaction_started) - journal_end(&th, inode->i_sb, jbegin_count) ; + if (!dangle && th) + reiserfs_end_persistent_transaction(th); reiserfs_write_unlock(inode->i_sb); @@ -696,16 +708,9 @@ int reiserfs_get_block (struct inode * i return 0; } - if (!transaction_started) { - /* if we don't pathrelse, we could vs-3050 on the buffer if - ** someone is waiting for it (they can't finish until the buffer - ** is released, we can start a new transaction until they finish) - */ + if (!th) { pathrelse(&path) ; - journal_begin(&th, inode->i_sb, jbegin_count) ; - reiserfs_update_inode_transaction(inode) ; - transaction_started = 1 ; - goto research; + goto start_trans; } /* desired position is not found or is in the direct item. We have @@ -733,13 +738,11 @@ int reiserfs_get_block (struct inode * i set_cpu_key_k_offset (&tmp_key, 1); PATH_LAST_POSITION(&path) ++; - retval = reiserfs_insert_item (&th, &path, &tmp_key, &tmp_ih, (char *)&unp); + retval = reiserfs_insert_item (th, &path, &tmp_key, &tmp_ih, inode, (char *)&unp); if (retval) { - reiserfs_free_block (&th, allocated_block_nr); - goto failure; // retval == -ENOSPC or -EIO or -EEXIST + reiserfs_free_block (th, inode, allocated_block_nr, 1); + goto failure; // retval == -ENOSPC, -EDQUOT or -EIO or -EEXIST } - if (unp) - inode->i_blocks += inode->i_sb->s_blocksize / 512; //mark_tail_converted (inode); } else if (is_direct_le_ih (ih)) { /* direct item has to be converted */ @@ -759,8 +762,14 @@ int reiserfs_get_block (struct inode * i node. FIXME: this should also get into page cache */ pathrelse(&path) ; - journal_end(&th, inode->i_sb, jbegin_count) ; - transaction_started = 0 ; + /* + * ugly, but we can only end the transaction if + * we aren't nested + */ + if (th->t_refcount == 1) { + reiserfs_end_persistent_transaction(th); + th = NULL; + } retval = convert_tail_for_hole(inode, bh_result, tail_offset) ; if (retval) { @@ -768,18 +777,19 @@ int reiserfs_get_block (struct inode * i printk("clm-6004: convert tail failed inode %lu, error %d\n", inode->i_ino, retval) ; if (allocated_block_nr) { /* the bitmap, the super, and the stat data == 3 */ - journal_begin(&th, inode->i_sb, 3) ; - reiserfs_free_block (&th, allocated_block_nr); - transaction_started = 1 ; + if (!th) + th = reiserfs_persistent_transaction(inode->i_sb,3); + if (th) + reiserfs_free_block (th,inode,allocated_block_nr,1); } goto failure ; } goto research ; } - retval = direct2indirect (&th, inode, &path, unbh, tail_offset); + retval = direct2indirect (th, inode, &path, unbh, tail_offset); if (retval) { reiserfs_unmap_buffer(unbh); - reiserfs_free_block (&th, allocated_block_nr); + reiserfs_free_block (th, inode, allocated_block_nr, 1); goto failure; } /* it is important the set_buffer_uptodate is done after @@ -799,7 +809,7 @@ int reiserfs_get_block (struct inode * i /* we've converted the tail, so we must ** flush unbh before the transaction commits */ - add_to_flushlist(inode, unbh) ; + reiserfs_add_tail_list(inode, unbh) ; /* mark it dirty now to prevent commit_write from adding ** this buffer to the inode's dirty buffer list @@ -812,9 +822,6 @@ int reiserfs_get_block (struct inode * i */ mark_buffer_dirty(unbh) ; } - - //inode->i_blocks += inode->i_sb->s_blocksize / 512; - //mark_tail_converted (inode); } else { /* append indirect item with holes if needed, when appending pointer to 'block'-th block use block, which is already @@ -862,24 +869,21 @@ int reiserfs_get_block (struct inode * i only have space for one block */ blocks_needed=max_to_insert?max_to_insert:1; } - retval = reiserfs_paste_into_item (&th, &path, &tmp_key, (char *)un, UNFM_P_SIZE * blocks_needed); + retval = reiserfs_paste_into_item (th, &path, &tmp_key, inode, (char *)un, UNFM_P_SIZE * blocks_needed); if (blocks_needed != 1) kfree(un); if (retval) { - reiserfs_free_block (&th, allocated_block_nr); + reiserfs_free_block (th, inode, allocated_block_nr, 1); goto failure; } - if (done) { - inode->i_blocks += inode->i_sb->s_blocksize / 512; - } else { + if (!done) { /* We need to mark new file size in case this function will be interrupted/aborted later on. And we may do this only for holes. */ inode->i_size += inode->i_sb->s_blocksize * blocks_needed; } - //mark_tail_converted (inode); } if (done == 1) @@ -893,8 +897,8 @@ int reiserfs_get_block (struct inode * i ** release the path so that anybody waiting on the path before ** ending their transaction will be able to continue. */ - if (journal_transaction_should_end(&th, th.t_blocks_allocated)) { - restart_transaction(&th, inode, &path) ; + if (journal_transaction_should_end(th, th->t_blocks_allocated)) { + restart_transaction(th, inode, &path) ; } /* inserting indirect pointers for a hole can take a ** long time. reschedule if needed @@ -911,7 +915,7 @@ int reiserfs_get_block (struct inode * i "%K should not be found\n", &key); retval = -EEXIST; if (allocated_block_nr) - reiserfs_free_block (&th, allocated_block_nr); + reiserfs_free_block (th, inode, allocated_block_nr, 1); pathrelse(&path) ; goto failure; } @@ -925,11 +929,10 @@ int reiserfs_get_block (struct inode * i retval = 0; failure: - if (transaction_started) { - reiserfs_update_sd(&th, inode) ; - journal_end(&th, inode->i_sb, jbegin_count) ; + if (th && !dangle) { + reiserfs_update_sd(th, inode) ; + reiserfs_end_persistent_transaction(th); } - pop_journal_writer(windex) ; reiserfs_write_unlock(inode->i_sb); reiserfs_check_path(&path) ; return retval; @@ -942,6 +945,58 @@ reiserfs_readpages(struct file *file, st return mpage_readpages(mapping, pages, nr_pages, reiserfs_get_block); } +/* Compute real number of used bytes by file + * Following three functions can go away when we'll have enough space in stat item + */ +static int real_space_diff(struct inode *inode, int sd_size) +{ + int bytes; + loff_t blocksize = inode->i_sb->s_blocksize ; + + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) + return sd_size ; + + /* End of file is also in full block with indirect reference, so round + ** up to the next block. + ** + ** there is just no way to know if the tail is actually packed + ** on the file, so we have to assume it isn't. When we pack the + ** tail, we add 4 bytes to pretend there really is an unformatted + ** node pointer + */ + bytes = ((inode->i_size + (blocksize-1)) >> inode->i_sb->s_blocksize_bits) * UNFM_P_SIZE + sd_size; + return bytes ; +} + +static inline loff_t to_real_used_space(struct inode *inode, ulong blocks, + int sd_size) +{ + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { + return inode->i_size + (loff_t)(real_space_diff(inode, sd_size)) ; + } + return ((loff_t)real_space_diff(inode, sd_size)) + (((loff_t)blocks) << 9); +} + +/* Compute number of blocks used by file in ReiserFS counting */ +static inline ulong to_fake_used_blocks(struct inode *inode, int sd_size) +{ + loff_t bytes = inode_get_bytes(inode) ; + loff_t real_space = real_space_diff(inode, sd_size) ; + + /* keeps fsck and non-quota versions of reiserfs happy */ + if (S_ISLNK(inode->i_mode) || S_ISDIR(inode->i_mode)) { + bytes += (loff_t)511 ; + } + + /* files from before the quota patch might i_blocks such that + ** bytes < real_space. Deal with that here to prevent it from + ** going negative. + */ + if (bytes < real_space) + return 0 ; + return (bytes - real_space) >> 9; +} + // // BAD: new directories have stat data of new type and all other items // of old type. Version stored in the inode says about body items, so @@ -969,7 +1024,10 @@ static void init_inode (struct inode * i REISERFS_I(inode)->i_prealloc_block = 0; REISERFS_I(inode)->i_prealloc_count = 0; REISERFS_I(inode)->i_trans_id = 0; - REISERFS_I(inode)->i_trans_index = 0; + REISERFS_I(inode)->i_jl = NULL; + REISERFS_I(inode)->i_acl_access = NULL; + REISERFS_I(inode)->i_acl_default = NULL; + init_rwsem (&REISERFS_I(inode)->xattr_sem); if (stat_data_v1 (ih)) { struct stat_data_v1 * sd = (struct stat_data_v1 *)B_I_PITEM (bh, ih); @@ -1004,6 +1062,14 @@ static void init_inode (struct inode * i rdev = sd_v1_rdev(sd); REISERFS_I(inode)->i_first_direct_byte = sd_v1_first_direct_byte(sd); + /* an early bug in the quota code can give us an odd number for the + ** block count. This is incorrect, fix it here. + */ + if (inode->i_blocks & 1) { + inode->i_blocks++ ; + } + inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, + SD_V1_SIZE)); /* nopack is initially zero for v1 objects. For v2 objects, nopack is initialised from sd_attrs */ REISERFS_I(inode)->i_flags &= ~i_nopack_mask; @@ -1036,6 +1102,8 @@ static void init_inode (struct inode * i set_inode_item_key_version (inode, KEY_FORMAT_3_6); REISERFS_I(inode)->i_first_direct_byte = 0; set_inode_sd_version (inode, STAT_DATA_V2); + inode_set_bytes(inode, to_real_used_space(inode, inode->i_blocks, + SD_V2_SIZE)); /* read persistent inode attributes from sd and initalise generic inode flags from them */ REISERFS_I(inode)->i_attrs = sd_v2_attrs( sd ); @@ -1051,7 +1119,7 @@ static void init_inode (struct inode * i inode->i_op = &reiserfs_dir_inode_operations; inode->i_fop = &reiserfs_dir_operations; } else if (S_ISLNK (inode->i_mode)) { - inode->i_op = &page_symlink_inode_operations; + inode->i_op = &reiserfs_symlink_inode_operations; inode->i_mapping->a_ops = &reiserfs_address_space_operations; } else { inode->i_blocks = 0; @@ -1061,7 +1129,7 @@ static void init_inode (struct inode * i // update new stat data with inode fields -static void inode2sd (void * sd, struct inode * inode) +static void inode2sd (void * sd, struct inode * inode, loff_t size) { struct stat_data * sd_v2 = (struct stat_data *)sd; __u16 flags; @@ -1069,12 +1137,12 @@ static void inode2sd (void * sd, struct set_sd_v2_mode(sd_v2, inode->i_mode ); set_sd_v2_nlink(sd_v2, inode->i_nlink ); set_sd_v2_uid(sd_v2, inode->i_uid ); - set_sd_v2_size(sd_v2, inode->i_size ); + set_sd_v2_size(sd_v2, size ); set_sd_v2_gid(sd_v2, inode->i_gid ); set_sd_v2_mtime(sd_v2, inode->i_mtime.tv_sec ); set_sd_v2_atime(sd_v2, inode->i_atime.tv_sec ); set_sd_v2_ctime(sd_v2, inode->i_ctime.tv_sec ); - set_sd_v2_blocks(sd_v2, inode->i_blocks ); + set_sd_v2_blocks(sd_v2, to_fake_used_blocks(inode, SD_V2_SIZE)); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); else @@ -1086,7 +1154,7 @@ static void inode2sd (void * sd, struct // used to copy inode's fields to old stat data -static void inode2sd_v1 (void * sd, struct inode * inode) +static void inode2sd_v1 (void * sd, struct inode * inode, loff_t size) { struct stat_data_v1 * sd_v1 = (struct stat_data_v1 *)sd; @@ -1094,7 +1162,7 @@ static void inode2sd_v1 (void * sd, stru set_sd_v1_uid(sd_v1, inode->i_uid ); set_sd_v1_gid(sd_v1, inode->i_gid ); set_sd_v1_nlink(sd_v1, inode->i_nlink ); - set_sd_v1_size(sd_v1, inode->i_size ); + set_sd_v1_size(sd_v1, size ); set_sd_v1_atime(sd_v1, inode->i_atime.tv_sec ); set_sd_v1_ctime(sd_v1, inode->i_ctime.tv_sec ); set_sd_v1_mtime(sd_v1, inode->i_mtime.tv_sec ); @@ -1102,7 +1170,7 @@ static void inode2sd_v1 (void * sd, stru if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) set_sd_v1_rdev(sd_v1, new_encode_dev(inode->i_rdev)); else - set_sd_v1_blocks(sd_v1, inode->i_blocks ); + set_sd_v1_blocks(sd_v1, to_fake_used_blocks(inode, SD_V1_SIZE)); // Sigh. i_first_direct_byte is back set_sd_v1_first_direct_byte(sd_v1, REISERFS_I(inode)->i_first_direct_byte); @@ -1112,7 +1180,8 @@ static void inode2sd_v1 (void * sd, stru /* NOTE, you must prepare the buffer head before sending it here, ** and then log it after the call */ -static void update_stat_data (struct path * path, struct inode * inode) +static void update_stat_data (struct path * path, struct inode * inode, + loff_t size) { struct buffer_head * bh; struct item_head * ih; @@ -1126,17 +1195,17 @@ static void update_stat_data (struct pat if (stat_data_v1 (ih)) { // path points to old stat data - inode2sd_v1 (B_I_PITEM (bh, ih), inode); + inode2sd_v1 (B_I_PITEM (bh, ih), inode, size); } else { - inode2sd (B_I_PITEM (bh, ih), inode); + inode2sd (B_I_PITEM (bh, ih), inode, size); } return; } -void reiserfs_update_sd (struct reiserfs_transaction_handle *th, - struct inode * inode) +void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th, + struct inode * inode, loff_t size) { struct cpu_key key; INITIALIZE_PATH(path); @@ -1186,7 +1255,7 @@ void reiserfs_update_sd (struct reiserfs } break; } - update_stat_data (&path, inode); + update_stat_data (&path, inode, size); journal_mark_dirty(th, th->t_super, bh) ; pathrelse (&path); return; @@ -1469,6 +1538,7 @@ int reiserfs_sync_inode (struct reiserfs /* stat data of new object is inserted already, this inserts the item containing "." and ".." entries */ static int reiserfs_new_directory (struct reiserfs_transaction_handle *th, + struct inode *inode, struct item_head * ih, struct path * path, struct inode * dir) { @@ -1513,13 +1583,14 @@ static int reiserfs_new_directory (struc } /* insert item, that is empty directory item */ - return reiserfs_insert_item (th, path, &key, ih, body); + return reiserfs_insert_item (th, path, &key, ih, inode, body); } /* stat data of object has been inserted, this inserts the item containing the body of symlink */ static int reiserfs_new_symlink (struct reiserfs_transaction_handle *th, + struct inode *inode, /* Inode of symlink */ struct item_head * ih, struct path * path, const char * symname, int item_len) { @@ -1549,7 +1620,7 @@ static int reiserfs_new_symlink (struct } /* insert item, that is body of symlink */ - return reiserfs_insert_item (th, path, &key, ih, symname); + return reiserfs_insert_item (th, path, &key, ih, inode, symname); } @@ -1617,7 +1688,8 @@ int reiserfs_new_inode (struct reiserfs_ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_size = i_size; - inode->i_blocks = (inode->i_size + 511) >> 9; + inode->i_blocks = 0; + inode->i_bytes = 0; REISERFS_I(inode)->i_first_direct_byte = S_ISLNK(mode) ? 1 : U32_MAX/*NO_BYTES_IN_DIRECT_ITEM*/; @@ -1626,10 +1698,13 @@ int reiserfs_new_inode (struct reiserfs_ REISERFS_I(inode)->i_prealloc_block = 0; REISERFS_I(inode)->i_prealloc_count = 0; REISERFS_I(inode)->i_trans_id = 0; - REISERFS_I(inode)->i_trans_index = 0; + REISERFS_I(inode)->i_jl = 0; REISERFS_I(inode)->i_attrs = REISERFS_I(dir)->i_attrs & REISERFS_INHERIT_MASK; sd_attrs_to_i_attrs( REISERFS_I(inode) -> i_attrs, inode ); + REISERFS_I(inode)->i_acl_access = NULL; + REISERFS_I(inode)->i_acl_default = NULL; + init_rwsem (&REISERFS_I(inode)->xattr_sem); if (old_format_only (sb)) make_le_item_head (&ih, 0, KEY_FORMAT_3_5, SD_OFFSET, TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); @@ -1659,9 +1734,9 @@ int reiserfs_new_inode (struct reiserfs_ err = -EINVAL; goto out_bad_inode; } - inode2sd_v1 (&sd, inode); + inode2sd_v1 (&sd, inode, inode->i_size); } else { - inode2sd (&sd, inode); + inode2sd (&sd, inode, inode->i_size); } // these do not go to on-disk stat data inode->i_ino = le32_to_cpu (ih.ih_key.k_objectid); @@ -1685,7 +1760,7 @@ int reiserfs_new_inode (struct reiserfs_ if (REISERFS_I(dir)->new_packing_locality) th->displace_new_blocks = 1; #endif - retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, (char *)(&sd)); + retval = reiserfs_insert_item (th, &path_to_key, &key, &ih, inode, (char *)(&sd)); if (retval) { err = retval; reiserfs_check_path(&path_to_key) ; @@ -1698,14 +1773,14 @@ int reiserfs_new_inode (struct reiserfs_ #endif if (S_ISDIR(mode)) { /* insert item with "." and ".." */ - retval = reiserfs_new_directory (th, &ih, &path_to_key, dir); + retval = reiserfs_new_directory (th, inode, &ih, &path_to_key, dir); } if (S_ISLNK(mode)) { /* insert body of symlink */ if (!old_format_only (sb)) i_size = ROUND_UP(i_size); - retval = reiserfs_new_symlink (th, &ih, &path_to_key, symname, i_size); + retval = reiserfs_new_symlink (th, inode, &ih, &path_to_key, symname, i_size); } if (retval) { err = retval; @@ -1714,6 +1789,19 @@ int reiserfs_new_inode (struct reiserfs_ goto out_inserted_sd; } + /* XXX CHECK THIS */ + if (reiserfs_posixacl (inode->i_sb)) { + retval = reiserfs_inherit_default_acl (dir, dentry, inode); + if (retval) { + err = retval; + reiserfs_check_path(&path_to_key) ; + journal_end(th, th->t_super, th->t_blocks_allocated); + goto out_inserted_sd; + } + } else if (inode->i_sb->s_flags & MS_POSIXACL) { + reiserfs_warning ("ACLs aren't enabled in the fs, but vfs thinks they are!\n"); + } + insert_inode_hash (inode); reiserfs_update_sd(th, inode); reiserfs_check_path(&path_to_key) ; @@ -1730,6 +1818,9 @@ out_bad_inode: /* dquot_drop must be done outside a transaction */ journal_end(th, th->t_super, th->t_blocks_allocated) ; + DQUOT_FREE_INODE(inode); + DQUOT_DROP(inode); + inode->i_flags |= S_NOQUOTA; make_bad_inode(inode); out_inserted_sd: @@ -1832,7 +1923,6 @@ unlock: */ void reiserfs_truncate_file(struct inode *p_s_inode, int update_timestamps) { struct reiserfs_transaction_handle th ; - int windex ; /* we want the offset for the first byte after the end of the file */ unsigned long offset = p_s_inode->i_size & (PAGE_CACHE_SIZE - 1) ; unsigned blocksize = p_s_inode->i_sb->s_blocksize ; @@ -1867,14 +1957,12 @@ void reiserfs_truncate_file(struct inode cut_from_item. 1 is for update_sd */ journal_begin(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ; reiserfs_update_inode_transaction(p_s_inode) ; - windex = push_journal_writer("reiserfs_vfs_truncate_file") ; if (update_timestamps) /* we are doing real truncate: if the system crashes before the last transaction of truncating gets committed - on reboot the file either appears truncated properly or not truncated at all */ add_save_link (&th, p_s_inode, 1); reiserfs_do_truncate (&th, p_s_inode, page, update_timestamps) ; - pop_journal_writer(windex) ; journal_end(&th, p_s_inode->i_sb, JOURNAL_PER_BALANCE_CNT * 2 + 1 ) ; if (update_timestamps) @@ -2015,7 +2103,8 @@ out: /* this is where we fill in holes in the file. */ if (use_get_block) { retval = reiserfs_get_block(inode, block, bh_result, - GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM) ; + GET_BLOCK_CREATE | GET_BLOCK_NO_ISEM | + GET_BLOCK_NO_DANGLE); if (!retval) { if (!buffer_mapped(bh_result) || bh_result->b_blocknr == 0) { /* get_block failed to find a mapped unformatted node. */ @@ -2037,32 +2126,6 @@ out: return retval ; } -/* - * does the right thing for deciding when to lock a buffer and - * mark it for io during a writepage. make sure the buffer is - * dirty before sending it here though. - */ -static void lock_buffer_for_writepage(struct page *page, - struct writeback_control *wbc, - struct buffer_head *bh) -{ - if (wbc->sync_mode != WB_SYNC_NONE) { - lock_buffer(bh); - } else { - if (test_set_buffer_locked(bh)) { - __set_page_dirty_nobuffers(page); - return; - } - } - if (test_clear_buffer_dirty(bh)) { - if (!buffer_uptodate(bh)) - buffer_error(); - mark_buffer_async_write(bh); - } else { - unlock_buffer(bh); - } -} - /* * mason@suse.com: updated in 2.5.54 to follow the same general io * start/recovery path as __block_write_full_page, along with special @@ -2110,29 +2173,52 @@ static int reiserfs_write_full_page(stru } bh = head ; block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits) ; + /* first map all the buffers, logging any direct items we find */ do { - get_bh(bh); - if (buffer_dirty(bh)) { - if (buffer_mapped(bh) && bh->b_blocknr != 0) { - /* buffer mapped to an unformatted node */ - lock_buffer_for_writepage(page, wbc, bh); - } else { - /* not mapped yet, or it points to a direct item, search - * the btree for the mapping info, and log any direct - * items found - */ - if ((error = map_block_for_writepage(inode, bh, block))) { - goto fail ; - } - if (buffer_mapped(bh) && bh->b_blocknr != 0) { - lock_buffer_for_writepage(page, wbc, bh); - } + if (buffer_dirty(bh) && (!buffer_mapped(bh) || + (buffer_mapped(bh) && bh->b_blocknr == 0))) { + /* not mapped yet, or it points to a direct item, search + * the btree for the mapping info, and log any direct + * items found + */ + if ((error = map_block_for_writepage(inode, bh, block))) { + goto fail ; } } bh = bh->b_this_page; block++; } while(bh != head) ; + /* now go through and lock any dirty buffers on the page */ + do { + get_bh(bh); + if (!buffer_mapped(bh)) + continue; + if (buffer_mapped(bh) && bh->b_blocknr == 0) + continue; + + /* from this point on, we know the buffer is mapped to a + * real block and not a direct item + */ + if (wbc->sync_mode != WB_SYNC_NONE) { + lock_buffer(bh); + } else { + if (test_set_buffer_locked(bh)) { + __set_page_dirty_nobuffers(page); + continue; + } + } + if (test_clear_buffer_dirty(bh)) { + if (!buffer_uptodate(bh)) + buffer_error(); + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + + bh = bh->b_this_page; + } while((bh = bh->b_this_page) != head); + BUG_ON(PageWriteback(page)); set_page_writeback(page); unlock_page(page); @@ -2227,13 +2313,43 @@ static int reiserfs_writepage (struct pa return reiserfs_write_full_page(page, wbc) ; } - int reiserfs_prepare_write(struct file *f, struct page *page, unsigned from, unsigned to) { struct inode *inode = page->mapping->host ; + int ret; + int old_ref = 0; + reiserfs_wait_on_write_block(inode->i_sb) ; fix_tail_page_for_writing(page) ; - return block_prepare_write(page, from, to, reiserfs_get_block) ; + if (reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th; + th = (struct reiserfs_transaction_handle *)current->journal_info; + old_ref = th->t_refcount; + th->t_refcount++; + } + + ret = block_prepare_write(page, from, to, reiserfs_get_block) ; + if (ret && reiserfs_transaction_running(inode->i_sb)) { + struct reiserfs_transaction_handle *th = current->journal_info; + /* this gets a little ugly. If reiserfs_get_block returned an + * error and left a transacstion running, we've got to close it, + * and we've got to free handle if it was a persistent transaction. + * + * But, if we had nested into an existing transaction, we need + * to just drop the ref count on the handle. + * + * If old_ref == 0, the transaction is from reiserfs_get_block, + * and it was a persistent trans. Otherwise, it was nested above. + */ + if (th->t_refcount > old_ref) { + if (old_ref) + th->t_refcount--; + else + reiserfs_end_persistent_transaction(th); + } + } + return ret; + } @@ -2245,16 +2361,22 @@ static int reiserfs_commit_write(struct unsigned from, unsigned to) { struct inode *inode = page->mapping->host ; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - int ret ; + int ret = 0; + int update_sd = 0; + struct reiserfs_transaction_handle *th = NULL; reiserfs_wait_on_write_block(inode->i_sb) ; + if (reiserfs_transaction_running(inode->i_sb)) { + th = current->journal_info; + } + reiserfs_commit_page(inode, page, from, to); /* generic_commit_write does this for us, but does not update the ** transaction tracking stuff when the size changes. So, we have ** to do the i_size updates here. */ if (pos > inode->i_size) { - struct reiserfs_transaction_handle th ; + struct reiserfs_transaction_handle myth ; reiserfs_write_lock(inode->i_sb); /* If the file have grown beyond the border where it can have a tail, unmark it as needing a tail @@ -2263,16 +2385,22 @@ static int reiserfs_commit_write(struct (have_small_tails (inode->i_sb) && inode->i_size > i_block_size(inode)) ) REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask ; - journal_begin(&th, inode->i_sb, 1) ; + journal_begin(&myth, inode->i_sb, 1) ; reiserfs_update_inode_transaction(inode) ; inode->i_size = pos ; - reiserfs_update_sd(&th, inode) ; - journal_end(&th, inode->i_sb, 1) ; + reiserfs_update_sd(&myth, inode) ; + update_sd = 1; + journal_end(&myth, inode->i_sb, 1) ; + reiserfs_write_unlock(inode->i_sb); + } + if (th) { + reiserfs_write_lock(inode->i_sb); + if (!update_sd) + reiserfs_update_sd(th, inode) ; + reiserfs_end_persistent_transaction(th); reiserfs_write_unlock(inode->i_sb); } - ret = generic_commit_write(f, page, from, to) ; - /* we test for O_SYNC here so we can commit the transaction ** for any packed tails the file might have had */ @@ -2332,16 +2460,110 @@ void i_attrs_to_sd_attrs( struct inode * } } +/* decide if this buffer needs to stay around for data logging or ordered +** write purposes +*/ +static int invalidatepage_can_drop(struct inode *inode, struct buffer_head *bh) +{ + int ret = 1 ; + struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ; + + spin_lock(&j->j_dirty_buffers_lock) ; + if (!buffer_mapped(bh)) { + goto free_jh; + } + /* the page is locked, and the only places that log a data buffer + * also lock the page. + */ +#if 0 + if (reiserfs_file_data_log(inode)) { + /* very conservative, leave the buffer pinned if anyone might need it. + ** this should be changed to drop the buffer if it is only in the + ** current transaction + */ + if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { + ret = 0 ; + } + } else +#endif + if (buffer_dirty(bh) || buffer_locked(bh)) { + struct reiserfs_journal_list *jl; + struct reiserfs_jh *jh = bh->b_private; + + /* why is this safe? + * reiserfs_setattr updates i_size in the on disk + * stat data before allowing vmtruncate to be called. + * + * If buffer was put onto the ordered list for this + * transaction, we know for sure either this transaction + * or an older one already has updated i_size on disk, + * and this ordered data won't be referenced in the file + * if we crash. + * + * if the buffer was put onto the ordered list for an older + * transaction, we need to leave it around + */ + if (jh && (jl = jh->jl) && jl != SB_JOURNAL(inode->i_sb)->j_current_jl) + ret = 0; + } +free_jh: + if (ret && bh->b_private) { + reiserfs_free_jh(bh); + } + spin_unlock(&j->j_dirty_buffers_lock) ; + return ret ; +} + +/* clm -- taken from fs/buffer.c:block_invalidate_page */ +static int reiserfs_invalidatepage(struct page *page, unsigned long offset) +{ + struct buffer_head *head, *bh, *next; + struct inode *inode = page->mapping->host; + unsigned int curr_off = 0; + int ret = 1; + + BUG_ON(!PageLocked(page)); + if (!page_has_buffers(page)) + goto out; + + head = page_buffers(page); + bh = head; + do { + unsigned int next_off = curr_off + bh->b_size; + next = bh->b_this_page; + + /* + * is this block fully invalidated? + */ + if (offset <= curr_off) { + if (invalidatepage_can_drop(inode, bh)) + reiserfs_unmap_buffer(bh); + else + ret = 0; + } + curr_off = next_off; + bh = next; + } while (bh != head); + + /* + * We release buffers only if the entire page is being invalidated. + * The get_block cached value has been unconditionally invalidated, + * so real IO is not possible anymore. + */ + if (!offset && ret) + ret = try_to_release_page(page, 0); +out: + return ret; +} + /* * Returns 1 if the page's buffers were dropped. The page is locked. * * Takes j_dirty_buffers_lock to protect the b_assoc_buffers list_heads * in the buffers at page_buffers(page). * - * FIXME: Chris says the buffer list is not used with `mount -o notail', - * so in that case the fs can avoid the extra locking. Create a second - * address_space_operations with a NULL ->releasepage and install that - * into new address_spaces. + * even in -o notail mode, we can't be sure an old mount without -o notail + * didn't create files with tails. */ static int reiserfs_releasepage(struct page *page, int unused_gfp_flags) { @@ -2355,11 +2577,13 @@ static int reiserfs_releasepage(struct p head = page_buffers(page) ; bh = head ; do { - if (!buffer_dirty(bh) && !buffer_locked(bh)) { - list_del_init(&bh->b_assoc_buffers) ; - } else { + if (bh->b_private) { + if (!buffer_dirty(bh) && !buffer_locked(bh)) { + reiserfs_free_jh(bh); + } else { ret = 0 ; break ; + } } bh = bh->b_this_page ; } while (bh != head) ; @@ -2381,12 +2605,75 @@ static int reiserfs_direct_IO(int rw, st offset, nr_segs, reiserfs_get_blocks_direct_io, NULL); } +int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { + struct inode *inode = dentry->d_inode ; + int error ; + unsigned int ia_valid = attr->ia_valid; + reiserfs_write_lock(inode->i_sb); + if (attr->ia_valid & ATTR_SIZE) { + /* version 2 items will be caught by the s_maxbytes check + ** done for us in vmtruncate + */ + if (get_inode_item_key_version(inode) == KEY_FORMAT_3_5 && + attr->ia_size > MAX_NON_LFS) { + error = -EFBIG ; + goto out; + } + /* fill in hole pointers in the expanding truncate case. */ + if (attr->ia_size > inode->i_size) { + error = generic_cont_expand(inode, attr->ia_size) ; + if (REISERFS_I(inode)->i_prealloc_count > 0) { + struct reiserfs_transaction_handle th ; + /* we're changing at most 2 bitmaps, inode + super */ + journal_begin(&th, inode->i_sb, 4) ; + reiserfs_discard_prealloc (&th, inode); + journal_end(&th, inode->i_sb, 4) ; + } + if (error) + goto out; + } + } + + if ((((attr->ia_valid & ATTR_UID) && (attr->ia_uid & ~0xffff)) || + ((attr->ia_valid & ATTR_GID) && (attr->ia_gid & ~0xffff))) && + (get_inode_sd_version (inode) == STAT_DATA_V1)) { + /* stat data of format v3.5 has 16 bit uid and gid */ + error = -EINVAL; + goto out; + } + + error = inode_change_ok(inode, attr) ; + if (!error) { + if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + error = reiserfs_chown_xattrs (inode, attr); + + if (!error) + error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0; + } + if (!error) + inode_setattr(inode, attr) ; + } + + + if (!error && reiserfs_posixacl (inode->i_sb)) { + if (attr->ia_valid & ATTR_MODE) + error = reiserfs_acl_chmod (inode); + } + +out: + reiserfs_write_unlock(inode->i_sb); + return error ; +} + + struct address_space_operations reiserfs_address_space_operations = { .writepage = reiserfs_writepage, .readpage = reiserfs_readpage, .readpages = reiserfs_readpages, .releasepage = reiserfs_releasepage, + .invalidatepage = reiserfs_invalidatepage, .sync_page = block_sync_page, .prepare_write = reiserfs_prepare_write, .commit_write = reiserfs_commit_write, diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/ioctl.c linux-2.6.5-rc1-mm2/fs/reiserfs/ioctl.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/ioctl.c 2004-03-11 03:55:26.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/ioctl.c 2004-03-19 16:10:24.009965624 +0100 @@ -92,6 +92,7 @@ int reiserfs_unpack (struct inode * inod int retval = 0; int index ; struct page *page ; + struct address_space *mapping ; unsigned long write_from ; unsigned long blocksize = inode->i_sb->s_blocksize ; @@ -122,17 +123,19 @@ int reiserfs_unpack (struct inode * inod ** reiserfs_get_block to unpack the tail for us. */ index = inode->i_size >> PAGE_CACHE_SHIFT ; - page = grab_cache_page(inode->i_mapping, index) ; + mapping = inode->i_mapping ; + page = grab_cache_page(mapping, index) ; retval = -ENOMEM; if (!page) { goto out ; } - retval = reiserfs_prepare_write(NULL, page, write_from, blocksize) ; + retval = mapping->a_ops->prepare_write(NULL, page, write_from, write_from) ; if (retval) goto out_unlock ; /* conversion can change page contents, must flush */ flush_dcache_page(page) ; + retval = mapping->a_ops->commit_write(NULL, page, write_from, write_from) ; REISERFS_I(inode)->i_flags |= i_nopack_mask; out_unlock: diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/journal.c linux-2.6.5-rc1-mm2/fs/reiserfs/journal.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/journal.c 2004-03-11 03:55:34.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/journal.c 2004-03-19 16:31:05.280277171 +0100 @@ -32,13 +32,6 @@ ** around too long. ** -- Note, if you call this as an immediate flush from ** from within kupdate, it will ignore the immediate flag -** -** The commit thread -- a writer process for async commits. It allows a -** a process to request a log flush on a task queue. -** the commit will happen once the commit thread wakes up. -** The benefit here is the writer (with whatever -** related locks it has) doesn't have to wait for the -** log blocks to hit disk if it doesn't want to. */ #include @@ -60,6 +53,15 @@ #include #include #include +#include +#include + + +/* gets a struct reiserfs_journal_list * from a list head */ +#define JOURNAL_LIST_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ + j_list)) +#define JOURNAL_WORK_ENTRY(h) (list_entry((h), struct reiserfs_journal_list, \ + j_working_list)) /* the number of mounted filesystems. This is used to decide when to ** start and kill the commit workqueue @@ -78,6 +80,12 @@ static struct workqueue_struct *commit_w #define BLOCK_FREED_HOLDER 3 /* this block was freed during this transaction, and can't be written */ #define BLOCK_NEEDS_FLUSH 4 /* used in flush_journal_list */ +#define BLOCK_DIRTIED 5 + + +/* journal list state bits */ +#define LIST_TOUCHED 1 +#define LIST_DIRTY 2 /* flags for do_journal_end */ #define FLUSH_ALL 1 /* flush commit and real blocks */ @@ -86,6 +94,9 @@ static struct workqueue_struct *commit_w /* state bits for the journal */ #define WRITERS_BLOCKED 1 /* set when new writers not allowed */ +#define WRITERS_QUEUED 2 /* set when log is full due to too many + * writers + */ static int do_journal_end(struct reiserfs_transaction_handle *,struct super_block *,unsigned long nblocks,int flags) ; static int flush_journal_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) ; @@ -94,6 +105,9 @@ static int can_dirty(struct reiserfs_jou static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks); static int release_journal_dev( struct super_block *super, struct reiserfs_journal *journal ); +static int dirty_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl); +static void flush_async_commits(void *p); static void init_journal_hash(struct super_block *p_s_sb) { memset(SB_JOURNAL(p_s_sb)->j_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ; @@ -105,11 +119,19 @@ static void init_journal_hash(struct sup ** more details. */ static int reiserfs_clean_and_file_buffer(struct buffer_head *bh) { - if (bh) + if (bh) { clear_buffer_dirty(bh); + clear_bit(BH_JTest, &bh->b_state); + } return 0 ; } +static void disable_barrier(struct super_block *s) +{ + REISERFS_SB(s)->s_mount_opt &= ~(1 << REISERFS_BARRIER_FLUSH); + printk("reiserfs: disabling flush barriers on %s\n", reiserfs_bdevname(s)); +} + static struct reiserfs_bitmap_node * allocate_bitmap_node(struct super_block *p_s_sb) { struct reiserfs_bitmap_node *bn ; @@ -367,6 +389,7 @@ static void free_cnode(struct super_bloc static int clear_prepared_bits(struct buffer_head *bh) { clear_bit(BH_JPrepared, &bh->b_state) ; + clear_bit(BH_JRestore_dirty, &bh->b_state) ; return 0 ; } @@ -408,7 +431,6 @@ void reiserfs_check_lock_depth(char *cal #ifdef CONFIG_SMP if (current->lock_depth < 0) { printk("%s called without kernel lock held\n", caller) ; - show_reiserfs_locks() ; BUG() ; } #else @@ -444,52 +466,6 @@ static inline struct reiserfs_journal_cn return cn ; } -/* once upon a time, the journal would deadlock. a lot. Now, when -** CONFIG_REISERFS_CHECK is defined, anytime someone enters a -** transaction, it pushes itself into this ugly static list, and pops -** itself off before calling journal_end. I made a SysRq key to dump -** the list, and tell me what the writers are when I'm deadlocked. */ - - /* are you depending on the compiler - to optimize this function away - everywhere it is called? It is not - obvious how this works, but I - suppose debugging code need not be - clear. -Hans */ -static char *journal_writers[512] ; -int push_journal_writer(char *s) { -#ifdef CONFIG_REISERFS_CHECK - int i ; - for (i = 0 ; i < 512 ; i++) { - if (!journal_writers[i]) { - journal_writers[i] = s ; - return i ; - } - } - return -1 ; -#else - return 0 ; -#endif -} -int pop_journal_writer(int index) { -#ifdef CONFIG_REISERFS_CHECK - if (index >= 0) { - journal_writers[index] = NULL ; - } -#endif - return 0 ; -} - -int dump_journal_writers(void) { - int i ; - for (i = 0 ; i < 512 ; i++) { - if (journal_writers[i]) { - printk("%d: %s\n", i, journal_writers[i]) ; - } - } - return 0 ; -} - /* ** this actually means 'can this block be reallocated yet?'. If you set search_all, a block can only be allocated ** if it is not in the current transaction, was not freed by the current transaction, and has no chance of ever @@ -518,11 +494,6 @@ int reiserfs_in_journal(struct super_blo *next_zero_bit = 0 ; /* always start this at zero. */ - /* we aren't logging all blocks are safe for reuse */ - if (reiserfs_dont_log(p_s_sb)) { - return 0 ; - } - PROC_INFO_INC( p_s_sb, journal.in_journal ); /* If we aren't doing a search_all, this is a metablock, and it will be logged before use. ** if we crash before the transaction that freed it commits, this transaction won't @@ -550,6 +521,7 @@ int reiserfs_in_journal(struct super_blo /* is it in the current transaction. This should never happen */ if ((cn = get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_hash_table, bl))) { + BUG(); return 1; } @@ -574,18 +546,30 @@ inline void insert_journal_hash(struct r /* lock the current transaction */ inline static void lock_journal(struct super_block *p_s_sb) { - PROC_INFO_INC( p_s_sb, journal.lock_journal ); - while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) { - PROC_INFO_INC( p_s_sb, journal.lock_journal_wait ); - sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ; - } - atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 1) ; + PROC_INFO_INC( p_s_sb, journal.lock_journal ); + down(&SB_JOURNAL(p_s_sb)->j_lock); } /* unlock the current transaction */ inline static void unlock_journal(struct super_block *p_s_sb) { - atomic_dec(&(SB_JOURNAL(p_s_sb)->j_wlock)) ; - wake_up(&(SB_JOURNAL(p_s_sb)->j_wait)) ; + up(&SB_JOURNAL(p_s_sb)->j_lock); +} + +static inline void get_journal_list(struct reiserfs_journal_list *jl) +{ + jl->j_refcount++; +} + +static inline void put_journal_list(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + if (jl->j_refcount < 1) { + printk("trans id %lu, refcount at %d\n", jl->j_trans_id, + jl->j_refcount); + BUG(); + } + if (--jl->j_refcount == 0) + reiserfs_kfree(jl, sizeof(struct reiserfs_journal_list), s); } /* @@ -603,6 +587,341 @@ static void cleanup_freed_for_journal_li jl->j_list_bitmap = NULL ; } +static int journal_list_still_alive(struct super_block *s, + unsigned long trans_id) +{ + struct list_head *entry = &SB_JOURNAL(s)->j_journal_list; + struct reiserfs_journal_list *jl; + + if (!list_empty(entry)) { + jl = JOURNAL_LIST_ENTRY(entry->next); + if (jl->j_trans_id <= trans_id) { + return 1; + } + } + return 0; +} + +static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { + char b[BDEVNAME_SIZE]; + + if (buffer_journaled(bh)) { + reiserfs_warning("clm-2084: pinned buffer %lu:%s sent to disk\n", + bh->b_blocknr, bdevname(bh->b_bdev, b)) ; + } + if (uptodate) + set_buffer_uptodate(bh) ; + else + clear_buffer_uptodate(bh) ; + unlock_buffer(bh) ; + put_bh(bh) ; +} + +static void reiserfs_end_ordered_io(struct buffer_head *bh, int uptodate) { + if (uptodate) + set_buffer_uptodate(bh) ; + else + clear_buffer_uptodate(bh) ; + unlock_buffer(bh) ; + put_bh(bh) ; +} + +static void submit_logged_buffer(struct buffer_head *bh) { + get_bh(bh) ; + bh->b_end_io = reiserfs_end_buffer_io_sync ; + mark_buffer_notjournal_new(bh) ; + clear_buffer_dirty(bh) ; + if (!test_and_clear_bit(BH_JTest, &bh->b_state)) + BUG(); + if (!buffer_uptodate(bh)) + BUG(); + submit_bh(WRITE, bh) ; +} + +static void submit_ordered_buffer(struct buffer_head *bh) { + get_bh(bh) ; + bh->b_end_io = reiserfs_end_ordered_io; + clear_buffer_dirty(bh) ; + if (!buffer_uptodate(bh)) + BUG(); + submit_bh(WRITE, bh) ; +} + +static int submit_barrier_buffer(struct buffer_head *bh) { + get_bh(bh) ; + bh->b_end_io = reiserfs_end_ordered_io; + clear_buffer_dirty(bh) ; + if (!buffer_uptodate(bh)) + BUG(); + return submit_bh(WRITE_BARRIER, bh) ; +} + +#define CHUNK_SIZE 32 +struct buffer_chunk { + struct buffer_head *bh[CHUNK_SIZE]; + int nr; +}; + +static void write_chunk(struct buffer_chunk *chunk) { + int i; + for (i = 0; i < chunk->nr ; i++) { + submit_logged_buffer(chunk->bh[i]) ; + } + chunk->nr = 0; +} + +static void write_ordered_chunk(struct buffer_chunk *chunk) { + int i; + for (i = 0; i < chunk->nr ; i++) { + submit_ordered_buffer(chunk->bh[i]) ; + } + chunk->nr = 0; +} + +static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, + spinlock_t *lock, + void (fn)(struct buffer_chunk *)) +{ + int ret = 0; + if (chunk->nr >= CHUNK_SIZE) + BUG(); + chunk->bh[chunk->nr++] = bh; + if (chunk->nr >= CHUNK_SIZE) { + ret = 1; + if (lock) + spin_unlock(lock); + fn(chunk); + if (lock) + spin_lock(lock); + } + return ret; +} + + +atomic_t nr_reiserfs_jh = ATOMIC_INIT(0); +static struct reiserfs_jh *alloc_jh(void) { + struct reiserfs_jh *jh; + while(1) { + jh = kmalloc(sizeof(*jh), GFP_NOFS); + if (jh) { + atomic_inc(&nr_reiserfs_jh); + return jh; + } + yield(); + } +} + +/* + * we want to free the jh when the buffer has been written + * and waited on + */ +void reiserfs_free_jh(struct buffer_head *bh) { + struct reiserfs_jh *jh; + + jh = bh->b_private; + if (jh) { + bh->b_private = NULL; + jh->bh = NULL; + list_del_init(&jh->list); + kfree(jh); + if (atomic_read(&nr_reiserfs_jh) <= 0) + BUG(); + atomic_dec(&nr_reiserfs_jh); + put_bh(bh); + } +} + +static inline int __add_jh(struct reiserfs_journal *j, struct buffer_head *bh, + int tail) +{ + struct reiserfs_jh *jh; + + if (bh->b_private) { + spin_lock(&j->j_dirty_buffers_lock); + if (!bh->b_private) { + spin_unlock(&j->j_dirty_buffers_lock); + goto no_jh; + } + jh = bh->b_private; + list_del_init(&jh->list); + } else { +no_jh: + get_bh(bh); + jh = alloc_jh(); + spin_lock(&j->j_dirty_buffers_lock); + /* buffer must be locked for __add_jh, should be able to have + * two adds at the same time + */ + if (bh->b_private) + BUG(); + jh->bh = bh; + bh->b_private = jh; + } + jh->jl = j->j_current_jl; + if (tail) + list_add_tail(&jh->list, &jh->jl->j_tail_bh_list); + else { + list_add_tail(&jh->list, &jh->jl->j_bh_list); + } + spin_unlock(&j->j_dirty_buffers_lock); + return 0; +} + +int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh) { + return __add_jh(SB_JOURNAL(inode->i_sb), bh, 1); +} +int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh) { + return __add_jh(SB_JOURNAL(inode->i_sb), bh, 0); +} + +#define JH_ENTRY(l) list_entry((l), struct reiserfs_jh, list) +static int write_ordered_buffers(spinlock_t *lock, + struct reiserfs_journal *j, + struct reiserfs_journal_list *jl, + struct list_head *list) +{ + struct buffer_head *bh; + struct reiserfs_jh *jh; + int ret = 0; + struct buffer_chunk chunk; + struct list_head tmp; + INIT_LIST_HEAD(&tmp); + + chunk.nr = 0; + spin_lock(lock); + while(!list_empty(list)) { + jh = JH_ENTRY(list->next); + bh = jh->bh; + get_bh(bh); + if (test_set_buffer_locked(bh)) { + if (!buffer_dirty(bh)) { + list_del_init(&jh->list); + list_add(&jh->list, &tmp); + goto loop_next; + } + spin_unlock(lock); + if (chunk.nr) + write_ordered_chunk(&chunk); + wait_on_buffer(bh); + if (need_resched) + schedule(); + spin_lock(lock); + goto loop_next; + } + if (buffer_dirty(bh)) { + list_del_init(&jh->list); + list_add(&jh->list, &tmp); + add_to_chunk(&chunk, bh, lock, write_ordered_chunk); + } else { + reiserfs_free_jh(bh); + unlock_buffer(bh); + } +loop_next: + put_bh(bh); + if (chunk.nr == 0 && need_resched) { + spin_unlock(lock); + schedule(); + spin_lock(lock); + } + } + if (chunk.nr) { + spin_unlock(lock); + write_ordered_chunk(&chunk); + spin_lock(lock); + } + while(!list_empty(&tmp)) { + jh = JH_ENTRY(tmp.prev); + bh = jh->bh; + get_bh(bh); + reiserfs_free_jh(bh); + + if (buffer_locked(bh)) { + spin_unlock(lock); + wait_on_buffer(bh); + spin_lock(lock); + } + if (!buffer_uptodate(bh)) + ret = -EIO; + put_bh(bh); + if (need_resched()) { + spin_unlock(lock); + schedule(); + spin_lock(lock); + } + } + spin_unlock(lock); + return ret; +} + +static int flush_older_commits(struct super_block *s, struct reiserfs_journal_list *jl) { + struct reiserfs_journal_list *other_jl; + struct reiserfs_journal_list *first_jl; + struct list_head *entry; + unsigned long trans_id = jl->j_trans_id; + unsigned long other_trans_id; + unsigned long first_trans_id; + +find_first: + /* + * first we walk backwards to find the oldest uncommitted transation + */ + first_jl = jl; + entry = jl->j_list.prev; + while(1) { + other_jl = JOURNAL_LIST_ENTRY(entry); + if (entry == &SB_JOURNAL(s)->j_journal_list || + atomic_read(&other_jl->j_older_commits_done)) + break; + + first_jl = other_jl; + entry = other_jl->j_list.prev; + } + + /* if we didn't find any older uncommitted transactions, return now */ + if (first_jl == jl) { + return 0; + } + + first_trans_id = first_jl->j_trans_id; + + entry = &first_jl->j_list; + while(1) { + other_jl = JOURNAL_LIST_ENTRY(entry); + other_trans_id = other_jl->j_trans_id; + + if (other_trans_id < trans_id) { + if (atomic_read(&other_jl->j_commit_left) != 0) { + flush_commit_list(s, other_jl, 0); + + /* list we were called with is gone, return */ + if (!journal_list_still_alive(s, trans_id)) + return 1; + + /* the one we just flushed is gone, this means all + * older lists are also gone, so first_jl is no longer + * valid either. Go back to the beginning. + */ + if (!journal_list_still_alive(s, other_trans_id)) { + goto find_first; + } + } + entry = entry->next; + if (entry == &SB_JOURNAL(s)->j_journal_list) + return 0; + } else { + return 0; + } + } + return 0; +} +int reiserfs_async_progress_wait(struct super_block *s) { + DEFINE_WAIT(wait); + struct reiserfs_journal *j = SB_JOURNAL(s); + if (atomic_read(&j->j_async_throttle)) + blk_congestion_wait(WRITE, HZ/10); + return 0; +} + /* ** if this journal list still has commit blocks unflushed, send them to disk. ** @@ -611,13 +930,11 @@ static void cleanup_freed_for_journal_li ** */ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list *jl, int flushall) { - int i, count ; - int index = 0 ; + int i; int bn ; - int retry_count = 0 ; - int orig_commit_left = 0 ; struct buffer_head *tbh = NULL ; - struct reiserfs_journal_list *other_jl ; + unsigned long trans_id = jl->j_trans_id; + int barrier = 0; reiserfs_check_lock_depth("flush_commit_list") ; @@ -628,133 +945,129 @@ static int flush_commit_list(struct supe /* before we can put our commit blocks on disk, we have to make sure everyone older than ** us is on disk too */ - if (jl->j_len <= 0) { - return 0 ; - } + if (jl->j_len <= 0) + BUG(); + if (trans_id == SB_JOURNAL(s)->j_trans_id) + BUG(); + + get_journal_list(jl); if (flushall) { - /* we _must_ make sure the transactions are committed in order. Start with the - ** index after this one, wrap all the way around - */ - index = (jl - SB_JOURNAL_LIST(s)) + 1 ; - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { - other_jl = SB_JOURNAL_LIST(s) + ( (index + i) % JOURNAL_LIST_COUNT) ; - if (other_jl && other_jl != jl && other_jl->j_len > 0 && other_jl->j_trans_id > 0 && - other_jl->j_trans_id <= jl->j_trans_id && (atomic_read(&(jl->j_older_commits_done)) == 0)) { - flush_commit_list(s, other_jl, 0) ; - } + if (flush_older_commits(s, jl) == 1) { + /* list disappeared during flush_older_commits. return */ + goto put_jl; } } - count = 0 ; - /* don't flush the commit list for the current transactoin */ - if (jl == ((SB_JOURNAL_LIST(s) + SB_JOURNAL_LIST_INDEX(s)))) { - return 0 ; - } - /* make sure nobody is trying to flush this one at the same time */ - if (atomic_read(&(jl->j_commit_flushing))) { - sleep_on(&(jl->j_commit_wait)) ; - if (flushall) { - atomic_set(&(jl->j_older_commits_done), 1) ; - } - return 0 ; - } - + down(&jl->j_commit_lock); + if (!journal_list_still_alive(s, trans_id)) { + up(&jl->j_commit_lock); + goto put_jl; + } + if (jl->j_trans_id == 0) + BUG(); + /* this commit is done, exit */ if (atomic_read(&(jl->j_commit_left)) <= 0) { if (flushall) { atomic_set(&(jl->j_older_commits_done), 1) ; } - return 0 ; + up(&jl->j_commit_lock); + goto put_jl; } - /* keeps others from flushing while we are flushing */ - atomic_set(&(jl->j_commit_flushing), 1) ; - - if (jl->j_len > SB_JOURNAL_TRANS_MAX(s)) { - reiserfs_panic(s, "journal-512: flush_commit_list: length is %lu, list number %d\n", jl->j_len, jl - SB_JOURNAL_LIST(s)) ; - return 0 ; - } - - orig_commit_left = atomic_read(&(jl->j_commit_left)) ; - - /* start by checking all the commit blocks in this transaction. - ** Add anyone not on disk into tbh. Stop checking once commit_left <= 1, because that means we - ** only have the commit block left - */ -retry: - count = 0 ; - for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && i < (jl->j_len + 1) ; i++) { /* everything but commit_bh */ - bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) % SB_ONDISK_JOURNAL_SIZE(s); + if (!list_empty(&jl->j_bh_list)) { + unlock_kernel(); + write_ordered_buffers(&SB_JOURNAL(s)->j_dirty_buffers_lock, + SB_JOURNAL(s), jl, &jl->j_bh_list); + lock_kernel(); + } + if (!list_empty(&jl->j_bh_list)) + BUG(); + /* + * for the description block and all the log blocks, submit any buffers + * that haven't already reached the disk + */ + atomic_inc(&SB_JOURNAL(s)->j_async_throttle); + for (i = 0 ; i < (jl->j_len + 1) ; i++) { + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) % + SB_ONDISK_JOURNAL_SIZE(s); tbh = journal_find_get_block(s, bn) ; - -/* kill this sanity check */ -if (count > (orig_commit_left + 2)) { -reiserfs_panic(s, "journal-539: flush_commit_list: BAD count(%d) > orig_commit_left(%d)!\n", count, orig_commit_left) ; -} - if (tbh) { - if (buffer_locked(tbh)) { /* wait on it, redo it just to make sure */ - wait_on_buffer(tbh) ; - if (!buffer_uptodate(tbh)) { - reiserfs_panic(s, "journal-584, buffer write failed\n") ; - } - } - if (buffer_dirty(tbh)) { - printk("journal-569: flush_commit_list, block already dirty!\n") ; - } else { - mark_buffer_dirty(tbh) ; - } - ll_rw_block(WRITE, 1, &tbh) ; - count++ ; - put_bh(tbh) ; /* once for our get_hash */ - } + if (buffer_dirty(tbh)) + ll_rw_block(WRITE, 1, &tbh) ; + put_bh(tbh) ; } + atomic_dec(&SB_JOURNAL(s)->j_async_throttle); - /* wait on everyone in tbh before writing commit block*/ - if (count > 0) { - for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && - i < (jl->j_len + 1) ; i++) { /* everything but commit_bh */ - bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ; - tbh = journal_find_get_block(s, bn) ; - - wait_on_buffer(tbh) ; - if (!buffer_uptodate(tbh)) { - reiserfs_panic(s, "journal-601, buffer write failed\n") ; + /* wait on everything written so far before writing the commit + * if we are in barrier mode, send the commit down now + */ + barrier = reiserfs_barrier_flush(s); + if (barrier) { + int ret; + lock_buffer(jl->j_commit_bh); + ret = submit_barrier_buffer(jl->j_commit_bh); + if (ret == -EOPNOTSUPP) { + set_buffer_uptodate(jl->j_commit_bh); + disable_barrier(s); + barrier = 0; } - put_bh(tbh) ; /* once for our get_hash */ - bforget(tbh) ; /* once due to original getblk in do_journal_end */ - atomic_dec(&(jl->j_commit_left)) ; - } - } - - if (atomic_read(&(jl->j_commit_left)) != 1) { /* just the commit_bh left, flush it without calling getblk for everyone */ - if (retry_count < 2) { - printk("journal-582: flush_commit_list, not all log blocks on disk yet, trying again\n") ; - retry_count++ ; - goto retry; - } - reiserfs_panic(s, "journal-563: flush_commit_list: BAD, j_commit_left is %u, should be 1\n", - atomic_read(&(jl->j_commit_left))); } + for (i = 0 ; i < (jl->j_len + 1) ; i++) { + bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ; + tbh = journal_find_get_block(s, bn) ; + wait_on_buffer(tbh) ; + // since we're using ll_rw_blk above, it might have skipped over + // a locked buffer. Double check here + // + if (buffer_dirty(tbh)) + sync_dirty_buffer(tbh); + if (!buffer_uptodate(tbh)) { + reiserfs_panic(s, "journal-601, buffer write failed\n") ; + } + put_bh(tbh) ; /* once for journal_find_get_block */ + put_bh(tbh) ; /* once due to original getblk in do_journal_end */ + atomic_dec(&(jl->j_commit_left)) ; + } + + if (atomic_read(&(jl->j_commit_left)) != 1) + BUG(); + + if (!barrier) { + if (buffer_dirty(jl->j_commit_bh)) + BUG(); + mark_buffer_dirty(jl->j_commit_bh) ; + sync_dirty_buffer(jl->j_commit_bh) ; + } else + wait_on_buffer(jl->j_commit_bh); - mark_buffer_dirty(jl->j_commit_bh) ; - sync_dirty_buffer(jl->j_commit_bh) ; if (!buffer_uptodate(jl->j_commit_bh)) { reiserfs_panic(s, "journal-615: buffer write failed\n") ; } - atomic_dec(&(jl->j_commit_left)) ; bforget(jl->j_commit_bh) ; + if (SB_JOURNAL(s)->j_last_commit_id != 0 && + (jl->j_trans_id - SB_JOURNAL(s)->j_last_commit_id) != 1) { + reiserfs_warning("clm-2200: last commit %lu, current %lu\n", + SB_JOURNAL(s)->j_last_commit_id, + jl->j_trans_id); + } + SB_JOURNAL(s)->j_last_commit_id = jl->j_trans_id; /* now, every commit block is on the disk. It is safe to allow blocks freed during this transaction to be reallocated */ cleanup_freed_for_journal_list(s, jl) ; + /* mark the metadata dirty */ + dirty_one_transaction(s, jl); + atomic_dec(&(jl->j_commit_left)) ; + if (flushall) { atomic_set(&(jl->j_older_commits_done), 1) ; } - atomic_set(&(jl->j_commit_flushing), 0) ; - wake_up(&(jl->j_commit_wait)) ; - - s->s_dirt = 1 ; + up(&jl->j_commit_lock); +put_jl: + put_journal_list(s, jl); + return 0 ; } @@ -829,8 +1142,22 @@ static int _update_journal_header_block( jh->j_last_flush_trans_id = cpu_to_le32(trans_id) ; jh->j_first_unflushed_offset = cpu_to_le32(offset) ; jh->j_mount_id = cpu_to_le32(SB_JOURNAL(p_s_sb)->j_mount_id) ; - set_buffer_dirty(SB_JOURNAL(p_s_sb)->j_header_bh) ; - sync_dirty_buffer(SB_JOURNAL(p_s_sb)->j_header_bh) ; + + if (reiserfs_barrier_flush(p_s_sb)) { + int ret; + lock_buffer(SB_JOURNAL(p_s_sb)->j_header_bh); + ret = submit_barrier_buffer(SB_JOURNAL(p_s_sb)->j_header_bh); + if (ret == -EOPNOTSUPP) { + set_buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh); + disable_barrier(p_s_sb); + goto sync; + } + wait_on_buffer(SB_JOURNAL(p_s_sb)->j_header_bh); + } else { +sync: + set_buffer_dirty(SB_JOURNAL(p_s_sb)->j_header_bh) ; + sync_dirty_buffer(SB_JOURNAL(p_s_sb)->j_header_bh) ; + } if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) { printk( "reiserfs: journal-837: IO error during journal replay\n" ); return -EIO ; @@ -851,45 +1178,35 @@ static int update_journal_header_block(s ** flush any and all journal lists older than you are ** can only be called from flush_journal_list */ -static int flush_older_journal_lists(struct super_block *p_s_sb, struct reiserfs_journal_list *jl, unsigned long trans_id) { - int i, index ; - struct reiserfs_journal_list *other_jl ; - - index = jl - SB_JOURNAL_LIST(p_s_sb) ; - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { - other_jl = SB_JOURNAL_LIST(p_s_sb) + ((index + i) % JOURNAL_LIST_COUNT) ; - if (other_jl && other_jl->j_len > 0 && - other_jl->j_trans_id > 0 && - other_jl->j_trans_id < trans_id && - other_jl != jl) { - /* do not flush all */ - flush_journal_list(p_s_sb, other_jl, 0) ; +static int flush_older_journal_lists(struct super_block *p_s_sb, + struct reiserfs_journal_list *jl) +{ + struct list_head *entry; + struct reiserfs_journal_list *other_jl ; + unsigned long trans_id = jl->j_trans_id; + + /* we know we are the only ones flushing things, no extra race + * protection is required. + */ +restart: + entry = SB_JOURNAL(p_s_sb)->j_journal_list.next; + other_jl = JOURNAL_LIST_ENTRY(entry); + if (other_jl->j_trans_id < trans_id) { + /* do not flush all */ + flush_journal_list(p_s_sb, other_jl, 0) ; + + /* other_jl is now deleted from the list */ + goto restart; } - } - return 0 ; + return 0 ; } -static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; - - if (buffer_journaled(bh)) { - reiserfs_warning("clm-2084: pinned buffer %lu:%s sent to disk\n", - bh->b_blocknr, bdevname(bh->b_bdev, b)) ; +static void del_from_work_list(struct super_block *s, + struct reiserfs_journal_list *jl) { + if (!list_empty(&jl->j_working_list)) { + list_del_init(&jl->j_working_list); + SB_JOURNAL(s)->j_num_work_lists--; } - if (uptodate) - set_buffer_uptodate(bh) ; - else - clear_buffer_uptodate(bh) ; - unlock_buffer(bh) ; - put_bh(bh) ; -} -static void submit_logged_buffer(struct buffer_head *bh) { - lock_buffer(bh) ; - get_bh(bh) ; - bh->b_end_io = reiserfs_end_buffer_io_sync ; - mark_buffer_notjournal_new(bh) ; - clear_buffer_dirty(bh) ; - submit_bh(WRITE, bh) ; } /* flush a journal list, both commit and real blocks @@ -912,29 +1229,26 @@ static int flush_journal_list(struct sup unsigned long j_len_saved = jl->j_len ; if (j_len_saved <= 0) { - return 0 ; + BUG(); } if (atomic_read(&SB_JOURNAL(s)->j_wcount) != 0) { reiserfs_warning("clm-2048: flush_journal_list called with wcount %d\n", atomic_read(&SB_JOURNAL(s)->j_wcount)) ; } - /* if someone is getting the commit list, we must wait for them */ - while (atomic_read(&(jl->j_commit_flushing))) { - sleep_on(&(jl->j_commit_wait)) ; - } - /* if someone is flushing this list, we must wait for them */ - while (atomic_read(&(jl->j_flushing))) { - sleep_on(&(jl->j_flush_wait)) ; - } + if (jl->j_trans_id == 0) + BUG(); - /* this list is now ours, we can change anything we want */ - atomic_set(&(jl->j_flushing), 1) ; + /* if flushall == 0, the lock is already held */ + if (flushall) { + down(&SB_JOURNAL(s)->j_flush_sem); + } else if (!down_trylock(&SB_JOURNAL(s)->j_flush_sem)) { + BUG(); + } count = 0 ; if (j_len_saved > SB_JOURNAL_TRANS_MAX(s)) { - reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, list number %d\n", j_len_saved, jl - SB_JOURNAL_LIST(s)) ; - atomic_dec(&(jl->j_flushing)) ; + reiserfs_panic(s, "journal-715: flush_journal_list, length is %lu, trans id %lu\n", j_len_saved, jl->j_trans_id); return 0 ; } @@ -949,6 +1263,9 @@ static int flush_journal_list(struct sup */ flush_commit_list(s, jl, 1) ; + if (!(jl->j_state & LIST_DIRTY)) + BUG(); + /* are we done now? */ if (atomic_read(&(jl->j_nonzerolen)) <= 0 && atomic_read(&(jl->j_commit_left)) <= 0) { @@ -984,13 +1301,13 @@ static int flush_journal_list(struct sup get_bh(saved_bh) ; if (buffer_journal_dirty(saved_bh)) { + if (!can_dirty(cn)) + BUG(); was_jwait = 1 ; - mark_buffer_notjournal_dirty(saved_bh) ; - /* undo the inc from journal_mark_dirty */ - put_bh(saved_bh) ; - } - if (can_dirty(cn)) { was_dirty = 1 ; + } else if (can_dirty(cn)) { + /* everything with !pjl && jwait should be writable */ + BUG(); } } @@ -998,7 +1315,8 @@ static int flush_journal_list(struct sup ** sure they are commited, and don't try writing it to disk */ if (pjl) { - flush_commit_list(s, pjl, 1) ; + if (atomic_read(&pjl->j_commit_left)) + flush_commit_list(s, pjl, 1) ; goto free_cnode ; } @@ -1017,22 +1335,17 @@ static int flush_journal_list(struct sup printk("journal-813: BAD! buffer %llu %cdirty %cjwait, not in a newer tranasction\n", (unsigned long long)saved_bh->b_blocknr, was_dirty ? ' ' : '!', was_jwait ? ' ' : '!') ; } - /* kupdate_one_transaction waits on the buffers it is writing, so we - ** should never see locked buffers here - */ - if (buffer_locked(saved_bh)) { - printk("clm-2083: locked buffer %llu in flush_journal_list\n", - (unsigned long long)saved_bh->b_blocknr) ; - wait_on_buffer(saved_bh) ; - if (!buffer_uptodate(saved_bh)) { - reiserfs_panic(s, "journal-923: buffer write failed\n") ; - } - } if (was_dirty) { /* we inc again because saved_bh gets decremented at free_cnode */ get_bh(saved_bh) ; set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ; - submit_logged_buffer(saved_bh) ; + lock_buffer(saved_bh); + if (cn->blocknr != saved_bh->b_blocknr) + BUG(); + if (buffer_dirty(saved_bh)) + submit_logged_buffer(saved_bh) ; + else + unlock_buffer(saved_bh); count++ ; } else { printk("clm-2082: Unable to flush buffer %llu in flush_journal_list\n", @@ -1063,6 +1376,14 @@ free_cnode: if (!buffer_uptodate(cn->bh)) { reiserfs_panic(s, "journal-949: buffer write failed\n") ; } + /* note, we must clear the JDirty_wait bit after the up to date + ** check, otherwise we race against our flushpage routine + */ + if (!test_and_clear_bit(BH_JDirty_wait, &cn->bh->b_state)) + BUG(); + + /* undo the inc from journal_mark_dirty */ + put_bh(cn->bh) ; brelse(cn->bh) ; } cn = cn->next ; @@ -1076,7 +1397,7 @@ flush_older_and_return: ** replayed after a crash */ if (flushall) { - flush_older_journal_lists(s, jl, jl->j_trans_id) ; + flush_older_journal_lists(s, jl); } /* before we can remove everything from the hash tables for this @@ -1091,181 +1412,224 @@ flush_older_and_return: update_journal_header_block(s, (jl->j_start + jl->j_len + 2) % SB_ONDISK_JOURNAL_SIZE(s), jl->j_trans_id) ; } remove_all_from_journal_list(s, jl, 0) ; + list_del(&jl->j_list); + SB_JOURNAL(s)->j_num_lists--; + del_from_work_list(s, jl); + + if (SB_JOURNAL(s)->j_last_flush_id != 0 && + (jl->j_trans_id - SB_JOURNAL(s)->j_last_flush_id) != 1) { + reiserfs_warning("clm-2201: last flush %lu, current %lu\n", + SB_JOURNAL(s)->j_last_flush_id, + jl->j_trans_id); + } + SB_JOURNAL(s)->j_last_flush_id = jl->j_trans_id; + + /* not strictly required since we are freeing the list, but it should + * help find code using dead lists later on + */ jl->j_len = 0 ; atomic_set(&(jl->j_nonzerolen), 0) ; jl->j_start = 0 ; jl->j_realblock = NULL ; jl->j_commit_bh = NULL ; jl->j_trans_id = 0 ; - atomic_dec(&(jl->j_flushing)) ; - wake_up(&(jl->j_flush_wait)) ; + jl->j_state = 0; + put_journal_list(s, jl); + if (flushall) + up(&SB_JOURNAL(s)->j_flush_sem); return 0 ; } - -static int kupdate_one_transaction(struct super_block *s, - struct reiserfs_journal_list *jl) +static int write_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl, + struct buffer_chunk *chunk) { - struct reiserfs_journal_list *pjl ; /* previous list for this cn */ - struct reiserfs_journal_cnode *cn, *walk_cn ; - b_blocknr_t blocknr ; - int run = 0 ; - int orig_trans_id = jl->j_trans_id ; - struct buffer_head *saved_bh ; + struct reiserfs_journal_cnode *cn; int ret = 0 ; - /* if someone is getting the commit list, we must wait for them */ - while (atomic_read(&(jl->j_commit_flushing))) { - sleep_on(&(jl->j_commit_wait)) ; - } - /* if someone is flushing this list, we must wait for them */ - while (atomic_read(&(jl->j_flushing))) { - sleep_on(&(jl->j_flush_wait)) ; + jl->j_state |= LIST_TOUCHED; + del_from_work_list(s, jl); + if (jl->j_len == 0 || atomic_read(&jl->j_nonzerolen) == 0) { + return 0; } - /* was it flushed while we slept? */ - if (jl->j_len <= 0 || jl->j_trans_id != orig_trans_id) { - return 0 ; - } - - /* this list is now ours, we can change anything we want */ - atomic_set(&(jl->j_flushing), 1) ; -loop_start: cn = jl->j_realblock ; while(cn) { - saved_bh = NULL ; /* if the blocknr == 0, this has been cleared from the hash, ** skip it */ if (cn->blocknr == 0) { goto next ; } + if (cn->bh && can_dirty(cn) && buffer_dirty(cn->bh)) { + struct buffer_head *tmp_bh; + /* we can race against journal_mark_freed when we try + * to lock_buffer(cn->bh), so we have to inc the buffer + * count, and recheck things after locking + */ + tmp_bh = cn->bh; + get_bh(tmp_bh); + lock_buffer(tmp_bh); + if (cn->bh && can_dirty(cn) && buffer_dirty(tmp_bh)) { + if (!buffer_journal_dirty(tmp_bh) || + reiserfs_buffer_prepared(tmp_bh)) + BUG(); + add_to_chunk(chunk, tmp_bh, NULL, write_chunk); + ret++; + } else { + /* note, cn->bh might be null now */ + unlock_buffer(tmp_bh); + } + put_bh(tmp_bh); + } +next: + cn = cn->next ; + cond_resched(); + } + return ret ; +} + +/* used by flush_commit_list */ +static int dirty_one_transaction(struct super_block *s, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_journal_cnode *cn; + struct reiserfs_journal_list *pjl; + int ret = 0 ; + + jl->j_state |= LIST_DIRTY; + cn = jl->j_realblock ; + while(cn) { /* look for a more recent transaction that logged this ** buffer. Only the most recent transaction with a buffer in ** it is allowed to send that buffer to disk */ - pjl = find_newer_jl_for_cn(cn) ; - if (run == 0 && !pjl && cn->bh && buffer_journal_dirty(cn->bh) && - can_dirty(cn)) - { - if (!test_bit(BH_JPrepared, &cn->bh->b_state)) { - set_bit(BLOCK_NEEDS_FLUSH, &cn->state) ; - submit_logged_buffer(cn->bh) ; - } else { - /* someone else is using this buffer. We can't - ** send it to disk right now because they might - ** be changing/logging it. - */ - ret = 1 ; - } - } else if (test_bit(BLOCK_NEEDS_FLUSH, &cn->state)) { - clear_bit(BLOCK_NEEDS_FLUSH, &cn->state) ; - if (!pjl && cn->bh) { - wait_on_buffer(cn->bh) ; - } - /* check again, someone could have logged while we scheduled */ - pjl = find_newer_jl_for_cn(cn) ; - - /* before the JDirty_wait bit is set, the - ** buffer is added to the hash list. So, if we are - ** run in the middle of a do_journal_end, we will notice - ** if this buffer was logged and added from the latest - ** transaction. In this case, we don't want to decrement - ** b_count - */ - if (!pjl && cn->bh && buffer_journal_dirty(cn->bh)) { - blocknr = cn->blocknr ; - walk_cn = cn ; - saved_bh= cn->bh ; - /* update all older transactions to show this block - ** was flushed - */ - mark_buffer_notjournal_dirty(cn->bh) ; - while(walk_cn) { - if (walk_cn->bh && walk_cn->blocknr == blocknr && - walk_cn->sb == cn->sb) { - if (walk_cn->jlist) { - atomic_dec(&(walk_cn->jlist->j_nonzerolen)) ; - } - walk_cn->bh = NULL ; - } - walk_cn = walk_cn->hnext ; - } - if (atomic_read(&saved_bh->b_count) < 1) { - reiserfs_warning("clm-2081: bad count on %lu\n", - saved_bh->b_blocknr) ; - } - brelse(saved_bh) ; - } - } - /* - ** if the more recent transaction is committed to the log, - ** this buffer can be considered flushed. Decrement our - ** counters to reflect one less buffer that needs writing. - ** - ** note, this relies on all of the above code being - ** schedule free once pjl comes back non-null. - */ - if (pjl && cn->bh && atomic_read(&pjl->j_commit_left) == 0) { - atomic_dec(&cn->jlist->j_nonzerolen) ; - cn->bh = NULL ; + pjl = find_newer_jl_for_cn(cn) ; + if (!pjl && cn->blocknr && cn->bh && buffer_journal_dirty(cn->bh)) + { + if (!can_dirty(cn)) + BUG(); + /* if the buffer is prepared, it will either be logged + * or restored. If restored, we need to make sure + * it actually gets marked dirty + */ + mark_buffer_notjournal_new(cn->bh) ; + if (test_bit(BH_JPrepared, &cn->bh->b_state)) { + set_bit(BH_JRestore_dirty, &cn->bh->b_state); + } else { + set_bit(BH_JTest, &cn->bh->b_state); + mark_buffer_dirty(cn->bh); + } } -next: cn = cn->next ; } - /* the first run through the loop sends all the dirty buffers to - ** ll_rw_block. - ** the second run through the loop does all the accounting - */ - if (run++ == 0) { - goto loop_start ; - } - - atomic_set(&(jl->j_flushing), 0) ; - wake_up(&(jl->j_flush_wait)) ; return ret ; } -/* since we never give dirty buffers to bdflush/kupdate, we have to -** flush them ourselves. This runs through the journal lists, finds -** old metadata in need of flushing and sends it to disk. -** this does not end transactions, commit anything, or free -** cnodes. -** -** returns the highest transaction id that was flushed last time -*/ -static unsigned long reiserfs_journal_kupdate(struct super_block *s) { - struct reiserfs_journal_list *jl ; - int i ; - int start ; - time_t age ; - int ret = 0 ; - start = SB_JOURNAL_LIST_INDEX(s) ; +static int kupdate_transactions(struct super_block *s, + struct reiserfs_journal_list *jl, + struct reiserfs_journal_list **next_jl, + unsigned long *next_trans_id, + int num_blocks, + int num_trans) { + int ret = 0; + int written = 0 ; + int transactions_flushed = 0; + unsigned long orig_trans_id = jl->j_trans_id; + struct buffer_chunk chunk; + struct list_head *entry; + chunk.nr = 0; + + down(&SB_JOURNAL(s)->j_flush_sem); + if (!journal_list_still_alive(s, orig_trans_id)) { + goto done; + } + + /* we've got j_flush_sem held, nobody is going to delete any + * of these lists out from underneath us + */ + while((num_trans && transactions_flushed < num_trans) || + (!num_trans && written < num_blocks)) { + + if (jl->j_len == 0 || (jl->j_state & LIST_TOUCHED) || + atomic_read(&jl->j_commit_left)) + { + del_from_work_list(s, jl); + break; + } + ret = write_one_transaction(s, jl, &chunk); - /* safety check to prevent flush attempts during a mount */ - if (start < 0) { - return 0 ; - } - i = (start + 1) % JOURNAL_LIST_COUNT ; - while(i != start) { - jl = SB_JOURNAL_LIST(s) + i ; - age = get_seconds() - jl->j_timestamp ; - if (jl->j_len > 0 && // age >= (JOURNAL_MAX_COMMIT_AGE * 2) && - atomic_read(&(jl->j_nonzerolen)) > 0 && - atomic_read(&(jl->j_commit_left)) == 0) { - - if (jl->j_trans_id == SB_JOURNAL(s)->j_trans_id) { - break ; - } - /* if ret was already 1, we want to preserve that */ - ret |= kupdate_one_transaction(s, jl) ; - } - if (atomic_read(&(jl->j_nonzerolen)) > 0) { - ret |= 1 ; + if (ret < 0) + goto done; + transactions_flushed++; + written += ret; + entry = jl->j_list.next; + + /* did we wrap? */ + if (entry == &SB_JOURNAL(s)->j_journal_list) { + break; } - i = (i + 1) % JOURNAL_LIST_COUNT ; + jl = JOURNAL_LIST_ENTRY(entry); + + /* don't bother with older transactions */ + if (jl->j_trans_id <= orig_trans_id) + break; } - return ret ; + if (chunk.nr) { + write_chunk(&chunk); + } + +done: + up(&SB_JOURNAL(s)->j_flush_sem); + return ret; +} + +/* for o_sync and fsync heavy applications, they tend to use +** all the journa list slots with tiny transactions. These +** trigger lots and lots of calls to update the header block, which +** adds seeks and slows things down. +** +** This function tries to clear out a large chunk of the journal lists +** at once, which makes everything faster since only the newest journal +** list updates the header block +*/ +static int flush_used_journal_lists(struct super_block *s, + struct reiserfs_journal_list *jl) { + unsigned long len = 0; + unsigned long cur_len; + int ret; + int i; + struct reiserfs_journal_list *tjl; + struct reiserfs_journal_list *flush_jl; + unsigned long trans_id; + + flush_jl = tjl = jl; + + /* flush for 256 transactions or 256 blocks, whichever comes first */ + for(i = 0 ; i < 256 && len < 256 ; i++) { + if (atomic_read(&tjl->j_commit_left) || + tjl->j_trans_id < jl->j_trans_id) { + break; + } + cur_len = atomic_read(&tjl->j_nonzerolen); + if (cur_len > 0) { + tjl->j_state &= ~LIST_TOUCHED; + } + len += cur_len; + flush_jl = tjl; + if (tjl->j_list.next == &SB_JOURNAL(s)->j_journal_list) + break; + tjl = JOURNAL_LIST_ENTRY(tjl->j_list.next); + } + /* try to find a group of blocks we can flush across all the + ** transactions, but only bother if we've actually spanned + ** across multiple lists + */ + if (flush_jl != jl) { + ret = kupdate_transactions(s, jl, &tjl, &trans_id, len, i); + } + flush_journal_list(s, flush_jl, 1); + return 0; } /* @@ -1309,6 +1673,10 @@ void remove_journal_hash(struct super_bl } static void free_journal_ram(struct super_block *p_s_sb) { + reiserfs_kfree(SB_JOURNAL(p_s_sb)->j_current_jl, + sizeof(struct reiserfs_journal_list), p_s_sb); + SB_JOURNAL(p_s_sb)->j_num_lists--; + vfree(SB_JOURNAL(p_s_sb)->j_cnode_free_orig) ; free_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap) ; free_bitmap_nodes(p_s_sb) ; /* must be after free_list_bitmaps */ @@ -1439,7 +1807,7 @@ static int journal_transaction_is_valid( } brelse(c_bh) ; reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1006: found valid " - "transaction start offset %lu, len %d id %d\n", + "transaction start offset %llu, len %d id %d\n", d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), get_desc_trans_len(desc), get_desc_trans_id(desc)) ; return 1 ; @@ -1479,7 +1847,7 @@ static int journal_read_transaction(stru desc = (struct reiserfs_journal_desc *)d_bh->b_data ; trans_offset = d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) ; reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1037: " - "journal_read_transaction, offset %lu, len %d mount_id %d\n", + "journal_read_transaction, offset %llu, len %d mount_id %d\n", d_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), get_desc_trans_len(desc), get_desc_mount_id(desc)) ; if (get_desc_trans_id(desc) < oldest_trans_id) { @@ -1507,7 +1875,7 @@ static int journal_read_transaction(stru commit = (struct reiserfs_journal_commit *)c_bh->b_data ; if (journal_compare_desc_commit(p_s_sb, desc, commit)) { reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal_read_transaction, " - "commit offset %ld had bad time %d or length %d\n", + "commit offset %llu had bad time %d or length %d\n", c_bh->b_blocknr - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), get_commit_trans_id(commit), get_commit_trans_len(commit)); brelse(c_bh) ; @@ -1675,7 +2043,7 @@ static int journal_read(struct super_blo printk("reiserfs: checking transaction log (%s) for (%s)\n", bdevname(SB_JOURNAL(p_s_sb)->j_dev_bd, b), reiserfs_bdevname(p_s_sb)); - start = get_seconds() ; + start = get_seconds(); /* step 1, read in the journal header block. Check the transaction it says ** is the first unflushed, and if that transaction is not valid, @@ -1735,7 +2103,7 @@ static int journal_read(struct super_blo oldest_start = d_bh->b_blocknr ; newest_mount_id = get_desc_mount_id(desc) ; reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1179: Setting " - "oldest_start to offset %lu, trans_id %lu\n", + "oldest_start to offset %llu, trans_id %lu\n", oldest_start - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), oldest_trans_id) ; } else if (oldest_trans_id > get_desc_trans_id(desc)) { @@ -1763,7 +2131,7 @@ start_log_replay: cur_dblock = oldest_start ; if (oldest_trans_id) { reiserfs_debug(p_s_sb, REISERFS_DEBUG_CODE, "journal-1206: Starting replay " - "from offset %lu, trans_id %lu\n", + "from offset %llu, trans_id %lu\n", cur_dblock - SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb), oldest_trans_id) ; @@ -1817,70 +2185,28 @@ start_log_replay: return 0 ; } - -struct reiserfs_journal_commit_task { - struct super_block *p_s_sb ; - int jindex ; - int wake_on_finish ; /* if this is one, we wake the task_done queue, if it - ** is zero, we free the whole struct on finish - */ - struct reiserfs_journal_commit_task *self ; - struct work_struct work; -} ; - -static void reiserfs_journal_commit_task_func(void *__ct) { - struct reiserfs_journal_commit_task *ct = __ct; - struct reiserfs_journal_list *jl ; - - reiserfs_write_lock(ct->p_s_sb); - - jl = SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex ; - - flush_commit_list(ct->p_s_sb, SB_JOURNAL_LIST(ct->p_s_sb) + ct->jindex, 1) ; - - if (jl->j_len > 0 && atomic_read(&(jl->j_nonzerolen)) > 0 && - atomic_read(&(jl->j_commit_left)) == 0) { - kupdate_one_transaction(ct->p_s_sb, jl) ; - } - reiserfs_kfree(ct->self, sizeof(struct reiserfs_journal_commit_task), ct->p_s_sb) ; - reiserfs_write_unlock(ct->p_s_sb); -} - -static void setup_commit_task_arg(struct reiserfs_journal_commit_task *ct, - struct super_block *p_s_sb, - int jindex) { - if (!ct) { - reiserfs_panic(NULL, "journal-1360: setup_commit_task_arg called with NULL struct\n") ; - } - ct->p_s_sb = p_s_sb ; - ct->jindex = jindex ; - INIT_WORK(&ct->work, reiserfs_journal_commit_task_func, ct); - ct->self = ct ; -} - -static void commit_flush_async(struct super_block *p_s_sb, int jindex) { - struct reiserfs_journal_commit_task *ct ; - /* using GFP_NOFS, GFP_KERNEL could try to flush inodes, which will try - ** to start/join a transaction, which will deadlock - */ - ct = reiserfs_kmalloc(sizeof(struct reiserfs_journal_commit_task), GFP_NOFS, p_s_sb) ; - if (ct) { - setup_commit_task_arg(ct, p_s_sb, jindex) ; - queue_work(commit_wq, &ct->work) ; - } else { -#ifdef CONFIG_REISERFS_CHECK - reiserfs_warning("journal-1540: kmalloc failed, doing sync commit\n") ; -#endif - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ; - } +static struct reiserfs_journal_list *alloc_journal_list(struct super_block *s) +{ + struct reiserfs_journal_list *jl; +retry: + jl = reiserfs_kmalloc(sizeof(struct reiserfs_journal_list), GFP_NOFS, s); + if (!jl) { + yield(); + goto retry; + } + memset(jl, 0, sizeof(*jl)); + INIT_LIST_HEAD(&jl->j_list); + INIT_LIST_HEAD(&jl->j_working_list); + INIT_LIST_HEAD(&jl->j_tail_bh_list); + INIT_LIST_HEAD(&jl->j_bh_list); + sema_init(&jl->j_commit_lock, 1); + SB_JOURNAL(s)->j_num_lists++; + get_journal_list(jl); + return jl; } static void journal_list_init(struct super_block *p_s_sb) { - int i ; - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { - init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_commit_wait)) ; - init_waitqueue_head(&(SB_JOURNAL_LIST(p_s_sb)[i].j_flush_wait)) ; - } + SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb); } static int release_journal_dev( struct super_block *super, @@ -1971,6 +2297,7 @@ int journal_init(struct super_block *p_s struct reiserfs_super_block * rs; struct reiserfs_journal_header *jh; struct reiserfs_journal *journal; + struct reiserfs_journal_list *jl; char b[BDEVNAME_SIZE]; journal = SB_JOURNAL(p_s_sb) = vmalloc(sizeof (struct reiserfs_journal)) ; @@ -1981,6 +2308,8 @@ int journal_init(struct super_block *p_s memset(journal, 0, sizeof(struct reiserfs_journal)) ; INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_bitmap_nodes) ; INIT_LIST_HEAD (&SB_JOURNAL(p_s_sb)->j_prealloc_list); + INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_working_list); + INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_journal_list); reiserfs_allocate_list_bitmaps(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_bitmap, SB_BMAP_NR(p_s_sb)) ; allocate_bitmap_nodes(p_s_sb) ; @@ -2088,14 +2417,9 @@ int journal_init(struct super_block *p_s brelse (bhjh); SB_JOURNAL(p_s_sb)->j_list_bitmap_index = 0 ; - SB_JOURNAL_LIST_INDEX(p_s_sb) = -10000 ; /* make sure flush_old_commits does not try to flush a list while replay is on */ - - /* clear out the journal list array */ - memset(SB_JOURNAL_LIST(p_s_sb), 0, sizeof(struct reiserfs_journal_list) * JOURNAL_LIST_COUNT) ; journal_list_init(p_s_sb) ; memset(SB_JOURNAL(p_s_sb)->j_list_hash_table, 0, JOURNAL_HASH_SIZE * sizeof(struct reiserfs_journal_cnode *)) ; - memset(journal_writers, 0, sizeof(char *) * 512) ; /* debug code */ INIT_LIST_HEAD(&SB_JOURNAL(p_s_sb)->j_dirty_buffers) ; spin_lock_init(&SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock) ; @@ -2104,18 +2428,19 @@ int journal_init(struct super_block *p_s SB_JOURNAL(p_s_sb)->j_len = 0 ; SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ; atomic_set(&(SB_JOURNAL(p_s_sb)->j_wcount), 0) ; + atomic_set(&(SB_JOURNAL(p_s_sb)->j_async_throttle), 0) ; SB_JOURNAL(p_s_sb)->j_bcount = 0 ; SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ; SB_JOURNAL(p_s_sb)->j_last = NULL ; SB_JOURNAL(p_s_sb)->j_first = NULL ; init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; - init_waitqueue_head(&(SB_JOURNAL(p_s_sb)->j_wait)) ; + sema_init(&SB_JOURNAL(p_s_sb)->j_lock, 1); + sema_init(&SB_JOURNAL(p_s_sb)->j_flush_sem, 1); SB_JOURNAL(p_s_sb)->j_trans_id = 10 ; SB_JOURNAL(p_s_sb)->j_mount_id = 10 ; SB_JOURNAL(p_s_sb)->j_state = 0 ; atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ; - atomic_set(&(SB_JOURNAL(p_s_sb)->j_wlock), 0) ; SB_JOURNAL(p_s_sb)->j_cnode_free_list = allocate_cnodes(num_cnodes) ; SB_JOURNAL(p_s_sb)->j_cnode_free_orig = SB_JOURNAL(p_s_sb)->j_cnode_free_list ; SB_JOURNAL(p_s_sb)->j_cnode_free = SB_JOURNAL(p_s_sb)->j_cnode_free_list ? num_cnodes : 0 ; @@ -2123,8 +2448,9 @@ int journal_init(struct super_block *p_s SB_JOURNAL(p_s_sb)->j_must_wait = 0 ; init_journal_hash(p_s_sb) ; - SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb)) ; - if (!(SB_JOURNAL_LIST(p_s_sb)[0].j_list_bitmap)) { + jl = SB_JOURNAL(p_s_sb)->j_current_jl; + jl->j_list_bitmap = get_list_bitmap(p_s_sb, jl); + if (!jl->j_list_bitmap) { reiserfs_warning("journal-2005, get_list_bitmap failed for journal list 0\n") ; goto free_and_return; } @@ -2132,16 +2458,12 @@ int journal_init(struct super_block *p_s reiserfs_warning("Replay Failure, unable to mount\n") ; goto free_and_return; } - SB_JOURNAL_LIST_INDEX(p_s_sb) = 0 ; /* once the read is done, we can set this - where it belongs */ - - if (reiserfs_dont_log (p_s_sb)) - return 0; reiserfs_mounted_fs_count++ ; if (reiserfs_mounted_fs_count <= 1) commit_wq = create_workqueue("reiserfs"); + INIT_WORK(&journal->j_work, flush_async_commits, p_s_sb); return 0 ; free_and_return: free_journal_ram(p_s_sb); @@ -2155,7 +2477,8 @@ free_and_return: */ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, int new_alloc) { time_t now = get_seconds() ; - if (reiserfs_dont_log(th->t_super)) + /* cannot restart while nested */ + if (th->t_refcount > 1) return 0 ; if ( SB_JOURNAL(th->t_super)->j_must_wait > 0 || (SB_JOURNAL(th->t_super)->j_len_alloc + new_alloc) >= SB_JOURNAL_MAX_BATCH(th->t_super) || @@ -2193,6 +2516,35 @@ void reiserfs_wait_on_write_block(struct !test_bit(WRITERS_BLOCKED, &SB_JOURNAL(s)->j_state)) ; } +static void queue_log_writer(struct super_block *s) { + set_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state); + sleep_on(&SB_JOURNAL(s)->j_join_wait); +} + +static void wake_queued_writers(struct super_block *s) { + if (test_and_clear_bit(WRITERS_QUEUED, &SB_JOURNAL(s)->j_state)) + wake_up(&SB_JOURNAL(s)->j_join_wait); +} + +static void let_transaction_grow(struct super_block *sb, + unsigned long trans_id) +{ + unsigned long bcount = SB_JOURNAL(sb)->j_bcount; + while(1) { + yield(); + while ((atomic_read(&SB_JOURNAL(sb)->j_wcount) > 0 || + atomic_read(&SB_JOURNAL(sb)->j_jlock)) && + SB_JOURNAL(sb)->j_trans_id == trans_id) { + queue_log_writer(sb); + } + if (SB_JOURNAL(sb)->j_trans_id != trans_id) + break; + if (bcount == SB_JOURNAL(sb)->j_bcount) + break; + bcount = SB_JOURNAL(sb)->j_bcount; + } +} + /* join == true if you must join an existing transaction. ** join == false if you can deal with waiting for others to finish ** @@ -2202,92 +2554,170 @@ void reiserfs_wait_on_write_block(struct static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb,unsigned long nblocks,int join) { time_t now = get_seconds() ; int old_trans_id ; + struct reiserfs_journal *journal = SB_JOURNAL(p_s_sb); + struct reiserfs_transaction_handle myth; + int sched_count = 0; reiserfs_check_lock_depth("journal_begin") ; RFALSE( p_s_sb->s_flags & MS_RDONLY, "clm-2078: calling journal_begin on readonly FS") ; - if (reiserfs_dont_log(p_s_sb)) { - th->t_super = p_s_sb ; /* others will check this for the don't log flag */ - return 0 ; - } PROC_INFO_INC( p_s_sb, journal.journal_being ); + /* set here for journal_join */ + th->t_refcount = 1; + th->t_super = p_s_sb ; relock: lock_journal(p_s_sb) ; + journal->j_bcount++; - if (test_bit(WRITERS_BLOCKED, &SB_JOURNAL(p_s_sb)->j_state)) { + if (test_bit(WRITERS_BLOCKED, &journal->j_state)) { unlock_journal(p_s_sb) ; reiserfs_wait_on_write_block(p_s_sb) ; PROC_INFO_INC( p_s_sb, journal.journal_relock_writers ); goto relock ; } + now = get_seconds(); /* if there is no room in the journal OR ** if this transaction is too old, and we weren't called joinable, wait for it to finish before beginning ** we don't sleep if there aren't other writers */ - if ( (!join && SB_JOURNAL(p_s_sb)->j_must_wait > 0) || - ( !join && (SB_JOURNAL(p_s_sb)->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) || - (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0 && SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && - (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) || - (!join && atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) ) || - (!join && SB_JOURNAL(p_s_sb)->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) { + if ( (!join && journal->j_must_wait > 0) || + ( !join && (journal->j_len_alloc + nblocks + 2) >= SB_JOURNAL_MAX_BATCH(p_s_sb)) || + (!join && atomic_read(&journal->j_wcount) > 0 && journal->j_trans_start_time > 0 && + (now - journal->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) || + (!join && atomic_read(&journal->j_jlock)) || + (!join && journal->j_cnode_free < (SB_JOURNAL_TRANS_MAX(p_s_sb) * 3))) { + old_trans_id = journal->j_trans_id; unlock_journal(p_s_sb) ; /* allow others to finish this transaction */ - - /* if writer count is 0, we can just force this transaction to end, and start - ** a new one afterwards. - */ - if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) { - struct reiserfs_transaction_handle myth ; - journal_join(&myth, p_s_sb, 1) ; - reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; - journal_mark_dirty(&myth, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; - do_journal_end(&myth, p_s_sb,1,COMMIT_NOW) ; + + if (!join && (journal->j_len_alloc + nblocks + 2) >= + SB_JOURNAL_MAX_BATCH(p_s_sb) && + ((journal->j_len + nblocks + 2) * 100) < (journal->j_len_alloc * 75)) + { + if (atomic_read(&journal->j_wcount) > 10) { + sched_count++; + queue_log_writer(p_s_sb); + goto relock; + } + } + /* don't mess with joining the transaction if all we have to do is + * wait for someone else to do a commit + */ + if (atomic_read(&journal->j_jlock)) { + while (journal->j_trans_id == old_trans_id && + atomic_read(&journal->j_jlock)) { + queue_log_writer(p_s_sb); + } + goto relock; + } + journal_join(&myth, p_s_sb, 1) ; + + /* someone might have ended the transaction while we joined */ + if (old_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) { + do_journal_end(&myth, p_s_sb, 1, 0) ; } else { - /* but if the writer count isn't zero, we have to wait for the current writers to finish. - ** They won't batch on transaction end once we set j_jlock - */ - atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ; - old_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ; - while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) && - SB_JOURNAL(p_s_sb)->j_trans_id == old_trans_id) { - sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; - } + do_journal_end(&myth, p_s_sb, 1, COMMIT_NOW) ; } + PROC_INFO_INC( p_s_sb, journal.journal_relock_wcount ); goto relock ; } - - if (SB_JOURNAL(p_s_sb)->j_trans_start_time == 0) { /* we are the first writer, set trans_id */ - SB_JOURNAL(p_s_sb)->j_trans_start_time = now ; + /* we are the first writer, set trans_id */ + if (journal->j_trans_start_time == 0) { + journal->j_trans_start_time = get_seconds(); } - atomic_inc(&(SB_JOURNAL(p_s_sb)->j_wcount)) ; - SB_JOURNAL(p_s_sb)->j_len_alloc += nblocks ; + atomic_inc(&(journal->j_wcount)) ; + journal->j_len_alloc += nblocks ; th->t_blocks_logged = 0 ; th->t_blocks_allocated = nblocks ; - th->t_super = p_s_sb ; - th->t_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ; - th->t_caller = "Unknown" ; + th->t_trans_id = journal->j_trans_id ; unlock_journal(p_s_sb) ; - p_s_sb->s_dirt = 1; return 0 ; } +struct reiserfs_transaction_handle * +reiserfs_persistent_transaction(struct super_block *s, int nblocks) { + int ret ; + struct reiserfs_transaction_handle *th ; + + /* if we're nesting into an existing transaction. It will be + ** persistent on its own + */ + if (reiserfs_transaction_running(s)) { + th = current->journal_info ; + th->t_refcount++ ; + if (th->t_refcount < 2) { + BUG() ; + } + return th ; + } + th = reiserfs_kmalloc(sizeof(struct reiserfs_transaction_handle), GFP_NOFS, s) ; + if (!th) + return NULL; + ret = journal_begin(th, s, nblocks) ; + if (ret) { + reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ; + return NULL; + } + return th ; +} + +int +reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *th) { + struct super_block *s = th->t_super; + int ret; + ret = journal_end(th, th->t_super, th->t_blocks_allocated); + if (th->t_refcount == 0) + reiserfs_kfree(th, sizeof(struct reiserfs_transaction_handle), s) ; + return ret; +} static int journal_join(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + struct reiserfs_transaction_handle *cur_th = current->journal_info; + + /* this keeps do_journal_end from NULLing out the current->journal_info + ** pointer + */ + th->t_handle_save = cur_th ; + if (cur_th && cur_th->t_refcount > 1) { + BUG() ; + } return do_journal_begin_r(th, p_s_sb, nblocks, 1) ; } int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks) { - return do_journal_begin_r(th, p_s_sb, nblocks, 0) ; -} + struct reiserfs_transaction_handle *cur_th = current->journal_info ; + int ret ; -/* not used at all */ -int journal_prepare(struct super_block * p_s_sb, struct buffer_head *bh) { - return 0 ; + th->t_handle_save = NULL ; + if (cur_th) { + /* we are nesting into the current transaction */ + if (cur_th->t_super == p_s_sb) { + cur_th->t_refcount++ ; + memcpy(th, cur_th, sizeof(*th)); + if (th->t_refcount <= 1) + printk("BAD: refcount <= 1, but journal_info != 0\n"); + return 0; + } else { + /* we've ended up with a handle from a different filesystem. + ** save it and restore on journal_end. This should never + ** really happen... + */ + reiserfs_warning("clm-2100: nesting info a different FS\n") ; + th->t_handle_save = current->journal_info ; + current->journal_info = th; + } + } else { + current->journal_info = th; + } + ret = do_journal_begin_r(th, p_s_sb, nblocks, 0) ; + if (current->journal_info != th) + BUG() ; + return ret ; } /* @@ -2305,18 +2735,14 @@ int journal_mark_dirty(struct reiserfs_t int prepared = 0 ; PROC_INFO_INC( p_s_sb, journal.mark_dirty ); - if (reiserfs_dont_log(th->t_super)) { - mark_buffer_dirty(bh) ; - return 0 ; - } - if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) { reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", th->t_trans_id, SB_JOURNAL(p_s_sb)->j_trans_id); } - p_s_sb->s_dirt = 1 ; + p_s_sb->s_dirt = 1; prepared = test_and_clear_bit(BH_JPrepared, &bh->b_state) ; + clear_bit(BH_JRestore_dirty, &bh->b_state); /* already in this transaction, we are done */ if (buffer_journaled(bh)) { PROC_INFO_INC( p_s_sb, journal.mark_dirty_already ); @@ -2327,14 +2753,12 @@ int journal_mark_dirty(struct reiserfs_t ** a dirty or journal_dirty or locked buffer to be logged, as some changes ** could get to disk too early. NOT GOOD. */ - if (!prepared || buffer_locked(bh)) { + if (!prepared || buffer_locked(bh) || buffer_dirty(bh)) { printk("journal-1777: buffer %llu bad state %cPREPARED %cLOCKED %cDIRTY %cJDIRTY_WAIT\n", (unsigned long long)bh->b_blocknr, prepared ? ' ' : '!', buffer_locked(bh) ? ' ' : '!', buffer_dirty(bh) ? ' ' : '!', buffer_journal_dirty(bh) ? ' ' : '!') ; - show_reiserfs_locks() ; } - count_already_incd = clear_prepared_bits(bh) ; if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0) { printk("journal-1409: journal_mark_dirty returning because j_wcount was %d\n", atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount))) ; @@ -2353,14 +2777,6 @@ int journal_mark_dirty(struct reiserfs_t mark_buffer_notjournal_dirty(bh) ; } - if (buffer_dirty(bh)) { - clear_buffer_dirty(bh) ; - } - - if (buffer_journaled(bh)) { /* must double check after getting lock */ - goto done ; - } - if (SB_JOURNAL(p_s_sb)->j_len > SB_JOURNAL(p_s_sb)->j_len_alloc) { SB_JOURNAL(p_s_sb)->j_len_alloc = SB_JOURNAL(p_s_sb)->j_len + JOURNAL_PER_BALANCE_CNT ; } @@ -2400,29 +2816,31 @@ int journal_mark_dirty(struct reiserfs_t SB_JOURNAL(p_s_sb)->j_first = cn ; SB_JOURNAL(p_s_sb)->j_last = cn ; } -done: - return 0 ; -} - -/* -** if buffer already in current transaction, do a journal_mark_dirty -** otherwise, just mark it dirty and move on. Used for writes to meta blocks -** that don't need journaling -*/ -int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, struct buffer_head *bh) { - if (reiserfs_dont_log(th->t_super) || buffer_journaled(bh) || - buffer_journal_dirty(bh)) { - return journal_mark_dirty(th, p_s_sb, bh) ; - } - if (get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_list_hash_table, bh->b_blocknr)) { - return journal_mark_dirty(th, p_s_sb, bh) ; - } - mark_buffer_dirty(bh) ; return 0 ; } int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { - return do_journal_end(th, p_s_sb, nblocks, 0) ; + if (!current->journal_info && th->t_refcount > 1) + printk("REISER-NESTING: th NULL, refcount %d\n", th->t_refcount); + + th->t_refcount--; + if (th->t_refcount > 0) { + struct reiserfs_transaction_handle *cur_th = current->journal_info ; + + /* we aren't allowed to close a nested transaction on a different + ** filesystem from the one in the task struct + */ + if (cur_th->t_super != th->t_super) + BUG() ; + + if (th != cur_th) { + memcpy(current->journal_info, th, sizeof(*th)); + th->t_trans_id = 0; + } + return 0; + } else { + return do_journal_end(th, p_s_sb, nblocks, 0) ; + } } /* removes from the current transaction, relsing and descrementing any counters. @@ -2464,7 +2882,6 @@ static int remove_from_transaction(struc if (atomic_read(&(bh->b_count)) < 0) { printk("journal-1752: remove from trans, b_count < 0\n") ; } - if (!buffer_locked(bh)) reiserfs_clean_and_file_buffer(bh) ; ret = 1 ; } SB_JOURNAL(p_s_sb)->j_len-- ; @@ -2490,7 +2907,7 @@ static int can_dirty(struct reiserfs_jou int can_dirty = 1 ; /* first test hprev. These are all newer than cn, so any node here - ** with the name block number and dev means this node can't be sent + ** with the same block number and dev means this node can't be sent ** to disk right now. */ while(cur && can_dirty) { @@ -2520,6 +2937,10 @@ static int can_dirty(struct reiserfs_jou */ int journal_end_sync(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + /* you can sync while nested, very, very bad */ + if (th->t_refcount > 1) { + BUG() ; + } if (SB_JOURNAL(p_s_sb)->j_len == 0) { reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; @@ -2527,88 +2948,62 @@ int journal_end_sync(struct reiserfs_tra return do_journal_end(th, p_s_sb, nblocks, COMMIT_NOW | WAIT) ; } -int show_reiserfs_locks(void) { - - dump_journal_writers() ; - return 0 ; -} - /* -** used to get memory back from async commits that are floating around -** and to reclaim any blocks deleted but unusable because their commits -** haven't hit disk yet. called from bitmap.c -** -** if it starts flushing things, it ors SCHEDULE_OCCURRED into repeat. -** note, this is just if schedule has a chance of occurring. I need to -** change flush_commit_lists to have a repeat parameter too. -** +** writeback the pending async commits to disk */ -void flush_async_commits(struct super_block *p_s_sb) { - int i ; - - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { - if (i != SB_JOURNAL_LIST_INDEX(p_s_sb)) { - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ; - } - } +static void flush_async_commits(void *p) { + struct super_block *p_s_sb = p; + struct reiserfs_journal_list *jl; + struct list_head *entry; + + lock_kernel(); + if (!list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) { + /* last entry is the youngest, commit it and you get everything */ + entry = SB_JOURNAL(p_s_sb)->j_journal_list.prev; + jl = JOURNAL_LIST_ENTRY(entry); + flush_commit_list(p_s_sb, jl, 1); + } + unlock_kernel(); + atomic_inc(&SB_JOURNAL(p_s_sb)->j_async_throttle); + filemap_fdatawrite(p_s_sb->s_bdev->bd_inode->i_mapping); + atomic_dec(&SB_JOURNAL(p_s_sb)->j_async_throttle); } /* ** flushes any old transactions to disk ** ends the current transaction if it is too old -** -** also calls flush_journal_list with old_only == 1, which allows me to reclaim -** memory and such from the journal lists whose real blocks are all on disk. -** -** called by sync_dev_journal from buffer.c */ -int flush_old_commits(struct super_block *p_s_sb, int immediate) { - int i ; - int count = 0; - int start ; - time_t now ; - struct reiserfs_transaction_handle th ; - - start = SB_JOURNAL_LIST_INDEX(p_s_sb) ; - now = get_seconds() ; - - /* safety check so we don't flush while we are replaying the log during mount */ - if (SB_JOURNAL_LIST_INDEX(p_s_sb) < 0) { - return 0 ; - } - /* starting with oldest, loop until we get to the start */ - i = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ; - while(i != start) { - if (SB_JOURNAL_LIST(p_s_sb)[i].j_len > 0 && ((now - SB_JOURNAL_LIST(p_s_sb)[i].j_timestamp) > SB_JOURNAL_MAX_COMMIT_AGE(p_s_sb) || - immediate)) { - /* we have to check again to be sure the current transaction did not change */ - if (i != SB_JOURNAL_LIST_INDEX(p_s_sb)) { - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + i, 1) ; - } - } - i = (i + 1) % JOURNAL_LIST_COUNT ; - count++ ; - } - /* now, check the current transaction. If there are no writers, and it is too old, finish it, and - ** force the commit blocks to disk - */ - if (!immediate && atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 && - SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && - SB_JOURNAL(p_s_sb)->j_len > 0 && - (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) { - journal_join(&th, p_s_sb, 1) ; - reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; - journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; - do_journal_end(&th, p_s_sb,1, COMMIT_NOW) ; - } else if (immediate) { /* belongs above, but I wanted this to be very explicit as a special case. If they say to - flush, we must be sure old transactions hit the disk too. */ - journal_join(&th, p_s_sb, 1) ; - reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; - journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; - do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ; - } - reiserfs_journal_kupdate(p_s_sb) ; - return 0 ; +int reiserfs_flush_old_commits(struct super_block *p_s_sb) { + time_t now ; + struct reiserfs_transaction_handle th ; + + now = get_seconds(); + /* safety check so we don't flush while we are replaying the log during + * mount + */ + if (list_empty(&SB_JOURNAL(p_s_sb)->j_journal_list)) { + return 0 ; + } + + /* check the current transaction. If there are no writers, and it is + * too old, finish it, and force the commit blocks to disk + */ + if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) <= 0 && + SB_JOURNAL(p_s_sb)->j_trans_start_time > 0 && + SB_JOURNAL(p_s_sb)->j_len > 0 && + (now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > + SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) + { + journal_join(&th, p_s_sb, 1) ; + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; + journal_mark_dirty(&th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; + + /* we're only being called from kreiserfsd, it makes no sense to do + ** an async commit so that kreiserfsd can do it later + */ + do_journal_end(&th, p_s_sb,1, COMMIT_NOW | WAIT) ; + } + return p_s_sb->s_dirt; } /* @@ -2629,6 +3024,7 @@ static int check_journal_end(struct reis int flush = flags & FLUSH_ALL ; int commit_now = flags & COMMIT_NOW ; int wait_on_commit = flags & WAIT ; + struct reiserfs_journal_list *jl; if (th->t_trans_id != SB_JOURNAL(p_s_sb)->j_trans_id) { reiserfs_panic(th->t_super, "journal-1577: handle trans id %ld != current trans id %ld\n", @@ -2645,13 +3041,7 @@ static int check_journal_end(struct reis ** care of in this trans */ if (SB_JOURNAL(p_s_sb)->j_len == 0) { - int wcount = atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) ; - unlock_journal(p_s_sb) ; - if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) > 0 && wcount <= 0) { - atomic_dec(&(SB_JOURNAL(p_s_sb)->j_jlock)) ; - wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; - } - return 0 ; + BUG(); } /* if wcount > 0, and we are called to with flush or commit_now, ** we wait on j_join_wait. We will wake up when the last writer has @@ -2661,24 +3051,37 @@ static int check_journal_end(struct reis */ if (atomic_read(&(SB_JOURNAL(p_s_sb)->j_wcount)) > 0) { if (flush || commit_now) { - int orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ; + unsigned trans_id ; + + jl = SB_JOURNAL(p_s_sb)->j_current_jl; + trans_id = jl->j_trans_id; + atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ; if (flush) { SB_JOURNAL(p_s_sb)->j_next_full_flush = 1 ; } unlock_journal(p_s_sb) ; + /* sleep while the current transaction is still j_jlocked */ - while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_jlock)) && - SB_JOURNAL(p_s_sb)->j_trans_id == th->t_trans_id) { - sleep_on(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; - } - if (commit_now) { - if (wait_on_commit) { - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ; - } else { - commit_flush_async(p_s_sb, orig_jindex) ; + while(SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) { + if (atomic_read(&SB_JOURNAL(p_s_sb)->j_jlock)) { + queue_log_writer(p_s_sb); + } else { + lock_journal(p_s_sb); + if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) { + atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 1) ; + } + unlock_journal(p_s_sb); } } + if (SB_JOURNAL(p_s_sb)->j_trans_id == trans_id) { + BUG(); + } + if (commit_now && journal_list_still_alive(p_s_sb, trans_id) && + wait_on_commit) + { + flush_commit_list(p_s_sb, jl, 1) ; + } return 0 ; } unlock_journal(p_s_sb) ; @@ -2686,7 +3089,7 @@ static int check_journal_end(struct reis } /* deal with old transactions where we are the last writers */ - now = get_seconds() ; + now = get_seconds(); if ((now - SB_JOURNAL(p_s_sb)->j_trans_start_time) > SB_JOURNAL_MAX_TRANS_AGE(p_s_sb)) { commit_now = 1 ; SB_JOURNAL(p_s_sb)->j_next_async_flush = 1 ; @@ -2726,25 +3129,21 @@ int journal_mark_freed(struct reiserfs_t struct buffer_head *bh = NULL ; struct reiserfs_list_bitmap *jb = NULL ; int cleaned = 0 ; - - if (reiserfs_dont_log(th->t_super)) { - bh = sb_find_get_block(p_s_sb, blocknr) ; - if (bh && buffer_dirty (bh)) { - printk ("journal_mark_freed(dont_log): dirty buffer on hash list: %lx %d\n", bh->b_state, blocknr); - BUG (); - } - brelse (bh); - return 0 ; + + cn = get_journal_hash_dev(p_s_sb, SB_JOURNAL(p_s_sb)->j_hash_table, blocknr); + if (cn && cn->bh) { + bh = cn->bh ; + get_bh(bh) ; } - bh = sb_find_get_block(p_s_sb, blocknr) ; /* if it is journal new, we just remove it from this transaction */ if (bh && buffer_journal_new(bh)) { mark_buffer_notjournal_new(bh) ; clear_prepared_bits(bh) ; + reiserfs_clean_and_file_buffer(bh) ; cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ; } else { /* set the bit for this block in the journal bitmap for this transaction */ - jb = SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap ; + jb = SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap; if (!jb) { reiserfs_panic(p_s_sb, "journal-1702: journal_mark_freed, journal_list_bitmap is NULL\n") ; } @@ -2754,6 +3153,7 @@ int journal_mark_freed(struct reiserfs_t if (bh) { clear_prepared_bits(bh) ; + reiserfs_clean_and_file_buffer(bh) ; } cleaned = remove_from_transaction(p_s_sb, blocknr, cleaned) ; @@ -2785,7 +3185,6 @@ int journal_mark_freed(struct reiserfs_t } if (bh) { - reiserfs_clean_and_file_buffer(bh) ; put_bh(bh) ; /* get_hash grabs the buffer */ if (atomic_read(&(bh->b_count)) < 0) { printk("journal-2165: bh->b_count < 0\n") ; @@ -2795,50 +3194,98 @@ int journal_mark_freed(struct reiserfs_t } void reiserfs_update_inode_transaction(struct inode *inode) { - - REISERFS_I(inode)->i_trans_index = SB_JOURNAL_LIST_INDEX(inode->i_sb); - + REISERFS_I(inode)->i_jl = SB_JOURNAL(inode->i_sb)->j_current_jl; REISERFS_I(inode)->i_trans_id = SB_JOURNAL(inode->i_sb)->j_trans_id ; } -static int reiserfs_inode_in_this_transaction(struct inode *inode) { - if (REISERFS_I(inode)->i_trans_id == SB_JOURNAL(inode->i_sb)->j_trans_id || - REISERFS_I(inode)->i_trans_id == 0) { - return 1; - } - return 0 ; +/* + * returns -1 on error, 0 if no commits/barriers were done and 1 + * if a transaction was actually committed and the barrier was done + */ +static int __commit_trans_jl(struct inode *inode, unsigned long id, + struct reiserfs_journal_list *jl) +{ + struct reiserfs_transaction_handle th ; + struct super_block *sb = inode->i_sb ; + int ret = 0; + + /* is it from the current transaction, or from an unknown transaction? */ + if (id == SB_JOURNAL(sb)->j_trans_id) { + jl = SB_JOURNAL(sb)->j_current_jl; + /* try to let other writers come in and grow this transaction */ + let_transaction_grow(sb, id); + if (SB_JOURNAL(sb)->j_trans_id != id) { + goto flush_commit_only; + } + + journal_begin(&th, sb, 1) ; + + /* someone might have ended this transaction while we joined */ + if (SB_JOURNAL(sb)->j_trans_id != id) { + reiserfs_prepare_for_journal(sb, SB_BUFFER_WITH_SB(sb), 1) ; + journal_mark_dirty(&th, sb, SB_BUFFER_WITH_SB(sb)) ; + journal_end(&th, sb, 1) ; + goto flush_commit_only; + } + + journal_end_sync(&th, sb, 1) ; + ret = 1; + + } else { + /* this gets tricky, we have to make sure the journal list in + * the inode still exists. We know the list is still around + * if we've got a larger transaction id than the oldest list + */ +flush_commit_only: + if (journal_list_still_alive(inode->i_sb, id)) { + /* + * we only set ret to 1 when we know for sure + * the barrier hasn't been started yet on the commit + * block. + */ + if (atomic_read(&jl->j_commit_left) > 1) + ret = 1; + flush_commit_list(sb, jl, 1) ; + } + } + /* otherwise the list is gone, and long since committed */ + return ret; } -void reiserfs_commit_for_inode(struct inode *inode) { - struct reiserfs_journal_list *jl ; - struct reiserfs_transaction_handle th ; - struct super_block *sb = inode->i_sb ; - - jl = SB_JOURNAL_LIST(sb) + REISERFS_I(inode)->i_trans_index ; - - /* is it from the current transaction, or from an unknown transaction? */ - if (reiserfs_inode_in_this_transaction(inode)) { - journal_join(&th, sb, 1) ; - reiserfs_update_inode_transaction(inode) ; - journal_end_sync(&th, sb, 1) ; - } else if (jl->j_trans_id == REISERFS_I(inode)->i_trans_id) { - flush_commit_list(sb, jl, 1) ; - } - /* if the transaction id does not match, this list is long since flushed - ** and we don't have to do anything here - */ +int reiserfs_commit_for_inode(struct inode *inode) { + unsigned long id = REISERFS_I(inode)->i_trans_id; + struct reiserfs_journal_list *jl = REISERFS_I(inode)->i_jl; + + /* for the whole inode, assume unset id means it was + * changed in the current transaction. More conservative + */ + if (!id || !jl) { + reiserfs_update_inode_transaction(inode) ; + id = REISERFS_I(inode)->i_trans_id; + /* jl will be updated in __commit_trans_jl */ + } + + return __commit_trans_jl(inode, id, jl); } void reiserfs_restore_prepared_buffer(struct super_block *p_s_sb, struct buffer_head *bh) { - PROC_INFO_INC( p_s_sb, journal.restore_prepared ); - if (reiserfs_dont_log (p_s_sb)) - return; - - if (!bh) { - return ; - } - clear_bit(BH_JPrepared, &bh->b_state) ; + PROC_INFO_INC( p_s_sb, journal.restore_prepared ); + if (!bh) { + return ; + } + if (test_and_clear_bit(BH_JRestore_dirty, &bh->b_state) && + buffer_journal_dirty(bh)) { + struct reiserfs_journal_cnode *cn; + cn = get_journal_hash_dev(p_s_sb, + SB_JOURNAL(p_s_sb)->j_list_hash_table, + bh->b_blocknr); + if (cn && can_dirty(cn)) { + set_bit(BH_JTest, &bh->b_state); + mark_buffer_dirty(bh); + } + } + clear_bit(BH_JPrepared, &bh->b_state) ; } extern struct tree_balance *cur_tb ; @@ -2849,29 +3296,39 @@ extern struct tree_balance *cur_tb ; ** wait on it. ** */ -void reiserfs_prepare_for_journal(struct super_block *p_s_sb, +int reiserfs_prepare_for_journal(struct super_block *p_s_sb, struct buffer_head *bh, int wait) { - int retry_count = 0 ; - PROC_INFO_INC( p_s_sb, journal.prepare ); - if (reiserfs_dont_log (p_s_sb)) - return; - while(!test_bit(BH_JPrepared, &bh->b_state) || - (wait && buffer_locked(bh))) { - if (buffer_journaled(bh)) { - set_bit(BH_JPrepared, &bh->b_state) ; - return ; - } - set_bit(BH_JPrepared, &bh->b_state) ; - if (wait) { - RFALSE( buffer_locked(bh) && cur_tb != NULL, - "waiting while do_balance was running\n") ; - wait_on_buffer(bh) ; + if (test_set_buffer_locked(bh)) { + if (!wait) + return 0; + lock_buffer(bh); + } + set_bit(BH_JPrepared, &bh->b_state); + if (test_clear_buffer_dirty(bh) && buffer_journal_dirty(bh)) { + clear_bit(BH_JTest, &bh->b_state); + set_bit(BH_JRestore_dirty, &bh->b_state); + } + unlock_buffer(bh); + return 1; +} + +static void flush_old_journal_lists(struct super_block *s) { + struct reiserfs_journal_list *jl; + struct list_head *entry; + time_t now = get_seconds(); + + while(!list_empty(&SB_JOURNAL(s)->j_journal_list)) { + entry = SB_JOURNAL(s)->j_journal_list.next; + jl = JOURNAL_LIST_ENTRY(entry); + /* this check should always be run, to send old lists to disk */ + if (jl->j_timestamp < (now - (JOURNAL_MAX_TRANS_AGE * 4))) { + flush_used_journal_lists(s, jl); + } else { + break; + } } - PROC_INFO_INC( p_s_sb, journal.prepare_retry ); - retry_count++ ; - } } /* @@ -2890,19 +3347,24 @@ static int do_journal_end(struct reiserf struct buffer_head *c_bh ; /* commit bh */ struct buffer_head *d_bh ; /* desc bh */ int cur_write_start = 0 ; /* start index of current log write */ - int cur_blocks_left = 0 ; /* number of journal blocks left to write */ int old_start ; int i ; - int jindex ; - int orig_jindex ; int flush = flags & FLUSH_ALL ; - int commit_now = flags & COMMIT_NOW ; int wait_on_commit = flags & WAIT ; - struct reiserfs_super_block *rs ; - int trans_half ; + struct reiserfs_journal_list *jl, *temp_jl; + struct list_head *entry, *safe; + unsigned long jindex; + unsigned long commit_trans_id; + int trans_half; - if (reiserfs_dont_log(th->t_super)) { - return 0 ; + if (th->t_refcount > 1) + BUG() ; + + current->journal_info = th->t_handle_save; + reiserfs_check_lock_depth("journal end"); + if (SB_JOURNAL(p_s_sb)->j_len == 0) { + reiserfs_prepare_for_journal(p_s_sb, SB_BUFFER_WITH_SB(p_s_sb), 1) ; + journal_mark_dirty(th, p_s_sb, SB_BUFFER_WITH_SB(p_s_sb)) ; } lock_journal(p_s_sb) ; @@ -2911,24 +3373,25 @@ static int do_journal_end(struct reiserf flush = 1 ; } if (SB_JOURNAL(p_s_sb)->j_next_async_flush) { - flags |= COMMIT_NOW ; - commit_now = 1 ; + flags |= COMMIT_NOW | WAIT; + wait_on_commit = 1; } /* check_journal_end locks the journal, and unlocks if it does not return 1 ** it tells us if we should continue with the journal_end, or just return */ if (!check_journal_end(th, p_s_sb, nblocks, flags)) { - return 0 ; + p_s_sb->s_dirt = 1; + wake_queued_writers(p_s_sb); + reiserfs_async_progress_wait(p_s_sb); + goto out ; } /* check_journal_end might set these, check again */ if (SB_JOURNAL(p_s_sb)->j_next_full_flush) { flush = 1 ; } - if (SB_JOURNAL(p_s_sb)->j_next_async_flush) { - commit_now = 1 ; - } + /* ** j must wait means we have to flush the log blocks, and the real blocks for ** this transaction @@ -2938,14 +3401,16 @@ static int do_journal_end(struct reiserf } #ifdef REISERFS_PREALLOCATE + /* quota ops might need to nest, setup the journal_info pointer for them */ + current->journal_info = th ; reiserfs_discard_all_prealloc(th); /* it should not involve new blocks into * the transaction */ + current->journal_info = th->t_handle_save ; #endif - rs = SB_DISK_SUPER_BLOCK(p_s_sb) ; /* setup description block */ d_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + SB_JOURNAL(p_s_sb)->j_start) ; - set_buffer_uptodate(d_bh) ; + set_buffer_uptodate(d_bh); desc = (struct reiserfs_journal_desc *)(d_bh)->b_data ; memset(d_bh->b_data, 0, d_bh->b_size) ; memcpy(get_journal_desc_magic (d_bh), JOURNAL_DESC_MAGIC, 8) ; @@ -2960,28 +3425,33 @@ static int do_journal_end(struct reiserf set_buffer_uptodate(c_bh) ; /* init this journal list */ - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_older_commits_done), 0) ; - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ; - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ; - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_bh = c_bh ; - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_start = SB_JOURNAL(p_s_sb)->j_start ; - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len = SB_JOURNAL(p_s_sb)->j_len ; - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_nonzerolen), SB_JOURNAL(p_s_sb)->j_len) ; - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_left), SB_JOURNAL(p_s_sb)->j_len + 2); - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = NULL ; - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ; - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ; - - /* which is faster, locking/unlocking at the start and end of the for - ** or locking once per iteration around the insert_journal_hash? - ** eitherway, we are write locking insert_journal_hash. The ENTIRE FOR - ** LOOP MUST not cause schedule to occur. - */ + jl = SB_JOURNAL(p_s_sb)->j_current_jl; + + /* we lock the commit before doing anything because + * we want to make sure nobody tries to run flush_commit_list until + * the new transaction is fully setup, and we've already flushed the + * ordered bh list + */ + down(&jl->j_commit_lock); + + /* save the transaction id in case we need to commit it later */ + commit_trans_id = jl->j_trans_id; + + atomic_set(&jl->j_older_commits_done, 0) ; + jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id ; + jl->j_timestamp = SB_JOURNAL(p_s_sb)->j_trans_start_time ; + jl->j_commit_bh = c_bh ; + jl->j_start = SB_JOURNAL(p_s_sb)->j_start ; + jl->j_len = SB_JOURNAL(p_s_sb)->j_len ; + atomic_set(&jl->j_nonzerolen, SB_JOURNAL(p_s_sb)->j_len) ; + atomic_set(&jl->j_commit_left, SB_JOURNAL(p_s_sb)->j_len + 2); + jl->j_realblock = NULL ; - /* for each real block, add it to the journal list hash, + /* The ENTIRE FOR LOOP MUST not cause schedule to occur. + ** for each real block, add it to the journal list hash, ** copy into real block index array in the commit or desc block */ - trans_half = journal_trans_half(p_s_sb->s_blocksize) ; + trans_half = journal_trans_half(p_s_sb->s_blocksize); for (i = 0, cn = SB_JOURNAL(p_s_sb)->j_first ; cn ; cn = cn->next, i++) { if (test_bit(BH_JDirty, &cn->bh->b_state) ) { jl_cn = get_cnode(p_s_sb) ; @@ -2989,7 +3459,7 @@ static int do_journal_end(struct reiserf reiserfs_panic(p_s_sb, "journal-1676, get_cnode returned NULL\n") ; } if (i == 0) { - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_realblock = jl_cn ; + jl->j_realblock = jl_cn ; } jl_cn->prev = last_cn ; jl_cn->next = NULL ; @@ -3005,9 +3475,9 @@ static int do_journal_end(struct reiserf } jl_cn->blocknr = cn->bh->b_blocknr ; jl_cn->state = 0 ; - jl_cn->sb = p_s_sb ; + jl_cn->sb = p_s_sb; jl_cn->bh = cn->bh ; - jl_cn->jlist = SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb) ; + jl_cn->jlist = jl; insert_journal_hash(SB_JOURNAL(p_s_sb)->j_list_hash_table, jl_cn) ; if (i < trans_half) { desc->j_realblock[i] = cpu_to_le32(cn->bh->b_blocknr) ; @@ -3018,7 +3488,6 @@ static int do_journal_end(struct reiserf i-- ; } } - set_desc_trans_len(desc, SB_JOURNAL(p_s_sb)->j_len) ; set_desc_mount_id(desc, SB_JOURNAL(p_s_sb)->j_mount_id) ; set_desc_trans_id(desc, SB_JOURNAL(p_s_sb)->j_trans_id) ; @@ -3026,53 +3495,35 @@ static int do_journal_end(struct reiserf /* special check in case all buffers in the journal were marked for not logging */ if (SB_JOURNAL(p_s_sb)->j_len == 0) { - brelse(d_bh) ; - brelse(c_bh) ; - unlock_journal(p_s_sb) ; - printk("journal-2020: do_journal_end: BAD desc->j_len is ZERO\n") ; - atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ; - wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; - return 0 ; + BUG(); } + /* we're about to dirty all the log blocks, mark the description block + * dirty now too. Don't mark the commit block dirty until all the + * others are on disk + */ + mark_buffer_dirty(d_bh); + /* first data block is j_start + 1, so add one to cur_write_start wherever you use it */ cur_write_start = SB_JOURNAL(p_s_sb)->j_start ; - cur_blocks_left = SB_JOURNAL(p_s_sb)->j_len ; cn = SB_JOURNAL(p_s_sb)->j_first ; jindex = 1 ; /* start at one so we don't get the desc again */ - while(cur_blocks_left > 0) { + while(cn) { + clear_bit(BH_JNew, &(cn->bh->b_state)) ; /* copy all the real blocks into log area. dirty log blocks */ if (test_bit(BH_JDirty, &cn->bh->b_state)) { struct buffer_head *tmp_bh ; tmp_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; - set_buffer_uptodate(tmp_bh) ; + set_buffer_uptodate(tmp_bh); memcpy(tmp_bh->b_data, cn->bh->b_data, cn->bh->b_size) ; + mark_buffer_dirty(tmp_bh); jindex++ ; - } else { - /* JDirty cleared sometime during transaction. don't log this one */ - printk("journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ; - } - cn = cn->next ; - cur_blocks_left-- ; - } - - /* we are done with both the c_bh and d_bh, but - ** c_bh must be written after all other commit blocks, - ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. - */ - - /* now loop through and mark all buffers from this transaction as JDirty_wait - ** clear the JDirty bit, clear BH_JNew too. - ** if they weren't JDirty, they weren't logged, just relse them and move on - */ - cn = SB_JOURNAL(p_s_sb)->j_first ; - while(cn) { - clear_bit(BH_JNew, &(cn->bh->b_state)) ; - if (test_bit(BH_JDirty, &(cn->bh->b_state))) { set_bit(BH_JDirty_wait, &(cn->bh->b_state)) ; clear_bit(BH_JDirty, &(cn->bh->b_state)) ; } else { + /* JDirty cleared sometime during transaction. don't log this one */ + reiserfs_warning("journal-2048: do_journal_end: BAD, buffer in journal hash, but not JDirty!\n") ; brelse(cn->bh) ; } next = cn->next ; @@ -3080,30 +3531,17 @@ static int do_journal_end(struct reiserf cn = next ; } - /* unlock the journal list for committing and flushing */ - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 0) ; - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 0) ; - - orig_jindex = SB_JOURNAL_LIST_INDEX(p_s_sb) ; - jindex = (SB_JOURNAL_LIST_INDEX(p_s_sb) + 1) % JOURNAL_LIST_COUNT ; - SB_JOURNAL_LIST_INDEX(p_s_sb) = jindex ; - - /* write any buffers that must hit disk before this commit is done */ - fsync_buffers_list(&(SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock), - &(SB_JOURNAL(p_s_sb)->j_dirty_buffers)) ; + /* we are done with both the c_bh and d_bh, but + ** c_bh must be written after all other commit blocks, + ** so we dirty/relse c_bh in flush_commit_list, with commit_left <= 1. + */ - /* honor the flush and async wishes from the caller */ - if (flush) { - - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ; - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex , 1) ; - } else if (commit_now) { - if (wait_on_commit) { - flush_commit_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + orig_jindex, 1) ; - } else { - commit_flush_async(p_s_sb, orig_jindex) ; - } - } + SB_JOURNAL(p_s_sb)->j_current_jl = alloc_journal_list(p_s_sb); + + /* now it is safe to insert this transaction on the main list */ + list_add_tail(&jl->j_list, &SB_JOURNAL(p_s_sb)->j_journal_list); + list_add_tail(&jl->j_working_list, &SB_JOURNAL(p_s_sb)->j_working_list); + SB_JOURNAL(p_s_sb)->j_num_work_lists++; /* reset journal values for the next transaction */ old_start = SB_JOURNAL(p_s_sb)->j_start ; @@ -3115,57 +3553,108 @@ static int do_journal_end(struct reiserf SB_JOURNAL(p_s_sb)->j_len = 0 ; SB_JOURNAL(p_s_sb)->j_trans_start_time = 0 ; SB_JOURNAL(p_s_sb)->j_trans_id++ ; + SB_JOURNAL(p_s_sb)->j_current_jl->j_trans_id = SB_JOURNAL(p_s_sb)->j_trans_id; SB_JOURNAL(p_s_sb)->j_must_wait = 0 ; SB_JOURNAL(p_s_sb)->j_len_alloc = 0 ; SB_JOURNAL(p_s_sb)->j_next_full_flush = 0 ; SB_JOURNAL(p_s_sb)->j_next_async_flush = 0 ; init_journal_hash(p_s_sb) ; + // make sure reiserfs_add_jh sees the new current_jl before we + // write out the tails + smp_mb(); + + /* tail conversion targets have to hit the disk before we end the + * transaction. Otherwise a later transaction might repack the tail + * before this transaction commits, leaving the data block unflushed and + * clean, if we crash before the later transaction commits, the data block + * is lost. + */ + if (!list_empty(&jl->j_tail_bh_list)) { + unlock_kernel(); + write_ordered_buffers(&SB_JOURNAL(p_s_sb)->j_dirty_buffers_lock, + SB_JOURNAL(p_s_sb), jl, &jl->j_tail_bh_list); + lock_kernel(); + } + if (!list_empty(&jl->j_tail_bh_list)) + BUG(); + up(&jl->j_commit_lock); + + /* honor the flush wishes from the caller, simple commits can + ** be done outside the journal lock, they are done below + ** + ** if we don't flush the commit list right now, we put it into + ** the work queue so the people waiting on the async progress work + ** queue don't wait for this proc to flush journal lists and such. + */ + if (flush) { + flush_commit_list(p_s_sb, jl, 1) ; + flush_journal_list(p_s_sb, jl, 1) ; + } else + queue_work(commit_wq, &SB_JOURNAL(p_s_sb)->j_work); + + /* if the next transaction has any chance of wrapping, flush ** transactions that might get overwritten. If any journal lists are very ** old flush them as well. */ - for (i = 0 ; i < JOURNAL_LIST_COUNT ; i++) { - jindex = i ; - if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && SB_JOURNAL(p_s_sb)->j_start <= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) { - if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >= SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) { - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1) ; - } - } else if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && - (SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) > SB_ONDISK_JOURNAL_SIZE(p_s_sb)) { - if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) % SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= - SB_JOURNAL_LIST(p_s_sb)[jindex].j_start) { - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; +first_jl: + list_for_each_safe(entry, safe, &SB_JOURNAL(p_s_sb)->j_journal_list) { + temp_jl = JOURNAL_LIST_ENTRY(entry); + if (SB_JOURNAL(p_s_sb)->j_start <= temp_jl->j_start) { + if ((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) >= + temp_jl->j_start) + { + flush_used_journal_lists(p_s_sb, temp_jl); + goto first_jl; + } else if ((SB_JOURNAL(p_s_sb)->j_start + + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) < + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) + { + /* if we don't cross into the next transaction and we don't + * wrap, there is no way we can overlap any later transactions + * break now + */ + break; + } + } else if ((SB_JOURNAL(p_s_sb)->j_start + + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) > + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) + { + if (((SB_JOURNAL(p_s_sb)->j_start + SB_JOURNAL_TRANS_MAX(p_s_sb) + 1) % + SB_ONDISK_JOURNAL_SIZE(p_s_sb)) >= temp_jl->j_start) + { + flush_used_journal_lists(p_s_sb, temp_jl); + goto first_jl; + } else { + /* we don't overlap anything from out start to the end of the + * log, and our wrapped portion doesn't overlap anything at + * the start of the log. We can break + */ + break; } - } - /* this check should always be run, to send old lists to disk */ - if (SB_JOURNAL_LIST(p_s_sb)[jindex].j_len > 0 && - SB_JOURNAL_LIST(p_s_sb)[jindex].j_timestamp < - (get_seconds() - (SB_JOURNAL_MAX_TRANS_AGE(p_s_sb) * 4))) { - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + jindex, 1 ) ; } } + flush_old_journal_lists(p_s_sb); - /* if the next journal_list is still in use, flush it */ - if (SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_len != 0) { - flush_journal_list(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + SB_JOURNAL_LIST_INDEX(p_s_sb), 1) ; - } - - /* we don't want anyone flushing the new transaction's list */ - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_commit_flushing), 1) ; - atomic_set(&(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_flushing), 1) ; - SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL_LIST(p_s_sb) + - SB_JOURNAL_LIST_INDEX(p_s_sb)) ; + SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap = get_list_bitmap(p_s_sb, SB_JOURNAL(p_s_sb)->j_current_jl) ; - if (!(SB_JOURNAL_LIST(p_s_sb)[SB_JOURNAL_LIST_INDEX(p_s_sb)].j_list_bitmap)) { + if (!(SB_JOURNAL(p_s_sb)->j_current_jl->j_list_bitmap)) { reiserfs_panic(p_s_sb, "journal-1996: do_journal_end, could not get a list bitmap\n") ; } - unlock_journal(p_s_sb) ; + atomic_set(&(SB_JOURNAL(p_s_sb)->j_jlock), 0) ; + unlock_journal(p_s_sb) ; /* wake up any body waiting to join. */ + clear_bit(WRITERS_QUEUED, &SB_JOURNAL(p_s_sb)->j_state); wake_up(&(SB_JOURNAL(p_s_sb)->j_join_wait)) ; + + if (!flush && wait_on_commit && + journal_list_still_alive(p_s_sb, commit_trans_id)) { + flush_commit_list(p_s_sb, jl, 1) ; + } +out: + reiserfs_check_lock_depth("journal end2"); + th->t_trans_id = 0; return 0 ; } - - - diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/Makefile linux-2.6.5-rc1-mm2/fs/reiserfs/Makefile --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/Makefile 2004-03-11 03:55:54.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/Makefile 2004-03-19 16:10:24.020964439 +0100 @@ -9,6 +9,18 @@ reiserfs-objs := bitmap.o do_balan.o nam hashes.o tail_conversion.o journal.o resize.o \ item_ops.o ioctl.o procfs.o +ifeq ($(CONFIG_REISERFS_FS_XATTR),y) +reiserfs-objs += xattr.o xattr_user.o xattr_trusted.o +endif + +ifeq ($(CONFIG_REISERFS_FS_SECURITY),y) +reiserfs-objs += xattr_security.o +endif + +ifeq ($(CONFIG_REISERFS_FS_POSIX_ACL),y) +reiserfs-objs += xattr_acl.o +endif + # gcc -O2 (the kernel default) is overaggressive on ppc32 when many inline # functions are used. This causes the compiler to advance the stack # pointer out of the available stack space, corrupting kernel space, diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/namei.c linux-2.6.5-rc1-mm2/fs/reiserfs/namei.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/namei.c 2004-03-11 03:55:29.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/namei.c 2004-03-19 16:10:24.014965085 +0100 @@ -15,7 +15,10 @@ #include #include #include +#include +#include #include +#include #define INC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) { i->i_nlink++; if (i->i_nlink >= REISERFS_LINK_MAX) i->i_nlink=1; } #define DEC_DIR_INODE_NLINK(i) if (i->i_nlink != 1) i->i_nlink--; @@ -331,11 +334,24 @@ static struct dentry * reiserfs_lookup ( retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path_to_entry, &de); pathrelse (&path_to_entry); if (retval == NAME_FOUND) { + /* Hide the .reiserfs_priv directory */ + if (reiserfs_xattrs (dir->i_sb) && + !old_format_only(dir->i_sb) && + REISERFS_SB(dir->i_sb)->priv_root && + REISERFS_SB(dir->i_sb)->priv_root->d_inode && + de.de_objectid == le32_to_cpu (INODE_PKEY(REISERFS_SB(dir->i_sb)->priv_root->d_inode)->k_objectid)) { + return ERR_PTR (-EACCES); + } + inode = reiserfs_iget (dir->i_sb, (struct cpu_key *)&(de.de_dir_id)); if (!inode || IS_ERR(inode)) { reiserfs_write_unlock(dir->i_sb); return ERR_PTR(-EACCES); } + + /* Propogate the priv_object flag so we know we're in the priv tree */ + if (is_reiserfs_priv_object (dir)) + REISERFS_I(inode)->i_flags |= i_priv_object; } reiserfs_write_unlock(dir->i_sb); if ( retval == IO_ERROR ) { @@ -504,7 +520,7 @@ static int reiserfs_add_entry (struct re } /* perform the insertion of the entry that we have prepared */ - retval = reiserfs_paste_into_item (th, &path, &entry_key, buffer, paste_size); + retval = reiserfs_paste_into_item (th, &path, &entry_key, dir, buffer, paste_size); if (buffer != small_buf) reiserfs_kfree (buffer, buflen, dir->i_sb); if (retval) { @@ -513,7 +529,6 @@ static int reiserfs_add_entry (struct re } dir->i_size += paste_size; - dir->i_blocks = ((dir->i_size + 511) >> 9); dir->i_mtime = dir->i_ctime = CURRENT_TIME; if (!S_ISDIR (inode->i_mode) && visible) // reiserfs_mkdir or reiserfs_rename will do that by itself @@ -529,7 +544,9 @@ static int reiserfs_add_entry (struct re ** inserted into the tree yet. */ static int drop_new_inode(struct inode *inode) { + DQUOT_DROP(inode); make_bad_inode(inode) ; + inode->i_flags |= S_NOQUOTA; iput(inode) ; return 0 ; } @@ -555,6 +572,11 @@ static int new_inode_init(struct inode * } else { inode->i_gid = current->fsgid; } + DQUOT_INIT(inode); + if (DQUOT_ALLOC_INODE(inode)) { + drop_new_inode(inode); + return -EDQUOT; + } return 0 ; } @@ -565,6 +587,7 @@ static int reiserfs_create (struct inode struct inode * inode; int jbegin_count = JOURNAL_PER_BALANCE_CNT * 2 ; struct reiserfs_transaction_handle th ; + int locked; if (!(inode = new_inode(dir->i_sb))) { return -ENOMEM ; @@ -573,10 +596,19 @@ static int reiserfs_create (struct inode if (retval) return retval; + locked = reiserfs_cache_default_acl (dir); + reiserfs_write_lock(dir->i_sb); + + if (locked) + reiserfs_write_lock_xattrs (dir->i_sb); + journal_begin(&th, dir->i_sb, jbegin_count) ; - th.t_caller = "create" ; retval = reiserfs_new_inode (&th, dir, mode, 0, 0/*i_size*/, dentry, inode); + + if (locked) + reiserfs_write_unlock_xattrs (dir->i_sb); + if (retval) { goto out_failed; } @@ -612,6 +644,7 @@ static int reiserfs_mknod (struct inode struct inode * inode; struct reiserfs_transaction_handle th ; int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; + int locked; if (!new_valid_dev(rdev)) return -EINVAL; @@ -623,15 +656,25 @@ static int reiserfs_mknod (struct inode if (retval) return retval; + locked = reiserfs_cache_default_acl (dir); + reiserfs_write_lock(dir->i_sb); + + if (locked) + reiserfs_write_lock_xattrs (dir->i_sb); + journal_begin(&th, dir->i_sb, jbegin_count) ; retval = reiserfs_new_inode (&th, dir, mode, 0, 0/*i_size*/, dentry, inode); + + if (locked) + reiserfs_write_unlock_xattrs (dir->i_sb); + if (retval) { goto out_failed; } - init_special_inode(inode, mode, rdev) ; + init_special_inode(inode, inode->i_mode, rdev) ; //FIXME: needed for block and char devices only reiserfs_update_sd (&th, inode); @@ -664,6 +707,7 @@ static int reiserfs_mkdir (struct inode struct inode * inode; struct reiserfs_transaction_handle th ; int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; + int locked; #ifdef DISPLACE_NEW_PACKING_LOCALITIES /* set flag that new packing locality created and new blocks for the content * of that directory are not displaced yet */ @@ -677,7 +721,11 @@ static int reiserfs_mkdir (struct inode if (retval) return retval; + locked = reiserfs_cache_default_acl (dir); + reiserfs_write_lock(dir->i_sb); + if (locked) + reiserfs_write_lock_xattrs (dir->i_sb); journal_begin(&th, dir->i_sb, jbegin_count) ; /* inc the link count now, so another writer doesn't overflow it while @@ -689,6 +737,9 @@ static int reiserfs_mkdir (struct inode old_format_only (dir->i_sb) ? EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE, dentry, inode); + if (locked) + reiserfs_write_unlock_xattrs (dir->i_sb); + if (retval) { dir->i_nlink-- ; goto out_failed; @@ -738,7 +789,6 @@ static int reiserfs_rmdir (struct inode { int retval; struct inode * inode; - int windex ; struct reiserfs_transaction_handle th ; int jbegin_count; INITIALIZE_PATH (path); @@ -750,7 +800,6 @@ static int reiserfs_rmdir (struct inode reiserfs_write_lock(dir->i_sb); journal_begin(&th, dir->i_sb, jbegin_count) ; - windex = push_journal_writer("reiserfs_rmdir") ; de.de_gen_number_bit_string = 0; if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) { @@ -793,13 +842,11 @@ static int reiserfs_rmdir (struct inode DEC_DIR_INODE_NLINK(dir) dir->i_size -= (DEH_SIZE + de.de_entrylen); - dir->i_blocks = ((dir->i_size + 511) >> 9); reiserfs_update_sd (&th, dir); /* prevent empty directory from getting lost */ add_save_link (&th, inode, 0/* not truncate */); - pop_journal_writer(windex) ; journal_end(&th, dir->i_sb, jbegin_count) ; reiserfs_check_path(&path) ; reiserfs_write_unlock(dir->i_sb); @@ -810,7 +857,6 @@ static int reiserfs_rmdir (struct inode reiserfs_cut_from_item, or reiserfs_cut_from_item does not release path if operation was not complete */ pathrelse (&path); - pop_journal_writer(windex) ; journal_end(&th, dir->i_sb, jbegin_count) ; reiserfs_write_unlock(dir->i_sb); return retval; @@ -822,7 +868,6 @@ static int reiserfs_unlink (struct inode struct inode * inode; struct reiserfs_dir_entry de; INITIALIZE_PATH (path); - int windex ; struct reiserfs_transaction_handle th ; int jbegin_count; unsigned long savelink; @@ -835,7 +880,6 @@ static int reiserfs_unlink (struct inode reiserfs_write_lock(dir->i_sb); journal_begin(&th, dir->i_sb, jbegin_count) ; - windex = push_journal_writer("reiserfs_unlink") ; de.de_gen_number_bit_string = 0; if ( (retval = reiserfs_find_entry (dir, dentry->d_name.name, dentry->d_name.len, &path, &de)) == NAME_NOT_FOUND) { @@ -880,7 +924,6 @@ static int reiserfs_unlink (struct inode reiserfs_update_sd (&th, inode); dir->i_size -= (de.de_entrylen + DEH_SIZE); - dir->i_blocks = ((dir->i_size + 511) >> 9); dir->i_ctime = dir->i_mtime = CURRENT_TIME; reiserfs_update_sd (&th, dir); @@ -888,7 +931,6 @@ static int reiserfs_unlink (struct inode /* prevent file from getting lost */ add_save_link (&th, inode, 0/* not truncate */); - pop_journal_writer(windex) ; journal_end(&th, dir->i_sb, jbegin_count) ; reiserfs_check_path(&path) ; reiserfs_write_unlock(dir->i_sb); @@ -896,7 +938,6 @@ static int reiserfs_unlink (struct inode end_unlink: pathrelse (&path); - pop_journal_writer(windex) ; journal_end(&th, dir->i_sb, jbegin_count) ; reiserfs_check_path(&path) ; reiserfs_write_unlock(dir->i_sb); @@ -939,6 +980,8 @@ static int reiserfs_symlink (struct inod memcpy (name, symname, strlen (symname)); padd_item (name, item_len, strlen (symname)); + /* We would inherit the default ACL here, but symlinks don't get ACLs */ + journal_begin(&th, parent_dir->i_sb, jbegin_count) ; retval = reiserfs_new_inode (&th, parent_dir, mode, name, strlen (symname), @@ -951,7 +994,7 @@ static int reiserfs_symlink (struct inod reiserfs_update_inode_transaction(inode) ; reiserfs_update_inode_transaction(parent_dir) ; - inode->i_op = &page_symlink_inode_operations; + inode->i_op = &reiserfs_symlink_inode_operations; inode->i_mapping->a_ops = &reiserfs_address_space_operations; // must be sure this inode is written with this transaction @@ -979,7 +1022,6 @@ static int reiserfs_link (struct dentry { int retval; struct inode *inode = old_dentry->d_inode; - int windex ; struct reiserfs_transaction_handle th ; int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; @@ -997,7 +1039,6 @@ static int reiserfs_link (struct dentry inode->i_nlink++; journal_begin(&th, dir->i_sb, jbegin_count) ; - windex = push_journal_writer("reiserfs_link") ; /* create new entry */ retval = reiserfs_add_entry (&th, dir, dentry->d_name.name, dentry->d_name.len, @@ -1008,7 +1049,6 @@ static int reiserfs_link (struct dentry if (retval) { inode->i_nlink--; - pop_journal_writer(windex) ; journal_end(&th, dir->i_sb, jbegin_count) ; reiserfs_write_unlock(dir->i_sb); return retval; @@ -1019,7 +1059,6 @@ static int reiserfs_link (struct dentry atomic_inc(&inode->i_count) ; d_instantiate(dentry, inode); - pop_journal_writer(windex) ; journal_end(&th, dir->i_sb, jbegin_count) ; reiserfs_write_unlock(dir->i_sb); return 0; @@ -1083,7 +1122,6 @@ static int reiserfs_rename (struct inode struct item_head new_entry_ih, old_entry_ih, dot_dot_ih ; struct reiserfs_dir_entry old_de, new_de, dot_dot_de; struct inode * old_inode, * new_dentry_inode; - int windex ; struct reiserfs_transaction_handle th ; int jbegin_count ; umode_t old_inode_mode; @@ -1151,7 +1189,6 @@ static int reiserfs_rename (struct inode } journal_begin(&th, old_dir->i_sb, jbegin_count) ; - windex = push_journal_writer("reiserfs_rename") ; /* add new entry (or find the existing one) */ retval = reiserfs_add_entry (&th, new_dir, new_dentry->d_name.name, new_dentry->d_name.len, @@ -1162,7 +1199,6 @@ static int reiserfs_rename (struct inode "vs-7050: new entry is found, new inode == 0\n"); } } else if (retval) { - pop_journal_writer(windex) ; journal_end(&th, old_dir->i_sb, jbegin_count) ; reiserfs_write_unlock(old_dir->i_sb); return retval; @@ -1303,7 +1339,6 @@ static int reiserfs_rename (struct inode reiserfs_warning ("vs-7060: reiserfs_rename: couldn't not cut old name. Fsck later?\n"); old_dir->i_size -= DEH_SIZE + old_de.de_entrylen; - old_dir->i_blocks = ((old_dir->i_size + 511) >> 9); reiserfs_update_sd (&th, old_dir); reiserfs_update_sd (&th, new_dir); @@ -1315,14 +1350,11 @@ static int reiserfs_rename (struct inode reiserfs_update_sd (&th, new_dentry_inode); } - pop_journal_writer(windex) ; journal_end(&th, old_dir->i_sb, jbegin_count) ; reiserfs_write_unlock(old_dir->i_sb); return 0; } - - /* * directories can handle most operations... */ @@ -1337,5 +1369,28 @@ struct inode_operations reiserfs_dir_ino .rmdir = reiserfs_rmdir, .mknod = reiserfs_mknod, .rename = reiserfs_rename, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, }; +/* + * symlink operations.. same as page_symlink_inode_operations, with xattr + * stuff added + */ +struct inode_operations reiserfs_symlink_inode_operations = { + .readlink = page_readlink, + .follow_link = page_follow_link, + .setattr = reiserfs_setattr, + .setxattr = reiserfs_setxattr, + .getxattr = reiserfs_getxattr, + .listxattr = reiserfs_listxattr, + .removexattr = reiserfs_removexattr, + .permission = reiserfs_permission, + +}; + + diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/objectid.c linux-2.6.5-rc1-mm2/fs/reiserfs/objectid.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/objectid.c 2004-03-11 03:55:26.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/objectid.c 2004-03-19 16:10:24.009965624 +0100 @@ -86,7 +86,6 @@ __u32 reiserfs_get_unused_objectid (stru } journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); - s->s_dirt = 1; return unused_objectid; } @@ -105,8 +104,6 @@ void reiserfs_release_objectid (struct r reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; journal_mark_dirty(th, s, SB_BUFFER_WITH_SB (s)); - s->s_dirt = 1; - /* start at the beginning of the objectid map (i = 0) and go to the end of it (i = disk_sb->s_oid_cursize). Linear search is diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/prints.c linux-2.6.5-rc1-mm2/fs/reiserfs/prints.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/prints.c 2004-03-11 03:55:34.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/prints.c 2004-03-19 16:10:24.014965085 +0100 @@ -333,7 +333,6 @@ extern struct tree_balance * cur_tb; void reiserfs_panic (struct super_block * sb, const char * fmt, ...) { - show_reiserfs_locks() ; do_reiserfs_warning(fmt); printk ( KERN_EMERG "%s", error_buf); BUG (); diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/procfs.c linux-2.6.5-rc1-mm2/fs/reiserfs/procfs.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/procfs.c 2004-03-11 03:55:24.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/procfs.c 2004-03-19 16:10:24.008965731 +0100 @@ -87,7 +87,7 @@ static int show_super(struct seq_file *m struct reiserfs_sb_info *r = REISERFS_SB(sb); seq_printf(m, "state: \t%s\n" - "mount options: \t%s%s%s%s%s%s%s%s%s%s%s%s\n" + "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" "gen. counter: \t%i\n" "s_kmallocs: \t%i\n" "s_disk_reads: \t%i\n" @@ -131,7 +131,6 @@ static int show_super(struct seq_file *m reiserfs_test4( sb ) ? "TEST4 " : "", have_large_tails( sb ) ? "TAILS " : have_small_tails(sb)?"SMALL_TAILS ":"NO_TAILS ", replay_only( sb ) ? "REPLAY_ONLY " : "", - reiserfs_dont_log( sb ) ? "DONT_LOG " : "LOG ", convert_reiserfs( sb ) ? "CONV " : "", atomic_read( &r -> s_generation_counter ), @@ -370,7 +369,6 @@ static int show_journal(struct seq_file "j_first_unflushed_offset: \t%lu\n" "j_last_flush_trans_id: \t%lu\n" "j_trans_start_time: \t%li\n" - "j_journal_list_index: \t%i\n" "j_list_bitmap_index: \t%i\n" "j_must_wait: \t%i\n" "j_next_full_flush: \t%i\n" @@ -416,7 +414,6 @@ static int show_journal(struct seq_file JF( j_first_unflushed_offset ), JF( j_last_flush_trans_id ), JF( j_trans_start_time ), - JF( j_journal_list_index ), JF( j_list_bitmap_index ), JF( j_must_wait ), JF( j_next_full_flush ), diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/stree.c linux-2.6.5-rc1-mm2/fs/reiserfs/stree.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/stree.c 2004-03-19 15:13:58.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/stree.c 2004-03-19 16:10:24.019964547 +0100 @@ -60,6 +60,7 @@ #include #include #include +#include /* Does the buffer contain a disk block which is in the tree. */ inline int B_IS_IN_TREE (const struct buffer_head * p_s_bh) @@ -71,9 +72,6 @@ inline int B_IS_IN_TREE (const struct bu return ( B_LEVEL (p_s_bh) != FREE_LEVEL ); } - - - inline void copy_short_key (void * to, const void * from) { memcpy (to, from, SHORT_KEY_SIZE); @@ -1125,8 +1123,7 @@ static char prepare_for_delete_or_cut( tmp = get_block_num(p_n_unfm_pointer,0); put_block_num(p_n_unfm_pointer, 0, 0); journal_mark_dirty (th, p_s_sb, p_s_bh); - inode->i_blocks -= p_s_sb->s_blocksize / 512; - reiserfs_free_block(th, tmp); + reiserfs_free_block(th, inode, tmp, 1); if ( item_moved (&s_ih, p_s_path) ) { need_research = 1; break ; @@ -1155,8 +1152,7 @@ static char prepare_for_delete_or_cut( } } - -/* Calculate bytes number which will be deleted or cutted in the balance. */ +/* Calculate number of bytes which will be deleted or cut during balance */ int calc_deleted_bytes_number( struct tree_balance * p_s_tb, char c_mode @@ -1167,14 +1163,14 @@ int calc_deleted_bytes_number( if ( is_statdata_le_ih (p_le_ih) ) return 0; + n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0]; if ( is_direntry_le_ih (p_le_ih) ) { // return EMPTY_DIR_SIZE; /* We delete emty directoris only. */ // we can't use EMPTY_DIR_SIZE, as old format dirs have a different // empty size. ick. FIXME, is this right? // - return ih_item_len(p_le_ih); + return n_del_size ; } - n_del_size = ( c_mode == M_DELETE ) ? ih_item_len(p_le_ih) : -p_s_tb->insert_size[0]; if ( is_indirect_le_ih (p_le_ih) ) n_del_size = (n_del_size/UNFM_P_SIZE)* @@ -1208,17 +1204,46 @@ void padd_item (char * item, int total_l item [--i] = 0; } +#ifdef REISERQUOTA_DEBUG +char key2type(struct key *ih) +{ + if (is_direntry_le_key(2, ih)) + return 'd'; + if (is_direct_le_key(2, ih)) + return 'D'; + if (is_indirect_le_key(2, ih)) + return 'i'; + if (is_statdata_le_key(2, ih)) + return 's'; + return 'u'; +} + +char head2type(struct item_head *ih) +{ + if (is_direntry_le_ih(ih)) + return 'd'; + if (is_direct_le_ih(ih)) + return 'D'; + if (is_indirect_le_ih(ih)) + return 'i'; + if (is_statdata_le_ih(ih)) + return 's'; + return 'u'; +} +#endif /* Delete object item. */ int reiserfs_delete_item (struct reiserfs_transaction_handle *th, struct path * p_s_path, /* Path to the deleted item. */ const struct cpu_key * p_s_item_key, /* Key to search for the deleted item. */ - struct inode * p_s_inode,/* inode is here just to update i_blocks */ + struct inode * p_s_inode,/* inode is here just to update i_blocks and quotas */ struct buffer_head * p_s_un_bh) /* NULL or unformatted node pointer. */ { struct super_block * p_s_sb = p_s_inode->i_sb; struct tree_balance s_del_balance; struct item_head s_ih; + struct item_head *q_ih; + int quota_cut_bytes; int n_ret_value, n_del_size, n_removed; @@ -1268,6 +1293,22 @@ int reiserfs_delete_item (struct reiserf // reiserfs_delete_item returns item length when success n_ret_value = calc_deleted_bytes_number(&s_del_balance, M_DELETE); + q_ih = get_ih(p_s_path) ; + quota_cut_bytes = ih_item_len(q_ih) ; + + /* hack so the quota code doesn't have to guess if the file + ** has a tail. On tail insert, we allocate quota for 1 unformatted node. + ** We test the offset because the tail might have been + ** split into multiple items, and we only want to decrement for + ** the unfm node once + */ + if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(q_ih)) { + if ((le_ih_k_offset(q_ih) & (p_s_sb->s_blocksize - 1)) == 1) { + quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE; + } else { + quota_cut_bytes = 0 ; + } + } if ( p_s_un_bh ) { int off; @@ -1299,10 +1340,14 @@ int reiserfs_delete_item (struct reiserf B_I_PITEM(PATH_PLAST_BUFFER(p_s_path), &s_ih), n_ret_value); kunmap_atomic(data, KM_USER0); } - /* Perform balancing after all resources have been collected at once. */ do_balance(&s_del_balance, NULL, NULL, M_DELETE); +#ifdef REISERQUOTA_DEBUG + printk(KERN_DEBUG "reiserquota delete_item(): freeing %u, id=%u type=%c\n", quota_cut_bytes, p_s_inode->i_uid, head2type(&s_ih)); +#endif + DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes); + /* Return deleted body length */ return n_ret_value; } @@ -1327,14 +1372,16 @@ int reiserfs_delete_item (struct reiserf /* this deletes item which never gets split */ void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th, + struct inode *inode, struct key * key) { struct tree_balance tb; INITIALIZE_PATH (path); - int item_len; + int item_len = 0; int tb_init = 0 ; struct cpu_key cpu_key; int retval; + int quota_cut_bytes = 0; le_key2cpu_key (&cpu_key, key); @@ -1358,6 +1405,7 @@ void reiserfs_delete_solid_item (struct item_len = ih_item_len( PATH_PITEM_HEAD(&path) ); init_tb_struct (th, &tb, th->t_super, &path, - (IH_SIZE + item_len)); } + quota_cut_bytes = ih_item_len(PATH_PITEM_HEAD(&path)) ; retval = fix_nodes (M_DELETE, &tb, NULL, 0); if (retval == REPEAT_SEARCH) { @@ -1367,6 +1415,12 @@ void reiserfs_delete_solid_item (struct if (retval == CARRY_ON) { do_balance (&tb, 0, 0, M_DELETE); + if (inode) { /* Should we count quota for item? (we don't count quotas for save-links) */ +#ifdef REISERQUOTA_DEBUG + printk(KERN_DEBUG "reiserquota delete_solid_item(): freeing %u id=%u type=%c\n", quota_cut_bytes, inode->i_uid, key2type(key)); +#endif + DQUOT_FREE_SPACE_NODIRTY(inode, quota_cut_bytes); + } break; } @@ -1399,7 +1453,7 @@ void reiserfs_delete_object (struct reis } /* USE_INODE_GENERATION_COUNTER */ #endif - reiserfs_delete_solid_item (th, INODE_PKEY (inode)); + reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode)); } @@ -1486,12 +1540,14 @@ int reiserfs_cut_from_item (struct reise structure by using the init_tb_struct and fix_nodes functions. After that we can make tree balancing. */ struct tree_balance s_cut_balance; + struct item_head *p_le_ih; int n_cut_size = 0, /* Amount to be cut. */ n_ret_value = CARRY_ON, n_removed = 0, /* Number of the removed unformatted nodes. */ n_is_inode_locked = 0; char c_mode; /* Mode of the balance. */ int retval2 = -1; + int quota_cut_bytes; init_tb_struct(th, &s_cut_balance, p_s_inode->i_sb, p_s_path, n_cut_size); @@ -1579,23 +1635,27 @@ int reiserfs_cut_from_item (struct reise RFALSE( c_mode == M_PASTE || c_mode == M_INSERT, "invalid mode"); /* Calculate number of bytes that need to be cut from the item. */ + quota_cut_bytes = ( c_mode == M_DELETE ) ? ih_item_len(get_ih(p_s_path)) : -s_cut_balance.insert_size[0]; if (retval2 == -1) n_ret_value = calc_deleted_bytes_number(&s_cut_balance, c_mode); else n_ret_value = retval2; - - if ( c_mode == M_DELETE ) { - struct item_head * p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path); - - if ( is_direct_le_ih (p_le_ih) && (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) { - /* we delete first part of tail which was stored in direct - item(s) */ + + + /* For direct items, we only change the quota when deleting the last + ** item. + */ + p_le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path); + if (!S_ISLNK (p_s_inode->i_mode) && is_direct_le_ih(p_le_ih)) { + if (c_mode == M_DELETE && + (le_ih_k_offset (p_le_ih) & (p_s_sb->s_blocksize - 1)) == 1 ) { // FIXME: this is to keep 3.5 happy REISERFS_I(p_s_inode)->i_first_direct_byte = U32_MAX; - p_s_inode->i_blocks -= p_s_sb->s_blocksize / 512; + quota_cut_bytes = p_s_sb->s_blocksize + UNFM_P_SIZE ; + } else { + quota_cut_bytes = 0 ; } } - #ifdef CONFIG_REISERFS_CHECK if (n_is_inode_locked) { struct item_head * le_ih = PATH_PITEM_HEAD (s_cut_balance.tb_path); @@ -1630,10 +1690,13 @@ int reiserfs_cut_from_item (struct reise */ REISERFS_I(p_s_inode)->i_flags &= ~i_pack_on_close_mask ; } +#ifdef REISERQUOTA_DEBUG + printk(KERN_DEBUG "reiserquota cut_from_item(): freeing %u id=%u type=%c\n", quota_cut_bytes, p_s_inode->i_uid, '?'); +#endif + DQUOT_FREE_SPACE_NODIRTY(p_s_inode, quota_cut_bytes); return n_ret_value; } - static void truncate_directory (struct reiserfs_transaction_handle *th, struct inode * inode) { if (inode->i_nlink) @@ -1641,8 +1704,8 @@ static void truncate_directory (struct r set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), DOT_OFFSET); set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_DIRENTRY); - reiserfs_delete_solid_item (th, INODE_PKEY (inode)); - + reiserfs_delete_solid_item (th, inode, INODE_PKEY (inode)); + reiserfs_update_sd(th, inode) ; set_le_key_k_offset (KEY_FORMAT_3_5, INODE_PKEY (inode), SD_OFFSET); set_le_key_k_type (KEY_FORMAT_3_5, INODE_PKEY (inode), TYPE_STAT_DATA); } @@ -1809,18 +1872,37 @@ static void check_research_for_paste (st int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th, struct path * p_s_search_path, /* Path to the pasted item. */ const struct cpu_key * p_s_key, /* Key to search for the needed item.*/ + struct inode * inode, /* Inode item belongs to */ const char * p_c_body, /* Pointer to the bytes to paste. */ int n_pasted_size) /* Size of pasted bytes. */ { struct tree_balance s_paste_balance; int retval; + int fs_gen; + + fs_gen = get_generation(inode->i_sb) ; +#ifdef REISERQUOTA_DEBUG + printk(KERN_DEBUG "reiserquota paste_into_item(): allocating %u id=%u type=%c\n", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key))); +#endif + + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, n_pasted_size)) { + pathrelse(p_s_search_path); + return -EDQUOT; + } init_tb_struct(th, &s_paste_balance, th->t_super, p_s_search_path, n_pasted_size); #ifdef DISPLACE_NEW_PACKING_LOCALITIES s_paste_balance.key = p_s_key->on_disk_key; #endif - - while ( (retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) == REPEAT_SEARCH ) { + + /* DQUOT_* can schedule, must check before the fix_nodes */ + if (fs_changed(fs_gen, inode->i_sb)) { + goto search_again; + } + + while ((retval = fix_nodes(M_PASTE, &s_paste_balance, NULL, p_c_body)) == +REPEAT_SEARCH ) { +search_again: /* file system changed while we were in the fix_nodes */ PROC_INFO_INC( th -> t_super, paste_into_item_restarted ); retval = search_for_position_by_key (th->t_super, p_s_key, p_s_search_path); @@ -1849,6 +1931,10 @@ int reiserfs_paste_into_item (struct rei error_out: /* this also releases the path */ unfix_nodes(&s_paste_balance); +#ifdef REISERQUOTA_DEBUG + printk(KERN_DEBUG "reiserquota paste_into_item(): freeing %u id=%u type=%c\n", n_pasted_size, inode->i_uid, key2type(&(p_s_key->on_disk_key))); +#endif + DQUOT_FREE_SPACE_NODIRTY(inode, n_pasted_size); return retval ; } @@ -1858,23 +1944,45 @@ int reiserfs_insert_item(struct reiserfs struct path * p_s_path, /* Path to the inserteded item. */ const struct cpu_key * key, struct item_head * p_s_ih, /* Pointer to the item header to insert.*/ + struct inode * inode, const char * p_c_body) /* Pointer to the bytes to insert. */ { struct tree_balance s_ins_balance; int retval; + int fs_gen = 0 ; + int quota_bytes = 0 ; + if (inode) { /* Do we count quotas for item? */ + fs_gen = get_generation(inode->i_sb); + quota_bytes = ih_item_len(p_s_ih); + + /* hack so the quota code doesn't have to guess if the file has + ** a tail, links are always tails, so there's no guessing needed + */ + if (!S_ISLNK (inode->i_mode) && is_direct_le_ih(p_s_ih)) { + quota_bytes = inode->i_sb->s_blocksize + UNFM_P_SIZE ; + } +#ifdef REISERQUOTA_DEBUG + printk(KERN_DEBUG "reiserquota insert_item(): allocating %u id=%u type=%c\n", quota_bytes, inode->i_uid, head2type(p_s_ih)); +#endif + /* We can't dirty inode here. It would be immediately written but + * appropriate stat item isn't inserted yet... */ + if (DQUOT_ALLOC_SPACE_NODIRTY(inode, quota_bytes)) { + pathrelse(p_s_path); + return -EDQUOT; + } + } init_tb_struct(th, &s_ins_balance, th->t_super, p_s_path, IH_SIZE + ih_item_len(p_s_ih)); #ifdef DISPLACE_NEW_PACKING_LOCALITIES s_ins_balance.key = key->on_disk_key; #endif - - /* - if (p_c_body == 0) - n_zeros_num = ih_item_len(p_s_ih); - */ - // le_key2cpu_key (&key, &(p_s_ih->ih_key)); + /* DQUOT_* can schedule, must check to be sure calling fix_nodes is safe */ + if (inode && fs_changed(fs_gen, inode->i_sb)) { + goto search_again; + } while ( (retval = fix_nodes(M_INSERT, &s_ins_balance, p_s_ih, p_c_body)) == REPEAT_SEARCH) { +search_again: /* file system changed while we were in the fix_nodes */ PROC_INFO_INC( th -> t_super, insert_item_restarted ); retval = search_item (th->t_super, key, p_s_path); @@ -1889,7 +1997,7 @@ int reiserfs_insert_item(struct reiserfs goto error_out; } } - + /* make balancing after all resources will be collected at a time */ if ( retval == CARRY_ON ) { do_balance (&s_ins_balance, p_s_ih, p_c_body, M_INSERT); @@ -1900,6 +2008,11 @@ int reiserfs_insert_item(struct reiserfs error_out: /* also releases the path */ unfix_nodes(&s_ins_balance); +#ifdef REISERQUOTA_DEBUG + printk(KERN_DEBUG "reiserquota insert_item(): freeing %u id=%u type=%c\n", quota_bytes, inode->i_uid, head2type(p_s_ih)); +#endif + if (inode) + DQUOT_FREE_SPACE_NODIRTY(inode, quota_bytes) ; return retval; } diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/super.c linux-2.6.5-rc1-mm2/fs/reiserfs/super.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/super.c 2004-03-11 03:55:26.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/super.c 2004-03-19 16:30:24.719646136 +0100 @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -59,22 +61,26 @@ static int is_any_reiserfs_magic_string static int reiserfs_remount (struct super_block * s, int * flags, char * data); static int reiserfs_statfs (struct super_block * s, struct kstatfs * buf); -static void reiserfs_write_super (struct super_block * s) +static void reiserfs_sync_fs (struct super_block * s) { + if (!(s->s_flags & MS_RDONLY)) { + struct reiserfs_transaction_handle th; + reiserfs_write_lock(s); + journal_begin(&th, s, 1); + journal_end_sync(&th, s, 1); + reiserfs_flush_old_commits(s); + s->s_dirt = 0; + reiserfs_write_unlock(s); + } +} - int dirty = 0 ; - reiserfs_write_lock(s); - if (!(s->s_flags & MS_RDONLY)) { - dirty = flush_old_commits(s, 1) ; - } - s->s_dirt = dirty; - reiserfs_write_unlock(s); +static void reiserfs_write_super(struct super_block *s) +{ + reiserfs_sync_fs(s); } static void reiserfs_write_super_lockfs (struct super_block * s) { - - int dirty = 0 ; struct reiserfs_transaction_handle th ; reiserfs_write_lock(s); if (!(s->s_flags & MS_RDONLY)) { @@ -84,7 +90,7 @@ static void reiserfs_write_super_lockfs reiserfs_block_writes(&th) ; journal_end(&th, s, 1) ; } - s->s_dirt = dirty; + s->s_dirt = 0; reiserfs_write_unlock(s); } @@ -109,7 +115,7 @@ static void remove_save_link_only (struc /* we are going to do one balancing */ journal_begin (&th, s, JOURNAL_PER_BALANCE_CNT); - reiserfs_delete_solid_item (&th, key); + reiserfs_delete_solid_item (&th, NULL, key); if (oid_free) /* removals are protected by direct items */ reiserfs_release_objectid (&th, le32_to_cpu (key->k_objectid)); @@ -295,8 +301,8 @@ void add_save_link (struct reiserfs_tran /* body of "save" link */ link = INODE_PKEY (inode)->k_dir_id; - /* put "save" link inot tree */ - retval = reiserfs_insert_item (th, &path, &key, &ih, (char *)&link); + /* put "save" link inot tree, don't charge quota to anyone */ + retval = reiserfs_insert_item (th, &path, &key, &ih, NULL, (char *)&link); if (retval) { if (retval != -ENOSPC) reiserfs_warning ("vs-2120: add_save_link: insert_item returned %d\n", @@ -338,7 +344,8 @@ void remove_save_link (struct inode * in ( REISERFS_I(inode) -> i_flags & i_link_saved_truncate_mask ) ) || ( !truncate && ( REISERFS_I(inode) -> i_flags & i_link_saved_unlink_mask ) ) ) - reiserfs_delete_solid_item (&th, &key); + /* don't take quota bytes from anywhere */ + reiserfs_delete_solid_item (&th, NULL, &key); if (!truncate) { reiserfs_release_objectid (&th, inode->i_ino); REISERFS_I(inode) -> i_flags &= ~i_link_saved_unlink_mask; @@ -353,7 +360,17 @@ static void reiserfs_put_super (struct s { int i; struct reiserfs_transaction_handle th ; + + if (REISERFS_SB(s)->xattr_root) { + d_invalidate (REISERFS_SB(s)->xattr_root); + dput (REISERFS_SB(s)->xattr_root); + } + if (REISERFS_SB(s)->priv_root) { + d_invalidate (REISERFS_SB(s)->priv_root); + dput (REISERFS_SB(s)->priv_root); + } + /* change file system state to current state if it was mounted with read-write permissions */ if (!(s->s_flags & MS_RDONLY)) { journal_begin(&th, s, 10) ; @@ -418,6 +435,8 @@ static void init_once(void * foo, kmem_c SLAB_CTOR_CONSTRUCTOR) { INIT_LIST_HEAD(&ei->i_prealloc_list) ; inode_init_once(&ei->vfs_inode); + ei->i_acl_access = NULL; + ei->i_acl_default = NULL; } } @@ -458,6 +477,22 @@ static void reiserfs_dirty_inode (struct reiserfs_write_unlock(inode->i_sb); } +static void reiserfs_clear_inode (struct inode *inode) +{ + struct posix_acl *acl; + + acl = REISERFS_I(inode)->i_acl_access; + if (acl && !IS_ERR (acl)) + posix_acl_release (acl); + REISERFS_I(inode)->i_acl_access = NULL; + + acl = REISERFS_I(inode)->i_acl_default; + if (acl && !IS_ERR (acl)) + posix_acl_release (acl); + REISERFS_I(inode)->i_acl_default = NULL; +} + + struct super_operations reiserfs_sops = { .alloc_inode = reiserfs_alloc_inode, @@ -465,6 +500,7 @@ struct super_operations reiserfs_sops = .write_inode = reiserfs_write_inode, .dirty_inode = reiserfs_dirty_inode, .delete_inode = reiserfs_delete_inode, + .clear_inode = reiserfs_clear_inode, .put_super = reiserfs_put_super, .write_super = reiserfs_write_super, .write_super_lockfs = reiserfs_write_super_lockfs, @@ -506,6 +542,21 @@ typedef struct { applied BEFORE setmask */ } opt_desc_t; +/* possible values for -o data= */ +static const arg_desc_t logging_mode[] = { + {"ordered", 1<s_mount_opt &= ~((1 << REISERFS_DATA_LOG) | + (1 << REISERFS_DATA_ORDERED) | + (1 << REISERFS_DATA_WRITEBACK)); + REISERFS_SB(s)->s_mount_opt |= (1 << mode); +} + +static void handle_data_mode(struct super_block *s, unsigned long mount_options) +{ + if (mount_options & (1 << REISERFS_DATA_LOG)) { + if (!reiserfs_data_log(s)) { + switch_data_mode(s, REISERFS_DATA_LOG); + printk("reiserfs: switching to journaled data mode\n"); + } + } else if (mount_options & (1 << REISERFS_DATA_ORDERED)) { + if (!reiserfs_data_ordered(s)) { + switch_data_mode(s, REISERFS_DATA_ORDERED); + printk("reiserfs: switching to ordered data mode\n"); + } + } else if (mount_options & (1 << REISERFS_DATA_WRITEBACK)) { + if (!reiserfs_data_writeback(s)) { + switch_data_mode(s, REISERFS_DATA_WRITEBACK); + printk("reiserfs: switching to writeback data mode\n"); + } + } +} + +static void handle_barrier_mode(struct super_block *s, unsigned long bits) { + int flush = (1 << REISERFS_BARRIER_FLUSH); + int none = (1 << REISERFS_BARRIER_NONE); + int all_barrier = flush | none; + + if (bits & all_barrier) { + REISERFS_SB(s)->s_mount_opt &= ~all_barrier; + if (bits & flush) { + REISERFS_SB(s)->s_mount_opt |= flush; + printk("reiserfs: enabling write barrier flush mode\n"); + } else if (bits & none) { + REISERFS_SB(s)->s_mount_opt |= none; + printk("reiserfs: write barriers turned off\n"); + } + } +} + static void handle_attrs( struct super_block *s ) { struct reiserfs_super_block * rs; @@ -775,6 +880,10 @@ static int reiserfs_remount (struct supe safe_mask |= 1 << REISERFS_HASHED_RELOCATION; safe_mask |= 1 << REISERFS_TEST4; safe_mask |= 1 << REISERFS_ATTRS; + safe_mask |= 1 << REISERFS_XATTRS_USER; + safe_mask |= 1 << REISERFS_POSIXACL; + safe_mask |= 1 << REISERFS_BARRIER_FLUSH; + safe_mask |= 1 << REISERFS_BARRIER_NONE; /* Update the bitmask, taking care to keep * the bits we're not allowed to change here */ @@ -791,6 +900,7 @@ static int reiserfs_remount (struct supe } if (*mount_flags & MS_RDONLY) { + reiserfs_xattr_init (s, *mount_flags); /* remount read-only */ if (s->s_flags & MS_RDONLY) /* it is read-only already */ @@ -805,12 +915,15 @@ static int reiserfs_remount (struct supe reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1) ; set_sb_umount_state( rs, REISERFS_SB(s)->s_mount_state ); journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); - s->s_dirt = 0; } else { /* remount read-write */ - if (!(s->s_flags & MS_RDONLY)) + if (!(s->s_flags & MS_RDONLY)) { + reiserfs_xattr_init (s, *mount_flags); return 0; /* We are read-write already */ + } + handle_data_mode(s, mount_options); + handle_barrier_mode(s, mount_options); REISERFS_SB(s)->s_mount_state = sb_umount_state(rs) ; s->s_flags &= ~MS_RDONLY ; /* now it is safe to call journal_begin */ journal_begin(&th, s, 10) ; @@ -822,15 +935,17 @@ static int reiserfs_remount (struct supe set_sb_umount_state( rs, REISERFS_ERROR_FS ); /* mark_buffer_dirty (SB_BUFFER_WITH_SB (s), 1); */ journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); - s->s_dirt = 0; REISERFS_SB(s)->s_mount_state = REISERFS_VALID_FS ; } /* this will force a full flush of all journal lists */ SB_JOURNAL(s)->j_must_wait = 1 ; journal_end(&th, s, 10) ; + s->s_dirt = 0; - if (!( *mount_flags & MS_RDONLY ) ) + if (!( *mount_flags & MS_RDONLY ) ) { finish_unfinished( s ); + reiserfs_xattr_init (s, *mount_flags); + } return 0; } @@ -1258,8 +1373,10 @@ static int reiserfs_fill_super (struct s REISERFS_SB(s)->s_alloc_options.bits = ( 1 << 5); /* If file grew past 4 blocks, start preallocation blocks for it. */ REISERFS_SB(s)->s_alloc_options.preallocmin = 4; - /* Preallocate by 8 blocks (9-1) at once */ - REISERFS_SB(s)->s_alloc_options.preallocsize = 9; + /* Preallocate by 16 blocks (17-1) at once */ + REISERFS_SB(s)->s_alloc_options.preallocsize = 17; + /* Initialize the rwsem for xattr dir */ + init_rwsem(&REISERFS_SB(s)->xattr_dir_sem); jdev_name = NULL; if (reiserfs_parse_options (s, (char *) data, &(sbi->s_mount_opt), &blocks, &jdev_name, &commit_max_age) == 0) { @@ -1303,6 +1420,24 @@ static int reiserfs_fill_super (struct s SPRINTK(silent, "reiserfs:warning: - it is slow mode for debugging.\n"); #endif + /* make data=ordered the default */ + if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) && + !reiserfs_data_writeback(s)) + { + REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED); + } + + if (reiserfs_data_log(s)) { + printk("reiserfs: using journaled data mode\n"); + } else if (reiserfs_data_ordered(s)) { + printk("reiserfs: using ordered data mode\n"); + } else { + printk("reiserfs: using writeback data mode\n"); + } + if (reiserfs_barrier_flush(s)) { + printk("reiserfs: using flush barriers\n"); + } + // set_device_ro(s->s_dev, 1) ; if( journal_init(s, jdev_name, old_format, commit_max_age) ) { SPRINTK(silent, "sh-2022: reiserfs_fill_super: unable to initialize journal space\n") ; @@ -1389,15 +1524,25 @@ static int reiserfs_fill_super (struct s journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB (s)); journal_end(&th, s, 1) ; - + + if (reiserfs_xattr_init (s, s->s_flags)) { + dput (s->s_root); + s->s_root = NULL; + goto error; + } + /* look for files which were to be removed in previous session */ finish_unfinished (s); - - s->s_dirt = 0; } else { if ( old_format_only(s) && !silent) { reiserfs_warning("reiserfs: using 3.5.x disk format\n") ; } + + if (reiserfs_xattr_init (s, s->s_flags)) { + dput (s->s_root); + s->s_root = NULL; + goto error; + } } // mark hash in super block: it could be unset. overwrite should be ok set_sb_hash_function_code( rs, function2code(sbi->s_hash_function ) ); @@ -1465,6 +1610,9 @@ init_reiserfs_fs ( void ) return ret; } + if ((ret = reiserfs_xattr_register_handlers ())) + goto failed_reiserfs_xattr_register_handlers; + reiserfs_proc_info_global_init (); reiserfs_proc_register_global ("version", reiserfs_global_version_in_proc); @@ -1474,6 +1622,9 @@ init_reiserfs_fs ( void ) return 0; } + reiserfs_xattr_unregister_handlers (); + +failed_reiserfs_xattr_register_handlers: reiserfs_proc_unregister_global ("version"); reiserfs_proc_info_global_done (); destroy_inodecache (); @@ -1484,6 +1635,7 @@ init_reiserfs_fs ( void ) static void __exit exit_reiserfs_fs ( void ) { + reiserfs_xattr_unregister_handlers (); reiserfs_proc_unregister_global ("version"); reiserfs_proc_info_global_done (); unregister_filesystem (& reiserfs_fs_type); diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/tail_conversion.c linux-2.6.5-rc1-mm2/fs/reiserfs/tail_conversion.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/tail_conversion.c 2004-03-11 03:55:56.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/tail_conversion.c 2004-03-19 16:10:24.020964439 +0100 @@ -66,11 +66,11 @@ int direct2indirect (struct reiserfs_tra set_ih_free_space (&ind_ih, 0); /* delete at nearest future */ put_ih_item_len( &ind_ih, UNFM_P_SIZE ); PATH_LAST_POSITION (path)++; - n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih, + n_retval = reiserfs_insert_item (th, path, &end_key, &ind_ih, inode, (char *)&unfm_ptr); } else { /* Paste into last indirect item of an object. */ - n_retval = reiserfs_paste_into_item(th, path, &end_key, + n_retval = reiserfs_paste_into_item(th, path, &end_key, inode, (char *)&unfm_ptr, UNFM_P_SIZE); } if ( n_retval ) { @@ -143,16 +143,17 @@ void reiserfs_unmap_buffer(struct buffer if (buffer_journaled(bh) || buffer_journal_dirty(bh)) { BUG() ; } - clear_buffer_dirty(bh) ; lock_buffer(bh) ; + clear_buffer_dirty(bh) ; /* Remove the buffer from whatever list it belongs to. We are mostly interested in removing it from per-sb j_dirty_buffers list, to avoid BUG() on attempt to write not mapped buffer */ - if ( !list_empty(&bh->b_assoc_buffers) && bh->b_page) { + if ( (!list_empty(&bh->b_assoc_buffers) || bh->b_private) && bh->b_page) { struct inode *inode = bh->b_page->mapping->host; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); spin_lock(&j->j_dirty_buffers_lock); list_del_init(&bh->b_assoc_buffers); + reiserfs_free_jh(bh); spin_unlock(&j->j_dirty_buffers_lock); } clear_buffer_mapped(bh) ; @@ -275,7 +276,7 @@ int indirect2direct (struct reiserfs_tra set_cpu_key_k_type (&key, TYPE_DIRECT); key.key_length = 4; /* Insert tail as new direct item in the tree */ - if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih, + if ( reiserfs_insert_item(th, p_s_path, &key, &s_ih, p_s_inode, tail ? tail : NULL) < 0 ) { /* No disk memory. So we can not convert last unformatted node to the direct item. In this case we used to adjust @@ -293,13 +294,15 @@ int indirect2direct (struct reiserfs_tra */ unmap_buffers(page, pos1) ; + /* make sure to get the i_blocks changes from reiserfs_insert_item */ + reiserfs_update_sd(th, p_s_inode); + // note: we have now the same as in above direct2indirect // conversion: there are two keys which have matching first three // key components. They only differ by the fouhth one. /* We have inserted new direct item and must remove last unformatted node. */ - p_s_inode->i_blocks += (p_s_sb->s_blocksize / 512); *p_c_mode = M_CUT; /* we store position of first direct item in the in-core inode */ diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_acl.c linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_acl.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_acl.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_acl.c 2004-03-19 16:10:24.027963686 +0100 @@ -0,0 +1,563 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +xattr_set_acl(struct inode *inode, int type, const void *value, size_t size) +{ + struct posix_acl *acl; + int error; + + if (!reiserfs_posixacl(inode->i_sb)) + return -EOPNOTSUPP; + if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) { + return PTR_ERR(acl); + } else if (acl) { + error = posix_acl_valid(acl); + if (error) + goto release_and_out; + } + } else + acl = NULL; + + error = reiserfs_set_acl (inode, type, acl); + +release_and_out: + posix_acl_release(acl); + return error; +} + + +static int +xattr_get_acl(struct inode *inode, int type, void *buffer, size_t size) +{ + struct posix_acl *acl; + int error; + + if (!reiserfs_posixacl(inode->i_sb)) + return -EOPNOTSUPP; + + acl = reiserfs_get_acl (inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + error = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return error; +} + + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl * +posix_acl_from_disk(const void *value, size_t size) +{ + const char *end = (char *)value + size; + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(reiserfs_acl_header)) + return ERR_PTR(-EINVAL); + if (((reiserfs_acl_header *)value)->a_version != + cpu_to_le32(REISERFS_ACL_VERSION)) + return ERR_PTR(-EINVAL); + value = (char *)value + sizeof(reiserfs_acl_header); + count = reiserfs_acl_count(size); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n=0; n < count; n++) { + reiserfs_acl_entry *entry = + (reiserfs_acl_entry *)value; + if ((char *)value + sizeof(reiserfs_acl_entry_short) > end) + goto fail; + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + value = (char *)value + + sizeof(reiserfs_acl_entry_short); + acl->a_entries[n].e_id = ACL_UNDEFINED_ID; + break; + + case ACL_USER: + case ACL_GROUP: + value = (char *)value + sizeof(reiserfs_acl_entry); + if ((char *)value > end) + goto fail; + acl->a_entries[n].e_id = + le32_to_cpu(entry->e_id); + break; + + default: + goto fail; + } + } + if (value != end) + goto fail; + return acl; + +fail: + posix_acl_release(acl); + return ERR_PTR(-EINVAL); +} + +/* + * Convert from in-memory to filesystem representation. + */ +static void * +posix_acl_to_disk(const struct posix_acl *acl, size_t *size) +{ + reiserfs_acl_header *ext_acl; + char *e; + int n; + + *size = reiserfs_acl_size(acl->a_count); + ext_acl = (reiserfs_acl_header *)kmalloc(sizeof(reiserfs_acl_header) + + acl->a_count * sizeof(reiserfs_acl_entry), GFP_NOFS); + if (!ext_acl) + return ERR_PTR(-ENOMEM); + ext_acl->a_version = cpu_to_le32(REISERFS_ACL_VERSION); + e = (char *)ext_acl + sizeof(reiserfs_acl_header); + for (n=0; n < acl->a_count; n++) { + reiserfs_acl_entry *entry = (reiserfs_acl_entry *)e; + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + switch(acl->a_entries[n].e_tag) { + case ACL_USER: + case ACL_GROUP: + entry->e_id = + cpu_to_le32(acl->a_entries[n].e_id); + e += sizeof(reiserfs_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + e += sizeof(reiserfs_acl_entry_short); + break; + + default: + goto fail; + } + } + return (char *)ext_acl; + +fail: + kfree(ext_acl); + return ERR_PTR(-EINVAL); +} + +/* + * Inode operation get_posix_acl(). + * + * inode->i_sem: down + * BKL held [before 2.5.x] + */ +struct posix_acl * +reiserfs_get_acl(struct inode *inode, int type) +{ + char *name, *value; + struct posix_acl *acl, **p_acl; + size_t size; + int retval; + struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_ACL_ACCESS; + p_acl = &reiserfs_i->i_acl_access; + break; + case ACL_TYPE_DEFAULT: + name = XATTR_NAME_ACL_DEFAULT; + p_acl = &reiserfs_i->i_acl_default; + break; + default: + return ERR_PTR (-EINVAL); + } + + if (IS_ERR (*p_acl)) { + if (PTR_ERR (*p_acl) == -ENODATA) + return NULL; + } else if (*p_acl != NULL) + return posix_acl_dup (*p_acl); + + size = reiserfs_xattr_get (inode, name, NULL, 0); + if ((int)size < 0) { + if (size == -ENODATA || size == -ENOSYS) { + *p_acl = ERR_PTR (-ENODATA); + return NULL; + } + return ERR_PTR (size); + } + + value = kmalloc (size, GFP_NOFS); + if (!value) + return ERR_PTR (-ENOMEM); + + retval = reiserfs_xattr_get(inode, name, value, size); + if (retval == -ENODATA || retval == -ENOSYS) { + /* This shouldn't actually happen as it should have + been caught above.. but just in case */ + acl = NULL; + *p_acl = ERR_PTR (-ENODATA); + } else if (retval < 0) { + acl = ERR_PTR(retval); + } else { + acl = posix_acl_from_disk(value, retval); + *p_acl = posix_acl_dup (acl); + } + + kfree(value); + return acl; +} + +/* + * Inode operation set_posix_acl(). + * + * inode->i_sem: down + * BKL held [before 2.5.x] + */ +int +reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) +{ + char *name; + void *value = NULL; + struct posix_acl **p_acl; + size_t size; + int error; + struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_ACL_ACCESS; + p_acl = &reiserfs_i->i_acl_access; + if (acl) { + mode_t mode = inode->i_mode; + error = posix_acl_equiv_mode (acl, &mode); + if (error < 0) + return error; + else { + inode->i_mode = mode; + if (error == 0) + acl = NULL; + } + } + break; + case ACL_TYPE_DEFAULT: + name = XATTR_NAME_ACL_DEFAULT; + p_acl = &reiserfs_i->i_acl_default; + if (!S_ISDIR (inode->i_mode)) + return acl ? -EACCES : 0; + break; + default: + return -EINVAL; + } + + if (acl) { + value = posix_acl_to_disk(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + error = reiserfs_xattr_set(inode, name, value, size, 0); + } else { + error = reiserfs_xattr_del (inode, name); + if (error == -ENODATA) + error = 0; + } + + if (value) + kfree(value); + + if (!error) { + /* Release the old one */ + if (!IS_ERR (*p_acl) && *p_acl) + posix_acl_release (*p_acl); + + if (acl == NULL) + *p_acl = ERR_PTR (-ENODATA); + else + *p_acl = posix_acl_dup (acl); + } + + return error; +} + +/* dir->i_sem: down, + * inode is new and not released into the wild yet */ +int +reiserfs_inherit_default_acl (struct inode *dir, struct dentry *dentry, struct inode *inode) +{ + struct posix_acl *acl; + int err = 0; + + /* ACLs only get applied to files and directories */ + if (S_ISLNK (inode->i_mode)) + return 0; + + /* ACLs can only be used on "new" objects, so if it's an old object + * there is nothing to inherit from */ + if (get_inode_sd_version (dir) == STAT_DATA_V1) + goto apply_umask; + + /* Don't apply ACLs to objects in the .reiserfs_priv tree.. This + * would be useless since permissions are ignored, and a pain because + * it introduces locking cycles */ + if (is_reiserfs_priv_object (dir)) { + REISERFS_I(inode)->i_flags |= i_priv_object; + goto apply_umask; + } + + acl = reiserfs_get_acl (dir, ACL_TYPE_DEFAULT); + if (IS_ERR (acl)) { + if (PTR_ERR (acl) == -ENODATA) + goto apply_umask; + return PTR_ERR (acl); + } + + if (acl) { + struct posix_acl *acl_copy; + mode_t mode = inode->i_mode; + int need_acl; + + /* Copy the default ACL to the default ACL of a new directory */ + if (S_ISDIR (inode->i_mode)) { + err = reiserfs_set_acl (inode, ACL_TYPE_DEFAULT, acl); + if (err) + goto cleanup; + } + + /* Now we reconcile the new ACL and the mode, + potentially modifying both */ + acl_copy = posix_acl_clone (acl, GFP_NOFS); + if (!acl_copy) { + err = -ENOMEM; + goto cleanup; + } + + + need_acl = posix_acl_create_masq (acl_copy, &mode); + if (need_acl >= 0) { + if (mode != inode->i_mode) { + inode->i_mode = mode; + } + + /* If we need an ACL.. */ + if (need_acl > 0) { + err = reiserfs_set_acl (inode, ACL_TYPE_ACCESS, acl_copy); + if (err) + goto cleanup_copy; + } + } +cleanup_copy: + posix_acl_release (acl_copy); +cleanup: + posix_acl_release (acl); + } else { +apply_umask: + /* no ACL, apply umask */ + inode->i_mode &= ~current->fs->umask; + } + + return err; +} + +/* Looks up and caches the result of the default ACL. + * We do this so that we don't need to carry the xattr_sem into + * reiserfs_new_inode if we don't need to */ +int +reiserfs_cache_default_acl (struct inode *inode) +{ + int ret = 0; + if (reiserfs_posixacl (inode->i_sb) && + !is_reiserfs_priv_object (inode)) { + struct posix_acl *acl; + reiserfs_read_lock_xattr_i (inode); + reiserfs_read_lock_xattrs (inode->i_sb); + acl = reiserfs_get_acl (inode, ACL_TYPE_DEFAULT); + reiserfs_read_unlock_xattrs (inode->i_sb); + reiserfs_read_unlock_xattr_i (inode); + ret = acl ? 1 : 0; + posix_acl_release (acl); + } + + return ret; +} + +int +reiserfs_acl_chmod (struct inode *inode) +{ + struct posix_acl *acl, *clone; + int error; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (get_inode_sd_version (inode) == STAT_DATA_V1 || + !reiserfs_posixacl(inode->i_sb)) + { + return 0; + } + + reiserfs_read_lock_xattrs (inode->i_sb); + acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); + reiserfs_read_unlock_xattrs (inode->i_sb); + if (!acl) + return 0; + if (IS_ERR(acl)) + return PTR_ERR(acl); + clone = posix_acl_clone(acl, GFP_NOFS); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + error = posix_acl_chmod_masq(clone, inode->i_mode); + if (!error) { + int lock = !has_xattr_dir (inode); + reiserfs_write_lock_xattr_i (inode); + if (lock) + reiserfs_write_lock_xattrs (inode->i_sb); + else + reiserfs_read_lock_xattrs (inode->i_sb); + error = reiserfs_set_acl(inode, ACL_TYPE_ACCESS, clone); + if (lock) + reiserfs_write_unlock_xattrs (inode->i_sb); + else + reiserfs_read_unlock_xattrs (inode->i_sb); + reiserfs_write_unlock_xattr_i (inode); + } + posix_acl_release(clone); + return error; +} + +static int +posix_acl_access_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) + return -EINVAL; + return xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); +} + +static int +posix_acl_access_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) + return -EINVAL; + return xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int +posix_acl_access_del (struct inode *inode, const char *name) +{ + struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); + struct posix_acl **acl = &reiserfs_i->i_acl_access; + if (strlen(name) != sizeof(XATTR_NAME_ACL_ACCESS)-1) + return -EINVAL; + if (!IS_ERR (*acl) && *acl) { + posix_acl_release (*acl); + *acl = ERR_PTR (-ENODATA); + } + + return 0; +} + +static int +posix_acl_access_list (struct inode *inode, const char *name, int namelen, char *out) +{ + int len = namelen; + if (!reiserfs_posixacl (inode->i_sb)) + return 0; + if (out) + memcpy (out, name, len); + + return len; +} + +struct reiserfs_xattr_handler posix_acl_access_handler = { + prefix: XATTR_NAME_ACL_ACCESS, + get: posix_acl_access_get, + set: posix_acl_access_set, + del: posix_acl_access_del, + list: posix_acl_access_list, +}; + +static int +posix_acl_default_get (struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) + return -EINVAL; + return xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); +} + +static int +posix_acl_default_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) + return -EINVAL; + return xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +static int +posix_acl_default_del (struct inode *inode, const char *name) +{ + struct reiserfs_inode_info *reiserfs_i = REISERFS_I(inode); + struct posix_acl **acl = &reiserfs_i->i_acl_default; + if (strlen(name) != sizeof(XATTR_NAME_ACL_DEFAULT)-1) + return -EINVAL; + if (!IS_ERR (*acl) && *acl) { + posix_acl_release (*acl); + *acl = ERR_PTR (-ENODATA); + } + + return 0; +} + +static int +posix_acl_default_list (struct inode *inode, const char *name, int namelen, char *out) +{ + int len = namelen; + if (!reiserfs_posixacl (inode->i_sb)) + return 0; + if (out) + memcpy (out, name, len); + + return len; +} + +struct reiserfs_xattr_handler posix_acl_default_handler = { + prefix: XATTR_NAME_ACL_DEFAULT, + get: posix_acl_default_get, + set: posix_acl_default_set, + del: posix_acl_default_del, + list: posix_acl_default_list, +}; diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr.c linux-2.6.5-rc1-mm2/fs/reiserfs/xattr.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/xattr.c 2004-03-19 16:10:24.023964116 +0100 @@ -0,0 +1,1440 @@ +/* + * linux/fs/reiserfs/xattr.c + * + * Copyright (c) 2002 by Jeff Mahoney, + * + */ + +/* + * In order to implement EA/ACLs in a clean, backwards compatible manner, + * they are implemented as files in a "private" directory. + * Each EA is in it's own file, with the directory layout like so (/ is assumed + * to be relative to fs root). Inside the /.reiserfs_priv/xattrs directory, + * directories named using the capital-hex form of the objectid and + * generation number are used. Inside each directory are individual files + * named with the name of the extended attribute. + * + * So, for objectid 12648430, we could have: + * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_access + * /.reiserfs_priv/xattrs/C0FFEE.0/system.posix_acl_default + * /.reiserfs_priv/xattrs/C0FFEE.0/user.Content-Type + * .. or similar. + * + * The file contents are the text of the EA. The size is known based on the + * stat data describing the file. + * + * In the case of system.posix_acl_access and system.posix_acl_default, since + * these are special cases for filesystem ACLs, they are interpreted by the + * kernel, in addition, they are negatively and positively cached and attached + * to the inode so that unnecessary lookups are avoided. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FL_READONLY 128 +#define FL_DIR_SEM_HELD 256 +#define PRIVROOT_NAME ".reiserfs_priv" +#define XAROOT_NAME "xattrs" + +static struct reiserfs_xattr_handler *find_xattr_handler_prefix (const char *prefix); + +static struct dentry * +create_xa_root (struct super_block *sb) +{ + struct dentry *privroot = dget (REISERFS_SB(sb)->priv_root); + struct dentry *xaroot; + + /* This needs to be created at mount-time */ + if (!privroot) + return ERR_PTR(-EOPNOTSUPP); + + xaroot = lookup_one_len (XAROOT_NAME, privroot, strlen (XAROOT_NAME)); + if (IS_ERR (xaroot)) { + goto out; + } else if (!xaroot->d_inode) { + int err; + down (&privroot->d_inode->i_sem); + err = privroot->d_inode->i_op->mkdir (privroot->d_inode, xaroot, 0700); + up (&privroot->d_inode->i_sem); + + if (err) { + dput (xaroot); + dput (privroot); + return ERR_PTR (err); + } + REISERFS_SB(sb)->xattr_root = dget (xaroot); + } + +out: + dput (privroot); + return xaroot; +} + +/* This will return a dentry, or error, refering to the xa root directory. + * If the xa root doesn't exist yet, the dentry will be returned without + * an associated inode. This dentry can be used with ->mkdir to create + * the xa directory. */ +static struct dentry * +__get_xa_root (struct super_block *s) +{ + struct dentry *privroot = dget (REISERFS_SB(s)->priv_root); + struct dentry *xaroot = NULL; + + if (IS_ERR (privroot) || !privroot) + return privroot; + + xaroot = lookup_one_len (XAROOT_NAME, privroot, strlen (XAROOT_NAME)); + if (IS_ERR (xaroot)) { + goto out; + } else if (!xaroot->d_inode) { + dput (xaroot); + xaroot = NULL; + goto out; + } + + REISERFS_SB(s)->xattr_root = dget (xaroot); + +out: + dput (privroot); + return xaroot; +} + +/* Returns the dentry (or NULL) referring to the root of the extended + * attribute directory tree. If it has already been retreived, it is used. + * Otherwise, we attempt to retreive it from disk. It may also return + * a pointer-encoded error. + */ +static inline struct dentry * +get_xa_root (struct super_block *s) +{ + struct dentry *dentry = dget (REISERFS_SB(s)->xattr_root); + + if (!dentry) + dentry = __get_xa_root (s); + + return dentry; +} + +/* Opens the directory corresponding to the inode's extended attribute store. + * If flags allow, the tree to the directory may be created. If creation is + * prohibited, -ENODATA is returned. */ +static struct dentry * +open_xa_dir (const struct inode *inode, int flags) +{ + struct dentry *xaroot, *xadir; + char namebuf[17]; + + xaroot = get_xa_root (inode->i_sb); + if (IS_ERR (xaroot)) { + return xaroot; + } else if (!xaroot) { + if (flags == 0 || flags & XATTR_CREATE) { + xaroot = create_xa_root (inode->i_sb); + if (IS_ERR (xaroot)) + return xaroot; + } + if (!xaroot) + return ERR_PTR (-ENODATA); + } + + /* ok, we have xaroot open */ + + snprintf (namebuf, sizeof (namebuf), "%X.%X", + le32_to_cpu (INODE_PKEY (inode)->k_objectid), + inode->i_generation); + xadir = lookup_one_len (namebuf, xaroot, strlen (namebuf)); + if (IS_ERR (xadir)) { + dput (xaroot); + return xadir; + } + + if (!xadir->d_inode) { + int err; + if (flags == 0 || flags & XATTR_CREATE) { + /* Although there is nothing else trying to create this directory, + * another directory with the same hash may be created, so we need + * to protect against that */ + err = xaroot->d_inode->i_op->mkdir (xaroot->d_inode, xadir, 0700); + if (err) { + dput (xaroot); + dput (xadir); + return ERR_PTR (err); + } + } + if (!xadir->d_inode) { + dput (xaroot); + dput (xadir); + return ERR_PTR (-ENODATA); + } + /* Newly created object.. Need to mark it private */ + REISERFS_I(xadir->d_inode)->i_flags |= i_priv_object; + } + + dput (xaroot); + return xadir; +} + +/* Returns a dentry corresponding to a specific extended attribute file + * for the inode. If flags allow, the file is created. Otherwise, a + * valid or negative dentry, or an error is returned. */ +static struct dentry * +get_xa_file_dentry (const struct inode *inode, const char *name, int flags) +{ + struct dentry *xadir, *xafile; + int err = 0; + + xadir = open_xa_dir (inode, flags); + if (IS_ERR (xadir)) { + return ERR_PTR (PTR_ERR (xadir)); + } else if (xadir && !xadir->d_inode) { + dput (xadir); + return ERR_PTR (-ENODATA); + } + + xafile = lookup_one_len (name, xadir, strlen (name)); + if (IS_ERR (xafile)) { + dput (xadir); + return ERR_PTR (PTR_ERR (xafile)); + } + + if (xafile->d_inode) { /* file exists */ + if (flags & XATTR_CREATE) { + err = -EEXIST; + dput (xafile); + goto out; + } + } else if (flags & XATTR_REPLACE || flags & FL_READONLY) { + goto out; + } else { + /* inode->i_sem is down, so nothing else can try to create + * the same xattr */ + err = xadir->d_inode->i_op->create (xadir->d_inode, xafile, + 0700|S_IFREG, NULL); + + if (err) { + dput (xafile); + goto out; + } + /* Newly created object.. Need to mark it private */ + REISERFS_I(xafile->d_inode)->i_flags |= i_priv_object; + } + +out: + dput (xadir); + if (err) + xafile = ERR_PTR (err); + return xafile; +} + + +/* Opens a file pointer to the attribute associated with inode */ +static struct file * +open_xa_file (const struct inode *inode, const char *name, int flags) +{ + struct dentry *xafile; + struct file *fp; + + xafile = get_xa_file_dentry (inode, name, flags); + if (IS_ERR (xafile)) + return ERR_PTR (PTR_ERR (xafile)); + else if (!xafile->d_inode) { + dput (xafile); + return ERR_PTR (-ENODATA); + } + + fp = dentry_open (xafile, NULL, O_RDWR); + /* dentry_open dputs the dentry if it fails */ + + return fp; +} + + +/* + * this is very similar to fs/reiserfs/dir.c:reiserfs_readdir, but + * we need to drop the path before calling the filldir struct. That + * would be a big performance hit to the non-xattr case, so I've copied + * the whole thing for now. --clm + * + * the big difference is that I go backwards through the directory, + * and don't mess with f->f_pos, but the idea is the same. Do some + * action on each and every entry in the directory. + * + * we're called with i_sem held, so there are no worries about the directory + * changing underneath us. + */ +static int __xattr_readdir(struct file * filp, void * dirent, filldir_t filldir) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct cpu_key pos_key; /* key of current position in the directory (key of directory entry) */ + INITIALIZE_PATH (path_to_entry); + struct buffer_head * bh; + int entry_num; + struct item_head * ih, tmp_ih; + int search_res; + char * local_buf; + loff_t next_pos; + char small_buf[32] ; /* avoid kmalloc if we can */ + struct reiserfs_de_head *deh; + int d_reclen; + char * d_name; + off_t d_off; + ino_t d_ino; + struct reiserfs_dir_entry de; + + + /* form key for search the next directory entry using f_pos field of + file structure */ + next_pos = max_reiserfs_offset(inode); + + while (1) { +research: + if (next_pos <= DOT_DOT_OFFSET) + break; + make_cpu_key (&pos_key, inode, next_pos, TYPE_DIRENTRY, 3); + + search_res = search_by_entry_key(inode->i_sb, &pos_key, &path_to_entry, &de); + if (search_res == IO_ERROR) { + // FIXME: we could just skip part of directory which could + // not be read + pathrelse(&path_to_entry); + return -EIO; + } + + if (search_res == NAME_NOT_FOUND) + de.de_entry_num--; + + set_de_name_and_namelen(&de); + entry_num = de.de_entry_num; + deh = &(de.de_deh[entry_num]); + + bh = de.de_bh; + ih = de.de_ih; + + if (!is_direntry_le_ih(ih)) { + reiserfs_warning("not direntry %h\n", ih); + break; + } + copy_item_head(&tmp_ih, ih); + + /* we must have found item, that is item of this directory, */ + RFALSE( COMP_SHORT_KEYS (&(ih->ih_key), &pos_key), + "vs-9000: found item %h does not match to dir we readdir %K", + ih, &pos_key); + + if (deh_offset(deh) <= DOT_DOT_OFFSET) { + break; + } + + /* look for the previous entry in the directory */ + next_pos = deh_offset (deh) - 1; + + if (!de_visible (deh)) + /* it is hidden entry */ + continue; + + d_reclen = entry_length(bh, ih, entry_num); + d_name = B_I_DEH_ENTRY_FILE_NAME (bh, ih, deh); + d_off = deh_offset (deh); + d_ino = deh_objectid (deh); + + if (!d_name[d_reclen - 1]) + d_reclen = strlen (d_name); + + if (d_reclen > REISERFS_MAX_NAME(inode->i_sb->s_blocksize)){ + /* too big to send back to VFS */ + continue ; + } + + /* Ignore the .reiserfs_priv entry */ + if (reiserfs_xattrs (inode->i_sb) && + !old_format_only(inode->i_sb) && + deh_objectid (deh) == le32_to_cpu (INODE_PKEY(REISERFS_SB(inode->i_sb)->priv_root->d_inode)->k_objectid)) + continue; + + if (d_reclen <= 32) { + local_buf = small_buf ; + } else { + local_buf = reiserfs_kmalloc(d_reclen, GFP_NOFS, inode->i_sb) ; + if (!local_buf) { + pathrelse (&path_to_entry); + return -ENOMEM ; + } + if (item_moved (&tmp_ih, &path_to_entry)) { + reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; + + /* sigh, must retry. Do this same offset again */ + next_pos = d_off; + goto research; + } + } + + // Note, that we copy name to user space via temporary + // buffer (local_buf) because filldir will block if + // user space buffer is swapped out. At that time + // entry can move to somewhere else + memcpy (local_buf, d_name, d_reclen); + + /* the filldir function might need to start transactions, + * or do who knows what. Release the path now that we've + * copied all the important stuff out of the deh + */ + pathrelse (&path_to_entry); + + if (filldir (dirent, local_buf, d_reclen, d_off, d_ino, + DT_UNKNOWN) < 0) { + if (local_buf != small_buf) { + reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; + } + goto end; + } + if (local_buf != small_buf) { + reiserfs_kfree(local_buf, d_reclen, inode->i_sb) ; + } + } /* while */ + +end: + pathrelse (&path_to_entry); + return 0; +} + +/* + * this could be done with dedicated readdir ops for the xattr files, + * but I want to get something working asap + * this is stolen from vfs_readdir + * + */ +static +int xattr_readdir(struct file *file, filldir_t filler, void *buf) +{ + struct inode *inode = file->f_dentry->d_inode; + int res = -ENOTDIR; + if (!file->f_op || !file->f_op->readdir) + goto out; + down(&inode->i_sem); +// down(&inode->i_zombie); + res = -ENOENT; + if (!IS_DEADDIR(inode)) { + lock_kernel(); + res = __xattr_readdir(file, buf, filler); + unlock_kernel(); + } +// up(&inode->i_zombie); + up(&inode->i_sem); +out: + return res; +} + + +/* Internal operations on file data */ +static inline void +reiserfs_put_page(struct page *page) +{ + kunmap(page); + page_cache_release(page); +} + +static struct page * +reiserfs_get_page(struct inode *dir, unsigned long n) +{ + struct address_space *mapping = dir->i_mapping; + struct page *page; + /* We can deadlock if we try to free dentries, + and an unlink/rmdir has just occured - GFP_NOFS avoids this */ + mapping->flags = (mapping->flags & ~__GFP_BITS_MASK) | GFP_NOFS; + page = read_cache_page (mapping, n, + (filler_t*)mapping->a_ops->readpage, NULL); + if (!IS_ERR(page)) { + wait_on_page_locked(page); + kmap(page); + if (!PageUptodate(page)) + goto fail; + + if (PageError(page)) + goto fail; + } + return page; + +fail: + reiserfs_put_page(page); + return ERR_PTR(-EIO); +} + +static inline __u32 +xattr_hash (const char *msg, int len) +{ + return csum_partial (msg, len, 0); +} + +/* Generic extended attribute operations that can be used by xa plugins */ + +/* + * inode->i_sem: down + */ +int +reiserfs_xattr_set (struct inode *inode, const char *name, const void *buffer, + size_t buffer_size, int flags) +{ + int err = 0; + struct file *fp; + struct page *page; + char *data; + struct address_space *mapping; + size_t file_pos = 0; + size_t buffer_pos = 0; + struct inode *xinode; + struct iattr newattrs; + __u32 xahash = 0; + + if (IS_RDONLY (inode)) + return -EROFS; + + if (IS_IMMUTABLE (inode) || IS_APPEND (inode)) + return -EPERM; + + if (get_inode_sd_version (inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + /* Empty xattrs are ok, they're just empty files, no hash */ + if (buffer && buffer_size) + xahash = xattr_hash (buffer, buffer_size); + +open_file: + fp = open_xa_file (inode, name, flags); + if (IS_ERR (fp)) { + err = PTR_ERR (fp); + goto out; + } + + xinode = fp->f_dentry->d_inode; + REISERFS_I(inode)->i_flags |= i_has_xattr_dir; + + /* we need to copy it off.. */ + if (xinode->i_nlink > 1) { + fput(fp); + err = reiserfs_xattr_del (inode, name); + if (err < 0) + goto out; + /* We just killed the old one, we're not replacing anymore */ + if (flags & XATTR_REPLACE) + flags &= ~XATTR_REPLACE; + goto open_file; + } + + /* Resize it so we're ok to write there */ + newattrs.ia_size = buffer_size; + newattrs.ia_valid = ATTR_SIZE | ATTR_CTIME; + down (&xinode->i_sem); + err = notify_change(fp->f_dentry, &newattrs); + if (err) + goto out_filp; + + mapping = xinode->i_mapping; + while (buffer_pos < buffer_size || buffer_pos == 0) { + size_t chunk; + size_t skip = 0; + size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1)); + if (buffer_size - buffer_pos > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE; + else + chunk = buffer_size - buffer_pos; + + page = reiserfs_get_page (xinode, file_pos >> PAGE_CACHE_SHIFT); + if (IS_ERR (page)) { + err = PTR_ERR (page); + goto out_filp; + } + + lock_page (page); + data = page_address (page); + + if (file_pos == 0) { + struct reiserfs_xattr_header *rxh; + skip = file_pos = sizeof (struct reiserfs_xattr_header); + if (chunk + skip > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE - skip; + rxh = (struct reiserfs_xattr_header *)data; + rxh->h_magic = cpu_to_le32 (REISERFS_XATTR_MAGIC); + rxh->h_hash = cpu_to_le32 (xahash); + } + + err = mapping->a_ops->prepare_write (fp, page, page_offset, + page_offset + chunk + skip); + if (!err) { + if (buffer) + memcpy (data + skip, buffer + buffer_pos, chunk); + err = mapping->a_ops->commit_write (fp, page, page_offset, + page_offset + chunk + skip); + } + unlock_page (page); + reiserfs_put_page (page); + buffer_pos += chunk; + file_pos += chunk; + skip = 0; + if (err || buffer_size == 0 || !buffer) + break; + } + + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty (inode); + +out_filp: + up (&xinode->i_sem); + fput(fp); + +out: + return err; +} + +/* + * inode->i_sem: down + */ +int +reiserfs_xattr_get (const struct inode *inode, const char *name, void *buffer, + size_t buffer_size) +{ + ssize_t err = 0; + struct file *fp; + size_t isize; + size_t file_pos = 0; + size_t buffer_pos = 0; + struct page *page; + struct inode *xinode; + __u32 hash = 0; + + if (name == NULL) + return -EINVAL; + + /* We can't have xattrs attached to v1 items since they don't have + * generation numbers */ + if (get_inode_sd_version (inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + fp = open_xa_file (inode, name, FL_READONLY); + if (IS_ERR (fp)) { + err = PTR_ERR (fp); + goto out; + } + + xinode = fp->f_dentry->d_inode; + isize = xinode->i_size; + REISERFS_I(inode)->i_flags |= i_has_xattr_dir; + + /* Just return the size needed */ + if (buffer == NULL) { + err = isize - sizeof (struct reiserfs_xattr_header); + goto out_dput; + } + + if (buffer_size < isize - sizeof (struct reiserfs_xattr_header)) { + err = -ERANGE; + goto out_dput; + } + + while (file_pos < isize) { + size_t chunk; + char *data; + size_t skip = 0; + if (isize - file_pos > PAGE_CACHE_SIZE) + chunk = PAGE_CACHE_SIZE; + else + chunk = isize - file_pos; + + page = reiserfs_get_page (xinode, file_pos >> PAGE_CACHE_SHIFT); + if (IS_ERR (page)) { + err = PTR_ERR (page); + goto out_dput; + } + + lock_page (page); + data = page_address (page); + if (file_pos == 0) { + struct reiserfs_xattr_header *rxh = + (struct reiserfs_xattr_header *)data; + skip = file_pos = sizeof (struct reiserfs_xattr_header); + chunk -= skip; + /* Magic doesn't match up.. */ + if (rxh->h_magic != cpu_to_le32 (REISERFS_XATTR_MAGIC)) { + unlock_page (page); + reiserfs_put_page (page); + reiserfs_warning ("reiserfs: Invalid magic for xattr (%s) " + "associated with %s %k\n", name, + reiserfs_bdevname (inode->i_sb), + INODE_PKEY (inode)); + err = -EIO; + goto out_dput; + } + hash = le32_to_cpu (rxh->h_hash); + } + memcpy (buffer + buffer_pos, data + skip, chunk); + unlock_page (page); + reiserfs_put_page (page); + file_pos += chunk; + buffer_pos += chunk; + skip = 0; + } + err = isize - sizeof (struct reiserfs_xattr_header); + + if (xattr_hash (buffer, isize - sizeof (struct reiserfs_xattr_header)) != hash) { + reiserfs_warning ("reiserfs: Invalid hash for xattr (%s) associated " + "with %s %k\n", name, + reiserfs_bdevname (inode->i_sb), INODE_PKEY (inode)); + err = -EIO; + } + +out_dput: + fput(fp); + +out: + return err; +} + +static int +__reiserfs_xattr_del (struct dentry *xadir, const char *name, int namelen) +{ + struct dentry *dentry; + struct inode *dir = xadir->d_inode; + int err = 0; + + dentry = lookup_one_len (name, xadir, namelen); + if (IS_ERR (dentry)) { + err = PTR_ERR (dentry); + goto out; + } else if (!dentry->d_inode) { + err = -ENODATA; + goto out_file; + } + + /* Skip directories.. */ + if (S_ISDIR (dentry->d_inode->i_mode)) + goto out_file; + + if (!is_reiserfs_priv_object (dentry->d_inode)) { + reiserfs_warning ("OID %08x [%.*s/%.*s] doesn't have priv flag set [parent is %sset].\n", + le32_to_cpu (INODE_PKEY (dentry->d_inode)->k_objectid), + xadir->d_name.len, xadir->d_name.name, namelen, name, + is_reiserfs_priv_object (xadir->d_inode) ? "" : "not "); + dput (dentry); + return -EIO; + } + + err = dir->i_op->unlink (dir, dentry); + if (!err) + d_delete (dentry); + +out_file: + dput (dentry); + +out: + return err; +} + + +int +reiserfs_xattr_del (struct inode *inode, const char *name) +{ + struct dentry *dir; + int err; + + if (IS_RDONLY (inode)) + return -EROFS; + + dir = open_xa_dir (inode, FL_READONLY); + if (IS_ERR (dir)) { + err = PTR_ERR (dir); + goto out; + } + + err = __reiserfs_xattr_del (dir, name, strlen (name)); + dput (dir); + +out: + return err; +} + +/* The following are side effects of other operations that aren't explicitly + * modifying extended attributes. This includes operations such as permissions + * or ownership changes, object deletions, etc. */ + +static int +reiserfs_delete_xattrs_filler (void *buf, const char *name, int namelen, + loff_t offset, ino_t ino, unsigned int d_type) +{ + struct dentry *xadir = (struct dentry *)buf; + + return __reiserfs_xattr_del (xadir, name, namelen); + +} + +/* This is called w/ inode->i_sem downed */ +int +reiserfs_delete_xattrs (struct inode *inode) +{ + struct file *fp; + struct dentry *dir, *root; + int err = 0; + + /* Skip out, an xattr has no xattrs associated with it */ + if (is_reiserfs_priv_object (inode) || + get_inode_sd_version (inode) == STAT_DATA_V1 || + !reiserfs_xattrs(inode->i_sb)) + { + return 0; + } + reiserfs_read_lock_xattrs (inode->i_sb); + dir = open_xa_dir (inode, FL_READONLY); + reiserfs_read_unlock_xattrs (inode->i_sb); + if (IS_ERR (dir)) { + err = PTR_ERR (dir); + goto out; + } else if (!dir->d_inode) { + dput (dir); + return 0; + } + + fp = dentry_open (dir, NULL, O_RDWR); + if (IS_ERR (fp)) { + err = PTR_ERR (fp); + /* dentry_open dputs the dentry if it fails */ + goto out; + } + + lock_kernel (); + err = xattr_readdir (fp, reiserfs_delete_xattrs_filler, dir); + if (err) { + unlock_kernel (); + goto out_dir; + } + + /* Leftovers besides . and .. -- that's not good. */ + if (dir->d_inode->i_nlink <= 2) { + root = get_xa_root (inode->i_sb); + reiserfs_write_lock_xattrs (inode->i_sb); + err = vfs_rmdir (root->d_inode, dir); + reiserfs_write_unlock_xattrs (inode->i_sb); + dput (root); + } else { + reiserfs_warning ("Couldn't remove all entries in directory\n"); + } + unlock_kernel (); + +out_dir: + fput(fp); + +out: + if (!err) + REISERFS_I(inode)->i_flags = REISERFS_I(inode)->i_flags & ~i_has_xattr_dir; + return err; +} + +struct reiserfs_chown_buf { + struct inode *inode; + struct dentry *xadir; + struct iattr *attrs; +}; + +/* XXX: If there is a better way to do this, I'd love to hear about it */ +static int +reiserfs_chown_xattrs_filler (void *buf, const char *name, int namelen, + loff_t offset, ino_t ino, unsigned int d_type) +{ + struct reiserfs_chown_buf *chown_buf = (struct reiserfs_chown_buf *)buf; + struct dentry *xafile, *xadir = chown_buf->xadir; + struct iattr *attrs = chown_buf->attrs; + int err = 0; + + xafile = lookup_one_len (name, xadir, namelen); + if (IS_ERR (xafile)) + return PTR_ERR (xafile); + else if (!xafile->d_inode) { + dput (xafile); + return -ENODATA; + } + + if (!S_ISDIR (xafile->d_inode->i_mode)) + err = notify_change (xafile, attrs); + dput (xafile); + + return err; +} + +int +reiserfs_chown_xattrs (struct inode *inode, struct iattr *attrs) +{ + struct file *fp; + struct dentry *dir; + int err = 0; + struct reiserfs_chown_buf buf; + unsigned int ia_valid = attrs->ia_valid; + + /* Skip out, an xattr has no xattrs associated with it */ + if (is_reiserfs_priv_object (inode) || + get_inode_sd_version (inode) == STAT_DATA_V1 || + !reiserfs_xattrs(inode->i_sb)) + { + return 0; + } + reiserfs_read_lock_xattrs (inode->i_sb); + dir = open_xa_dir (inode, FL_READONLY); + reiserfs_read_unlock_xattrs (inode->i_sb); + if (IS_ERR (dir)) { + if (PTR_ERR (dir) != -ENODATA) + err = PTR_ERR (dir); + goto out; + } else if (!dir->d_inode) { + dput (dir); + goto out; + } + + fp = dentry_open (dir, NULL, O_RDWR); + if (IS_ERR (fp)) { + err = PTR_ERR (fp); + /* dentry_open dputs the dentry if it fails */ + goto out; + } + + lock_kernel (); + + attrs->ia_valid &= (ATTR_UID | ATTR_GID | ATTR_CTIME); + buf.xadir = dir; + buf.attrs = attrs; + buf.inode = inode; + + err = xattr_readdir (fp, reiserfs_chown_xattrs_filler, &buf); + if (err) { + unlock_kernel (); + goto out_dir; + } + + err = notify_change (dir, attrs); + unlock_kernel (); + +out_dir: + fput(fp); + +out: + attrs->ia_valid = ia_valid; + return err; +} + + +/* Actual operations that are exported to VFS-land */ + +/* + * Inode operation getxattr() + * Preliminary locking: we down dentry->d_inode->i_sem + */ +ssize_t +reiserfs_getxattr (struct dentry *dentry, const char *name, void *buffer, + size_t size) +{ + struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); + int err; + + if (!xah || !reiserfs_xattrs(dentry->d_sb) || + get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + reiserfs_read_lock_xattr_i (dentry->d_inode); + reiserfs_read_lock_xattrs (dentry->d_sb); + err = xah->get (dentry->d_inode, name, buffer, size); + reiserfs_read_unlock_xattrs (dentry->d_sb); + reiserfs_read_unlock_xattr_i (dentry->d_inode); + return err; +} + + +/* + * Inode operation setxattr() + * + * dentry->d_inode->i_sem down + */ +int +reiserfs_setxattr (struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); + int err; + int lock; + + if (!xah || !reiserfs_xattrs(dentry->d_sb) || + get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + if (IS_RDONLY (dentry->d_inode)) + return -EROFS; + + if (IS_IMMUTABLE (dentry->d_inode) || IS_APPEND (dentry->d_inode)) + return -EROFS; + + reiserfs_write_lock_xattr_i (dentry->d_inode); + lock = !has_xattr_dir (dentry->d_inode); + if (lock) + reiserfs_write_lock_xattrs (dentry->d_sb); + else + reiserfs_read_lock_xattrs (dentry->d_sb); + err = xah->set (dentry->d_inode, name, value, size, flags); + if (lock) + reiserfs_write_unlock_xattrs (dentry->d_sb); + else + reiserfs_read_unlock_xattrs (dentry->d_sb); + reiserfs_write_unlock_xattr_i (dentry->d_inode); + return err; +} + +/* + * Inode operation removexattr() + * + * dentry->d_inode->i_sem down + */ +int +reiserfs_removexattr (struct dentry *dentry, const char *name) +{ + int err; + struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); + int lock; + + if (!xah || !reiserfs_xattrs(dentry->d_sb) || + get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + if (IS_RDONLY (dentry->d_inode)) + return -EROFS; + + if (IS_IMMUTABLE (dentry->d_inode) || IS_APPEND (dentry->d_inode)) + return -EPERM; + + reiserfs_write_lock_xattr_i (dentry->d_inode); + reiserfs_read_lock_xattrs (dentry->d_sb); + + /* Deletion pre-operation */ + if (xah->del) { + err = xah->del (dentry->d_inode, name); + if (err) + goto out; + } + + err = reiserfs_xattr_del (dentry->d_inode, name); + + dentry->d_inode->i_ctime = CURRENT_TIME; + mark_inode_dirty (dentry->d_inode); + +out: + reiserfs_read_unlock_xattrs (dentry->d_sb); + reiserfs_write_unlock_xattr_i (dentry->d_inode); + return err; +} + + +/* This is what filldir will use: + * r_pos will always contain the amount of space required for the entire + * list. If r_pos becomes larger than r_size, we need more space and we + * return an error indicating this. If r_pos is less than r_size, then we've + * filled the buffer successfully and we return success */ +struct reiserfs_listxattr_buf { + int r_pos; + int r_size; + char *r_buf; + struct inode *r_inode; +}; + +static int +reiserfs_listxattr_filler (void *buf, const char *name, int namelen, + loff_t offset, ino_t ino, unsigned int d_type) +{ + struct reiserfs_listxattr_buf *b = (struct reiserfs_listxattr_buf *)buf; + int len = 0; + if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { + struct reiserfs_xattr_handler *xah = find_xattr_handler_prefix (name); + if (!xah) return 0; /* Unsupported xattr name, skip it */ + + /* We call ->list() twice because the operation isn't required to just + * return the name back - we want to make sure we have enough space */ + len += xah->list (b->r_inode, name, namelen, NULL); + + if (len) { + if (b->r_pos + len + 1 <= b->r_size) { + char *p = b->r_buf + b->r_pos; + p += xah->list (b->r_inode, name, namelen, p); + *p++ = '\0'; + } + b->r_pos += len + 1; + } + } + + return 0; +} +/* + * Inode operation listxattr() + * + * Preliminary locking: we down dentry->d_inode->i_sem + */ +ssize_t +reiserfs_listxattr (struct dentry *dentry, char *buffer, size_t size) +{ + struct file *fp; + struct dentry *dir; + int err = 0; + struct reiserfs_listxattr_buf buf; + + if (!dentry->d_inode) + return -EINVAL; + + if (!reiserfs_xattrs(dentry->d_sb) || + get_inode_sd_version (dentry->d_inode) == STAT_DATA_V1) + return -EOPNOTSUPP; + + reiserfs_read_lock_xattr_i (dentry->d_inode); + reiserfs_read_lock_xattrs (dentry->d_sb); + dir = open_xa_dir (dentry->d_inode, FL_READONLY); + reiserfs_read_unlock_xattrs (dentry->d_sb); + if (IS_ERR (dir)) { + err = PTR_ERR (dir); + if (err == -ENODATA) + err = 0; /* Not an error if there aren't any xattrs */ + goto out; + } + + fp = dentry_open (dir, NULL, O_RDWR); + if (IS_ERR (fp)) { + err = PTR_ERR (fp); + /* dentry_open dputs the dentry if it fails */ + goto out; + } + + buf.r_buf = buffer; + buf.r_size = buffer ? size : 0; + buf.r_pos = 0; + buf.r_inode = dentry->d_inode; + + REISERFS_I(dentry->d_inode)->i_flags |= i_has_xattr_dir; + + err = xattr_readdir (fp, reiserfs_listxattr_filler, &buf); + if (err) + goto out_dir; + + if (buf.r_pos > buf.r_size && buffer != NULL) + err = -ERANGE; + else + err = buf.r_pos; + +out_dir: + fput(fp); + +out: + reiserfs_read_unlock_xattr_i (dentry->d_inode); + return err; +} + +/* This is the implementation for the xattr plugin infrastructure */ +static struct list_head xattr_handlers = LIST_HEAD_INIT (xattr_handlers); +static rwlock_t handler_lock = RW_LOCK_UNLOCKED; + +static struct reiserfs_xattr_handler * +find_xattr_handler_prefix (const char *prefix) +{ + struct reiserfs_xattr_handler *xah = NULL; + struct list_head *p; + + read_lock (&handler_lock); + list_for_each (p, &xattr_handlers) { + xah = list_entry (p, struct reiserfs_xattr_handler, handlers); + if (strncmp (xah->prefix, prefix, strlen (xah->prefix)) == 0) + break; + xah = NULL; + } + + read_unlock (&handler_lock); + return xah; +} + +static void +__unregister_handlers (void) +{ + struct reiserfs_xattr_handler *xah; + struct list_head *p, *tmp; + + list_for_each_safe (p, tmp, &xattr_handlers) { + xah = list_entry (p, struct reiserfs_xattr_handler, handlers); + if (xah->exit) + xah->exit(); + + list_del_init (p); + } + INIT_LIST_HEAD (&xattr_handlers); +} + +int __init +reiserfs_xattr_register_handlers (void) +{ + int err = 0; + struct reiserfs_xattr_handler *xah; + struct list_head *p; + + write_lock (&handler_lock); + + /* If we're already initialized, nothing to do */ + if (!list_empty (&xattr_handlers)) { + write_unlock (&handler_lock); + return 0; + } + + /* Add the handlers */ + list_add_tail (&user_handler.handlers, &xattr_handlers); + list_add_tail (&trusted_handler.handlers, &xattr_handlers); +#ifdef CONFIG_REISERFS_FS_SECURITY + list_add_tail (&security_handler.handlers, &xattr_handlers); +#endif +#ifdef CONFIG_REISERFS_FS_POSIX_ACL + list_add_tail (&posix_acl_access_handler.handlers, &xattr_handlers); + list_add_tail (&posix_acl_default_handler.handlers, &xattr_handlers); +#endif + + /* Run initializers, if available */ + list_for_each (p, &xattr_handlers) { + xah = list_entry (p, struct reiserfs_xattr_handler, handlers); + if (xah->init) { + err = xah->init (); + if (err) { + list_del_init (p); + break; + } + } + } + + /* Clean up other handlers, if any failed */ + if (err) + __unregister_handlers (); + + write_unlock (&handler_lock); + return err; +} + +void +reiserfs_xattr_unregister_handlers (void) +{ + write_lock (&handler_lock); + __unregister_handlers (); + write_unlock (&handler_lock); +} + +/* This will catch lookups from the fs root to .reiserfs_priv */ +static int +xattr_lookup_poison (struct dentry *dentry, struct qstr *q1, struct qstr *name) +{ + struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root; + if (name->len == priv_root->d_name.len && + name->hash == priv_root->d_name.hash && + !memcmp (name->name, priv_root->d_name.name, name->len)) { + return -ENOENT; + } + return 0; +} + +static struct dentry_operations xattr_lookup_poison_ops = { + .d_compare = xattr_lookup_poison, +}; + + +/* We need to take a copy of the mount flags since things like + * MS_RDONLY don't get set until *after* we're called. + * mount_flags != mount_options */ +int +reiserfs_xattr_init (struct super_block *s, int mount_flags) +{ + int err = 0; + + /* We need generation numbers to ensure that the oid mapping is correct + * v3.5 filesystems don't have them. */ + if (!old_format_only (s)) { + set_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); + } else if (reiserfs_xattrs_optional (s)) { + /* Old format filesystem, but optional xattrs have been enabled + * at mount time. Error out. */ + reiserfs_warning ("reiserfs: xattrs/ACLs not supported on pre v3.6 " + "format filesystem. Failing mount.\n"); + err = -EOPNOTSUPP; + goto error; + } else { + /* Old format filesystem, but no optional xattrs have been enabled. This + * means we silently disable xattrs on the filesystem. */ + clear_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); + } + + /* If we don't have the privroot located yet - go find it */ + if (reiserfs_xattrs (s) && !REISERFS_SB(s)->priv_root) { + struct dentry *dentry; + dentry = lookup_one_len (PRIVROOT_NAME, s->s_root, + strlen (PRIVROOT_NAME)); + if (!IS_ERR (dentry)) { + if (!(mount_flags & MS_RDONLY) && !dentry->d_inode) { + struct inode *inode = dentry->d_parent->d_inode; + down (&inode->i_sem); + err = inode->i_op->mkdir (inode, dentry, 0700); + up (&inode->i_sem); + if (err) { + dput (dentry); + dentry = NULL; + } + + if (dentry && dentry->d_inode) + reiserfs_warning ("reiserfs: Created %s on %s - reserved for " + "xattr storage.\n", PRIVROOT_NAME, + reiserfs_bdevname (inode->i_sb)); + } else if (!dentry->d_inode) { + dput (dentry); + dentry = NULL; + } + } else + err = PTR_ERR (dentry); + + if (!err && dentry) { + s->s_root->d_op = &xattr_lookup_poison_ops; + REISERFS_I(dentry->d_inode)->i_flags |= i_priv_object; + REISERFS_SB(s)->priv_root = dentry; + } else if (!(mount_flags & MS_RDONLY)) { /* xattrs are unavailable */ + /* If we're read-only it just means that the dir hasn't been + * created. Not an error -- just no xattrs on the fs. We'll + * check again if we go read-write */ + reiserfs_warning ("reiserfs: xattrs/ACLs enabled and couldn't " + "find/create .reiserfs_priv. Failing mount.\n"); + err = -EOPNOTSUPP; + } + } + +error: + /* This is only nonzero if there was an error initializing the xattr + * directory or if there is a condition where we don't support them. */ + if (err) { + clear_bit (REISERFS_XATTRS, &(REISERFS_SB(s)->s_mount_opt)); + clear_bit (REISERFS_XATTRS_USER, &(REISERFS_SB(s)->s_mount_opt)); + clear_bit (REISERFS_POSIXACL, &(REISERFS_SB(s)->s_mount_opt)); + } + + /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ + s->s_flags = s->s_flags & ~MS_POSIXACL; + if (reiserfs_posixacl (s)) + s->s_flags |= MS_POSIXACL; + + return err; +} + +static int +__reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd, + int need_lock) +{ + umode_t mode = inode->i_mode; + + if (mask & MAY_WRITE) { + /* + * Nobody gets write access to a read-only fs. + */ + if (IS_RDONLY(inode) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + return -EROFS; + + /* + * Nobody gets write access to an immutable file. + */ + if (IS_IMMUTABLE(inode)) + return -EACCES; + } + + /* We don't do permission checks on the internal objects. + * Permissions are determined by the "owning" object. */ + if (is_reiserfs_priv_object (inode)) + return 0; + + if (current->fsuid == inode->i_uid) { + mode >>= 6; +#ifdef CONFIG_REISERFS_FS_POSIX_ACL + } else if (reiserfs_posixacl(inode->i_sb) && + get_inode_sd_version (inode) != STAT_DATA_V1) { + struct posix_acl *acl; + + /* ACL can't contain additional permissions if + the ACL_MASK entry is 0 */ + if (!(mode & S_IRWXG)) + goto check_groups; + + reiserfs_read_lock_xattr_i (inode); + if (need_lock) + reiserfs_read_lock_xattrs (inode->i_sb); + acl = reiserfs_get_acl (inode, ACL_TYPE_ACCESS); + if (need_lock) + reiserfs_read_unlock_xattrs (inode->i_sb); + reiserfs_read_unlock_xattr_i (inode); + if (IS_ERR (acl)) { + if (PTR_ERR (acl) == -ENODATA) + goto check_groups; + return PTR_ERR (acl); + } + + if (acl) { + int err = posix_acl_permission (inode, acl, mask); + posix_acl_release (acl); + if (err == -EACCES) { + goto check_capabilities; + } + return err; + } else { + goto check_groups; + } +#endif + } else { +check_groups: + if (in_group_p(inode->i_gid)) + mode >>= 3; + } + + /* + * If the DACs are ok we don't need any capability check. + */ + if (((mode & mask & (MAY_READ|MAY_WRITE|MAY_EXEC)) == mask)) + return 0; + +check_capabilities: + /* + * Read/write DACs are always overridable. + * Executable DACs are overridable if at least one exec bit is set. + */ + if (!(mask & MAY_EXEC) || + (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode)) + if (capable(CAP_DAC_OVERRIDE)) + return 0; + + /* + * Searching includes executable on directories, else just read. + */ + if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) + if (capable(CAP_DAC_READ_SEARCH)) + return 0; + + return -EACCES; +} + +int +reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd) +{ + return __reiserfs_permission (inode, mask, nd, 1); +} + +int +reiserfs_permission_locked (struct inode *inode, int mask, struct nameidata *nd) +{ + return __reiserfs_permission (inode, mask, nd, 0); +} diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_security.c linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_security.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_security.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_security.c 2004-03-19 16:10:24.038962502 +0100 @@ -0,0 +1,69 @@ +#include +#include +#include +#include +#include +#include +#include + +#define XATTR_SECURITY_PREFIX "security." + +static int +security_get (struct inode *inode, const char *name, void *buffer, size_t size) +{ + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + if (is_reiserfs_priv_object(inode)) + return -EPERM; + + return reiserfs_xattr_get (inode, name, buffer, size); +} + +static int +security_set (struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) +{ + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + if (is_reiserfs_priv_object(inode)) + return -EPERM; + + return reiserfs_xattr_set (inode, name, buffer, size, flags); +} + +static int +security_del (struct inode *inode, const char *name) +{ + if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) + return -EINVAL; + + if (is_reiserfs_priv_object(inode)) + return -EPERM; + + return 0; +} + +static int +security_list (struct inode *inode, const char *name, int namelen, char *out) +{ + int len = namelen; + + if (is_reiserfs_priv_object(inode)) + return 0; + + if (out) + memcpy (out, name, len); + + return len; +} + + +struct reiserfs_xattr_handler security_handler = { + prefix: XATTR_SECURITY_PREFIX, + get: security_get, + set: security_set, + del: security_del, + list: security_list, +}; diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_trusted.c linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_trusted.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_trusted.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_trusted.c 2004-03-19 16:10:24.034962932 +0100 @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include +#include +#include + +#define XATTR_TRUSTED_PREFIX "trusted." + +static int +trusted_get (struct inode *inode, const char *name, void *buffer, size_t size) +{ + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs (inode->i_sb)) + return -EOPNOTSUPP; + + if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) + return -EPERM; + + return reiserfs_xattr_get (inode, name, buffer, size); +} + +static int +trusted_set (struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) +{ + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs (inode->i_sb)) + return -EOPNOTSUPP; + + if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) + return -EPERM; + + return reiserfs_xattr_set (inode, name, buffer, size, flags); +} + +static int +trusted_del (struct inode *inode, const char *name) +{ + if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs (inode->i_sb)) + return -EOPNOTSUPP; + + if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) + return -EPERM; + + return 0; +} + +static int +trusted_list (struct inode *inode, const char *name, int namelen, char *out) +{ + int len = namelen; + + if (!reiserfs_xattrs (inode->i_sb)) + return 0; + + if (!(capable(CAP_SYS_ADMIN) || is_reiserfs_priv_object(inode))) + return 0; + + if (out) + memcpy (out, name, len); + + return len; +} + + +struct reiserfs_xattr_handler trusted_handler = { + prefix: XATTR_TRUSTED_PREFIX, + get: trusted_get, + set: trusted_set, + del: trusted_del, + list: trusted_list, +}; diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_user.c linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_user.c --- /opt/kernel/linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_user.c 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.5-rc1-mm2/fs/reiserfs/xattr_user.c 2004-03-19 16:10:24.027963686 +0100 @@ -0,0 +1,99 @@ +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_REISERFS_FS_POSIX_ACL +# include +#endif + +#define XATTR_USER_PREFIX "user." + +static int +user_get (struct inode *inode, const char *name, void *buffer, size_t size) +{ + + int error; + + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs_user (inode->i_sb)) + return -EOPNOTSUPP; + + error = reiserfs_permission_locked (inode, MAY_READ, NULL); + if (error) + return error; + + return reiserfs_xattr_get (inode, name, buffer, size); +} + +static int +user_set (struct inode *inode, const char *name, const void *buffer, + size_t size, int flags) +{ + + int error; + + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs_user (inode->i_sb)) + return -EOPNOTSUPP; + + if (!S_ISREG (inode->i_mode) && + (!S_ISDIR (inode->i_mode) || inode->i_mode & S_ISVTX)) + return -EPERM; + + error = reiserfs_permission_locked (inode, MAY_WRITE, NULL); + if (error) + return error; + + return reiserfs_xattr_set (inode, name, buffer, size, flags); +} + +static int +user_del (struct inode *inode, const char *name) +{ + int error; + + if (strlen(name) < sizeof(XATTR_USER_PREFIX)) + return -EINVAL; + + if (!reiserfs_xattrs_user (inode->i_sb)) + return -EOPNOTSUPP; + + if (!S_ISREG (inode->i_mode) && + (!S_ISDIR (inode->i_mode) || inode->i_mode & S_ISVTX)) + return -EPERM; + + error = reiserfs_permission_locked (inode, MAY_WRITE, NULL); + if (error) + return error; + + return 0; +} + +static int +user_list (struct inode *inode, const char *name, int namelen, char *out) +{ + int len = namelen; + if (!reiserfs_xattrs_user (inode->i_sb)) + return 0; + + if (out) + memcpy (out, name, len); + + return len; +} + +struct reiserfs_xattr_handler user_handler = { + prefix: XATTR_USER_PREFIX, + get: user_get, + set: user_set, + del: user_del, + list: user_list, +}; diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/bio.h linux-2.6.5-rc1-mm2/include/linux/bio.h --- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/bio.h 2004-03-19 15:13:58.000000000 +0100 +++ linux-2.6.5-rc1-mm2/include/linux/bio.h 2004-03-19 16:10:28.737456662 +0100 @@ -140,6 +140,8 @@ struct bio { #define bio_cur_sectors(bio) (bio_iovec(bio)->bv_len >> 9) #define bio_data(bio) (page_address(bio_page((bio))) + bio_offset((bio))) #define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER)) +#define bio_failfast(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST)) +#define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) #define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC)) /* diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/blkdev.h linux-2.6.5-rc1-mm2/include/linux/blkdev.h --- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/blkdev.h 2004-03-19 15:13:58.000000000 +0100 +++ linux-2.6.5-rc1-mm2/include/linux/blkdev.h 2004-03-19 16:10:28.667464197 +0100 @@ -195,6 +195,8 @@ enum rq_flag_bits { __REQ_PM_SUSPEND, /* suspend request */ __REQ_PM_RESUME, /* resume request */ __REQ_PM_SHUTDOWN, /* shutdown request */ + __REQ_BAR_PREFLUSH, /* barrier pre-flush done */ + __REQ_BAR_POSTFLUSH, /* barrier post-flush */ __REQ_NR_BITS, /* stops here */ }; @@ -220,6 +222,8 @@ enum rq_flag_bits { #define REQ_PM_SUSPEND (1 << __REQ_PM_SUSPEND) #define REQ_PM_RESUME (1 << __REQ_PM_RESUME) #define REQ_PM_SHUTDOWN (1 << __REQ_PM_SHUTDOWN) +#define REQ_BAR_PREFLUSH (1 << __REQ_BAR_PREFLUSH) +#define REQ_BAR_POSTFLUSH (1 << __REQ_BAR_POSTFLUSH) /* * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME @@ -371,6 +375,7 @@ struct request_queue #define QUEUE_FLAG_DEAD 5 /* queue being torn down */ #define QUEUE_FLAG_REENTER 6 /* Re-entrancy avoidance */ #define QUEUE_FLAG_PLUGGED 7 /* queue is plugged */ +#define QUEUE_FLAG_ORDERED 8 /* supports ordered writes */ #define blk_queue_plugged(q) test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) @@ -386,6 +391,10 @@ struct request_queue #define blk_pm_request(rq) \ ((rq)->flags & (REQ_PM_SUSPEND | REQ_PM_RESUME)) +#define blk_barrier_rq(rq) ((rq)->flags & REQ_HARDBARRIER) +#define blk_barrier_preflush(rq) ((rq)->flags & REQ_BAR_PREFLUSH) +#define blk_barrier_postflush(rq) ((rq)->flags & REQ_BAR_POSTFLUSH) + #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) #define rq_data_dir(rq) ((rq)->flags & 1) @@ -583,6 +592,7 @@ extern void blk_queue_prep_rq(request_qu extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *); extern void blk_queue_dma_alignment(request_queue_t *, int); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); +extern void blk_queue_ordered(request_queue_t *, int); extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); @@ -610,6 +620,7 @@ extern long blk_congestion_wait(int rw, extern void blk_rq_bio_prep(request_queue_t *, struct request *, struct bio *); extern void blk_rq_prep_restart(struct request *); +extern int blkdev_issue_flush(struct block_device *); #define MAX_PHYS_SEGMENTS 128 #define MAX_HW_SEGMENTS 128 diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/buffer_head.h linux-2.6.5-rc1-mm2/include/linux/buffer_head.h --- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/buffer_head.h 2004-03-11 03:55:21.000000000 +0100 +++ linux-2.6.5-rc1-mm2/include/linux/buffer_head.h 2004-03-19 16:10:27.727565387 +0100 @@ -26,6 +26,7 @@ enum bh_state_bits { BH_Delay, /* Buffer is not yet allocated on disk */ BH_Boundary, /* Block is followed by a discontiguity */ BH_Write_EIO, /* I/O error on write */ + BH_Ordered, /* ordered write */ BH_PrivateStart,/* not a state bit, but the first bit available * for private allocation by other entities @@ -117,7 +118,8 @@ BUFFER_FNS(Async_Read, async_read) BUFFER_FNS(Async_Write, async_write) BUFFER_FNS(Delay, delay) BUFFER_FNS(Boundary, boundary) -BUFFER_FNS(Write_EIO,write_io_error) +BUFFER_FNS(Write_EIO, write_io_error) +BUFFER_FNS(Ordered, ordered) #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) #define touch_buffer(bh) mark_page_accessed(bh->b_page) diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/ext3_fs.h linux-2.6.5-rc1-mm2/include/linux/ext3_fs.h --- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/ext3_fs.h 2004-03-11 03:55:33.000000000 +0100 +++ linux-2.6.5-rc1-mm2/include/linux/ext3_fs.h 2004-03-19 16:10:28.716458922 +0100 @@ -324,6 +324,7 @@ struct ext3_inode { #define EXT3_MOUNT_NO_UID32 0x2000 /* Disable 32-bit UIDs */ #define EXT3_MOUNT_XATTR_USER 0x4000 /* Extended user attributes */ #define EXT3_MOUNT_POSIX_ACL 0x8000 /* POSIX Access Control Lists */ +#define EXT3_MOUNT_BARRIER 0x10000 /* Use block barriers */ /* Compatibility, for having both ext2_fs.h and ext3_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/fs.h linux-2.6.5-rc1-mm2/include/linux/fs.h --- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/fs.h 2004-03-19 15:13:58.000000000 +0100 +++ linux-2.6.5-rc1-mm2/include/linux/fs.h 2004-03-19 16:10:28.669463982 +0100 @@ -85,6 +85,7 @@ extern int leases_enable, dir_notify_ena #define SPECIAL 4 /* For non-blockdevice requests in request queue */ #define READ_SYNC (READ | (1 << BIO_RW_SYNC)) #define WRITE_SYNC (WRITE | (1 << BIO_RW_SYNC)) +#define WRITE_BARRIER ((1 << BIO_RW) | (1 << BIO_RW_BARRIER)) #define SEL_IN 1 #define SEL_OUT 2 diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/ide.h linux-2.6.5-rc1-mm2/include/linux/ide.h --- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/ide.h 2004-03-19 15:13:58.000000000 +0100 +++ linux-2.6.5-rc1-mm2/include/linux/ide.h 2004-03-19 16:10:28.682462582 +0100 @@ -732,6 +732,8 @@ typedef struct ide_drive_s { u8 bios_head; /* BIOS/fdisk/LILO number of heads */ u8 bios_sect; /* BIOS/fdisk/LILO sectors per track */ u8 queue_depth; /* max queue depth */ + u8 doing_barrier; /* state, 1=currently doing flush */ + u8 last_rq_flush; /* last rq was a flush */ unsigned int bios_cyl; /* BIOS/fdisk/LILO number of cyls */ unsigned int cyl; /* "real" number of cyls */ @@ -744,6 +746,7 @@ typedef struct ide_drive_s { int lun; /* logical unit */ int crc_count; /* crc counter to reduce drive speed */ + char special_buf[8]; /* private command buffer */ struct list_head list; struct device gendev; struct semaphore gendev_rel_sem; /* to deal with device release() */ diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/jbd.h linux-2.6.5-rc1-mm2/include/linux/jbd.h --- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/jbd.h 2004-03-11 03:55:43.000000000 +0100 +++ linux-2.6.5-rc1-mm2/include/linux/jbd.h 2004-03-19 16:10:28.739456446 +0100 @@ -825,6 +825,7 @@ struct journal_s #define JFS_ACK_ERR 0x004 /* The errno in the sb has been acked */ #define JFS_FLUSHED 0x008 /* The journal superblock has been flushed */ #define JFS_LOADED 0x010 /* The journal superblock has been loaded */ +#define JFS_BARRIER 0x020 /* Use IDE barriers */ /* * Function declarations for the journaling transaction and buffer diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_acl.h linux-2.6.5-rc1-mm2/include/linux/reiserfs_acl.h --- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_acl.h 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.5-rc1-mm2/include/linux/reiserfs_acl.h 2004-03-19 16:10:28.821447619 +0100 @@ -0,0 +1,91 @@ +#include +#include +#include + +#define REISERFS_ACL_VERSION 0x0001 + +typedef struct { + __u16 e_tag; + __u16 e_perm; + __u32 e_id; +} reiserfs_acl_entry; + +typedef struct { + __u16 e_tag; + __u16 e_perm; +} reiserfs_acl_entry_short; + +typedef struct { + __u32 a_version; +} reiserfs_acl_header; + +static inline size_t reiserfs_acl_size(int count) +{ + if (count <= 4) { + return sizeof(reiserfs_acl_header) + + count * sizeof(reiserfs_acl_entry_short); + } else { + return sizeof(reiserfs_acl_header) + + 4 * sizeof(reiserfs_acl_entry_short) + + (count - 4) * sizeof(reiserfs_acl_entry); + } +} + +static inline int reiserfs_acl_count(size_t size) +{ + ssize_t s; + size -= sizeof(reiserfs_acl_header); + s = size - 4 * sizeof(reiserfs_acl_entry_short); + if (s < 0) { + if (size % sizeof(reiserfs_acl_entry_short)) + return -1; + return size / sizeof(reiserfs_acl_entry_short); + } else { + if (s % sizeof(reiserfs_acl_entry)) + return -1; + return s / sizeof(reiserfs_acl_entry) + 4; + } +} + + +#ifdef CONFIG_REISERFS_FS_POSIX_ACL +struct posix_acl * reiserfs_get_acl(struct inode *inode, int type); +int reiserfs_set_acl(struct inode *inode, int type, struct posix_acl *acl); +int reiserfs_acl_chmod (struct inode *inode); +int reiserfs_inherit_default_acl (struct inode *dir, struct dentry *dentry, struct inode *inode); +int reiserfs_cache_default_acl (struct inode *dir); +extern int reiserfs_xattr_posix_acl_init (void) __init; +extern int reiserfs_xattr_posix_acl_exit (void); +extern struct reiserfs_xattr_handler posix_acl_default_handler; +extern struct reiserfs_xattr_handler posix_acl_access_handler; +#else + +#define reiserfs_set_acl NULL +#define reiserfs_get_acl NULL +#define reiserfs_cache_default_acl(inode) 0 + +static inline int +reiserfs_xattr_posix_acl_init (void) +{ + return 0; +} + +static inline int +reiserfs_xattr_posix_acl_exit (void) +{ + return 0; +} + +static inline int +reiserfs_acl_chmod (struct inode *inode) +{ + return 0; +} + +static inline int +reiserfs_inherit_default_acl (const struct inode *dir, struct dentry *dentry, struct inode *inode) +{ + return 0; +} + +#endif diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs.h linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs.h --- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs.h 2004-03-19 15:13:58.000000000 +0100 +++ linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs.h 2004-03-19 16:30:24.730644952 +0100 @@ -268,6 +268,7 @@ int is_reiserfs_jr (struct reiserfs_supe #define NO_DISK_SPACE -3 #define NO_BALANCING_NEEDED (-4) #define NO_MORE_UNUSED_CONTIGUOUS_BLOCKS (-5) +#define QUOTA_EXCEEDED -6 typedef __u32 b_blocknr_t; typedef __u32 unp_t; @@ -287,7 +288,7 @@ struct unfm_nodeinfo { #define STAT_DATA_V2 1 -static inline struct reiserfs_inode_info *REISERFS_I(struct inode *inode) +static inline struct reiserfs_inode_info *REISERFS_I(const struct inode *inode) { return container_of(inode, struct reiserfs_inode_info, vfs_inode); } @@ -1238,7 +1239,6 @@ excessive effort to avoid disturbing the gods only know how we are going to SMP the code that uses them. znodes are the way! */ - struct path { int path_length; /* Length of the array above. */ struct path_element path_elements[EXTENDED_MAX_HEIGHT]; /* Array of the path elements. */ @@ -1702,45 +1702,91 @@ struct reiserfs_journal_header { (((block)<<(JBH_HASH_SHIFT - 6)) ^ ((block) >> 13) ^ ((block) << (JBH_HASH_SHIFT - 12)))) #define journal_hash(t,sb,block) ((t)[_jhashfn((sb),(block)) & JBH_HASH_MASK]) -/* finds n'th buffer with 0 being the start of this commit. Needs to go away, j_ap_blocks has changed -** since I created this. One chunk of code in journal.c needs changing before deleting it -*/ -#define JOURNAL_BUFFER(j,n) ((j)->j_ap_blocks[((j)->j_start + (n)) % JOURNAL_BLOCK_COUNT]) - // We need these to make journal.c code more readable #define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) #define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) #define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) -void reiserfs_commit_for_inode(struct inode *) ; +enum reiserfs_bh_state_bits { + BH_JDirty = BH_PrivateStart, + BH_JDirty_wait, + BH_JNew, + BH_JPrepared, + BH_JRestore_dirty, + BH_JTest, // debugging only will go away +}; + +/* +** transaction handle which is passed around for all journal calls +*/ +struct reiserfs_transaction_handle { + struct super_block *t_super ; /* super for this FS when journal_begin was + called. saves calls to reiserfs_get_super + also used by nested transactions to make + sure they are nesting on the right FS + _must_ be first in the handle + */ + int t_refcount; + int t_blocks_logged ; /* number of blocks this writer has logged */ + int t_blocks_allocated ; /* number of blocks this writer allocated */ + unsigned long t_trans_id ; /* sanity check, equals the current trans id */ + void *t_handle_save ; /* save existing current->journal_info */ + int displace_new_blocks:1; /* if new block allocation occurres, that block + should be displaced from others */ +} ; + +/* used to keep track of ordered and tail writes, attached to the buffer + * head through b_journal_head. + */ +struct reiserfs_jh { + struct reiserfs_journal_list *jl; + struct buffer_head *bh; + struct list_head list; +}; + +void reiserfs_free_jh(struct buffer_head *bh); +int reiserfs_add_tail_list(struct inode *inode, struct buffer_head *bh); +int reiserfs_add_ordered_list(struct inode *inode, struct buffer_head *bh); +int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ; + +static inline int reiserfs_transaction_running(struct super_block *s) { + struct reiserfs_transaction_handle *th = current->journal_info ; + if (th && th->t_super == s) + return 1 ; + if (th && th->t_super == NULL) + BUG(); + return 0 ; +} + +int reiserfs_async_progress_wait(struct super_block *s); + +struct reiserfs_transaction_handle * +reiserfs_persistent_transaction(struct super_block *, int count); +int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *); +int reiserfs_commit_page(struct inode *inode, struct page *page, + unsigned from, unsigned to); +int reiserfs_flush_old_commits(struct super_block *); +int reiserfs_commit_for_inode(struct inode *) ; +int reiserfs_inode_needs_commit(struct inode *) ; void reiserfs_update_inode_transaction(struct inode *) ; void reiserfs_wait_on_write_block(struct super_block *s) ; void reiserfs_block_writes(struct reiserfs_transaction_handle *th) ; void reiserfs_allow_writes(struct super_block *s) ; void reiserfs_check_lock_depth(char *caller) ; -void reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ; +int reiserfs_prepare_for_journal(struct super_block *, struct buffer_head *bh, int wait) ; void reiserfs_restore_prepared_buffer(struct super_block *, struct buffer_head *bh) ; int journal_init(struct super_block *, const char * j_dev_name, int old_format, unsigned int) ; int journal_release(struct reiserfs_transaction_handle*, struct super_block *) ; int journal_release_error(struct reiserfs_transaction_handle*, struct super_block *) ; int journal_end(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ; int journal_end_sync(struct reiserfs_transaction_handle *, struct super_block *, unsigned long) ; -int journal_mark_dirty_nolog(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ; int journal_mark_freed(struct reiserfs_transaction_handle *, struct super_block *, b_blocknr_t blocknr) ; -int push_journal_writer(char *w) ; -int pop_journal_writer(int windex) ; int journal_transaction_should_end(struct reiserfs_transaction_handle *, int) ; int reiserfs_in_journal(struct super_block *p_s_sb, int bmap_nr, int bit_nr, int searchall, b_blocknr_t *next) ; int journal_begin(struct reiserfs_transaction_handle *, struct super_block *p_s_sb, unsigned long) ; -void flush_async_commits(struct super_block *p_s_sb) ; int buffer_journaled(const struct buffer_head *bh) ; int mark_buffer_journal_new(struct buffer_head *bh) ; -int reiserfs_add_page_to_flush_list(struct reiserfs_transaction_handle *, - struct inode *, struct buffer_head *) ; -int reiserfs_remove_page_from_flush_list(struct reiserfs_transaction_handle *, - struct inode *) ; - int reiserfs_allocate_list_bitmaps(struct super_block *s, struct reiserfs_list_bitmap *, int) ; /* why is this kerplunked right here? */ @@ -1844,11 +1890,13 @@ void pathrelse_and_restore (struct super int reiserfs_insert_item (struct reiserfs_transaction_handle *th, struct path * path, const struct cpu_key * key, - struct item_head * ih, const char * body); + struct item_head * ih, + struct inode *inode, const char * body); int reiserfs_paste_into_item (struct reiserfs_transaction_handle *th, struct path * path, const struct cpu_key * key, + struct inode *inode, const char * body, int paste_size); int reiserfs_cut_from_item (struct reiserfs_transaction_handle *th, @@ -1865,7 +1913,7 @@ int reiserfs_delete_item (struct reiserf struct buffer_head * p_s_un_bh); void reiserfs_delete_solid_item (struct reiserfs_transaction_handle *th, - struct key * key); + struct inode *inode, struct key * key); void reiserfs_delete_object (struct reiserfs_transaction_handle *th, struct inode * p_s_inode); void reiserfs_do_truncate (struct reiserfs_transaction_handle *th, struct inode * p_s_inode, struct page *, @@ -1910,11 +1958,22 @@ int reiserfs_new_inode (struct reiserfs_ struct inode * dir, int mode, const char * symname, loff_t i_size, struct dentry *dentry, struct inode *inode); -int reiserfs_sync_inode (struct reiserfs_transaction_handle *th, struct inode * inode); -void reiserfs_update_sd (struct reiserfs_transaction_handle *th, struct inode * inode); + +int reiserfs_sync_inode (struct reiserfs_transaction_handle *th, + struct inode * inode); + +void reiserfs_update_sd_size (struct reiserfs_transaction_handle *th, + struct inode * inode, loff_t size); + +static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th, + struct inode *inode) +{ + reiserfs_update_sd_size(th, inode, inode->i_size) ; +} void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode ); void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs ); +int reiserfs_setattr(struct dentry *dentry, struct iattr *attr); /* namei.c */ void set_de_name_and_namelen (struct reiserfs_dir_entry * de); @@ -1965,6 +2024,7 @@ int reiserfs_global_version_in_proc( cha /* dir.c */ extern struct inode_operations reiserfs_dir_inode_operations; +extern struct inode_operations reiserfs_symlink_inode_operations; extern struct file_operations reiserfs_dir_operations; /* tail_conversion.c */ @@ -2082,7 +2142,7 @@ typedef struct __reiserfs_blocknr_hint r int reiserfs_parse_alloc_options (struct super_block *, char *); int is_reusable (struct super_block * s, b_blocknr_t block, int bit_value); -void reiserfs_free_block (struct reiserfs_transaction_handle *th, b_blocknr_t); +void reiserfs_free_block (struct reiserfs_transaction_handle *th, struct inode *, b_blocknr_t, int for_unformatted); int reiserfs_allocate_blocknrs(reiserfs_blocknr_hint_t *, b_blocknr_t * , int, int); extern inline int reiserfs_new_form_blocknrs (struct tree_balance * tb, b_blocknr_t *new_blocknrs, int amount_needed) @@ -2183,6 +2243,9 @@ int reiserfs_unpack (struct inode * inod #define reiserfs_write_lock( sb ) lock_kernel() #define reiserfs_write_unlock( sb ) unlock_kernel() +/* xattr stuff */ +#define REISERFS_XATTR_DIR_SEM(s) (REISERFS_SB(s)->xattr_dir_sem) + #endif /* _LINUX_REISER_FS_H */ diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_i.h linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_i.h --- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_i.h 2004-03-11 03:55:43.000000000 +0100 +++ linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_i.h 2004-03-19 16:10:28.740456339 +0100 @@ -3,6 +3,8 @@ #include +struct reiserfs_journal_list; + /** bitmasks for i_flags field in reiserfs-specific part of inode */ typedef enum { /** this says what format of key do all items (but stat data) of @@ -20,7 +22,9 @@ typedef enum { truncate or unlink. Safe link is used to avoid leakage of disk space on crash with some files open, but unlinked. */ i_link_saved_unlink_mask = 0x0010, - i_link_saved_truncate_mask = 0x0020 + i_link_saved_truncate_mask = 0x0020, + i_priv_object = 0x0080, + i_has_xattr_dir = 0x0100, } reiserfs_inode_flags; @@ -48,7 +52,11 @@ struct reiserfs_inode_info { ** needs to be committed in order for this inode to be properly ** flushed */ unsigned long i_trans_id ; - unsigned long i_trans_index ; + struct reiserfs_journal_list *i_jl; + + struct posix_acl *i_acl_access; + struct posix_acl *i_acl_default; + struct rw_semaphore xattr_sem; struct inode vfs_inode; }; diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_sb.h linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_sb.h --- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_sb.h 2004-03-11 03:55:27.000000000 +0100 +++ linux-2.6.5-rc1-mm2/include/linux/reiserfs_fs_sb.h 2004-03-19 16:30:24.728645167 +0100 @@ -6,6 +6,7 @@ #ifdef __KERNEL__ #include +#include #endif typedef enum { @@ -106,21 +107,6 @@ typedef enum { #define JOURNAL_MAX_CNODE 1500 /* max cnodes to allocate. */ #define JOURNAL_HASH_SIZE 8192 #define JOURNAL_NUM_BITMAPS 5 /* number of copies of the bitmaps to have floating. Must be >= 2 */ -#define JOURNAL_LIST_COUNT 64 - -/* these are bh_state bit flag offset numbers, for use in the buffer head */ - -#define BH_JDirty 16 /* journal data needs to be written before buffer can be marked dirty */ -#define BH_JDirty_wait 18 /* commit is done, buffer marked dirty */ -#define BH_JNew 19 /* buffer allocated during this transaction, no need to write if freed during this trans too */ - -/* ugly. metadata blocks must be prepared before they can be logged. -** prepared means unlocked and cleaned. If the block is prepared, but not -** logged for some reason, any bits cleared while preparing it must be -** set again. -*/ -#define BH_JPrepared 20 /* block has been prepared for the log */ -#define BH_JRestore_dirty 22 /* restore the dirty bit later */ /* One of these for every block in every transaction ** Each one is in two hash tables. First, a hash of the current transaction, and after journal_end, a @@ -154,22 +140,6 @@ struct reiserfs_list_bitmap { } ; /* -** transaction handle which is passed around for all journal calls -*/ -struct reiserfs_transaction_handle { - /* ifdef it. -Hans */ - char *t_caller ; /* debugging use */ - int t_blocks_logged ; /* number of blocks this writer has logged */ - int t_blocks_allocated ; /* number of blocks this writer allocated */ - unsigned long t_trans_id ; /* sanity check, equals the current trans id */ - struct super_block *t_super ; /* super for this FS when journal_begin was - called. saves calls to reiserfs_get_super */ - int displace_new_blocks:1; /* if new block allocation occurres, that block - should be displaced from others */ - -} ; - -/* ** one of these for each transaction. The most important part here is the j_realblock. ** this list of cnodes is used to hash all the blocks in all the commits, to mark all the ** real buffer heads dirty once all the commits hit the disk, @@ -177,23 +147,30 @@ struct reiserfs_transaction_handle { ** to be overwritten */ struct reiserfs_journal_list { unsigned long j_start ; + unsigned long j_state; unsigned long j_len ; atomic_t j_nonzerolen ; atomic_t j_commit_left ; - atomic_t j_flushing ; - atomic_t j_commit_flushing ; atomic_t j_older_commits_done ; /* all commits older than this on disk*/ + struct semaphore j_commit_lock; unsigned long j_trans_id ; time_t j_timestamp ; struct reiserfs_list_bitmap *j_list_bitmap ; struct buffer_head *j_commit_bh ; /* commit buffer head */ struct reiserfs_journal_cnode *j_realblock ; struct reiserfs_journal_cnode *j_freedlist ; /* list of buffers that were freed during this trans. free each of these on flush */ - wait_queue_head_t j_commit_wait ; /* wait for all the commit blocks to be flushed */ - wait_queue_head_t j_flush_wait ; /* wait for all the real blocks to be flushed */ -} ; + /* time ordered list of all active transactions */ + struct list_head j_list; -struct reiserfs_page_list ; /* defined in reiserfs_fs.h */ + /* time ordered list of all transactions we haven't tried to flush yet */ + struct list_head j_working_list; + + /* list of tail conversion targets in need of flush before commit */ + struct list_head j_tail_bh_list; + /* list of data=ordered buffers in need of flush before commit */ + struct list_head j_bh_list; + int j_refcount; +} ; struct reiserfs_journal { struct buffer_head ** j_ap_blocks ; /* journal blocks on disk */ @@ -216,16 +193,11 @@ struct reiserfs_journal { unsigned long j_last_flush_trans_id ; /* last fully flushed journal timestamp */ struct buffer_head *j_header_bh ; - /* j_flush_pages must be flushed before the current transaction can - ** commit - */ - struct reiserfs_page_list *j_flush_pages ; time_t j_trans_start_time ; /* time this transaction started */ - wait_queue_head_t j_wait ; /* wait journal_end to finish I/O */ - atomic_t j_wlock ; /* lock for j_wait */ + struct semaphore j_lock; + struct semaphore j_flush_sem; wait_queue_head_t j_join_wait ; /* wait for current transaction to finish before starting new one */ atomic_t j_jlock ; /* lock for j_join_wait */ - int j_journal_list_index ; /* journal list number of the current trans */ int j_list_bitmap_index ; /* number of next list bitmap to use */ int j_must_wait ; /* no more journal begins allowed. MUST sleep on j_join_wait */ int j_next_full_flush ; /* next journal_end will flush all journal list */ @@ -242,24 +214,43 @@ struct reiserfs_journal { struct reiserfs_journal_cnode *j_cnode_free_list ; struct reiserfs_journal_cnode *j_cnode_free_orig ; /* orig pointer returned from vmalloc */ + struct reiserfs_journal_list *j_current_jl; int j_free_bitmap_nodes ; int j_used_bitmap_nodes ; + + int j_num_lists; /* total number of active transactions */ + int j_num_work_lists; /* number that need attention from kreiserfsd */ + + /* debugging to make sure things are flushed in order */ + int j_last_flush_id; + + /* debugging to make sure things are committed in order */ + int j_last_commit_id; + struct list_head j_bitmap_nodes ; struct list_head j_dirty_buffers ; spinlock_t j_dirty_buffers_lock ; /* protects j_dirty_buffers */ + + /* list of all active transactions */ + struct list_head j_journal_list; + /* lists that haven't been touched by writeback attempts */ + struct list_head j_working_list; + struct reiserfs_list_bitmap j_list_bitmap[JOURNAL_NUM_BITMAPS] ; /* array of bitmaps to record the deleted blocks */ - struct reiserfs_journal_list j_journal_list[JOURNAL_LIST_COUNT] ; /* array of all the journal lists */ struct reiserfs_journal_cnode *j_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for real buffer heads in current trans */ struct reiserfs_journal_cnode *j_list_hash_table[JOURNAL_HASH_SIZE] ; /* hash table for all the real buffer heads in all the transactions */ struct list_head j_prealloc_list; /* list of inodes which have preallocated blocks */ unsigned long j_max_trans_size ; unsigned long j_max_batch_size ; + + /* when flushing ordered buffers, throttle new ordered writers */ + struct work_struct j_work; + atomic_t j_async_throttle; }; #define JOURNAL_DESC_MAGIC "ReIsErLB" /* ick. magic string to find desc blocks in the journal */ - typedef __u32 (*hashf_t) (const signed char *, int); struct reiserfs_bitmap_info @@ -403,18 +394,22 @@ struct reiserfs_sb_info struct proc_dir_entry *procdir; int reserved_blocks; /* amount of blocks reserved for further allocations */ spinlock_t bitmap_lock; /* this lock on now only used to protect reserved_blocks variable */ + struct dentry *priv_root; /* root of /.reiserfs_priv */ + struct dentry *xattr_root; /* root of /.reiserfs_priv/.xa */ + struct rw_semaphore xattr_dir_sem; + }; /* Definitions of reiserfs on-disk properties: */ #define REISERFS_3_5 0 #define REISERFS_3_6 1 +enum reiserfs_mount_options { /* Mount options */ -#define REISERFS_LARGETAIL 0 /* large tails will be created in a session */ -#define REISERFS_SMALLTAIL 17 /* small (for files less than block size) tails will be created in a session */ -#define REPLAYONLY 3 /* replay journal and return 0. Use by fsck */ -#define REISERFS_NOLOG 4 /* -o nolog: turn journalling off */ -#define REISERFS_CONVERT 5 /* -o conv: causes conversion of old + REISERFS_LARGETAIL, /* large tails will be created in a session */ + REISERFS_SMALLTAIL, /* small (for files less than block size) tails will be created in a session */ + REPLAYONLY, /* replay journal and return 0. Use by fsck */ + REISERFS_CONVERT, /* -o conv: causes conversion of old format super block to the new format. If not specified - old partition will be dealt with in a @@ -428,26 +423,34 @@ struct reiserfs_sb_info ** the existing hash on the FS, so if you have a tea hash disk, and mount ** with -o hash=rupasov, the mount will fail. */ -#define FORCE_TEA_HASH 6 /* try to force tea hash on mount */ -#define FORCE_RUPASOV_HASH 7 /* try to force rupasov hash on mount */ -#define FORCE_R5_HASH 8 /* try to force rupasov hash on mount */ -#define FORCE_HASH_DETECT 9 /* try to detect hash function on mount */ - + FORCE_TEA_HASH, /* try to force tea hash on mount */ + FORCE_RUPASOV_HASH, /* try to force rupasov hash on mount */ + FORCE_R5_HASH, /* try to force rupasov hash on mount */ + FORCE_HASH_DETECT, /* try to detect hash function on mount */ + + REISERFS_DATA_LOG, + REISERFS_DATA_ORDERED, + REISERFS_DATA_WRITEBACK, /* used for testing experimental features, makes benchmarking new features with and without more convenient, should never be used by users in any code shipped to users (ideally) */ -#define REISERFS_NO_BORDER 11 -#define REISERFS_NO_UNHASHED_RELOCATION 12 -#define REISERFS_HASHED_RELOCATION 13 - -#define REISERFS_ATTRS 15 - -#define REISERFS_TEST1 11 -#define REISERFS_TEST2 12 -#define REISERFS_TEST3 13 -#define REISERFS_TEST4 14 + REISERFS_NO_BORDER, + REISERFS_NO_UNHASHED_RELOCATION, + REISERFS_HASHED_RELOCATION, + REISERFS_ATTRS, + REISERFS_XATTRS, + REISERFS_XATTRS_USER, + REISERFS_POSIXACL, + REISERFS_BARRIER_NONE, + REISERFS_BARRIER_FLUSH, + + REISERFS_TEST1, + REISERFS_TEST2, + REISERFS_TEST3, + REISERFS_TEST4, +}; #define reiserfs_r5_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_R5_HASH)) #define reiserfs_rupasov_hash(s) (REISERFS_SB(s)->s_mount_opt & (1 << FORCE_RUPASOV_HASH)) @@ -461,17 +464,21 @@ struct reiserfs_sb_info #define have_large_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_LARGETAIL)) #define have_small_tails(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_SMALLTAIL)) #define replay_only(s) (REISERFS_SB(s)->s_mount_opt & (1 << REPLAYONLY)) -#define reiserfs_dont_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_NOLOG)) #define reiserfs_attrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_ATTRS)) #define old_format_only(s) (REISERFS_SB(s)->s_properties & (1 << REISERFS_3_5)) #define convert_reiserfs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_CONVERT)) - +#define reiserfs_data_log(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_LOG)) +#define reiserfs_data_ordered(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_ORDERED)) +#define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK)) +#define reiserfs_xattrs(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS)) +#define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER)) +#define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL)) +#define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s)) +#define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE)) +#define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH)) void reiserfs_file_buffer (struct buffer_head * bh, int list); extern struct file_system_type reiserfs_fs_type; -int journal_mark_dirty(struct reiserfs_transaction_handle *, struct super_block *, struct buffer_head *bh) ; -int flush_old_commits(struct super_block *s, int) ; -int show_reiserfs_locks(void) ; int reiserfs_resize(struct super_block *, unsigned long) ; #define CARRY_ON 0 @@ -481,8 +488,6 @@ int reiserfs_resize(struct super_block * #define SB_BUFFER_WITH_SB(s) (REISERFS_SB(s)->s_sbh) #define SB_JOURNAL(s) (REISERFS_SB(s)->s_journal) #define SB_JOURNAL_1st_RESERVED_BLOCK(s) (SB_JOURNAL(s)->j_1st_reserved_block) -#define SB_JOURNAL_LIST(s) (SB_JOURNAL(s)->j_journal_list) -#define SB_JOURNAL_LIST_INDEX(s) (SB_JOURNAL(s)->j_journal_list_index) #define SB_JOURNAL_LEN_FREE(s) (SB_JOURNAL(s)->j_journal_len_free) #define SB_AP_BITMAP(s) (REISERFS_SB(s)->s_ap_bitmap) diff -urNp -X /home/axboe/cdrom/exclude /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_xattr.h linux-2.6.5-rc1-mm2/include/linux/reiserfs_xattr.h --- /opt/kernel/linux-2.6.5-rc1-mm2/include/linux/reiserfs_xattr.h 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.5-rc1-mm2/include/linux/reiserfs_xattr.h 2004-03-19 16:10:28.814448373 +0100 @@ -0,0 +1,132 @@ +/* + File: linux/reiserfs_xattr.h +*/ + +#include +#include +#include + +/* Magic value in header */ +#define REISERFS_XATTR_MAGIC 0x52465841 /* "RFXA" */ + +struct reiserfs_xattr_header { + __u32 h_magic; /* magic number for identification */ + __u32 h_hash; /* hash of the value */ +}; + +#ifdef __KERNEL__ + +struct reiserfs_xattr_handler { + char *prefix; + int (*init)(void); + void (*exit)(void); + int (*get)(struct inode *inode, const char *name, void *buffer, + size_t size); + int (*set)(struct inode *inode, const char *name, const void *buffer, + size_t size, int flags); + int (*del)(struct inode *inode, const char *name); + int (*list)(struct inode *inode, const char *name, int namelen, char *out); + struct list_head handlers; +}; + + +#ifdef CONFIG_REISERFS_FS_XATTR +#define is_reiserfs_priv_object(inode) (REISERFS_I(inode)->i_flags & i_priv_object) +#define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) +ssize_t reiserfs_getxattr (struct dentry *dentry, const char *name, + void *buffer, size_t size); +int reiserfs_setxattr (struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t reiserfs_listxattr (struct dentry *dentry, char *buffer, size_t size); +int reiserfs_removexattr (struct dentry *dentry, const char *name); +int reiserfs_delete_xattrs (struct inode *inode); +int reiserfs_chown_xattrs (struct inode *inode, struct iattr *attrs); +int reiserfs_xattr_init (struct super_block *sb, int mount_flags); +int reiserfs_permission (struct inode *inode, int mask, struct nameidata *nd); +int reiserfs_permission_locked (struct inode *inode, int mask, struct nameidata *nd); + +int reiserfs_xattr_del (struct inode *, const char *); +int reiserfs_xattr_get (const struct inode *, const char *, void *, size_t); +int reiserfs_xattr_set (struct inode *, const char *, const void *, + size_t, int); + +extern struct reiserfs_xattr_handler user_handler; +extern struct reiserfs_xattr_handler trusted_handler; +#ifdef CONFIG_REISERFS_FS_SECURITY +extern struct reiserfs_xattr_handler security_handler; +#endif + +int reiserfs_xattr_register_handlers (void) __init; +void reiserfs_xattr_unregister_handlers (void); + +static inline void +reiserfs_write_lock_xattrs(struct super_block *sb) +{ + down_write (&REISERFS_XATTR_DIR_SEM(sb)); +} +static inline void +reiserfs_write_unlock_xattrs(struct super_block *sb) +{ + up_write (&REISERFS_XATTR_DIR_SEM(sb)); +} +static inline void +reiserfs_read_lock_xattrs(struct super_block *sb) +{ + down_read (&REISERFS_XATTR_DIR_SEM(sb)); +} + +static inline void +reiserfs_read_unlock_xattrs(struct super_block *sb) +{ + up_read (&REISERFS_XATTR_DIR_SEM(sb)); +} + +static inline void +reiserfs_write_lock_xattr_i(struct inode *inode) +{ + down_write (&REISERFS_I(inode)->xattr_sem); +} +static inline void +reiserfs_write_unlock_xattr_i(struct inode *inode) +{ + up_write (&REISERFS_I(inode)->xattr_sem); +} +static inline void +reiserfs_read_lock_xattr_i(struct inode *inode) +{ + down_read (&REISERFS_I(inode)->xattr_sem); +} + +static inline void +reiserfs_read_unlock_xattr_i(struct inode *inode) +{ + up_read (&REISERFS_I(inode)->xattr_sem); +} + +#else + +#define is_reiserfs_priv_object(inode) 0 +#define reiserfs_getxattr NULL +#define reiserfs_setxattr NULL +#define reiserfs_listxattr NULL +#define reiserfs_removexattr NULL +#define reiserfs_write_lock_xattrs(sb) +#define reiserfs_write_unlock_xattrs(sb) +#define reiserfs_read_lock_xattrs(sb) +#define reiserfs_read_unlock_xattrs(sb) + +#define reiserfs_permission NULL + +#define reiserfs_xattr_register_handlers() 0 +#define reiserfs_xattr_unregister_handlers() + +static inline int reiserfs_delete_xattrs (struct inode *inode) { return 0; }; +static inline int reiserfs_chown_xattrs (struct inode *inode, struct iattr *attrs) { return 0; }; +static inline int reiserfs_xattr_init (struct super_block *sb, int mount_flags) +{ + sb->s_flags = (sb->s_flags & ~MS_POSIXACL); /* to be sure */ + return 0; +}; +#endif + +#endif /* __KERNEL__ */