# This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet 1.1496 -> 1.1497 # include/linux/buffer_head.h 1.44 -> 1.45 # fs/xfs/pagebuf/page_buf.c 1.68 -> 1.69 # drivers/block/ll_rw_blk.c 1.221 -> 1.222 # include/linux/blkdev.h 1.128 -> 1.129 # include/linux/fs.h 1.274 -> 1.275 # include/linux/ide.h 1.76 -> 1.77 # drivers/ide/ide-disk.c 1.62 -> 1.63 # drivers/md/raid1.c 1.73 -> 1.74 # fs/buffer.c 1.215 -> 1.216 # drivers/ide/ide-io.c 1.20 -> 1.21 # fs/jbd/commit.c 1.41 -> 1.42 # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 03/11/28 axboe@smithers.home.kernel.dk 1.1497 # ide barrier support # -------------------------------------------- # diff -Nru a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c --- a/drivers/block/ll_rw_blk.c Fri Nov 28 10:33:10 2003 +++ b/drivers/block/ll_rw_blk.c Fri Nov 28 10:33:10 2003 @@ -245,6 +245,31 @@ EXPORT_SYMBOL(blk_queue_make_request); /** + * blk_queue_ordered - does this queue support ordered writes + * @q: the request queue + * @flag: see below + * + * Description: + * For journalled file systems, doing ordered writes on a commit + * block instead of explicitly doing wait_on_buffer (which is bad + * for performance) can be a big win. Block drivers supporting this + * feature should call this function and indicate so. + * + * With this in mind, the values are + * QUEUE_ORDERED_NONE: the default, doesn't support barrier + * QUEUE_ORDERED_FLUSH: supports barrier through cache flush + **/ +void blk_queue_ordered(request_queue_t *q, int flag) +{ + if (flag) + set_bit(QUEUE_FLAG_ORDERED, &q->queue_flags); + else + clear_bit(QUEUE_FLAG_ORDERED, &q->queue_flags); +} + +EXPORT_SYMBOL(blk_queue_ordered); + +/** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device * @dma_addr: bus address limit @@ -1836,6 +1861,8 @@ if (unlikely(!q)) return; + + WARN_ON(!req->ref_count); if (unlikely(--req->ref_count)) return; @@ -2002,7 +2029,7 @@ static int __make_request(request_queue_t *q, struct bio *bio) { struct request *req, *freereq = NULL; - int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra; + int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, ra, err; sector_t sector; sector = bio->bi_sector; @@ -2020,7 +2047,11 @@ spin_lock_prefetch(q->queue_lock); - barrier = test_bit(BIO_RW_BARRIER, &bio->bi_rw); + barrier = bio_barrier(bio); + if (barrier && !test_bit(QUEUE_FLAG_ORDERED, &q->queue_flags)) { + err = -EOPNOTSUPP; + goto end_io; + } ra = bio->bi_rw & (1 << BIO_RW_AHEAD); @@ -2102,6 +2133,7 @@ /* * READA bit set */ + err = -EWOULDBLOCK; if (ra) goto end_io; @@ -2157,7 +2189,7 @@ return 0; end_io: - bio_endio(bio, nr_sectors << 9, -EWOULDBLOCK); + bio_endio(bio, nr_sectors << 9, err); return 0; } diff -Nru a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c --- a/drivers/ide/ide-disk.c Fri Nov 28 10:33:10 2003 +++ b/drivers/ide/ide-disk.c Fri Nov 28 10:33:10 2003 @@ -1371,6 +1371,7 @@ static int write_cache (ide_drive_t *drive, int arg) { ide_task_t args; + int err; if (!(drive->id->cfs_enable_2 & 0x3000)) return 1; @@ -1380,7 +1381,10 @@ SETFEATURES_EN_WCACHE : SETFEATURES_DIS_WCACHE; args.tfRegister[IDE_COMMAND_OFFSET] = WIN_SETFEATURES; args.command_type = ide_cmd_type_parser(&args); - (void) ide_raw_taskfile(drive, &args, NULL); + + err = ide_raw_taskfile(drive, &args, NULL); + if (err) + return err; drive->wcache = arg; return 0; @@ -1688,6 +1692,12 @@ if (drive->id->cfs_enable_2 & 0x3000) write_cache(drive, (id->cfs_enable_2 & 0x3000)); + drive->wcache = drive->id->cfs_enable_1 & 0x20; + if (drive->wcache) { + printk("%s: write cache enabled\n", drive->name); + blk_queue_ordered(drive->queue, 1); + } + #ifdef CONFIG_BLK_DEV_IDE_TCQ_DEFAULT if (drive->using_dma) HWIF(drive)->ide_dma_queued_on(drive); @@ -1755,7 +1765,7 @@ drive->wcache = 0; /* Cache enabled ? */ if (drive->id->csfo & 1) - drive->wcache = 1; + drive->wcache = 1; /* Cache command set available ? */ if (drive->id->cfs_enable_1 & (1<<5)) drive->wcache = 1; diff -Nru a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c --- a/drivers/ide/ide-io.c Fri Nov 28 10:33:10 2003 +++ b/drivers/ide/ide-io.c Fri Nov 28 10:33:10 2003 @@ -85,30 +85,63 @@ #endif /* DISK_RECOVERY_TIME */ } -/** - * ide_end_request - complete an IDE I/O - * @drive: IDE device for the I/O - * @uptodate: - * @nr_sectors: number of sectors completed - * - * This is our end_request wrapper function. We complete the I/O - * update random number input and dequeue the request, which if - * it was tagged may be out of order. +/* + * preempt pending requests, and store this cache flush for immediate + * execution */ - -int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors) +static struct request *ide_queue_flush_cmd(ide_drive_t *drive, + struct request *rq, int post) { - struct request *rq; - unsigned long flags; - int ret = 1; + struct request *flush_rq = &HWGROUP(drive)->wrq; - spin_lock_irqsave(&ide_lock, flags); - rq = HWGROUP(drive)->rq; + /* + * write cache disabled, just return barrier write immediately + */ + if (!drive->wcache) + return rq; - BUG_ON(!(rq->flags & REQ_STARTED)); + /* + * if last rq issued was the post-flush, we can skip the pre-flush + */ + if (drive->last_rq_flush) { + rq->flags |= REQ_BAR_PREFLUSH; + return rq; + } - if (!nr_sectors) - nr_sectors = rq->hard_cur_sectors; + blkdev_dequeue_request(rq); + + memset(drive->special_buf, 0, sizeof(drive->special_buf)); + + ide_init_drive_cmd(flush_rq); + + flush_rq->flags = REQ_DRIVE_TASK; + flush_rq->buffer = drive->special_buf; + flush_rq->special = rq; + flush_rq->buffer[0] = WIN_FLUSH_CACHE; + flush_rq->nr_sectors = rq->nr_sectors; + + if (drive->id->cfs_enable_2 & 0x2400) + flush_rq->buffer[0] = WIN_FLUSH_CACHE_EXT; + + if (!post) { + drive->doing_barrier = 1; + flush_rq->flags |= REQ_BAR_PREFLUSH; + } else + flush_rq->flags |= REQ_BAR_POSTFLUSH; + + flush_rq->flags |= REQ_STARTED; + flush_rq->timeout = jiffies; + list_add(&flush_rq->queuelist, &drive->queue->queue_head); + HWGROUP(drive)->rq = NULL; + return flush_rq; +} + +static int __ide_end_request(ide_drive_t *drive, struct request *rq, + int uptodate, int nr_sectors) +{ + int ret = 1; + + BUG_ON(!(rq->flags & REQ_STARTED)); /* * if failfast is set on a request, override number of sectors and @@ -128,14 +161,55 @@ if (!end_that_request_first(rq, uptodate, nr_sectors)) { add_disk_randomness(rq->rq_disk); - if (!blk_rq_tagged(rq)) - blkdev_dequeue_request(rq); - else + + if (blk_rq_tagged(rq)) blk_queue_end_tag(drive->queue, rq); - HWGROUP(drive)->rq = NULL; + else if (!blk_barrier_rq(rq)) + blkdev_dequeue_request(rq); + end_that_request_last(rq); + HWGROUP(drive)->rq = NULL; ret = 0; } + + return ret; +} + +/** + * ide_end_request - complete an IDE I/O + * @drive: IDE device for the I/O + * @uptodate: + * @nr_sectors: number of sectors completed + * + * This is our end_request wrapper function. We complete the I/O + * update random number input and dequeue the request, which if + * it was tagged may be out of order. + */ + +int ide_end_request (ide_drive_t *drive, int uptodate, int nr_sectors) +{ + struct request *rq; + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&ide_lock, flags); + rq = HWGROUP(drive)->rq; + + if (!nr_sectors) + nr_sectors = rq->hard_cur_sectors; + + if (!blk_barrier_rq(rq)) + ret = __ide_end_request(drive, rq, uptodate, nr_sectors); + else { + struct request *flush_rq = &HWGROUP(drive)->wrq; + + flush_rq->nr_sectors -= nr_sectors; + if (!flush_rq->nr_sectors) { + ide_queue_flush_cmd(drive, rq, 1); + ret = 0; + } + } + spin_unlock_irqrestore(&ide_lock, flags); return ret; } @@ -171,6 +245,97 @@ spin_unlock_irqrestore(&ide_lock, flags); } +/* + * FIXME: probably move this somewhere else, name is bad too :) + */ +static sector_t ide_get_error_location(ide_drive_t *drive, char *args) +{ + u32 high, low; + u8 hcyl, lcyl, sect; + sector_t sector; + + high = 0; + hcyl = args[5]; + lcyl = args[4]; + sect = args[3]; + + if (drive->id->cfs_enable_2 & 0x2400) { + low = (hcyl << 16) | (lcyl << 8) | sect; + HWIF(drive)->OUTB(drive->ctl|0x80, IDE_CONTROL_REG); + high = ide_read_24(drive); + } else { + u8 cur = HWIF(drive)->INB(IDE_SELECT_REG); + if (cur & 0x40) + low = (hcyl << 16) | (lcyl << 8) | sect; + else { + low = hcyl * drive->head * drive->sect; + low += lcyl * drive->sect; + low += sect - 1; + } + } + + sector = ((sector_t) high << 24) | low; + return sector; +} + +static void ide_complete_barrier(ide_drive_t *drive, struct request *rq, + int error) +{ + struct request *real_rq = rq->special; + int good_sectors, bad_sectors; + sector_t sector; + + if (!error) { + if (blk_barrier_postflush(rq)) { + /* + * this completes the barrier write + */ + __ide_end_request(drive, real_rq, 1, real_rq->hard_nr_sectors); + drive->doing_barrier = 0; + drive->last_rq_flush = 1; + } else { + /* + * just indicate that we did the pre flush + */ + real_rq->flags |= REQ_BAR_PREFLUSH; + __elv_add_request(drive->queue, real_rq, ELEVATOR_INSERT_FRONT, 0); + } + +#ifdef IDE_DUMP_FLUSH_TIMINGS + printk("%s: %sflush took %lu jiffies\n", drive->name, blk_barrier_postflush(rq) ? "post" : "pre", jiffies - rq->timeout); +#endif + + /* + * all is fine, return + */ + return; + } + + /* + * bummer, flush failed. if it was the pre-flush, fail the barrier. + * if it was the post-flush, complete the succesful part of the request + * and fail the rest + */ + good_sectors = 0; + if (blk_barrier_postflush(rq)) { + sector = ide_get_error_location(drive, rq->buffer); + + if ((sector >= real_rq->hard_sector) && + (sector < real_rq->hard_sector + real_rq->hard_nr_sectors)) + good_sectors = sector - real_rq->hard_sector; + } else + sector = real_rq->hard_sector; + + bad_sectors = real_rq->hard_nr_sectors - good_sectors; + if (good_sectors) + __ide_end_request(drive, real_rq, 1, good_sectors); + if (bad_sectors) + __ide_end_request(drive, real_rq, 0, bad_sectors); + + printk(KERN_ERR "%s: failed barrier write: sector=%Lx(good=%d/bad=%d)\n", drive->name, sector, good_sectors, bad_sectors); + blk_queue_ordered(drive->queue, 1); +} + /** * ide_end_drive_cmd - end an explicit drive command * @drive: command @@ -260,6 +425,10 @@ spin_lock_irqsave(&ide_lock, flags); blkdev_dequeue_request(rq); + + if (blk_barrier_preflush(rq) || blk_barrier_postflush(rq)) + ide_complete_barrier(drive, rq, err); + HWGROUP(drive)->rq = NULL; end_that_request_last(rq); spin_unlock_irqrestore(&ide_lock, flags); @@ -752,6 +921,15 @@ repeat: best = NULL; drive = hwgroup->drive; + + /* + * drive is doing pre-flush, ordered write, post-flush sequence. even + * though that is 3 requests, it must be seen as a single transaction. + * we must not preempt this drive until that is complete + */ + if (drive->doing_barrier) + return drive; + do { if ((!drive->sleep || time_after_eq(jiffies, drive->sleep)) && !elv_queue_empty(drive->queue)) { @@ -919,6 +1097,15 @@ } /* + * if rq is a barrier write, issue pre cache flush if not + * already done + */ + if (blk_barrier_rq(rq) && !blk_barrier_preflush(rq)) + rq = ide_queue_flush_cmd(drive, rq, 0); + + drive->last_rq_flush = 0; + + /* * Sanity: don't accept a request that isn't a PM request * if we are currently power managed. This is very important as * blk_stop_queue() doesn't prevent the elv_next_request() @@ -937,6 +1124,10 @@ break; } + /* + * we can only queue read-write requests, so let the drive + * queue drain before continuing with this command. + */ if (!rq->bio && ata_pending_commands(drive)) break; @@ -1344,6 +1535,7 @@ { memset(rq, 0, sizeof(*rq)); rq->flags = REQ_DRIVE_CMD; + rq->ref_count = 1; } EXPORT_SYMBOL(ide_init_drive_cmd); diff -Nru a/drivers/md/raid1.c b/drivers/md/raid1.c --- a/drivers/md/raid1.c Fri Nov 28 10:33:10 2003 +++ b/drivers/md/raid1.c Fri Nov 28 10:33:10 2003 @@ -879,7 +879,7 @@ conf = mddev_to_conf(mddev); bio = r1_bio->master_bio; switch(r1_bio->cmd) { - case SPECIAL: + case WRITESYNC: sync_request_write(mddev, r1_bio); break; case READ: @@ -989,7 +989,7 @@ r1_bio->mddev = mddev; r1_bio->sector = sector_nr; - r1_bio->cmd = SPECIAL; + r1_bio->cmd = WRITESYNC; r1_bio->read_disk = disk; bio = r1_bio->master_bio; diff -Nru a/fs/buffer.c b/fs/buffer.c --- a/fs/buffer.c Fri Nov 28 10:33:10 2003 +++ b/fs/buffer.c Fri Nov 28 10:33:10 2003 @@ -2665,6 +2665,9 @@ if (rw == READ && buffer_dirty(bh)) buffer_error(); + if (buffer_ordered(bh) && (rw == WRITE)) + rw = WRITESYNC; + /* Only clear out a write error when rewriting */ if (test_set_buffer_req(bh) && rw == WRITE) clear_buffer_write_io_error(bh); diff -Nru a/fs/jbd/commit.c b/fs/jbd/commit.c --- a/fs/jbd/commit.c Fri Nov 28 10:33:10 2003 +++ b/fs/jbd/commit.c Fri Nov 28 10:33:10 2003 @@ -621,7 +621,9 @@ { struct buffer_head *bh = jh2bh(descriptor); set_buffer_uptodate(bh); + set_buffer_ordered(bh); sync_dirty_buffer(bh); + clear_buffer_ordered(bh); if (unlikely(!buffer_uptodate(bh))) err = -EIO; put_bh(bh); /* One for getblk() */ diff -Nru a/fs/xfs/pagebuf/page_buf.c b/fs/xfs/pagebuf/page_buf.c --- a/fs/xfs/pagebuf/page_buf.c Fri Nov 28 10:33:10 2003 +++ b/fs/xfs/pagebuf/page_buf.c Fri Nov 28 10:33:10 2003 @@ -1403,12 +1403,12 @@ submit_io: if (likely(bio->bi_size)) { - if (pb->pb_flags & PBF_READ) { - submit_bio(READ, bio); - } else { - submit_bio(WRITE, bio); - } - + int cmd = WRITE; + if (pb->pb_flags & PBF_READ) + cmd = READ; + else if (pb->pb_flags & PBF_FLUSH) + cmd = WRITESYNC; + submit_bio(cmd, bio); if (size) goto next_chunk; } else { diff -Nru a/include/linux/blkdev.h b/include/linux/blkdev.h --- a/include/linux/blkdev.h Fri Nov 28 10:33:10 2003 +++ b/include/linux/blkdev.h Fri Nov 28 10:33:10 2003 @@ -198,6 +198,8 @@ __REQ_IDETAPE_READ, __REQ_IDETAPE_WRITE, __REQ_IDETAPE_READ_BUFFER, + __REQ_BAR_PREFLUSH, /* barrier pre-flush done */ + __REQ_BAR_POSTFLUSH, /* barrier post-flush */ __REQ_NR_BITS, /* stops here */ }; @@ -228,6 +230,8 @@ #define REQ_IDETAPE_READ (1 << __REQ_IDETAPE_READ) #define REQ_IDETAPE_WRITE (1 << __REQ_IDETAPE_WRITE) #define REQ_IDETAPE_READ_BUFFER (1 << __REQ_IDETAPE_READ_BUFFER) +#define REQ_BAR_PREFLUSH (1 << __REQ_BAR_PREFLUSH) +#define REQ_BAR_POSTFLUSH (1 << __REQ_BAR_POSTFLUSH) /* * State information carried for REQ_PM_SUSPEND and REQ_PM_RESUME @@ -377,6 +381,7 @@ #define QUEUE_FLAG_READFULL 3 /* write queue has been filled */ #define QUEUE_FLAG_WRITEFULL 4 /* read queue has been filled */ #define QUEUE_FLAG_DEAD 5 /* queue being torn down */ +#define QUEUE_FLAG_ORDERED 6 /* supports ordered writes */ #define blk_queue_plugged(q) !list_empty(&(q)->plug_list) #define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags) @@ -389,6 +394,10 @@ #define blk_pm_request(rq) \ ((rq)->flags & (REQ_PM_SUSPEND | REQ_PM_RESUME)) +#define blk_barrier_rq(rq) ((rq)->flags & REQ_HARDBARRIER) +#define blk_barrier_preflush(rq) ((rq)->flags & REQ_BAR_PREFLUSH) +#define blk_barrier_postflush(rq) ((rq)->flags & REQ_BAR_POSTFLUSH) + #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) #define rq_data_dir(rq) ((rq)->flags & 1) @@ -571,6 +580,7 @@ extern void blk_queue_merge_bvec(request_queue_t *, merge_bvec_fn *); extern void blk_queue_dma_alignment(request_queue_t *, int); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); +extern void blk_queue_ordered(request_queue_t *, int); extern int blk_rq_map_sg(request_queue_t *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); diff -Nru a/include/linux/buffer_head.h b/include/linux/buffer_head.h --- a/include/linux/buffer_head.h Fri Nov 28 10:33:10 2003 +++ b/include/linux/buffer_head.h Fri Nov 28 10:33:10 2003 @@ -26,6 +26,7 @@ BH_Delay, /* Buffer is not yet allocated on disk */ BH_Boundary, /* Block is followed by a discontiguity */ BH_Write_EIO, /* I/O error on write */ + BH_Ordered, /* ordered write */ BH_PrivateStart,/* not a state bit, but the first bit available * for private allocation by other entities @@ -117,7 +118,8 @@ BUFFER_FNS(Async_Write, async_write) BUFFER_FNS(Delay, delay) BUFFER_FNS(Boundary, boundary) -BUFFER_FNS(Write_EIO,write_io_error) +BUFFER_FNS(Write_EIO, write_io_error) +BUFFER_FNS(Ordered, ordered) #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) #define touch_buffer(bh) mark_page_accessed(bh->b_page) diff -Nru a/include/linux/fs.h b/include/linux/fs.h --- a/include/linux/fs.h Fri Nov 28 10:33:10 2003 +++ b/include/linux/fs.h Fri Nov 28 10:33:10 2003 @@ -81,7 +81,7 @@ #define READ 0 #define WRITE 1 #define READA 2 /* read-ahead - don't block if no resources */ -#define SPECIAL 4 /* For non-blockdevice requests in request queue */ +#define WRITESYNC ((1 << BIO_RW) | (1 << BIO_RW_BARRIER)) #define SEL_IN 1 #define SEL_OUT 2 diff -Nru a/include/linux/ide.h b/include/linux/ide.h --- a/include/linux/ide.h Fri Nov 28 10:33:10 2003 +++ b/include/linux/ide.h Fri Nov 28 10:33:10 2003 @@ -757,6 +757,8 @@ u8 bios_head; /* BIOS/fdisk/LILO number of heads */ u8 bios_sect; /* BIOS/fdisk/LILO sectors per track */ u8 queue_depth; /* max queue depth */ + u8 doing_barrier; /* state, 1=currently doing flush */ + u8 last_rq_flush; /* last rq was a flush */ unsigned int bios_cyl; /* BIOS/fdisk/LILO number of cyls */ unsigned int cyl; /* "real" number of cyls */ @@ -771,6 +773,7 @@ int forced_lun; /* if hdxlun was given at boot */ int lun; /* logical unit */ int crc_count; /* crc counter to reduce drive speed */ + char special_buf[8]; /* private command buffer */ struct list_head list; struct device gendev; struct semaphore gendev_rel_sem; /* to deal with device release() */