diff -urN 2.2.15pre19/arch/sparc64/kernel/ioctl32.c elevator/arch/sparc64/kernel/ioctl32.c --- 2.2.15pre19/arch/sparc64/kernel/ioctl32.c Fri Apr 21 15:59:20 2000 +++ elevator/arch/sparc64/kernel/ioctl32.c Tue Apr 25 18:51:22 2000 @@ -1996,6 +1996,8 @@ case HDIO_SET_NICE: case BLKROSET: case BLKROGET: + case BLKELVGET: + case BLKELVSET: /* 0x02 -- Floppy ioctls */ case FDMSGON: diff -urN 2.2.15pre19/drivers/block/ide.c elevator/drivers/block/ide.c --- 2.2.15pre19/drivers/block/ide.c Fri Apr 21 15:59:21 2000 +++ elevator/drivers/block/ide.c Tue Apr 25 18:51:22 2000 @@ -2287,6 +2287,10 @@ drive->nice1 = (arg >> IDE_NICE_1) & 1; return 0; + case BLKELVGET: + case BLKELVSET: + return blkelv_ioctl(inode->i_rdev, cmd, arg); + RO_IOCTLS(inode->i_rdev, arg); default: diff -urN 2.2.15pre19/drivers/block/ll_rw_blk.c elevator/drivers/block/ll_rw_blk.c --- 2.2.15pre19/drivers/block/ll_rw_blk.c Fri Apr 21 15:59:21 2000 +++ elevator/drivers/block/ll_rw_blk.c Tue Apr 25 18:52:31 2000 @@ -3,6 +3,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1994, Karl Keyte: Added support for disk statistics + * Elevator latency, (C) 2000 Andrea Arcangeli SuSE */ /* @@ -20,6 +21,7 @@ #include #include +#include #include #include @@ -142,6 +144,17 @@ return &blk_dev[major].current_request; } +static inline int get_request_latency(elevator_t * elevator, int rw) +{ + int latency; + + latency = elevator->read_latency; + if (rw != READ) + latency = elevator->write_latency; + + return latency; +} + /* * remove the plug and let it rip.. */ @@ -291,6 +304,196 @@ printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n"); } +static int blkelvget_ioctl(elevator_t * elevator, blkelv_ioctl_arg_t * arg) +{ + int ret; + blkelv_ioctl_arg_t output; + + output.queue_ID = elevator->queue_ID; + output.read_latency = elevator->read_latency; + output.write_latency = elevator->write_latency; + output.max_bomb_segments = elevator->max_bomb_segments; + + ret = -EFAULT; + if (copy_to_user(arg, &output, sizeof(blkelv_ioctl_arg_t))) + goto out; + ret = 0; + out: + return ret; +} + +static int blkelvset_ioctl(elevator_t * elevator, const blkelv_ioctl_arg_t * arg) +{ + blkelv_ioctl_arg_t input; + int ret; + + ret = -EFAULT; + if (copy_from_user(&input, arg, sizeof(blkelv_ioctl_arg_t))) + goto out; + + ret = -EINVAL; + if (input.read_latency < 0) + goto out; + if (input.write_latency < 0) + goto out; + if (input.max_bomb_segments <= 0) + goto out; + + elevator->read_latency = input.read_latency; + elevator->write_latency = input.write_latency; + elevator->max_bomb_segments = input.max_bomb_segments; + + ret = 0; + out: + return ret; +} + +int blkelv_ioctl(kdev_t dev, unsigned long cmd, unsigned long arg) +{ + elevator_t * elevator = &blk_dev[MAJOR(dev)].elevator; + blkelv_ioctl_arg_t * __arg = (blkelv_ioctl_arg_t *) arg; + + switch (cmd) { + case BLKELVGET: + return blkelvget_ioctl(elevator, __arg); + case BLKELVSET: + return blkelvset_ioctl(elevator, __arg); + } +} + +static inline int seek_to_not_starving_chunk(struct request ** req, int * lat) +{ + struct request * tmp = *req; + int found = 0, pos = 0; + int last_pos = 0, __lat = *lat; + + do { + if (tmp->elevator_latency <= 0) + { + *req = tmp; + found = 1; + last_pos = pos; + if (last_pos >= __lat) + break; + } + pos += tmp->nr_segments; + } while ((tmp = tmp->next)); + *lat -= last_pos; + + return found; +} + +#define CASE_COALESCE_BUT_FIRST_REQUEST_MAYBE_BUSY \ + case IDE0_MAJOR: /* same as HD_MAJOR */ \ + case IDE1_MAJOR: \ + case FLOPPY_MAJOR: \ + case IDE2_MAJOR: \ + case IDE3_MAJOR: \ + case IDE4_MAJOR: \ + case IDE5_MAJOR: \ + case ACSI_MAJOR: \ + case MFM_ACORN_MAJOR: \ + case MDISK_MAJOR: \ + case DASD_MAJOR: +#define CASE_COALESCE_ALSO_FIRST_REQUEST \ + case SCSI_DISK0_MAJOR: \ + case SCSI_DISK1_MAJOR: \ + case SCSI_DISK2_MAJOR: \ + case SCSI_DISK3_MAJOR: \ + case SCSI_DISK4_MAJOR: \ + case SCSI_DISK5_MAJOR: \ + case SCSI_DISK6_MAJOR: \ + case SCSI_DISK7_MAJOR: \ + case SCSI_CDROM_MAJOR: \ + case DAC960_MAJOR+0: \ + case DAC960_MAJOR+1: \ + case DAC960_MAJOR+2: \ + case DAC960_MAJOR+3: \ + case DAC960_MAJOR+4: \ + case DAC960_MAJOR+5: \ + case DAC960_MAJOR+6: \ + case DAC960_MAJOR+7: \ + case COMPAQ_SMART2_MAJOR+0: \ + case COMPAQ_SMART2_MAJOR+1: \ + case COMPAQ_SMART2_MAJOR+2: \ + case COMPAQ_SMART2_MAJOR+3: \ + case COMPAQ_SMART2_MAJOR+4: \ + case COMPAQ_SMART2_MAJOR+5: \ + case COMPAQ_SMART2_MAJOR+6: \ + case COMPAQ_SMART2_MAJOR+7: + +#define elevator_starve_rest_of_queue(req) \ +do { \ + struct request * tmp = (req); \ + for ((tmp) = (tmp)->next; (tmp); (tmp) = (tmp)->next) \ + (tmp)->elevator_latency--; \ +} while (0) + +static inline void elevator_queue(struct request * req, + struct request * tmp, + int latency, + struct blk_dev_struct * dev, + struct request ** queue_head) +{ + struct request * __tmp; + int starving, __latency; + + starving = seek_to_not_starving_chunk(&tmp, &latency); + __tmp = tmp; + __latency = latency; + + for (;; tmp = tmp->next) + { + if ((latency -= tmp->nr_segments) <= 0) + { + tmp = __tmp; + latency = __latency - tmp->nr_segments; + + if (starving) + break; + + switch (MAJOR(req->rq_dev)) + { + CASE_COALESCE_BUT_FIRST_REQUEST_MAYBE_BUSY + if (tmp == dev->current_request) + default: + goto link; + CASE_COALESCE_ALSO_FIRST_REQUEST + } + + latency += tmp->nr_segments; + req->next = tmp; + *queue_head = req; + goto after_link; + } + + if (!tmp->next) + break; + + { + const int after_current = IN_ORDER(tmp,req); + const int before_next = IN_ORDER(req,tmp->next); + + if (!IN_ORDER(tmp,tmp->next)) { + if (after_current || before_next) + break; + } else { + if (after_current && before_next) + break; + } + } + } + + link: + req->next = tmp->next; + tmp->next = req; + + after_link: + req->elevator_latency = latency; + + elevator_starve_rest_of_queue(req); +} + /* * add-request adds a request to the linked list. * It disables interrupts (aquires the request spinlock) so that it can muck @@ -309,6 +512,7 @@ short disk_index; unsigned long flags; int queue_new_request = 0; + int latency; switch (major) { case DAC960_MAJOR+0: @@ -333,7 +537,7 @@ break; } - req->next = NULL; + latency = get_request_latency(&dev->elevator, req->cmd); /* * We use the goto to reduce locking complexity @@ -344,25 +548,14 @@ if (req->bh) mark_buffer_clean(req->bh); if (!(tmp = *current_request)) { + req->next = NULL; + req->elevator_latency = latency; *current_request = req; if (dev->current_request != &dev->plug) queue_new_request = 1; goto out; } - for ( ; tmp->next ; tmp = tmp->next) { - const int after_current = IN_ORDER(tmp,req); - const int before_next = IN_ORDER(req,tmp->next); - - if (!IN_ORDER(tmp,tmp->next)) { - if (after_current || before_next) - break; - } else { - if (after_current && before_next) - break; - } - } - req->next = tmp->next; - tmp->next = req; + elevator_queue(req, tmp, latency, dev, current_request); /* for SCSI devices, call request_fn unconditionally */ if (scsi_blk_major(major) || @@ -399,6 +592,8 @@ total_segments--; if (total_segments > max_segments) return; + if (next->elevator_latency < req->elevator_latency) + req->elevator_latency = next->elevator_latency; req->bhtail->b_reqnext = next->bh; req->bhtail = next->bhtail; req->nr_sectors += next->nr_sectors; @@ -408,12 +603,28 @@ wake_up (&wait_for_request); } +#define read_pendings(req) \ +({ \ + int __ret = 0; \ + struct request * tmp = (req); \ + do { \ + if (tmp->cmd == READ) \ + { \ + __ret = 1; \ + break; \ + } \ + tmp = tmp->next; \ + } while (tmp); \ + __ret; \ +}) + void make_request(int major, int rw, struct buffer_head * bh) { unsigned int sector, count; - struct request * req; + struct request * req, * prev; int rw_ahead, max_req, max_sectors, max_segments; unsigned long flags; + int latency, starving; count = bh->b_size >> 9; sector = bh->b_rsector; @@ -490,6 +701,8 @@ max_sectors = get_max_sectors(bh->b_rdev); max_segments = get_max_segments(bh->b_rdev); + latency = get_request_latency(&blk_dev[major].elevator, rw); + /* * Now we acquire the request spinlock, we have to be mega careful * not to schedule or do something nonatomic @@ -502,17 +715,7 @@ major != DDV_MAJOR && major != NBD_MAJOR) plug_device(blk_dev + major); /* is atomic */ } else switch (major) { - case IDE0_MAJOR: /* same as HD_MAJOR */ - case IDE1_MAJOR: - case FLOPPY_MAJOR: - case IDE2_MAJOR: - case IDE3_MAJOR: - case IDE4_MAJOR: - case IDE5_MAJOR: - case ACSI_MAJOR: - case MFM_ACORN_MAJOR: - case MDISK_MAJOR: - case DASD_MAJOR: + CASE_COALESCE_BUT_FIRST_REQUEST_MAYBE_BUSY /* * The scsi disk and cdrom drivers completely remove the request * from the queue when they start processing an entry. For this @@ -523,37 +726,20 @@ * entry may be busy being processed and we thus can't change it. */ if (req == blk_dev[major].current_request) - req = req->next; - if (!req) - break; + { + if (!(req = req->next)) + break; + latency -= req->nr_segments; + } /* fall through */ + CASE_COALESCE_ALSO_FIRST_REQUEST - case SCSI_DISK0_MAJOR: - case SCSI_DISK1_MAJOR: - case SCSI_DISK2_MAJOR: - case SCSI_DISK3_MAJOR: - case SCSI_DISK4_MAJOR: - case SCSI_DISK5_MAJOR: - case SCSI_DISK6_MAJOR: - case SCSI_DISK7_MAJOR: - case SCSI_CDROM_MAJOR: - case DAC960_MAJOR+0: - case DAC960_MAJOR+1: - case DAC960_MAJOR+2: - case DAC960_MAJOR+3: - case DAC960_MAJOR+4: - case DAC960_MAJOR+5: - case DAC960_MAJOR+6: - case DAC960_MAJOR+7: - case COMPAQ_SMART2_MAJOR+0: - case COMPAQ_SMART2_MAJOR+1: - case COMPAQ_SMART2_MAJOR+2: - case COMPAQ_SMART2_MAJOR+3: - case COMPAQ_SMART2_MAJOR+4: - case COMPAQ_SMART2_MAJOR+5: - case COMPAQ_SMART2_MAJOR+6: - case COMPAQ_SMART2_MAJOR+7: + /* avoid write-bombs to not hurt iteractiveness of reads */ + if (rw != READ && read_pendings(req)) + max_segments = blk_dev[major].elevator.max_bomb_segments; + starving = seek_to_not_starving_chunk(&req, &latency); + prev = NULL; do { if (req->sem) continue; @@ -565,24 +751,34 @@ continue; /* Can we add it to the end of this request? */ if (req->sector + req->nr_sectors == sector) { + if (latency - req->nr_segments < 0) + break; if (req->bhtail->b_data + req->bhtail->b_size != bh->b_data) { if (req->nr_segments < max_segments) req->nr_segments++; - else continue; + else break; } req->bhtail->b_reqnext = bh; req->bhtail = bh; req->nr_sectors += count; + + /* latency stuff */ + if ((latency -= req->nr_segments) < req->elevator_latency) + req->elevator_latency = latency; + elevator_starve_rest_of_queue(req); + /* Can we now merge this req with the next? */ attempt_merge(req, max_sectors, max_segments); /* or to the beginning? */ } else if (req->sector - count == sector) { + if (!prev && starving) + break; if (bh->b_data + bh->b_size != req->bh->b_data) { if (req->nr_segments < max_segments) req->nr_segments++; - else continue; + else break; } bh->b_reqnext = req->bh; req->bh = bh; @@ -590,6 +786,14 @@ req->current_nr_sectors = count; req->sector = sector; req->nr_sectors += count; + + /* latency stuff */ + if (latency < --req->elevator_latency) + req->elevator_latency = latency; + elevator_starve_rest_of_queue(req); + + if (prev) + attempt_merge(prev, max_sectors, max_segments); } else continue; @@ -597,7 +801,8 @@ spin_unlock_irqrestore(&io_request_lock,flags); return; - } while ((req = req->next) != NULL); + } while (prev = req, + (latency -= req->nr_segments) >= 0 && (req = req->next) != NULL); } /* find an unused request. */ @@ -623,7 +828,6 @@ req->sem = NULL; req->bh = bh; req->bhtail = bh; - req->next = NULL; add_request(major+blk_dev,req); return; @@ -781,6 +985,7 @@ { struct request * req; struct blk_dev_struct *dev; + static unsigned int queue_ID; for (dev = blk_dev + MAX_BLKDEV; dev-- != blk_dev;) { dev->request_fn = NULL; @@ -792,12 +997,13 @@ dev->plug_tq.sync = 0; dev->plug_tq.routine = &unplug_device; dev->plug_tq.data = dev; + dev->elevator = ELEVATOR_DEFAULTS; + dev->elevator.queue_ID = queue_ID++; } req = all_requests + NR_REQUEST; while (--req >= all_requests) { req->rq_status = RQ_INACTIVE; - req->next = NULL; } memset(ro_bits,0,sizeof(ro_bits)); memset(max_readahead, 0, sizeof(max_readahead)); @@ -908,3 +1114,4 @@ EXPORT_SYMBOL(io_request_lock); EXPORT_SYMBOL(end_that_request_first); EXPORT_SYMBOL(end_that_request_last); +EXPORT_SYMBOL(blkelv_ioctl); diff -urN 2.2.15pre19/drivers/scsi/sd_ioctl.c elevator/drivers/scsi/sd_ioctl.c --- 2.2.15pre19/drivers/scsi/sd_ioctl.c Mon Jan 17 16:44:40 2000 +++ elevator/drivers/scsi/sd_ioctl.c Tue Apr 25 18:51:22 2000 @@ -113,6 +113,10 @@ return put_user(blksize_size[MAJOR(dev)][MINOR(dev)&0x0F], (int *)arg); + case BLKELVGET: + case BLKELVSET: + return blkelv_ioctl(inode->i_rdev, cmd, arg); + RO_IOCTLS(dev, arg); default: diff -urN 2.2.15pre19/include/linux/blkdev.h elevator/include/linux/blkdev.h --- 2.2.15pre19/include/linux/blkdev.h Tue Apr 4 02:49:26 2000 +++ elevator/include/linux/blkdev.h Tue Apr 25 18:51:22 2000 @@ -32,11 +32,39 @@ struct buffer_head * bh; struct buffer_head * bhtail; struct request * next; + int elevator_latency; }; typedef void (request_fn_proc) (void); typedef struct request ** (queue_proc) (kdev_t dev); +typedef struct elevator_s +{ + int read_latency; + int write_latency; + int max_bomb_segments; + unsigned int queue_ID; +} elevator_t; + +#define ELEVATOR_DEFAULTS \ +((elevator_t) { \ + 128, /* read_latency */ \ + 8192, /* write_latency */ \ + 4, /* max_bomb_segments */ \ + }) + +extern int blkelv_ioctl(kdev_t, unsigned long, unsigned long); + +typedef struct blkelv_ioctl_arg_s { + int queue_ID; + int read_latency; + int write_latency; + int max_bomb_segments; +} blkelv_ioctl_arg_t; + +#define BLKELVGET _IOR(0x12,106,sizeof(blkelv_ioctl_arg_t)) +#define BLKELVSET _IOW(0x12,107,sizeof(blkelv_ioctl_arg_t)) + struct blk_dev_struct { request_fn_proc *request_fn; /* @@ -47,6 +75,8 @@ struct request *current_request; struct request plug; struct tq_struct plug_tq; + + elevator_t elevator; }; struct sec_size { diff -urN 2.2.15pre19/include/linux/fs.h elevator/include/linux/fs.h --- 2.2.15pre19/include/linux/fs.h Sun Apr 2 21:07:50 2000 +++ elevator/include/linux/fs.h Tue Apr 25 18:51:22 2000 @@ -154,6 +154,10 @@ #define BLKSECTSET _IO(0x12,102)/* set max sectors per request (ll_rw_blk.c) */ #define BLKSECTGET _IO(0x12,103)/* get max sectors per request (ll_rw_blk.c) */ #define BLKSSZGET _IO(0x12,104) /* get block device sector size */ +#if 0 +#define BLKELVGET _IOR(0x12,106,sizeof(blkelv_ioctl_arg_t))/* elevator get */ +#define BLKELVSET _IOW(0x12,107,sizeof(blkelv_ioctl_arg_t))/* elevator set */ +#endif #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */