diff -urN raid/drivers/block/raid1.c raid-reb/drivers/block/raid1.c --- raid/drivers/block/raid1.c Fri Dec 22 19:27:29 2000 +++ raid-reb/drivers/block/raid1.c Fri Dec 22 19:45:26 2000 @@ -12,6 +12,20 @@ * You should have received a copy of the GNU General Public License * (for example /usr/src/linux/COPYING); if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * History: + * ---------------------------------------------------------------------- + * + * Nov 5, 1999 - Added raid1_read_balance() + * Mika Kuoppala + * + * Dec 16, 1999 - raid1_kmalloc fixed + * Ilpo Ruotsalainen + * + * May 5, 2000 - Fixed bug in raid1_read_balance() + * where failed disk sometimes were part + * of selection process. + * Mika Kuoppala */ #include @@ -23,8 +37,9 @@ #define MD_DRIVER #define MD_PERSONALITY -#define MAX_LINEAR_SECTORS 128 - +#define MAX_LINEAR_SECTORS 128 +#define MAX_SINGLE_DISK_READS 1000 + #define MAX(a,b) ((a) > (b) ? (a) : (b)) #define MIN(a,b) ((a) < (b) ? (a) : (b)) @@ -40,7 +55,7 @@ * simply can not afford to fail an allocation because * there is no failure return path (eg. make_request()) */ - while (!(ptr = kmalloc (sizeof (raid1_conf_t), GFP_BUFFER))) { + while (!(ptr = kmalloc(size, GFP_BUFFER))) { printk ("raid1: out of memory, retrying...\n"); current->state = TASK_UNINTERRUPTIBLE; schedule_timeout(HZ/10); @@ -184,7 +199,105 @@ clear_bit(BH_Lock, &bh->b_state); make_request (MAJOR (bh->b_rdev), rw, bh); } + +/* + * This routine returns the disk from which the requested read should + * be done. It bookkeeps the last read position for every disk + * in array and when new read requests come, the disk which last + * position is nearest to the request, is chosen. + * TODO: now if there is 2 mirrors in same 2 devices, performance + * degrades dramatically because position is mirror, not device based. + * This should be changed to be device based. Also atomic sequential + * reads should be somehow balanced. + * + * -- Mika + */ +static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh) +{ + int new_disk = conf->last_used; + const int sectors = bh->b_size >> 9; + const long this_sector = bh->b_blocknr * sectors; + unsigned long new_distance; + unsigned long current_distance; + int current_disk = new_disk; + + /* + * Never touch anything for sequential reads. This seems + * to be fastest way on my testbed. This tho would need + * some more testing on different setups. + */ + + /* + Check if it is sane at all to balance + */ + + if( conf->resync_mirrors ) + goto rb_out; + + if( conf->working_disks < 2) { + while( conf->mirrors[new_disk].operational == 0 ) + new_disk = conf->mirrors[new_disk].next; + + goto rb_out; + } + + if( this_sector == conf->mirrors[new_disk].current_position ) + goto rb_out; + + /* + * If reads have been done only on a single disk + * for a time, lets give another disk a change. + * This is for kicking those idling disks so that + * they would find work near some hotspot. + */ + + if(conf->sect_count >= MAX_SINGLE_DISK_READS) + { + conf->sect_count = 0; + + while( new_disk != conf->mirrors[new_disk].next ) { + if( (conf->mirrors[new_disk].write_only) || + (!conf->mirrors[new_disk].operational) ) + continue; + + new_disk = conf->mirrors[new_disk].next; + break; + } + + goto rb_out; + } + + current_distance = abs(this_sector - + conf->mirrors[current_disk].current_position); + + /* Find the disk which is closest */ + + while( conf->mirrors[current_disk].next != conf->last_used ) { + current_disk = conf->mirrors[current_disk].next; + + if( (conf->mirrors[current_disk].write_only) || + (!conf->mirrors[current_disk].operational) ) + continue; + + new_distance = abs(this_sector - + conf->mirrors[current_disk].current_position); + + if(new_distance < current_distance) { + conf->sect_count = 0; + current_distance = new_distance; + new_disk = current_disk; + } + } + + rb_out: + conf->mirrors[new_disk].current_position = this_sector + sectors; + + conf->last_used = new_disk; + conf->sect_count += sectors; + return new_disk; +} + static int raid1_make_request (mddev_t *mddev, int rw, struct buffer_head * bh) { @@ -192,7 +305,7 @@ struct buffer_head *mirror_bh[MD_SB_DISKS], *bh_req; struct raid1_bh * r1_bh; int disks = MD_SB_DISKS; - int i, sum_bhs = 0, switch_disks = 0, sectors, lowprio = 0; + int i, sum_bhs = 0, lowprio = 0; struct mirror_info *mirror; r1_bh = raid1_kmalloc (sizeof (struct raid1_bh)); @@ -236,35 +349,15 @@ r1_bh->cmd = rw; if (rw==READ) { - int last_used = conf->last_used; /* * read balancing logic: */ - mirror = conf->mirrors + last_used; - bh->b_rdev = mirror->dev; - sectors = bh->b_size >> 9; - if (bh->b_blocknr * sectors == conf->next_sect) { - conf->sect_count += sectors; - if (conf->sect_count >= mirror->sect_limit) - switch_disks = 1; - } else - switch_disks = 1; - conf->next_sect = (bh->b_blocknr + 1) * sectors; - /* - * Do not switch disks if full resync is in progress ... - */ - if (switch_disks && !conf->resync_mirrors) { - conf->sect_count = 0; - last_used = conf->last_used = mirror->next; - /* - * Do not switch to write-only disks ... - * reconstruction is in progress - */ - while (conf->mirrors[last_used].write_only) - conf->last_used = conf->mirrors[last_used].next; - } + mirror = conf->mirrors + raid1_read_balance( conf, bh ); + + bh->b_rdev = mirror->dev; + bh_req = &r1_bh->bh_req; memcpy(bh_req, bh, sizeof(*bh)); bh_req->b_end_io = raid1_end_request; @@ -974,6 +1067,7 @@ disk->raid_disk = disk_idx; disk->dev = rdev->dev; disk->sect_limit = MAX_LINEAR_SECTORS; + disk->current_position = 0; disk->operational = 0; disk->write_only = 0; disk->spare = 0; @@ -1005,6 +1099,7 @@ disk->raid_disk = disk_idx; disk->dev = rdev->dev; disk->sect_limit = MAX_LINEAR_SECTORS; + disk->current_position = 0; disk->operational = 1; disk->write_only = 0; disk->spare = 0; @@ -1019,6 +1114,7 @@ disk->raid_disk = disk_idx; disk->dev = rdev->dev; disk->sect_limit = MAX_LINEAR_SECTORS; + disk->current_position = 0; disk->operational = 0; disk->write_only = 0; disk->spare = 1; @@ -1235,3 +1331,5 @@ unregister_md_personality (RAID1); } #endif + + diff -urN raid/include/linux/raid/raid1.h raid-reb/include/linux/raid/raid1.h --- raid/include/linux/raid/raid1.h Fri Dec 22 19:27:29 2000 +++ raid-reb/include/linux/raid/raid1.h Fri Dec 22 19:41:52 2000 @@ -9,6 +9,7 @@ kdev_t dev; int next; int sect_limit; + int current_position; /* * State bits: