## Automatically generated incremental diff ## From: linux-2.4.21-bk1 ## To: linux-2.4.21-bk2 ## Robot: $Id: make-incremental-diff,v 1.11 2002/02/20 02:59:33 hpa Exp $ diff -urN linux-2.4.21-bk1/Documentation/Configure.help linux-2.4.21-bk2/Documentation/Configure.help --- linux-2.4.21-bk1/Documentation/Configure.help 2003-07-06 08:34:20.000000000 -0700 +++ linux-2.4.21-bk2/Documentation/Configure.help 2003-07-06 08:34:28.000000000 -0700 @@ -15930,6 +15930,30 @@ If unsure, say N. +Allow direct I/O on files in NFS +CONFIG_NFS_DIRECTIO + There are important applications whose performance or correctness + depends on uncached access to file data. Database clusters (multiple + copies of the same instance running on separate hosts) implement their + own cache coherency protocol that subsumes the NFS cache protocols. + Applications that process datasets considerably larger than the client's + memory do not always benefit from a local cache. A streaming video + server, for instance, has no need to cache the contents of a file. + + This option enables applications to perform direct I/O on files in NFS + file systems using the O_DIRECT open() flag. When O_DIRECT is set for + files, their data is not cached in the system's page cache. Direct + read and write operations are aligned to block boundaries. Data is + moved to and from user-level application buffers directly. + + Unless your program is designed to use O_DIRECT properly, you are much + better off allowing the NFS client to manage caching for you. Misusing + O_DIRECT can cause poor server performance or network storms. This + kernel build option defaults OFF to avoid exposing system administrators + unwittingly to a potentially hazardous feature. + + If unsure, say N. + Root file system on NFS CONFIG_ROOT_NFS If you want your Linux box to mount its whole root file system (the diff -urN linux-2.4.21-bk1/Makefile linux-2.4.21-bk2/Makefile --- linux-2.4.21-bk1/Makefile 2003-07-06 08:34:20.000000000 -0700 +++ linux-2.4.21-bk2/Makefile 2003-07-06 08:34:28.000000000 -0700 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 21 -EXTRAVERSION = -bk1 +EXTRAVERSION = -bk2 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -urN linux-2.4.21-bk1/drivers/block/ll_rw_blk.c linux-2.4.21-bk2/drivers/block/ll_rw_blk.c --- linux-2.4.21-bk1/drivers/block/ll_rw_blk.c 2003-06-13 07:51:32.000000000 -0700 +++ linux-2.4.21-bk2/drivers/block/ll_rw_blk.c 2003-07-06 08:34:33.000000000 -0700 @@ -176,11 +176,12 @@ { int count = q->nr_requests; - count -= __blk_cleanup_queue(&q->rq[READ]); - count -= __blk_cleanup_queue(&q->rq[WRITE]); + count -= __blk_cleanup_queue(&q->rq); if (count) printk("blk_cleanup_queue: leaked requests (%d)\n", count); + if (atomic_read(&q->nr_sectors)) + printk("blk_cleanup_queue: leaked sectors (%d)\n", atomic_read(&q->nr_sectors)); memset(q, 0, sizeof(*q)); } @@ -215,6 +216,24 @@ } /** + * blk_queue_throttle_sectors - indicates you will call sector throttling funcs + * @q: The queue which this applies to. + * @active: A flag indication if you want sector throttling on + * + * Description: + * The sector throttling code allows us to put a limit on the number of + * sectors pending io to the disk at a given time, sending @active nonzero + * indicates you will call blk_started_sectors and blk_finished_sectors in + * addition to calling blk_started_io and blk_finished_io in order to + * keep track of the number of sectors in flight. + **/ + +void blk_queue_throttle_sectors(request_queue_t * q, int active) +{ + q->can_throttle = active; +} + +/** * blk_queue_make_request - define an alternate make_request function for a device * @q: the request queue for the device to be affected * @mfn: the alternate make_request function @@ -389,7 +408,7 @@ * * Returns the (new) number of requests which the queue has available. */ -int blk_grow_request_list(request_queue_t *q, int nr_requests) +int blk_grow_request_list(request_queue_t *q, int nr_requests, int max_queue_sectors) { unsigned long flags; /* Several broken drivers assume that this function doesn't sleep, @@ -399,21 +418,34 @@ spin_lock_irqsave(&io_request_lock, flags); while (q->nr_requests < nr_requests) { struct request *rq; - int rw; rq = kmem_cache_alloc(request_cachep, SLAB_ATOMIC); if (rq == NULL) break; memset(rq, 0, sizeof(*rq)); rq->rq_status = RQ_INACTIVE; - rw = q->nr_requests & 1; - list_add(&rq->queue, &q->rq[rw].free); - q->rq[rw].count++; + list_add(&rq->queue, &q->rq.free); + q->rq.count++; + q->nr_requests++; } + + /* + * Wakeup waiters after both one quarter of the + * max-in-fligh queue and one quarter of the requests + * are available again. + */ + q->batch_requests = q->nr_requests / 4; if (q->batch_requests > 32) q->batch_requests = 32; + q->batch_sectors = max_queue_sectors / 4; + + q->max_queue_sectors = max_queue_sectors; + + BUG_ON(!q->batch_sectors); + atomic_set(&q->nr_sectors, 0); + spin_unlock_irqrestore(&io_request_lock, flags); return q->nr_requests; } @@ -422,23 +454,27 @@ { struct sysinfo si; int megs; /* Total memory, in megabytes */ - int nr_requests; - - INIT_LIST_HEAD(&q->rq[READ].free); - INIT_LIST_HEAD(&q->rq[WRITE].free); - q->rq[READ].count = 0; - q->rq[WRITE].count = 0; + int nr_requests, max_queue_sectors = MAX_QUEUE_SECTORS; + + INIT_LIST_HEAD(&q->rq.free); + q->rq.count = 0; q->nr_requests = 0; si_meminfo(&si); megs = si.totalram >> (20 - PAGE_SHIFT); - nr_requests = 128; - if (megs < 32) - nr_requests /= 2; - blk_grow_request_list(q, nr_requests); + nr_requests = MAX_NR_REQUESTS; + if (megs < 30) { + nr_requests /= 2; + max_queue_sectors /= 2; + } + /* notice early if anybody screwed the defaults */ + BUG_ON(!nr_requests); + BUG_ON(!max_queue_sectors); + + blk_grow_request_list(q, nr_requests, max_queue_sectors); + + init_waitqueue_head(&q->wait_for_requests); - init_waitqueue_head(&q->wait_for_requests[0]); - init_waitqueue_head(&q->wait_for_requests[1]); spin_lock_init(&q->queue_lock); } @@ -491,6 +527,8 @@ q->plug_tq.routine = &generic_unplug_device; q->plug_tq.data = q; q->plugged = 0; + q->can_throttle = 0; + /* * These booleans describe the queue properties. We set the * default (and most common) values here. Other drivers can @@ -511,9 +549,10 @@ static struct request *get_request(request_queue_t *q, int rw) { struct request *rq = NULL; - struct request_list *rl = q->rq + rw; + struct request_list *rl; - if (!list_empty(&rl->free)) { + rl = &q->rq; + if (!list_empty(&rl->free) && !blk_oversized_queue(q)) { rq = blkdev_free_rq(&rl->free); list_del(&rq->queue); rl->count--; @@ -522,34 +561,23 @@ rq->special = NULL; rq->q = q; } - return rq; } /* - * Here's the request allocation design: + * Here's the request allocation design, low latency version: * * 1: Blocking on request exhaustion is a key part of I/O throttling. * * 2: We want to be `fair' to all requesters. We must avoid starvation, and * attempt to ensure that all requesters sleep for a similar duration. Hence * no stealing requests when there are other processes waiting. - * - * 3: We also wish to support `batching' of requests. So when a process is - * woken, we want to allow it to allocate a decent number of requests - * before it blocks again, so they can be nicely merged (this only really - * matters if the process happens to be adding requests near the head of - * the queue). - * - * 4: We want to avoid scheduling storms. This isn't really important, because - * the system will be I/O bound anyway. But it's easy. - * - * There is tension between requirements 2 and 3. Once a task has woken, - * we don't want to allow it to sleep as soon as it takes its second request. - * But we don't want currently-running tasks to steal all the requests - * from the sleepers. We handle this with wakeup hysteresis around - * 0 .. batch_requests and with the assumption that request taking is much, - * much faster than request freeing. + * + * There used to be more here, attempting to allow a process to send in a + * number of requests once it has woken up. But, there's no way to + * tell if a process has just been woken up, or if it is a new process + * coming in to steal requests from the waiters. So, we give up and force + * everyone to wait fairly. * * So here's what we do: * @@ -561,28 +589,23 @@ * * When a process wants a new request: * - * b) If free_requests == 0, the requester sleeps in FIFO manner. - * - * b) If 0 < free_requests < batch_requests and there are waiters, - * we still take a request non-blockingly. This provides batching. - * - * c) If free_requests >= batch_requests, the caller is immediately - * granted a new request. + * b) If free_requests == 0, the requester sleeps in FIFO manner, and + * the queue full condition is set. The full condition is not + * cleared until there are no longer any waiters. Once the full + * condition is set, all new io must wait, hopefully for a very + * short period of time. * * When a request is released: * - * d) If free_requests < batch_requests, do nothing. - * - * f) If free_requests >= batch_requests, wake up a single waiter. + * c) If free_requests < batch_requests, do nothing. * - * The net effect is that when a process is woken at the batch_requests level, - * it will be able to take approximately (batch_requests) requests before - * blocking again (at the tail of the queue). - * - * This all assumes that the rate of taking requests is much, much higher - * than the rate of releasing them. Which is very true. + * d) If free_requests >= batch_requests, wake up a single waiter. * - * -akpm, Feb 2002. + * As each waiter gets a request, he wakes another waiter. We do this + * to prevent a race where an unplug might get run before a request makes + * it's way onto the queue. The result is a cascade of wakeups, so delaying + * the initial wakeup until we've got batch_requests available helps avoid + * wakeups where there aren't any requests available yet. */ static struct request *__get_request_wait(request_queue_t *q, int rw) @@ -590,21 +613,37 @@ register struct request *rq; DECLARE_WAITQUEUE(wait, current); - add_wait_queue(&q->wait_for_requests[rw], &wait); + add_wait_queue_exclusive(&q->wait_for_requests, &wait); + do { set_current_state(TASK_UNINTERRUPTIBLE); - generic_unplug_device(q); - if (q->rq[rw].count == 0) - schedule(); spin_lock_irq(&io_request_lock); + if (blk_oversized_queue(q)) { + __generic_unplug_device(q); + spin_unlock_irq(&io_request_lock); + schedule(); + spin_lock_irq(&io_request_lock); + } rq = get_request(q, rw); spin_unlock_irq(&io_request_lock); } while (rq == NULL); - remove_wait_queue(&q->wait_for_requests[rw], &wait); + remove_wait_queue(&q->wait_for_requests, &wait); current->state = TASK_RUNNING; + return rq; } +static void get_request_wait_wakeup(request_queue_t *q, int rw) +{ + /* + * avoid losing an unplug if a second __get_request_wait did the + * generic_unplug_device while our __get_request_wait was running + * w/o the queue_lock held and w/ our request out of the queue. + */ + if (waitqueue_active(&q->wait_for_requests)) + wake_up(&q->wait_for_requests); +} + /* RO fail safe mechanism */ static long ro_bits[MAX_BLKDEV][8]; @@ -818,7 +857,6 @@ void blkdev_release_request(struct request *req) { request_queue_t *q = req->q; - int rw = req->cmd; req->rq_status = RQ_INACTIVE; req->q = NULL; @@ -828,9 +866,17 @@ * assume it has free buffers and check waiters */ if (q) { - list_add(&req->queue, &q->rq[rw].free); - if (++q->rq[rw].count >= q->batch_requests) - wake_up(&q->wait_for_requests[rw]); + int oversized_batch = 0; + + if (q->can_throttle) + oversized_batch = blk_oversized_queue_batch(q); + q->rq.count++; + list_add(&req->queue, &q->rq.free); + if (q->rq.count >= q->batch_requests && !oversized_batch) { + smp_mb(); + if (waitqueue_active(&q->wait_for_requests)) + wake_up(&q->wait_for_requests); + } } } @@ -908,6 +954,7 @@ struct list_head *head, *insert_here; int latency; elevator_t *elevator = &q->elevator; + int should_wake = 0; count = bh->b_size >> 9; sector = bh->b_rsector; @@ -948,7 +995,6 @@ */ max_sectors = get_max_sectors(bh->b_rdev); -again: req = NULL; head = &q->queue_head; /* @@ -957,7 +1003,9 @@ */ spin_lock_irq(&io_request_lock); +again: insert_here = head->prev; + if (list_empty(head)) { q->plug_device_fn(q, bh->b_rdev); /* is atomic */ goto get_rq; @@ -976,6 +1024,7 @@ req->bhtail = bh; req->nr_sectors = req->hard_nr_sectors += count; blk_started_io(count); + blk_started_sectors(req, count); drive_stat_acct(req->rq_dev, req->cmd, count, 0); req_new_io(req, 1, count); attempt_back_merge(q, req, max_sectors, max_segments); @@ -998,6 +1047,7 @@ req->sector = req->hard_sector = sector; req->nr_sectors = req->hard_nr_sectors += count; blk_started_io(count); + blk_started_sectors(req, count); drive_stat_acct(req->rq_dev, req->cmd, count, 0); req_new_io(req, 1, count); attempt_front_merge(q, head, req, max_sectors, max_segments); @@ -1030,7 +1080,7 @@ * See description above __get_request_wait() */ if (rw_ahead) { - if (q->rq[rw].count < q->batch_requests) { + if (q->rq.count < q->batch_requests || blk_oversized_queue_batch(q)) { spin_unlock_irq(&io_request_lock); goto end_io; } @@ -1042,6 +1092,9 @@ if (req == NULL) { spin_unlock_irq(&io_request_lock); freereq = __get_request_wait(q, rw); + head = &q->queue_head; + spin_lock_irq(&io_request_lock); + should_wake = 1; goto again; } } @@ -1064,10 +1117,13 @@ req->start_time = jiffies; req_new_io(req, 0, count); blk_started_io(count); + blk_started_sectors(req, count); add_request(q, req, insert_here); out: if (freereq) blkdev_release_request(freereq); + if (should_wake) + get_request_wait_wakeup(q, rw); spin_unlock_irq(&io_request_lock); return 0; end_io: @@ -1196,8 +1252,15 @@ bh->b_rdev = bh->b_dev; bh->b_rsector = bh->b_blocknr * count; + get_bh(bh); generic_make_request(rw, bh); + /* fix race condition with wait_on_buffer() */ + smp_mb(); /* spin_unlock may have inclusive semantics */ + if (waitqueue_active(&bh->b_wait)) + wake_up(&bh->b_wait); + + put_bh(bh); switch (rw) { case WRITE: kstat.pgpgout += count; @@ -1350,6 +1413,7 @@ if ((bh = req->bh) != NULL) { nsect = bh->b_size >> 9; blk_finished_io(nsect); + blk_finished_sectors(req, nsect); req->bh = bh->b_reqnext; bh->b_reqnext = NULL; bh->b_end_io(bh, uptodate); @@ -1509,6 +1573,7 @@ EXPORT_SYMBOL(blk_get_queue); EXPORT_SYMBOL(blk_cleanup_queue); EXPORT_SYMBOL(blk_queue_headactive); +EXPORT_SYMBOL(blk_queue_throttle_sectors); EXPORT_SYMBOL(blk_queue_make_request); EXPORT_SYMBOL(generic_make_request); EXPORT_SYMBOL(blkdev_release_request); diff -urN linux-2.4.21-bk1/drivers/ide/Makefile linux-2.4.21-bk2/drivers/ide/Makefile --- linux-2.4.21-bk1/drivers/ide/Makefile 2003-06-13 07:51:33.000000000 -0700 +++ linux-2.4.21-bk2/drivers/ide/Makefile 2003-07-06 08:34:34.000000000 -0700 @@ -8,7 +8,6 @@ # In the future, some of these should be built conditionally. # -O_TARGET := idedriver.o export-objs := ide-iops.o ide-taskfile.o ide-proc.o ide.o ide-probe.o ide-dma.o ide-lib.o setup-pci.o ide-io.o ide-disk.o @@ -29,24 +28,25 @@ # Core IDE code - must come before legacy -obj-$(CONFIG_BLK_DEV_IDE) += ide-probe.o ide-geometry.o ide-iops.o ide-taskfile.o ide.o ide-lib.o ide-io.o ide-default.o -obj-$(CONFIG_BLK_DEV_IDEDISK) += ide-disk.o -obj-$(CONFIG_BLK_DEV_IDECD) += ide-cd.o -obj-$(CONFIG_BLK_DEV_IDETAPE) += ide-tape.o -obj-$(CONFIG_BLK_DEV_IDEFLOPPY) += ide-floppy.o +ide-core-objs := ide-iops.o ide-taskfile.o ide.o ide-lib.o ide-io.o ide-default.o ide-proc.o +ide-detect-objs := ide-probe.o ide-geometry.o + ifeq ($(CONFIG_BLK_DEV_IDEPCI),y) -obj-$(CONFIG_BLK_DEV_IDE) += setup-pci.o +ide-core-objs += setup-pci.o endif ifeq ($(CONFIG_BLK_DEV_IDEDMA_PCI),y) -obj-$(CONFIG_BLK_DEV_IDE) += ide-dma.o +ide-core-objs += ide-dma.o endif -obj-$(CONFIG_BLK_DEV_ISAPNP) += ide-pnp.o +# Initialisation order: +# Core sets up +# Legacy drivers may register a callback +# Drivers are pre initialised +# Probe inits the drivers and driver callbacks +# Raid scans the devices -ifeq ($(CONFIG_BLK_DEV_IDE),y) -obj-$(CONFIG_PROC_FS) += ide-proc.o -endif +obj-$(CONFIG_BLK_DEV_IDE) += ide-core.o ifeq ($(CONFIG_BLK_DEV_IDE),y) obj-y += legacy/idedriver-legacy.o @@ -58,10 +58,28 @@ endif endif +obj-$(CONFIG_BLK_DEV_ISAPNP) += ide-pnp.o + +obj-$(CONFIG_BLK_DEV_IDEDISK) += ide-disk.o +obj-$(CONFIG_BLK_DEV_IDECD) += ide-cd.o +obj-$(CONFIG_BLK_DEV_IDETAPE) += ide-tape.o +obj-$(CONFIG_BLK_DEV_IDEFLOPPY) += ide-floppy.o + +obj-$(CONFIG_BLK_DEV_IDE) += ide-detect.o ifeq ($(CONFIG_BLK_DEV_IDE),y) # RAID must be last of all obj-y += raid/idedriver-raid.o endif +list-multi := ide-core.o ide-detect.o +O_TARGET := idedriver.o + include $(TOPDIR)/Rules.make + +ide-core.o: $(ide-core-objs) + $(LD) -r -o $@ $(ide-core-objs) + +ide-detect.o: $(ide-detect-objs) + $(LD) -r -o $@ $(ide-detect-objs) + diff -urN linux-2.4.21-bk1/drivers/ide/ide-probe.c linux-2.4.21-bk2/drivers/ide/ide-probe.c --- linux-2.4.21-bk1/drivers/ide/ide-probe.c 2003-06-13 07:51:33.000000000 -0700 +++ linux-2.4.21-bk2/drivers/ide/ide-probe.c 2003-07-06 08:34:34.000000000 -0700 @@ -971,6 +971,7 @@ q->queuedata = HWGROUP(drive); blk_init_queue(q, do_ide_request); + blk_queue_throttle_sectors(q, 1); } #undef __IRQ_HELL_SPIN diff -urN linux-2.4.21-bk1/drivers/ide/pci/cmd640.c linux-2.4.21-bk2/drivers/ide/pci/cmd640.c --- linux-2.4.21-bk1/drivers/ide/pci/cmd640.c 2003-06-13 07:51:33.000000000 -0700 +++ linux-2.4.21-bk2/drivers/ide/pci/cmd640.c 2003-07-06 08:34:34.000000000 -0700 @@ -102,6 +102,7 @@ #define CMD640_PREFETCH_MASKS 1 #include +#include #include #include #include @@ -120,7 +121,8 @@ /* * This flag is set in ide.c by the parameter: ide0=cmd640_vlb */ -int cmd640_vlb = 0; + +static int cmd640_vlb = 0; /* * CMD640 specific registers definition. @@ -716,7 +718,7 @@ /* * Probe for a cmd640 chipset, and initialize it if found. Called from ide.c */ -int __init ide_probe_for_cmd640x (void) +static void __init ide_probe_for_cmd640x (void) { #ifdef CONFIG_BLK_DEV_CMD640_ENHANCED int second_port_toggled = 0; @@ -731,13 +733,13 @@ } else { cmd640_vlb = 0; /* Find out what kind of PCI probing is supported otherwise - Justin Gibbs will sulk.. */ + we break some Adaptec cards... */ if (pci_conf1() && probe_for_cmd640_pci1()) bus_type = "PCI (type1)"; else if (pci_conf2() && probe_for_cmd640_pci2()) bus_type = "PCI (type2)"; else - return 0; + return; } /* * Undocumented magic (there is no 0x5b reg in specs) @@ -745,7 +747,7 @@ put_cmd640_reg(0x5b, 0xbd); if (get_cmd640_reg(0x5b) != 0xbd) { printk(KERN_ERR "ide: cmd640 init failed: wrong value in reg 0x5b\n"); - return 0; + return; } put_cmd640_reg(0x5b, 0); @@ -760,7 +762,7 @@ cmd640_chip_version = cfr & CFR_DEVREV; if (cmd640_chip_version == 0) { printk ("ide: bad cmd640 revision: %d\n", cmd640_chip_version); - return 0; + return; } /* @@ -874,6 +876,28 @@ #ifdef CMD640_DUMP_REGS CMD640_DUMP_REGS; #endif - return 1; + return; +} + +static int __init cmd640_init(void) +{ + ide_register_driver(ide_probe_for_cmd640x); + return 0; +} + +/* + * Called by the IDE core when compiled in and cmd640=vlb is + * selected. + */ +void init_cmd640_vlb(void) +{ + cmd640_vlb = 1; } +module_init(cmd640_init); + +MODULE_AUTHOR("See Source"); +MODULE_DESCRIPTION("IDE support for CMD640 controller"); +MODULE_PARM(cmd640_vlb, "i"); +MODULE_PARM_DESC(cmd640_vlb, "Set to enable scanning for VLB controllers"); +MODULE_LICENSE("GPL"); diff -urN linux-2.4.21-bk1/drivers/net/8139too.c linux-2.4.21-bk2/drivers/net/8139too.c --- linux-2.4.21-bk1/drivers/net/8139too.c 2003-07-06 08:34:23.000000000 -0700 +++ linux-2.4.21-bk2/drivers/net/8139too.c 2003-07-06 08:34:35.000000000 -0700 @@ -2074,7 +2074,7 @@ RTL_W16 (IntrStatus, ackstat); DPRINTK ("%s: interrupt status=%#4.4x ackstat=%#4.4x new intstat=%#4.4x.\n", - dev->name, ackstat, status, RTL_R16 (IntrStatus)); + dev->name, status, ackstat, RTL_R16 (IntrStatus)); if (netif_running (dev) && (status & RxAckBits)) rtl8139_rx_interrupt (dev, tp, ioaddr); diff -urN linux-2.4.21-bk1/drivers/net/acenic.c linux-2.4.21-bk2/drivers/net/acenic.c --- linux-2.4.21-bk1/drivers/net/acenic.c 2003-07-06 08:34:23.000000000 -0700 +++ linux-2.4.21-bk2/drivers/net/acenic.c 2003-07-06 08:34:35.000000000 -0700 @@ -3019,9 +3019,6 @@ return 0; case ETHTOOL_SSET: - if(!capable(CAP_NET_ADMIN)) - return -EPERM; - link = readl(®s->GigLnkState); if (link & LNK_1000MB) speed = SPEED_1000; diff -urN linux-2.4.21-bk1/drivers/net/bonding/bond_3ad.c linux-2.4.21-bk2/drivers/net/bonding/bond_3ad.c --- linux-2.4.21-bk1/drivers/net/bonding/bond_3ad.c 2003-07-06 08:34:23.000000000 -0700 +++ linux-2.4.21-bk2/drivers/net/bonding/bond_3ad.c 2003-07-06 08:34:35.000000000 -0700 @@ -37,6 +37,16 @@ * 2003/05/01 - Shmulik Hen * - Renamed bond_3ad_link_status_changed() to * bond_3ad_handle_link_change() for compatibility with TLB. + * + * 2003/05/20 - Amir Noam + * - Fix long fail over time when releasing last slave of an active + * aggregator - send LACPDU on unbind of slave to tell partner this + * port is no longer aggregatable. + * + * 2003/06/25 - Tsippy Mendelson + * - Send LACPDU as highest priority packet to further fix the above + * problem on very high Tx traffic load where packets may get dropped + * by the slave. */ #include @@ -45,6 +55,7 @@ #include #include #include +#include #include "bonding.h" #include "bond_3ad.h" @@ -905,6 +916,7 @@ skb->mac.raw = skb->data; skb->nh.raw = skb->data + ETH_HLEN; skb->protocol = PKT_TYPE_LACPDU; + skb->priority = TC_PRIO_CONTROL; lacpdu_header = (struct lacpdu_header *)skb_put(skb, length); diff -urN linux-2.4.21-bk1/drivers/net/bonding/bond_3ad.h linux-2.4.21-bk2/drivers/net/bonding/bond_3ad.h --- linux-2.4.21-bk1/drivers/net/bonding/bond_3ad.h 2003-07-06 08:34:23.000000000 -0700 +++ linux-2.4.21-bk2/drivers/net/bonding/bond_3ad.h 2003-07-06 08:34:35.000000000 -0700 @@ -165,7 +165,7 @@ // = 0x02 (marker response information) u8 marker_length; // = 0x16 u16 requester_port; // The number assigned to the port by the requester - struct mac_addr requester_system; // The requester’s system id + struct mac_addr requester_system; // The requester's system id u32 requester_transaction_id; // The transaction id allocated by the requester, u16 pad; // = 0 u8 tlv_type_terminator; // = 0x00 diff -urN linux-2.4.21-bk1/drivers/net/bonding/bond_alb.c linux-2.4.21-bk2/drivers/net/bonding/bond_alb.c --- linux-2.4.21-bk1/drivers/net/bonding/bond_alb.c 2003-07-06 08:34:23.000000000 -0700 +++ linux-2.4.21-bk2/drivers/net/bonding/bond_alb.c 2003-07-06 08:34:35.000000000 -0700 @@ -17,6 +17,13 @@ * * The full GNU General Public License is included in this distribution in the * file called LICENSE. + * + * + * Changes: + * + * 2003/06/25 - Shmulik Hen + * - Fixed signed/unsigned calculation errors that caused load sharing + * to collapse to one slave under very heavy UDP Tx stress. */ #include @@ -246,7 +253,7 @@ { struct slave *slave; struct slave *least_loaded; - u32 curr_gap, max_gap; + s64 curr_gap, max_gap; /* Find the first enabled slave */ slave = bond_get_first_slave(bond); @@ -262,15 +269,15 @@ } least_loaded = slave; - max_gap = (slave->speed * 1000000) - - (SLAVE_TLB_INFO(slave).load * 8); + max_gap = (s64)(slave->speed * 1000000) - + (s64)(SLAVE_TLB_INFO(slave).load * 8); /* Find the slave with the largest gap */ slave = bond_get_next_slave(bond, slave); while (slave) { if (SLAVE_IS_OK(slave)) { - curr_gap = (slave->speed * 1000000) - - (SLAVE_TLB_INFO(slave).load * 8); + curr_gap = (s64)(slave->speed * 1000000) - + (s64)(SLAVE_TLB_INFO(slave).load * 8); if (max_gap < curr_gap) { least_loaded = slave; max_gap = curr_gap; diff -urN linux-2.4.21-bk1/drivers/net/bonding/bond_main.c linux-2.4.21-bk2/drivers/net/bonding/bond_main.c --- linux-2.4.21-bk1/drivers/net/bonding/bond_main.c 2003-07-06 08:34:23.000000000 -0700 +++ linux-2.4.21-bk2/drivers/net/bonding/bond_main.c 2003-07-06 08:34:35.000000000 -0700 @@ -385,6 +385,9 @@ * - In conjunction with fix for ifenslave -c, in * bond_change_active(), changing to the already active slave * is no longer an error (it successfully does nothing). + * + * 2003/06/30 - Amir Noam + * - Fixed bond_change_active() for ALB/TLB modes. */ #include @@ -429,8 +432,8 @@ #include "bond_3ad.h" #include "bond_alb.h" -#define DRV_VERSION "2.2.11" -#define DRV_RELDATE "May 29, 2003" +#define DRV_VERSION "2.2.14" +#define DRV_RELDATE "June 30, 2003" #define DRV_NAME "bonding" #define DRV_DESCRIPTION "Ethernet Channel Bonding Driver" @@ -1761,8 +1764,11 @@ (oldactive != NULL)&& (newactive->link == BOND_LINK_UP)&& IS_UP(newactive->dev)) { - bond_set_slave_inactive_flags(oldactive); - bond_set_slave_active_flags(newactive); + if (bond_mode == BOND_MODE_ACTIVEBACKUP) { + bond_set_slave_inactive_flags(oldactive); + bond_set_slave_active_flags(newactive); + } + bond_mc_update(bond, newactive, oldactive); bond_assign_current_slave(bond, newactive); printk("%s : activate %s(old : %s)\n", diff -urN linux-2.4.21-bk1/drivers/net/e100/e100_main.c linux-2.4.21-bk2/drivers/net/e100/e100_main.c --- linux-2.4.21-bk1/drivers/net/e100/e100_main.c 2003-07-06 08:34:23.000000000 -0700 +++ linux-2.4.21-bk2/drivers/net/e100/e100_main.c 2003-07-06 08:34:35.000000000 -0700 @@ -3422,10 +3422,6 @@ int ethtool_new_speed_duplex; struct ethtool_cmd ecmd; - if (!capable(CAP_NET_ADMIN)) { - return -EPERM; - } - bdp = dev->priv; if (copy_from_user(&ecmd, ifr->ifr_data, sizeof (ecmd))) { return -EFAULT; @@ -3543,8 +3539,6 @@ void *addr = ifr->ifr_data; u16 mdi_reg; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; bdp = dev->priv; if(copy_from_user(®s, addr, sizeof(regs))) @@ -3572,9 +3566,6 @@ { struct e100_private *bdp; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - bdp = dev->priv; if ((bdp->speed_duplex_caps & SUPPORTED_Autoneg) && @@ -3630,9 +3621,6 @@ void *ptr; u8 *eeprom_data_bytes = (u8 *)eeprom_data; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - bdp = dev->priv; if (copy_from_user(&ecmd, ifr->ifr_data, sizeof (ecmd))) @@ -3910,9 +3898,6 @@ struct ethtool_wolinfo wolinfo; int res = 0; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - bdp = dev->priv; if (copy_from_user(&wolinfo, ifr->ifr_data, sizeof (wolinfo))) { diff -urN linux-2.4.21-bk1/drivers/net/e1000/e1000_ethtool.c linux-2.4.21-bk2/drivers/net/e1000/e1000_ethtool.c --- linux-2.4.21-bk1/drivers/net/e1000/e1000_ethtool.c 2003-07-06 08:34:23.000000000 -0700 +++ linux-2.4.21-bk2/drivers/net/e1000/e1000_ethtool.c 2003-07-06 08:34:35.000000000 -0700 @@ -1289,8 +1289,6 @@ } case ETHTOOL_SSET: { struct ethtool_cmd ecmd; - if(!capable(CAP_NET_ADMIN)) - return -EPERM; if(copy_from_user(&ecmd, addr, sizeof(ecmd))) return -EFAULT; return e1000_ethtool_sset(adapter, &ecmd); @@ -1363,8 +1361,6 @@ return 0; } case ETHTOOL_NWAY_RST: { - if(!capable(CAP_NET_ADMIN)) - return -EPERM; if(netif_running(netdev)) { e1000_down(adapter); e1000_up(adapter); @@ -1393,8 +1389,6 @@ } case ETHTOOL_SWOL: { struct ethtool_wolinfo wol; - if(!capable(CAP_NET_ADMIN)) - return -EPERM; if(copy_from_user(&wol, addr, sizeof(wol)) != 0) return -EFAULT; return e1000_ethtool_swol(adapter, &wol); @@ -1436,9 +1430,6 @@ case ETHTOOL_SEEPROM: { struct ethtool_eeprom eeprom; - if(!capable(CAP_NET_ADMIN)) - return -EPERM; - if(copy_from_user(&eeprom, addr, sizeof(eeprom))) return -EFAULT; @@ -1470,9 +1461,6 @@ } test = { {ETHTOOL_TEST} }; int err; - if(!capable(CAP_NET_ADMIN)) - return -EPERM; - if(copy_from_user(&test.eth_test, addr, sizeof(test.eth_test))) return -EFAULT; diff -urN linux-2.4.21-bk1/drivers/net/ioc3-eth.c linux-2.4.21-bk2/drivers/net/ioc3-eth.c --- linux-2.4.21-bk1/drivers/net/ioc3-eth.c 2002-08-02 17:39:44.000000000 -0700 +++ linux-2.4.21-bk2/drivers/net/ioc3-eth.c 2003-07-06 08:34:35.000000000 -0700 @@ -1841,9 +1841,6 @@ return -EFAULT; return 0; } else if (ecmd.cmd == ETHTOOL_SSET) { - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - /* Verify the settings we care about. */ if (ecmd.autoneg != AUTONEG_ENABLE && ecmd.autoneg != AUTONEG_DISABLE) diff -urN linux-2.4.21-bk1/drivers/net/sungem.c linux-2.4.21-bk2/drivers/net/sungem.c --- linux-2.4.21-bk1/drivers/net/sungem.c 2002-08-02 17:39:44.000000000 -0700 +++ linux-2.4.21-bk2/drivers/net/sungem.c 2003-07-06 08:34:35.000000000 -0700 @@ -2616,9 +2616,6 @@ return 0; case ETHTOOL_SSET: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - /* Verify the settings we care about. */ if (ecmd.autoneg != AUTONEG_ENABLE && ecmd.autoneg != AUTONEG_DISABLE) diff -urN linux-2.4.21-bk1/drivers/net/sunhme.c linux-2.4.21-bk2/drivers/net/sunhme.c --- linux-2.4.21-bk1/drivers/net/sunhme.c 2003-07-06 08:34:23.000000000 -0700 +++ linux-2.4.21-bk2/drivers/net/sunhme.c 2003-07-06 08:34:36.000000000 -0700 @@ -2480,9 +2480,6 @@ return -EFAULT; return 0; } else if (ecmd.cmd == ETHTOOL_SSET) { - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - /* Verify the settings we care about. */ if (ecmd.autoneg != AUTONEG_ENABLE && ecmd.autoneg != AUTONEG_DISABLE) diff -urN linux-2.4.21-bk1/drivers/net/typhoon.c linux-2.4.21-bk2/drivers/net/typhoon.c --- linux-2.4.21-bk1/drivers/net/typhoon.c 2003-07-06 08:34:23.000000000 -0700 +++ linux-2.4.21-bk2/drivers/net/typhoon.c 2003-07-06 08:34:36.000000000 -0700 @@ -40,7 +40,7 @@ /* Set the copy breakpoint for the copy-only-tiny-frames scheme. * Setting to > 1518 effectively disables this feature. */ -static int rx_copybreak = 0; +static int rx_copybreak = 200; /* end user-configurable values */ @@ -85,8 +85,8 @@ #define PKT_BUF_SZ 1536 #define DRV_MODULE_NAME "typhoon" -#define DRV_MODULE_VERSION "1.0" -#define DRV_MODULE_RELDATE "03/02/14" +#define DRV_MODULE_VERSION "1.4.1" +#define DRV_MODULE_RELDATE "03/06/26" #define PFX DRV_MODULE_NAME ": " #define ERR_PFX KERN_ERR PFX @@ -150,7 +150,7 @@ #define TYPHOON_CRYPTO_DES 1 #define TYPHOON_CRYPTO_3DES 2 #define TYPHOON_CRYPTO_VARIABLE 4 -#define TYPHOON_FIBER 5 +#define TYPHOON_FIBER 8 enum typhoon_cards { TYPHOON_TX = 0, TYPHOON_TX95, TYPHOON_TX97, TYPHOON_SVR, @@ -1798,7 +1798,7 @@ u32 intr_status; intr_status = readl(ioaddr + TYPHOON_REG_INTR_STATUS); - if(!intr_status) + if(!(intr_status & TYPHOON_INTR_HOST_INT)) return; writel(intr_status, ioaddr + TYPHOON_REG_INTR_STATUS); diff -urN linux-2.4.21-bk1/drivers/net/via-rhine.c linux-2.4.21-bk2/drivers/net/via-rhine.c --- linux-2.4.21-bk1/drivers/net/via-rhine.c 2003-07-06 08:34:23.000000000 -0700 +++ linux-2.4.21-bk2/drivers/net/via-rhine.c 2003-07-06 08:34:36.000000000 -0700 @@ -2,6 +2,8 @@ /* Written 1998-2001 by Donald Becker. + Current Maintainer: Roger Luethi + This software may be used and distributed according to the terms of the GNU General Public License (GPL), incorporated herein by reference. Drivers based on or derived from this code fall under the GPL and must @@ -9,8 +11,9 @@ a complete program and may only be used when the entire operating system is licensed under the GPL. - This driver is designed for the VIA VT86C100A Rhine-I. - It also works with the 6102 Rhine-II, and 6105/6105M Rhine-III. + This driver is designed for the VIA VT86C100A Rhine-I. + It also works with the Rhine-II (6102) and Rhine-III (6105/6105L/6105LOM + and management NIC 6105M). The author may be reached as becker@scyld.com, or C/O Scyld Computing Corporation @@ -115,11 +118,15 @@ - Force flushing for PCI posted writes - More reset code changes + LK1.1.18 (Roger Luethi) + - No filtering multicast in promisc mode (Edward Peng) + - Fix for Rhine-I Tx timeouts + */ #define DRV_NAME "via-rhine" -#define DRV_VERSION "1.1.17" -#define DRV_RELDATE "March-1-2003" +#define DRV_VERSION "1.1.18" +#define DRV_RELDATE "July-4-2003" /* A few user-configurable values. @@ -139,7 +146,7 @@ Both 'options[]' and 'full_duplex[]' should exist for driver interoperability. The media type is usually passed in 'options[]'. - The default is autonegotation for speed and duplex. + The default is autonegotiation for speed and duplex. This should rarely be overridden. Use option values 0x10/0x20 for 10Mbps, 0x100,0x200 for 100Mbps. Use option values 0x10 and 0x100 for forcing half duplex fixed speed. @@ -386,17 +393,17 @@ { "VIA VT6102 Rhine-II", RHINE_IOTYPE, 256, CanHaveMII | HasWOL }, { "VIA VT6105 Rhine-III", RHINE_IOTYPE, 256, - CanHaveMII | HasWOL }, + CanHaveMII | HasWOL }, { "VIA VT6105M Rhine-III", RHINE_IOTYPE, 256, - CanHaveMII | HasWOL }, + CanHaveMII | HasWOL }, }; static struct pci_device_id via_rhine_pci_tbl[] __devinitdata = { {0x1106, 0x3043, PCI_ANY_ID, PCI_ANY_ID, 0, 0, VT86C100A}, {0x1106, 0x3065, PCI_ANY_ID, PCI_ANY_ID, 0, 0, VT6102}, - {0x1106, 0x3106, PCI_ANY_ID, PCI_ANY_ID, 0, 0, VT6105}, - {0x1106, 0x3053, PCI_ANY_ID, PCI_ANY_ID, 0, 0, VT6105M}, + {0x1106, 0x3106, PCI_ANY_ID, PCI_ANY_ID, 0, 0, VT6105}, /* 6105{,L,LOM} */ + {0x1106, 0x3053, PCI_ANY_ID, PCI_ANY_ID, 0, 0, VT6105M}, {0,} /* terminate list */ }; MODULE_DEVICE_TABLE(pci, via_rhine_pci_tbl); @@ -441,7 +448,7 @@ IntrRxWakeUp=0x8000, IntrNormalSummary=0x0003, IntrAbnormalSummary=0xC260, IntrTxDescRace=0x080000, /* mapped from IntrStatus2 */ - IntrTxErrSummary=0x082210, + IntrTxErrSummary=0x082218, }; /* The Rx and Tx buffer descriptors. */ @@ -1264,7 +1271,7 @@ if (skb->len < ETH_ZLEN) { skb = skb_padto(skb, ETH_ZLEN); - if(skb == NULL) + if (skb == NULL) return 0; } @@ -1650,11 +1657,18 @@ printk(KERN_INFO "%s: Tx descriptor write-back race.\n", dev->name); } - if (intr_status & ( IntrTxAborted | IntrTxUnderrun | IntrTxDescRace )) + if ((intr_status & IntrTxError) && ~( IntrTxAborted | IntrTxUnderrun | + IntrTxDescRace )) { + if (debug > 2) + printk(KERN_INFO "%s: Unspecified error.\n", + dev->name); + } + if (intr_status & ( IntrTxAborted | IntrTxUnderrun | IntrTxDescRace | + IntrTxError )) via_rhine_restart_tx(dev); if (intr_status & ~( IntrLinkChange | IntrStatsMax | IntrTxUnderrun | - IntrTxError | IntrTxAborted | IntrNormalSummary | + IntrTxError | IntrTxAborted | IntrNormalSummary | IntrTxDescRace )) { if (debug > 1) printk(KERN_ERR "%s: Something Wicked happened! %8.8x.\n", diff -urN linux-2.4.21-bk1/drivers/scsi/scsi.c linux-2.4.21-bk2/drivers/scsi/scsi.c --- linux-2.4.21-bk1/drivers/scsi/scsi.c 2002-11-28 15:53:14.000000000 -0800 +++ linux-2.4.21-bk2/drivers/scsi/scsi.c 2003-07-06 08:34:37.000000000 -0700 @@ -197,6 +197,7 @@ blk_init_queue(q, scsi_request_fn); blk_queue_headactive(q, 0); + blk_queue_throttle_sectors(q, 1); q->queuedata = (void *) SDpnt; } diff -urN linux-2.4.21-bk1/drivers/scsi/scsi_lib.c linux-2.4.21-bk2/drivers/scsi/scsi_lib.c --- linux-2.4.21-bk1/drivers/scsi/scsi_lib.c 2003-06-13 07:51:36.000000000 -0700 +++ linux-2.4.21-bk2/drivers/scsi/scsi_lib.c 2003-07-06 08:34:37.000000000 -0700 @@ -378,6 +378,7 @@ if ((bh = req->bh) != NULL) { nsect = bh->b_size >> 9; blk_finished_io(nsect); + blk_finished_sectors(req, nsect); req->bh = bh->b_reqnext; bh->b_reqnext = NULL; sectors -= nsect; diff -urN linux-2.4.21-bk1/drivers/sound/i810_audio.c linux-2.4.21-bk2/drivers/sound/i810_audio.c --- linux-2.4.21-bk1/drivers/sound/i810_audio.c 2003-07-06 08:34:24.000000000 -0700 +++ linux-2.4.21-bk2/drivers/sound/i810_audio.c 2003-07-06 08:34:38.000000000 -0700 @@ -118,6 +118,9 @@ #ifndef PCI_DEVICE_ID_INTEL_ICH4 #define PCI_DEVICE_ID_INTEL_ICH4 0x24c5 #endif +#ifndef PCI_DEVICE_ID_INTEL_ICH5 +#define PCI_DEVICE_ID_INTEL_ICH5 0x24d5 +#endif #ifndef PCI_DEVICE_ID_INTEL_440MX #define PCI_DEVICE_ID_INTEL_440MX 0x7195 #endif @@ -273,6 +276,7 @@ INTELICH2, INTELICH3, INTELICH4, + INTELICH5, SI7012, NVIDIA_NFORCE, AMD768, @@ -286,6 +290,7 @@ "Intel ICH2", "Intel ICH3", "Intel ICH4", + "Intel ICH5", "SiS 7012", "NVIDIA nForce Audio", "AMD 768", @@ -304,7 +309,8 @@ { 1, 0x0000 }, /* INTEL440MX */ { 1, 0x0000 }, /* INTELICH2 */ { 2, 0x0000 }, /* INTELICH3 */ - { 3, 0x0003 }, /* INTELICH4 */ + { 3, 0x0003 }, /* INTELICH4 */ + { 3, 0x0003 }, /* INTELICH5 */ /*@FIXME to be verified*/ { 2, 0x0000 }, /* SI7012 */ /*@FIXME to be verified*/ { 2, 0x0000 }, /* NVIDIA_NFORCE */ /*@FIXME to be verified*/ { 2, 0x0000 }, /* AMD768 */ @@ -324,6 +330,8 @@ PCI_ANY_ID, PCI_ANY_ID, 0, 0, INTELICH3}, {PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH4, PCI_ANY_ID, PCI_ANY_ID, 0, 0, INTELICH4}, + {PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH5, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, INTELICH5}, {PCI_VENDOR_ID_SI, PCI_DEVICE_ID_SI_7012, PCI_ANY_ID, PCI_ANY_ID, 0, 0, SI7012}, {PCI_VENDOR_ID_NVIDIA, PCI_DEVICE_ID_NVIDIA_MCP1_AUDIO, diff -urN linux-2.4.21-bk1/fs/Config.in linux-2.4.21-bk2/fs/Config.in --- linux-2.4.21-bk1/fs/Config.in 2003-07-06 08:34:24.000000000 -0700 +++ linux-2.4.21-bk2/fs/Config.in 2003-07-06 08:34:39.000000000 -0700 @@ -104,6 +104,7 @@ dep_tristate 'InterMezzo file system support (replicating fs) (EXPERIMENTAL)' CONFIG_INTERMEZZO_FS $CONFIG_INET $CONFIG_EXPERIMENTAL dep_tristate 'NFS file system support' CONFIG_NFS_FS $CONFIG_INET dep_mbool ' Provide NFSv3 client support' CONFIG_NFS_V3 $CONFIG_NFS_FS + dep_mbool ' Allow direct I/O on NFS files (EXPERIMENTAL)' CONFIG_NFS_DIRECTIO $CONFIG_NFS_FS $CONFIG_EXPERIMENTAL dep_bool ' Root file system on NFS' CONFIG_ROOT_NFS $CONFIG_NFS_FS $CONFIG_IP_PNP dep_tristate 'NFS server support' CONFIG_NFSD $CONFIG_INET diff -urN linux-2.4.21-bk1/fs/block_dev.c linux-2.4.21-bk2/fs/block_dev.c --- linux-2.4.21-bk1/fs/block_dev.c 2003-06-13 07:51:37.000000000 -0700 +++ linux-2.4.21-bk2/fs/block_dev.c 2003-07-06 08:34:39.000000000 -0700 @@ -131,8 +131,9 @@ return 0; } -static int blkdev_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) +static int blkdev_direct_IO(int rw, struct file * filp, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) { + struct inode * inode = filp->f_dentry->d_inode->i_mapping->host; return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, blkdev_get_block); } diff -urN linux-2.4.21-bk1/fs/buffer.c linux-2.4.21-bk2/fs/buffer.c --- linux-2.4.21-bk1/fs/buffer.c 2003-07-06 08:34:24.000000000 -0700 +++ linux-2.4.21-bk2/fs/buffer.c 2003-07-06 08:34:39.000000000 -0700 @@ -153,10 +153,23 @@ get_bh(bh); add_wait_queue(&bh->b_wait, &wait); do { - run_task_queue(&tq_disk); set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!buffer_locked(bh)) break; + /* + * We must read tq_disk in TQ_ACTIVE after the + * add_wait_queue effect is visible to other cpus. + * We could unplug some line above it wouldn't matter + * but we can't do that right after add_wait_queue + * without an smp_mb() in between because spin_unlock + * has inclusive semantics. + * Doing it here is the most efficient place so we + * don't do a suprious unplug if we get a racy + * wakeup that make buffer_locked to return 0, and + * doing it here avoids an explicit smp_mb() we + * rely on the implicit one in set_task_state. + */ + run_task_queue(&tq_disk); schedule(); } while (buffer_locked(bh)); tsk->state = TASK_RUNNING; @@ -1516,6 +1529,9 @@ /* Done - end_buffer_io_async will unlock */ SetPageUptodate(page); + + wakeup_page_waiters(page); + return 0; out: @@ -1547,6 +1563,7 @@ } while (bh != head); if (need_unlock) UnlockPage(page); + wakeup_page_waiters(page); return err; } @@ -1774,6 +1791,8 @@ else submit_bh(READ, bh); } + + wakeup_page_waiters(page); return 0; } @@ -2400,6 +2419,7 @@ submit_bh(rw, bh); bh = next; } while (bh != head); + wakeup_page_waiters(page); return 0; } diff -urN linux-2.4.21-bk1/fs/ext2/inode.c linux-2.4.21-bk2/fs/ext2/inode.c --- linux-2.4.21-bk1/fs/ext2/inode.c 2003-06-13 07:51:37.000000000 -0700 +++ linux-2.4.21-bk2/fs/ext2/inode.c 2003-07-06 08:34:39.000000000 -0700 @@ -592,8 +592,9 @@ { return generic_block_bmap(mapping,block,ext2_get_block); } -static int ext2_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) +static int ext2_direct_IO(int rw, struct file * filp, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) { + struct inode * inode = filp->f_dentry->d_inode->i_mapping->host; return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, ext2_get_block); } struct address_space_operations ext2_aops = { diff -urN linux-2.4.21-bk1/fs/nfs/Makefile linux-2.4.21-bk2/fs/nfs/Makefile --- linux-2.4.21-bk1/fs/nfs/Makefile 2001-11-09 14:28:15.000000000 -0800 +++ linux-2.4.21-bk2/fs/nfs/Makefile 2003-07-06 08:34:39.000000000 -0700 @@ -14,6 +14,7 @@ obj-$(CONFIG_ROOT_NFS) += nfsroot.o mount_clnt.o obj-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o +obj-$(CONFIG_NFS_DIRECTIO) += direct.o obj-m := $(O_TARGET) diff -urN linux-2.4.21-bk1/fs/nfs/file.c linux-2.4.21-bk2/fs/nfs/file.c --- linux-2.4.21-bk1/fs/nfs/file.c 2003-06-13 07:51:37.000000000 -0700 +++ linux-2.4.21-bk2/fs/nfs/file.c 2003-07-06 08:34:39.000000000 -0700 @@ -16,6 +16,7 @@ * nfs regular file handling functions */ +#include #include #include #include @@ -200,6 +201,9 @@ sync_page: nfs_sync_page, writepage: nfs_writepage, prepare_write: nfs_prepare_write, +#ifdef CONFIG_NFS_DIRECTIO + direct_IO: nfs_direct_IO, +#endif commit_write: nfs_commit_write }; diff -urN linux-2.4.21-bk1/fs/nfs/write.c linux-2.4.21-bk2/fs/nfs/write.c --- linux-2.4.21-bk1/fs/nfs/write.c 2003-07-06 08:34:24.000000000 -0700 +++ linux-2.4.21-bk2/fs/nfs/write.c 2003-07-06 08:34:39.000000000 -0700 @@ -123,23 +123,6 @@ } /* - * This function will be used to simulate weak cache consistency - * under NFSv2 when the NFSv3 attribute patch is included. - * For the moment, we just call nfs_refresh_inode(). - */ -static __inline__ int -nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) -{ - if ((fattr->valid & NFS_ATTR_FATTR) && !(fattr->valid & NFS_ATTR_WCC)) { - fattr->pre_size = NFS_CACHE_ISIZE(inode); - fattr->pre_mtime = NFS_CACHE_MTIME(inode); - fattr->pre_ctime = NFS_CACHE_CTIME(inode); - fattr->valid |= NFS_ATTR_WCC; - } - return nfs_refresh_inode(inode, fattr); -} - -/* * Write a page synchronously. * Offset is the data offset within the page. */ diff -urN linux-2.4.21-bk1/fs/reiserfs/inode.c linux-2.4.21-bk2/fs/reiserfs/inode.c --- linux-2.4.21-bk1/fs/reiserfs/inode.c 2003-07-06 08:34:24.000000000 -0700 +++ linux-2.4.21-bk2/fs/reiserfs/inode.c 2003-07-06 08:34:39.000000000 -0700 @@ -2080,6 +2080,7 @@ */ if (nr) { submit_bh_for_writepage(arr, nr) ; + wakeup_page_waiters(page); } else { UnlockPage(page) ; } diff -urN linux-2.4.21-bk1/include/linux/blkdev.h linux-2.4.21-bk2/include/linux/blkdev.h --- linux-2.4.21-bk1/include/linux/blkdev.h 2003-06-13 07:51:38.000000000 -0700 +++ linux-2.4.21-bk2/include/linux/blkdev.h 2003-07-06 08:34:40.000000000 -0700 @@ -64,12 +64,6 @@ typedef void (plug_device_fn) (request_queue_t *q, kdev_t device); typedef void (unplug_device_fn) (void *q); -/* - * Default nr free requests per queue, ll_rw_blk will scale it down - * according to available RAM at init time - */ -#define QUEUE_NR_REQUESTS 8192 - struct request_list { unsigned int count; struct list_head free; @@ -80,7 +74,7 @@ /* * the queue request freelist, one for reads and one for writes */ - struct request_list rq[2]; + struct request_list rq; /* * The total number of requests on each queue @@ -93,6 +87,21 @@ int batch_requests; /* + * The total number of 512byte blocks on each queue + */ + atomic_t nr_sectors; + + /* + * Batching threshold for sleep/wakeup decisions + */ + int batch_sectors; + + /* + * The max number of 512byte blocks on each queue + */ + int max_queue_sectors; + + /* * Together with queue_head for cacheline sharing */ struct list_head queue_head; @@ -118,13 +127,21 @@ /* * Boolean that indicates whether this queue is plugged or not. */ - char plugged; + int plugged:1; /* * Boolean that indicates whether current_request is active or * not. */ - char head_active; + int head_active:1; + + /* + * Boolean that indicates you will use blk_started_sectors + * and blk_finished_sectors in addition to blk_started_io + * and blk_finished_io. It enables the throttling code to + * help keep the sectors in flight to a reasonable value + */ + int can_throttle:1; unsigned long bounce_pfn; @@ -137,7 +154,7 @@ /* * Tasks wait here for free read and write requests */ - wait_queue_head_t wait_for_requests[2]; + wait_queue_head_t wait_for_requests; }; #define blk_queue_plugged(q) (q)->plugged @@ -221,10 +238,11 @@ /* * Access functions for manipulating queue properties */ -extern int blk_grow_request_list(request_queue_t *q, int nr_requests); +extern int blk_grow_request_list(request_queue_t *q, int nr_requests, int max_queue_sectors); extern void blk_init_queue(request_queue_t *, request_fn_proc *); extern void blk_cleanup_queue(request_queue_t *); extern void blk_queue_headactive(request_queue_t *, int); +extern void blk_queue_throttle_sectors(request_queue_t *, int); extern void blk_queue_make_request(request_queue_t *, make_request_fn *); extern void generic_unplug_device(void *); extern inline int blk_seg_merge_ok(struct buffer_head *, struct buffer_head *); @@ -243,6 +261,8 @@ #define MAX_SEGMENTS 128 #define MAX_SECTORS 255 +#define MAX_QUEUE_SECTORS (4 << (20 - 9)) /* 4 mbytes when full sized */ +#define MAX_NR_REQUESTS 1024 /* 1024k when in 512 units, normally min is 1M in 1k units */ #define PageAlignSize(size) (((size) + PAGE_SIZE -1) & PAGE_MASK) @@ -268,9 +288,51 @@ return retval; } +static inline int blk_oversized_queue(request_queue_t * q) +{ + if (q->can_throttle) + return atomic_read(&q->nr_sectors) > q->max_queue_sectors; + return q->rq.count == 0; +} + +static inline int blk_oversized_queue_batch(request_queue_t * q) +{ + return atomic_read(&q->nr_sectors) > q->max_queue_sectors - q->batch_sectors; +} + #define blk_finished_io(nsects) do { } while (0) #define blk_started_io(nsects) do { } while (0) +static inline void blk_started_sectors(struct request *rq, int count) +{ + request_queue_t *q = rq->q; + if (q && q->can_throttle) { + atomic_add(count, &q->nr_sectors); + if (atomic_read(&q->nr_sectors) < 0) { + printk("nr_sectors is %d\n", atomic_read(&q->nr_sectors)); + BUG(); + } + } +} + +static inline void blk_finished_sectors(struct request *rq, int count) +{ + request_queue_t *q = rq->q; + if (q && q->can_throttle) { + atomic_sub(count, &q->nr_sectors); + + smp_mb(); + if (q->rq.count >= q->batch_requests && !blk_oversized_queue_batch(q)) { + if (waitqueue_active(&q->wait_for_requests)) + wake_up(&q->wait_for_requests); + } + if (atomic_read(&q->nr_sectors) < 0) { + printk("nr_sectors is %d\n", atomic_read(&q->nr_sectors)); + BUG(); + } + } +} + static inline unsigned int blksize_bits(unsigned int size) { unsigned int bits = 8; diff -urN linux-2.4.21-bk1/include/linux/elevator.h linux-2.4.21-bk2/include/linux/elevator.h --- linux-2.4.21-bk1/include/linux/elevator.h 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.21-bk2/include/linux/elevator.h 2003-07-06 08:34:40.000000000 -0700 @@ -80,7 +80,7 @@ return latency; } -#define ELV_LINUS_SEEK_COST 16 +#define ELV_LINUS_SEEK_COST 1 #define ELEVATOR_NOOP \ ((elevator_t) { \ @@ -93,8 +93,8 @@ #define ELEVATOR_LINUS \ ((elevator_t) { \ - 2048, /* read passovers */ \ - 8192, /* write passovers */ \ + 128, /* read passovers */ \ + 512, /* write passovers */ \ \ elevator_linus_merge, /* elevator_merge_fn */ \ elevator_linus_merge_req, /* elevator_merge_req_fn */ \ diff -urN linux-2.4.21-bk1/include/linux/fs.h linux-2.4.21-bk2/include/linux/fs.h --- linux-2.4.21-bk1/include/linux/fs.h 2003-07-06 08:34:25.000000000 -0700 +++ linux-2.4.21-bk2/include/linux/fs.h 2003-07-06 08:34:40.000000000 -0700 @@ -395,7 +395,7 @@ int (*flushpage) (struct page *, unsigned long); int (*releasepage) (struct page *, int); #define KERNEL_HAS_O_DIRECT /* this is for modules out of the kernel */ - int (*direct_IO)(int, struct inode *, struct kiobuf *, unsigned long, int); + int (*direct_IO)(int, struct file *, struct kiobuf *, unsigned long, int); void (*removepage)(struct page *); /* called when page gets removed from the inode */ }; diff -urN linux-2.4.21-bk1/include/linux/nfs_fs.h linux-2.4.21-bk2/include/linux/nfs_fs.h --- linux-2.4.21-bk1/include/linux/nfs_fs.h 2003-06-13 07:51:38.000000000 -0700 +++ linux-2.4.21-bk2/include/linux/nfs_fs.h 2003-07-06 08:34:40.000000000 -0700 @@ -274,6 +274,11 @@ #define NFS_TestClearPageSync(page) test_and_clear_bit(PG_fs_1, &(page)->flags) /* + * linux/fs/nfs/direct.c + */ +extern int nfs_direct_IO(int, struct file *, struct kiobuf *, unsigned long, int); + +/* * linux/fs/mount_clnt.c * (Used only by nfsroot module) */ @@ -302,6 +307,23 @@ return __nfs_refresh_inode(inode,fattr); } +/* + * This function will be used to simulate weak cache consistency + * under NFSv2 when the NFSv3 attribute patch is included. + * For the moment, we just call nfs_refresh_inode(). + */ +static __inline__ int +nfs_write_attributes(struct inode *inode, struct nfs_fattr *fattr) +{ + if ((fattr->valid & NFS_ATTR_FATTR) && !(fattr->valid & NFS_ATTR_WCC)) { + fattr->pre_size = NFS_CACHE_ISIZE(inode); + fattr->pre_mtime = NFS_CACHE_MTIME(inode); + fattr->pre_ctime = NFS_CACHE_CTIME(inode); + fattr->valid |= NFS_ATTR_WCC; + } + return nfs_refresh_inode(inode, fattr); +} + static inline loff_t nfs_size_to_loff_t(__u64 size) { diff -urN linux-2.4.21-bk1/include/linux/nfs_xdr.h linux-2.4.21-bk2/include/linux/nfs_xdr.h --- linux-2.4.21-bk1/include/linux/nfs_xdr.h 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.21-bk2/include/linux/nfs_xdr.h 2003-07-06 08:34:40.000000000 -0700 @@ -59,7 +59,7 @@ /* Arguments to the read call. * Note that NFS_READ_MAXIOV must be <= (MAX_IOVEC-2) from sunrpc/xprt.h */ -#define NFS_READ_MAXIOV 8 +#define NFS_READ_MAXIOV (9) struct nfs_readargs { struct nfs_fh * fh; @@ -78,7 +78,7 @@ /* Arguments to the write call. * Note that NFS_WRITE_MAXIOV must be <= (MAX_IOVEC-2) from sunrpc/xprt.h */ -#define NFS_WRITE_MAXIOV 8 +#define NFS_WRITE_MAXIOV (9) struct nfs_writeargs { struct nfs_fh * fh; __u64 offset; diff -urN linux-2.4.21-bk1/include/linux/pagemap.h linux-2.4.21-bk2/include/linux/pagemap.h --- linux-2.4.21-bk1/include/linux/pagemap.h 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.21-bk2/include/linux/pagemap.h 2003-07-06 08:34:40.000000000 -0700 @@ -97,6 +97,8 @@ ___wait_on_page(page); } +extern void FASTCALL(wakeup_page_waiters(struct page * page)); + /* * Returns locked page at given index in given cache, creating it if needed. */ diff -urN linux-2.4.21-bk1/include/linux/pci.h linux-2.4.21-bk2/include/linux/pci.h --- linux-2.4.21-bk1/include/linux/pci.h 2003-06-13 07:51:39.000000000 -0700 +++ linux-2.4.21-bk2/include/linux/pci.h 2003-07-06 08:34:40.000000000 -0700 @@ -773,6 +773,11 @@ pdev->driver_data = data; } +static inline char *pci_name(struct pci_dev *pdev) +{ + return pdev->slot_name; +} + /* * The world is not perfect and supplies us with broken PCI devices. * For at least a part of these bugs we need a work-around, so both diff -urN linux-2.4.21-bk1/ipc/sem.c linux-2.4.21-bk2/ipc/sem.c --- linux-2.4.21-bk1/ipc/sem.c 2002-11-28 15:53:15.000000000 -0800 +++ linux-2.4.21-bk2/ipc/sem.c 2003-07-06 08:34:40.000000000 -0700 @@ -62,6 +62,7 @@ #include #include #include +#include #include #include "util.h" @@ -251,39 +252,38 @@ for (sop = sops; sop < sops + nsops; sop++) { curr = sma->sem_base + sop->sem_num; sem_op = sop->sem_op; - - if (!sem_op && curr->semval) + result = curr->semval; + + if (!sem_op && result) goto would_block; - curr->sempid = (curr->sempid << 16) | pid; - curr->semval += sem_op; - if (sop->sem_flg & SEM_UNDO) - { + result += sem_op; + if (result < 0) + goto would_block; + if (result > SEMVMX) + goto out_of_range; + if (sop->sem_flg & SEM_UNDO) { int undo = un->semadj[sop->sem_num] - sem_op; /* * Exceeding the undo range is an error. */ if (undo < (-SEMAEM - 1) || undo > SEMAEM) - { - /* Don't undo the undo */ - sop->sem_flg &= ~SEM_UNDO; goto out_of_range; - } - un->semadj[sop->sem_num] = undo; } - if (curr->semval < 0) - goto would_block; - if (curr->semval > SEMVMX) - goto out_of_range; + curr->semval = result; } - if (do_undo) - { - sop--; + if (do_undo) { result = 0; goto undo; } - + sop--; + while (sop >= sops) { + sma->sem_base[sop->sem_num].sempid = pid; + if (sop->sem_flg & SEM_UNDO) + un->semadj[sop->sem_num] -= sop->sem_op; + sop--; + } sma->sem_otime = CURRENT_TIME; return 0; @@ -298,13 +298,9 @@ result = 1; undo: + sop--; while (sop >= sops) { - curr = sma->sem_base + sop->sem_num; - curr->semval -= sop->sem_op; - curr->sempid >>= 16; - - if (sop->sem_flg & SEM_UNDO) - un->semadj[sop->sem_num] += sop->sem_op; + sma->sem_base[sop->sem_num].semval -= sop->sem_op; sop--; } @@ -624,7 +620,7 @@ err = curr->semval; goto out_unlock; case GETPID: - err = curr->sempid & 0xffff; + err = curr->sempid; goto out_unlock; case GETNCNT: err = count_semncnt(sma,semnum); @@ -839,6 +835,12 @@ asmlinkage long sys_semop (int semid, struct sembuf *tsops, unsigned nsops) { + return sys_semtimedop(semid, tsops, nsops, NULL); +} + +asmlinkage long sys_semtimedop (int semid, struct sembuf *tsops, + unsigned nsops, const struct timespec *timeout) +{ int error = -EINVAL; struct sem_array *sma; struct sembuf fast_sops[SEMOPM_FAST]; @@ -846,6 +848,7 @@ struct sem_undo *un; int undos = 0, decrease = 0, alter = 0; struct sem_queue queue; + unsigned long jiffies_left = 0; if (nsops < 1 || semid < 0) return -EINVAL; @@ -860,6 +863,19 @@ error=-EFAULT; goto out_free; } + if (timeout) { + struct timespec _timeout; + if (copy_from_user(&_timeout, timeout, sizeof(*timeout))) { + error = -EFAULT; + goto out_free; + } + if (_timeout.tv_sec < 0 || _timeout.tv_nsec < 0 || + _timeout.tv_nsec >= 1000000000L) { + error = -EINVAL; + goto out_free; + } + jiffies_left = timespec_to_jiffies(&_timeout); + } sma = sem_lock(semid); error=-EINVAL; if(sma==NULL) @@ -932,7 +948,10 @@ current->state = TASK_INTERRUPTIBLE; sem_unlock(semid); - schedule(); + if (timeout) + jiffies_left = schedule_timeout(jiffies_left); + else + schedule(); tmp = sem_lock(semid); if(tmp==NULL) { @@ -957,6 +976,8 @@ break; } else { error = queue.status; + if (error == -EINTR && timeout && jiffies_left == 0) + error = -EAGAIN; if (queue.prev) /* got Interrupt */ break; /* Everything done by update_queue */ diff -urN linux-2.4.21-bk1/kernel/ksyms.c linux-2.4.21-bk2/kernel/ksyms.c --- linux-2.4.21-bk1/kernel/ksyms.c 2003-07-06 08:34:26.000000000 -0700 +++ linux-2.4.21-bk2/kernel/ksyms.c 2003-07-06 08:34:40.000000000 -0700 @@ -296,6 +296,7 @@ EXPORT_SYMBOL(filemap_fdatawait); EXPORT_SYMBOL(lock_page); EXPORT_SYMBOL(unlock_page); +EXPORT_SYMBOL(wakeup_page_waiters); /* device registration */ EXPORT_SYMBOL(register_chrdev); diff -urN linux-2.4.21-bk1/mm/filemap.c linux-2.4.21-bk2/mm/filemap.c --- linux-2.4.21-bk1/mm/filemap.c 2003-07-06 08:34:26.000000000 -0700 +++ linux-2.4.21-bk2/mm/filemap.c 2003-07-06 08:34:40.000000000 -0700 @@ -810,6 +810,20 @@ return &wait[hash]; } +/* + * This must be called after every submit_bh with end_io + * callbacks that would result into the blkdev layer waking + * up the page after a queue unplug. + */ +void wakeup_page_waiters(struct page * page) +{ + wait_queue_head_t * head; + + head = page_waitqueue(page); + if (waitqueue_active(head)) + wake_up(head); +} + /* * Wait for a page to get unlocked. * @@ -1607,7 +1621,7 @@ if (retval) break; - retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize); + retval = mapping->a_ops->direct_IO(rw, filp, iobuf, (offset+progress) >> blocksize_bits, blocksize); if (rw == READ && retval > 0) mark_dirty_kiobuf(iobuf, retval);