diff -urN 2.3.46pre1/CREDITS 2.3.46pre1aa1/CREDITS --- 2.3.46pre1/CREDITS Tue Feb 15 03:06:47 2000 +++ 2.3.46pre1aa1/CREDITS Wed Feb 16 00:28:24 2000 @@ -1519,6 +1519,13 @@ D: XF86_8514 D: cfdisk (curses based disk partitioning program) +N: Heinz Mauelshagen +E: mge@EZ-Darmstadt.Telekom.de +D: Logical Volume Manager +S: Bartningstr. 12 +S: 64289 Darmstadt +S: Germany + N: Mike McLagan E: mike.mclagan@linux.org W: http://www.invlogic.com/~mmclagan diff -urN 2.3.46pre1/Documentation/Configure.help 2.3.46pre1aa1/Documentation/Configure.help --- 2.3.46pre1/Documentation/Configure.help Fri Feb 11 00:05:31 2000 +++ 2.3.46pre1aa1/Documentation/Configure.help Wed Feb 16 00:28:24 2000 @@ -1260,6 +1260,30 @@ called on26.o. You must also have a high-level driver for the type of device that you want to support. +Logical Volume Manager (LVM) support +CONFIG_BLK_DEV_LVM + This driver lets you combine several hard disks, hard disk partitions, + multiple devices or even loop devices (for evaluation purposes) into + a volume group. Imagine a volume group as a kind of virtual disk. + Logical volumes, which can be thought of as virtual partitions, + can be created in the volume group. You can resize volume groups and + logical volumes after creation time, corresponding to new capacity needs. + Logical volumes are accessed as block devices named + /dev/VolumeGroupName/LogicalVolumeName. + + For details see /usr/src/linux/Documentaion/LVM-HOWTO. + + To get the newest software see . + +Logical Volume Manager proc filesystem information +CONFIG_LVM_PROC_FS + If you say Y here, you are able to access overall Logical Volume Manager, + Volume Group, Logical and Physical Volume information in /proc/lvm. + + To use this option, you have to check, that the "proc filesystem support" + (CONFIG_PROC_FS) is enabled too. + + Multiple devices driver support CONFIG_BLK_DEV_MD This driver lets you combine several hard disk partitions into one diff -urN 2.3.46pre1/Documentation/LVM-HOWTO 2.3.46pre1aa1/Documentation/LVM-HOWTO --- 2.3.46pre1/Documentation/LVM-HOWTO Thu Jan 1 01:00:00 1970 +++ 2.3.46pre1aa1/Documentation/LVM-HOWTO Wed Feb 16 00:28:24 2000 @@ -0,0 +1,118 @@ +Heinz Mauelshagen's LVM (Logical Volume Manager) howto. 01/28/1999 + + +Abstract: +--------- +The LVM adds a kind of virtual disks and virtual partitions functionality +to the Linux operating system + +It achieves this by adding an additional layer between the physical peripherals +and the i/o interface in the kernel. + +This allows the concatenation of several disk partitions or total disks +(so-called physical volumes or PVs) or even multiple devices +to form a storage pool (so-called Volume Group or VG) with +allocation units called physical extents (called PE). +You can think of the volume group as a virtual disk. +Please see scenario below. + +Some or all PEs of this VG then can be allocated to so-called Logical Volumes +or LVs in units called logical extents or LEs. +Each LE is mapped to a corresponding PE. +LEs and PEs are equal in size. +Logical volumes are a kind of virtual partitions. + + +The LVs can be used through device special files similar to the known +/dev/sd[a-z]* or /dev/hd[a-z]* named /dev/VolumeGroupName/LogicalVolumeName. + +But going beyond this, you are able to extend or reduce +VGs _AND_ LVs at runtime! + +So... +If for example the capacity of a LV gets too small and your VG containing +this LV is full, you could add another PV to that VG and simply extend +the LV afterwards. +If you reduce or delete a LV you can use the freed capacity for different +LVs in the same VG. + + +The above scenario looks like this: + + /------------------------------------------\ + | /--PV2---\ VG 1 /--PVn---\ | + | |-VGDA---| |-VGDA-- | | + | |PE1PE2..| |PE1PE2..| | + | | | ...... | | | + | | | | | | + | | /-----------------------\ | | + | | \-------LV 1------------/ | | + | | ..PEn| | ..PEn| | + | \--------/ \--------/ | + \------------------------------------------/ + +PV 1 could be /dev/sdc1 sized 3GB +PV n could be /dev/sde1 sized 4GB +VG 1 could be test_vg +LV 1 could be /dev/test_vg/test_lv +VGDA is the volume group descriptor area holding the LVM metadata +PE1 up to PEn is the number of physical extents on each disk(partition) + + + +Installation steps see INSTALL and insmod(1)/modprobe(1), kmod/kerneld(8) +to load the logical volume manager module if you did not bind it +into the kernel. + + +Configuration steps for getting the above scenario: + +1. Set the partition system id to 0xFE on /dev/sdc1 and /dev/sde1. + +2. do a "pvcreate /dev/sd[ce]1" + For testing purposes you can use more than one partition on a disk. + You should not use more than one partition because in the case of + a striped LV you'll have a performance breakdown. + +3. do a "vgcreate test_vg /dev/sd[ce]1" to create the new VG named "test_vg" + which has the total capacity of both partitions. + vgcreate activates (transfers the metadata into the LVM driver in the kernel) + the new volume group too to be able to create LVs in the next step. + +4. do a "lvcreate -L1500 -ntest_lv test_vg" to get a 1500MB linear LV named + "test_lv" and it's block device special "/dev/test_vg/test_lv". + + Or do a "lvcreate -i2 -I4 -l1500 -nanother_test_lv test_vg" to get a 100 LE + large logical volume with 2 stripes and stripesize 4 KB. + +5. For example generate a filesystem in one LV with + "mke2fs /dev/test_vg/test_lv" and mount it. + +6. extend /dev/test_vg/test_lv to 1600MB with relative size by + "lvextend -L+100 /dev/test_vg/test_lv" + or with absolute size by + "lvextend -L1600 /dev/test_vg/test_lv" + +7. reduce /dev/test_vg/test_lv to 900 logical extents with relative extents by + "lvreduce -l-700 /dev/test_vg/test_lv" + or with absolute extents by + "lvreduce -l900 /dev/test_vg/test_lv" + +9. rename a VG by deactivating it with + "vgchange -an test_vg" # only VGs with _no_ open LVs can be deactivated! + "vgrename test_vg whatever" + and reactivate it again by + "vgchange -ay whatever" + +9. rename a LV after closing it by + "lvchange -an /dev/whatever/test_lv" # only closed LVs can be deactivated + "lvrename /dev/whatever/test_lv /dev/whatever/whatvolume" + or by + "lvrename whatever test_lv whatvolume" + and reactivate it again by + "lvchange -ay /dev/whatever/whatvolume" + +10. if you own Ted Tso's resize2fs program, you are able to resize the + ext2 type filesystems contained in logical volumes without destroyiing + the data by + "e2fsadm -L+100 /dev/test_vg/another_test_lv" diff -urN 2.3.46pre1/MAINTAINERS 2.3.46pre1aa1/MAINTAINERS --- 2.3.46pre1/MAINTAINERS Sat Feb 12 21:03:23 2000 +++ 2.3.46pre1aa1/MAINTAINERS Wed Feb 16 00:28:24 2000 @@ -578,6 +578,13 @@ W: http://people.redhat.com/zab/maestro/ S: Supported +LOGICAL VOLUME MANAGER +P: Heinz Mauelshagen +M: linux-LVM@EZ-Darmstadt.Telekom.de +L: linux-LVM@msede.com +W: http://linux.msede.com/lvm +S: Maintained + M68K P: Jes Sorensen M: Jes.Sorensen@cern.ch diff -urN 2.3.46pre1/Makefile 2.3.46pre1aa1/Makefile --- 2.3.46pre1/Makefile Tue Feb 15 03:07:27 2000 +++ 2.3.46pre1aa1/Makefile Wed Feb 16 00:28:25 2000 @@ -330,13 +330,15 @@ echo \#define LINUX_COMPILE_DOMAIN ; \ fi >> .ver @echo \#define LINUX_COMPILER \"`$(CC) $(CFLAGS) -v 2>&1 | tail -1`\" >> .ver - @mv -f .ver $@ + @cp .ver $@ + @rm .ver include/linux/version.h: ./Makefile @echo \#define UTS_RELEASE \"$(KERNELRELEASE)\" > .ver @echo \#define LINUX_VERSION_CODE `expr $(VERSION) \\* 65536 + $(PATCHLEVEL) \\* 256 + $(SUBLEVEL)` >> .ver @echo '#define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c))' >>.ver - @mv -f .ver $@ + @cp .ver $@ + @rm .ver init/version.o: init/version.c include/linux/compile.h include/config/MARKER $(CC) $(CFLAGS) -DUTS_MACHINE='"$(ARCH)"' -c -o init/version.o init/version.c diff -urN 2.3.46pre1/arch/alpha/kernel/Makefile 2.3.46pre1aa1/arch/alpha/kernel/Makefile --- 2.3.46pre1/arch/alpha/kernel/Makefile Fri Feb 11 00:05:32 2000 +++ 2.3.46pre1aa1/arch/alpha/kernel/Makefile Wed Feb 16 00:28:24 2000 @@ -14,7 +14,7 @@ O_TARGET := kernel.o O_OBJS := entry.o traps.o process.o osf_sys.o irq.o signal.o setup.o \ - ptrace.o time.o semaphore.o + ptrace.o time.o semaphore.o i8259.o rtc_irq.o OX_OBJS := alpha_ksyms.o diff -urN 2.3.46pre1/arch/alpha/kernel/alpha_ksyms.c 2.3.46pre1aa1/arch/alpha/kernel/alpha_ksyms.c --- 2.3.46pre1/arch/alpha/kernel/alpha_ksyms.c Fri Feb 11 00:05:32 2000 +++ 2.3.46pre1aa1/arch/alpha/kernel/alpha_ksyms.c Wed Feb 16 00:28:24 2000 @@ -36,6 +36,7 @@ extern struct hwrpb_struct *hwrpb; extern void dump_thread(struct pt_regs *, struct user *); extern int dump_fpu(struct pt_regs *, elf_fpregset_t *); +extern spinlock_t kernel_flag; /* these are C runtime functions with special calling conventions: */ extern void __divl (void); @@ -158,13 +159,16 @@ */ #ifdef __SMP__ +EXPORT_SYMBOL(kernel_flag); EXPORT_SYMBOL(synchronize_irq); EXPORT_SYMBOL(flush_tlb_all); EXPORT_SYMBOL(flush_tlb_mm); EXPORT_SYMBOL(flush_tlb_page); EXPORT_SYMBOL(flush_tlb_range); +EXPORT_SYMBOL(smp_imb); EXPORT_SYMBOL(cpu_data); EXPORT_SYMBOL(__cpu_number_map); +EXPORT_SYMBOL(smp_num_cpus); EXPORT_SYMBOL(global_irq_holder); EXPORT_SYMBOL(__global_cli); EXPORT_SYMBOL(__global_sti); diff -urN 2.3.46pre1/arch/alpha/kernel/i8259.c 2.3.46pre1aa1/arch/alpha/kernel/i8259.c --- 2.3.46pre1/arch/alpha/kernel/i8259.c Thu Jan 1 01:00:00 1970 +++ 2.3.46pre1aa1/arch/alpha/kernel/i8259.c Wed Feb 16 00:28:24 2000 @@ -0,0 +1,123 @@ +/* started hacking from linux-2.3.30pre6/arch/i386/kernel/i8259.c */ + +#include +#include +#include +#include +#include + +#include +#include + +/* + * This is the 'legacy' 8259A Programmable Interrupt Controller, + * present in the majority of PC/AT boxes. + */ + +static void enable_8259A_irq(unsigned int irq); +static void disable_8259A_irq(unsigned int irq); + +/* shutdown is same as "disable" */ +#define end_8259A_irq enable_8259A_irq +#define shutdown_8259A_irq disable_8259A_irq + +static void mask_and_ack_8259A(unsigned int); + +static unsigned int startup_8259A_irq(unsigned int irq) +{ + enable_8259A_irq(irq); + return 0; /* never anything pending */ +} + +static struct hw_interrupt_type i8259A_irq_type = { + "XT-PIC", + startup_8259A_irq, + shutdown_8259A_irq, + enable_8259A_irq, + disable_8259A_irq, + mask_and_ack_8259A, + end_8259A_irq +}; + +/* + * 8259A PIC functions to handle ISA devices: + */ + +/* + * This contains the irq mask for both 8259A irq controllers, + */ +static unsigned int cached_irq_mask = 0xffff; + +#define __byte(x,y) (((unsigned char *)&(y))[x]) +#define cached_21 (__byte(0,cached_irq_mask)) +#define cached_A1 (__byte(1,cached_irq_mask)) + +/* + * These have to be protected by the irq controller spinlock + * before being called. + */ +static void disable_8259A_irq(unsigned int irq) +{ + unsigned int mask = 1 << irq; + cached_irq_mask |= mask; + if (irq & 8) + outb(cached_A1,0xA1); + else + outb(cached_21,0x21); +} + +static void enable_8259A_irq(unsigned int irq) +{ + unsigned int mask = ~(1 << irq); + cached_irq_mask &= mask; + if (irq & 8) + outb(cached_A1,0xA1); + else + outb(cached_21,0x21); +} + +static void mask_and_ack_8259A(unsigned int irq) +{ + disable_8259A_irq(irq); + + /* Ack the interrupt making it the lowest priority */ + /* First the slave .. */ + if (irq > 7) { + outb(0xE0 | (irq - 8), 0xa0); + irq = 2; + } + /* .. then the master */ + outb(0xE0 | irq, 0x20); +} + +static void init_8259A(void) +{ + outb(0xff, 0x21); /* mask all of 8259A-1 */ + outb(0xff, 0xA1); /* mask all of 8259A-2 */ +} + +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ +static struct irqaction irq2 = { no_action, 0, 0, "cascade", NULL, NULL}; + +void __init +init_ISA_irqs (void) +{ + int i; + + for (i = 0; i < NR_IRQS; i++) { + if (i == RTC_IRQ) + continue; + if (i >= 16) + break; + irq_desc[i].status = IRQ_DISABLED; + /* + * 16 old-style INTA-cycle interrupts: + */ + irq_desc[i].handler = &i8259A_irq_type; + } + + init_8259A(); + setup_irq(2, &irq2); +} diff -urN 2.3.46pre1/arch/alpha/kernel/irq.c 2.3.46pre1aa1/arch/alpha/kernel/irq.c --- 2.3.46pre1/arch/alpha/kernel/irq.c Fri Feb 11 00:05:32 2000 +++ 2.3.46pre1aa1/arch/alpha/kernel/irq.c Wed Feb 16 00:28:24 2000 @@ -39,6 +39,7 @@ #ifndef __SMP__ int __local_irq_count; int __local_bh_count; +unsigned long __irq_attempt[NR_IRQS]; #endif #if NR_IRQS > 128 @@ -57,12 +58,6 @@ /* - * Shadow-copy of masked interrupts. - */ - -unsigned long _alpha_irq_masks[2] = { ~0UL, ~0UL }; - -/* * The ack_irq routine used by 80% of the systems. */ @@ -135,7 +130,7 @@ return; } } - handle_irq(j, j, regs); + handle_irq(j, regs); #else unsigned long pic; @@ -169,77 +164,201 @@ void srm_device_interrupt(unsigned long vector, struct pt_regs * regs) { - int irq, ack; + int irq; - ack = irq = (vector - 0x800) >> 4; - handle_irq(irq, ack, regs); + irq = (vector - 0x800) >> 4; + handle_irq(irq, regs); } /* + * Special irq handlers. + */ + +void no_action(int cpl, void *dev_id, struct pt_regs *regs) { } + +/* * Initial irq handlers. */ -static struct irqaction timer_irq = { NULL, 0, 0, NULL, NULL, NULL}; -spinlock_t irq_controller_lock = SPIN_LOCK_UNLOCKED; -irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { [0 ... NR_IRQS-1] = {0,} }; +static void enable_none(unsigned int irq) { } +static unsigned int startup_none(unsigned int irq) { return 0; } +static void disable_none(unsigned int irq) { } +static void ack_none(unsigned int irq) +{ + printk("unexpected IRQ trap at vector %02x\n", irq); +} + +/* startup is the same as "enable", shutdown is same as "disable" */ +#define shutdown_none disable_none +#define end_none enable_none + +struct hw_interrupt_type no_irq_type = { + "none", + startup_none, + shutdown_none, + enable_none, + disable_none, + ack_none, + end_none +}; +spinlock_t irq_controller_lock = SPIN_LOCK_UNLOCKED; +irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = + { [0 ... NR_IRQS-1] = { 0, &no_irq_type, }}; -static inline void -mask_irq(unsigned long irq) +int handle_IRQ_event(unsigned int irq, struct pt_regs * regs, struct irqaction * action) { - set_bit(irq, _alpha_irq_masks); - alpha_mv.update_irq_hw(irq, alpha_irq_mask, 0); -} + int status; + int cpu = smp_processor_id(); -static inline void -unmask_irq(unsigned long irq) -{ - clear_bit(irq, _alpha_irq_masks); - alpha_mv.update_irq_hw(irq, alpha_irq_mask, 1); + kstat.irqs[cpu][irq]++; + irq_enter(cpu, irq); + + status = 1; /* Force the "do bottom halves" bit */ + + do { + if (!(action->flags & SA_INTERRUPT)) + __sti(); + else + __cli(); + + status |= action->flags; + action->handler(irq, action->dev_id, regs); + action = action->next; + } while (action); + if (status & SA_SAMPLE_RANDOM) + add_interrupt_randomness(irq); + __cli(); + + irq_exit(cpu, irq); + + return status; } +/* + * Generic enable/disable code: this just calls + * down into the PIC-specific version for the actual + * hardware disable after having gotten the irq + * controller lock. + */ void -disable_irq_nosync(unsigned int irq_nr) +disable_irq_nosync(unsigned int irq) { unsigned long flags; - save_and_cli(flags); - mask_irq(irq_nr); - restore_flags(flags); + spin_lock_irqsave(&irq_controller_lock, flags); + if (!irq_desc[irq].depth++) { + irq_desc[irq].status |= IRQ_DISABLED; + irq_desc[irq].handler->disable(irq); + } + spin_unlock_irqrestore(&irq_controller_lock, flags); } +/* + * Synchronous version of the above, making sure the IRQ is + * no longer running on any other IRQ.. + */ void -disable_irq(unsigned int irq_nr) +disable_irq(unsigned int irq) { - /* This works non-SMP, and SMP until we write code to distribute - interrupts to more that cpu 0. */ - disable_irq_nosync(irq_nr); + disable_irq_nosync(irq); + + if (!local_irq_count(smp_processor_id())) { + do { + barrier(); + } while (irq_desc[irq].status & IRQ_INPROGRESS); + } } void -enable_irq(unsigned int irq_nr) +enable_irq(unsigned int irq) { unsigned long flags; - save_and_cli(flags); - unmask_irq(irq_nr); - restore_flags(flags); + spin_lock_irqsave(&irq_controller_lock, flags); + switch (irq_desc[irq].depth) { + case 1: { + unsigned int status = irq_desc[irq].status & ~IRQ_DISABLED; + irq_desc[irq].status = status; + if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { + irq_desc[irq].status = status | IRQ_REPLAY; + hw_resend_irq(irq_desc[irq].handler,irq); /* noop */ + } + irq_desc[irq].handler->enable(irq); + /* fall-through */ + } + default: + irq_desc[irq].depth--; + break; + case 0: + printk("enable_irq() unbalanced from %p\n", + __builtin_return_address(0)); + } + spin_unlock_irqrestore(&irq_controller_lock, flags); } int -check_irq(unsigned int irq) +setup_irq(unsigned int irq, struct irqaction * new) { - return irq_desc[irq].action ? -EBUSY : 0; + int shared = 0; + struct irqaction *old, **p; + unsigned long flags; + + /* + * Some drivers like serial.c use request_irq() heavily, + * so we have to be careful not to interfere with a + * running system. + */ + if (new->flags & SA_SAMPLE_RANDOM) { + /* + * This function might sleep, we want to call it first, + * outside of the atomic block. + * Yes, this might clear the entropy pool if the wrong + * driver is attempted to be loaded, without actually + * installing a new handler, but is this really a problem, + * only the sysadmin is able to do this. + */ + rand_initialize_irq(irq); + } + + /* + * The following block of code has to be executed atomically + */ + spin_lock_irqsave(&irq_controller_lock,flags); + p = &irq_desc[irq].action; + if ((old = *p) != NULL) { + /* Can't share interrupts unless both agree to */ + if (!(old->flags & new->flags & SA_SHIRQ)) { + spin_unlock_irqrestore(&irq_controller_lock,flags); + return -EBUSY; + } + + /* add new interrupt at end of irq queue */ + do { + p = &old->next; + old = *p; + } while (old); + shared = 1; + } + + *p = new; + + if (!shared) { + irq_desc[irq].depth = 0; + irq_desc[irq].status &= ~IRQ_DISABLED; + irq_desc[irq].handler->startup(irq); + } + spin_unlock_irqrestore(&irq_controller_lock,flags); + return 0; } int request_irq(unsigned int irq, void (*handler)(int, void *, struct pt_regs *), unsigned long irqflags, const char * devname, void *dev_id) { - int shared = 0; - struct irqaction * action, **p; - unsigned long flags; + int retval; + struct irqaction * action; if (irq >= ACTUAL_NR_IRQS) return -EINVAL; @@ -248,36 +367,25 @@ if (!handler) return -EINVAL; - p = &irq_desc[irq].action; - action = *p; - if (action) { - /* Can't share interrupts unless both agree to */ - if (!(action->flags & irqflags & SA_SHIRQ)) - return -EBUSY; - - /* Can't share interrupts unless both are same type */ - if ((action->flags ^ irqflags) & SA_INTERRUPT) - return -EBUSY; - - /* Add new interrupt at end of irq queue */ - do { - p = &action->next; - action = *p; - } while (action); - shared = 1; +#if 1 + /* + * Sanity-check: shared interrupts should REALLY pass in + * a real dev-ID, otherwise we'll have trouble later trying + * to figure out which interrupt is which (messes up the + * interrupt freeing logic etc). + */ + if (irqflags & SA_SHIRQ) { + if (!dev_id) + printk("Bad boy: %s (at %p) called us without a dev_id!\n", + devname, __builtin_return_address(0)); } +#endif - action = &timer_irq; - if (irq != TIMER_IRQ) { - action = (struct irqaction *) + action = (struct irqaction *) kmalloc(sizeof(struct irqaction), GFP_KERNEL); - } if (!action) return -ENOMEM; - if (irqflags & SA_SAMPLE_RANDOM) - rand_initialize_irq(irq); - action->handler = handler; action->flags = irqflags; action->mask = 0; @@ -285,20 +393,16 @@ action->next = NULL; action->dev_id = dev_id; - save_and_cli(flags); - *p = action; - - if (!shared) - unmask_irq(irq); - - restore_flags(flags); - return 0; + retval = setup_irq(irq, action); + if (retval) + kfree(action); + return retval; } - + void free_irq(unsigned int irq, void *dev_id) { - struct irqaction * action, **p; + struct irqaction **p; unsigned long flags; if (irq >= ACTUAL_NR_IRQS) { @@ -309,25 +413,39 @@ printk("Trying to free reserved IRQ %d\n", irq); return; } - for (p = &irq_desc[irq].action; (action = *p) != NULL; p = &action->next) { - if (action->dev_id != dev_id) - continue; + spin_lock_irqsave(&irq_controller_lock,flags); + p = &irq_desc[irq].action; + for (;;) { + struct irqaction * action = *p; + if (action) { + struct irqaction **pp = p; + p = &action->next; + if (action->dev_id != dev_id) + continue; - /* Found it - now free it */ - save_and_cli(flags); - *p = action->next; - if (!irq_desc[irq].action) - mask_irq(irq); - restore_flags(flags); - kfree(action); + /* Found it - now remove it from the list of entries */ + *pp = action->next; + if (!irq_desc[irq].action) { + irq_desc[irq].status |= IRQ_DISABLED; + irq_desc[irq].handler->shutdown(irq); + } + spin_unlock_irqrestore(&irq_controller_lock,flags); + + /* Wait to make sure it's not being used on another CPU */ + while (irq_desc[irq].status & IRQ_INPROGRESS) + barrier(); + kfree(action); + return; + } + printk("Trying to free free IRQ%d\n",irq); + spin_unlock_irqrestore(&irq_controller_lock,flags); return; } - printk("Trying to free free IRQ%d\n",irq); } int get_irq_list(char *buf) { - int i; + int i, j; struct irqaction * action; char *p = buf; @@ -335,6 +453,8 @@ p += sprintf(p, " "); for (i = 0; i < smp_num_cpus; i++) p += sprintf(p, "CPU%d ", i); + for (i = 0; i < smp_num_cpus; i++) + p += sprintf(p, "TRY%d ", i); *p++ = '\n'; #endif @@ -346,13 +466,14 @@ #ifndef __SMP__ p += sprintf(p, "%10u ", kstat_irqs(i)); #else - { - int j; - for (j = 0; j < smp_num_cpus; j++) - p += sprintf(p, "%10u ", - kstat.irqs[cpu_logical_map(j)][i]); - } + for (j = 0; j < smp_num_cpus; j++) + p += sprintf(p, "%10u ", + kstat.irqs[cpu_logical_map(j)][i]); + for (j = 0; j < smp_num_cpus; j++) + p += sprintf(p, "%10lu ", + irq_attempt(cpu_logical_map(j), i)); #endif + p += sprintf(p, " %14s", irq_desc[i].handler->typename); p += sprintf(p, " %c%s", (action->flags & SA_INTERRUPT)?'+':' ', action->name); @@ -364,6 +485,13 @@ } *p++ = '\n'; } +#if CONFIG_SMP + p += sprintf(p, "LOC: "); + for (j = 0; j < smp_num_cpus; j++) + p += sprintf(p, "%10lu ", + cpu_data[cpu_logical_map(j)].smp_local_irq_count); + p += sprintf(p, "\n"); +#endif return p - buf; } @@ -605,139 +733,157 @@ } #endif /* __SMP__ */ -static void -unexpected_irq(int irq, struct pt_regs * regs) -{ -#if 0 -#if 1 - printk("device_interrupt: unexpected interrupt %d\n", irq); -#else - struct irqaction *action; - int i; - - printk("IO device interrupt, irq = %d\n", irq); - printk("PC = %016lx PS=%04lx\n", regs->pc, regs->ps); - printk("Expecting: "); - for (i = 0; i < ACTUAL_NR_IRQS; i++) - if ((action = irq_desc[i].action)) - while (action->handler) { - printk("[%s:%d] ", action->name, i); - action = action->next; - } - printk("\n"); -#endif -#endif - -#if defined(CONFIG_ALPHA_JENSEN) - /* ??? Is all this just debugging, or are the inb's and outb's - necessary to make things work? */ - printk("64=%02x, 60=%02x, 3fa=%02x 2fa=%02x\n", - inb(0x64), inb(0x60), inb(0x3fa), inb(0x2fa)); - outb(0x0c, 0x3fc); - outb(0x0c, 0x2fc); - outb(0,0x61); - outb(0,0x461); -#endif -} - +/* + * do_IRQ handles all normal device IRQ's (the special + * SMP cross-CPU interrupts have their own specific + * handlers). + */ void -handle_irq(int irq, int ack, struct pt_regs * regs) -{ - struct irqaction * action; +handle_irq(int irq, struct pt_regs * regs) +{ + /* + * We ack quickly, we don't want the irq controller + * thinking we're snobs just because some other CPU has + * disabled global interrupts (we have already done the + * INT_ACK cycles, it's too late to try to pretend to the + * controller that we aren't taking the interrupt). + * + * 0 return value means that this irq is already being + * handled by some other CPU. (or is disabled) + */ int cpu = smp_processor_id(); + irq_desc_t *desc; + struct irqaction * action; + unsigned int status; if ((unsigned) irq > ACTUAL_NR_IRQS) { printk("device_interrupt: illegal interrupt %d\n", irq); return; } -#if 0 - /* A useful bit of code to find out if an interrupt is going wild. */ - { - static unsigned int last_msg, last_cc; - static int last_irq, count; - unsigned int cc; - - __asm __volatile("rpcc %0" : "=r"(cc)); - ++count; - if (cc - last_msg > 150000000 || irq != last_irq) { - printk("handle_irq: irq %d count %d cc %u @ %p\n", - irq, count, cc-last_cc, regs->pc); - count = 0; - last_msg = cc; - last_irq = irq; - } - last_cc = cc; + irq_attempt(cpu, irq)++; + desc = irq_desc + irq; + spin_lock_irq(&irq_controller_lock); /* mask also the RTC */ + desc->handler->ack(irq); + /* + REPLAY is when Linux resends an IRQ that was dropped earlier + WAITING is used by probe to mark irqs that are being tested + */ + status = desc->status & ~(IRQ_REPLAY | IRQ_WAITING); + status |= IRQ_PENDING; /* we _want_ to handle it */ + + /* + * If the IRQ is disabled for whatever reason, we cannot + * use the action we have. + */ + action = NULL; + if (!(status & (IRQ_DISABLED | IRQ_INPROGRESS))) { + action = desc->action; + status &= ~IRQ_PENDING; /* we commit to handling */ + status |= IRQ_INPROGRESS; /* we are handling it */ } -#endif + desc->status = status; + spin_unlock(&irq_controller_lock); - irq_enter(cpu, irq); - kstat.irqs[cpu][irq] += 1; - action = irq_desc[irq].action; + /* + * If there is no IRQ handler or it was disabled, exit early. + Since we set PENDING, if another processor is handling + a different instance of this same irq, the other processor + will take care of it. + */ + if (!action) + return; /* - * For normal interrupts, we mask it out, and then ACK it. - * This way another (more timing-critical) interrupt can - * come through while we're doing this one. - * - * Note! An irq without a handler gets masked and acked, but - * never unmasked. The autoirq stuff depends on this (it looks - * at the masks before and after doing the probing). - */ - if (ack >= 0) { - mask_irq(ack); - alpha_mv.ack_irq(ack); - } - if (action) { - if (action->flags & SA_SAMPLE_RANDOM) - add_interrupt_randomness(irq); - do { - action->handler(irq, action->dev_id, regs); - action = action->next; - } while (action); - if (ack >= 0) - unmask_irq(ack); - } else { - unexpected_irq(irq, regs); + * Edge triggered interrupts need to remember + * pending events. + * This applies to any hw interrupts that allow a second + * instance of the same irq to arrive while we are in do_IRQ + * or in the handler. But the code here only handles the _second_ + * instance of the irq, not the third or fourth. So it is mostly + * useful for irq hardware that does not mask cleanly in an + * SMP environment. + */ + for (;;) { + handle_IRQ_event(irq, regs, action); + spin_lock(&irq_controller_lock); + + if (!(desc->status & IRQ_PENDING) + || (desc->status & IRQ_LEVEL)) + break; + desc->status &= ~IRQ_PENDING; + spin_unlock(&irq_controller_lock); } - irq_exit(cpu, irq); + desc->status &= ~IRQ_INPROGRESS; + if (!(desc->status & IRQ_DISABLED)) + desc->handler->end(irq); + spin_unlock(&irq_controller_lock); } - /* - * Start listening for interrupts.. + * IRQ autodetection code.. + * + * This depends on the fact that any interrupt that + * comes in on to an unassigned handler will get stuck + * with "IRQ_WAITING" cleared and the interrupt + * disabled. */ - unsigned long probe_irq_on(void) { - struct irqaction * action; - unsigned long irqs = 0; - unsigned long delay; unsigned int i; + unsigned long delay; - /* Handle only the first 64 IRQs here. This is enough for - [E]ISA, which is the only thing that needs probing anyway. */ - for (i = (ACTUAL_NR_IRQS - 1) & 63; i > 0; i--) { - if (!(PROBE_MASK & (1UL << i))) { - continue; - } - action = irq_desc[i].action; - if (!action) { - enable_irq(i); - irqs |= (1UL << i); + /* Something may have generated an irq long ago and we want to + flush such a longstanding irq before considering it as spurious. */ + spin_lock_irq(&irq_controller_lock); + for (i = NR_IRQS-1; i > 0; i--) + if (!irq_desc[i].action) + irq_desc[i].handler->startup(i); + spin_unlock_irq(&irq_controller_lock); + + /* Wait for longstanding interrupts to trigger. */ + for (delay = jiffies + HZ/50; time_after(delay, jiffies); ) + /* about 20ms delay */ synchronize_irq(); + + /* enable any unassigned irqs (we must startup again here because + if a longstanding irq happened in the previous stage, it may have + masked itself) first, enable any unassigned irqs. */ + spin_lock_irq(&irq_controller_lock); + for (i = NR_IRQS-1; i > 0; i--) { + if (!irq_desc[i].action) { + irq_desc[i].status |= IRQ_AUTODETECT | IRQ_WAITING; + if(irq_desc[i].handler->startup(i)) + irq_desc[i].status |= IRQ_PENDING; } } + spin_unlock_irq(&irq_controller_lock); + + /* + * Wait for spurious interrupts to trigger + */ + for (delay = jiffies + HZ/10; time_after(delay, jiffies); ) + /* about 100ms delay */ synchronize_irq(); /* - * Wait about 100ms for spurious interrupts to mask themselves - * out again... + * Now filter out any obviously spurious interrupts */ - for (delay = jiffies + HZ/10; time_before(jiffies, delay); ) - barrier(); + spin_lock_irq(&irq_controller_lock); + for (i=0; ishutdown(i); + } + } + spin_unlock_irq(&irq_controller_lock); + + return 0x12345678; } /* @@ -747,19 +893,35 @@ */ int -probe_irq_off(unsigned long irqs) +probe_irq_off(unsigned long unused) { - int i; - - /* Handle only the first 64 IRQs here. This is enough for - [E]ISA, which is the only thing that needs probing anyway. */ - irqs &= alpha_irq_mask; - if (!irqs) - return 0; - i = ffz(~irqs); - if (irqs != (1UL << i)) - i = -i; - return i; + int i, irq_found, nr_irqs; + + if (unused != 0x12345678) + printk("Bad IRQ probe from %lx\n", (&unused)[-1]); + + nr_irqs = 0; + irq_found = 0; + spin_lock_irq(&irq_controller_lock); + for (i=0; ishutdown(i); + } + spin_unlock_irq(&irq_controller_lock); + + if (nr_irqs > 1) + irq_found = -irq_found; + return irq_found; } @@ -782,7 +944,12 @@ #endif break; case 1: - handle_irq(RTC_IRQ, -1, ®s); +#ifdef __SMP__ + cpu_data[smp_processor_id()].smp_local_irq_count++; + smp_percpu_timer_interrupt(®s); + if (smp_processor_id() == smp_boot_cpuid) +#endif + handle_irq(RTC_IRQ, ®s); return; case 2: alpha_mv.machine_check(vector, la_ptr, ®s); diff -urN 2.3.46pre1/arch/alpha/kernel/process.c 2.3.46pre1aa1/arch/alpha/kernel/process.c --- 2.3.46pre1/arch/alpha/kernel/process.c Wed Dec 8 00:05:25 1999 +++ 2.3.46pre1aa1/arch/alpha/kernel/process.c Wed Feb 16 00:28:24 2000 @@ -30,9 +30,11 @@ #include #include +#if 0 #ifdef CONFIG_RTC #include #endif +#endif #include #include @@ -139,9 +141,11 @@ #endif } +#if 0 #ifdef CONFIG_RTC /* Reset rtc to defaults. */ rtc_kill_pit(); +#endif #endif if (alpha_mv.kill_arch) diff -urN 2.3.46pre1/arch/alpha/kernel/rtc_irq.c 2.3.46pre1aa1/arch/alpha/kernel/rtc_irq.c --- 2.3.46pre1/arch/alpha/kernel/rtc_irq.c Thu Jan 1 01:00:00 1970 +++ 2.3.46pre1aa1/arch/alpha/kernel/rtc_irq.c Wed Feb 16 00:28:24 2000 @@ -0,0 +1,26 @@ +/* RTC irq callbacks, 1999 Andrea Arcangeli */ + +#include +#include +#include + +static void enable_rtc(unsigned int irq) { } +static unsigned int startup_rtc(unsigned int irq) { return 0; } +#define shutdown_rtc enable_rtc +#define end_rtc enable_rtc +#define ack_rtc enable_rtc +#define disable_rtc enable_rtc + +void __init +init_RTC_irq(void) +{ + static struct hw_interrupt_type rtc_irq_type = { "RTC", + startup_rtc, + shutdown_rtc, + enable_rtc, + disable_rtc, + ack_rtc, + end_rtc }; + irq_desc[RTC_IRQ].status = IRQ_DISABLED; + irq_desc[RTC_IRQ].handler = &rtc_irq_type; +} diff -urN 2.3.46pre1/arch/alpha/kernel/setup.c 2.3.46pre1aa1/arch/alpha/kernel/setup.c --- 2.3.46pre1/arch/alpha/kernel/setup.c Fri Feb 11 00:05:32 2000 +++ 2.3.46pre1aa1/arch/alpha/kernel/setup.c Wed Feb 16 00:28:24 2000 @@ -30,9 +30,11 @@ #include #include +#if 0 #ifdef CONFIG_RTC #include #endif +#endif #ifdef CONFIG_BLK_DEV_INITRD #include #endif @@ -453,6 +455,7 @@ /* Reserve standard resources. */ reserve_std_resources(); +#if 0 /* Initialize the timers. */ /* ??? There is some circumstantial evidence that this needs to be done now rather than later in time_init, which would @@ -461,6 +464,7 @@ rtc_init_pit(); #else alpha_mv.init_pit(); +#endif #endif /* diff -urN 2.3.46pre1/arch/alpha/kernel/signal.c 2.3.46pre1aa1/arch/alpha/kernel/signal.c --- 2.3.46pre1/arch/alpha/kernel/signal.c Wed Nov 24 18:22:03 1999 +++ 2.3.46pre1aa1/arch/alpha/kernel/signal.c Wed Feb 16 00:28:25 2000 @@ -437,6 +437,8 @@ err |= __copy_to_user(frame->extramask, &set->sig[1], sizeof(frame->extramask)); } + if (err) + goto give_sigsegv; /* Set up to return from userspace. If provided, use a stub already in userspace. */ @@ -499,6 +501,8 @@ err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, sw, set->sig[0], oldsp); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) + goto give_sigsegv; /* Set up to return from userspace. If provided, use a stub already in userspace. */ diff -urN 2.3.46pre1/arch/alpha/kernel/smp.c 2.3.46pre1aa1/arch/alpha/kernel/smp.c --- 2.3.46pre1/arch/alpha/kernel/smp.c Fri Feb 11 00:05:32 2000 +++ 2.3.46pre1aa1/arch/alpha/kernel/smp.c Wed Feb 16 00:28:24 2000 @@ -62,6 +62,7 @@ static unsigned long smp_secondary_alive; unsigned long cpu_present_mask; /* Which cpus ids came online. */ +static unsigned long __cpu_present_mask __initdata = 0; /* cpu reported in the hwrpb */ static int max_cpus = -1; /* Command-line limitation. */ int smp_boot_cpuid; /* Which processor we booted from. */ @@ -506,7 +507,7 @@ if ((cpu->flags & 0x1cc) == 0x1cc) { smp_num_probed++; /* Assume here that "whami" == index */ - cpu_present_mask |= (1L << i); + __cpu_present_mask |= (1L << i); cpu->pal_revision = boot_cpu_palrev; } @@ -517,11 +518,12 @@ } } else { smp_num_probed = 1; - cpu_present_mask = (1L << smp_boot_cpuid); + __cpu_present_mask = (1L << smp_boot_cpuid); } + cpu_present_mask = 1L << smp_boot_cpuid; printk(KERN_INFO "SMP: %d CPUs probed -- cpu_present_mask = %lx\n", - smp_num_probed, cpu_present_mask); + smp_num_probed, __cpu_present_mask); } /* @@ -565,12 +567,13 @@ if (i == smp_boot_cpuid) continue; - if (((cpu_present_mask >> i) & 1) == 0) + if (((__cpu_present_mask >> i) & 1) == 0) continue; if (smp_boot_one_cpu(i, cpu_count)) continue; + cpu_present_mask |= 1L << i; cpu_count++; } @@ -865,6 +868,22 @@ } return 0; +} + +static void +ipi_imb(void) +{ + imb(); +} + +void +smp_imb(void) +{ + /* Must wait other processors to flush their icache before continue. */ + if (smp_call_function(ipi_imb, NULL, 1, 1)) + printk(KERN_CRIT "smp_imb: timed out\n"); + + imb(); } static void diff -urN 2.3.46pre1/arch/alpha/kernel/sys_dp264.c 2.3.46pre1aa1/arch/alpha/kernel/sys_dp264.c --- 2.3.46pre1/arch/alpha/kernel/sys_dp264.c Wed Dec 8 00:05:25 1999 +++ 2.3.46pre1aa1/arch/alpha/kernel/sys_dp264.c Wed Feb 16 00:28:24 2000 @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -36,60 +37,158 @@ * HACK ALERT! only the boot cpu is used for interrupts. */ +static void enable_tsunami_irq(unsigned int irq); +static void disable_tsunami_irq(unsigned int irq); +static void enable_clipper_irq(unsigned int irq); +static void disable_clipper_irq(unsigned int irq); + +#define end_tsunami_irq enable_tsunami_irq +#define shutdown_tsunami_irq disable_tsunami_irq +#define mask_and_ack_tsunami_irq disable_tsunami_irq + +#define end_clipper_irq enable_clipper_irq +#define shutdown_clipper_irq disable_clipper_irq +#define mask_and_ack_clipper_irq disable_clipper_irq + + +static unsigned int +startup_tsunami_irq(unsigned int irq) +{ + enable_tsunami_irq(irq); + return 0; /* never anything pending */ +} + +static unsigned int +startup_clipper_irq(unsigned int irq) +{ + enable_clipper_irq(irq); + return 0; /* never anything pending */ +} + +static struct hw_interrupt_type tsunami_irq_type = { + "TSUNAMI", + startup_tsunami_irq, + shutdown_tsunami_irq, + enable_tsunami_irq, + disable_tsunami_irq, + mask_and_ack_tsunami_irq, + end_tsunami_irq +}; + +static struct hw_interrupt_type clipper_irq_type = { + "CLIPPER", + startup_clipper_irq, + shutdown_clipper_irq, + enable_clipper_irq, + disable_clipper_irq, + mask_and_ack_clipper_irq, + end_clipper_irq +}; + +static unsigned long cached_irq_mask = ~0UL; + +#define TSUNAMI_SET_IRQ_MASK(cpu, value) \ +do { \ + volatile unsigned long *csr; \ + \ + csr = &TSUNAMI_cchip->dim##cpu##.csr; \ + *csr = (value); \ + mb(); \ + *csr; \ +} while(0) + +static inline void +do_flush_irq_mask(unsigned long value) +{ + switch (TSUNAMI_bootcpu) + { + case 0: + TSUNAMI_SET_IRQ_MASK(0, value); + break; + case 1: + TSUNAMI_SET_IRQ_MASK(1, value); + break; + case 2: + TSUNAMI_SET_IRQ_MASK(2, value); + break; + case 3: + TSUNAMI_SET_IRQ_MASK(3, value); + break; + } +} + +#ifdef CONFIG_SMP +static inline void +do_flush_smp_irq_mask(unsigned long value) +{ + extern unsigned long cpu_present_mask; + unsigned long other_cpus = cpu_present_mask & ~(1L << TSUNAMI_bootcpu); + + if (other_cpus & 1) + TSUNAMI_SET_IRQ_MASK(0, value); + if (other_cpus & 2) + TSUNAMI_SET_IRQ_MASK(1, value); + if (other_cpus & 4) + TSUNAMI_SET_IRQ_MASK(2, value); + if (other_cpus & 8) + TSUNAMI_SET_IRQ_MASK(3, value); +} +#endif + static void -dp264_update_irq_hw(unsigned long irq, unsigned long mask, int unmask_p) +dp264_flush_irq_mask(unsigned long mask) { - volatile unsigned long *csr; + unsigned long value; - if (TSUNAMI_bootcpu < 2) { - if (!TSUNAMI_bootcpu) - csr = &TSUNAMI_cchip->dim0.csr; - else - csr = &TSUNAMI_cchip->dim1.csr; - } else { - if (TSUNAMI_bootcpu == 2) - csr = &TSUNAMI_cchip->dim2.csr; - else - csr = &TSUNAMI_cchip->dim3.csr; - } +#ifdef CONFIG_SMP + value = ~mask; + do_flush_smp_irq_mask(value); +#endif - *csr = ~mask; - mb(); - *csr; - - if (irq < 16) { - if (irq >= 8) - outb(mask >> 8, 0xA1); /* ISA PIC2 */ - else - outb(mask, 0x21); /* ISA PIC1 */ - } + value = ~mask | (1UL << 55) | 0xffff; /* isa irqs always enabled */ + do_flush_irq_mask(value); } static void -clipper_update_irq_hw(unsigned long irq, unsigned long mask, int unmask_p) +enable_tsunami_irq(unsigned int irq) { - if (irq >= 16) { - volatile unsigned long *csr; + cached_irq_mask &= ~(1UL << irq); + dp264_flush_irq_mask(cached_irq_mask); +} - if (TSUNAMI_bootcpu < 2) - if (!TSUNAMI_bootcpu) - csr = &TSUNAMI_cchip->dim0.csr; - else - csr = &TSUNAMI_cchip->dim1.csr; - else - if (TSUNAMI_bootcpu == 2) - csr = &TSUNAMI_cchip->dim2.csr; - else - csr = &TSUNAMI_cchip->dim3.csr; - - *csr = (~mask >> 16) | (1UL << 55); /* master ISA enable */ - mb(); - *csr; - } - else if (irq >= 8) - outb(mask >> 8, 0xA1); /* ISA PIC2 */ - else - outb(mask, 0x21); /* ISA PIC1 */ +static void +disable_tsunami_irq(unsigned int irq) +{ + cached_irq_mask |= 1UL << irq; + dp264_flush_irq_mask(cached_irq_mask); +} + +static void +clipper_flush_irq_mask(unsigned long mask) +{ + unsigned long value; + +#ifdef CONFIG_SMP + value = ~mask >> 16; + do_flush_smp_irq_mask(value); +#endif + + value = (~mask >> 16) | (1UL << 55); /* master ISA enable */ + do_flush_irq_mask(value); +} + +static void +enable_clipper_irq(unsigned int irq) +{ + cached_irq_mask &= ~(1UL << irq); + clipper_flush_irq_mask(cached_irq_mask); +} + +static void +disable_clipper_irq(unsigned int irq) +{ + cached_irq_mask |= 1UL << irq; + clipper_flush_irq_mask(cached_irq_mask); } static void @@ -126,9 +225,9 @@ static void dp264_srm_device_interrupt(unsigned long vector, struct pt_regs * regs) { - int irq, ack; + int irq; - ack = irq = (vector - 0x800) >> 4; + irq = (vector - 0x800) >> 4; /* * The SRM console reports PCI interrupts with a vector calculated by: @@ -142,17 +241,17 @@ * so we don't count them. */ if (irq >= 32) - ack = irq = irq - 16; + irq -= 16; - handle_irq(irq, ack, regs); + handle_irq(irq, regs); } static void clipper_srm_device_interrupt(unsigned long vector, struct pt_regs * regs) { - int irq, ack; + int irq; - ack = irq = (vector - 0x800) >> 4; + irq = (vector - 0x800) >> 4; /* * The SRM console reports PCI interrupts with a vector calculated by: @@ -166,7 +265,22 @@ * * Eg IRQ 24 is DRIR bit 8, etc, etc */ - handle_irq(irq, ack, regs); + handle_irq(irq, regs); +} + +static void __init +init_TSUNAMI_irqs(struct hw_interrupt_type * ops) +{ + int i; + + for (i = 0; i < NR_IRQS; i++) { + if (i == RTC_IRQ) + continue; + if (i < 16) + continue; + irq_desc[i].status = IRQ_DISABLED | IRQ_LEVEL; + irq_desc[i].handler = ops; + } } static void __init @@ -180,10 +294,11 @@ if (alpha_using_srm) alpha_mv.device_interrupt = dp264_srm_device_interrupt; - dp264_update_irq_hw(16, alpha_irq_mask, 0); + init_ISA_irqs(); + init_RTC_irq(); + init_TSUNAMI_irqs(&tsunami_irq_type); - enable_irq(55); /* Enable ISA interrupt controller. */ - enable_irq(2); + dp264_flush_irq_mask(~0UL); } static void __init @@ -197,10 +312,11 @@ if (alpha_using_srm) alpha_mv.device_interrupt = clipper_srm_device_interrupt; - clipper_update_irq_hw(16, alpha_irq_mask, 0); + init_ISA_irqs(); + init_RTC_irq(); + init_TSUNAMI_irqs(&clipper_irq_type); - enable_irq(55); /* Enable ISA interrupt controller. */ - enable_irq(2); + clipper_flush_irq_mask(~0UL); } @@ -431,9 +547,6 @@ min_mem_address: DEFAULT_MEM_BASE, nr_irqs: 64, - irq_probe_mask: TSUNAMI_PROBE_MASK, - update_irq_hw: dp264_update_irq_hw, - ack_irq: common_ack_irq, device_interrupt: dp264_device_interrupt, init_arch: tsunami_init_arch, @@ -458,9 +571,6 @@ min_mem_address: DEFAULT_MEM_BASE, nr_irqs: 64, - irq_probe_mask: TSUNAMI_PROBE_MASK, - update_irq_hw: dp264_update_irq_hw, - ack_irq: common_ack_irq, device_interrupt: dp264_device_interrupt, init_arch: tsunami_init_arch, @@ -484,9 +594,6 @@ min_mem_address: DEFAULT_MEM_BASE, nr_irqs: 64, - irq_probe_mask: TSUNAMI_PROBE_MASK, - update_irq_hw: dp264_update_irq_hw, - ack_irq: common_ack_irq, device_interrupt: dp264_device_interrupt, init_arch: tsunami_init_arch, @@ -510,9 +617,6 @@ min_mem_address: DEFAULT_MEM_BASE, nr_irqs: 64, - irq_probe_mask: TSUNAMI_PROBE_MASK, - update_irq_hw: clipper_update_irq_hw, - ack_irq: common_ack_irq, device_interrupt: dp264_device_interrupt, init_arch: tsunami_init_arch, diff -urN 2.3.46pre1/arch/alpha/kernel/sys_sx164.c 2.3.46pre1aa1/arch/alpha/kernel/sys_sx164.c --- 2.3.46pre1/arch/alpha/kernel/sys_sx164.c Wed Dec 8 00:05:25 1999 +++ 2.3.46pre1aa1/arch/alpha/kernel/sys_sx164.c Wed Feb 16 00:28:24 2000 @@ -14,6 +14,8 @@ #include #include #include +#include +#include #include #include @@ -26,47 +28,83 @@ #include #include "proto.h" -#include #include "pci_impl.h" #include "machvec_impl.h" +/* Note invert on MASK bits. */ +static unsigned long cached_irq_mask; + +static inline void +sx164_change_irq_mask(unsigned long mask) +{ + *(vulp)PYXIS_INT_MASK = mask; + mb(); + *(vulp)PYXIS_INT_MASK; +} + +static inline void +sx164_enable_irq(unsigned int irq) +{ + sx164_change_irq_mask(cached_irq_mask |= 1UL << (irq - 16)); +} + static void -sx164_update_irq_hw(unsigned long irq, unsigned long mask, int unmask_p) +sx164_disable_irq(unsigned int irq) { - if (irq >= 16) { - /* Make CERTAIN none of the bogus ints get enabled */ - *(vulp)PYXIS_INT_MASK = - ~((long)mask >> 16) & ~0x000000000000003bUL; - mb(); - /* ... and read it back to make sure it got written. */ - *(vulp)PYXIS_INT_MASK; - } - else if (irq >= 8) - outb(mask >> 8, 0xA1); /* ISA PIC2 */ - else - outb(mask, 0x21); /* ISA PIC1 */ + sx164_change_irq_mask(cached_irq_mask &= ~(1UL << (irq - 16))); +} + +static unsigned int +sx164_startup_irq(unsigned int irq) +{ + sx164_enable_irq(irq); + return 0; +} + +static inline void +sx164_srm_enable_irq(unsigned int irq) +{ + cserve_ena(irq - 16); } static void -sx164_srm_update_irq_hw(unsigned long irq, unsigned long mask, int unmask_p) +sx164_srm_disable_irq(unsigned int irq) { - if (irq >= 16) { - if (unmask_p) - cserve_ena(irq - 16); - else - cserve_dis(irq - 16); - } - else if (irq >= 8) - outb(mask >> 8, 0xA1); /* ISA PIC2 */ - else - outb(mask, 0x21); /* ISA PIC1 */ + cserve_dis(irq - 16); } +static unsigned int +sx164_srm_startup_irq(unsigned int irq) +{ + sx164_srm_enable_irq(irq); + return 0; +} + +static struct hw_interrupt_type sx164_irq_type = { + typename: "SX164", + startup: sx164_startup_irq, + shutdown: sx164_disable_irq, + enable: sx164_enable_irq, + disable: sx164_disable_irq, + ack: sx164_disable_irq, + end: sx164_enable_irq, +}; + +static struct hw_interrupt_type sx164_srm_irq_type = { + typename: "SX164-SRM", + startup: sx164_srm_startup_irq, + shutdown: sx164_srm_disable_irq, + enable: sx164_srm_enable_irq, + disable: sx164_srm_disable_irq, + ack: sx164_srm_disable_irq, + end: sx164_srm_enable_irq, +}; + static void sx164_device_interrupt(unsigned long vector, struct pt_regs *regs) { - unsigned long pld, tmp; + unsigned long pld; unsigned int i; /* Read the interrupt summary register of PYXIS */ @@ -93,35 +131,48 @@ continue; } else { /* if not timer int */ - handle_irq(16 + i, 16 + i, regs); + handle_irq(16 + i, regs); } - *(vulp)PYXIS_INT_REQ = 1UL << i; mb(); - tmp = *(vulp)PYXIS_INT_REQ; + + *(vulp)PYXIS_INT_REQ = 1UL << i; + mb(); + *(vulp)PYXIS_INT_REQ; } } static void sx164_init_irq(void) { + struct hw_interrupt_type *ops; + long i; + outb(0, DMA1_RESET_REG); outb(0, DMA2_RESET_REG); outb(DMA_MODE_CASCADE, DMA2_MODE_REG); outb(0, DMA2_MASK_REG); + init_ISA_irqs(); + init_RTC_irq(); + if (alpha_using_srm) { - alpha_mv.update_irq_hw = sx164_srm_update_irq_hw; alpha_mv.device_interrupt = srm_device_interrupt; + ops = &sx164_srm_irq_type; } else { - /* Note invert on MASK bits. */ - *(vulp)PYXIS_INT_MASK = ~((long)alpha_irq_mask >> 16); - mb(); - *(vulp)PYXIS_INT_MASK; + sx164_change_irq_mask(0); + ops = &sx164_irq_type; + } + + for (i = 16; i < 40; ++i) { + /* Make CERTAIN none of the bogus ints get enabled. */ + if ((0x3b0000 >> i) & 1) + continue; + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].handler = ops; } - enable_irq(16 + 6); /* enable timer */ - enable_irq(16 + 7); /* enable ISA PIC cascade */ - enable_irq(2); /* enable cascade */ + ops->startup(16 + 6); /* enable timer */ + ops->startup(16 + 7); /* enable ISA PIC cascade */ } /* @@ -202,9 +253,6 @@ min_mem_address: DEFAULT_MEM_BASE, nr_irqs: 40, - irq_probe_mask: _PROBE_MASK(40), - update_irq_hw: sx164_update_irq_hw, - ack_irq: common_ack_irq, device_interrupt: sx164_device_interrupt, init_arch: pyxis_init_arch, diff -urN 2.3.46pre1/arch/alpha/kernel/time.c 2.3.46pre1aa1/arch/alpha/kernel/time.c --- 2.3.46pre1/arch/alpha/kernel/time.c Wed Dec 8 00:05:25 1999 +++ 2.3.46pre1aa1/arch/alpha/kernel/time.c Wed Feb 16 00:28:24 2000 @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include #include @@ -88,13 +90,7 @@ __u32 now; long nticks; -#ifdef __SMP__ - /* When SMP, do this for *all* CPUs, but only do the rest for - the boot CPU. */ - smp_percpu_timer_interrupt(regs); - if (smp_processor_id() != smp_boot_cpuid) - return; -#else +#ifndef __SMP__ /* Not SMP, do kernel PC profiling here. */ if (!user_mode(regs)) alpha_do_profile(regs->pc); @@ -167,6 +163,7 @@ )*60 + sec; /* finally seconds */ } +#if 0 /* * Initialize Programmable Interval Timers with standard values. Some * drivers depend on them being initialized (e.g., joystick driver). @@ -213,6 +210,7 @@ sti(); } #endif +#endif void common_init_pit (void) @@ -248,10 +246,15 @@ void time_init(void) { - void (*irq_handler)(int, void *, struct pt_regs *); unsigned int year, mon, day, hour, min, sec, cc1, cc2; unsigned long cycle_freq, one_percent; long diff; + static struct irqaction timer_irqaction = { timer_interrupt, + SA_INTERRUPT, 0, "timer", + NULL, NULL}; + + /* Startup the timer source. */ + alpha_mv.init_pit(); /* * The Linux interpretation of the CMOS clock register contents: @@ -337,9 +340,7 @@ state.partial_tick = 0L; /* setup timer */ - irq_handler = timer_interrupt; - if (request_irq(TIMER_IRQ, irq_handler, 0, "timer", NULL)) - panic("Could not allocate timer IRQ!"); + setup_irq(TIMER_IRQ, &timer_irqaction); } /* diff -urN 2.3.46pre1/arch/alpha/mm/fault.c 2.3.46pre1aa1/arch/alpha/mm/fault.c --- 2.3.46pre1/arch/alpha/mm/fault.c Wed Nov 24 18:22:03 1999 +++ 2.3.46pre1aa1/arch/alpha/mm/fault.c Wed Feb 16 00:28:25 2000 @@ -130,13 +130,13 @@ * make sure we exit gracefully rather than endlessly redo * the fault. */ +survive: fault = handle_mm_fault(current, vma, address, cause > 0); - up(&mm->mmap_sem); - if (fault < 0) goto out_of_memory; if (fault == 0) goto do_sigbus; + up(&mm->mmap_sem); return; @@ -177,13 +177,23 @@ * us unable to handle the page fault gracefully. */ out_of_memory: - printk(KERN_ALERT "VM: killing process %s(%d)\n", - current->comm, current->pid); - if (!user_mode(regs)) - goto no_context; - do_exit(SIGKILL); + if (current->pid == 1) + { + current->policy |= SCHED_YIELD; + schedule(); + goto survive; + } + up(&mm->mmap_sem); + if (user_mode(regs)) + { + printk(KERN_ALERT "VM: killing process %s(%d)\n", + current->comm, current->pid); + do_exit(SIGKILL); + } + goto no_context; do_sigbus: + up(&mm->mmap_sem); /* * Send a sigbus, regardless of whether we were in kernel * or user mode. diff -urN 2.3.46pre1/arch/i386/kernel/irq.c 2.3.46pre1aa1/arch/i386/kernel/irq.c --- 2.3.46pre1/arch/i386/kernel/irq.c Fri Feb 11 00:05:32 2000 +++ 2.3.46pre1aa1/arch/i386/kernel/irq.c Wed Feb 16 00:28:25 2000 @@ -679,8 +679,24 @@ unsigned long delay; unsigned long val; + /* + * something may have generated an irq long ago and we want to + * flush such a longstanding irq before considering it as spurious. + */ + spin_lock_irq(&irq_controller_lock); + for (i = NR_IRQS-1; i > 0; i--) + if (!irq_desc[i].action) + irq_desc[i].handler->startup(i); + spin_unlock_irq(&irq_controller_lock); + + /* Wait for longstanding interrupts to trigger. */ + for (delay = jiffies + HZ/50; time_after(delay, jiffies); ) + /* about 20ms delay */ synchronize_irq(); + /* - * first, enable any unassigned irqs + * enable any unassigned irqs + * (we must startup again here because if a longstanding irq + * happened in the previous stage, it may have masked itself) */ spin_lock_irq(&irq_controller_lock); for (i = NR_IRQS-1; i > 0; i--) { diff -urN 2.3.46pre1/arch/i386/kernel/signal.c 2.3.46pre1aa1/arch/i386/kernel/signal.c --- 2.3.46pre1/arch/i386/kernel/signal.c Sun Jan 30 15:43:34 2000 +++ 2.3.46pre1aa1/arch/i386/kernel/signal.c Wed Feb 16 00:28:25 2000 @@ -419,13 +419,19 @@ ? current->exec_domain->signal_invmap[sig] : sig), &frame->sig); + if (err) + goto give_sigsegv; err |= setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]); + if (err) + goto give_sigsegv; if (_NSIG_WORDS > 1) { err |= __copy_to_user(frame->extramask, &set->sig[1], sizeof(frame->extramask)); } + if (err) + goto give_sigsegv; /* Set up to return from userspace. If provided, use a stub already in userspace. */ @@ -486,6 +492,8 @@ err |= __put_user(&frame->info, &frame->pinfo); err |= __put_user(&frame->uc, &frame->puc); err |= __copy_to_user(&frame->info, info, sizeof(*info)); + if (err) + goto give_sigsegv; /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); @@ -497,6 +505,8 @@ err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + if (err) + goto give_sigsegv; /* Set up to return from userspace. If provided, use a stub already in userspace. */ diff -urN 2.3.46pre1/arch/i386/mm/fault.c 2.3.46pre1aa1/arch/i386/mm/fault.c --- 2.3.46pre1/arch/i386/mm/fault.c Sun Jan 30 15:43:27 2000 +++ 2.3.46pre1aa1/arch/i386/mm/fault.c Wed Feb 16 00:28:25 2000 @@ -32,6 +32,7 @@ { struct vm_area_struct * vma; unsigned long start = (unsigned long) addr; + int fault; if (!size) return 1; @@ -51,8 +52,12 @@ start &= PAGE_MASK; for (;;) { - if (handle_mm_fault(current, vma, start, 1) <= 0) - goto bad_area; +survive: + fault = handle_mm_fault(current, vma, start, 1); + if (!fault) + goto do_sigbus; + if (fault < 0) + goto out_of_memory; if (!size) break; size--; @@ -75,6 +80,19 @@ bad_area: return 0; + +do_sigbus: + force_sig(SIGBUS, current); + goto bad_area; + +out_of_memory: + if (current->pid == 1) + { + current->policy |= SCHED_YIELD; + schedule(); + goto survive; + } + goto bad_area; } static void __init handle_wp_test (void) @@ -192,6 +210,7 @@ * make sure we exit gracefully rather than endlessly redo * the fault. */ +survive: { int fault = handle_mm_fault(tsk, vma, address, write); if (fault < 0) @@ -288,10 +307,39 @@ * us unable to handle the page fault gracefully. */ out_of_memory: + if (tsk->pid == 1) + { + tsk->policy |= SCHED_YIELD; + schedule(); + goto survive; + } up(&mm->mmap_sem); - printk("VM: killing process %s\n", tsk->comm); if (error_code & 4) - do_exit(SIGKILL); + { + if (tsk->oom_kill_try++ > 10 || + !((regs->eflags >> 12) & 3)) + { + printk(KERN_ALERT "VM: killing process %s\n", + tsk->comm); + do_exit(SIGKILL); + } + else + { + /* + * The task is running with privilegies and so we + * trust it and we give it a chance to die gracefully. + */ + printk(KERN_ALERT "VM: terminating process %s\n", + tsk->comm); + force_sig(SIGTERM, current); + if (tsk->oom_kill_try > 1) + { + tsk->policy |= SCHED_YIELD; + schedule(); + } + return; + } + } goto no_context; do_sigbus: diff -urN 2.3.46pre1/arch/m68k/atari/stram.c 2.3.46pre1aa1/arch/m68k/atari/stram.c --- 2.3.46pre1/arch/m68k/atari/stram.c Sun Jan 30 15:43:34 2000 +++ 2.3.46pre1aa1/arch/m68k/atari/stram.c Wed Feb 16 00:28:24 2000 @@ -1168,7 +1168,7 @@ { unsigned long start, len; - while( CURRENT ) { + while( !QUEUE_EMPTY ) { if (MAJOR(CURRENT->rq_dev) != MAJOR_NR) panic("stram: request list destroyed"); if (CURRENT->bh) { diff -urN 2.3.46pre1/drivers/acorn/block/fd1772.c 2.3.46pre1aa1/drivers/acorn/block/fd1772.c --- 2.3.46pre1/drivers/acorn/block/fd1772.c Thu Jan 13 05:17:19 2000 +++ 2.3.46pre1aa1/drivers/acorn/block/fd1772.c Wed Feb 16 00:28:24 2000 @@ -591,7 +591,7 @@ { printk("FDC1772: fd_error\n"); /*panic("fd1772: fd_error"); *//* DAG tmp */ - if (!CURRENT) + if (QUEUE_EMPTY) return; CURRENT->errors++; if (CURRENT->errors >= MAX_ERRORS) { @@ -1230,14 +1230,14 @@ DPRINT(("redo_fd_request: CURRENT=%08lx CURRENT->rq_dev=%04x CURRENT->sector=%ld\n", (unsigned long) CURRENT, CURRENT ? CURRENT->rq_dev : 0, - CURRENT ? CURRENT->sector : 0)); + !QUEUE_EMPTY ? CURRENT->sector : 0)); - if (CURRENT && CURRENT->rq_status == RQ_INACTIVE) + if (!QUEUE_EMPTY && CURRENT->rq_status == RQ_INACTIVE) goto the_end; repeat: - if (!CURRENT) + if (QUEUE_EMPTY) goto the_end; if (MAJOR(CURRENT->rq_dev) != MAJOR_NR) diff -urN 2.3.46pre1/drivers/acorn/block/mfmhd.c 2.3.46pre1aa1/drivers/acorn/block/mfmhd.c --- 2.3.46pre1/drivers/acorn/block/mfmhd.c Sun Jan 30 15:43:27 2000 +++ 2.3.46pre1aa1/drivers/acorn/block/mfmhd.c Wed Feb 16 00:28:24 2000 @@ -758,7 +758,7 @@ /* No - its the end of the line */ /* end_request's should have happened at the end of sector DMAs */ /* Turns Drive LEDs off - may slow it down? */ - if (!CURRENT) + if (QUEUE_EMPTY) issue_command(CMD_CKV, block, 2); Busy = 0; @@ -891,7 +891,7 @@ { DBG("mfm_request CURRENT=%p Busy=%d\n", CURRENT, Busy); - if (!CURRENT) { + if (QUEUE_EMPTY) { DBG("mfm_request: Exited due to NULL Current 1\n"); return; } @@ -918,7 +918,7 @@ DBG("mfm_request: before INIT_REQUEST\n"); - if (!CURRENT) { + if (QUEUE_EMPTY) { printk("mfm_request: Exiting due to !CURRENT (pre)\n"); CLEAR_INTR; Busy = 0; diff -urN 2.3.46pre1/drivers/block/Config.in 2.3.46pre1aa1/drivers/block/Config.in --- 2.3.46pre1/drivers/block/Config.in Fri Feb 11 00:05:33 2000 +++ 2.3.46pre1aa1/drivers/block/Config.in Wed Feb 16 00:28:24 2000 @@ -198,6 +198,10 @@ comment 'Additional Block Devices' +tristate 'Logical volume manager (LVM) support' CONFIG_BLK_DEV_LVM N +if [ "$CONFIG_BLK_DEV_LVM" != "n" ]; then + bool ' LVM information in proc filesystem' CONFIG_LVM_PROC_FS Y +fi tristate 'Loopback device support' CONFIG_BLK_DEV_LOOP if [ "$CONFIG_NET" = "y" ]; then tristate 'Network block device support' CONFIG_BLK_DEV_NBD diff -urN 2.3.46pre1/drivers/block/DAC960.c 2.3.46pre1aa1/drivers/block/DAC960.c --- 2.3.46pre1/drivers/block/DAC960.c Sun Jan 30 15:43:37 2000 +++ 2.3.46pre1aa1/drivers/block/DAC960.c Wed Feb 16 00:28:24 2000 @@ -1010,16 +1010,19 @@ static int DAC_merge_fn(request_queue_t *q, struct request *req, - struct buffer_head *bh) + struct buffer_head *bh, int __max_segments) { int max_segments; DAC960_Controller_T * Controller = q->queuedata; max_segments = Controller->MaxSegmentsPerRequest[MINOR(req->rq_dev)]; + if (__max_segments < max_segments) + max_segments = __max_segments; if (req->bhtail->b_data + req->bhtail->b_size != bh->b_data) { if (req->nr_segments < max_segments) { req->nr_segments++; + q->nr_segments++; return 1; } return 0; @@ -1030,16 +1033,22 @@ static int DAC_merge_requests_fn(request_queue_t *q, struct request *req, - struct request *next) + struct request *next, + int __max_segments) { int max_segments; DAC960_Controller_T * Controller = q->queuedata; int total_segments = req->nr_segments + next->nr_segments; max_segments = Controller->MaxSegmentsPerRequest[MINOR(req->rq_dev)]; + if (__max_segments < max_segments) + max_segments = __max_segments; if (req->bhtail->b_data + req->bhtail->b_size == next->bh->b_data) + { total_segments--; + q->nr_segments--; + } if (total_segments > max_segments) return 0; @@ -1156,7 +1165,6 @@ blk_size[MajorNumber] = NULL; blksize_size[MajorNumber] = NULL; max_sectors[MajorNumber] = NULL; - max_segments[MajorNumber] = NULL; /* Remove the Generic Disk Information structure from the list. */ @@ -1305,15 +1313,17 @@ static boolean DAC960_ProcessRequest(DAC960_Controller_T *Controller, boolean WaitForCommand) { - IO_Request_T **RequestQueuePointer = - &blk_dev[DAC960_MAJOR + Controller->ControllerNumber].request_queue.current_request; + struct list_head * queue_head; IO_Request_T *Request; DAC960_Command_T *Command; char *RequestBuffer; + + queue_head = &blk_dev[DAC960_MAJOR + Controller->ControllerNumber].request_queue.queue_head; while (true) { - Request = *RequestQueuePointer; - if (Request == NULL || Request->rq_status == RQ_INACTIVE) return false; + if (list_empty(queue_head)) return false; + Request = blkdev_entry_next_request(queue_head); + if (Request->rq_status == RQ_INACTIVE) return false; Command = DAC960_AllocateCommand(Controller); if (Command != NULL) break; if (!WaitForCommand) return false; @@ -1335,7 +1345,7 @@ Command->BufferHeader = Request->bh; RequestBuffer = Request->buffer; Request->rq_status = RQ_INACTIVE; - *RequestQueuePointer = Request->next; + blkdev_dequeue_request(Request); wake_up(&wait_for_request); if (Command->SegmentCount == 1) { diff -urN 2.3.46pre1/drivers/block/Makefile 2.3.46pre1aa1/drivers/block/Makefile --- 2.3.46pre1/drivers/block/Makefile Fri Feb 11 00:05:33 2000 +++ 2.3.46pre1aa1/drivers/block/Makefile Wed Feb 16 00:28:24 2000 @@ -326,6 +326,14 @@ endif endif +ifeq ($(CONFIG_BLK_DEV_LVM),y) +L_OBJS += lvm.o lvm-snap.o +else + ifeq ($(CONFIG_BLK_DEV_LVM),m) + M_OBJS += lvm-mod.o + endif +endif + ifeq ($(CONFIG_BLK_DEV_MD),y) LX_OBJS += md.o @@ -407,3 +415,6 @@ ide-probe-mod.o: ide-probe.o ide-geometry.o $(LD) $(LD_RFLAG) -r -o $@ ide-probe.o ide-geometry.o + +lvm-mod.o: lvm.o lvm-snap.o + $(LD) -r -o $@ lvm.o lvm-snap.o diff -urN 2.3.46pre1/drivers/block/README.lvm 2.3.46pre1aa1/drivers/block/README.lvm --- 2.3.46pre1/drivers/block/README.lvm Thu Jan 1 01:00:00 1970 +++ 2.3.46pre1aa1/drivers/block/README.lvm Wed Feb 16 00:28:24 2000 @@ -0,0 +1,8 @@ + +This is the Logical Volume Manager driver for Linux, + +Tools, library that manage logical volumes can be found +at . + +There you can obtain actual driver versions too. + diff -urN 2.3.46pre1/drivers/block/acsi.c 2.3.46pre1aa1/drivers/block/acsi.c --- 2.3.46pre1/drivers/block/acsi.c Sun Jan 30 15:43:28 2000 +++ 2.3.46pre1aa1/drivers/block/acsi.c Wed Feb 16 00:28:24 2000 @@ -769,7 +769,7 @@ static void bad_rw_intr( void ) { - if (!CURRENT) + if (QUEUE_EMPTY) return; if (++CURRENT->errors >= MAX_ERRORS) @@ -843,7 +843,7 @@ DEVICE_INTR = NULL; printk( KERN_ERR "ACSI timeout\n" ); - if (!CURRENT) return; + if (QUEUE_EMPTY) return; if (++CURRENT->errors >= MAX_ERRORS) { #ifdef DEBUG printk( KERN_ERR "ACSI: too many errors.\n" ); @@ -953,7 +953,7 @@ unsigned long pbuffer; struct buffer_head *bh; - if (CURRENT && CURRENT->rq_status == RQ_INACTIVE) { + if (!QUEUE_EMPTY && CURRENT->rq_status == RQ_INACTIVE) { if (!DEVICE_INTR) { ENABLE_IRQ(); stdma_release(); @@ -969,7 +969,7 @@ /* Another check here: An interrupt or timer event could have * happened since the last check! */ - if (CURRENT && CURRENT->rq_status == RQ_INACTIVE) { + if (!QUEUE_EMPTY && CURRENT->rq_status == RQ_INACTIVE) { if (!DEVICE_INTR) { ENABLE_IRQ(); stdma_release(); @@ -979,7 +979,7 @@ if (DEVICE_INTR) return; - if (!CURRENT) { + if (QUEUE_EMPTY) { CLEAR_INTR; ENABLE_IRQ(); stdma_release(); diff -urN 2.3.46pre1/drivers/block/amiflop.c 2.3.46pre1aa1/drivers/block/amiflop.c --- 2.3.46pre1/drivers/block/amiflop.c Sun Jan 30 15:43:37 2000 +++ 2.3.46pre1aa1/drivers/block/amiflop.c Wed Feb 16 00:28:24 2000 @@ -1385,12 +1385,12 @@ char *data; unsigned long flags; - if (CURRENT && CURRENT->rq_status == RQ_INACTIVE){ + if (!QUEUE_EMPTY && CURRENT->rq_status == RQ_INACTIVE){ return; } repeat: - if (!CURRENT) { + if (QUEUE_EMPTY) { /* Nothing left to do */ return; } diff -urN 2.3.46pre1/drivers/block/ataflop.c 2.3.46pre1aa1/drivers/block/ataflop.c --- 2.3.46pre1/drivers/block/ataflop.c Thu Jan 13 05:17:19 2000 +++ 2.3.46pre1aa1/drivers/block/ataflop.c Wed Feb 16 00:28:24 2000 @@ -624,7 +624,7 @@ return; } - if (!CURRENT) return; + if (QUEUE_EMPTY) return; CURRENT->errors++; if (CURRENT->errors >= MAX_ERRORS) { printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive ); @@ -1450,18 +1450,18 @@ int device, drive, type; DPRINT(("redo_fd_request: CURRENT=%08lx CURRENT->dev=%04x CURRENT->sector=%ld\n", - (unsigned long)CURRENT, CURRENT ? CURRENT->rq_dev : 0, - CURRENT ? CURRENT->sector : 0 )); + (unsigned long)CURRENT, !QUEUE_EMPTY ? CURRENT->rq_dev : 0, + !QUEUE_EMPTY ? CURRENT->sector : 0 )); IsFormatting = 0; - if (CURRENT && CURRENT->rq_status == RQ_INACTIVE){ + if (!QUEUE_EMPTY && CURRENT->rq_status == RQ_INACTIVE){ return; } repeat: - if (!CURRENT) + if (QUEUE_EMPTY) goto the_end; if (MAJOR(CURRENT->rq_dev) != MAJOR_NR) diff -urN 2.3.46pre1/drivers/block/cpqarray.c 2.3.46pre1aa1/drivers/block/cpqarray.c --- 2.3.46pre1/drivers/block/cpqarray.c Sun Jan 30 15:43:28 2000 +++ 2.3.46pre1aa1/drivers/block/cpqarray.c Wed Feb 16 00:28:24 2000 @@ -880,14 +880,16 @@ cmdlist_t *c; int seg, sect; char *lastdataend; - request_queue_t * q; + struct list_head * queue_head; struct buffer_head *bh; struct request *creq; - q = &blk_dev[MAJOR_NR+ctlr].request_queue; + queue_head = &blk_dev[MAJOR_NR+ctlr].request_queue.queue_head; - creq = q->current_request; - if (creq == NULL || creq->rq_status == RQ_INACTIVE) + if (list_empty(queue_head)) + goto doreq_done; + creq = blkdev_entry_next_request(queue_head); + if (creq->rq_status == RQ_INACTIVE) goto doreq_done; if (ctlr != MAJOR(creq->rq_dev)-MAJOR_NR || @@ -961,10 +963,9 @@ bh->b_reqnext = NULL; DBGPX( printk("More to do on same request %p\n", creq); ); } else { -DBGPX( printk("Done with %p, queueing %p\n", creq, creq->next); ); - creq->rq_status = RQ_INACTIVE; - q->current_request = creq->next; - wake_up(&wait_for_request); +DBGPX( printk("Done with %p\n", creq); ); + blkdev_dequeue_request(creq); + end_that_request_last(creq); } c->req.hdr.cmd = (creq->cmd == READ) ? IDA_READ : IDA_WRITE; diff -urN 2.3.46pre1/drivers/block/floppy.c 2.3.46pre1aa1/drivers/block/floppy.c --- 2.3.46pre1/drivers/block/floppy.c Fri Feb 11 00:05:33 2000 +++ 2.3.46pre1aa1/drivers/block/floppy.c Wed Feb 16 00:28:24 2000 @@ -2274,7 +2274,7 @@ probing = 0; reschedule_timeout(MAXTIMEOUT, "request done %d", uptodate); - if (!CURRENT){ + if (QUEUE_EMPTY){ DPRINT("request list destroyed in floppy request done\n"); return; } @@ -2288,14 +2288,14 @@ DRS->maxtrack = 1; /* unlock chained buffers */ - while (current_count_sectors && CURRENT && + while (current_count_sectors && !QUEUE_EMPTY && current_count_sectors >= CURRENT->current_nr_sectors){ current_count_sectors -= CURRENT->current_nr_sectors; CURRENT->nr_sectors -= CURRENT->current_nr_sectors; CURRENT->sector += CURRENT->current_nr_sectors; end_request(1); } - if (current_count_sectors && CURRENT){ + if (current_count_sectors && !QUEUE_EMPTY){ /* "unlock" last subsector */ CURRENT->buffer += current_count_sectors <<9; CURRENT->current_nr_sectors -= current_count_sectors; @@ -2304,7 +2304,7 @@ return; } - if (current_count_sectors && !CURRENT) + if (current_count_sectors && QUEUE_EMPTY) DPRINT("request list destroyed in floppy request done\n"); } else { @@ -2867,14 +2867,14 @@ if (current_drive < N_DRIVE) floppy_off(current_drive); - if (CURRENT && CURRENT->rq_status == RQ_INACTIVE){ + if (!QUEUE_EMPTY && CURRENT->rq_status == RQ_INACTIVE){ CLEAR_INTR; unlock_fdc(); return; } while(1){ - if (!CURRENT) { + if (QUEUE_EMPTY) { CLEAR_INTR; unlock_fdc(); return; diff -urN 2.3.46pre1/drivers/block/hd.c 2.3.46pre1aa1/drivers/block/hd.c --- 2.3.46pre1/drivers/block/hd.c Sun Jan 30 15:43:28 2000 +++ 2.3.46pre1aa1/drivers/block/hd.c Wed Feb 16 00:28:24 2000 @@ -145,7 +145,7 @@ unsigned long flags; char devc; - devc = CURRENT ? 'a' + DEVICE_NR(CURRENT->rq_dev) : '?'; + devc = !QUEUE_EMPTY ? 'a' + DEVICE_NR(CURRENT->rq_dev) : '?'; save_flags (flags); sti(); #ifdef VERBOSE_ERRORS @@ -174,7 +174,7 @@ if (hd_error & (BBD_ERR|ECC_ERR|ID_ERR|MARK_ERR)) { printk(", CHS=%d/%d/%d", (inb(HD_HCYL)<<8) + inb(HD_LCYL), inb(HD_CURRENT) & 0xf, inb(HD_SECTOR)); - if (CURRENT) + if (!QUEUE_EMPTY) printk(", sector=%ld", CURRENT->sector); } printk("\n"); @@ -351,7 +351,7 @@ { int dev; - if (!CURRENT) + if (QUEUE_EMPTY) return; dev = DEVICE_NR(CURRENT->rq_dev); if (++CURRENT->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) { @@ -414,7 +414,7 @@ #if (HD_DELAY > 0) last_req = read_timer(); #endif - if (CURRENT) + if (!QUEUE_EMPTY) hd_request(); return; } @@ -475,7 +475,7 @@ unsigned int dev; DEVICE_INTR = NULL; - if (!CURRENT) + if (QUEUE_EMPTY) return; disable_irq(HD_IRQ); sti(); @@ -522,7 +522,7 @@ { unsigned int dev, block, nsect, sec, track, head, cyl; - if (CURRENT && CURRENT->rq_status == RQ_INACTIVE) return; + if (!QUEUE_EMPTY && CURRENT->rq_status == RQ_INACTIVE) return; if (DEVICE_INTR) return; repeat: diff -urN 2.3.46pre1/drivers/block/ide.c 2.3.46pre1aa1/drivers/block/ide.c --- 2.3.46pre1/drivers/block/ide.c Fri Feb 11 00:05:33 2000 +++ 2.3.46pre1aa1/drivers/block/ide.c Wed Feb 16 00:28:25 2000 @@ -501,8 +501,7 @@ if (!end_that_request_first(rq, uptodate, hwgroup->drive->name)) { add_blkdev_randomness(MAJOR(rq->rq_dev)); - hwgroup->drive->queue.current_request = rq->next; - blk_dev[MAJOR(rq->rq_dev)].request_queue.current_request = NULL; + blkdev_dequeue_request(rq); hwgroup->rq = NULL; end_that_request_last(rq); } @@ -772,8 +771,7 @@ } } spin_lock_irqsave(&io_request_lock, flags); - drive->queue.current_request = rq->next; - blk_dev[MAJOR(rq->rq_dev)].request_queue.current_request = NULL; + blkdev_dequeue_request(rq); HWGROUP(drive)->rq = NULL; rq->rq_status = RQ_INACTIVE; spin_unlock_irqrestore(&io_request_lock, flags); @@ -1076,7 +1074,7 @@ { ide_startstop_t startstop; unsigned long block, blockend; - struct request *rq = drive->queue.current_request; + struct request *rq = blkdev_entry_next_request(&drive->queue.queue_head); unsigned int minor = MINOR(rq->rq_dev), unit = minor >> PARTN_BITS; ide_hwif_t *hwif = HWIF(drive); @@ -1159,7 +1157,7 @@ best = NULL; drive = hwgroup->drive; do { - if (drive->queue.current_request && (!drive->sleep || 0 <= (signed long)(jiffies - drive->sleep))) { + if (!list_empty(&drive->queue.queue_head) && (!drive->sleep || 0 <= (signed long)(jiffies - drive->sleep))) { if (!best || (drive->sleep && (!best->sleep || 0 < (signed long)(best->sleep - drive->sleep))) || (!best->sleep && 0 < (signed long)(WAKEUP(best) - WAKEUP(drive)))) @@ -1247,8 +1245,6 @@ drive = hwgroup->drive; do { bdev = &blk_dev[HWIF(drive)->major]; - if( !bdev->request_queue.plugged ) - bdev->request_queue.current_request = NULL; /* (broken since patch-2.1.15) */ if (drive->sleep && (!sleep || 0 < (signed long)(sleep - drive->sleep))) sleep = drive->sleep; } while ((drive = drive->next) != hwgroup->drive); @@ -1288,7 +1284,7 @@ bdev = &blk_dev[hwif->major]; if ( bdev->request_queue.plugged ) /* FIXME: paranoia */ printk("%s: Huh? nuking plugged queue\n", drive->name); - bdev->request_queue.current_request = hwgroup->rq = drive->queue.current_request; + hwgroup->rq = blkdev_entry_next_request(&drive->queue.queue_head); /* * Some systems have trouble with IDE IRQs arriving while * the driver is still setting things up. So, here we disable @@ -1670,7 +1666,7 @@ rq->sem = NULL; rq->bh = NULL; rq->bhtail = NULL; - rq->next = NULL; + rq->q = NULL; } /* @@ -1703,7 +1699,7 @@ unsigned long flags; ide_hwgroup_t *hwgroup = HWGROUP(drive); unsigned int major = HWIF(drive)->major; - struct request *cur_rq; + struct list_head * queue_head; DECLARE_MUTEX_LOCKED(sem); #ifdef CONFIG_BLK_DEV_PDC4030 @@ -1716,20 +1712,17 @@ if (action == ide_wait) rq->sem = &sem; spin_lock_irqsave(&io_request_lock, flags); - cur_rq = drive->queue.current_request; - if (cur_rq == NULL || action == ide_preempt) { - rq->next = cur_rq; - drive->queue.current_request = rq; + queue_head = &drive->queue.queue_head; + if (list_empty(queue_head) || action == ide_preempt) { if (action == ide_preempt) hwgroup->rq = NULL; } else { if (action == ide_wait || action == ide_end) { - while (cur_rq->next != NULL) /* find end of list */ - cur_rq = cur_rq->next; - } - rq->next = cur_rq->next; - cur_rq->next = rq; + queue_head = queue_head->prev; + } else + queue_head = queue_head->next; } + list_add(&rq->queue, queue_head); ide_do_request(hwgroup, 0); spin_unlock_irqrestore(&io_request_lock, flags); if (action == ide_wait) { diff -urN 2.3.46pre1/drivers/block/ll_rw_blk.c 2.3.46pre1aa1/drivers/block/ll_rw_blk.c --- 2.3.46pre1/drivers/block/ll_rw_blk.c Fri Feb 11 00:05:33 2000 +++ 2.3.46pre1aa1/drivers/block/ll_rw_blk.c Wed Feb 16 00:28:25 2000 @@ -3,6 +3,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1994, Karl Keyte: Added support for disk statistics + * Elevator latency, (C) 2000 Andrea Arcangeli SuSE */ /* @@ -27,6 +28,8 @@ #include +#define DEBUG_ELEVATOR + /* * MAC Floppy IWM hooks */ @@ -147,6 +150,18 @@ return ret; } +static inline int get_request_latency(elevator_t * elevator, int rw) +{ + int latency; + + if (rw != READ) + latency = elevator->write_latency; + else + latency = elevator->read_latency; + + return latency; +} + void blk_cleanup_queue(request_queue_t * q) { memset(q, 0, sizeof(*q)); @@ -168,11 +183,12 @@ } static int ll_merge_fn(request_queue_t *q, struct request *req, - struct buffer_head *bh) + struct buffer_head *bh, int max_segments) { if (req->bhtail->b_data + req->bhtail->b_size != bh->b_data) { - if (req->nr_segments < MAX_SEGMENTS) { + if (req->nr_segments < max_segments) { req->nr_segments++; + q->nr_segments++; return 1; } return 0; @@ -181,14 +197,17 @@ } static int ll_merge_requests_fn(request_queue_t *q, struct request *req, - struct request *next) + struct request *next, int max_segments) { int total_segments = req->nr_segments + next->nr_segments; if (req->bhtail->b_data + req->bhtail->b_size == next->bh->b_data) + { total_segments--; + q->nr_segments--; + } - if (total_segments > MAX_SEGMENTS) + if (total_segments > max_segments) return 0; req->nr_segments = total_segments; @@ -197,8 +216,9 @@ void blk_init_queue(request_queue_t * q, request_fn_proc * rfn) { + INIT_LIST_HEAD(&q->queue_head); + q->elevator = ELEVATOR_DEFAULTS; q->request_fn = rfn; - q->current_request = NULL; q->merge_fn = ll_merge_fn; q->merge_requests_fn = ll_merge_requests_fn; q->make_request_fn = NULL; @@ -230,13 +250,15 @@ spin_unlock_irq(&io_request_lock); BUG(); } - if (q->current_request) + if (!list_empty(&q->queue_head)) return; q->plugged = 1; queue_task(&q->plug_tq, &tq_disk); } +void plug_device_noop(request_queue_t *q, kdev_t dev) { } + /* * remove the plug and let it rip.. */ @@ -248,7 +270,7 @@ spin_lock_irqsave(&io_request_lock,flags); if (q->plugged) { q->plugged = 0; - if (q->current_request) + if (!list_empty(&q->queue_head)) (q->request_fn)(q); } spin_unlock_irqrestore(&io_request_lock,flags); @@ -298,7 +320,8 @@ add_wait_queue(&wait_for_request, &wait); for (;;) { - current->state = TASK_UNINTERRUPTIBLE; + /* FIFO wake-one wakeup will make starvtion even better */ + __set_current_state(TASK_UNINTERRUPTIBLE|TASK_EXCLUSIVE); spin_lock_irqsave(&io_request_lock,flags); req = get_request(n, dev); spin_unlock_irqrestore(&io_request_lock,flags); @@ -388,6 +411,109 @@ printk(KERN_ERR "drive_stat_acct: cmd not R/W?\n"); } +/* elevator */ + +static inline struct list_head * seek_to_not_starving_chunk(request_queue_t * q, + int * lat, int * starving) +{ + int sequence = q->elevator.sequence; + struct list_head * entry = q->queue_head.prev; + int pos = 0; + + do { + struct request * req = blkdev_entry_to_request(entry); + if (time_before_eq(req->elevator_sequence, sequence)) + { + *lat -= q->nr_segments - pos; + *starving = 1; + return entry; + } + pos += req->nr_segments; + } while ((entry = entry->prev) != &q->queue_head); + + *starving = 0; + + return entry->next; +} + +static inline void elevator_merge_requests(elevator_t * e, struct request * req, struct request * next) +{ + if (time_before(next->elevator_sequence, req->elevator_sequence)) + req->elevator_sequence = next->elevator_sequence; + if (req->cmd == READ) + e->read_pendings--; + +} + +static inline int elevator_sequence(elevator_t * e, int latency) +{ + return latency + e->sequence; +} + +#define elevator_merge_before(q, req, lat) __elevator_merge((q), (req), (lat), 0) +#define elevator_merge_after(q, req, lat) __elevator_merge((q), (req), (lat), 1) +static inline void __elevator_merge(request_queue_t * q, struct request * req, int latency, int after) +{ + int sequence = elevator_sequence(&q->elevator, latency); + if (after) + sequence -= req->nr_segments; + if (time_before(sequence, req->elevator_sequence)) + req->elevator_sequence = sequence; +} + +static inline void elevator_queue(request_queue_t * q, + struct request * req, + struct list_head * entry, + int latency, int starving) +{ + struct request * tmp, * __tmp; + int __latency = latency; + + __tmp = tmp = blkdev_entry_to_request(entry); + + for (;; tmp = blkdev_next_request(tmp)) + { + if ((latency -= tmp->nr_segments) <= 0) + { + tmp = __tmp; + latency = __latency; + + if (starving) + break; + + if (q->head_active && !q->plugged) + { + latency -= tmp->nr_segments; + break; + } + + list_add(&req->queue, &q->queue_head); + goto after_link; + } + + if (tmp->queue.next == &q->queue_head) + break; + + { + const int after_current = IN_ORDER(tmp,req); + const int before_next = IN_ORDER(req,blkdev_next_request(tmp)); + + if (!IN_ORDER(tmp,blkdev_next_request(tmp))) { + if (after_current || before_next) + break; + } else { + if (after_current && before_next) + break; + } + } + } + + list_add(&req->queue, &tmp->queue); + + after_link: + req->elevator_sequence = elevator_sequence(&q->elevator, latency); +} + /* * add-request adds a request to the linked list. * It disables interrupts (aquires the request spinlock) so that it can muck @@ -398,32 +524,20 @@ * which is important for drive_stat_acct() above. */ -static inline void __add_request(request_queue_t * q, struct request * req) +static inline void __add_request(request_queue_t * q, struct request * req, + int empty, struct list_head * entry, + int latency, int starving) { - int major = MAJOR(req->rq_dev); - struct request * tmp; + int major; drive_stat_acct(req, req->nr_sectors, 1); - req->next = NULL; - if (!(tmp = q->current_request)) { - q->current_request = req; + if (empty) { + req->elevator_sequence = elevator_sequence(&q->elevator, latency); + list_add(&req->queue, &q->queue_head); return; } - for ( ; tmp->next ; tmp = tmp->next) { - const int after_current = IN_ORDER(tmp,req); - const int before_next = IN_ORDER(req,tmp->next); - - if (!IN_ORDER(tmp,tmp->next)) { - if (after_current || before_next) - break; - } else { - if (after_current && before_next) - break; - } - } - req->next = tmp->next; - tmp->next = req; + elevator_queue(q, req, entry, latency, starving); /* * FIXME(eric) I don't understand why there is a need for this @@ -432,6 +546,7 @@ * I am leaving this in here until I hear back from the COMPAQ * people. */ + major = MAJOR(req->rq_dev); if (major >= COMPAQ_SMART2_MAJOR+0 && major <= COMPAQ_SMART2_MAJOR+7) { (q->request_fn)(q); @@ -448,12 +563,14 @@ */ static inline void attempt_merge (request_queue_t * q, struct request *req, - int max_sectors) + int max_sectors, + int max_segments) { - struct request *next = req->next; - - if (!next) + struct request *next; + + if (req->queue.next == &q->queue_head) return; + next = blkdev_next_request(req); if (req->sector + req->nr_sectors != next->sector) return; if (next->sem || req->cmd != next->cmd || req->rq_dev != next->rq_dev || req->nr_sectors + next->nr_sectors > max_sectors) @@ -464,25 +581,79 @@ * will have been updated to the appropriate number, * and we shouldn't do it here too. */ - if(!(q->merge_requests_fn)(q, req, next)) + if(!(q->merge_requests_fn)(q, req, next, max_segments)) return; + elevator_merge_requests(&q->elevator, req, next); req->bhtail->b_reqnext = next->bh; req->bhtail = next->bhtail; req->nr_sectors += next->nr_sectors; next->rq_status = RQ_INACTIVE; - req->next = next->next; + list_del(&next->queue); wake_up (&wait_for_request); } +static inline void elevator_debug(request_queue_t * q, kdev_t dev) +{ +#ifdef DEBUG_ELEVATOR + int read_pendings = 0, nr_segments = 0; + elevator_t * elevator = &q->elevator; + struct list_head * entry = &q->queue_head; + static int counter; + + if (counter++ % 100) + return; + + while ((entry = entry->next) != &q->queue_head) + { + struct request * req; + + req = blkdev_entry_to_request(entry); + if (!req->q) + continue; + if (req->cmd == READ) + read_pendings++; + nr_segments += req->nr_segments; + } + + if (read_pendings != elevator->read_pendings) + { + printk(KERN_WARNING + "%s: elevator read_pendings %d should be %d\n", + kdevname(dev), elevator->read_pendings, + read_pendings); + elevator->read_pendings = read_pendings; + } + if (nr_segments != q->nr_segments) + { + printk(KERN_WARNING + "%s: elevator nr_segments %d should be %d\n", + kdevname(dev), q->nr_segments, + nr_segments); + q->nr_segments = nr_segments; + } +#endif +} + +static inline void elevator_account_request(request_queue_t * q, struct request * req) +{ + q->elevator.sequence++; + if (req->cmd == READ) + q->elevator.read_pendings++; + q->nr_segments++; +} + static inline void __make_request(request_queue_t * q, int rw, struct buffer_head * bh) { int major = MAJOR(bh->b_rdev); unsigned int sector, count; - struct request * req; + int max_segments = MAX_SEGMENTS; + struct request * req, * prev; int rw_ahead, max_req, max_sectors; unsigned long flags; + int orig_latency, latency, __latency, starving, __starving, empty; + struct list_head * entry, * __entry; count = bh->b_size >> 9; sector = bh->b_rsector; @@ -569,13 +740,18 @@ */ max_sectors = get_max_sectors(bh->b_rdev); + __latency = orig_latency = get_request_latency(&q->elevator, rw); + /* * Now we acquire the request spinlock, we have to be mega careful * not to schedule or do something nonatomic */ spin_lock_irqsave(&io_request_lock,flags); - req = q->current_request; - if (!req) { + elevator_debug(q, bh->b_rdev); + + empty = 0; + if (list_empty(&q->queue_head)) { + empty = 1; /* MD and loop can't handle plugging without deadlocking */ if (q->plug_device_fn) q->plug_device_fn(q, bh->b_rdev); /* is atomic */ @@ -584,6 +760,17 @@ goto get_rq; } + /* avoid write-bombs to not hurt iteractiveness of reads */ + if (rw != READ && q->elevator.read_pendings) + max_segments = q->elevator.max_bomb_segments; + + entry = seek_to_not_starving_chunk(q, &__latency, &starving); + + __entry = entry; + __starving = starving; + + latency = __latency; + if (q->head_active && !q->plugged) { /* * The scsi disk and cdrom drivers completely remove the request @@ -595,11 +782,18 @@ * entry may be busy being processed and we thus can't change * it. */ - if ((req = req->next) == NULL) - goto get_rq; + if (entry == q->queue_head.next) { + latency -= blkdev_entry_to_request(entry)->nr_segments; + if ((entry = entry->next) == &q->queue_head) + goto get_rq; + starving = 0; + } } + prev = NULL; do { + req = blkdev_entry_to_request(entry); + if (req->sem) continue; if (req->cmd != rw) @@ -610,6 +804,8 @@ continue; /* Can we add it to the end of this request? */ if (req->sector + req->nr_sectors == sector) { + if (latency - req->nr_segments < 0) + break; /* * The merge_fn is a more advanced way * of accomplishing the same task. Instead @@ -622,16 +818,21 @@ * may suggest that we shouldn't merge * this */ - if(!(q->merge_fn)(q, req, bh)) + if(!(q->merge_fn)(q, req, bh, max_segments)) continue; req->bhtail->b_reqnext = bh; req->bhtail = bh; req->nr_sectors += count; drive_stat_acct(req, count, 0); + + elevator_merge_after(q, req, latency); + /* Can we now merge this req with the next? */ - attempt_merge(q, req, max_sectors); + attempt_merge(q, req, max_sectors, max_segments); /* or to the beginning? */ } else if (req->sector - count == sector) { + if (!prev && starving) + continue; /* * The merge_fn is a more advanced way * of accomplishing the same task. Instead @@ -644,7 +845,7 @@ * may suggest that we shouldn't merge * this */ - if(!(q->merge_fn)(q, req, bh)) + if(!(q->merge_fn)(q, req, bh, max_segments)) continue; bh->b_reqnext = req->bh; req->bh = bh; @@ -653,13 +854,21 @@ req->sector = sector; req->nr_sectors += count; drive_stat_acct(req, count, 0); + + elevator_merge_before(q, req, latency); + + if (prev) + attempt_merge(q, prev, max_sectors, max_segments); } else continue; + q->elevator.sequence++; spin_unlock_irqrestore(&io_request_lock,flags); return; - } while ((req = req->next) != NULL); + } while (prev = req, + (latency -= req->nr_segments) >= 0 && + (entry = entry->next) != &q->queue_head); /* find an unused request. */ get_rq: @@ -675,6 +884,14 @@ goto end_io; req = __get_request_wait(max_req, bh->b_rdev); spin_lock_irqsave(&io_request_lock,flags); + + /* lock got dropped so revalidate elevator */ + empty = 1; + if (!list_empty(&q->queue_head)) { + empty = 0; + __latency = orig_latency; + __entry = seek_to_not_starving_chunk(q, &__latency, &__starving); + } } /* * Dont start the IO if the buffer has been @@ -707,8 +924,10 @@ req->sem = NULL; req->bh = bh; req->bhtail = bh; - req->next = NULL; - __add_request(q, req); + req->q = q; + __add_request(q, req, empty, __entry, __latency, __starving); + elevator_account_request(q, req); + spin_unlock_irqrestore(&io_request_lock, flags); return; @@ -867,6 +1086,8 @@ void end_that_request_last(struct request *req) { + if (req->q) + BUG(); if (req->sem != NULL) up(req->sem); req->rq_status = RQ_INACTIVE; @@ -886,7 +1107,6 @@ req = all_requests + NR_REQUEST; while (--req >= all_requests) { req->rq_status = RQ_INACTIVE; - req->next = NULL; } memset(ro_bits,0,sizeof(ro_bits)); memset(max_readahead, 0, sizeof(max_readahead)); @@ -977,6 +1197,9 @@ #ifdef CONFIG_SJCD sjcd_init(); #endif CONFIG_SJCD +#ifdef CONFIG_BLK_DEV_LVM + lvm_init(); +#endif #ifdef CONFIG_BLK_DEV_MD md_init(); #endif CONFIG_BLK_DEV_MD diff -urN 2.3.46pre1/drivers/block/loop.c 2.3.46pre1aa1/drivers/block/loop.c --- 2.3.46pre1/drivers/block/loop.c Tue Feb 15 03:06:47 2000 +++ 2.3.46pre1aa1/drivers/block/loop.c Wed Feb 16 00:28:25 2000 @@ -277,7 +277,7 @@ repeat: INIT_REQUEST; current_request=CURRENT; - CURRENT=current_request->next; + blkdev_dequeue_request(current_request); if (MINOR(current_request->rq_dev) >= max_loop) goto error_out; lo = &loop_dev[MINOR(current_request->rq_dev)]; @@ -375,15 +375,13 @@ spin_lock_irq(&io_request_lock); current_request->sector += current_request->current_nr_sectors; current_request->nr_sectors -= current_request->current_nr_sectors; - current_request->next=CURRENT; - CURRENT=current_request; + list_add(¤t_request->queue, ¤t_request->q->queue_head); end_request(1); goto repeat; error_out_lock: spin_lock_irq(&io_request_lock); error_out: - current_request->next=CURRENT; - CURRENT=current_request; + list_add(¤t_request->queue, ¤t_request->q->queue_head); end_request(0); goto repeat; } @@ -790,6 +788,7 @@ } blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), DEVICE_REQUEST); + blk_queue_headactive(BLK_DEFAULT_QUEUE(MAJOR_NR), 0); for (i=0; i < max_loop; i++) { memset(&loop_dev[i], 0, sizeof(struct loop_device)); loop_dev[i].lo_number = i; diff -urN 2.3.46pre1/drivers/block/lvm-snap.c 2.3.46pre1aa1/drivers/block/lvm-snap.c --- 2.3.46pre1/drivers/block/lvm-snap.c Thu Jan 1 01:00:00 1970 +++ 2.3.46pre1aa1/drivers/block/lvm-snap.c Wed Feb 16 00:28:24 2000 @@ -0,0 +1,414 @@ +/* linux/drivers/block/lvm-snap.c + + Copyright (C) 2000 Andrea Arcangeli SuSE + + LVM snapshotting */ + +#include +#include +#include +#include +#include + + +extern const char *const lvm_name; +extern int lvm_blocksizes[]; + +void lvm_snapshot_release(lv_t *); + +#define hashfn(dev,block,mask,chunk_size) \ + ((HASHDEV(dev)^((block)/(chunk_size))) & (mask)) + +static inline lv_block_exception_t * +lvm_find_exception_table(kdev_t org_dev, unsigned long org_start, lv_t * lv) +{ + struct list_head * hash_table = lv->lv_snapshot_hash_table, * next; + unsigned long mask = lv->lv_snapshot_hash_mask; + int chunk_size = lv->lv_chunk_size; + lv_block_exception_t * ret; + int i = 0; + + hash_table = &hash_table[hashfn(org_dev, org_start, mask, chunk_size)]; + ret = NULL; + for (next = hash_table->next; next != hash_table; next = next->next) + { + lv_block_exception_t * exception; + + exception = list_entry(next, lv_block_exception_t, hash); + if (exception->rsector_org == org_start && + exception->rdev_org == org_dev) + { + if (i) + { + /* fun, isn't it? :) */ + list_del(next); + list_add(next, hash_table); + } + ret = exception; + break; + } + i++; + } + return ret; +} + +static inline void lvm_hash_link(lv_block_exception_t * exception, + kdev_t org_dev, unsigned long org_start, + lv_t * lv) +{ + struct list_head * hash_table = lv->lv_snapshot_hash_table; + unsigned long mask = lv->lv_snapshot_hash_mask; + int chunk_size = lv->lv_chunk_size; + + hash_table = &hash_table[hashfn(org_dev, org_start, mask, chunk_size)]; + list_add(&exception->hash, hash_table); +} + +int lvm_snapshot_remap_block(kdev_t * org_dev, unsigned long * org_sector, + unsigned long pe_start, lv_t * lv) +{ + int ret; + unsigned long pe_off, pe_adjustment, __org_start; + kdev_t __org_dev; + int chunk_size = lv->lv_chunk_size; + lv_block_exception_t * exception; + + pe_off = pe_start % chunk_size; + pe_adjustment = (*org_sector-pe_off) % chunk_size; + __org_start = *org_sector - pe_adjustment; + __org_dev = *org_dev; + + ret = 0; + exception = lvm_find_exception_table(__org_dev, __org_start, lv); + if (exception) + { + *org_dev = exception->rdev_new; + *org_sector = exception->rsector_new + pe_adjustment; + ret = 1; + } + return ret; +} + +static void lvm_drop_snapshot(lv_t * lv_snap, const char * reason) +{ + kdev_t last_dev; + int i; + + /* no exception storage space available for this snapshot + or error on this snapshot --> release it */ + invalidate_buffers(lv_snap->lv_dev); + + for (i = last_dev = 0; i < lv_snap->lv_remap_ptr; i++) { + if ( lv_snap->lv_block_exception[i].rdev_new != last_dev) { + last_dev = lv_snap->lv_block_exception[i].rdev_new; + invalidate_buffers(last_dev); + } + } + + lvm_snapshot_release(lv_snap); + + printk(KERN_INFO + "%s -- giving up to snapshot %s on %s due %s\n", + lvm_name, lv_snap->lv_snapshot_org->lv_name, lv_snap->lv_name, + reason); +} + +static inline void lvm_snapshot_prepare_blocks(unsigned long * blocks, + unsigned long start, + int nr_sectors, + int blocksize) +{ + int i, sectors_per_block, nr_blocks; + + sectors_per_block = blocksize >> 9; + nr_blocks = nr_sectors / sectors_per_block; + start /= sectors_per_block; + + for (i = 0; i < nr_blocks; i++) + blocks[i] = start++; +} + +static inline int get_blksize(kdev_t dev) +{ + int correct_size = BLOCK_SIZE, i, major; + + major = MAJOR(dev); + if (blksize_size[major]) + { + i = blksize_size[major][MINOR(dev)]; + if (i) + correct_size = i; + } + return correct_size; +} + +#ifdef DEBUG_SNAPSHOT +static inline void invalidate_snap_cache(unsigned long start, unsigned long nr, + kdev_t dev) +{ + struct buffer_head * bh; + int sectors_per_block, i, blksize, minor; + + minor = MINOR(dev); + blksize = lvm_blocksizes[minor]; + sectors_per_block = blksize >> 9; + nr /= sectors_per_block; + start /= sectors_per_block; + + for (i = 0; i < nr; i++) + { + bh = get_hash_table(dev, start++, blksize); + if (bh) + bforget(bh); + } +} +#endif + +/* + * copy on write handler for one snapshot logical volume + * + * read the original blocks and store it/them on the new one(s). + * if there is no exception storage space free any longer --> release snapshot. + * + * this routine gets called for each _first_ write to a physical chunk. + */ +int lvm_snapshot_COW(kdev_t org_phys_dev, + unsigned long org_phys_sector, + unsigned long org_pe_start, + unsigned long org_virt_sector, + lv_t * lv_snap) +{ + const char * reason; + unsigned long org_start, snap_start, snap_phys_dev, virt_start, pe_off; + int idx = lv_snap->lv_remap_ptr, chunk_size = lv_snap->lv_chunk_size; + struct kiobuf * iobuf; + unsigned long blocks[KIO_MAX_SECTORS]; + int blksize_snap, blksize_org, min_blksize, max_blksize; + int max_sectors, nr_sectors; + + /* check if we are out of snapshot space */ + if (idx >= lv_snap->lv_remap_end) + goto fail_out_of_space; + + /* calculate physical boundaries of source chunk */ + pe_off = org_pe_start % chunk_size; + org_start = org_phys_sector - ((org_phys_sector-pe_off) % chunk_size); + virt_start = org_virt_sector - (org_phys_sector - org_start); + + /* calculate physical boundaries of destination chunk */ + snap_phys_dev = lv_snap->lv_block_exception[idx].rdev_new; + snap_start = lv_snap->lv_block_exception[idx].rsector_new; + +#ifdef DEBUG_SNAPSHOT + printk(KERN_INFO + "%s -- COW: " + "org %02d:%02d faulting %lu start %lu, " + "snap %02d:%02d start %lu, " + "size %d, pe_start %lu pe_off %lu, virt_sec %lu\n", + lvm_name, + MAJOR(org_phys_dev), MINOR(org_phys_dev), org_phys_sector, + org_start, + MAJOR(snap_phys_dev), MINOR(snap_phys_dev), snap_start, + chunk_size, + org_pe_start, pe_off, + org_virt_sector); +#endif + + iobuf = lv_snap->lv_iobuf; + + blksize_org = get_blksize(org_phys_dev); + blksize_snap = get_blksize(snap_phys_dev); + max_blksize = max(blksize_org, blksize_snap); + min_blksize = min(blksize_org, blksize_snap); + max_sectors = KIO_MAX_SECTORS * (min_blksize>>9); + + if (chunk_size % (max_blksize>>9)) + goto fail_blksize; + + while (chunk_size) + { + nr_sectors = min(chunk_size, max_sectors); + chunk_size -= nr_sectors; + + iobuf->length = nr_sectors << 9; + + lvm_snapshot_prepare_blocks(blocks, org_start, + nr_sectors, blksize_org); + if (brw_kiovec(READ, 1, &iobuf, org_phys_dev, + blocks, blksize_org) != (nr_sectors<<9)) + goto fail_raw_read; + + lvm_snapshot_prepare_blocks(blocks, snap_start, + nr_sectors, blksize_snap); + if (brw_kiovec(WRITE, 1, &iobuf, snap_phys_dev, + blocks, blksize_snap) != (nr_sectors<<9)) + goto fail_raw_write; + } + +#ifdef DEBUG_SNAPSHOT + /* invalidate the logcial snapshot buffer cache */ + invalidate_snap_cache(virt_start, lv_snap->lv_chunk_size, + lv_snap->lv_dev); +#endif + + /* the original chunk is now stored on the snapshot volume + so update the execption table */ + lv_snap->lv_block_exception[idx].rdev_org = org_phys_dev; + lv_snap->lv_block_exception[idx].rsector_org = org_start; + lvm_hash_link(lv_snap->lv_block_exception + idx, + org_phys_dev, org_start, lv_snap); + lv_snap->lv_remap_ptr = idx + 1; + return 0; + + /* slow path */ + out: + lvm_drop_snapshot(lv_snap, reason); + return 1; + + fail_out_of_space: + reason = "out of space"; + goto out; + fail_raw_read: + reason = "read error"; + goto out; + fail_raw_write: + reason = "write error"; + goto out; + fail_blksize: + reason = "blocksize error"; + goto out; +} + +static int lvm_snapshot_alloc_iobuf_pages(struct kiobuf * iobuf, int sectors) +{ + int bytes, nr_pages, err, i; + + bytes = sectors << 9; + nr_pages = (bytes + ~PAGE_MASK) >> PAGE_SHIFT; + err = expand_kiobuf(iobuf, nr_pages); + if (err) + goto out; + + err = -ENOMEM; + iobuf->locked = 1; + iobuf->nr_pages = 0; + for (i = 0; i < nr_pages; i++) + { + struct page * page; + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,3,27) + page = alloc_page(GFP_KERNEL); + if (!page) + goto out; +#else + { + unsigned long addr = __get_free_page(GFP_USER); + if (!addr) + goto out; + iobuf->pagelist[i] = addr; + page = mem_map + MAP_NR(addr); + } +#endif + + iobuf->maplist[i] = page; + /* the only point to lock the page here is to be allowed + to share unmap_kiobuf() in the fail-path */ +#ifndef LockPage +#define LockPage(map) set_bit(PG_locked, &(map)->flags) +#endif + LockPage(page); + iobuf->nr_pages++; + } + iobuf->offset = 0; + + err = 0; + out: + return err; +} + +static int calc_max_buckets(void) +{ + unsigned long mem; + + mem = num_physpages << PAGE_SHIFT; + mem /= 100; + mem *= 2; + mem /= sizeof(struct list_head); + + return mem; +} + +static int lvm_snapshot_alloc_hash_table(lv_t * lv) +{ + int err; + unsigned long buckets, max_buckets, size; + struct list_head * hash; + + buckets = lv->lv_remap_end; + max_buckets = calc_max_buckets(); + buckets = min(buckets, max_buckets); + while (buckets & (buckets-1)) + buckets &= (buckets-1); + + size = buckets * sizeof(struct list_head); + + err = -ENOMEM; + hash = vmalloc(size); + lv->lv_snapshot_hash_table = hash; + + if (!hash) + goto out; + + lv->lv_snapshot_hash_mask = buckets-1; + while (buckets--) + INIT_LIST_HEAD(hash+buckets); + err = 0; + out: + return err; +} + +int lvm_snapshot_alloc(lv_t * lv_snap) +{ + int err, blocksize, max_sectors; + + err = alloc_kiovec(1, &lv_snap->lv_iobuf); + if (err) + goto out; + + blocksize = lvm_blocksizes[MINOR(lv_snap->lv_dev)]; + max_sectors = KIO_MAX_SECTORS << (PAGE_SHIFT-9); + + err = lvm_snapshot_alloc_iobuf_pages(lv_snap->lv_iobuf, max_sectors); + if (err) + goto out_free_kiovec; + + err = lvm_snapshot_alloc_hash_table(lv_snap); + if (err) + goto out_free_kiovec; + out: + return err; + + out_free_kiovec: + unmap_kiobuf(lv_snap->lv_iobuf); + free_kiovec(1, &lv_snap->lv_iobuf); + goto out; +} + +void lvm_snapshot_release(lv_t * lv) +{ + if (lv->lv_block_exception) + { + vfree(lv->lv_block_exception); + lv->lv_block_exception = NULL; + } + if (lv->lv_snapshot_hash_table) + { + vfree(lv->lv_snapshot_hash_table); + lv->lv_snapshot_hash_table = NULL; + } + if (lv->lv_iobuf) + { + free_kiovec(1, &lv->lv_iobuf); + lv->lv_iobuf = NULL; + } +} diff -urN 2.3.46pre1/drivers/block/lvm.c 2.3.46pre1aa1/drivers/block/lvm.c --- 2.3.46pre1/drivers/block/lvm.c Thu Jan 1 01:00:00 1970 +++ 2.3.46pre1aa1/drivers/block/lvm.c Wed Feb 16 00:28:24 2000 @@ -0,0 +1,2616 @@ +/* + * kernel/lvm.c + * + * Copyright (C) 1997 - 2000 Heinz Mauelshagen, Germany + * + * February-November 1997 + * April-May,July-August,November 1998 + * January-March,May,July,September,October 1999 + * + * + * LVM driver is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * LVM driver is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + */ + +/* + * Changelog + * + * 09/11/1997 - added chr ioctls VG_STATUS_GET_COUNT + * and VG_STATUS_GET_NAMELIST + * 18/01/1998 - change lvm_chr_open/close lock handling + * 30/04/1998 - changed LV_STATUS ioctl to LV_STATUS_BYNAME and + * - added LV_STATUS_BYINDEX ioctl + * - used lvm_status_byname_req_t and + * lvm_status_byindex_req_t vars + * 04/05/1998 - added multiple device support + * 08/05/1998 - added support to set/clear extendable flag in volume group + * 09/05/1998 - changed output of lvm_proc_get_info() because of + * support for free (eg. longer) logical volume names + * 12/05/1998 - added spin_locks (thanks to Pascal van Dam + * ) + * 25/05/1998 - fixed handling of locked PEs in lvm_map() and lvm_chr_ioctl() + * 26/05/1998 - reactivated verify_area by access_ok + * 07/06/1998 - used vmalloc/vfree instead of kmalloc/kfree to go + * beyond 128/256 KB max allocation limit per call + * - #ifdef blocked spin_lock calls to avoid compile errors + * with 2.0.x + * 11/06/1998 - another enhancement to spinlock code in lvm_chr_open() + * and use of LVM_VERSION_CODE instead of my own macros + * (thanks to Michael Marxmeier ) + * 07/07/1998 - added statistics in lvm_map() + * 08/07/1998 - saved statistics in do_lv_extend_reduce() + * 25/07/1998 - used __initfunc macro + * 02/08/1998 - changes for official char/block major numbers + * 07/08/1998 - avoided init_module() and cleanup_module() to be static + * 30/08/1998 - changed VG lv_open counter from sum of LV lv_open counters + * to sum of LVs open (no matter how often each is) + * 01/09/1998 - fixed lvm_gendisk.part[] index error + * 07/09/1998 - added copying of lv_current_pe-array + * in LV_STATUS_BYINDEX ioctl + * 17/11/1998 - added KERN_* levels to printk + * 13/01/1999 - fixed LV index bug in do_lv_create() which hit lvrename + * 07/02/1999 - fixed spinlock handling bug in case of LVM_RESET + * by moving spinlock code from lvm_chr_open() + * to lvm_chr_ioctl() + * - added LVM_LOCK_LVM ioctl to lvm_chr_ioctl() + * - allowed LVM_RESET and retrieval commands to go ahead; + * only other update ioctls are blocked now + * - fixed pv->pe to NULL for pv_status + * - using lv_req structure in lvm_chr_ioctl() now + * - fixed NULL ptr reference bug in do_lv_extend_reduce() + * caused by uncontiguous PV array in lvm_chr_ioctl(VG_REDUCE) + * 09/02/1999 - changed BLKRASET and BLKRAGET in lvm_chr_ioctl() to + * handle lgoical volume private read ahead sector + * - implemented LV read_ahead handling with lvm_blk_read() + * and lvm_blk_write() + * 10/02/1999 - implemented 2.[12].* support function lvm_hd_name() + * to be used in drivers/block/genhd.c by disk_name() + * 12/02/1999 - fixed index bug in lvm_blk_ioctl(), HDIO_GETGEO + * - enhanced gendisk insert/remove handling + * 16/02/1999 - changed to dynamic block minor number allocation to + * have as much as 99 volume groups with 256 logical volumes + * as the grand total; this allows having 1 volume group with + * up to 256 logical volumes in it + * 21/02/1999 - added LV open count information to proc filesystem + * - substituted redundant LVM_RESET code by calls + * to do_vg_remove() + * 22/02/1999 - used schedule_timeout() to be more responsive + * in case of do_vg_remove() with lots of logical volumes + * 19/03/1999 - fixed NULL pointer bug in module_init/lvm_init + * 17/05/1999 - used DECLARE_WAIT_QUEUE_HEAD macro (>2.3.0) + * - enhanced lvm_hd_name support + * 03/07/1999 - avoided use of KERNEL_VERSION macro based ifdefs and + * memcpy_tofs/memcpy_fromfs macro redefinitions + * 06/07/1999 - corrected reads/writes statistic counter copy in case + * of striped logical volume + * 28/07/1999 - implemented snapshot logical volumes + * - lvm_chr_ioctl + * - LV_STATUS_BYINDEX + * - LV_STATUS_BYNAME + * - do_lv_create + * - do_lv_remove + * - lvm_map + * - new lvm_snapshot_remap_block + * - new lvm_snapshot_remap_new_block + * 08/10/1999 - implemented support for multiple snapshots per + * original logical volume + * 12/10/1999 - support for 2.3.19 + * 11/11/1999 - support for 2.3.28 + * 21/11/1999 - changed lvm_map() interface to buffer_head based + * 19/12/1999 - support for 2.3.33 + * 01/01/2000 - changed locking concept in lvm_map(), + * do_vg_create() and do_lv_remove() + * + */ + + +/* + * TODO + * + * - implement special handling of unavailable physical volumes + * + */ + +char *lvm_version = "LVM version 0.8e by Heinz Mauelshagen (4/1/2000)\n"; +char *lvm_short_version = "version 0.8e (4/1/2000)"; + +#define MAJOR_NR LVM_BLK_MAJOR +#define DEVICE_OFF(device) + +#include +#include + +#ifdef MODVERSIONS +# undef MODULE +# define MODULE +# include +#endif + +#ifdef MODULE +# include +#endif + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_KERNELD +#include +#endif + +#include +#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 0) +#include +#endif + +#include +#include + +#define LVM_CORRECT_READ_AHEAD(a) \ +do { \ + if ((a) < LVM_MIN_READ_AHEAD) \ + (a) = LVM_MIN_READ_AHEAD; \ + if ((a) > LVM_MAX_READ_AHEAD) \ + (a) = LVM_MAX_READ_AHEAD; \ +} while(0) + +#define suser() ( current->uid == 0 && current->euid == 0) + + +/* + * External function prototypes + */ +#ifdef MODULE +int init_module ( void); +void cleanup_module ( void); +#else +extern int lvm_init ( void); +#endif + +#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 30) +static void lvm_dummy_device_request ( request_queue_t*); +#else +static void lvm_dummy_device_request ( void); +#endif +static int lvm_blk_ioctl ( struct inode *, struct file *, uint, ulong); +static int lvm_blk_open ( struct inode *, struct file *); + +static int lvm_chr_open ( struct inode *, struct file *); + +static int lvm_chr_release ( struct inode *, struct file *); +static int lvm_blk_release ( struct inode *, struct file *); + +static int lvm_chr_ioctl ( struct inode *, struct file *, uint, ulong); + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS +#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 30) +static int lvm_proc_get_info ( char *, char **, off_t, int); +static int (*lvm_proc_get_info_ptr)(char *, char **, off_t, int) = + &lvm_proc_get_info; +#else +static int lvm_proc_get_info ( char *, char **, off_t, int, int); +#endif +#endif + +#ifdef LVM_HD_NAME +void lvm_hd_name ( char*, int); +#endif + +/* external snapshot calls */ +int lvm_snapshot_remap_block ( kdev_t*, ulong*, unsigned long, lv_t*); +int lvm_snapshot_COW(kdev_t, unsigned long, unsigned long, + unsigned long, lv_t *); +int lvm_snapshot_alloc(lv_t *); +void lvm_snapshot_release(lv_t *); + +/* End external function prototypes */ + + +/* + * Internal function prototypes + */ +static void lvm_init_vars ( void); +#if LINUX_VERSION_CODE < KERNEL_VERSION( 2, 3, 43) +extern int (*lvm_map_ptr) ( struct buffer_head*, int); +#endif + + +#ifdef LVM_HD_NAME +extern void (*lvm_hd_name_ptr) ( char*, int); +#endif +static int lvm_map ( struct buffer_head*, int); +static int do_vg_create ( int, void *); +static int do_vg_remove ( int); +static int do_lv_create ( int, char *, lv_t *); +static int do_lv_remove ( int, char *, int); +static int do_lv_extend_reduce ( int, char *, lv_t *); +static void lvm_geninit ( struct gendisk *); +#ifdef LVM_GET_INODE + static struct inode *lvm_get_inode ( int); + void lvm_clear_inode ( struct inode *); +#endif +inline int lvm_strlen ( char *); +inline void lvm_memcpy ( char *, char *, int); +inline int lvm_strcmp ( char *, char *); +inline char *lvm_strrchr ( char *, char c); +/* END Internal function prototypes */ + + +/* volume group descriptor area pointers */ +static vg_t *vg[ABS_MAX_VG + 1]; +static pv_t *pvp = NULL; +static lv_t *lvp = NULL; +static pe_t *pep = NULL; +static pe_t *pep1 = NULL; + + +/* map from block minor number to VG and LV numbers */ +typedef struct { + int vg_number; + int lv_number; +} vg_lv_map_t; +static vg_lv_map_t vg_lv_map[ABS_MAX_LV]; + + +/* Request structures (lvm_chr_ioctl()) */ +static pv_change_req_t pv_change_req; +static pv_flush_req_t pv_flush_req; +static pv_status_req_t pv_status_req; +static pe_lock_req_t pe_lock_req; +static le_remap_req_t le_remap_req; +static lv_req_t lv_req; + +#ifdef LVM_TOTAL_RESET +static int lvm_reset_spindown = 0; +#endif + +static char pv_name[NAME_LEN]; +/* static char rootvg[NAME_LEN] = { 0, }; */ +static uint lv_open = 0; +const char *const lvm_name = LVM_NAME; +static int lock = 0; +static int loadtime = 0; +static uint vg_count = 0; +static long lvm_chr_open_count = 0; +static ushort lvm_iop_version = LVM_DRIVER_IOP_VERSION; +#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 0) +static DECLARE_WAIT_QUEUE_HEAD ( lvm_wait); +static DECLARE_WAIT_QUEUE_HEAD ( lvm_map_wait); +#else +struct wait_queue *lvm_wait = NULL; +struct wait_queue *lvm_map_wait = NULL; +#endif + +static spinlock_t lvm_lock = SPIN_LOCK_UNLOCKED; + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS +#if LINUX_VERSION_CODE < KERNEL_VERSION ( 2, 3, 31) +static struct proc_dir_entry lvm_proc_entry = { + 0, 3, LVM_NAME, S_IFREG | S_IRUGO, + 1, 0, 0, 0, + NULL, + lvm_proc_get_info, + NULL, NULL, NULL, NULL, NULL, +}; +#endif +#endif + +static struct file_operations lvm_chr_fops = { + ioctl: lvm_chr_ioctl, + open: lvm_chr_open, + release: lvm_chr_release, +}; + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 3, 38) +static struct file_operations lvm_blk_fops = { + read: block_read, + write: block_write, + ioctl: lvm_blk_ioctl, + open: lvm_blk_open, + release: lvm_blk_release, + fsync: block_fsync, +}; +#else +static struct block_device_operations lvm_blk_fops = +{ + open: lvm_blk_open, + release: lvm_blk_release, + ioctl: lvm_blk_ioctl, +}; +#endif + +/* gendisk structures */ +static struct hd_struct lvm_hd_struct[MAX_LV]; +int lvm_blocksizes[MAX_LV] = { 0, }; +static int lvm_size[MAX_LV] = { 0, }; +static struct gendisk lvm_gendisk = { + MAJOR_NR, /* major # */ + LVM_NAME, /* name of major */ + 0, /* number of times minor is shifted + to get real minor */ + 1, /* maximum partitions per device */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 3, 40) + MAX_LV, /* maximum number of real devices */ + lvm_geninit, /* initialization called before we + do other things */ +#endif + lvm_hd_struct, /* partition table */ + lvm_size, /* device size in blocks, copied + to block_size[] */ + MAX_LV, /* number or real devices */ + NULL, /* internal */ + NULL, /* pointer to next gendisk struct (internal) */ +}; + + +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 3, 43) +static void lvm_make_request (int rw, struct buffer_head * bh) +{ +#ifdef CONFIG_BLK_DEV_MD + request_queue_t * q; + unsigned long flags; +#endif + + if (lvm_map(bh, rw)) + goto sorry; + +#ifdef CONFIG_BLK_DEV_MD + q = blk_get_queue(bh->b_rdev); + + if (q->make_request_fn) + q->make_request_fn(rw, bh[i]); + else +#endif + generic_make_request(rw, bh); + return; + + sorry: + printk(KERN_ERR "Bad lvm_map in ll_rw_block\n"); +} +#endif + +#ifdef MODULE +/* + * Module initialization... + */ +int init_module ( void) +#else +/* + * Driver initialization... + */ +#ifdef __initfunc +__initfunc ( int lvm_init ( void)) +#else +int __init lvm_init ( void) +#endif +#endif /* #ifdef MODULE */ +{ + struct gendisk *gendisk_ptr = NULL; + + lvm_init_vars (); + + /* insert our gendisk at the corresponding major */ + lvm_geninit ( &lvm_gendisk); + if ( gendisk_head != NULL) { + gendisk_ptr = gendisk_head; + while ( gendisk_ptr->next != NULL && + gendisk_ptr->major > lvm_gendisk.major) { + gendisk_ptr = gendisk_ptr->next; + } + lvm_gendisk.next = gendisk_ptr->next; + gendisk_ptr->next = &lvm_gendisk; + } else { + gendisk_head = &lvm_gendisk; + lvm_gendisk.next = NULL; + } + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 3, 43) + /* reference from drivers/block/ll_rw_blk.c */ + lvm_map_ptr = lvm_map; +#endif + +#ifdef LVM_HD_NAME + /* reference from drivers/block/genhd.c */ + lvm_hd_name_ptr = lvm_hd_name; +#endif + +#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 30) + blk_init_queue ( BLK_DEFAULT_QUEUE ( MAJOR_NR), lvm_dummy_device_request); +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 3, 43) + blk_queue_pluggable(BLK_DEFAULT_QUEUE(MAJOR_NR), 0); +#else + blk_queue_pluggable(BLK_DEFAULT_QUEUE(MAJOR_NR), plug_device_noop); + blk_queue_make_request(BLK_DEFAULT_QUEUE(MAJOR_NR), lvm_make_request); +#endif +#else + blk_dev[MAJOR_NR].request_fn = lvm_dummy_device_request; + blk_dev[MAJOR_NR].current_request = NULL; +#endif + + /* optional read root VGDA */ +/* + if ( *rootvg != 0) { + vg_read_with_pv_and_lv ( rootvg, &vg); + } +*/ + + if ( register_chrdev ( LVM_CHAR_MAJOR, lvm_name, &lvm_chr_fops) < 0) { + printk ( KERN_ERR "%s -- register_chrdev failed\n", lvm_name); + return -EIO; + } + if ( register_blkdev ( MAJOR_NR, lvm_name, &lvm_blk_fops) < 0) { + printk ( "%s -- register_blkdev failed\n", lvm_name); + if ( unregister_chrdev ( LVM_CHAR_MAJOR, lvm_name) < 0) + printk ( KERN_ERR "%s -- unregister_chrdev failed\n", lvm_name); + return -EIO; + } + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS +#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 25) + create_proc_info_entry ( LVM_NAME, S_IFREG | S_IRUGO, + &proc_root, lvm_proc_get_info_ptr); +# else + proc_register ( &proc_root, &lvm_proc_entry); +# endif +#endif + + printk ( KERN_INFO + "%s%s -- " +#ifdef MODULE + "Module" +#else + "Driver" +#endif + " successfully initialized\n", + lvm_version, lvm_name); + + return 0; +} /* init_module () / lvm_init () */ + + +#ifdef MODULE +/* + * Module cleanup... + */ +void cleanup_module ( void) { + struct gendisk *gendisk_ptr = NULL, *gendisk_ptr_prev = NULL; + + if ( unregister_chrdev ( LVM_CHAR_MAJOR, lvm_name) < 0) { + printk ( KERN_ERR "%s -- unregister_chrdev failed\n", lvm_name); + } + if ( unregister_blkdev ( MAJOR_NR, lvm_name) < 0) { + printk ( KERN_ERR "%s -- unregister_blkdev failed\n", lvm_name); + } + +#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 30) + blk_cleanup_queue ( BLK_DEFAULT_QUEUE ( MAJOR_NR)); +#else + blk_dev[MAJOR_NR].request_fn = NULL; + blk_dev[MAJOR_NR].current_request = NULL; +#endif + + gendisk_ptr = gendisk_ptr_prev = gendisk_head; + while ( gendisk_ptr != NULL) { + if ( gendisk_ptr == &lvm_gendisk) break; + gendisk_ptr_prev = gendisk_ptr; + gendisk_ptr = gendisk_ptr->next; + } + /* delete our gendisk from chain */ + if ( gendisk_ptr == &lvm_gendisk) gendisk_ptr_prev->next = gendisk_ptr->next; + + blk_size[MAJOR_NR] = NULL; + blksize_size[MAJOR_NR] = NULL; + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS +#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 30) + remove_proc_entry ( LVM_NAME, &proc_root); +# else + proc_unregister ( &proc_root, lvm_proc_entry.low_ino); +# endif +#endif + +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 3, 43) + /* reference from linux/drivers/block/ll_rw_blk.c */ + lvm_map_ptr = NULL; +#endif + +#ifdef LVM_HD_NAME + /* reference from linux/drivers/block/genhd.c */ + lvm_hd_name_ptr = NULL; +#endif + + printk ( KERN_INFO "%s -- Module successfully deactivated\n", lvm_name); + + return; +} /* void cleanup_module () */ +#endif /* #ifdef MODULE */ + + +/* + * support function to initialize lvm variables + */ +#ifdef __initfunc +__initfunc ( void lvm_init_vars ( void)) +#else +void __init lvm_init_vars ( void) +#endif +{ + int v; + + loadtime = CURRENT_TIME; + + lvm_lock = SPIN_LOCK_UNLOCKED; + + pe_lock_req.lock = UNLOCK_PE; + pe_lock_req.data.lv_dev = \ + pe_lock_req.data.pv_dev = \ + pe_lock_req.data.pv_offset = 0; + + /* Initialize VG pointers */ + for ( v = 0; v <= ABS_MAX_VG; v++) vg[v] = NULL; + + /* Initialize LV -> VG association */ + for ( v = 0; v < ABS_MAX_LV; v++) { + /* index ABS_MAX_VG never used for real VG */ + vg_lv_map[v].vg_number = ABS_MAX_VG; + vg_lv_map[v].lv_number = -1; + } + + return; +} /* lvm_init_vars () */ + + +/******************************************************************** + * + * Character device functions + * + ********************************************************************/ + +/* + * character device open routine + */ +static int lvm_chr_open ( struct inode *inode, + struct file *file) { + int minor = MINOR ( inode->i_rdev); + +#ifdef DEBUG + printk ( KERN_DEBUG + "%s -- lvm_chr_open MINOR: %d VG#: %d mode: 0x%X lock: %d\n", + lvm_name, minor, VG_CHR(minor), file->f_mode, lock); +#endif + + /* super user validation */ + if ( ! suser()) return -EACCES; + + /* Group special file open */ + if ( VG_CHR(minor) > MAX_VG) return -ENXIO; + +#ifdef MODULE + MOD_INC_USE_COUNT; +#endif + + lvm_chr_open_count++; + return 0; +} /* lvm_chr_open () */ + + +/* + * character device i/o-control routine + * + * Only one changing process can do ioctl at one time, others will block. + * + */ +static int lvm_chr_ioctl ( struct inode *inode, struct file *file, + uint command, ulong a) { + int minor = MINOR ( inode->i_rdev); + int extendable; + ulong l, le, p, v; + ulong size; + void *arg = ( void*) a; +#ifdef LVM_GET_INODE + struct inode *inode_sav; +#endif + lv_status_byname_req_t lv_status_byname_req; + lv_status_byindex_req_t lv_status_byindex_req; + lv_t lv; + + /* otherwise cc will complain about unused variables */ + ( void) lvm_lock; + + +#ifdef DEBUG_IOCTL + printk ( KERN_DEBUG + "%s -- lvm_chr_ioctl: command: 0x%X MINOR: %d " + "VG#: %d mode: 0x%X\n", + lvm_name, command, minor, VG_CHR(minor), file->f_mode); +#endif + +#ifdef LVM_TOTAL_RESET + if ( lvm_reset_spindown > 0) return -EACCES; +#endif + + + /* Main command switch */ + switch ( command) { + /* lock the LVM */ + case LVM_LOCK_LVM: +lock_try_again: + spin_lock ( &lvm_lock); + if( lock != 0 && lock != current->pid ) { +#ifdef DEBUG_IOCTL + printk ( KERN_INFO "lvm_chr_ioctl: %s is locked by pid %d ...\n", + lvm_name, lock); +#endif + spin_unlock ( &lvm_lock); + interruptible_sleep_on ( &lvm_wait); + if ( current->sigpending != 0) return -EINTR; +#ifdef LVM_TOTAL_RESET + if ( lvm_reset_spindown > 0) return -EACCES; +#endif + goto lock_try_again; + } + lock = current->pid; + spin_unlock ( &lvm_lock); + return 0; + + + /* check lvm version to ensure driver/tools+lib interoperability */ + case LVM_GET_IOP_VERSION: + if ( copy_to_user ( arg, &lvm_iop_version, sizeof ( ushort)) != 0) + return -EFAULT; + return 0; + + +#ifdef LVM_TOTAL_RESET + /* lock reset function */ + case LVM_RESET: + lvm_reset_spindown = 1; + for ( v = 0; v < ABS_MAX_VG; v++) { + if ( vg[v] != NULL) { + do_vg_remove ( v); + } + } + +#ifdef MODULE + while ( GET_USE_COUNT ( &__this_module) < 1) + MOD_INC_USE_COUNT; + while ( GET_USE_COUNT ( &__this_module) > 1) + MOD_DEC_USE_COUNT; +#endif /* MODULE */ + lock = 0; /* release lock */ + wake_up_interruptible ( &lvm_wait); + return 0; +#endif /* LVM_TOTAL_RESET */ + + + /* lock/unlock i/o to a physical extent to move it to another + physical volume (move's done in user space's pvmove) */ + case PE_LOCK_UNLOCK: + if ( vg[VG_CHR(minor)] == NULL) return -ENXIO; + if ( copy_from_user ( &pe_lock_req, arg, sizeof ( pe_lock_req_t)) != 0) + return -EFAULT; + + switch ( pe_lock_req.lock) { + case LOCK_PE: + for ( p = 0; p < vg[VG_CHR(minor)]->pv_max; p++) { + if ( vg[VG_CHR(minor)]->pv[p] != NULL && + pe_lock_req.data.pv_dev == + vg[VG_CHR(minor)]->pv[p]->pv_dev) + break; + } + + if ( p == vg[VG_CHR(minor)]->pv_max) return -ENXIO; + + pe_lock_req.lock = UNLOCK_PE; + fsync_dev ( pe_lock_req.data.lv_dev); + pe_lock_req.lock = LOCK_PE; + break; + + case UNLOCK_PE: + pe_lock_req.lock = UNLOCK_PE; + pe_lock_req.data.lv_dev = \ + pe_lock_req.data.pv_dev = \ + pe_lock_req.data.pv_offset = 0; + wake_up ( &lvm_map_wait); + break; + + default: + return -EINVAL; + } + + return 0; + + + /* remap a logical extent (after moving the physical extent) */ + case LE_REMAP: + if ( vg[VG_CHR(minor)] == NULL) return -ENXIO; + if ( copy_from_user ( &le_remap_req, arg, + sizeof ( le_remap_req_t)) != 0) + return -EFAULT; + + for ( l = 0; l < vg[VG_CHR(minor)]->lv_max; l++) { + if ( vg[VG_CHR(minor)]->lv[l] != NULL && + lvm_strcmp ( vg[VG_CHR(minor)]->lv[l]->lv_name, + le_remap_req.lv_name) == 0) { + for ( le = 0; le < vg[VG_CHR(minor)]->lv[l]->lv_allocated_le; + le++) { + if ( vg[VG_CHR(minor)]->lv[l]->lv_current_pe[le].dev == + le_remap_req.old_dev && + vg[VG_CHR(minor)]->lv[l]->lv_current_pe[le].pe == + le_remap_req.old_pe) { + vg[VG_CHR(minor)]->lv[l]->lv_current_pe[le].dev = + le_remap_req.new_dev; + vg[VG_CHR(minor)]->lv[l]->lv_current_pe[le].pe = + le_remap_req.new_pe; + return 0; + } + } + return -EINVAL; + } + } + + return -ENXIO; + + + /* create a VGDA */ + case VG_CREATE: + return do_vg_create ( minor, arg); + + + /* remove an inactive VGDA */ + case VG_REMOVE: + return do_vg_remove ( minor); + + + /* extend a volume group */ + case VG_EXTEND: + if ( vg[VG_CHR(minor)] == NULL) return -ENXIO; + if ( vg[VG_CHR(minor)]->pv_cur < vg[VG_CHR(minor)]->pv_max) { + for ( p = 0; p < vg[VG_CHR(minor)]->pv_max; p++) { + if ( vg[VG_CHR(minor)]->pv[p] == NULL) { + if ( ( vg[VG_CHR(minor)]->pv[p] = + kmalloc ( sizeof ( pv_t), GFP_USER)) == NULL) { + printk ( KERN_CRIT + "%s -- VG_EXTEND: kmalloc error PV at line %d\n", + lvm_name, __LINE__); + return -ENOMEM; + } + if ( copy_from_user ( vg[VG_CHR(minor)]->pv[p], arg, + sizeof ( pv_t)) != 0) + return -EFAULT; + + vg[VG_CHR(minor)]->pv[p]->pv_status = PV_ACTIVE; + /* We don't need the PE list + in kernel space like LVs pe_t list */ + vg[VG_CHR(minor)]->pv[p]->pe = NULL; + vg[VG_CHR(minor)]->pv_cur++; + vg[VG_CHR(minor)]->pv_act++; + vg[VG_CHR(minor)]->pe_total += + vg[VG_CHR(minor)]->pv[p]->pe_total; +#ifdef LVM_GET_INODE + /* insert a dummy inode for fs_may_mount */ + vg[VG_CHR(minor)]->pv[p]->inode = + lvm_get_inode ( vg[VG_CHR(minor)]->pv[p]->pv_dev); +#endif + return 0; + } + } + } + return -EPERM; + + + /* reduce a volume group */ + case VG_REDUCE: + if ( vg[VG_CHR(minor)] == NULL) return -ENXIO; + if ( copy_from_user ( pv_name, arg, sizeof ( pv_name)) != 0) + return -EFAULT; + + for ( p = 0; p < vg[VG_CHR(minor)]->pv_max; p++) { + if ( vg[VG_CHR(minor)]->pv[p] != NULL && + lvm_strcmp ( vg[VG_CHR(minor)]->pv[p]->pv_name, + pv_name) == 0) { + if ( vg[VG_CHR(minor)]->pv[p]->lv_cur > 0) return -EPERM; + vg[VG_CHR(minor)]->pe_total -= + vg[VG_CHR(minor)]->pv[p]->pe_total; + vg[VG_CHR(minor)]->pv_cur--; + vg[VG_CHR(minor)]->pv_act--; +#ifdef DEBUG_VFREE + printk ( KERN_DEBUG + "%s -- kfree %d\n", lvm_name, __LINE__); +#endif +#ifdef LVM_GET_INODE + lvm_clear_inode ( vg[VG_CHR(minor)]->pv[p]->inode); +#endif + kfree ( vg[VG_CHR(minor)]->pv[p]); + /* Make PV pointer array contiguous */ + for ( ; p < vg[VG_CHR(minor)]->pv_max-1; p++) + vg[VG_CHR(minor)]->pv[p] = vg[VG_CHR(minor)]->pv[p + 1]; + vg[VG_CHR(minor)]->pv[p + 1] = NULL; + return 0; + } + } + return -ENXIO; + + + /* set/clear extendability flag of volume group */ + case VG_SET_EXTENDABLE: + if ( vg[VG_CHR(minor)] == NULL) return -ENXIO; + if ( copy_from_user ( &extendable, arg, sizeof ( extendable)) != 0) + return -EFAULT; + + if ( extendable == VG_EXTENDABLE || + extendable == ~VG_EXTENDABLE) { + if ( extendable == VG_EXTENDABLE) + vg[VG_CHR(minor)]->vg_status |= VG_EXTENDABLE; + else + vg[VG_CHR(minor)]->vg_status &= ~VG_EXTENDABLE; + } else return -EINVAL; + return 0; + + + /* get volume group data (only the vg_t struct) */ + case VG_STATUS: + if ( vg[VG_CHR(minor)] == NULL) return -ENXIO; + if ( copy_to_user ( arg, vg[VG_CHR(minor)], sizeof ( vg_t)) != 0) + return -EFAULT; + + return 0; + + + /* get volume group count */ + case VG_STATUS_GET_COUNT: + if ( copy_to_user ( arg, &vg_count, sizeof ( vg_count)) != 0) + return -EFAULT; + + return 0; + + + /* get volume group count */ + case VG_STATUS_GET_NAMELIST: + for ( l = v = 0; v < ABS_MAX_VG; v++) { + if ( vg[v] != NULL) { + if ( copy_to_user ( arg + l++ * NAME_LEN, + vg[v]->vg_name, + NAME_LEN) != 0) + return -EFAULT; + } + } + return 0; + + + /* create, remove, extend or reduce a logical volume */ + case LV_CREATE: + case LV_REMOVE: + case LV_EXTEND: + case LV_REDUCE: + if ( vg[VG_CHR(minor)] == NULL) return -ENXIO; + if ( copy_from_user ( &lv_req, arg, sizeof ( lv_req)) != 0) + return -EFAULT; + + if ( command != LV_REMOVE) { + if ( copy_from_user ( &lv, lv_req.lv, sizeof ( lv_t)) != 0) + return -EFAULT; + } + + switch ( command) { + case LV_CREATE: + return do_lv_create ( minor, lv_req.lv_name, &lv); + + case LV_REMOVE: + return do_lv_remove ( minor, lv_req.lv_name, -1); + + case LV_EXTEND: + case LV_REDUCE: + return do_lv_extend_reduce ( minor, lv_req.lv_name, &lv); + } + + + /* get status of a logical volume by name */ + case LV_STATUS_BYNAME: + if ( vg[VG_CHR(minor)] == NULL) return -ENXIO; + if ( copy_from_user ( &lv_status_byname_req, arg, + sizeof ( lv_status_byname_req_t)) != 0) + return -EFAULT; + + if ( lv_status_byname_req.lv == NULL) return -EINVAL; + if ( copy_from_user ( &lv, lv_status_byname_req.lv, + sizeof ( lv_t)) != 0) + return -EFAULT; + + for ( l = 0; l < vg[VG_CHR(minor)]->lv_max; l++) { + if ( vg[VG_CHR(minor)]->lv[l] != NULL && + lvm_strcmp ( vg[VG_CHR(minor)]->lv[l]->lv_name, + lv_status_byname_req.lv_name) == 0) { + if ( copy_to_user ( lv_status_byname_req.lv, + vg[VG_CHR(minor)]->lv[l], + sizeof ( lv_t)) != 0) + return -EFAULT; + + if ( lv.lv_current_pe != NULL) { + size = vg[VG_CHR(minor)]->lv[l]->lv_allocated_le * + sizeof ( pe_t); + if ( copy_to_user ( lv.lv_current_pe, + vg[VG_CHR(minor)]->lv[l]->lv_current_pe, + size) != 0) + return -EFAULT; + } + return 0; + } + } + return -ENXIO; + + + /* get status of a logical volume by index */ + case LV_STATUS_BYINDEX: + if ( vg[VG_CHR(minor)] == NULL) return -ENXIO; + if ( copy_from_user ( &lv_status_byindex_req, arg, + sizeof ( lv_status_byindex_req)) != 0) + return -EFAULT; + + if ( ( lvp = lv_status_byindex_req.lv) == NULL) return -EINVAL; + l = lv_status_byindex_req.lv_index; + if ( vg[VG_CHR(minor)]->lv[l] == NULL) return -ENXIO; + + if ( copy_from_user ( &lv, lvp, sizeof ( lv_t)) != 0) + return -EFAULT; + + if ( copy_to_user ( lvp, vg[VG_CHR(minor)]->lv[l], + sizeof ( lv_t)) != 0) + return -EFAULT; + + if ( lv.lv_current_pe != NULL) { + size = vg[VG_CHR(minor)]->lv[l]->lv_allocated_le * sizeof ( pe_t); + if ( copy_to_user ( lv.lv_current_pe, + vg[VG_CHR(minor)]->lv[l]->lv_current_pe, + size) != 0) + return -EFAULT; + } + return 0; + + + /* change a physical volume */ + case PV_CHANGE: + if ( vg[VG_CHR(minor)] == NULL) return -ENXIO; + if ( copy_from_user ( &pv_change_req, arg, + sizeof ( pv_change_req)) != 0) + return -EFAULT; + + for ( p = 0; p < vg[VG_CHR(minor)]->pv_max; p++) { + if ( vg[VG_CHR(minor)]->pv[p] != NULL && + lvm_strcmp ( vg[VG_CHR(minor)]->pv[p]->pv_name, + pv_change_req.pv_name) == 0) { +#ifdef LVM_GET_INODE + inode_sav = vg[VG_CHR(minor)]->pv[p]->inode; +#endif + if ( copy_from_user ( vg[VG_CHR(minor)]->pv[p], + pv_change_req.pv, + sizeof ( pv_t)) != 0) + return -EFAULT; + + /* We don't need the PE list + in kernel space as with LVs pe_t list */ + vg[VG_CHR(minor)]->pv[p]->pe = NULL; +#ifdef LVM_GET_INODE + vg[VG_CHR(minor)]->pv[p]->inode = inode_sav; +#endif + return 0; + } + } + return -ENXIO; + + + /* get physical volume data (pv_t structure only) */ + case PV_STATUS: + if ( vg[VG_CHR(minor)] == NULL) return -ENXIO; + if ( copy_from_user ( &pv_status_req, arg, + sizeof ( pv_status_req)) != 0) + return -EFAULT; + + for ( p = 0; p < vg[VG_CHR(minor)]->pv_max; p++) { + if ( vg[VG_CHR(minor)]->pv[p] != NULL) { + if ( lvm_strcmp ( vg[VG_CHR(minor)]->pv[p]->pv_name, + pv_status_req.pv_name) == 0) { + if ( copy_to_user ( pv_status_req.pv, + vg[VG_CHR(minor)]->pv[p], + sizeof ( pv_t)) != 0) + return -EFAULT; + return 0; + } + } + } + return -ENXIO; + + + /* physical volume buffer flush/invalidate */ + case PV_FLUSH: + if ( copy_from_user ( &pv_flush_req, arg, sizeof ( pv_flush_req)) != 0) + return -EFAULT; + + for ( v = 0; v < ABS_MAX_VG; v++) { + if ( vg[v] == NULL) continue; + for ( p = 0; p < vg[v]->pv_max; p++) { + if ( vg[v]->pv[p] != NULL && + lvm_strcmp ( vg[v]->pv[p]->pv_name, + pv_flush_req.pv_name) == 0) { + fsync_dev ( vg[v]->pv[p]->pv_dev); + invalidate_buffers ( vg[v]->pv[p]->pv_dev); + return 0; + } + } + } + return 0; + + + default: + printk ( KERN_WARNING + "%s -- lvm_chr_ioctl: unknown command %x\n", + lvm_name, command); + return -EINVAL; + } + + return 0; +} /* lvm_chr_ioctl */ + + +/* + * character device close routine + */ +static int lvm_chr_release ( struct inode *inode, struct file *file) +{ +#ifdef DEBUG + int minor = MINOR ( inode->i_rdev); + printk ( KERN_DEBUG + "%s -- lvm_chr_release VG#: %d\n", lvm_name, VG_CHR(minor)); +#endif + +#ifdef MODULE + if ( GET_USE_COUNT ( &__this_module) > 0) MOD_DEC_USE_COUNT; +#endif + +#ifdef LVM_TOTAL_RESET + if ( lvm_reset_spindown > 0) { + lvm_reset_spindown = 0; + lvm_chr_open_count = 1; + } +#endif + + if ( lvm_chr_open_count > 0) lvm_chr_open_count--; + if ( lock == current->pid) { + lock = 0; /* release lock */ + wake_up_interruptible ( &lvm_wait); + } + + return 0; +} /* lvm_chr_release () */ + + + +/******************************************************************** + * + * Block device functions + * + ********************************************************************/ + +/* + * block device open routine + */ +static int lvm_blk_open ( struct inode *inode, struct file *file) { + int minor = MINOR ( inode->i_rdev); + +#ifdef DEBUG_LVM_BLK_OPEN + printk ( KERN_DEBUG + "%s -- lvm_blk_open MINOR: %d VG#: %d LV#: %d mode: 0x%X\n", + lvm_name, minor, VG_BLK(minor), LV_BLK(minor), file->f_mode); +#endif + +#ifdef LVM_TOTAL_RESET + if ( lvm_reset_spindown > 0) return -EPERM; +#endif + + if ( vg[VG_BLK(minor)] != NULL && + ( vg[VG_BLK(minor)]->vg_status & VG_ACTIVE) && + vg[VG_BLK(minor)]->lv[LV_BLK(minor)] != NULL && + LV_BLK(minor) >= 0 && + LV_BLK(minor) < vg[VG_BLK(minor)]->lv_max) { + + /* Check parallel LV spindown (LV remove) */ + if ( vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_status & LV_SPINDOWN) + return -EPERM; + + /* Check inactive LV and open for read/write */ + if ( file->f_mode & O_RDWR) { + if ( ! ( vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_status & LV_ACTIVE)) + return -EPERM; + if ( ! ( vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_access & LV_WRITE)) + return -EACCES; + } + + if ( vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_open == 0) + vg[VG_BLK(minor)]->lv_open++; + vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_open++; + +#ifdef MODULE + MOD_INC_USE_COUNT; +#endif + +#ifdef DEBUG_LVM_BLK_OPEN + printk ( KERN_DEBUG + "%s -- lvm_blk_open MINOR: %d VG#: %d LV#: %d size: %d\n", + lvm_name, minor, VG_BLK(minor), LV_BLK(minor), + vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_size); +#endif + + return 0; + } + + return -ENXIO; +} /* lvm_blk_open () */ + + +/* + * block device i/o-control routine + */ +static int lvm_blk_ioctl (struct inode *inode, struct file *file, + uint command, ulong a) { + int minor = MINOR ( inode->i_rdev); + void *arg = ( void*) a; + struct hd_geometry *hd = ( struct hd_geometry *) a; + +#ifdef DEBUG_IOCTL + printk ( KERN_DEBUG + "%s -- lvm_blk_ioctl MINOR: %d command: 0x%X arg: %X " + "VG#: %dl LV#: %d\n", + lvm_name, minor, command, ( ulong) arg, + VG_BLK(minor), LV_BLK(minor)); +#endif + + switch ( command) { + /* return device size */ + case BLKGETSIZE: +#ifdef DEBUG_IOCTL + printk ( KERN_DEBUG + "%s -- lvm_blk_ioctl -- BLKGETSIZE: %u\n", + lvm_name, vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_size); +#endif + copy_to_user ( ( long*) arg, &vg[VG_BLK(minor)]->\ + lv[LV_BLK(minor)]->lv_size, + sizeof ( vg[VG_BLK(minor)]->\ + lv[LV_BLK(minor)]->lv_size)); + break; + + + /* flush buffer cache */ + case BLKFLSBUF: + /* super user validation */ + if ( ! suser ()) return -EACCES; + +#ifdef DEBUG_IOCTL + printk ( KERN_DEBUG + "%s -- lvm_blk_ioctl -- BLKFLSBUF\n", lvm_name); +#endif + fsync_dev ( inode->i_rdev); + invalidate_buffers(inode->i_rdev); + break; + + + /* set read ahead for block device */ + case BLKRASET: + /* super user validation */ + if ( ! suser ()) return -EACCES; + +#ifdef DEBUG_IOCTL + printk ( KERN_DEBUG + "%s -- lvm_blk_ioctl -- BLKRASET: %d sectors for %02X:%02X\n", + lvm_name, ( long) arg, MAJOR( inode->i_rdev), minor); +#endif + if ( ( long) arg < LVM_MIN_READ_AHEAD || + ( long) arg > LVM_MAX_READ_AHEAD) return -EINVAL; + read_ahead[MAJOR_NR] = + vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_read_ahead = ( long) arg; + break; + + + /* get current read ahead setting */ + case BLKRAGET: +#ifdef DEBUG_IOCTL + printk ( KERN_DEBUG + "%s -- lvm_blk_ioctl -- BLKRAGET\n", lvm_name); +#endif + copy_to_user ( ( long*) arg, + &vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_read_ahead, + sizeof ( vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->\ + lv_read_ahead)); + break; + + + /* get disk geometry */ + case HDIO_GETGEO: +#ifdef DEBUG_IOCTL + printk ( KERN_DEBUG + "%s -- lvm_blk_ioctl -- HDIO_GETGEO\n", lvm_name); +#endif + if ( hd == NULL) return -EINVAL; + { + unsigned char heads = 64; + unsigned char sectors = 32; + long start = 0; + short cylinders = vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_size / + heads / sectors; + + if ( copy_to_user ( ( char*) &hd->heads, &heads, + sizeof ( heads)) != 0 || + copy_to_user ( ( char*) &hd->sectors, §ors, + sizeof ( sectors)) != 0 || + copy_to_user ( ( short*) &hd->cylinders, + &cylinders, sizeof ( cylinders)) != 0 || + copy_to_user ( ( long*) &hd->start, &start, + sizeof ( start)) != 0) + return -EFAULT; + } + +#ifdef DEBUG_IOCTL + printk ( KERN_DEBUG + "%s -- lvm_blk_ioctl -- cylinders: %d\n", + lvm_name, vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->\ + lv_size / heads / sectors); +#endif + break; + + + /* set access flags of a logical volume */ + case LV_SET_ACCESS: + /* super user validation */ + if ( ! suser ()) return -EACCES; + vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_access = ( ulong) arg; + break; + + + /* set status flags of a logical volume */ + case LV_SET_STATUS: + /* super user validation */ + if ( ! suser ()) return -EACCES; + if ( ! ( ( ulong) arg & LV_ACTIVE) && + vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_open > 1) return -EPERM; + vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_status = ( ulong) arg; + break; + + + /* set allocation flags of a logical volume */ + case LV_SET_ALLOCATION: + /* super user validation */ + if ( ! suser ()) return -EACCES; + vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_allocation = ( ulong) arg; + break; + + + default: + printk ( KERN_WARNING + "%s -- lvm_blk_ioctl: unknown command %d\n", + lvm_name, command); + return -EINVAL; + } + + return 0; +} /* lvm_blk_ioctl () */ + + +/* + * block device close routine + */ +static int lvm_blk_release ( struct inode *inode, struct file *file) +{ + int minor = MINOR ( inode->i_rdev); + +#ifdef DEBUG + printk ( KERN_DEBUG + "%s -- lvm_blk_release MINOR: %d VG#: %d LV#: %d\n", + lvm_name, minor, VG_BLK(minor), LV_BLK(minor)); +#endif + + sync_dev ( inode->i_rdev); + if ( vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_open == 1) + vg[VG_BLK(minor)]->lv_open--; + vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_open--; + +#ifdef MODULE + MOD_DEC_USE_COUNT; +#endif + + return 0; +} /* lvm_blk_release () */ + + +#if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS +/* + * Support function /proc-Filesystem + */ +#define LVM_PROC_BUF ( i == 0 ? dummy_buf : &buf[sz]) + +#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 25) +static int lvm_proc_get_info ( char *page, char **start, off_t pos, int count) +#else +static int lvm_proc_get_info ( char *page, char **start, off_t pos, + int count, int whence) +#endif +{ + int c, i, l, p, v, vg_counter, pv_counter, lv_counter, lv_open_counter, + lv_open_total, pe_t_bytes, lv_block_exception_t_bytes, seconds; + static off_t sz; + off_t sz_last; + char allocation_flag, inactive_flag, rw_flag, stripes_flag; + char *lv_name = NULL; + static char *buf = NULL; + static char dummy_buf[160]; /* sized for 2 lines */ + +#ifdef DEBUG_LVM_PROC_GET_INFO + printk ( KERN_DEBUG + "%s - lvm_proc_get_info CALLED pos: %lu count: %d whence: %d\n", + lvm_name, pos, count, whence); +#endif + + if ( pos == 0 || buf == NULL) { + sz_last = vg_counter = pv_counter = lv_counter = lv_open_counter = \ + lv_open_total = pe_t_bytes = lv_block_exception_t_bytes = 0; + + /* search for activity */ + for ( v = 0; v < ABS_MAX_VG; v++) { + if ( vg[v] != NULL) { + vg_counter++; + pv_counter += vg[v]->pv_cur; + lv_counter += vg[v]->lv_cur; + if ( vg[v]->lv_cur > 0) { + for ( l = 0; l < vg[v]->lv_max; l++) { + if ( vg[v]->lv[l] != NULL) { + pe_t_bytes += vg[v]->lv[l]->lv_allocated_le; + if ( vg[v]->lv[l]->lv_block_exception != NULL) { + lv_block_exception_t_bytes += + vg[v]->lv[l]->lv_remap_end; + } + if ( vg[v]->lv[l]->lv_open > 0) { + lv_open_counter++; + lv_open_total += vg[v]->lv[l]->lv_open; + } + } + } + } + } + } + pe_t_bytes *= sizeof ( pe_t); + lv_block_exception_t_bytes *= sizeof ( lv_block_exception_t); + + if ( buf != NULL) { +#ifdef DEBUG_VFREE + printk ( KERN_DEBUG + "%s -- vfree %d\n", lvm_name, __LINE__); +#endif + vfree ( buf); + buf = NULL; + } + + /* 2 times: first to get size to allocate buffer, + 2nd to fill the vmalloced buffer */ + for ( i = 0; i < 2; i++) { + sz = 0; + sz += sprintf ( LVM_PROC_BUF, + "LVM " +#ifdef MODULE + "module" +#else + "driver" +#endif + " %s\n\n" + "Total: %d VG%s %d PV%s %d LV%s ", + lvm_short_version, + vg_counter, vg_counter == 1 ? "" : "s", + pv_counter, pv_counter == 1 ? "" : "s", + lv_counter, lv_counter == 1 ? "" : "s"); + sz += sprintf ( LVM_PROC_BUF, + "(%d LV%s open", + lv_open_counter, + lv_open_counter == 1 ? "" : "s"); + if ( lv_open_total > 0) sz += sprintf ( LVM_PROC_BUF, + " %d times)\n", + lv_open_total); + else sz += sprintf ( LVM_PROC_BUF, ")"); + sz += sprintf ( LVM_PROC_BUF, + "\nGlobal: %lu bytes vmalloced IOP version: %d ", + vg_counter * sizeof ( vg_t) + + pv_counter * sizeof ( pv_t) + + lv_counter * sizeof ( lv_t) + + pe_t_bytes + lv_block_exception_t_bytes + sz_last, + lvm_iop_version); + + seconds = CURRENT_TIME - loadtime; + if ( seconds < 0) loadtime = CURRENT_TIME + seconds; + if ( seconds / 86400 > 0) { + sz += sprintf ( LVM_PROC_BUF, "%d day%s ", + seconds / 86400, + seconds / 86400 == 0 || + seconds / 86400 > 1 ? "s": ""); + } + sz += sprintf ( LVM_PROC_BUF, "%d:%02d:%02d active\n", + ( seconds % 86400) / 3600, + ( seconds % 3600) / 60, + seconds % 60); + + if ( vg_counter > 0) { + for ( v = 0; v < ABS_MAX_VG; v++) { + /* volume group */ + if ( vg[v] != NULL) { + inactive_flag = ' '; + if ( ! ( vg[v]->vg_status & VG_ACTIVE)) + inactive_flag = 'I'; + sz += sprintf ( LVM_PROC_BUF, + "\nVG: %c%s [%d PV, %d LV/%d open] " + " PE Size: %d KB\n" + " Usage [KB/PE]: %d /%d total " + "%d /%d used %d /%d free", + inactive_flag, + vg[v]->vg_name, + vg[v]->pv_cur, + vg[v]->lv_cur, + vg[v]->lv_open, + vg[v]->pe_size >> 1, + vg[v]->pe_size * vg[v]->pe_total >> 1, + vg[v]->pe_total, + vg[v]->pe_allocated * vg[v]->pe_size >> 1, + vg[v]->pe_allocated, + ( vg[v]->pe_total - vg[v]->pe_allocated) * + vg[v]->pe_size >> 1, + vg[v]->pe_total - vg[v]->pe_allocated); + + /* physical volumes */ + sz += sprintf ( LVM_PROC_BUF, + "\n PV%s ", + vg[v]->pv_cur == 1 ? ": " : "s:"); + c = 0; + for ( p = 0; p < vg[v]->pv_max; p++) { + if ( vg[v]->pv[p] != NULL) { + inactive_flag = 'A'; + if ( ! ( vg[v]->pv[p]->pv_status & PV_ACTIVE)) + inactive_flag = 'I'; + allocation_flag = 'A'; + if ( ! ( vg[v]->pv[p]->pv_allocatable & PV_ALLOCATABLE)) + allocation_flag = 'N'; + sz += sprintf ( LVM_PROC_BUF, + "[%c%c] %-21s %8d /%-6d " + "%8d /%-6d %8d /%-6d", + inactive_flag, + allocation_flag, + vg[v]->pv[p]->pv_name, + vg[v]->pv[p]->pe_total * + vg[v]->pv[p]->pe_size >> 1, + vg[v]->pv[p]->pe_total, + vg[v]->pv[p]->pe_allocated * + vg[v]->pv[p]->pe_size >> 1, + vg[v]->pv[p]->pe_allocated, + ( vg[v]->pv[p]->pe_total - + vg[v]->pv[p]->pe_allocated) * + vg[v]->pv[p]->pe_size >> 1, + vg[v]->pv[p]->pe_total - + vg[v]->pv[p]->pe_allocated); + c++; + if ( c < vg[v]->pv_cur) sz += sprintf ( LVM_PROC_BUF, + "\n "); + } + } + + /* logical volumes */ + sz += sprintf ( LVM_PROC_BUF, + "\n LV%s ", + vg[v]->lv_cur == 1 ? ": " : "s:"); + c = 0; + for ( l = 0; l < vg[v]->lv_max; l++) { + if ( vg[v]->lv[l] != NULL) { + inactive_flag = 'A'; + if ( ! ( vg[v]->lv[l]->lv_status & LV_ACTIVE)) + inactive_flag = 'I'; + rw_flag = 'R'; + if ( vg[v]->lv[l]->lv_access & LV_WRITE) rw_flag = 'W'; + allocation_flag = 'D'; + if ( vg[v]->lv[l]->lv_allocation & LV_CONTIGUOUS) + allocation_flag = 'C'; + stripes_flag = 'L'; + if ( vg[v]->lv[l]->lv_stripes > 1) stripes_flag = 'S'; + sz += sprintf ( LVM_PROC_BUF, + "[%c%c%c%c", + inactive_flag, + rw_flag, + allocation_flag, + stripes_flag); + if ( vg[v]->lv[l]->lv_stripes > 1) + sz += sprintf ( LVM_PROC_BUF, "%-2d", + vg[v]->lv[l]->lv_stripes); + else + sz += sprintf ( LVM_PROC_BUF, " "); + lv_name = lvm_strrchr ( vg[v]->lv[l]->lv_name, '/'); + if ( lv_name != NULL) lv_name++; + else lv_name = vg[v]->lv[l]->lv_name; + sz += sprintf ( LVM_PROC_BUF, "] %-25s", lv_name); + if ( lvm_strlen ( lv_name) > 25) + sz += sprintf ( LVM_PROC_BUF, + "\n "); + sz += sprintf ( LVM_PROC_BUF, "%9d /%-6d ", + vg[v]->lv[l]->lv_size >> 1, + vg[v]->lv[l]->lv_size / vg[v]->pe_size); + + if ( vg[v]->lv[l]->lv_open == 0) + sz += sprintf ( LVM_PROC_BUF, "close"); + else + sz += sprintf ( LVM_PROC_BUF, "%dx open", + vg[v]->lv[l]->lv_open); + c++; + if ( c < vg[v]->lv_cur) sz += sprintf ( LVM_PROC_BUF, + "\n "); + } + } + if ( vg[v]->lv_cur == 0) + sz += sprintf ( LVM_PROC_BUF, "none"); + sz += sprintf ( LVM_PROC_BUF, "\n"); + } + } + } + + if ( buf == NULL) { + if ( ( buf = vmalloc ( sz)) == NULL) { + sz = 0; + return sprintf ( page, "%s - vmalloc error at line %d\n", + lvm_name, __LINE__); + } + } + sz_last = sz; + } + } + + if ( pos > sz - 1) { + vfree ( buf); + buf = NULL; + return 0; + } + + *start = &buf[pos]; + if ( sz - pos < count) return sz - pos; + else return count; +} /* lvm_proc_get_info () */ +#endif /* #if defined CONFIG_LVM_PROC_FS && defined CONFIG_PROC_FS */ + + +/* + * block device support function for /usr/src/linux/drivers/block/ll_rw_blk.c + * (see init_module/lvm_init) + */ +static int lvm_map ( struct buffer_head *bh, int rw) { + int minor = MINOR ( bh->b_dev); + int ret = 0; + ulong index; + ulong size = bh->b_size >> 9; + ulong rsector_tmp = bh->b_blocknr * size; + ulong rsector_sav; + kdev_t rdev_tmp = bh->b_dev; + kdev_t rdev_sav; + lv_t *lv = vg[VG_BLK(minor)]->lv[LV_BLK(minor)]; + unsigned long pe_start; + + + if ( ! ( lv->lv_status & LV_ACTIVE)) { + printk ( KERN_ALERT + "%s - lvm_map: ll_rw_blk for inactive LV %s\n", + lvm_name, lv->lv_name); + return -1; + } + +/* +if ( lv->lv_access & LV_SNAPSHOT) +printk ( "%s -- %02d:%02d block: %lu rw: %d\n", lvm_name, MAJOR ( bh->b_dev), MINOR ( bh->b_dev), bh->b_blocknr, rw); +*/ + + /* take care of snapshot chunk writes before + check for writable logical volume */ + if ( ( lv->lv_access & LV_SNAPSHOT) && + MAJOR ( bh->b_dev) != 0 && + MAJOR ( bh->b_dev) != MAJOR_NR && +#ifdef WRITEA + ( rw == WRITEA || rw == WRITE)) +#else + rw == WRITE) +#endif + { +/* +printk ( "%s -- doing snapshot write for %02d:%02d[%02d:%02d] b_blocknr: %lu b_rsector: %lu\n", lvm_name, MAJOR ( bh->b_dev), MINOR ( bh->b_dev), MAJOR ( bh->b_dev), MINOR ( bh->b_dev), bh->b_blocknr, bh->b_rsector); +*/ + return 0; + } + +#ifdef WRITEA + if ( ( rw == WRITE || rw == WRITEA) && +#else + if ( rw == WRITE && +#endif + ! ( lv->lv_access & LV_WRITE)) { + printk ( KERN_CRIT + "%s - lvm_map: ll_rw_blk write for readonly LV %s\n", + lvm_name, lv->lv_name); + return -1; + } + + +#ifdef DEBUG_MAP + printk ( KERN_DEBUG + "%s - lvm_map minor:%d *rdev: %02d:%02d *rsector: %lu " + "size:%lu\n", + lvm_name, minor, + MAJOR ( rdev_tmp), + MINOR ( rdev_tmp), + rsector_tmp, size); +#endif + + if ( rsector_tmp + size > lv->lv_size) { + printk ( KERN_ALERT + "%s - lvm_map *rsector: %lu or size: %lu wrong for" + " minor: %2d\n", lvm_name, rsector_tmp, size, minor); + return -1; + } + + rsector_sav = rsector_tmp; + rdev_sav = rdev_tmp; + +lvm_second_remap: + /* linear mapping */ + if ( lv->lv_stripes < 2) { + index = rsector_tmp / vg[VG_BLK(minor)]->pe_size; /* get the index */ + pe_start = lv->lv_current_pe[index].pe; + rsector_tmp = lv->lv_current_pe[index].pe + + ( rsector_tmp % vg[VG_BLK(minor)]->pe_size); + rdev_tmp = lv->lv_current_pe[index].dev; + +#ifdef DEBUG_MAP + printk ( KERN_DEBUG + "lv_current_pe[%ld].pe: %d rdev: %02d:%02d rsector:%ld\n", + index, + lv->lv_current_pe[index].pe, + MAJOR ( rdev_tmp), + MINOR ( rdev_tmp), + rsector_tmp); +#endif + + /* striped mapping */ + } else { + ulong stripe_index; + ulong stripe_length; + + stripe_length = vg[VG_BLK(minor)]->pe_size * lv->lv_stripes; + stripe_index = ( rsector_tmp % stripe_length) / lv->lv_stripesize; + index = rsector_tmp / stripe_length + + ( stripe_index % lv->lv_stripes) * + ( lv->lv_allocated_le / lv->lv_stripes); + pe_start = lv->lv_current_pe[index].pe; + rsector_tmp = lv->lv_current_pe[index].pe + + ( rsector_tmp % stripe_length) - + ( stripe_index % lv->lv_stripes) * lv->lv_stripesize - + stripe_index / lv->lv_stripes * + ( lv->lv_stripes - 1) * lv->lv_stripesize; + rdev_tmp = lv->lv_current_pe[index].dev; + +#ifdef DEBUG_MAP + printk(KERN_DEBUG + "lv_current_pe[%ld].pe: %d rdev: %02d:%02d rsector:%ld\n" + "stripe_length: %ld stripe_index: %ld\n", + index, + lv->lv_current_pe[index].pe, + MAJOR ( rdev_tmp), + MINOR ( rdev_tmp), + rsector_tmp, + stripe_length, + stripe_index); +#endif + } + + /* handle physical extents on the move */ + if ( pe_lock_req.lock == LOCK_PE) { + if ( rdev_tmp == pe_lock_req.data.pv_dev && + rsector_tmp >= pe_lock_req.data.pv_offset && + rsector_tmp < ( pe_lock_req.data.pv_offset + + vg[VG_BLK(minor)]->pe_size)) { + sleep_on ( &lvm_map_wait); + rsector_tmp = rsector_sav; + rdev_tmp = rdev_sav; + goto lvm_second_remap; + } + } + + /* statistic */ +#ifdef WRITEA + if ( rw == WRITE || rw == WRITEA) +#else + if ( rw == WRITE) +#endif + lv->lv_current_pe[index].writes++; + else + lv->lv_current_pe[index].reads++; + + /* snapshot volume exception handling on physical device address base */ + if ( lv->lv_access & ( LV_SNAPSHOT | LV_SNAPSHOT_ORG)) { + /* original logical volume */ + if ( lv->lv_access & LV_SNAPSHOT_ORG) { +#ifdef WRITEA + if ( rw == WRITE || rw == WRITEA) +#else + if ( rw == WRITE) +#endif + { + lv_t *lv_ptr; + + /* start with first snapshot and loop thrugh all of them */ + for ( lv_ptr = lv->lv_snapshot_next; + lv_ptr != NULL; + lv_ptr = lv_ptr->lv_snapshot_next) { + down(&lv_ptr->lv_snapshot_sem); + /* do we still have exception storage for this snapshot free? */ + if ( lv_ptr->lv_block_exception != NULL) { + kdev_t __dev; + unsigned long __sector; + + __dev = rdev_tmp; + __sector = rsector_tmp; + if (!lvm_snapshot_remap_block(&rdev_tmp, + &rsector_tmp, + pe_start, + lv_ptr)) + /* create a new mapping */ + ret = lvm_snapshot_COW(rdev_tmp, + rsector_tmp, + pe_start, + rsector_sav, + lv_ptr); + rdev_tmp = __dev; + rsector_tmp = __sector; + } + up(&lv_ptr->lv_snapshot_sem); + } + } + } else { + /* remap snapshot logical volume */ + down(&lv->lv_snapshot_sem); + if ( lv->lv_block_exception != NULL) + lvm_snapshot_remap_block ( &rdev_tmp, &rsector_tmp, pe_start, lv); + up(&lv->lv_snapshot_sem); + } + } + + bh->b_rdev = rdev_tmp; + bh->b_rsector = rsector_tmp; + + return ret; +} /* lvm_map () */ + + +/* + * lvm_map snapshot logical volume support functions + */ + +/* + * end lvm_map snapshot logical volume support functions + */ + + +/* + * internal support functions + */ + +#ifdef LVM_HD_NAME +/* + * generate "hard disk" name + */ +void lvm_hd_name ( char *buf, int minor) { + int len = 0; + + if ( vg[VG_BLK(minor)] == NULL || + vg[VG_BLK(minor)]->lv[LV_BLK(minor)] == NULL) return; + len = lvm_strlen ( vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_name) - 5; + lvm_memcpy ( buf, &vg[VG_BLK(minor)]->lv[LV_BLK(minor)]->lv_name[5], len); + buf[len] = 0; + return; +} +#endif + + +/* + * this one never should be called... + */ +#if LINUX_VERSION_CODE > KERNEL_VERSION ( 2, 3, 30) +static void lvm_dummy_device_request ( request_queue_t *t) +#else +static void lvm_dummy_device_request ( void) +#endif +{ + printk ( KERN_EMERG + "%s -- oops, got lvm request for %02d:%02d [sector: %lu]\n", + lvm_name, + MAJOR ( CURRENT->rq_dev), + MINOR ( CURRENT->rq_dev), + CURRENT->sector); + return; +} + + +/* + * character device support function VGDA create + */ +int do_vg_create ( int minor, void *arg) { + int snaporg_minor = 0; + ulong l, p; + lv_t lv; + vg_t *vg_ptr; + + if ( vg[VG_CHR(minor)] != NULL) return -EPERM; + + if ( ( vg_ptr = kmalloc ( sizeof ( vg_t), GFP_USER)) == NULL) { + printk ( KERN_CRIT + "%s -- VG_CREATE: kmalloc error VG at line %d\n", + lvm_name, __LINE__); + return -ENOMEM; + } + + /* get the volume group structure */ + if ( copy_from_user ( vg_ptr, arg, sizeof ( vg_t)) != 0) { + kfree ( vg_ptr); + return -EFAULT; + } + + /* we are not that active so far... */ + vg_ptr->vg_status &= ~VG_ACTIVE; + vg[VG_CHR(minor)] = vg_ptr; + + vg[VG_CHR(minor)]->pe_allocated = 0; + if ( vg[VG_CHR(minor)]->pv_max > ABS_MAX_PV) { + printk ( KERN_WARNING + "%s -- Can't activate VG: ABS_MAX_PV too small\n", + lvm_name); + kfree ( vg[VG_CHR(minor)]); + vg[VG_CHR(minor)] = NULL; + return -EPERM; + } + if ( vg[VG_CHR(minor)]->lv_max > ABS_MAX_LV) { + printk ( KERN_WARNING + "%s -- Can't activate VG: ABS_MAX_LV too small for %u\n", + lvm_name, vg[VG_CHR(minor)]->lv_max); + kfree ( vg[VG_CHR(minor)]); + vg[VG_CHR(minor)] = NULL; + return -EPERM; + } + + /* get the physical volume structures */ + vg[VG_CHR(minor)]->pv_act = vg[VG_CHR(minor)]->pv_cur = 0; + for ( p = 0; p < vg[VG_CHR(minor)]->pv_max; p++) { + /* user space address */ + if ( ( pvp = vg[VG_CHR(minor)]->pv[p]) != NULL) { + vg[VG_CHR(minor)]->pv[p] = kmalloc ( sizeof ( pv_t), GFP_USER); + if ( vg[VG_CHR(minor)]->pv[p] == NULL) { + printk ( KERN_CRIT + "%s -- VG_CREATE: kmalloc error PV at line %d\n", + lvm_name, __LINE__); + do_vg_remove ( minor); + return -ENOMEM; + } + if ( copy_from_user ( vg[VG_CHR(minor)]->pv[p], pvp, + sizeof ( pv_t)) != 0) { + do_vg_remove ( minor); + return -EFAULT; + } + + /* We don't need the PE list + in kernel space as with LVs pe_t list (see below) */ + vg[VG_CHR(minor)]->pv[p]->pe = NULL; + vg[VG_CHR(minor)]->pv[p]->pe_allocated = 0; + vg[VG_CHR(minor)]->pv[p]->pv_status = PV_ACTIVE; + vg[VG_CHR(minor)]->pv_act++; + vg[VG_CHR(minor)]->pv_cur++; + +#ifdef LVM_GET_INODE + /* insert a dummy inode for fs_may_mount */ + vg[VG_CHR(minor)]->pv[p]->inode = + lvm_get_inode ( vg[VG_CHR(minor)]->pv[p]->pv_dev); +#endif + } + } + + /* get the logical volume structures */ + vg[VG_CHR(minor)]->lv_cur = 0; + for ( l = 0; l < vg[VG_CHR(minor)]->lv_max; l++) { + /* user space address */ + if ( ( lvp = vg[VG_CHR(minor)]->lv[l]) != NULL) { + if ( copy_from_user ( &lv, lvp, sizeof ( lv_t)) != 0) { + do_vg_remove ( minor); + return -EFAULT; + } + vg[VG_CHR(minor)]->lv[l] = NULL; + { + int err; + + err = do_lv_create(minor, lv.lv_name, &lv); + if (err) + { + do_vg_remove(minor); + return err; + } + } + } + } + + /* Second path to correct snapshot logical volumes which are not + in place during first path above */ + for ( l = 0; l < vg[VG_CHR(minor)]->lv_max; l++) { + if ( vg[VG_CHR(minor)]->lv[l] != NULL && + vg[VG_CHR(minor)]->lv[l]->lv_access & LV_SNAPSHOT) { + snaporg_minor = vg[VG_CHR(minor)]->lv[l]->lv_snapshot_minor; + if ( vg[VG_CHR(minor)]->lv[LV_BLK(snaporg_minor)] != NULL) { + /* get pointer to original logical volume */ + lv_t *lv_ptr = vg[VG_CHR(minor)]->lv[l]->lv_snapshot_org = + vg[VG_CHR(minor)]->lv[LV_BLK(snaporg_minor)]; + + /* set necessary fields of original logical volume */ + lv_ptr->lv_access |= LV_SNAPSHOT_ORG; + lv_ptr->lv_snapshot_minor = 0; + lv_ptr->lv_snapshot_org = lv_ptr; + lv_ptr->lv_snapshot_prev = NULL; + + /* find last snapshot logical volume in the chain */ + while ( lv_ptr->lv_snapshot_next != NULL) + lv_ptr = lv_ptr->lv_snapshot_next; + + /* set back pointer to this last one in our new logical volume */ + vg[VG_CHR(minor)]->lv[l]->lv_snapshot_prev = lv_ptr; + + /* last logical volume now points to our new snapshot volume */ + lv_ptr->lv_snapshot_next = vg[VG_CHR(minor)]->lv[l]; + + /* now point to the new one */ + lv_ptr = lv_ptr->lv_snapshot_next; + + /* set necessary fields of new snapshot logical volume */ + lv_ptr->lv_snapshot_next = NULL; + lv_ptr->lv_current_pe = + vg[VG_CHR(minor)]->lv[LV_BLK(snaporg_minor)]->lv_current_pe; + lv_ptr->lv_allocated_le = + vg[VG_CHR(minor)]->lv[LV_BLK(snaporg_minor)]->lv_allocated_le; + lv_ptr->lv_current_le = + vg[VG_CHR(minor)]->lv[LV_BLK(snaporg_minor)]->lv_current_le; + lv_ptr->lv_size = + vg[VG_CHR(minor)]->lv[LV_BLK(snaporg_minor)]->lv_size; + } + } + } + + vg_count++; + + /* let's go active */ + vg[VG_CHR(minor)]->vg_status |= VG_ACTIVE; + +#ifdef MODULE + MOD_INC_USE_COUNT; +#endif + return 0; +} /* do_vg_create () */ + + +/* + * character device support function VGDA remove + */ +static int do_vg_remove ( int minor) { + int i; + + if ( vg[VG_CHR(minor)] == NULL) return -ENXIO; + +#ifdef LVM_TOTAL_RESET + if ( vg[VG_CHR(minor)]->lv_open > 0 && lvm_reset_spindown == 0) +#else + if ( vg[VG_CHR(minor)]->lv_open > 0) +#endif + return -EPERM; + + /* let's go inactive */ + vg[VG_CHR(minor)]->vg_status &= ~VG_ACTIVE; + + /* free LVs */ + /* first free snapshot logical volumes */ + for ( i = 0; i < vg[VG_CHR(minor)]->lv_max; i++) { + if ( vg[VG_CHR(minor)]->lv[i] != NULL && + vg[VG_CHR(minor)]->lv[i]->lv_access & LV_SNAPSHOT) { + do_lv_remove ( minor, NULL, i); + current->state = TASK_INTERRUPTIBLE; + schedule_timeout ( 1); + } + } + /* then free the rest */ + for ( i = 0; i < vg[VG_CHR(minor)]->lv_max; i++) { + if ( vg[VG_CHR(minor)]->lv[i] != NULL) { + do_lv_remove ( minor, NULL, i); + current->state = TASK_INTERRUPTIBLE; + schedule_timeout ( 1); + } + } + + /* free PVs */ + for ( i = 0; i < vg[VG_CHR(minor)]->pv_max; i++) { + if ( vg[VG_CHR(minor)]->pv[i] != NULL) { +#ifdef DEBUG_VFREE + printk ( KERN_DEBUG + "%s -- kfree %d\n", lvm_name, __LINE__); +#endif +#ifdef LVM_GET_INODE + lvm_clear_inode ( vg[VG_CHR(minor)]->pv[i]->inode); +#endif + kfree ( vg[VG_CHR(minor)]->pv[i]); + vg[VG_CHR(minor)]->pv[i] = NULL; + } + } + +#ifdef DEBUG_VFREE + printk ( KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__); +#endif + kfree ( vg[VG_CHR(minor)]); + vg[VG_CHR(minor)] = NULL; + + vg_count--; + +#ifdef MODULE + MOD_DEC_USE_COUNT; +#endif + return 0; +} /* do_vg_remove () */ + + +/* + * character device support function logical volume create + */ +static int do_lv_create ( int minor, char *lv_name, lv_t *lv) { + int l, le, l_new, p, size; + ulong lv_status_save; + lv_block_exception_t *lvbe = lv->lv_block_exception; + lv_t *lv_ptr = NULL; + + if ( ( pep = lv->lv_current_pe) == NULL) return -EINVAL; + if ( lv->lv_chunk_size > LVM_SNAPSHOT_MAX_CHUNK) return -EINVAL; + + for ( l = 0; l < vg[VG_CHR(minor)]->lv_max; l++) { + if ( vg[VG_CHR(minor)]->lv[l] != NULL && + lvm_strcmp ( vg[VG_CHR(minor)]->lv[l]->lv_name, lv_name) == 0) + return -EEXIST; + } + + /* in case of lv_remove(), lv_create() pair; for eg. lvrename does this */ + l_new = -1; + if ( vg[VG_CHR(minor)]->lv[lv->lv_number] == NULL) l_new = lv->lv_number; + else { + for ( l = 0; l < vg[VG_CHR(minor)]->lv_max; l++) { + if ( vg[VG_CHR(minor)]->lv[l] == NULL) if ( l_new == -1) l_new = l; + } + } + if ( l_new == -1) return -EPERM; + l = l_new; + + if ( ( lv_ptr = kmalloc ( sizeof ( lv_t), GFP_USER)) == NULL) {; + printk ( KERN_CRIT "%s -- LV_CREATE: kmalloc error LV at line %d\n", + lvm_name, __LINE__); + return -ENOMEM; + } + + /* copy preloaded LV */ + lvm_memcpy ( ( char*) lv_ptr, ( char *) lv, sizeof ( lv_t)); + + lv_status_save = lv_ptr->lv_status; + lv_ptr->lv_status &= ~LV_ACTIVE; + lv_ptr->lv_snapshot_org = \ + lv_ptr->lv_snapshot_prev = \ + lv_ptr->lv_snapshot_next = NULL; + lv_ptr->lv_block_exception = NULL; +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 3, 4) + lv_ptr->lv_snapshot_sem = MUTEX; +#else + init_MUTEX(&lv_ptr->lv_snapshot_sem); +#endif + vg[VG_CHR(minor)]->lv[l] = lv_ptr; + + /* get the PE structures from user space if this + is no snapshot logical volume */ + if ( ! ( lv_ptr->lv_access & LV_SNAPSHOT)) { + size = lv_ptr->lv_allocated_le * sizeof ( pe_t); + if ( ( lv_ptr->lv_current_pe = vmalloc ( size)) == NULL) { + printk ( KERN_CRIT + "%s -- LV_CREATE: vmalloc error LV_CURRENT_PE of %d Byte " + "at line %d\n", + lvm_name, size, __LINE__); +#ifdef DEBUG_VFREE + printk ( KERN_DEBUG "%s -- vfree %d\n", lvm_name, __LINE__); +#endif + kfree ( lv_ptr); + vg[VG_CHR(minor)]->lv[l] = NULL; + return -ENOMEM; + } + + if ( copy_from_user ( lv_ptr->lv_current_pe, pep, size)) { + vfree ( lv_ptr->lv_current_pe); + kfree ( lv_ptr); + vg[VG_CHR(minor)]->lv[l] = NULL; + return -EFAULT; + } + + /* correct the PE count in PVs */ + for ( le = 0; le < lv_ptr->lv_allocated_le; le++) { + vg[VG_CHR(minor)]->pe_allocated++; + for ( p = 0; p < vg[VG_CHR(minor)]->pv_cur; p++) { + if ( vg[VG_CHR(minor)]->pv[p]->pv_dev == + lv_ptr->lv_current_pe[le].dev) + vg[VG_CHR(minor)]->pv[p]->pe_allocated++; + } + } + } else { + /* Get snapshot exception data and block list */ + if ( lvbe != NULL) { + lv_ptr->lv_snapshot_org = + vg[VG_CHR(minor)]->lv[LV_BLK(lv_ptr->lv_snapshot_minor)]; + if ( lv_ptr->lv_snapshot_org != NULL) { + size = lv_ptr->lv_remap_end * sizeof ( lv_block_exception_t); + if ( ( lv_ptr->lv_block_exception = vmalloc ( size)) == NULL) { + printk ( KERN_CRIT + "%s -- do_lv_create: vmalloc error LV_BLOCK_EXCEPTION " + "of %d byte at line %d\n", + lvm_name, size, __LINE__); +#ifdef DEBUG_VFREE + printk ( KERN_DEBUG "%s -- vfree %d\n", lvm_name, __LINE__); +#endif + kfree ( lv_ptr); + vg[VG_CHR(minor)]->lv[l] = NULL; + return -ENOMEM; + } + + if ( copy_from_user ( lv_ptr->lv_block_exception, lvbe, size)) { + vfree ( lv_ptr->lv_block_exception); + kfree ( lv_ptr); + vg[VG_CHR(minor)]->lv[l] = NULL; + return -EFAULT; + } + + /* get pointer to original logical volume */ + lv_ptr = lv_ptr->lv_snapshot_org; + + lv_ptr->lv_snapshot_minor = 0; + lv_ptr->lv_snapshot_org = lv_ptr; + lv_ptr->lv_snapshot_prev = NULL; + /* walk thrugh the snapshot list */ + while ( lv_ptr->lv_snapshot_next != NULL) + lv_ptr = lv_ptr->lv_snapshot_next; + /* now lv_ptr points to the last existing snapshot in the chain */ + vg[VG_CHR(minor)]->lv[l]->lv_snapshot_prev = lv_ptr; + /* our new one now back points to the previous last in the chain */ + lv_ptr = vg[VG_CHR(minor)]->lv[l]; + /* now lv_ptr points to our new last snapshot logical volume */ + lv_ptr->lv_snapshot_org = lv_ptr->lv_snapshot_prev->lv_snapshot_org; + lv_ptr->lv_snapshot_next = NULL; + lv_ptr->lv_current_pe = lv_ptr->lv_snapshot_org->lv_current_pe; + lv_ptr->lv_allocated_le = lv_ptr->lv_snapshot_org->lv_allocated_le; + lv_ptr->lv_current_le = lv_ptr->lv_snapshot_org->lv_current_le; + lv_ptr->lv_size = lv_ptr->lv_snapshot_org->lv_size; + lv_ptr->lv_stripes = lv_ptr->lv_snapshot_org->lv_stripes; + lv_ptr->lv_stripesize = lv_ptr->lv_snapshot_org->lv_stripesize; + { + int err; + + err = lvm_snapshot_alloc(lv_ptr); + if (err) + { + vfree(lv_ptr->lv_block_exception); + kfree(lv_ptr); + vg[VG_CHR(minor)]->lv[l] = NULL; + return err; + } + } + } else { + vfree ( lv_ptr->lv_block_exception); + kfree ( lv_ptr); + vg[VG_CHR(minor)]->lv[l] = NULL; + return -EFAULT; + } + } else { + kfree ( vg[VG_CHR(minor)]->lv[l]); + vg[VG_CHR(minor)]->lv[l] = NULL; + return -EINVAL; + } + } /* if ( vg[VG_CHR(minor)]->lv[l]->lv_access & LV_SNAPSHOT) */ + + lv_ptr = vg[VG_CHR(minor)]->lv[l]; + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = 0; + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = lv_ptr->lv_size; + lvm_size[MINOR(lv_ptr->lv_dev)] = lv_ptr->lv_size >> 1; + vg_lv_map[MINOR(lv_ptr->lv_dev)].vg_number = vg[VG_CHR(minor)]->vg_number; + vg_lv_map[MINOR(lv_ptr->lv_dev)].lv_number = lv_ptr->lv_number; + LVM_CORRECT_READ_AHEAD ( lv_ptr->lv_read_ahead); + read_ahead[MAJOR_NR] = lv_ptr->lv_read_ahead; + vg[VG_CHR(minor)]->lv_cur++; + lv_ptr->lv_status = lv_status_save; + + /* optionally add our new snapshot LV */ + if ( lv_ptr->lv_access & LV_SNAPSHOT) { + /* sync the original logical volume */ + fsync_dev ( lv_ptr->lv_snapshot_org->lv_dev); + /* put ourselve into the chain */ + lv_ptr->lv_snapshot_prev->lv_snapshot_next = lv_ptr; + lv_ptr->lv_snapshot_org->lv_access |= LV_SNAPSHOT_ORG; + } + + return 0; +} /* do_lv_create () */ + + +/* + * character device support function logical volume remove + */ +static int do_lv_remove ( int minor, char *lv_name, int l) { + uint le, p; + lv_t *lv_ptr; + + if ( l == -1) { + for ( l = 0; l < vg[VG_CHR(minor)]->lv_max; l++) { + if ( vg[VG_CHR(minor)]->lv[l] != NULL && + lvm_strcmp ( vg[VG_CHR(minor)]->lv[l]->lv_name, lv_name) == 0) { + break; + } + } + } + + lv_ptr = vg[VG_CHR(minor)]->lv[l]; + if ( l < vg[VG_CHR(minor)]->lv_max) { +#ifdef LVM_TOTAL_RESET + if ( lv_ptr->lv_open > 0 && lvm_reset_spindown == 0) +#else + if ( lv_ptr->lv_open > 0) +#endif + return -EBUSY; + + /* check for deletion of snapshot source while + snapshot volume still exists */ + if ( ( lv_ptr->lv_access & LV_SNAPSHOT_ORG) && + lv_ptr->lv_snapshot_next != NULL) + return -EPERM; + + lv_ptr->lv_status |= LV_SPINDOWN; + + /* sync the buffers */ + fsync_dev ( lv_ptr->lv_dev); + + lv_ptr->lv_status &= ~LV_ACTIVE; + + /* invalidate the buffers */ + invalidate_buffers ( lv_ptr->lv_dev); + + /* reset generic hd */ + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].start_sect = -1; + lvm_gendisk.part[MINOR(lv_ptr->lv_dev)].nr_sects = 0; + lvm_size[MINOR(lv_ptr->lv_dev)] = 0; + + /* reset VG/LV mapping */ + vg_lv_map[MINOR(lv_ptr->lv_dev)].vg_number = ABS_MAX_VG; + vg_lv_map[MINOR(lv_ptr->lv_dev)].lv_number = -1; + + /* correct the PE count in PVs if this is no snapshot logical volume */ + if ( ! ( lv_ptr->lv_access & LV_SNAPSHOT)) { + /* only if this is no snapshot logical volume because we share + the lv_current_pe[] structs with the original logical volume */ + for ( le = 0; le < lv_ptr->lv_allocated_le; le++) { + vg[VG_CHR(minor)]->pe_allocated--; + for ( p = 0; p < vg[VG_CHR(minor)]->pv_cur; p++) { + if ( vg[VG_CHR(minor)]->pv[p]->pv_dev == + lv_ptr->lv_current_pe[le].dev) + vg[VG_CHR(minor)]->pv[p]->pe_allocated--; + } + } + vfree ( lv_ptr->lv_current_pe); + /* LV_SNAPSHOT */ + } else { +/* + if ( lv_ptr->lv_block_exception != NULL) { + int i; + kdev_t last_dev; + for ( i = last_dev = 0; i < lv_ptr->lv_remap_ptr; i++) { + if ( lv_ptr->lv_block_exception[i].rdev_new != last_dev) { + last_dev = lv_ptr->lv_block_exception[i].rdev_new; + invalidate_buffers ( last_dev); + current->state = TASK_INTERRUPTIBLE; + schedule_timeout ( 1); + } + } + } +*/ + /* remove this snapshot logical volume from the chain */ + lv_ptr->lv_snapshot_prev->lv_snapshot_next = lv_ptr->lv_snapshot_next; + if ( lv_ptr->lv_snapshot_next != NULL) { + lv_ptr->lv_snapshot_next->lv_snapshot_prev = + lv_ptr->lv_snapshot_prev; + } + /* no more snapshots? */ + if ( lv_ptr->lv_snapshot_org->lv_snapshot_next == NULL) + lv_ptr->lv_snapshot_org->lv_access &= ~LV_SNAPSHOT_ORG; + lvm_snapshot_release(lv_ptr); + } + +#ifdef DEBUG_VFREE + printk ( KERN_DEBUG "%s -- kfree %d\n", lvm_name, __LINE__); +#endif + kfree ( lv_ptr); + vg[VG_CHR(minor)]->lv[l] = NULL; + vg[VG_CHR(minor)]->lv_cur--; + return 0; + } + + return -ENXIO; +} /* do_lv_remove () */ + + +/* + * character device support function logical volume extend / reduce + */ +static int do_lv_extend_reduce ( int minor, char *lv_name, lv_t *lv) { + int l, le, p, size, old_allocated_le; + uint32_t end, lv_status_save; + pe_t *pe; + + if ( ( pep = lv->lv_current_pe) == NULL) return -EINVAL; + + for ( l = 0; l < vg[VG_CHR(minor)]->lv_max; l++) { + if ( vg[VG_CHR(minor)]->lv[l] != NULL && + lvm_strcmp ( vg[VG_CHR(minor)]->lv[l]->lv_name, lv_name) == 0) + break; + } + if ( l == vg[VG_CHR(minor)]->lv_max) return -ENXIO; + + /* check for active snapshot */ + if ( lv->lv_access & ( LV_SNAPSHOT|LV_SNAPSHOT_ORG)) return -EPERM; + + if ( ( pe = vmalloc ( size = lv->lv_current_le * sizeof ( pe_t))) == NULL) { + printk ( KERN_CRIT + "%s -- do_lv_extend_reduce: vmalloc error LV_CURRENT_PE " + "of %d Byte at line %d\n", + lvm_name, size, __LINE__); + return -ENOMEM; + } + + /* get the PE structures from user space */ + if ( copy_from_user ( pe, pep, size)) { + vfree ( pe); + return -EFAULT; + } + +#ifdef DEBUG + printk ( KERN_DEBUG + "%s -- fsync_dev and " + "invalidate_buffers for %s [%s] in %s\n", + lvm_name, vg[VG_CHR(minor)]->lv[l]->lv_name, + kdevname ( vg[VG_CHR(minor)]->lv[l]->lv_dev), + vg[VG_CHR(minor)]->vg_name); +#endif + + vg[VG_CHR(minor)]->lv[l]->lv_status |= LV_SPINDOWN; + fsync_dev ( vg[VG_CHR(minor)]->lv[l]->lv_dev); + vg[VG_CHR(minor)]->lv[l]->lv_status &= ~LV_ACTIVE; + invalidate_buffers ( vg[VG_CHR(minor)]->lv[l]->lv_dev); + + /* reduce allocation counters on PV(s) */ + for ( le = 0; le < vg[VG_CHR(minor)]->lv[l]->lv_allocated_le; le++) { + vg[VG_CHR(minor)]->pe_allocated--; + for ( p = 0; p < vg[VG_CHR(minor)]->pv_cur; p++) { + if ( vg[VG_CHR(minor)]->pv[p]->pv_dev == + vg[VG_CHR(minor)]->lv[l]->lv_current_pe[le].dev) { + vg[VG_CHR(minor)]->pv[p]->pe_allocated--; + break; + } + } + } + +#ifdef DEBUG_VFREE + printk ( KERN_DEBUG "%s -- vfree %d\n", lvm_name, __LINE__); +#endif + + /* save pointer to "old" lv/pe pointer array */ + pep1 = vg[VG_CHR(minor)]->lv[l]->lv_current_pe; + end = vg[VG_CHR(minor)]->lv[l]->lv_current_le; + + /* save open counter */ + lv_open = vg[VG_CHR(minor)]->lv[l]->lv_open; + + /* save # of old allocated logical extents */ + old_allocated_le = vg[VG_CHR(minor)]->lv[l]->lv_allocated_le; + + /* copy preloaded LV */ + lv_status_save = lv->lv_status; + lv->lv_status |= LV_SPINDOWN; + lv->lv_status &= ~LV_ACTIVE; + lvm_memcpy ( ( char*) vg[VG_CHR(minor)]->lv[l], ( char*) lv, sizeof ( lv_t)); + vg[VG_CHR(minor)]->lv[l]->lv_current_pe = pe; + vg[VG_CHR(minor)]->lv[l]->lv_open = lv_open; + + /* save availiable i/o statistic data */ + /* linear logical volume */ + if ( vg[VG_CHR(minor)]->lv[l]->lv_stripes < 2) { + /* Check what last LE shall be used */ + if ( end > vg[VG_CHR(minor)]->lv[l]->lv_current_le) + end = vg[VG_CHR(minor)]->lv[l]->lv_current_le; + for ( le = 0; le < end; le++) { + vg[VG_CHR(minor)]->lv[l]->lv_current_pe[le].reads = pep1[le].reads; + vg[VG_CHR(minor)]->lv[l]->lv_current_pe[le].writes = pep1[le].writes; + } + /* striped logical volume */ + } else { + uint i, j, source, dest, end, old_stripe_size, new_stripe_size; + + old_stripe_size = old_allocated_le / vg[VG_CHR(minor)]->lv[l]->lv_stripes; + new_stripe_size = vg[VG_CHR(minor)]->lv[l]->lv_allocated_le / + vg[VG_CHR(minor)]->lv[l]->lv_stripes; + end = old_stripe_size; + if ( end > new_stripe_size) end = new_stripe_size; + for ( i = source = dest = 0; + i < vg[VG_CHR(minor)]->lv[l]->lv_stripes; i++) { + for ( j = 0; j < end; j++) { + vg[VG_CHR(minor)]->lv[l]->lv_current_pe[dest+j].reads = + pep1[source+j].reads; + vg[VG_CHR(minor)]->lv[l]->lv_current_pe[dest+j].writes = + pep1[source+j].writes; + } + source += old_stripe_size; + dest += new_stripe_size; + } + } + vfree ( pep1); pep1 = NULL; + + + /* extend the PE count in PVs */ + for ( le = 0; le < vg[VG_CHR(minor)]->lv[l]->lv_allocated_le; le++) { + vg[VG_CHR(minor)]->pe_allocated++; + for ( p = 0; p < vg[VG_CHR(minor)]->pv_cur; p++) { + if ( vg[VG_CHR(minor)]->pv[p]->pv_dev == + vg[VG_CHR(minor)]->lv[l]->lv_current_pe[le].dev) { + vg[VG_CHR(minor)]->pv[p]->pe_allocated++; + break; + } + } + } + + lvm_gendisk.part[MINOR(vg[VG_CHR(minor)]->lv[l]->lv_dev)].start_sect = 0; + lvm_gendisk.part[MINOR(vg[VG_CHR(minor)]->lv[l]->lv_dev)].nr_sects = + vg[VG_CHR(minor)]->lv[l]->lv_size; + lvm_size[MINOR(vg[VG_CHR(minor)]->lv[l]->lv_dev)] = + vg[VG_CHR(minor)]->lv[l]->lv_size >> 1; + /* vg_lv_map array doesn't have to be changed here */ + + LVM_CORRECT_READ_AHEAD ( vg[VG_CHR(minor)]->lv[l]->lv_read_ahead); + read_ahead[MAJOR_NR] = vg[VG_CHR(minor)]->lv[l]->lv_read_ahead; + vg[VG_CHR(minor)]->lv[l]->lv_status = lv_status_save; + + return 0; +} /* do_lv_extend_reduce () */ + + +/* + * support function initialize gendisk variables + */ +#ifdef __initfunc +__initfunc ( void lvm_geninit ( struct gendisk *lvm_gdisk)) +#else +void __init lvm_geninit ( struct gendisk *lvm_gdisk) +#endif +{ + int i = 0; + +#ifdef DEBUG_GENDISK + printk ( KERN_DEBUG "%s -- lvm_gendisk\n", lvm_name); +#endif + + for ( i = 0; i < MAX_LV; i++) { + lvm_gendisk.part[i].start_sect = -1; /* avoid partition check */ + lvm_size[i] = lvm_gendisk.part[i].nr_sects = 0; + lvm_blocksizes[i] = BLOCK_SIZE; + } + + blksize_size[MAJOR_NR] = lvm_blocksizes; + blk_size[MAJOR_NR] = lvm_size; + + return; +} /* lvm_gen_init () */ + + +#ifdef LVM_GET_INODE +/* + * support function to get an empty inode + * + * Gets an empty inode to be inserted into the inode hash, + * so that a physical volume can't be mounted. + * This is analog to drivers/block/md.c + * + * Is this the real thing? + * + */ +struct inode *lvm_get_inode ( int dev) { + struct inode *inode_this = NULL; + + /* Lock the device by inserting a dummy inode. */ + inode_this = get_empty_inode (); + inode_this->i_dev = dev; + insert_inode_hash ( inode_this); + return inode_this; +} + + +/* + * support function to clear an inode + * + */ +void lvm_clear_inode ( struct inode *inode) { +#ifdef I_FREEING + inode->i_state |= I_FREEING; +#endif + clear_inode ( inode); + return; +} +#endif /* #ifdef LVM_GET_INODE */ + + +/* my strlen */ +inline int lvm_strlen ( char *s1) { + int len = 0; + + while ( s1[len] != 0) len++; + return len; +} + + +/* my strcmp */ +inline int lvm_strcmp ( char *s1, char *s2) { + while ( *s1 != 0 && *s2 != 0) { + if ( *s1 != *s2) return -1; + s1++; s2++; + } + if ( *s1 == 0 && *s2 == 0) return 0; + return -1; +} + + +/* my strrchr */ +inline char *lvm_strrchr ( char *s1, char c) { + char *s2 = NULL; + + while ( *s1 != 0) { + if ( *s1 == c) s2 = s1; + s1++; + } + return s2; +} + + +/* my memcpy */ +inline void lvm_memcpy ( char *dest, char *source, int size) { + for ( ;size > 0; size--) *dest++ = *source++; +} diff -urN 2.3.46pre1/drivers/block/nbd.c 2.3.46pre1aa1/drivers/block/nbd.c --- 2.3.46pre1/drivers/block/nbd.c Sun Jan 30 15:43:28 2000 +++ 2.3.46pre1aa1/drivers/block/nbd.c Wed Feb 16 00:28:25 2000 @@ -184,10 +184,10 @@ DEBUG("reading control, "); reply.magic = 0; result = nbd_xmit(0, lo->sock, (char *) &reply, sizeof(reply)); - req = lo->tail; if (result <= 0) HARDFAIL("Recv control failed."); memcpy(&xreq, reply.handle, sizeof(xreq)); + req = blkdev_entry_prev_request(&lo->queue_head); if (xreq != req) FAIL("Unexpected handle received.\n"); @@ -216,47 +216,42 @@ { struct request *req; - while (1) { + down (&lo->queue_lock); + while (!list_empty(&lo->queue_head)) { req = nbd_read_stat(lo); if (!req) - return; - down (&lo->queue_lock); + goto out; #ifdef PARANOIA - if (req != lo->tail) { + if (req != blkdev_entry_prev_request(&lo->queue_head)) { printk(KERN_ALERT "NBD: I have problem...\n"); } if (lo != &nbd_dev[MINOR(req->rq_dev)]) { printk(KERN_ALERT "NBD: request corrupted!\n"); - goto next; + continue; } if (lo->magic != LO_MAGIC) { printk(KERN_ALERT "NBD: nbd_dev[] corrupted: Not enough magic\n"); - up (&lo->queue_lock); - return; + goto out; } #endif - nbd_end_request(req); - if (lo->tail == lo->head) { -#ifdef PARANOIA - if (lo->tail->next) - printk(KERN_ERR "NBD: I did not expect this\n"); -#endif - lo->head = NULL; - } - lo->tail = lo->tail->next; - next: + list_del(&req->queue); up (&lo->queue_lock); + + nbd_end_request(req); + + down (&lo->queue_lock); } + out: + up (&lo->queue_lock); } void nbd_clear_que(struct nbd_device *lo) { struct request *req; + unsigned long flags; - while (1) { - req = lo->tail; - if (!req) - return; + while (!list_empty(&lo->queue_head)) { + req = blkdev_entry_prev_request(&lo->queue_head); #ifdef PARANOIA if (lo != &nbd_dev[MINOR(req->rq_dev)]) { printk(KERN_ALERT "NBD: request corrupted when clearing!\n"); @@ -268,15 +263,12 @@ } #endif req->errors++; + list_del(&req->queue); + up(&lo->queue_lock); + nbd_end_request(req); - if (lo->tail == lo->head) { -#ifdef PARANOIA - if (lo->tail->next) - printk(KERN_ERR "NBD: I did not assume this\n"); -#endif - lo->head = NULL; - } - lo->tail = lo->tail->next; + + down(&lo->queue_lock); } } @@ -296,7 +288,7 @@ int dev; struct nbd_device *lo; - while (CURRENT) { + while (!QUEUE_EMPTY) { req = CURRENT; dev = MINOR(req->rq_dev); #ifdef PARANOIA @@ -314,28 +306,23 @@ requests_in++; #endif req->errors = 0; - CURRENT = CURRENT->next; - req->next = NULL; - + blkdev_dequeue_request(req); spin_unlock_irq(&io_request_lock); - down (&lo->queue_lock); - if (lo->head == NULL) { - lo->head = req; - lo->tail = req; - } else { - lo->head->next = req; - lo->head = req; - } + down (&lo->queue_lock); + list_add(&req->queue, &lo->queue_head); nbd_send_req(lo->sock, req); /* Why does this block? */ up (&lo->queue_lock); + spin_lock_irq(&io_request_lock); continue; error_out: req->errors++; + blkdev_dequeue_request(req); + spin_unlock(&io_request_lock); nbd_end_request(req); - CURRENT = CURRENT->next; + spin_lock(&io_request_lock); } return; } @@ -359,11 +346,14 @@ lo = &nbd_dev[dev]; switch (cmd) { case NBD_CLEAR_SOCK: + down(&lo->queue_lock); nbd_clear_que(lo); - if (lo->head || lo->tail) { + if (!list_empty(&lo->queue_head)) { + up(&lo->queue_lock); printk(KERN_ERR "nbd: Some requests are in progress -> can not turn off.\n"); return -EBUSY; } + up(&lo->queue_lock); file = lo->file; if (!file) return -EINVAL; @@ -415,8 +405,8 @@ return 0; #ifdef PARANOIA case NBD_PRINT_DEBUG: - printk(KERN_INFO "NBD device %d: head = %lx, tail = %lx. Global: in %d, out %d\n", - dev, (long) lo->head, (long) lo->tail, requests_in, requests_out); + printk(KERN_INFO "NBD device %d: queue_head = %p. Global: in %d, out %d\n", + dev, lo->queue_head, requests_in, requests_out); return 0; #endif case BLKGETSIZE: @@ -480,6 +470,7 @@ blksize_size[MAJOR_NR] = nbd_blksizes; blk_size[MAJOR_NR] = nbd_sizes; blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), do_nbd_request); + blk_queue_headactive(BLK_DEFAULT_QUEUE(MAJOR_NR), 0); for (i = 0; i < MAX_NBD; i++) { nbd_dev[i].refcnt = 0; nbd_dev[i].file = NULL; diff -urN 2.3.46pre1/drivers/block/paride/pcd.c 2.3.46pre1aa1/drivers/block/paride/pcd.c --- 2.3.46pre1/drivers/block/paride/pcd.c Tue Dec 14 15:48:49 1999 +++ 2.3.46pre1aa1/drivers/block/paride/pcd.c Wed Feb 16 00:28:25 2000 @@ -756,7 +756,7 @@ if (pcd_busy) return; while (1) { - if ((!CURRENT) || (CURRENT->rq_status == RQ_INACTIVE)) return; + if (QUEUE_EMPTY || (CURRENT->rq_status == RQ_INACTIVE)) return; INIT_REQUEST; if (CURRENT->cmd == READ) { unit = MINOR(CURRENT->rq_dev); diff -urN 2.3.46pre1/drivers/block/paride/pd.c 2.3.46pre1aa1/drivers/block/paride/pd.c --- 2.3.46pre1/drivers/block/paride/pd.c Sun Jan 30 15:43:28 2000 +++ 2.3.46pre1aa1/drivers/block/paride/pd.c Wed Feb 16 00:28:25 2000 @@ -868,7 +868,7 @@ if (pd_busy) return; repeat: - if ((!CURRENT) || (CURRENT->rq_status == RQ_INACTIVE)) return; + if (QUEUE_EMPTY || (CURRENT->rq_status == RQ_INACTIVE)) return; INIT_REQUEST; pd_dev = MINOR(CURRENT->rq_dev); @@ -890,7 +890,7 @@ pd_cmd = CURRENT->cmd; pd_run = pd_count; while ((pd_run <= cluster) && - (req = req->next) && + (req = blkdev_next_request(req)) && (pd_block+pd_run == req->sector) && (pd_cmd == req->cmd) && (pd_dev == MINOR(req->rq_dev))) @@ -922,7 +922,7 @@ /* paranoia */ - if ((!CURRENT) || + if (QUEUE_EMPTY || (CURRENT->cmd != pd_cmd) || (MINOR(CURRENT->rq_dev) != pd_dev) || (CURRENT->rq_status == RQ_INACTIVE) || diff -urN 2.3.46pre1/drivers/block/paride/pf.c 2.3.46pre1aa1/drivers/block/paride/pf.c --- 2.3.46pre1/drivers/block/paride/pf.c Sun Jan 30 15:43:28 2000 +++ 2.3.46pre1aa1/drivers/block/paride/pf.c Wed Feb 16 00:28:25 2000 @@ -854,7 +854,7 @@ if (pf_busy) return; repeat: - if ((!CURRENT) || (CURRENT->rq_status == RQ_INACTIVE)) return; + if (QUEUE_EMPTY || (CURRENT->rq_status == RQ_INACTIVE)) return; INIT_REQUEST; pf_unit = unit = DEVICE_NR(CURRENT->rq_dev); @@ -874,7 +874,7 @@ pf_cmd = CURRENT->cmd; pf_run = pf_count; while ((pf_run <= cluster) && - (req = req->next) && + (req = blkdev_next_request(req)) && (pf_block+pf_run == req->sector) && (pf_cmd == req->cmd) && (pf_unit == DEVICE_NR(req->rq_dev))) @@ -904,7 +904,7 @@ /* paranoia */ - if ((!CURRENT) || + if (QUEUE_EMPTY || (CURRENT->cmd != pf_cmd) || (DEVICE_NR(CURRENT->rq_dev) != pf_unit) || (CURRENT->rq_status == RQ_INACTIVE) || diff -urN 2.3.46pre1/drivers/block/ps2esdi.c 2.3.46pre1aa1/drivers/block/ps2esdi.c --- 2.3.46pre1/drivers/block/ps2esdi.c Sun Jan 30 15:43:28 2000 +++ 2.3.46pre1aa1/drivers/block/ps2esdi.c Wed Feb 16 00:28:25 2000 @@ -476,7 +476,7 @@ if (virt_to_bus(CURRENT->buffer + CURRENT->nr_sectors * 512) > 16 * MB) { printk("%s: DMA above 16MB not supported\n", DEVICE_NAME); end_request(FAIL); - if (CURRENT) + if (!QUEUE_EMPTY) do_ps2esdi_request(q); return; } /* check for above 16Mb dmas */ @@ -510,7 +510,7 @@ default: printk("%s: Unknown command\n", DEVICE_NAME); end_request(FAIL); - if (CURRENT) + if (!QUEUE_EMPTY) do_ps2esdi_request(q); break; } /* handle different commands */ @@ -520,7 +520,7 @@ printk("Grrr. error. ps2esdi_drives: %d, %lu %lu\n", ps2esdi_drives, CURRENT->sector, ps2esdi[MINOR(CURRENT->rq_dev)].nr_sects); end_request(FAIL); - if (CURRENT) + if (!QUEUE_EMPTY) do_ps2esdi_request(q); } @@ -591,7 +591,7 @@ return do_ps2esdi_request(NULL); else { end_request(FAIL); - if (CURRENT) + if (!QUEUE_EMPTY) do_ps2esdi_request(NULL); } } @@ -894,7 +894,7 @@ do_ps2esdi_request(NULL); else { end_request(FAIL); - if (CURRENT) + if (!QUEUE_EMPTY) do_ps2esdi_request(NULL); } break; @@ -940,7 +940,7 @@ do_ps2esdi_request(NULL); else { end_request(FAIL); - if (CURRENT) + if (!QUEUE_EMPTY) do_ps2esdi_request(NULL); } break; @@ -950,7 +950,7 @@ outb((int_ret_code & 0xe0) | ATT_EOI, ESDI_ATTN); outb(CTRL_ENABLE_INTR, ESDI_CONTROL); end_request(FAIL); - if (CURRENT) + if (!QUEUE_EMPTY) do_ps2esdi_request(NULL); break; @@ -986,7 +986,7 @@ do_ps2esdi_request(NULL); } else { end_request(SUCCES); - if (CURRENT) + if (!QUEUE_EMPTY) do_ps2esdi_request(NULL); } } diff -urN 2.3.46pre1/drivers/block/swim3.c 2.3.46pre1aa1/drivers/block/swim3.c --- 2.3.46pre1/drivers/block/swim3.c Thu Feb 3 06:05:57 2000 +++ 2.3.46pre1aa1/drivers/block/swim3.c Wed Feb 16 00:28:25 2000 @@ -305,7 +305,7 @@ wake_up(&fs->wait); return; } - while (CURRENT && fs->state == idle) { + while (!QUEUE_EMPTY && fs->state == idle) { if (MAJOR(CURRENT->rq_dev) != MAJOR_NR) panic(DEVICE_NAME ": request list destroyed"); if (CURRENT->bh && !buffer_locked(CURRENT->bh)) diff -urN 2.3.46pre1/drivers/block/swim_iop.c 2.3.46pre1aa1/drivers/block/swim_iop.c --- 2.3.46pre1/drivers/block/swim_iop.c Thu Feb 3 06:05:57 2000 +++ 2.3.46pre1aa1/drivers/block/swim_iop.c Wed Feb 16 00:28:25 2000 @@ -550,7 +550,7 @@ wake_up(&fs->wait); return; } - while (CURRENT && fs->state == idle) { + while (!QUEUE_EMPTY && fs->state == idle) { if (MAJOR(CURRENT->rq_dev) != MAJOR_NR) panic(DEVICE_NAME ": request list destroyed"); if (CURRENT->bh && !buffer_locked(CURRENT->bh)) diff -urN 2.3.46pre1/drivers/block/xd.c 2.3.46pre1aa1/drivers/block/xd.c --- 2.3.46pre1/drivers/block/xd.c Sun Jan 30 15:43:28 2000 +++ 2.3.46pre1aa1/drivers/block/xd.c Wed Feb 16 00:28:25 2000 @@ -287,7 +287,7 @@ sti(); if (xdc_busy) return; - while (code = 0, CURRENT) { + while (code = 0, !QUEUE_EMPTY) { INIT_REQUEST; /* do some checking on the request structure */ if (CURRENT_DEV < xd_drives diff -urN 2.3.46pre1/drivers/cdrom/aztcd.c 2.3.46pre1aa1/drivers/cdrom/aztcd.c --- 2.3.46pre1/drivers/cdrom/aztcd.c Sun Jan 30 15:43:28 2000 +++ 2.3.46pre1aa1/drivers/cdrom/aztcd.c Wed Feb 16 00:28:25 2000 @@ -234,7 +234,7 @@ #endif #define CURRENT_VALID \ - (CURRENT && MAJOR(CURRENT -> rq_dev) == MAJOR_NR && CURRENT -> cmd == READ \ + (!QUEUE_EMPTY && MAJOR(CURRENT -> rq_dev) == MAJOR_NR && CURRENT -> cmd == READ \ && CURRENT -> sector != -1) #define AFL_STATUSorDATA (AFL_STATUS | AFL_DATA) diff -urN 2.3.46pre1/drivers/cdrom/cdu31a.c 2.3.46pre1aa1/drivers/cdrom/cdu31a.c --- 2.3.46pre1/drivers/cdrom/cdu31a.c Tue Dec 14 15:48:50 1999 +++ 2.3.46pre1aa1/drivers/cdrom/cdu31a.c Wed Feb 16 00:28:25 2000 @@ -1672,7 +1672,7 @@ if (signal_pending(current)) { restore_flags(flags); - if (CURRENT && CURRENT->rq_status != RQ_INACTIVE) + if (!QUEUE_EMPTY && CURRENT->rq_status != RQ_INACTIVE) { end_request(0); } @@ -1705,7 +1705,7 @@ * The beginning here is stolen from the hard disk driver. I hope * it's right. */ - if (!(CURRENT) || CURRENT->rq_status == RQ_INACTIVE) + if (QUEUE_EMPTY || CURRENT->rq_status == RQ_INACTIVE) { goto end_do_cdu31a_request; } diff -urN 2.3.46pre1/drivers/cdrom/cm206.c 2.3.46pre1aa1/drivers/cdrom/cm206.c --- 2.3.46pre1/drivers/cdrom/cm206.c Tue Dec 14 15:48:50 1999 +++ 2.3.46pre1aa1/drivers/cdrom/cm206.c Wed Feb 16 00:28:25 2000 @@ -816,7 +816,7 @@ while(1) { /* repeat until all requests have been satisfied */ INIT_REQUEST; - if (CURRENT == NULL || CURRENT->rq_status == RQ_INACTIVE) + if (QUEUE_EMPTY || CURRENT->rq_status == RQ_INACTIVE) return; if (CURRENT->cmd != READ) { debug(("Non-read command %d on cdrom\n", CURRENT->cmd)); diff -urN 2.3.46pre1/drivers/cdrom/gscd.c 2.3.46pre1aa1/drivers/cdrom/gscd.c --- 2.3.46pre1/drivers/cdrom/gscd.c Sun Jan 30 15:43:28 2000 +++ 2.3.46pre1aa1/drivers/cdrom/gscd.c Wed Feb 16 00:28:25 2000 @@ -279,13 +279,13 @@ unsigned int nsect; repeat: - if (!(CURRENT) || CURRENT->rq_status == RQ_INACTIVE) return; + if (QUEUE_EMPTY || CURRENT->rq_status == RQ_INACTIVE) return; INIT_REQUEST; dev = MINOR(CURRENT->rq_dev); block = CURRENT->sector; nsect = CURRENT->nr_sectors; - if (CURRENT == NULL || CURRENT -> sector == -1) + if (QUEUE_EMPTY || CURRENT -> sector == -1) return; if (CURRENT -> cmd != READ) diff -urN 2.3.46pre1/drivers/cdrom/mcd.c 2.3.46pre1aa1/drivers/cdrom/mcd.c --- 2.3.46pre1/drivers/cdrom/mcd.c Fri Feb 11 00:05:33 2000 +++ 2.3.46pre1aa1/drivers/cdrom/mcd.c Wed Feb 16 00:28:25 2000 @@ -134,7 +134,7 @@ /* #define DOUBLE_QUICK_ONLY */ #define CURRENT_VALID \ -(CURRENT && MAJOR(CURRENT -> rq_dev) == MAJOR_NR && CURRENT -> cmd == READ \ +(!QUEUE_EMPTY && MAJOR(CURRENT -> rq_dev) == MAJOR_NR && CURRENT -> cmd == READ \ && CURRENT -> sector != -1) #define MFL_STATUSorDATA (MFL_STATUS | MFL_DATA) diff -urN 2.3.46pre1/drivers/cdrom/mcdx.c 2.3.46pre1aa1/drivers/cdrom/mcdx.c --- 2.3.46pre1/drivers/cdrom/mcdx.c Tue Dec 14 15:48:50 1999 +++ 2.3.46pre1aa1/drivers/cdrom/mcdx.c Wed Feb 16 00:28:25 2000 @@ -530,7 +530,7 @@ again: - if (CURRENT == NULL) { + if (QUEUE_EMPTY) { xtrace(REQUEST, "end_request(0): CURRENT == NULL\n"); return; } diff -urN 2.3.46pre1/drivers/cdrom/optcd.c 2.3.46pre1aa1/drivers/cdrom/optcd.c --- 2.3.46pre1/drivers/cdrom/optcd.c Sun Jan 30 15:43:28 2000 +++ 2.3.46pre1aa1/drivers/cdrom/optcd.c Wed Feb 16 00:28:25 2000 @@ -980,7 +980,7 @@ #define CURRENT_VALID \ - (CURRENT && MAJOR(CURRENT -> rq_dev) == MAJOR_NR \ + (!QUEUE_EMPTY && MAJOR(CURRENT -> rq_dev) == MAJOR_NR \ && CURRENT -> cmd == READ && CURRENT -> sector != -1) diff -urN 2.3.46pre1/drivers/cdrom/sbpcd.c 2.3.46pre1aa1/drivers/cdrom/sbpcd.c --- 2.3.46pre1/drivers/cdrom/sbpcd.c Tue Dec 14 15:48:50 1999 +++ 2.3.46pre1aa1/drivers/cdrom/sbpcd.c Wed Feb 16 00:28:25 2000 @@ -4791,9 +4791,7 @@ */ #undef DEBUG_GTL static inline void sbpcd_end_request(struct request *req, int uptodate) { - req->next=CURRENT; - CURRENT=req; - up(&ioctl_read_sem); + list_add(&req->queue, &req->q->queue_head); end_request(uptodate); } /*==========================================================================*/ @@ -4815,7 +4813,7 @@ #ifdef DEBUG_GTL xnr=++xx_nr; - if(!CURRENT) + if(QUEUE_EMPTY) { printk( "do_sbpcd_request[%di](NULL), Pid:%d, Time:%li\n", xnr, current->pid, jiffies); @@ -4830,15 +4828,15 @@ #endif INIT_REQUEST; req=CURRENT; /* take out our request so no other */ - CURRENT=req->next; /* task can fuck it up GTL */ - spin_unlock_irq(&io_request_lock); /* FIXME!!!! */ + blkdev_dequeue_request(req); /* task can fuck it up GTL */ - down(&ioctl_read_sem); if (req->rq_status == RQ_INACTIVE) sbpcd_end_request(req, 0); if (req -> sector == -1) sbpcd_end_request(req, 0); + spin_unlock_irq(&io_request_lock); + down(&ioctl_read_sem); if (req->cmd != READ) { msg(DBG_INF, "bad cmd %d\n", req->cmd); @@ -4875,8 +4873,9 @@ printk(" do_sbpcd_request[%do](%p:%ld+%ld) end 2, Time:%li\n", xnr, req, req->sector, req->nr_sectors, jiffies); #endif + up(&ioctl_read_sem); + spin_lock_irq(&io_request_lock); sbpcd_end_request(req, 1); - spin_lock_irq(&io_request_lock); /* FIXME!!!! */ goto request_loop; } @@ -4915,8 +4914,9 @@ printk(" do_sbpcd_request[%do](%p:%ld+%ld) end 3, Time:%li\n", xnr, req, req->sector, req->nr_sectors, jiffies); #endif + up(&ioctl_read_sem); + spin_lock_irq(&io_request_lock); sbpcd_end_request(req, 1); - spin_lock_irq(&io_request_lock); /* FIXME!!!! */ goto request_loop; } } @@ -4929,9 +4929,10 @@ printk(" do_sbpcd_request[%do](%p:%ld+%ld) end 4 (error), Time:%li\n", xnr, req, req->sector, req->nr_sectors, jiffies); #endif - sbpcd_end_request(req, 0); + up(&ioctl_read_sem); sbp_sleep(0); /* wait a bit, try again */ - spin_lock_irq(&io_request_lock); /* FIXME!!!! */ + spin_lock_irq(&io_request_lock); + sbpcd_end_request(req, 0); goto request_loop; } /*==========================================================================*/ @@ -5741,6 +5742,7 @@ #endif MODULE } blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), DEVICE_REQUEST); + blk_queue_headactive(BLK_DEFAULT_QUEUE(MAJOR_NR), 0); read_ahead[MAJOR_NR] = buffers * (CD_FRAMESIZE / 512); request_region(CDo_command,4,major_name); diff -urN 2.3.46pre1/drivers/cdrom/sjcd.c 2.3.46pre1aa1/drivers/cdrom/sjcd.c --- 2.3.46pre1/drivers/cdrom/sjcd.c Sun Jan 30 15:43:28 2000 +++ 2.3.46pre1aa1/drivers/cdrom/sjcd.c Wed Feb 16 00:28:25 2000 @@ -938,7 +938,7 @@ */ #define CURRENT_IS_VALID \ - ( CURRENT != NULL && MAJOR( CURRENT->rq_dev ) == MAJOR_NR && \ + ( !QUEUE_EMPTY && MAJOR( CURRENT->rq_dev ) == MAJOR_NR && \ CURRENT->cmd == READ && CURRENT->sector != -1 ) static void sjcd_transfer( void ){ diff -urN 2.3.46pre1/drivers/cdrom/sonycd535.c 2.3.46pre1aa1/drivers/cdrom/sonycd535.c --- 2.3.46pre1/drivers/cdrom/sonycd535.c Sun Jan 30 15:43:28 2000 +++ 2.3.46pre1aa1/drivers/cdrom/sonycd535.c Wed Feb 16 00:28:25 2000 @@ -803,7 +803,7 @@ * The beginning here is stolen from the hard disk driver. I hope * it's right. */ - if (!(CURRENT) || CURRENT->rq_status == RQ_INACTIVE) { + if (QUEUE_EMPTY || CURRENT->rq_status == RQ_INACTIVE) { return; } INIT_REQUEST; diff -urN 2.3.46pre1/drivers/char/rtc.c 2.3.46pre1aa1/drivers/char/rtc.c --- 2.3.46pre1/drivers/char/rtc.c Fri Feb 11 00:05:34 2000 +++ 2.3.46pre1aa1/drivers/char/rtc.c Wed Feb 16 00:28:24 2000 @@ -97,14 +97,18 @@ static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); +#ifndef __alpha__ static unsigned int rtc_poll(struct file *file, poll_table *wait); +#endif static void get_rtc_time (struct rtc_time *rtc_tm); static void get_rtc_alm_time (struct rtc_time *alm_tm); +#ifndef __alpha__ static void rtc_dropped_irq(unsigned long data); static void set_rtc_irq_bit(unsigned char bit); static void mask_rtc_irq_bit(unsigned char bit); +#endif static inline unsigned char rtc_is_updating(void); @@ -132,6 +136,7 @@ static const unsigned char days_in_mo[] = {0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}; +#ifndef __alpha__ /* * A very tiny interrupt handler. It runs with SA_INTERRUPT set, * so that there is no possibility of conflicting with the @@ -162,6 +167,7 @@ if (atomic_read(&rtc_status) & RTC_TIMER_ON) mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100); } +#endif /* * Now all the various file operations that we export. @@ -175,6 +181,9 @@ static ssize_t rtc_read(struct file *file, char *buf, size_t count, loff_t *ppos) { +#ifdef __alpha__ + return -EIO; +#else DECLARE_WAITQUEUE(wait, current); unsigned long data; ssize_t retval; @@ -206,6 +215,7 @@ remove_wait_queue(&rtc_wait, &wait); return retval; +#endif } static int rtc_ioctl(struct inode *inode, struct file *file, unsigned int cmd, @@ -216,6 +226,7 @@ struct rtc_time wtime; switch (cmd) { +#ifndef __alpha__ case RTC_AIE_OFF: /* Mask alarm int. enab. bit */ { mask_rtc_irq_bit(RTC_AIE); @@ -265,6 +276,7 @@ set_rtc_irq_bit(RTC_UIE); return 0; } +#endif case RTC_ALM_READ: /* Read the present alarm time */ { /* @@ -398,6 +410,7 @@ spin_unlock_irqrestore(&rtc_lock, flags); return 0; } +#ifndef __alpha__ case RTC_IRQP_READ: /* Read the periodic IRQ rate. */ { return put_user(rtc_freq, (unsigned long *)arg); @@ -437,7 +450,7 @@ spin_unlock_irqrestore(&rtc_lock, flags); return 0; } -#ifdef __alpha__ +#else case RTC_EPOCH_READ: /* Read the epoch. */ { return put_user (epoch, (unsigned long *)arg); @@ -494,13 +507,14 @@ static int rtc_release(struct inode *inode, struct file *file) { + unsigned long flags; +#ifndef __alpha__ /* * Turn off all interrupts once the device is no longer * in use, and clear the data. */ unsigned char tmp; - unsigned long flags; spin_lock_irqsave(&rtc_lock, flags); tmp = CMOS_READ(RTC_CONTROL); @@ -520,6 +534,7 @@ rtc_fasync (-1, file, 0); } +#endif MOD_DEC_USE_COUNT; spin_lock_irqsave (&rtc_lock, flags); @@ -529,6 +544,7 @@ return 0; } +#ifndef __alpha__ static unsigned int rtc_poll(struct file *file, poll_table *wait) { unsigned long l, flags; @@ -543,6 +559,7 @@ return POLLIN | POLLRDNORM; return 0; } +#endif /* * The various file operations we support. @@ -551,7 +568,9 @@ static struct file_operations rtc_fops = { llseek: rtc_llseek, read: rtc_read, +#ifndef __alpha__ poll: rtc_poll, +#endif ioctl: rtc_ioctl, open: rtc_open, release: rtc_release, @@ -612,12 +631,14 @@ return -EIO; } +#ifndef __alpha__ if(request_irq(RTC_IRQ, rtc_interrupt, SA_INTERRUPT, "rtc", NULL)) { /* Yeah right, seeing as irq 8 doesn't even hit the bus. */ printk(KERN_ERR "rtc: IRQ %d is not free.\n", RTC_IRQ); return -EIO; } +#endif request_region(RTC_PORT(0), RTC_IO_EXTENT, "rtc"); #endif /* __sparc__ vs. others */ @@ -654,12 +675,14 @@ if (guess) printk("rtc: %s epoch (%lu) detected\n", guess, epoch); #endif +#ifndef __alpha__ init_timer(&rtc_irq_timer); rtc_irq_timer.function = rtc_dropped_irq; spin_lock_irqsave(&rtc_lock, flags); /* Initialize periodic freq. to CMOS reset default, which is 1024Hz */ CMOS_WRITE(((CMOS_READ(RTC_FREQ_SELECT) & 0xF0) | 0x06), RTC_FREQ_SELECT); spin_unlock_irqrestore(&rtc_lock, flags); +#endif rtc_freq = 1024; printk(KERN_INFO "Real Time Clock Driver v" RTC_VERSION "\n"); @@ -689,6 +712,7 @@ module_exit(rtc_exit); EXPORT_NO_SYMBOLS; +#ifndef __alpha__ /* * At IRQ rates >= 4096Hz, an interrupt may get lost altogether. * (usually during an IDE disk interrupt, with IRQ unmasking off) @@ -714,6 +738,7 @@ rtc_irq_data |= (CMOS_READ(RTC_INTR_FLAGS) & 0xF0); /* restart */ spin_unlock_irqrestore(&rtc_lock, flags); } +#endif /* * Info exported via "/proc/driver/rtc". @@ -902,6 +927,7 @@ } } +#ifndef __alpha__ /* * Used to disable/enable interrupts for any one of UIE, AIE, PIE. * Rumour has it that if you frob the interrupt enable/disable @@ -939,3 +965,4 @@ rtc_irq_data = 0; spin_unlock_irqrestore(&rtc_lock, flags); } +#endif diff -urN 2.3.46pre1/drivers/i2o/i2o_block.c 2.3.46pre1aa1/drivers/i2o/i2o_block.c --- 2.3.46pre1/drivers/i2o/i2o_block.c Sun Jan 30 15:43:28 2000 +++ 2.3.46pre1aa1/drivers/i2o/i2o_block.c Wed Feb 16 00:28:25 2000 @@ -461,7 +461,7 @@ struct i2ob_device *dev; u32 m; - while (CURRENT) { + while (!QUEUE_EMPTY) { /* * On an IRQ completion if there is an inactive * request on the queue head it means it isnt yet @@ -515,8 +515,7 @@ } } req->errors = 0; - CURRENT = CURRENT->next; - req->next = NULL; + blkdev_dequeue_request(req); req->sem = NULL; ireq = i2ob_qhead; diff -urN 2.3.46pre1/drivers/scsi/scsi.c 2.3.46pre1aa1/drivers/scsi/scsi.c --- 2.3.46pre1/drivers/scsi/scsi.c Fri Feb 11 00:05:35 2000 +++ 2.3.46pre1aa1/drivers/scsi/scsi.c Wed Feb 16 00:28:25 2000 @@ -2193,19 +2193,24 @@ /* Now dump the request lists for each block device */ printk("Dump of pending block device requests\n"); for (i = 0; i < MAX_BLKDEV; i++) { - if (blk_dev[i].request_queue.current_request) { + struct list_head * queue_head; + + queue_head = &blk_dev[i].request_queue.queue_head; + if (!list_empty(queue_head)) { struct request *req; + struct list_head * entry; + printk("%d: ", i); - req = blk_dev[i].request_queue.current_request; - while (req) { + entry = queue_head->next; + do { + req = blkdev_entry_to_request(entry); printk("(%s %d %ld %ld %ld) ", kdevname(req->rq_dev), req->cmd, req->sector, req->nr_sectors, req->current_nr_sectors); - req = req->next; - } + } while ((entry = entry->next) != queue_head); printk("\n"); } } diff -urN 2.3.46pre1/drivers/scsi/scsi_lib.c 2.3.46pre1aa1/drivers/scsi/scsi_lib.c --- 2.3.46pre1/drivers/scsi/scsi_lib.c Fri Feb 11 00:05:35 2000 +++ 2.3.46pre1aa1/drivers/scsi/scsi_lib.c Wed Feb 16 00:28:25 2000 @@ -86,6 +86,7 @@ q = &SCpnt->device->request_queue; SCpnt->request.cmd = SPECIAL; SCpnt->request.special = (void *) SCpnt; + SCpnt->request.q = NULL; /* * We have the option of inserting the head or the tail of the queue. @@ -96,8 +97,7 @@ spin_lock_irqsave(&io_request_lock, flags); if (at_head) { - SCpnt->request.next = q->current_request; - q->current_request = &SCpnt->request; + list_add(&SCpnt->request.queue, &q->queue_head); } else { /* * FIXME(eric) - we always insert at the tail of the @@ -107,19 +107,7 @@ * request might not float high enough in the queue * to be scheduled. */ - SCpnt->request.next = NULL; - if (q->current_request == NULL) { - q->current_request = &SCpnt->request; - } else { - struct request *req; - - for (req = q->current_request; req; req = req->next) { - if (req->next == NULL) { - req->next = &SCpnt->request; - break; - } - } - } + list_add_tail(&SCpnt->request.queue, &q->queue_head); } /* @@ -239,9 +227,8 @@ * in which case we need to request the blocks that come after * the bad sector. */ - SCpnt->request.next = q->current_request; - q->current_request = &SCpnt->request; SCpnt->request.special = (void *) SCpnt; + list_add(&SCpnt->request.queue, &q->queue_head); } /* @@ -260,7 +247,7 @@ * use function pointers to pick the right one. */ if (SDpnt->single_lun - && q->current_request == NULL + && list_empty(&q->queue_head) && SDpnt->device_busy == 0) { request_queue_t *q; @@ -850,18 +837,18 @@ } /* - * Loop through all of the requests in this queue, and find - * one that is queueable. - */ - req = q->current_request; - - /* * If we couldn't find a request that could be queued, then we * can also quit. */ - if (!req) { + if (list_empty(&q->queue_head)) break; - } + + /* + * Loop through all of the requests in this queue, and find + * one that is queueable. + */ + req = blkdev_entry_next_request(&q->queue_head); + /* * Find the actual device driver associated with this command. * The SPECIAL requests are things like character device or @@ -922,8 +909,7 @@ * reason to search the list, because all of the commands * in this queue are for the same device. */ - q->current_request = req->next; - SCpnt->request.next = NULL; + blkdev_dequeue_request(req); if (req != &SCpnt->request) { memcpy(&SCpnt->request, req, sizeof(struct request)); @@ -932,7 +918,6 @@ * We have copied the data out of the request block - it is now in * a field in SCpnt. Release the request block. */ - req->next = NULL; req->rq_status = RQ_INACTIVE; wake_up(&wait_for_request); } diff -urN 2.3.46pre1/drivers/scsi/scsi_merge.c 2.3.46pre1aa1/drivers/scsi/scsi_merge.c --- 2.3.46pre1/drivers/scsi/scsi_merge.c Fri Feb 11 00:05:35 2000 +++ 2.3.46pre1aa1/drivers/scsi/scsi_merge.c Wed Feb 16 00:28:25 2000 @@ -343,6 +343,7 @@ __inline static int __scsi_merge_fn(request_queue_t * q, struct request *req, struct buffer_head *bh, + int max_segments, int use_clustering, int dma_host) { @@ -357,6 +358,9 @@ count = bh->b_size >> 9; sector = bh->b_rsector; + if (max_segments > 64) + max_segments = 64; + /* * We come in here in one of two cases. The first is that we * are checking to see if we can add the buffer to the end of the @@ -447,10 +451,11 @@ * scsi.c allocates for this purpose * min(64,sg_tablesize) entries. */ - if (req->nr_segments >= 64 && + if (req->nr_segments >= max_segments && req->nr_segments >= SHpnt->sg_tablesize) return 0; req->nr_segments++; + q->nr_segments++; return 1; new_segment: /* @@ -459,20 +464,25 @@ * check if things fit into sg_tablesize. */ if (req->nr_hw_segments >= SHpnt->sg_tablesize || - (req->nr_segments >= 64 && + (req->nr_segments >= max_segments && req->nr_segments >= SHpnt->sg_tablesize)) return 0; + if (req->nr_segments >= max_segments) + return 0; req->nr_hw_segments++; req->nr_segments++; + q->nr_segments++; return 1; #else new_segment: - if (req->nr_segments < SHpnt->sg_tablesize) { + if (req->nr_segments < SHpnt->sg_tablesize && + req->nr_segments < max_segments) { /* * This will form the start of a new segment. Bump the * counter. */ req->nr_segments++; + q->nr_segments++; return 1; } else { return 0; @@ -500,11 +510,12 @@ #define MERGEFCT(_FUNCTION, _CLUSTER, _DMA) \ static int _FUNCTION(request_queue_t * q, \ struct request * req, \ - struct buffer_head * bh) \ + struct buffer_head * bh, \ + int max_segments) \ { \ int ret; \ SANITY_CHECK(req, _CLUSTER, _DMA); \ - ret = __scsi_merge_fn(q, req, bh, _CLUSTER, _DMA); \ + ret = __scsi_merge_fn(q, req, bh, max_segments, _CLUSTER, _DMA); \ return ret; \ } @@ -550,6 +561,7 @@ __inline static int __scsi_merge_requests_fn(request_queue_t * q, struct request *req, struct request *next, + int max_segments, int use_clustering, int dma_host) { @@ -559,11 +571,14 @@ SDpnt = (Scsi_Device *) q->queuedata; SHpnt = SDpnt->host; + if (max_segments > 64) + max_segments = 64; + #ifdef DMA_CHUNK_SIZE /* If it would not fit into prepared memory space for sg chain, * then don't allow the merge. */ - if (req->nr_segments + next->nr_segments - 1 > 64 && + if (req->nr_segments + next->nr_segments - 1 > max_segments && req->nr_segments + next->nr_segments - 1 > SHpnt->sg_tablesize) { return 0; } @@ -619,6 +634,7 @@ * This one is OK. Let it go. */ req->nr_segments += next->nr_segments - 1; + q->nr_segments--; #ifdef DMA_CHUNK_SIZE req->nr_hw_segments += next->nr_hw_segments - 1; #endif @@ -627,7 +643,7 @@ } dont_combine: #ifdef DMA_CHUNK_SIZE - if (req->nr_segments + next->nr_segments > 64 && + if (req->nr_segments + next->nr_segments > max_segments && req->nr_segments + next->nr_segments > SHpnt->sg_tablesize) { return 0; } @@ -650,7 +666,8 @@ * Make sure we can fix something that is the sum of the two. * A slightly stricter test than we had above. */ - if (req->nr_segments + next->nr_segments > SHpnt->sg_tablesize) { + if (req->nr_segments + next->nr_segments > max_segments && + req->nr_segments + next->nr_segments > SHpnt->sg_tablesize) { return 0; } else { /* @@ -683,11 +700,12 @@ #define MERGEREQFCT(_FUNCTION, _CLUSTER, _DMA) \ static int _FUNCTION(request_queue_t * q, \ struct request * req, \ - struct request * next) \ + struct request * next, \ + int max_segments) \ { \ int ret; \ SANITY_CHECK(req, _CLUSTER, _DMA); \ - ret = __scsi_merge_requests_fn(q, req, next, _CLUSTER, _DMA); \ + ret = __scsi_merge_requests_fn(q, req, next, max_segments, _CLUSTER, _DMA); \ return ret; \ } diff -urN 2.3.46pre1/fs/buffer.c 2.3.46pre1aa1/fs/buffer.c --- 2.3.46pre1/fs/buffer.c Fri Feb 11 00:05:36 2000 +++ 2.3.46pre1aa1/fs/buffer.c Wed Feb 16 00:28:25 2000 @@ -148,9 +148,9 @@ atomic_inc(&bh->b_count); add_wait_queue(&bh->b_wait, &wait); repeat: - run_task_queue(&tq_disk); set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (buffer_locked(bh)) { + run_task_queue(&tq_disk); schedule(); goto repeat; } diff -urN 2.3.46pre1/fs/dcache.c 2.3.46pre1aa1/fs/dcache.c --- 2.3.46pre1/fs/dcache.c Sun Jan 30 15:43:39 2000 +++ 2.3.46pre1aa1/fs/dcache.c Wed Feb 16 00:28:25 2000 @@ -57,6 +57,15 @@ int dummy[2]; } dentry_stat = {0, 0, 45, 0,}; +struct { + /* Enlarging too much is not a good idea since a too large cache + may generate too much collisions in the hash potentially + slowing down the system. */ + int limit_percent; +} dcache_ctl = { 2, }; +int dcache_ctl_min[] = { 0, }; +int dcache_ctl_max[] = { 100, }; + static inline void d_free(struct dentry *dentry) { if (dentry->d_op && dentry->d_op->d_release) @@ -428,6 +437,20 @@ return 0; } +static inline void preshrink_dcache_memory(void) +{ + unsigned long size, limit; + + size = (dentry_stat.nr_unused * sizeof(struct dentry)) >> PAGE_SHIFT; + limit = num_physpages * dcache_ctl.limit_percent / 100; + if (size > limit) + { + lock_kernel(); + prune_dcache(dentry_stat.nr_unused >> 2); + unlock_kernel(); + } +} + #define NAME_ALLOC_LEN(len) ((len+16) & ~15) struct dentry * d_alloc(struct dentry * parent, const struct qstr *name) @@ -435,6 +458,7 @@ char * str; struct dentry *dentry; + preshrink_dcache_memory(); dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); if (!dentry) return NULL; diff -urN 2.3.46pre1/fs/exec.c 2.3.46pre1aa1/fs/exec.c --- 2.3.46pre1/fs/exec.c Fri Feb 11 00:05:36 2000 +++ 2.3.46pre1aa1/fs/exec.c Wed Feb 16 00:28:25 2000 @@ -277,13 +277,13 @@ pmd = pmd_alloc(pgd, address); if (!pmd) { __free_page(page); - oom(tsk); + force_sig(SIGKILL, tsk); return; } pte = pte_alloc(pmd, address); if (!pte) { __free_page(page); - oom(tsk); + force_sig(SIGKILL, tsk); return; } if (!pte_none(*pte)) { diff -urN 2.3.46pre1/fs/inode.c 2.3.46pre1aa1/fs/inode.c --- 2.3.46pre1/fs/inode.c Tue Feb 15 03:06:49 2000 +++ 2.3.46pre1aa1/fs/inode.c Wed Feb 16 00:28:25 2000 @@ -70,10 +70,19 @@ int dummy[5]; } inodes_stat = {0, 0,}; +struct { + /* Enlarging too much is not a good idea since a too large cache + may generate too much collisions in the hash potentially + slowing down the system. */ + int limit_percent; +} icache_ctl = { 2, }; +int icache_ctl_min[] = { 0, }; +int icache_ctl_max[] = { 100, }; + static kmem_cache_t * inode_cachep; #define alloc_inode() \ - ((struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL)) + (preshrink_icache_memory(), (struct inode *) kmem_cache_alloc(inode_cachep, SLAB_KERNEL)) #define destroy_inode(inode) kmem_cache_free(inode_cachep, (inode)) /* @@ -411,6 +420,16 @@ kmem_cache_shrink(inode_cachep); return 0; +} + +static inline void preshrink_icache_memory(void) +{ + unsigned long size, limit; + + size = (inodes_stat.nr_unused * sizeof(struct inode)) >> PAGE_SHIFT; + limit = num_physpages * icache_ctl.limit_percent / 100; + if (size > limit) + prune_icache(inodes_stat.nr_unused >> 2); } static inline void __iget(struct inode * inode) diff -urN 2.3.46pre1/fs/partitions/check.c 2.3.46pre1aa1/fs/partitions/check.c --- 2.3.46pre1/fs/partitions/check.c Fri Feb 11 00:05:37 2000 +++ 2.3.46pre1aa1/fs/partitions/check.c Wed Feb 16 00:28:24 2000 @@ -37,6 +37,11 @@ extern void rd_load(void); extern void initrd_load(void); +#if defined CONFIG_BLK_DEV_LVM || defined CONFIG_BLK_DEV_LVM_MODULE +#include +void ( *lvm_hd_name_ptr) ( char *, int) = NULL; +#endif + struct gendisk *gendisk_head; static int (*check_part[])(struct gendisk *hd, kdev_t dev, unsigned long first_sect, int first_minor) = { @@ -88,6 +93,13 @@ * This requires special handling here. */ switch (hd->major) { +#if defined CONFIG_BLK_DEV_LVM || defined CONFIG_BLK_DEV_LVM_MODULE + case LVM_BLK_MAJOR: + *buf = 0; + if ( lvm_hd_name_ptr != NULL) + ( lvm_hd_name_ptr) ( buf, minor); + return buf; +#endif case IDE9_MAJOR: unit += 2; case IDE8_MAJOR: diff -urN 2.3.46pre1/include/asm-alpha/hardirq.h 2.3.46pre1aa1/include/asm-alpha/hardirq.h --- 2.3.46pre1/include/asm-alpha/hardirq.h Tue Feb 15 16:37:45 2000 +++ 2.3.46pre1aa1/include/asm-alpha/hardirq.h Wed Feb 16 00:28:24 2000 @@ -8,8 +8,11 @@ #ifndef __SMP__ extern int __local_irq_count; #define local_irq_count(cpu) ((void)(cpu), __local_irq_count) +extern unsigned long __irq_attempt[]; +#define irq_attempt(cpu, irq) ((void)(cpu), __irq_attempt[irq]) #else #define local_irq_count(cpu) (cpu_data[cpu].irq_count) +#define irq_attempt(cpu, irq) (cpu_data[cpu].irq_attempt[irq]) #endif /* diff -urN 2.3.46pre1/include/asm-alpha/hw_irq.h 2.3.46pre1aa1/include/asm-alpha/hw_irq.h --- 2.3.46pre1/include/asm-alpha/hw_irq.h Tue Feb 15 03:16:53 2000 +++ 2.3.46pre1aa1/include/asm-alpha/hw_irq.h Wed Feb 16 00:28:24 2000 @@ -18,21 +18,22 @@ outb(0, DMA1_CLR_MASK_REG); \ outb(0, DMA2_CLR_MASK_REG) -extern unsigned long _alpha_irq_masks[2]; -#define alpha_irq_mask _alpha_irq_masks[0] - extern void common_ack_irq(unsigned long irq); extern void isa_device_interrupt(unsigned long vector, struct pt_regs * regs); extern void srm_device_interrupt(unsigned long vector, struct pt_regs * regs); -extern void handle_irq(int irq, int ack, struct pt_regs * regs); +extern void handle_irq(int irq, struct pt_regs * regs); #define RTC_IRQ 8 +#if 0 /* on Alpha we want to use only the RTC as timer for SMP issues */ #ifdef CONFIG_RTC #define TIMER_IRQ 0 /* timer is the pit */ #else #define TIMER_IRQ RTC_IRQ /* timer is the rtc */ #endif +#else +#define TIMER_IRQ RTC_IRQ /* timer is the rtc */ +#endif /* * PROBE_MASK is the bitset of irqs that we consider for autoprobing. @@ -71,10 +72,11 @@ #endif -extern char _stext; static inline void alpha_do_profile (unsigned long pc) { if (prof_buffer && current->pid) { + extern char _stext; + pc -= (unsigned long) &_stext; pc >>= prof_shift; /* @@ -87,5 +89,10 @@ atomic_inc((atomic_t *)&prof_buffer[pc]); } } + +static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i) {} +extern void no_action(int cpl, void *dev_id, struct pt_regs *regs); +extern void init_ISA_irqs(void); +extern void init_RTC_irq(void); #endif diff -urN 2.3.46pre1/include/asm-alpha/pgalloc.h 2.3.46pre1aa1/include/asm-alpha/pgalloc.h --- 2.3.46pre1/include/asm-alpha/pgalloc.h Tue Feb 15 03:15:06 2000 +++ 2.3.46pre1aa1/include/asm-alpha/pgalloc.h Wed Feb 16 00:28:24 2000 @@ -3,13 +3,28 @@ #include -/* Caches aren't brain-dead on the Alpha. */ -#define flush_cache_all() do { } while (0) +/* The icache is not coherent with the dcache on alpha, thus before + running self modified code we must always run an imb(). + Actually flush_cache_all() is real overkill as it's recalled from + vmalloc() before accessing pagetables and on the Alpha we are not required + to flush the icache before doing that, but the semantic of flush_cache_all() + requires us to flush _all_ the caches and so we must be correct here. It's + instead vmalloc that should be changed to use a more finegrined cache + flush operation (I suspect that also other archs doesn't need an icache + flush while handling pagetables). OTOH vmalloc is not a performance critical + path so after all we can live with it for now. */ +#define flush_cache_all() flush_icache_range(0, 0) #define flush_cache_mm(mm) do { } while (0) #define flush_cache_range(mm, start, end) do { } while (0) #define flush_cache_page(vma, vmaddr) do { } while (0) #define flush_page_to_ram(page) do { } while (0) -#define flush_icache_range(start, end) do { } while (0) +#ifndef __SMP__ +#define flush_icache_range(start, end) imb() +#else +#define flush_icache_range(start, end) smp_imb() +extern void smp_imb(void); +#endif +#define flush_icache_page(vma,pg) do { } while (0) /* * Use a few helper functions to hide the ugly broken ASN diff -urN 2.3.46pre1/include/asm-alpha/smp.h 2.3.46pre1aa1/include/asm-alpha/smp.h --- 2.3.46pre1/include/asm-alpha/smp.h Tue Feb 15 03:15:06 2000 +++ 2.3.46pre1aa1/include/asm-alpha/smp.h Wed Feb 16 00:28:24 2000 @@ -20,6 +20,7 @@ #ifdef __SMP__ #include +#include struct cpuinfo_alpha { unsigned long loops_per_sec; @@ -28,6 +29,8 @@ unsigned long *pte_cache; unsigned long pgtable_cache_sz; unsigned long ipi_count; + unsigned long irq_attempt[NR_IRQS]; + unsigned long smp_local_irq_count; unsigned long prof_multiplier; unsigned long prof_counter; int irq_count, bh_count; diff -urN 2.3.46pre1/include/linux/blk.h 2.3.46pre1aa1/include/linux/blk.h --- 2.3.46pre1/include/linux/blk.h Tue Feb 15 03:17:30 2000 +++ 2.3.46pre1aa1/include/linux/blk.h Wed Feb 16 00:28:25 2000 @@ -96,6 +96,18 @@ * code duplication in drivers. */ +extern inline void blkdev_dequeue_request(struct request * req) +{ + if (req->q) + { + if (req->cmd == READ) + req->q->elevator.read_pendings--; + req->q->nr_segments -= req->nr_segments; + req->q = NULL; + } + list_del(&req->queue); +} + int end_that_request_first(struct request *req, int uptodate, char *name); void end_that_request_last(struct request *req); @@ -373,7 +385,10 @@ #if !defined(IDE_DRIVER) #ifndef CURRENT -#define CURRENT (blk_dev[MAJOR_NR].request_queue.current_request) +#define CURRENT blkdev_entry_next_request(&blk_dev[MAJOR_NR].request_queue.queue_head) +#endif +#ifndef QUEUE_EMPTY +#define QUEUE_EMPTY list_empty(&blk_dev[MAJOR_NR].request_queue.queue_head) #endif #ifndef DEVICE_NAME @@ -418,7 +433,7 @@ #endif #define INIT_REQUEST \ - if (!CURRENT) {\ + if (QUEUE_EMPTY) {\ CLEAR_INTR; \ return; \ } \ @@ -446,7 +461,7 @@ add_blkdev_randomness(MAJOR(req->rq_dev)); #endif DEVICE_OFF(req->rq_dev); - CURRENT = req->next; + blkdev_dequeue_request(req); end_that_request_last(req); } diff -urN 2.3.46pre1/include/linux/blkdev.h 2.3.46pre1aa1/include/linux/blkdev.h --- 2.3.46pre1/include/linux/blkdev.h Tue Feb 15 03:15:06 2000 +++ 2.3.46pre1aa1/include/linux/blkdev.h Wed Feb 16 00:28:25 2000 @@ -5,6 +5,10 @@ #include #include #include +#include + +struct request_queue; +typedef struct request_queue request_queue_t; /* * Ok, this is an expanded form so that we can use the same @@ -13,6 +17,9 @@ * for read/write completion. */ struct request { + struct list_head queue; + int elevator_sequence; + volatile int rq_status; /* should split this into a few status bits */ #define RQ_INACTIVE (-1) #define RQ_ACTIVE 1 @@ -33,25 +40,39 @@ struct semaphore * sem; struct buffer_head * bh; struct buffer_head * bhtail; - struct request * next; + request_queue_t * q; }; -typedef struct request_queue request_queue_t; typedef int (merge_request_fn) (request_queue_t *q, struct request *req, - struct buffer_head *bh); + struct buffer_head *bh, + int); typedef int (merge_requests_fn) (request_queue_t *q, struct request *req, - struct request *req2); + struct request *req2, + int); typedef void (request_fn_proc) (request_queue_t *q); typedef request_queue_t * (queue_proc) (kdev_t dev); typedef void (make_request_fn) (int rw, struct buffer_head *bh); typedef void (plug_device_fn) (request_queue_t *q, kdev_t device); typedef void (unplug_device_fn) (void *q); +typedef struct elevator_s +{ + int sequence; + int read_latency; + int write_latency; + int max_bomb_segments; + int read_pendings; +} elevator_t; + struct request_queue { - struct request * current_request; + struct list_head queue_head; + /* together with queue_head for cacheline sharing */ + elevator_t elevator; + unsigned int nr_segments; + request_fn_proc * request_fn; merge_request_fn * merge_fn; merge_requests_fn * merge_requests_fn; @@ -108,6 +129,7 @@ extern void register_disk(struct gendisk *dev, kdev_t first, unsigned minors, struct block_device_operations *ops, long size); extern void generic_unplug_device(void * data); extern void generic_plug_device (request_queue_t *q, kdev_t dev); +extern void plug_device_noop(request_queue_t *q, kdev_t dev); extern void generic_make_request(int rw, struct buffer_head * bh); extern request_queue_t * blk_get_queue(kdev_t dev); @@ -141,5 +163,13 @@ /* read-ahead in pages.. */ #define MAX_READAHEAD 31 #define MIN_READAHEAD 3 + +#define ELEVATOR_DEFAULTS ((elevator_t) { 0, NR_REQUEST>>1, NR_REQUEST<<5, 4, 0, }) + +#define blkdev_entry_to_request(entry) list_entry((entry), struct request, queue) +#define blkdev_entry_next_request(entry) blkdev_entry_to_request((entry)->next) +#define blkdev_entry_prev_request(entry) blkdev_entry_to_request((entry)->prev) +#define blkdev_next_request(req) blkdev_entry_to_request((req)->queue.next) +#define blkdev_prev_request(req) blkdev_entry_to_request((req)->queue.prev) #endif diff -urN 2.3.46pre1/include/linux/irq.h 2.3.46pre1aa1/include/linux/irq.h --- 2.3.46pre1/include/linux/irq.h Tue Feb 15 03:17:05 2000 +++ 2.3.46pre1aa1/include/linux/irq.h Wed Feb 16 00:28:24 2000 @@ -11,6 +11,7 @@ #define IRQ_REPLAY 8 /* IRQ has been replayed but not acked yet */ #define IRQ_AUTODETECT 16 /* IRQ is being autodetected */ #define IRQ_WAITING 32 /* IRQ not yet seen - for autodetection */ +#define IRQ_LEVEL 64 /* IRQ level triggered */ /* * Interrupt controller descriptor. This is all we need diff -urN 2.3.46pre1/include/linux/lvm.h 2.3.46pre1aa1/include/linux/lvm.h --- 2.3.46pre1/include/linux/lvm.h Thu Jan 1 01:00:00 1970 +++ 2.3.46pre1aa1/include/linux/lvm.h Wed Feb 16 00:28:24 2000 @@ -0,0 +1,827 @@ +/* + * kernel/lvm.h + * + * Copyright (C) 1997 - 2000 Heinz Mauelshagen, Germany + * + * February-November 1997 + * May-July 1998 + * January-March,July,September,October,Dezember 1999 + * January 2000 + * + * lvm is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * lvm is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + */ + +/* + * Changelog + * + * 10/10/1997 - beginning of new structure creation + * 12/05/1998 - incorporated structures from lvm_v1.h and deleted lvm_v1.h + * 07/06/1998 - avoided LVM_KMALLOC_MAX define by using vmalloc/vfree + * instead of kmalloc/kfree + * 01/07/1998 - fixed wrong LVM_MAX_SIZE + * 07/07/1998 - extended pe_t structure by ios member (for statistic) + * 02/08/1998 - changes for official char/block major numbers + * 07/08/1998 - avoided init_module() and cleanup_module() to be static + * 29/08/1998 - seprated core and disk structure type definitions + * 01/09/1998 - merged kernel integration version (mike) + * 20/01/1999 - added LVM_PE_DISK_OFFSET macro for use in + * vg_read_with_pv_and_lv(), pv_move_pe(), pv_show_pe_text()... + * 18/02/1999 - added definition of time_disk_t structure for; + * keeps time stamps on disk for nonatomic writes (future) + * 15/03/1999 - corrected LV() and VG() macro definition to use argument + * instead of minor + * 03/07/1999 - define for genhd.c name handling + * 23/07/1999 - implemented snapshot part + * 08/12/1999 - changed LVM_LV_SIZE_MAX macro to reflect current 1TB limit + * 01/01/2000 - extended lv_v2 core structure by wait_queue member + * + */ + + +#ifndef _LVM_H_INCLUDE +#define _LVM_H_INCLUDE + +#define _LVM_H_VERSION "LVM 0.8 (1/1/2000)" + +/* + * preprocessor definitions + */ +/* if you like emergency reset code in the driver */ +#define LVM_TOTAL_RESET + +#define LVM_GET_INODE +#define LVM_HD_NAME + +/* lots of debugging output (see driver source) +#define DEBUG_LVM_GET_INFO +#define DEBUG +#define DEBUG_MAP +#define DEBUG_MAP_SIZE +#define DEBUG_IOCTL +#define DEBUG_READ +#define DEBUG_GENDISK +#define DEBUG_VG_CREATE +#define DEBUG_LVM_BLK_OPEN +#define DEBUG_VFREE +#define DEBUG_SNAPSHOT +*/ +/* + * end of preprocessor definitions + */ + +#ifndef LINUX_VERSION_CODE +# include + /* for 2.0.x series */ +# ifndef KERNEL_VERSION +# define KERNEL_VERSION(a,b,c) (((a) << 16) + ((b) << 8) + (c)) +# endif +#endif + +#include +#include +#include +#if LINUX_VERSION_CODE >= KERNEL_VERSION ( 2, 3 ,0) +# include +#else +# include +#endif + +/* leave this for now until major.h is updated (mike) */ +#ifndef LVM_BLK_MAJOR +# define LVM_BLK_MAJOR 58 +#endif +#ifndef LVM_CHAR_MAJOR +# define LVM_CHAR_MAJOR 109 +#endif + +#if !defined ( LVM_BLK_MAJOR) || !defined ( LVM_CHAR_MAJOR) + #error Bad include/linux/major.h - LVM MAJOR undefined +#endif + + +#if LINUX_VERSION_CODE < KERNEL_VERSION ( 2, 1 ,0) +# ifndef uint8_t +# define uint8_t __u8 +# endif +# ifndef uint16_t +# define uint16_t __u16 +# endif +# ifndef uint32_t +# define uint32_t __u32 +# endif +# ifndef uint64_t +# define uint64_t __u64 +# endif +#endif + +#define LVM_STRUCT_VERSION 1 /* structure version */ + +#ifndef min +#define min(a,b) (((a)<(b))?(a):(b)) +#endif +#ifndef max +#define max(a,b) (((a)>(b))?(a):(b)) +#endif + +/* set the default structure version */ +#if ( LVM_STRUCT_VERSION == 1) +# define pv_t pv_v1_t +# define lv_t lv_v2_t +# define vg_t vg_v1_t +# define pv_disk_t pv_disk_v1_t +# define lv_disk_t lv_disk_v1_t +# define vg_disk_t vg_disk_v1_t +# define lv_exception_t lv_v2_exception_t +#endif + + +/* + * i/o protocoll version + * + * defined here for the driver and defined seperate in the + * user land LVM parts + * + */ +#define LVM_DRIVER_IOP_VERSION 6 + +#define LVM_NAME "lvm" + +/* + * VG/LV indexing macros + */ +/* character minor maps directly to volume group */ +#define VG_CHR(a) ( a) + +/* block minor indexes into a volume group/logical volume indirection table */ +#define VG_BLK(a) ( vg_lv_map[a].vg_number) +#define LV_BLK(a) ( vg_lv_map[a].lv_number) + +/* + * absolute limits for VGs, PVs per VG and LVs per VG + */ +#define ABS_MAX_VG 99 +#define ABS_MAX_PV 256 +#define ABS_MAX_LV 256 /* caused by 8 bit minor */ + +#define MAX_VG ABS_MAX_VG +#define MAX_LV ABS_MAX_LV +#define MAX_PV ABS_MAX_PV + +#if ( MAX_VG > ABS_MAX_VG) +# undef MAX_VG +# define MAX_VG ABS_MAX_VG +#endif + +#if ( MAX_LV > ABS_MAX_LV) +# undef MAX_LV +# define MAX_LV ABS_MAX_LV +#endif + + +/* + * VGDA: default disk spaces and offsets + * + * there's space after the structures for later extensions. + * + * offset what size + * --------------- ---------------------------------- ------------ + * 0 physical volume structure ~500 byte + * + * 1K volume group structure ~200 byte + * + * 5K time stamp structure ~ + * + * 6K namelist of physical volumes 128 byte each + * + * 6k + n * 128byte n logical volume structures ~300 byte each + * + * + m * 328byte m physical extent alloc. structs 4 byte each + * + * End of disk - first physical extent typical 4 megabyte + * PE total * + * PE size + * + * + */ + +/* DONT TOUCH THESE !!! */ +/* base of PV structure in disk partition */ +#define LVM_PV_DISK_BASE 0L + +/* size reserved for PV structure on disk */ +#define LVM_PV_DISK_SIZE 1024L + +/* base of VG structure in disk partition */ +#define LVM_VG_DISK_BASE LVM_PV_DISK_SIZE + +/* size reserved for VG structure */ +#define LVM_VG_DISK_SIZE ( 9 * 512L) + +/* size reserved for timekeeping */ +#define LVM_TIMESTAMP_DISK_BASE ( LVM_VG_DISK_BASE + LVM_VG_DISK_SIZE) +#define LVM_TIMESTAMP_DISK_SIZE 512L /* reserved for timekeeping */ + +/* name list of physical volumes on disk */ +#define LVM_PV_NAMELIST_DISK_BASE ( LVM_TIMESTAMP_DISK_BASE + \ + LVM_TIMESTAMP_DISK_SIZE) + +/* now for the dynamically calculated parts of the VGDA */ +#define LVM_LV_DISK_OFFSET(a, b) ( (a)->lv_on_disk.base + sizeof ( lv_t) * b) +#define LVM_DISK_SIZE(pv) ( (pv)->pe_on_disk.base + \ + (pv)->pe_on_disk.size) +#define LVM_PE_DISK_OFFSET(pe, pv) ( pe * pv->pe_size + \ + ( LVM_DISK_SIZE ( pv) / SECTOR_SIZE)) +#define LVM_PE_ON_DISK_BASE(pv) \ + { int rest; \ + pv->pe_on_disk.base = pv->lv_on_disk.base + pv->lv_on_disk.size; \ + if ( ( rest = pv->pe_on_disk.base % SECTOR_SIZE) != 0) \ + pv->pe_on_disk.base += ( SECTOR_SIZE - rest); \ + } +/* END default disk spaces and offsets for PVs */ + + +/* + * LVM_PE_T_MAX corresponds to: + * + * 8KB PE size can map a ~512 MB logical volume at the cost of 1MB memory, + * + * 128MB PE size can map a 8TB logical volume at the same cost of memory. + * + * Default PE size of 4 MB gives a maximum logical volume size of 256 GB. + * + * Maximum PE size of 16GB gives a maximum logical volume size of 1024 TB. + * + * AFAIK, the actual kernels limit this to 1 TB. + * + * Should be a sufficient spectrum ;*) + */ + +/* This is the usable size of disk_pe_t.le_num !!! v v */ +#define LVM_PE_T_MAX ( ( 1 << ( sizeof ( uint16_t) * 8)) - 2) + +#define LVM_LV_SIZE_MAX(a) ( ( long long) LVM_PE_T_MAX * (a)->pe_size > ( long long) 2*1024*1024*1024 ? ( long long) 2*1024*1024*1024 : ( long long) LVM_PE_T_MAX * (a)->pe_size) +#define LVM_MIN_PE_SIZE ( 8L * 2) /* 8 KB in sectors */ +#define LVM_MAX_PE_SIZE ( 16L * 1024L * 1024L * 2) /* 16GB in sectors */ +#define LVM_DEFAULT_PE_SIZE ( 4096L * 2) /* 4 MB in sectors */ +#define LVM_DEFAULT_STRIPE_SIZE 16L /* 16 KB */ +#define LVM_MIN_STRIPE_SIZE 2L /* 1 KB in sectors */ +#define LVM_MAX_STRIPE_SIZE ( 512L * 2) /* 512 KB in sectors */ +#define LVM_MAX_STRIPES 128 /* max # of stripes */ +#define LVM_MAX_SIZE ( 1024LU * 1024 * 1024 * 2) /* 1TB[sectors] */ +#define LVM_MAX_MIRRORS 2 /* future use */ +#define LVM_MIN_READ_AHEAD 0 /* minimum read ahead sectors */ +#define LVM_MAX_READ_AHEAD 256 /* maximum read ahead sectors */ +#define LVM_DEF_READ_AHEAD ((LVM_MAX_READ_AHEAD-LVM_MIN_READ_AHEAD)/2 + LVM_MIN_READ_AHEAD) +#define LVM_MAX_LV_IO_TIMEOUT 60 /* seconds I/O timeout (future use) */ +#define LVM_PARTITION 0xfe /* LVM partition id */ +#define LVM_NEW_PARTITION 0x8e /* new LVM partition id (10/09/1999) */ +#define LVM_PE_SIZE_PV_SIZE_REL 5 /* max relation PV size and PE size */ + +#define LVM_SNAPSHOT_MAX_CHUNK 256 /* 256 KB */ +#define LVM_SNAPSHOT_DEF_CHUNK 64 /* 64 KB */ +#define LVM_SNAPSHOT_MIN_CHUNK 1 /* 1 KB */ + +#define UNDEF -1 +#define FALSE 0 +#define TRUE 1 + + +/* + * ioctls + */ +/* volume group */ +#define VG_CREATE _IOW ( 0xfe, 0x00, 1) +#define VG_REMOVE _IOW ( 0xfe, 0x01, 1) + +#define VG_EXTEND _IOW ( 0xfe, 0x03, 1) +#define VG_REDUCE _IOW ( 0xfe, 0x04, 1) + +#define VG_STATUS _IOWR ( 0xfe, 0x05, 1) +#define VG_STATUS_GET_COUNT _IOWR ( 0xfe, 0x06, 1) +#define VG_STATUS_GET_NAMELIST _IOWR ( 0xfe, 0x07, 1) + +#define VG_SET_EXTENDABLE _IOW ( 0xfe, 0x08, 1) + + +/* logical volume */ +#define LV_CREATE _IOW ( 0xfe, 0x20, 1) +#define LV_REMOVE _IOW ( 0xfe, 0x21, 1) + +#define LV_ACTIVATE _IO ( 0xfe, 0x22) +#define LV_DEACTIVATE _IO ( 0xfe, 0x23) + +#define LV_EXTEND _IOW ( 0xfe, 0x24, 1) +#define LV_REDUCE _IOW ( 0xfe, 0x25, 1) + +#define LV_STATUS_BYNAME _IOWR ( 0xfe, 0x26, 1) +#define LV_STATUS_BYINDEX _IOWR ( 0xfe, 0x27, 1) + +#define LV_SET_ACCESS _IOW ( 0xfe, 0x28, 1) +#define LV_SET_ALLOCATION _IOW ( 0xfe, 0x29, 1) +#define LV_SET_STATUS _IOW ( 0xfe, 0x2a, 1) + +#define LE_REMAP _IOW ( 0xfe, 0x2b, 1) + + +/* physical volume */ +#define PV_STATUS _IOWR ( 0xfe, 0x40, 1) +#define PV_CHANGE _IOWR ( 0xfe, 0x41, 1) +#define PV_FLUSH _IOW ( 0xfe, 0x42, 1) + +/* physical extent */ +#define PE_LOCK_UNLOCK _IOW ( 0xfe, 0x50, 1) + +/* i/o protocol version */ +#define LVM_GET_IOP_VERSION _IOR ( 0xfe, 0x98, 1) + +#ifdef LVM_TOTAL_RESET +/* special reset function for testing purposes */ +#define LVM_RESET _IO ( 0xfe, 0x99) +#endif + +/* lock the logical volume manager */ +#define LVM_LOCK_LVM _IO ( 0xfe, 0x100) +/* END ioctls */ + + +/* + * Status flags + */ +/* volume group */ +#define VG_ACTIVE 0x01 /* vg_status */ +#define VG_EXPORTED 0x02 /* " */ +#define VG_EXTENDABLE 0x04 /* " */ + +#define VG_READ 0x01 /* vg_access */ +#define VG_WRITE 0x02 /* " */ + +/* logical volume */ +#define LV_ACTIVE 0x01 /* lv_status */ +#define LV_SPINDOWN 0x02 /* " */ + +#define LV_READ 0x01 /* lv_access */ +#define LV_WRITE 0x02 /* " */ +#define LV_SNAPSHOT 0x04 /* " */ +#define LV_SNAPSHOT_ORG 0x08 /* " */ + +#define LV_BADBLOCK_ON 0x01 /* lv_badblock */ + +#define LV_STRICT 0x01 /* lv_allocation */ +#define LV_CONTIGUOUS 0x02 /* " */ + +/* physical volume */ +#define PV_ACTIVE 0x01 /* pv_status */ +#define PV_ALLOCATABLE 0x02 /* pv_allocatable */ + + +/* + * Structure definitions core/disk follow + * + * conditional conversion takes place on big endian architectures + * in functions * pv_copy_*(), vg_copy_*() and lv_copy_*() + * + */ + +#define NAME_LEN 128 /* don't change!!! */ +#define UUID_LEN 16 /* don't change!!! */ + +/* remap physical sector/rdev pairs */ +typedef struct { + struct list_head hash; + ulong rsector_org; + kdev_t rdev_org; + ulong rsector_new; + kdev_t rdev_new; +} lv_block_exception_t; + + +/* disk stored pe information */ +typedef struct { + uint16_t lv_num; + uint16_t le_num; +} disk_pe_t; + +/* disk stored PV, VG, LV and PE size and offset information */ +typedef struct { + uint32_t base; + uint32_t size; +} lvm_disk_data_t; + + +/* + * Structure Physical Volume (PV) Version 1 + */ + +/* core */ +typedef struct { + uint8_t id[2]; /* Identifier */ + uint16_t version; /* HM lvm version */ + lvm_disk_data_t pv_on_disk; + lvm_disk_data_t vg_on_disk; + lvm_disk_data_t pv_namelist_on_disk; + lvm_disk_data_t lv_on_disk; + lvm_disk_data_t pe_on_disk; + uint8_t pv_name[NAME_LEN]; + uint8_t vg_name[NAME_LEN]; + uint8_t system_id[NAME_LEN]; /* for vgexport/vgimport */ + kdev_t pv_dev; + uint32_t pv_number; + uint32_t pv_status; + uint32_t pv_allocatable; + uint32_t pv_size; /* HM */ + uint32_t lv_cur; + uint32_t pe_size; + uint32_t pe_total; + uint32_t pe_allocated; + uint32_t pe_stale; /* for future use */ + disk_pe_t *pe; /* HM */ + struct inode *inode; /* HM */ +} pv_v1_t; + +/* disk */ +typedef struct { + uint8_t id[2]; /* Identifier */ + uint16_t version; /* HM lvm version */ + lvm_disk_data_t pv_on_disk; + lvm_disk_data_t vg_on_disk; + lvm_disk_data_t pv_namelist_on_disk; + lvm_disk_data_t lv_on_disk; + lvm_disk_data_t pe_on_disk; + uint8_t pv_name[NAME_LEN]; + uint8_t vg_name[NAME_LEN]; + uint8_t system_id[NAME_LEN]; /* for vgexport/vgimport */ + uint32_t pv_major; + uint32_t pv_number; + uint32_t pv_status; + uint32_t pv_allocatable; + uint32_t pv_size; /* HM */ + uint32_t lv_cur; + uint32_t pe_size; + uint32_t pe_total; + uint32_t pe_allocated; + uint32_t dummy1; + uint32_t dummy2; + uint32_t dummy3; +} pv_disk_v1_t; + + +/* + * Structure Physical Volume (PV) Version 2 (future!) + */ + +typedef struct { + uint8_t id[2]; /* Identifier */ + uint16_t version; /* HM lvm version */ + lvm_disk_data_t pv_on_disk; + lvm_disk_data_t vg_on_disk; + lvm_disk_data_t pv_uuid_on_disk; + lvm_disk_data_t lv_on_disk; + lvm_disk_data_t pe_on_disk; + uint8_t pv_name[NAME_LEN]; + uint8_t vg_name[NAME_LEN]; + uint8_t system_id[NAME_LEN]; /* for vgexport/vgimport */ + kdev_t pv_dev; + uint32_t pv_number; + uint32_t pv_status; + uint32_t pv_allocatable; + uint32_t pv_size; /* HM */ + uint32_t lv_cur; + uint32_t pe_size; + uint32_t pe_total; + uint32_t pe_allocated; + uint32_t pe_stale; /* for future use */ + disk_pe_t *pe; /* HM */ + struct inode *inode; /* HM */ + /* delta to version 1 starts here */ + uint8_t pv_uuid[UUID_LEN]; + uint32_t pv_atime; /* PV access time */ + uint32_t pv_ctime; /* PV creation time */ + uint32_t pv_mtime; /* PV modification time */ +} pv_v2_t; + + +/* + * Structures for Logical Volume (LV) + */ + +/* core PE information */ +typedef struct { + kdev_t dev; + uint32_t pe; /* to be changed if > 2TB */ + uint32_t reads; + uint32_t writes; +} pe_t; + +typedef struct { + uint8_t lv_name[NAME_LEN]; + kdev_t old_dev; + kdev_t new_dev; + ulong old_pe; + ulong new_pe; +} le_remap_req_t; + + + +/* + * Structure Logical Volume (LV) Version 1 + */ + +/* core */ +typedef struct { + uint8_t lv_name[NAME_LEN]; + uint8_t vg_name[NAME_LEN]; + uint32_t lv_access; + uint32_t lv_status; + uint32_t lv_open; /* HM */ + kdev_t lv_dev; /* HM */ + uint32_t lv_number; /* HM */ + uint32_t lv_mirror_copies; /* for future use */ + uint32_t lv_recovery; /* " */ + uint32_t lv_schedule; /* " */ + uint32_t lv_size; + pe_t *lv_current_pe; /* HM */ + uint32_t lv_current_le; /* for future use */ + uint32_t lv_allocated_le; + uint32_t lv_stripes; + uint32_t lv_stripesize; + uint32_t lv_badblock; /* for future use */ + uint32_t lv_allocation; + uint32_t lv_io_timeout; /* for future use */ + uint32_t lv_read_ahead; +} lv_v1_t; + +/* disk */ +typedef struct { + uint8_t lv_name[NAME_LEN]; + uint8_t vg_name[NAME_LEN]; + uint32_t lv_access; + uint32_t lv_status; + uint32_t lv_open; /* HM */ + uint32_t lv_dev; /* HM */ + uint32_t lv_number; /* HM */ + uint32_t lv_mirror_copies; /* for future use */ + uint32_t lv_recovery; /* " */ + uint32_t lv_schedule; /* " */ + uint32_t lv_size; + uint32_t dummy; + uint32_t lv_current_le; /* for future use */ + uint32_t lv_allocated_le; + uint32_t lv_stripes; + uint32_t lv_stripesize; + uint32_t lv_badblock; /* for future use */ + uint32_t lv_allocation; + uint32_t lv_io_timeout; /* for future use */ + uint32_t lv_read_ahead; /* HM, for future use */ +} lv_disk_v1_t; + + +/* + * Structure Logical Volume (LV) Version 2 + */ + +/* core */ +typedef struct lv_v2 { + uint8_t lv_name[NAME_LEN]; + uint8_t vg_name[NAME_LEN]; + uint32_t lv_access; + uint32_t lv_status; + uint32_t lv_open; /* HM */ + kdev_t lv_dev; /* HM */ + uint32_t lv_number; /* HM */ + uint32_t lv_mirror_copies; /* for future use */ + uint32_t lv_recovery; /* " */ + uint32_t lv_schedule; /* " */ + uint32_t lv_size; + pe_t *lv_current_pe; /* HM */ + uint32_t lv_current_le; /* for future use */ + uint32_t lv_allocated_le; + uint32_t lv_stripes; + uint32_t lv_stripesize; + uint32_t lv_badblock; /* for future use */ + uint32_t lv_allocation; + uint32_t lv_io_timeout; /* for future use */ + uint32_t lv_read_ahead; + /* delta to version 1 starts here */ + struct lv_v2 *lv_snapshot_org; + struct lv_v2 *lv_snapshot_prev; + struct lv_v2 *lv_snapshot_next; + lv_block_exception_t *lv_block_exception; + uint8_t __unused2; + uint32_t lv_remap_ptr; + uint32_t lv_remap_end; + uint32_t lv_chunk_size; + uint32_t lv_snapshot_minor; + struct kiobuf * lv_iobuf; + struct semaphore lv_snapshot_sem; + struct list_head * lv_snapshot_hash_table; + unsigned long lv_snapshot_hash_mask; +} lv_v2_t; + +/* disk */ +typedef struct { + uint8_t lv_name[NAME_LEN]; + uint8_t vg_name[NAME_LEN]; + uint32_t lv_access; + uint32_t lv_status; + uint32_t lv_open; /* HM */ + uint32_t lv_dev; /* HM */ + uint32_t lv_number; /* HM */ + uint32_t lv_mirror_copies; /* for future use */ + uint32_t lv_recovery; /* " */ + uint32_t lv_schedule; /* " */ + uint32_t lv_size; + uint32_t dummy; + uint32_t lv_current_le; /* for future use */ + uint32_t lv_allocated_le; + uint32_t lv_stripes; + uint32_t lv_stripesize; + uint32_t lv_badblock; /* for future use */ + uint32_t lv_allocation; + uint32_t lv_io_timeout; /* for future use */ + uint32_t lv_read_ahead; /* HM, for future use */ +} lv_disk_v2_t; + + +/* + * Structure Volume Group (VG) Version 1 + */ + +typedef struct { + uint8_t vg_name[NAME_LEN]; /* volume group name */ + uint32_t vg_number; /* volume group number */ + uint32_t vg_access; /* read/write */ + uint32_t vg_status; /* active or not */ + uint32_t lv_max; /* maximum logical volumes */ + uint32_t lv_cur; /* current logical volumes */ + uint32_t lv_open; /* open logical volumes */ + uint32_t pv_max; /* maximum physical volumes */ + uint32_t pv_cur; /* current physical volumes FU */ + uint32_t pv_act; /* active physical volumes */ + uint32_t dummy; /* was obsolete max_pe_per_pv */ + uint32_t vgda; /* volume group descriptor arrays FU */ + uint32_t pe_size; /* physical extent size in sectors */ + uint32_t pe_total; /* total of physical extents */ + uint32_t pe_allocated; /* allocated physical extents */ + uint32_t pvg_total; /* physical volume groups FU */ + struct proc_dir_entry *proc; + pv_t *pv[ABS_MAX_PV+1]; /* physical volume struct pointers */ + lv_t *lv[ABS_MAX_LV+1]; /* logical volume struct pointers */ +} vg_v1_t; + +typedef struct { + uint8_t vg_name[NAME_LEN]; /* volume group name */ + uint32_t vg_number; /* volume group number */ + uint32_t vg_access; /* read/write */ + uint32_t vg_status; /* active or not */ + uint32_t lv_max; /* maximum logical volumes */ + uint32_t lv_cur; /* current logical volumes */ + uint32_t lv_open; /* open logical volumes */ + uint32_t pv_max; /* maximum physical volumes */ + uint32_t pv_cur; /* current physical volumes FU */ + uint32_t pv_act; /* active physical volumes */ + uint32_t dummy; + uint32_t vgda; /* volume group descriptor arrays FU */ + uint32_t pe_size; /* physical extent size in sectors */ + uint32_t pe_total; /* total of physical extents */ + uint32_t pe_allocated; /* allocated physical extents */ + uint32_t pvg_total; /* physical volume groups FU */ +} vg_disk_v1_t; + +/* + * Structure Volume Group (VG) Version 2 + */ + +typedef struct { + uint8_t vg_name[NAME_LEN]; /* volume group name */ + uint32_t vg_number; /* volume group number */ + uint32_t vg_access; /* read/write */ + uint32_t vg_status; /* active or not */ + uint32_t lv_max; /* maximum logical volumes */ + uint32_t lv_cur; /* current logical volumes */ + uint32_t lv_open; /* open logical volumes */ + uint32_t pv_max; /* maximum physical volumes */ + uint32_t pv_cur; /* current physical volumes FU */ + uint32_t pv_act; /* future: active physical volumes */ + uint32_t max_pe_per_pv; /* OBSOLETE maximum PE/PV */ + uint32_t vgda; /* volume group descriptor arrays FU */ + uint32_t pe_size; /* physical extent size in sectors */ + uint32_t pe_total; /* total of physical extents */ + uint32_t pe_allocated; /* allocated physical extents */ + uint32_t pvg_total; /* physical volume groups FU */ + struct proc_dir_entry *proc; + pv_t *pv[ABS_MAX_PV+1]; /* physical volume struct pointers */ + lv_t *lv[ABS_MAX_LV+1]; /* logical volume struct pointers */ + /* delta to version 1 starts here */ + uint8_t vg_uuid[UUID_LEN]; /* volume group UUID */ + time_t vg_atime; /* VG access time */ + time_t vg_ctime; /* VG creation time */ + time_t vg_mtime; /* VG modification time */ +} vg_v2_t; + + +/* + * Timekeeping structure on disk (0.7 feature) + * + * Holds several timestamps for start/stop time of non + * atomic VGDA disk i/o operations + * + */ + +typedef struct { + uint32_t seconds; /* seconds since the epoch */ + uint32_t jiffies; /* micro timer */ +} lvm_time_t; + +#define TIMESTAMP_ID_SIZE 2 +typedef struct { + uint8_t id[TIMESTAMP_ID_SIZE]; /* Identifier */ + lvm_time_t pv_vg_lv_pe_io_begin; + lvm_time_t pv_vg_lv_pe_io_end; + lvm_time_t pv_io_begin; + lvm_time_t pv_io_end; + lvm_time_t vg_io_begin; + lvm_time_t vg_io_end; + lvm_time_t lv_io_begin; + lvm_time_t lv_io_end; + lvm_time_t pe_io_begin; + lvm_time_t pe_io_end; + lvm_time_t pe_move_io_begin; + lvm_time_t pe_move_io_end; + uint8_t dummy[LVM_TIMESTAMP_DISK_SIZE - + TIMESTAMP_ID_SIZE - + 12 * sizeof(lvm_time_t)]; + /* ATTENTION ^^ */ +} timestamp_disk_t; + +/* same on disk and in core so far */ +typedef timestamp_disk_t timestamp_t; + +/* function identifiers for timestamp actions */ +typedef enum { PV_VG_LV_PE_IO_BEGIN, + PV_VG_LV_PE_IO_END, + PV_IO_BEGIN, + PV_IO_END, + VG_IO_BEGIN, + VG_IO_END, + LV_IO_BEGIN, + LV_IO_END, + PE_IO_BEGIN, + PE_IO_END, + PE_MOVE_IO_BEGIN, + PE_MOVE_IO_END} ts_fct_id_t; + + +/* + * Request structures for ioctls + */ + +/* Request structure PV_STATUS */ +typedef struct { + char pv_name[NAME_LEN]; + pv_t *pv; +} pv_status_req_t, pv_change_req_t; + +/* Request structure PV_FLUSH */ +typedef struct { + char pv_name[NAME_LEN]; +} pv_flush_req_t; + + +/* Request structure PE_MOVE */ +typedef struct { + enum { LOCK_PE, UNLOCK_PE} lock; + struct { + kdev_t lv_dev; + kdev_t pv_dev; + uint32_t pv_offset; + } data; +} pe_lock_req_t; + + +/* Request structure LV_STATUS_BYNAME */ +typedef struct { + char lv_name[NAME_LEN]; + lv_t *lv; +} lv_status_byname_req_t, lv_req_t; + +/* Request structure LV_STATUS_BYINDEX */ +typedef struct { + ulong lv_index; + lv_t *lv; +} lv_status_byindex_req_t; + +#endif /* #ifndef _LVM_H_INCLUDE */ diff -urN 2.3.46pre1/include/linux/major.h 2.3.46pre1aa1/include/linux/major.h --- 2.3.46pre1/include/linux/major.h Sun Jan 30 15:43:30 2000 +++ 2.3.46pre1aa1/include/linux/major.h Wed Feb 16 00:28:24 2000 @@ -92,8 +92,6 @@ #define SCSI_DISK7_MAJOR 71 -#define LVM_BLK_MAJOR 58 /* Logical Volume Manager */ - #define COMPAQ_SMART2_MAJOR 72 #define COMPAQ_SMART2_MAJOR1 73 #define COMPAQ_SMART2_MAJOR2 74 diff -urN 2.3.46pre1/include/linux/mm.h 2.3.46pre1aa1/include/linux/mm.h --- 2.3.46pre1/include/linux/mm.h Tue Feb 15 17:19:32 2000 +++ 2.3.46pre1aa1/include/linux/mm.h Wed Feb 16 00:28:25 2000 @@ -399,7 +399,6 @@ unsigned long * zones_size, unsigned long zone_start_paddr); extern void mem_init(void); extern void show_mem(void); -extern void oom(struct task_struct * tsk); extern void si_meminfo(struct sysinfo * val); extern void swapin_readahead(swp_entry_t); diff -urN 2.3.46pre1/include/linux/nbd.h 2.3.46pre1aa1/include/linux/nbd.h --- 2.3.46pre1/include/linux/nbd.h Sat Feb 12 05:15:10 2000 +++ 2.3.46pre1aa1/include/linux/nbd.h Wed Feb 16 00:28:25 2000 @@ -60,8 +60,7 @@ struct socket * sock; struct file * file; /* If == NULL, device is not ready, yet */ int magic; /* FIXME: not if debugging is off */ - struct request *head; /* Requests are added here... */ - struct request *tail; + struct list_head queue_head; /* Requests are added here... */ struct semaphore queue_lock; }; #endif diff -urN 2.3.46pre1/include/linux/rbtree.h 2.3.46pre1aa1/include/linux/rbtree.h --- 2.3.46pre1/include/linux/rbtree.h Thu Jan 1 01:00:00 1970 +++ 2.3.46pre1aa1/include/linux/rbtree.h Wed Feb 16 00:28:25 2000 @@ -0,0 +1,128 @@ +/* + Red Black Trees + (C) 1999 Andrea Arcangeli + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + linux/include/linux/rbtree.h + + To use rbtrees you'll have to implement your own insert and search cores. + This will avoid us to use callbacks and to drop drammatically performances. + I know it's not the cleaner way, but in C (not in C++) to get + performances and genericity... + + Some example of insert and search follows here. The search is a plain + normal search over an ordered tree. The insert instead must be implemented + int two steps: as first thing the code must insert the element in + order as a red leaf in the tree, then the support library function + rb_insert_color() must be called. Such function will do the + not trivial work to rebalance the rbtree if necessary. + +----------------------------------------------------------------------- +static inline struct page * rb_search_page_cache(struct inode * inode, + unsigned long offset) +{ + rb_node_t * n = inode->i_rb_page_cache.rb_node; + struct page * page; + + while (n) + { + page = rb_entry(n, struct page, rb_page_cache); + + if (offset < page->offset) + n = n->rb_left; + else if (offset > page->offset) + n = n->rb_right; + else + return page; + } + return NULL; +} + +static inline struct page * __rb_insert_page_cache(struct inode * inode, + unsigned long offset, + rb_node_t * node) +{ + rb_node_t ** p = &inode->i_rb_page_cache.rb_node; + rb_node_t * parent = NULL; + struct page * page; + + while (*p) + { + parent = *p; + page = rb_entry(parent, struct page, rb_page_cache); + + if (offset < page->offset) + p = &(*p)->rb_left; + else if (offset > page->offset) + p = &(*p)->rb_right; + else + return page; + } + + node->rb_parent = parent; + node->rb_color = RB_RED; + node->rb_left = node->rb_right = NULL; + + *p = node; + + return NULL; +} + +static inline struct page * rb_insert_page_cache(struct inode * inode, + unsigned long offset, + rb_node_t * node) +{ + struct page * ret; + if ((ret = __rb_insert_page_cache(inode, offset, node))) + goto out; + rb_insert_color(node, &inode->i_rb_page_cache); + out: + return ret; +} +----------------------------------------------------------------------- +*/ + +#ifndef _LINUX_RBTREE_H +#define _LINUX_RBTREE_H + +#include +#include + +typedef struct rb_node_s +{ + struct rb_node_s * rb_parent; + int rb_color; +#define RB_RED 0 +#define RB_BLACK 1 + struct rb_node_s * rb_right; + struct rb_node_s * rb_left; +} +rb_node_t; + +typedef struct rb_root_s +{ + struct rb_node_s * rb_node; +} +rb_root_t; + +#define RB_ROOT (rb_root_t) { NULL, } +#define rb_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +extern void rb_insert_color(rb_node_t *, rb_root_t *); +extern void rb_erase(rb_node_t *, rb_root_t *); + +#endif /* _LINUX_RBTREE_H */ diff -urN 2.3.46pre1/include/linux/sched.h 2.3.46pre1aa1/include/linux/sched.h --- 2.3.46pre1/include/linux/sched.h Tue Feb 15 17:19:32 2000 +++ 2.3.46pre1aa1/include/linux/sched.h Wed Feb 16 00:28:25 2000 @@ -356,6 +356,9 @@ u32 self_exec_id; /* Protection of fields allocatio/deallocation */ struct semaphore exit_sem; + +/* oom handling, left at the end since it's not critical info */ + int oom_kill_try; }; /* @@ -426,6 +429,7 @@ /* signals */ SPIN_LOCK_UNLOCKED, &init_signals, {{0}}, {{0}}, NULL, &init_task.sigqueue, 0, 0, \ /* exec cts */ 0,0, \ /* exit_sem */ __MUTEX_INITIALIZER(name.exit_sem), \ +/* oom */ 0, \ } #ifndef INIT_TASK_SIZE diff -urN 2.3.46pre1/include/linux/sysctl.h 2.3.46pre1aa1/include/linux/sysctl.h --- 2.3.46pre1/include/linux/sysctl.h Fri Feb 11 00:05:38 2000 +++ 2.3.46pre1aa1/include/linux/sysctl.h Wed Feb 16 00:28:25 2000 @@ -501,7 +501,9 @@ FS_NRSUPER=9, /* int:current number of allocated super_blocks */ FS_MAXSUPER=10, /* int:maximum number of super_blocks that can be allocated */ FS_OVERFLOWUID=11, /* int: overflow UID */ - FS_OVERFLOWGID=12 /* int: overflow GID */ + FS_OVERFLOWGID=12, /* int: overflow GID */ + FS_DCACHE_CTL=13, /* dentry cache controls */ + FS_ICACHE_CTL=14, /* inode cache controls */ }; /* CTL_DEBUG names: */ diff -urN 2.3.46pre1/include/linux/timer.h 2.3.46pre1aa1/include/linux/timer.h --- 2.3.46pre1/include/linux/timer.h Tue Feb 15 03:15:06 2000 +++ 2.3.46pre1aa1/include/linux/timer.h Wed Feb 16 00:28:25 2000 @@ -105,10 +105,10 @@ * good compiler would generate better code (and a really good compiler * wouldn't care). Gcc is currently neither. */ -#define time_after(a,b) ((long)(b) - (long)(a) < 0) +#define time_after(a,b) ((signed)(b) - (signed)(a) < 0) #define time_before(a,b) time_after(b,a) -#define time_after_eq(a,b) ((long)(a) - (long)(b) >= 0) +#define time_after_eq(a,b) ((signed)(a) - (signed)(b) >= 0) #define time_before_eq(a,b) time_after_eq(b,a) #endif diff -urN 2.3.46pre1/kernel/ksyms.c 2.3.46pre1aa1/kernel/ksyms.c --- 2.3.46pre1/kernel/ksyms.c Tue Feb 15 03:06:49 2000 +++ 2.3.46pre1aa1/kernel/ksyms.c Wed Feb 16 00:28:24 2000 @@ -71,6 +71,15 @@ }; #endif +#ifdef CONFIG_BLK_DEV_LVM_MODULE +#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 3, 43) + extern int (*lvm_map_ptr) ( int, kdev_t *, unsigned long *, + unsigned long, int); + EXPORT_SYMBOL(lvm_map_ptr); +#endif + extern void (*lvm_hd_name_ptr) ( char*, int); + EXPORT_SYMBOL(lvm_hd_name_ptr); +#endif #ifdef CONFIG_KMOD EXPORT_SYMBOL(request_module); @@ -159,6 +168,8 @@ EXPORT_SYMBOL(free_kiovec); EXPORT_SYMBOL(brw_kiovec); EXPORT_SYMBOL(alloc_kiovec); +EXPORT_SYMBOL(expand_kiobuf); +EXPORT_SYMBOL(unmap_kiobuf); EXPORT_SYMBOL(get_empty_filp); EXPORT_SYMBOL(init_private_file); EXPORT_SYMBOL(filp_open); diff -urN 2.3.46pre1/kernel/ptrace.c 2.3.46pre1aa1/kernel/ptrace.c --- 2.3.46pre1/kernel/ptrace.c Tue Feb 15 03:06:49 2000 +++ 2.3.46pre1aa1/kernel/ptrace.c Wed Feb 16 00:28:25 2000 @@ -26,6 +26,7 @@ unsigned long mapnr; unsigned long maddr; struct page *page; + int fault; repeat: pgdir = pgd_offset(vma->vm_mm, addr); @@ -65,8 +66,12 @@ fault_in_page: /* -1: out of memory. 0 - unmapped page */ - if (handle_mm_fault(tsk, vma, addr, write) > 0) + fault = handle_mm_fault(tsk, vma, addr, write); + if (fault > 0) goto repeat; + if (fault < 0) + /* the out of memory is been triggered by the current task. */ + force_sig(SIGKILL, current); return 0; bad_pgd: diff -urN 2.3.46pre1/kernel/sched.c 2.3.46pre1aa1/kernel/sched.c --- 2.3.46pre1/kernel/sched.c Fri Feb 11 00:05:39 2000 +++ 2.3.46pre1aa1/kernel/sched.c Wed Feb 16 00:28:25 2000 @@ -141,7 +141,7 @@ #endif /* .. and a slight advantage to the current MM */ - if (p->mm == this_mm) + if (p->mm == this_mm || !p->mm) weight += 1; weight += p->priority; @@ -173,7 +173,7 @@ */ static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) { - return goodness(p, cpu, prev->mm) - goodness(prev, cpu, prev->mm); + return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); } /* diff -urN 2.3.46pre1/kernel/sysctl.c 2.3.46pre1aa1/kernel/sysctl.c --- 2.3.46pre1/kernel/sysctl.c Fri Feb 11 00:05:39 2000 +++ 2.3.46pre1aa1/kernel/sysctl.c Wed Feb 16 00:28:25 2000 @@ -47,6 +47,9 @@ static int maxolduid = 65535; static int minolduid = 0; +extern int dcache_ctl[], dcache_ctl_min[], dcache_ctl_max[]; +extern int icache_ctl[], icache_ctl_min[], icache_ctl_max[]; + #ifdef CONFIG_KMOD extern char modprobe_path[]; #endif @@ -294,6 +297,12 @@ {FS_OVERFLOWGID, "overflowgid", &fs_overflowgid, sizeof(int), 0644, NULL, &proc_dointvec_minmax, &sysctl_intvec, NULL, &minolduid, &maxolduid}, + {FS_DCACHE_CTL, "dcache_ctl", &dcache_ctl, 1*sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, + &dcache_ctl_min, &dcache_ctl_max}, + {FS_ICACHE_CTL, "icache_ctl", &icache_ctl, 1*sizeof(int), 0644, NULL, + &proc_dointvec_minmax, &sysctl_intvec, NULL, + &icache_ctl_min, &icache_ctl_max}, {0} }; diff -urN 2.3.46pre1/kernel/timer.c 2.3.46pre1aa1/kernel/timer.c --- 2.3.46pre1/kernel/timer.c Fri Feb 11 00:05:39 2000 +++ 2.3.46pre1aa1/kernel/timer.c Wed Feb 16 00:28:25 2000 @@ -101,6 +101,8 @@ (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5 }; +static struct timer_list ** run_timer_list_running; + #define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0])) static unsigned long timer_jiffies = 0; @@ -125,7 +127,13 @@ unsigned long idx = expires - timer_jiffies; struct timer_list ** vec; - if (idx < TVR_SIZE) { + if (run_timer_list_running) { + if ((signed) idx < 0) + printk(KERN_WARNING __FUNCTION__ + ": potential recursion idx %ld from %p\n", + (signed) idx, __builtin_return_address(0)); + vec = run_timer_list_running; + } else if (idx < TVR_SIZE) { int i = expires & TVR_MASK; vec = tv1.vec + i; } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { @@ -266,13 +274,14 @@ { spin_lock_irq(&timerlist_lock); while ((long)(jiffies - timer_jiffies) >= 0) { - struct timer_list *timer; + struct timer_list *timer, * queued = NULL; if (!tv1.index) { int n = 1; do { cascade_timers(tvecs[n]); } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS); } + run_timer_list_running = &queued; while ((timer = tv1.vec[tv1.index])) { void (*fn)(unsigned long) = timer->function; unsigned long data = timer->data; @@ -283,8 +292,14 @@ fn(data); spin_lock_irq(&timerlist_lock); } + run_timer_list_running = NULL; ++timer_jiffies; tv1.index = (tv1.index + 1) & TVR_MASK; + while (queued) { + timer = queued; + queued = queued->next; + internal_add_timer(timer); + } } spin_unlock_irq(&timerlist_lock); } diff -urN 2.3.46pre1/lib/Makefile 2.3.46pre1aa1/lib/Makefile --- 2.3.46pre1/lib/Makefile Mon Jan 18 02:27:00 1999 +++ 2.3.46pre1aa1/lib/Makefile Wed Feb 16 00:28:25 2000 @@ -7,6 +7,6 @@ # L_TARGET := lib.a -L_OBJS := errno.o ctype.o string.o vsprintf.o +L_OBJS := errno.o ctype.o string.o vsprintf.o rbtree.o include $(TOPDIR)/Rules.make diff -urN 2.3.46pre1/lib/rbtree.c 2.3.46pre1aa1/lib/rbtree.c --- 2.3.46pre1/lib/rbtree.c Thu Jan 1 01:00:00 1970 +++ 2.3.46pre1aa1/lib/rbtree.c Wed Feb 16 00:28:25 2000 @@ -0,0 +1,293 @@ +/* + Red Black Trees + (C) 1999 Andrea Arcangeli + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + linux/lib/rbtree.c +*/ + +#include + +static void __rb_rotate_left(rb_node_t * node, rb_root_t * root) +{ + rb_node_t * right = node->rb_right; + + if ((node->rb_right = right->rb_left)) + right->rb_left->rb_parent = node; + right->rb_left = node; + + if ((right->rb_parent = node->rb_parent)) + { + if (node == node->rb_parent->rb_left) + node->rb_parent->rb_left = right; + else + node->rb_parent->rb_right = right; + } + else + root->rb_node = right; + node->rb_parent = right; +} + +static void __rb_rotate_right(rb_node_t * node, rb_root_t * root) +{ + rb_node_t * left = node->rb_left; + + if ((node->rb_left = left->rb_right)) + left->rb_right->rb_parent = node; + left->rb_right = node; + + if ((left->rb_parent = node->rb_parent)) + { + if (node == node->rb_parent->rb_right) + node->rb_parent->rb_right = left; + else + node->rb_parent->rb_left = left; + } + else + root->rb_node = left; + node->rb_parent = left; +} + +void rb_insert_color(rb_node_t * node, rb_root_t * root) +{ + rb_node_t * parent, * gparent; + + while ((parent = node->rb_parent) && parent->rb_color == RB_RED) + { + gparent = parent->rb_parent; + + if (parent == gparent->rb_left) + { + { + register rb_node_t * uncle = gparent->rb_right; + if (uncle && uncle->rb_color == RB_RED) + { + uncle->rb_color = RB_BLACK; + parent->rb_color = RB_BLACK; + gparent->rb_color = RB_RED; + node = gparent; + continue; + } + } + + if (parent->rb_right == node) + { + register rb_node_t * tmp; + __rb_rotate_left(parent, root); + tmp = parent; + parent = node; + node = tmp; + } + + parent->rb_color = RB_BLACK; + gparent->rb_color = RB_RED; + __rb_rotate_right(gparent, root); + } else { + { + register rb_node_t * uncle = gparent->rb_left; + if (uncle && uncle->rb_color == RB_RED) + { + uncle->rb_color = RB_BLACK; + parent->rb_color = RB_BLACK; + gparent->rb_color = RB_RED; + node = gparent; + continue; + } + } + + if (parent->rb_left == node) + { + register rb_node_t * tmp; + __rb_rotate_right(parent, root); + tmp = parent; + parent = node; + node = tmp; + } + + parent->rb_color = RB_BLACK; + gparent->rb_color = RB_RED; + __rb_rotate_left(gparent, root); + } + } + + root->rb_node->rb_color = RB_BLACK; +} + +static void __rb_erase_color(rb_node_t * node, rb_node_t * parent, + rb_root_t * root) +{ + rb_node_t * other; + + while ((!node || node->rb_color == RB_BLACK) && node != root->rb_node) + { + if (parent->rb_left == node) + { + other = parent->rb_right; + if (other->rb_color == RB_RED) + { + other->rb_color = RB_BLACK; + parent->rb_color = RB_RED; + __rb_rotate_left(parent, root); + other = parent->rb_right; + } + if ((!other->rb_left || + other->rb_left->rb_color == RB_BLACK) + && (!other->rb_right || + other->rb_right->rb_color == RB_BLACK)) + { + other->rb_color = RB_RED; + node = parent; + parent = node->rb_parent; + } + else + { + if (!other->rb_right || + other->rb_right->rb_color == RB_BLACK) + { + register rb_node_t * o_left; + if ((o_left = other->rb_left)) + o_left->rb_color = RB_BLACK; + other->rb_color = RB_RED; + __rb_rotate_right(other, root); + other = parent->rb_right; + } + other->rb_color = parent->rb_color; + parent->rb_color = RB_BLACK; + if (other->rb_right) + other->rb_right->rb_color = RB_BLACK; + __rb_rotate_left(parent, root); + node = root->rb_node; + break; + } + } + else + { + other = parent->rb_left; + if (other->rb_color == RB_RED) + { + other->rb_color = RB_BLACK; + parent->rb_color = RB_RED; + __rb_rotate_right(parent, root); + other = parent->rb_left; + } + if ((!other->rb_left || + other->rb_left->rb_color == RB_BLACK) + && (!other->rb_right || + other->rb_right->rb_color == RB_BLACK)) + { + other->rb_color = RB_RED; + node = parent; + parent = node->rb_parent; + } + else + { + if (!other->rb_left || + other->rb_left->rb_color == RB_BLACK) + { + register rb_node_t * o_right; + if ((o_right = other->rb_right)) + o_right->rb_color = RB_BLACK; + other->rb_color = RB_RED; + __rb_rotate_left(other, root); + other = parent->rb_left; + } + other->rb_color = parent->rb_color; + parent->rb_color = RB_BLACK; + if (other->rb_left) + other->rb_left->rb_color = RB_BLACK; + __rb_rotate_right(parent, root); + node = root->rb_node; + break; + } + } + } + if (node) + node->rb_color = RB_BLACK; +} + +void rb_erase(rb_node_t * node, rb_root_t * root) +{ + rb_node_t * child, * parent; + int color; + + if (!node->rb_left) + child = node->rb_right; + else if (!node->rb_right) + child = node->rb_left; + else + { + rb_node_t * old = node, * left; + + node = node->rb_right; + while ((left = node->rb_left)) + node = left; + child = node->rb_right; + parent = node->rb_parent; + color = node->rb_color; + + if (child) + child->rb_parent = parent; + if (parent) + { + if (parent->rb_left == node) + parent->rb_left = child; + else + parent->rb_right = child; + } + else + root->rb_node = child; + + if (node->rb_parent == old) + parent = node; + node->rb_parent = old->rb_parent; + node->rb_color = old->rb_color; + node->rb_right = old->rb_right; + node->rb_left = old->rb_left; + + if (old->rb_parent) + { + if (old->rb_parent->rb_left == old) + old->rb_parent->rb_left = node; + else + old->rb_parent->rb_right = node; + } else + root->rb_node = node; + + old->rb_left->rb_parent = node; + if (old->rb_right) + old->rb_right->rb_parent = node; + goto color; + } + + parent = node->rb_parent; + color = node->rb_color; + + if (child) + child->rb_parent = parent; + if (parent) + { + if (parent->rb_left == node) + parent->rb_left = child; + else + parent->rb_right = child; + } + else + root->rb_node = child; + + color: + if (color == RB_BLACK) + __rb_erase_color(child, parent, root); +} diff -urN 2.3.46pre1/mm/filemap.c 2.3.46pre1aa1/mm/filemap.c --- 2.3.46pre1/mm/filemap.c Tue Feb 15 03:06:49 2000 +++ 2.3.46pre1aa1/mm/filemap.c Wed Feb 16 00:28:25 2000 @@ -586,10 +586,10 @@ add_wait_queue(&page->wait, &wait); do { - run_task_queue(&tq_disk); set_task_state(tsk, TASK_UNINTERRUPTIBLE); if (!PageLocked(page)) break; + run_task_queue(&tq_disk); schedule(); } while (PageLocked(page)); tsk->state = TASK_RUNNING; @@ -631,13 +631,13 @@ struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); - run_task_queue(&tq_disk); - __set_task_state(tsk, TASK_UNINTERRUPTIBLE); add_wait_queue(&page->wait, &wait); - if (PageLocked(page)) + if (PageLocked(page)) { + run_task_queue(&tq_disk); schedule(); + } __set_task_state(tsk, TASK_RUNNING); remove_wait_queue(&page->wait, &wait); @@ -681,13 +681,13 @@ struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); - run_task_queue(&tq_disk); - __set_task_state(tsk, TASK_UNINTERRUPTIBLE); add_wait_queue(&page->wait, &wait); - if (PageLocked(page)) + if (PageLocked(page)) { + run_task_queue(&tq_disk); schedule(); + } __set_task_state(tsk, TASK_RUNNING); remove_wait_queue(&page->wait, &wait); diff -urN 2.3.46pre1/mm/memory.c 2.3.46pre1aa1/mm/memory.c --- 2.3.46pre1/mm/memory.c Tue Feb 15 03:06:49 2000 +++ 2.3.46pre1aa1/mm/memory.c Wed Feb 16 00:28:25 2000 @@ -70,16 +70,6 @@ mem_map_t * mem_map = NULL; /* - * oom() prints a message (so that the user knows why the process died), - * and gives the process an untrappable SIGKILL. - */ -void oom(struct task_struct * task) -{ - printk("\nOut of memory for %s.\n", task->comm); - force_sig(SIGKILL, task); -} - -/* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ diff -urN 2.3.46pre1/mm/vmscan.c 2.3.46pre1aa1/mm/vmscan.c --- 2.3.46pre1/mm/vmscan.c Sat Feb 12 21:03:24 2000 +++ 2.3.46pre1aa1/mm/vmscan.c Wed Feb 16 00:28:25 2000 @@ -325,6 +325,7 @@ struct task_struct * p; int counter; int __ret = 0; + int assign = 0; lock_kernel(); /* @@ -344,12 +345,9 @@ counter = nr_threads / (priority+1); if (counter < 1) counter = 1; - if (counter > nr_threads) - counter = nr_threads; for (; counter >= 0; counter--) { - int assign = 0; - int max_cnt = 0; + unsigned long max_cnt = 0; struct mm_struct *best = NULL; int pid = 0; select: @@ -362,7 +360,7 @@ if (mm->rss <= 0) continue; /* Refresh swap_cnt? */ - if (assign) + if (assign == 1) mm->swap_cnt = mm->rss; if (mm->swap_cnt > max_cnt) { max_cnt = mm->swap_cnt; @@ -371,6 +369,8 @@ } } read_unlock(&tasklist_lock); + if (assign == 1) + assign = 2; if (!best) { if (!assign) { assign = 1;