diff -prauN linux-2.6.0-test7/Documentation/filesystems/Locking wli-2.6.0-test7-bk1-29/Documentation/filesystems/Locking --- linux-2.6.0-test7/Documentation/filesystems/Locking 2003-10-08 12:24:43.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/Documentation/filesystems/Locking 2003-10-09 19:42:26.000000000 -0700 @@ -204,7 +204,7 @@ currently-in-progress I/O. If the filesystem is not called for "sync" and it determines that it would need to block against in-progress I/O to be able to start new I/O against the page the filesystem shoud redirty the page (usually with -__set_page_dirty_nobuffers()), then unlock the page and return zero. +set_page_dirty_nobuffers()), then unlock the page and return zero. This may also be done to avoid internal deadlocks, but rarely. If the filesytem is called for sync then it must wait on any diff -prauN linux-2.6.0-test7/Documentation/filesystems/jfs.txt wli-2.6.0-test7-bk1-29/Documentation/filesystems/jfs.txt --- linux-2.6.0-test7/Documentation/filesystems/jfs.txt 2003-10-08 12:24:16.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/Documentation/filesystems/jfs.txt 2003-10-09 19:27:22.000000000 -0700 @@ -32,6 +32,10 @@ integrity Default. Commit metadata chan option to remount a volume where the nointegrity option was previously specified in order to restore normal behavior. +errors=continue Keep going on a filesystem error. +errors=remount-ro Default. Remount the filesystem read-only on an error. +errors=panic Panic and halt the machine if an error occurs. + JFS TODO list: Plans for our near term development items diff -prauN linux-2.6.0-test7/Documentation/filesystems/xfs.txt wli-2.6.0-test7-bk1-29/Documentation/filesystems/xfs.txt --- linux-2.6.0-test7/Documentation/filesystems/xfs.txt 2003-10-08 12:24:27.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/Documentation/filesystems/xfs.txt 2003-10-09 19:27:22.000000000 -0700 @@ -29,6 +29,11 @@ When mounting an XFS filesystem, the fol The preferred buffered I/O size can also be altered on an individual file basis using the ioctl(2) system call. + ikeep + When inode clusters are emptied of inodes, keep them around + on the disk, this is the old XFS behavior. Default is now to + return the inode cluster to the free space pool. + logbufs=value Set the number of in-memory log buffers. Valid numbers range from 2-8 inclusive. diff -prauN linux-2.6.0-test7/Documentation/vm/locking wli-2.6.0-test7-bk1-29/Documentation/vm/locking --- linux-2.6.0-test7/Documentation/vm/locking 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/Documentation/vm/locking 2003-10-09 19:40:04.000000000 -0700 @@ -66,7 +66,7 @@ in some cases it is not really needed. E expand_stack(), it is hard to come up with a destructive scenario without having the vmlist protection in this case. -The page_table_lock nests with the inode i_shared_sem and the kmem cache +The page_table_lock nests with the inode i_shared_lock and the kmem cache c_spinlock spinlocks. This is okay, since the kmem code asks for pages after dropping c_spinlock. The page_table_lock also nests with pagecache_lock and pagemap_lru_lock spinlocks, and no code asks for memory with these locks diff -prauN linux-2.6.0-test7/Documentation/x86_64/boot-options.txt wli-2.6.0-test7-bk1-29/Documentation/x86_64/boot-options.txt --- linux-2.6.0-test7/Documentation/x86_64/boot-options.txt 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/Documentation/x86_64/boot-options.txt 2003-10-09 19:27:22.000000000 -0700 @@ -20,18 +20,15 @@ Machine check APICs - nolocalapic Don't use a local or IO-APIC. This should only - be needed if you have a buggy BIOS. The newer - kernels already turn it off by default if the - BIOS didn't enable the local APIC, so it will - be hopefully not needed. - Note this code path is not very well tested, you are on - your own. - apic Use IO-APIC. Default + Unless you have an NVidia or VIA/Uniprocessor board. + Then it defaults to off. noapic Don't use the IO-APIC. - Also only lightly tested. + + disableapic Don't use the local APIC + + nolapic Don't use the local APIC (alias for i386 compatibility) pirq=... See Documentation/i386/IO-APIC.txt @@ -60,13 +57,16 @@ Timing Report when timer interrupts are lost because some code turned off interrupts for too long. - nmi_watchdog=NUMBER + nmi_watchdog=NUMBER[,panic] NUMBER can be: 0 don't use an NMI watchdog 1 use the IO-APIC timer for the NMI watchdog 2 use the local APIC for the NMI watchdog using a performance counter. Note This will use one performance counter and the local APIC's performance vector. + When panic is specified panic when an NMI watchdog timeout occurs. + This is useful when you use a panic=... timeout and need the box + quickly up again. Idle loop @@ -127,6 +127,9 @@ NUMA ACPI acpi=off Don't enable ACPI + acpi=ht Use ACPI boot table parsing, but don't enable ACPI + interpreter + acpi=force Force ACPI on (currently not needed) PCI diff -prauN linux-2.6.0-test7/Makefile wli-2.6.0-test7-bk1-29/Makefile --- linux-2.6.0-test7/Makefile 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/Makefile 2003-10-09 19:27:22.000000000 -0700 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 0 -EXTRAVERSION = -test7 +EXTRAVERSION = -test7-bk1 # *DOCUMENTATION* # To see a list of typical targets execute "make help" diff -prauN linux-2.6.0-test7/arch/alpha/mm/remap.c wli-2.6.0-test7-bk1-29/arch/alpha/mm/remap.c --- linux-2.6.0-test7/arch/alpha/mm/remap.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/alpha/mm/remap.c 2003-10-09 19:28:46.000000000 -0700 @@ -73,7 +73,7 @@ __alpha_remap_area_pages(unsigned long a spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test7/arch/arm/Makefile wli-2.6.0-test7-bk1-29/arch/arm/Makefile --- linux-2.6.0-test7/arch/arm/Makefile 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/arm/Makefile 2003-10-09 19:27:22.000000000 -0700 @@ -155,10 +155,8 @@ zImage Image bootpImage: vmlinux zinstall install: vmlinux $(Q)$(MAKE) $(build)=$(boot) $@ -MRPROPER_FILES += \ - include/asm-arm/arch include/asm-arm/.arch \ - include/asm-arm/constants.h* \ - include/asm-arm/mach-types.h +CLEAN_FILES += include/asm-arm/constants.h* include/asm-arm/mach-types.h +MRPROPER_FILES += include/asm-arm/arch include/asm-arm/.arch # We use MRPROPER_FILES and CLEAN_FILES now archclean: diff -prauN linux-2.6.0-test7/arch/arm/configs/lart_defconfig wli-2.6.0-test7-bk1-29/arch/arm/configs/lart_defconfig --- linux-2.6.0-test7/arch/arm/configs/lart_defconfig 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/arm/configs/lart_defconfig 2003-10-09 19:27:22.000000000 -0700 @@ -2,76 +2,107 @@ # Automatically generated make config: don't edit # CONFIG_ARM=y -# CONFIG_EISA is not set -# CONFIG_SBUS is not set -# CONFIG_MCA is not set +CONFIG_MMU=y CONFIG_UID16=y CONFIG_RWSEM_GENERIC_SPINLOCK=y -# CONFIG_RWSEM_XCHGADD_ALGORITHM is not set # # Code maturity level options # CONFIG_EXPERIMENTAL=y -# CONFIG_OBSOLETE is not set +CONFIG_CLEAN_COMPILE=y +CONFIG_STANDALONE=y +CONFIG_BROKEN_ON_SMP=y + +# +# General setup +# +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +# CONFIG_BSD_PROCESS_ACCT is not set +CONFIG_SYSCTL=y +CONFIG_LOG_BUF_SHIFT=14 +# CONFIG_IKCONFIG is not set +# CONFIG_EMBEDDED is not set +CONFIG_KALLSYMS=y +CONFIG_FUTEX=y +CONFIG_EPOLL=y +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y # # Loadable module support # CONFIG_MODULES=y +# CONFIG_MODULE_UNLOAD is not set +CONFIG_OBSOLETE_MODPARM=y # CONFIG_MODVERSIONS is not set CONFIG_KMOD=y # # System Type # +# CONFIG_ARCH_ADIFCC is not set # CONFIG_ARCH_ANAKIN is not set -# CONFIG_ARCH_ARCA5K is not set # CONFIG_ARCH_CLPS7500 is not set # CONFIG_ARCH_CLPS711X is not set # CONFIG_ARCH_CO285 is not set +# CONFIG_ARCH_PXA is not set # CONFIG_ARCH_EBSA110 is not set +# CONFIG_ARCH_CAMELOT is not set # CONFIG_ARCH_FOOTBRIDGE is not set # CONFIG_ARCH_INTEGRATOR is not set +# CONFIG_ARCH_IOP3XX is not set # CONFIG_ARCH_L7200 is not set # CONFIG_ARCH_RPC is not set CONFIG_ARCH_SA1100=y # CONFIG_ARCH_SHARK is not set # -# Archimedes/A5000 Implementations +# CLPS711X/EP721X Implementations # # -# Archimedes/A5000 Implementations (select only ONE) +# Epxa10db # -# CONFIG_ARCH_ARC is not set -# CONFIG_ARCH_A5K is not set # # Footbridge Implementations # -# CONFIG_ARCH_CATS is not set -# CONFIG_ARCH_PERSONAL_SERVER is not set -# CONFIG_ARCH_EBSA285_ADDIN is not set -# CONFIG_ARCH_EBSA285_HOST is not set -# CONFIG_ARCH_NETWINDER is not set + +# +# IOP3xx Implementation Options +# +# CONFIG_ARCH_IOP310 is not set +# CONFIG_ARCH_IOP321 is not set + +# +# IOP3xx Chipset Features +# + +# +# Intel PXA250/210 Implementations +# # # SA11x0 Implementations # # CONFIG_SA1100_ASSABET is not set -# CONFIG_ASSABET_NEPONSET is not set # CONFIG_SA1100_ADSBITSY is not set # CONFIG_SA1100_BRUTUS is not set # CONFIG_SA1100_CERF is not set +# CONFIG_SA1100_H3100 is not set # CONFIG_SA1100_H3600 is not set +# CONFIG_SA1100_H3800 is not set # CONFIG_SA1100_EXTENEX1 is not set # CONFIG_SA1100_FLEXANET is not set # CONFIG_SA1100_FREEBIRD is not set # CONFIG_SA1100_GRAPHICSCLIENT is not set # CONFIG_SA1100_GRAPHICSMASTER is not set +# CONFIG_SA1100_BADGE4 is not set # CONFIG_SA1100_JORNADA720 is not set +# CONFIG_SA1100_HACKKIT is not set # CONFIG_SA1100_HUW_WEBPANEL is not set # CONFIG_SA1100_ITSY is not set CONFIG_SA1100_LART=y @@ -79,76 +110,72 @@ CONFIG_SA1100_LART=y # CONFIG_SA1100_OMNIMETER is not set # CONFIG_SA1100_PANGOLIN is not set # CONFIG_SA1100_PLEB is not set +# CONFIG_SA1100_PT_SYSTEM3 is not set +# CONFIG_SA1100_SHANNON is not set # CONFIG_SA1100_SHERMAN is not set # CONFIG_SA1100_SIMPAD is not set # CONFIG_SA1100_PFS168 is not set # CONFIG_SA1100_VICTOR is not set # CONFIG_SA1100_XP860 is not set # CONFIG_SA1100_YOPY is not set +# CONFIG_SA1100_STORK is not set +# CONFIG_SA1100_SSP is not set CONFIG_SA1100_USB=m CONFIG_SA1100_USB_NETLINK=m CONFIG_SA1100_USB_CHAR=m # -# CLPS711X/EP721X Implementations +# Processor Type # -# CONFIG_ARCH_CDB89712 is not set -# CONFIG_ARCH_CLEP7312 is not set -# CONFIG_ARCH_EDB7211 is not set -# CONFIG_ARCH_P720T is not set -# CONFIG_ARCH_EP7211 is not set -# CONFIG_ARCH_EP7212 is not set -# CONFIG_ARCH_ACORN is not set -# CONFIG_FOOTBRIDGE is not set -# CONFIG_FOOTBRIDGE_HOST is not set -# CONFIG_FOOTBRIDGE_ADDIN is not set CONFIG_CPU_32=y -# CONFIG_CPU_26 is not set +CONFIG_CPU_SA1100=y +CONFIG_CPU_32v4=y +CONFIG_CPU_ABRT_EV4=y +CONFIG_CPU_CACHE_V4WB=y +CONFIG_CPU_TLB_V4WB=y +CONFIG_CPU_MINICACHE=y # -# Processor Type +# Processor Features # -# CONFIG_CPU_32v3 is not set -CONFIG_CPU_32v4=y -# CONFIG_CPU_ARM610 is not set -# CONFIG_CPU_ARM710 is not set -# CONFIG_CPU_ARM720T is not set -# CONFIG_CPU_ARM920T is not set -# CONFIG_CPU_ARM1020 is not set -# CONFIG_CPU_SA110 is not set -CONFIG_CPU_SA1100=y -CONFIG_DISCONTIGMEM=y -# CONFIG_CPU_BIG_ENDIAN is not set # # General setup # -# CONFIG_PCI is not set -# CONFIG_ISA is not set -# CONFIG_ISA_DMA is not set +CONFIG_DISCONTIGMEM=y +CONFIG_ISA=y +# CONFIG_ZBOOT_ROM is not set +CONFIG_ZBOOT_ROM_TEXT=0x0 +CONFIG_ZBOOT_ROM_BSS=0x0 CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_SA1100=y +# CONFIG_CPU_FREQ_PROC_INTF is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_24_API=y # CONFIG_HOTPLUG is not set -# CONFIG_PCMCIA is not set -CONFIG_NET=y -CONFIG_SYSVIPC=y -# CONFIG_BSD_PROCESS_ACCT is not set -CONFIG_SYSCTL=y # # At least one math emulation must be selected # CONFIG_FPE_NWFPE=y +# CONFIG_FPE_NWFPE_XP is not set # CONFIG_FPE_FASTFPE is not set -CONFIG_KCORE_ELF=y -# CONFIG_KCORE_AOUT is not set -CONFIG_BINFMT_AOUT=y CONFIG_BINFMT_ELF=y +CONFIG_BINFMT_AOUT=y # CONFIG_BINFMT_MISC is not set + +# +# Generic Driver Options +# CONFIG_PM=y +# CONFIG_PREEMPT is not set CONFIG_APM=m # CONFIG_ARTHUR is not set CONFIG_CMDLINE="console=ttySA0,9600 root=/dev/ram" -# CONFIG_PFS168_CMDLINE is not set CONFIG_LEDS=y # CONFIG_LEDS_TIMER is not set CONFIG_LEDS_CPU=y @@ -166,8 +193,9 @@ CONFIG_MTD=y CONFIG_MTD_DEBUG=y CONFIG_MTD_DEBUG_VERBOSE=1 CONFIG_MTD_PARTITIONS=y +# CONFIG_MTD_CONCAT is not set # CONFIG_MTD_REDBOOT_PARTS is not set -# CONFIG_MTD_BOOTLDR_PARTS is not set +# CONFIG_MTD_CMDLINE_PARTS is not set # CONFIG_MTD_AFS_PARTS is not set # @@ -177,51 +205,26 @@ CONFIG_MTD_CHAR=y CONFIG_MTD_BLOCK=y # CONFIG_FTL is not set # CONFIG_NFTL is not set +# CONFIG_INFTL is not set # # RAM/ROM/Flash chip drivers # # CONFIG_MTD_CFI is not set -# CONFIG_MTD_CFI_INTELEXT is not set -# CONFIG_MTD_CFI_AMDSTD is not set -# CONFIG_MTD_AMDSTD is not set -# CONFIG_MTD_SHARP is not set +# CONFIG_MTD_JEDECPROBE is not set # CONFIG_MTD_RAM is not set # CONFIG_MTD_ROM is not set -# CONFIG_MTD_JEDEC is not set +# CONFIG_MTD_ABSENT is not set +# CONFIG_MTD_OBSOLETE_CHIPS is not set # # Mapping drivers for chip access # -# CONFIG_MTD_PHYSMAP is not set -# CONFIG_MTD_SUN_UFLASH is not set -# CONFIG_MTD_NORA is not set -# CONFIG_MTD_PNC2000 is not set -# CONFIG_MTD_RPXLITE is not set -# CONFIG_MTD_TQM8XXL is not set -# CONFIG_MTD_SC520CDP is not set -# CONFIG_MTD_NETSC520 is not set -# CONFIG_MTD_SBC_GXX is not set -# CONFIG_MTD_ELAN_104NC is not set -# CONFIG_MTD_DBOX2 is not set -# CONFIG_MTD_CSTM_MIPS_IXX is not set -# CONFIG_MTD_CFI_FLAGADM is not set -# CONFIG_MTD_SOLUTIONENGINE is not set -# CONFIG_MTD_MIXMEM is not set -# CONFIG_MTD_OCTAGON is not set -# CONFIG_MTD_VMAX is not set -# CONFIG_MTD_OCELOT is not set -# CONFIG_MTD_L440GX is not set -# CONFIG_MTD_ARM_INTEGRATOR is not set -# CONFIG_MTD_CDB89712 is not set -# CONFIG_MTD_SA1100 is not set -# CONFIG_MTD_DC21285 is not set -# CONFIG_MTD_IQ80310 is not set +# CONFIG_MTD_COMPLEX_MAPPINGS is not set # # Self-contained MTD device drivers # -# CONFIG_MTD_PMC551 is not set # CONFIG_MTD_SLRAM is not set CONFIG_MTD_LART=y # CONFIG_MTD_MTDRAM is not set @@ -230,10 +233,9 @@ CONFIG_MTD_LART=y # # Disk-On-Chip Device Drivers # -# CONFIG_MTD_DOC1000 is not set # CONFIG_MTD_DOC2000 is not set # CONFIG_MTD_DOC2001 is not set -# CONFIG_MTD_DOCPROBE is not set +# CONFIG_MTD_DOC2001PLUS is not set # # NAND Flash Device Drivers @@ -241,21 +243,15 @@ CONFIG_MTD_LART=y # CONFIG_MTD_NAND is not set # -# Plug and Play configuration +# Plug and Play support # # CONFIG_PNP is not set -# CONFIG_ISAPNP is not set -# CONFIG_PNPBIOS is not set # # Block devices # # CONFIG_BLK_DEV_FD is not set # CONFIG_BLK_DEV_XD is not set -# CONFIG_PARIDE is not set -# CONFIG_BLK_CPQ_DA is not set -# CONFIG_BLK_CPQ_CISS_DA is not set -# CONFIG_BLK_DEV_DAC960 is not set # CONFIG_BLK_DEV_LOOP is not set # CONFIG_BLK_DEV_NBD is not set CONFIG_BLK_DEV_RAM=y @@ -266,44 +262,49 @@ CONFIG_BLK_DEV_INITRD=y # Multi-device support (RAID and LVM) # # CONFIG_MD is not set -# CONFIG_BLK_DEV_MD is not set -# CONFIG_MD_LINEAR is not set -# CONFIG_MD_RAID0 is not set -# CONFIG_MD_RAID1 is not set -# CONFIG_MD_RAID5 is not set -# CONFIG_BLK_DEV_LVM is not set + +# +# Networking support +# +CONFIG_NET=y # # Networking options # CONFIG_PACKET=m # CONFIG_PACKET_MMAP is not set -# CONFIG_NETLINK is not set -# CONFIG_NETFILTER is not set -# CONFIG_FILTER is not set +# CONFIG_NETLINK_DEV is not set CONFIG_UNIX=y +# CONFIG_NET_KEY is not set CONFIG_INET=y # CONFIG_IP_MULTICAST is not set # CONFIG_IP_ADVANCED_ROUTER is not set # CONFIG_IP_PNP is not set # CONFIG_NET_IPIP is not set # CONFIG_NET_IPGRE is not set +# CONFIG_ARPD is not set CONFIG_INET_ECN=y CONFIG_SYN_COOKIES=y +# CONFIG_INET_AH is not set +# CONFIG_INET_ESP is not set +# CONFIG_INET_IPCOMP is not set # CONFIG_IPV6 is not set -# CONFIG_KHTTPD is not set -# CONFIG_ATM is not set +# CONFIG_DECNET is not set +# CONFIG_BRIDGE is not set +# CONFIG_NETFILTER is not set # -# +# SCTP Configuration (EXPERIMENTAL) # +CONFIG_IPV6_SCTP__=y +# CONFIG_IP_SCTP is not set +# CONFIG_ATM is not set +# CONFIG_VLAN_8021Q is not set +# CONFIG_LLC2 is not set # CONFIG_IPX is not set # CONFIG_ATALK is not set -# CONFIG_DECNET is not set -# CONFIG_BRIDGE is not set # CONFIG_X25 is not set # CONFIG_LAPB is not set -# CONFIG_LLC is not set # CONFIG_NET_DIVERT is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set @@ -316,8 +317,9 @@ CONFIG_SYN_COOKIES=y # CONFIG_NET_SCHED is not set # -# Network device support +# Network testing # +# CONFIG_NET_PKTGEN is not set CONFIG_NETDEVICES=y # @@ -333,15 +335,14 @@ CONFIG_DUMMY=m # Ethernet (10 or 100Mbit) # CONFIG_NET_ETHERNET=y -# CONFIG_SUNLANCE is not set -# CONFIG_SUNBMAC is not set -# CONFIG_SUNQE is not set -# CONFIG_SUNLANCE is not set -# CONFIG_SUNGEM is not set +# CONFIG_MII is not set # CONFIG_NET_VENDOR_3COM is not set # CONFIG_LANCE is not set # CONFIG_NET_VENDOR_SMC is not set # CONFIG_NET_VENDOR_RACAL is not set +# CONFIG_AT1700 is not set +# CONFIG_DEPCA is not set +# CONFIG_HP100 is not set # CONFIG_NET_ISA is not set # CONFIG_NET_PCI is not set # CONFIG_NET_POCKET is not set @@ -349,16 +350,10 @@ CONFIG_NET_ETHERNET=y # # Ethernet (1000 Mbit) # -# CONFIG_ACENIC is not set -# CONFIG_DL2K is not set -# CONFIG_MYRI_SBUS is not set -# CONFIG_NS83820 is not set -# CONFIG_HAMACHI is not set -# CONFIG_YELLOWFIN is not set -# CONFIG_SK98LIN is not set -# CONFIG_FDDI is not set -# CONFIG_HIPPI is not set -# CONFIG_PLIP is not set + +# +# Ethernet (10000 Mbit) +# CONFIG_PPP=m # CONFIG_PPP_MULTILINK is not set # CONFIG_PPP_FILTER is not set @@ -381,8 +376,6 @@ CONFIG_SLIP_COMPRESSED=y # Token Ring devices # # CONFIG_TR is not set -# CONFIG_NET_FC is not set -# CONFIG_RCPCI is not set # CONFIG_SHAPER is not set # @@ -407,10 +400,9 @@ CONFIG_IRLAN=m CONFIG_IRNET=m CONFIG_IRCOMM=m # CONFIG_IRDA_ULTRA is not set -CONFIG_IRDA_OPTIONS=y # -# IrDA options +# IrDA options # CONFIG_IRDA_CACHE_LAST_LSAP=y # CONFIG_IRDA_FAST_RR is not set @@ -424,132 +416,145 @@ CONFIG_IRDA_DEBUG=y # SIR device drivers # # CONFIG_IRTTY_SIR is not set -# CONFIG_IRPORT_SIR is not set # # Dongle support # -# CONFIG_DONGLE is not set + +# +# Old SIR device drivers +# +# CONFIG_IRPORT_SIR is not set + +# +# Old Serial dongle support +# # # FIR device drivers # -# CONFIG_USB_IRDA is not set # CONFIG_NSC_FIR is not set # CONFIG_WINBOND_FIR is not set # CONFIG_TOSHIBA_FIR is not set # CONFIG_SMC_IRCC_FIR is not set # CONFIG_ALI_FIR is not set -# CONFIG_VLSI_FIR is not set CONFIG_SA1100_FIR=m +# CONFIG_VIA_FIR is not set # -# ATA/IDE/MFM/RLL support +# Bluetooth support # -CONFIG_IDE=m +# CONFIG_BT is not set # -# IDE, ATA and ATAPI Block devices +# ATA/ATAPI/MFM/RLL support # +CONFIG_IDE=m CONFIG_BLK_DEV_IDE=m # # Please see Documentation/ide.txt for help/info on IDE drives # -# CONFIG_BLK_DEV_HD_IDE is not set -# CONFIG_BLK_DEV_HD is not set CONFIG_BLK_DEV_IDEDISK=m # CONFIG_IDEDISK_MULTI_MODE is not set -# CONFIG_BLK_DEV_IDECS is not set +# CONFIG_IDEDISK_STROKE is not set CONFIG_BLK_DEV_IDECD=m # CONFIG_BLK_DEV_IDETAPE is not set # CONFIG_BLK_DEV_IDEFLOPPY is not set -# CONFIG_BLK_DEV_IDESCSI is not set +# CONFIG_IDE_TASK_IOCTL is not set +# CONFIG_IDE_TASKFILE_IO is not set # # IDE chipset support/bugfixes # -# CONFIG_BLK_DEV_CMD640 is not set -# CONFIG_BLK_DEV_CMD640_ENHANCED is not set -# CONFIG_BLK_DEV_ISAPNP is not set # CONFIG_IDE_CHIPSETS is not set +# CONFIG_BLK_DEV_IDEDMA is not set # CONFIG_IDEDMA_AUTO is not set -# CONFIG_BLK_DEV_ATARAID is not set -# CONFIG_BLK_DEV_ATARAID_PDC is not set -# CONFIG_BLK_DEV_ATARAID_HPT is not set +# CONFIG_DMA_NONPCI is not set +# CONFIG_BLK_DEV_HD is not set # -# SCSI support +# SCSI device support # # CONFIG_SCSI is not set # # I2O device support # -# CONFIG_I2O is not set -# CONFIG_I2O_BLOCK is not set -# CONFIG_I2O_LAN is not set -# CONFIG_I2O_SCSI is not set -# CONFIG_I2O_PROC is not set # # ISDN subsystem # -# CONFIG_ISDN is not set +# CONFIG_ISDN_BOOL is not set + +# +# Input device support +# +CONFIG_INPUT=y # -# Input core support +# Userland interfaces # -# CONFIG_INPUT is not set -# CONFIG_INPUT_KEYBDEV is not set -# CONFIG_INPUT_MOUSEDEV is not set +CONFIG_INPUT_MOUSEDEV=y +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 # CONFIG_INPUT_JOYDEV is not set +# CONFIG_INPUT_TSDEV is not set # CONFIG_INPUT_EVDEV is not set +# CONFIG_INPUT_EVBUG is not set + +# +# Input I/O drivers +# +# CONFIG_GAMEPORT is not set +CONFIG_SOUND_GAMEPORT=y +CONFIG_SERIO=y +CONFIG_SERIO_I8042=y +CONFIG_SERIO_SERPORT=y +# CONFIG_SERIO_CT82C710 is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ATKBD=y +# CONFIG_KEYBOARD_SUNKBD is not set +# CONFIG_KEYBOARD_XTKBD is not set +# CONFIG_KEYBOARD_NEWTON is not set +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=y +# CONFIG_MOUSE_PS2_SYNAPTICS is not set +# CONFIG_MOUSE_SERIAL is not set +# CONFIG_MOUSE_INPORT is not set +# CONFIG_MOUSE_LOGIBM is not set +# CONFIG_MOUSE_PC110PAD is not set +# CONFIG_INPUT_JOYSTICK is not set +# CONFIG_INPUT_TOUCHSCREEN is not set +# CONFIG_INPUT_MISC is not set # # Character devices # -# CONFIG_VT is not set -# CONFIG_SERIAL is not set -# CONFIG_SERIAL_EXTENDED is not set +CONFIG_VT=y +CONFIG_VT_CONSOLE=y +CONFIG_HW_CONSOLE=y # CONFIG_SERIAL_NONSTANDARD is not set # # Serial drivers # -# CONFIG_SERIAL_ANAKIN is not set -# CONFIG_SERIAL_ANAKIN_CONSOLE is not set -# CONFIG_SERIAL_AMBA is not set -# CONFIG_SERIAL_AMBA_CONSOLE is not set -# CONFIG_SERIAL_CLPS711X is not set -# CONFIG_SERIAL_CLPS711X_CONSOLE is not set -# CONFIG_SERIAL_21285 is not set -# CONFIG_SERIAL_21285_OLD is not set -# CONFIG_SERIAL_21285_CONSOLE is not set +# CONFIG_SERIAL_8250 is not set + +# +# Non-8250 serial port support +# CONFIG_SERIAL_SA1100=y CONFIG_SERIAL_SA1100_CONSOLE=y -CONFIG_SA1100_DEFAULT_BAUDRATE=9600 -# CONFIG_SERIAL_8250 is not set -# CONFIG_SERIAL_8250_CONSOLE is not set -# CONFIG_SERIAL_8250_EXTENDED is not set -# CONFIG_SERIAL_8250_MANY_PORTS is not set -# CONFIG_SERIAL_8250_SHARE_IRQ is not set -# CONFIG_SERIAL_8250_DETECT_IRQ is not set -# CONFIG_SERIAL_8250_MULTIPORT is not set -# CONFIG_SERIAL_8250_HUB6 is not set CONFIG_SERIAL_CORE=y CONFIG_SERIAL_CORE_CONSOLE=y CONFIG_UNIX98_PTYS=y CONFIG_UNIX98_PTY_COUNT=256 -CONFIG_UCB1200=m -CONFIG_TOUCHSCREEN_UCB1200=m -CONFIG_AUDIO_UCB1200=m -CONFIG_ADC_UCB1200=m -# CONFIG_TOUCHSCREEN_H3600 is not set -CONFIG_PROFILER=m -# CONFIG_PFS168_SPI is not set -# CONFIG_PFS168_DTMF is not set -# CONFIG_PFS168_MISC is not set # # I2C support @@ -557,55 +562,39 @@ CONFIG_PROFILER=m # CONFIG_I2C is not set # -# L3 serial bus support +# I2C Algorithms # -# CONFIG_L3 is not set -# CONFIG_L3_ALGOBIT is not set -# CONFIG_L3_BIT_SA1100_GPIO is not set # -# Other L3 adapters +# I2C Hardware Bus support # -# CONFIG_L3_SA1111 is not set # -# L3 driver support +# I2C Hardware Sensors Chip support # -# CONFIG_L3_DRV_UDA1341 is not set -# CONFIG_BIT_SA1100_GPIO is not set +# CONFIG_I2C_SENSOR is not set # # Mice # # CONFIG_BUSMOUSE is not set -# CONFIG_MOUSE is not set - -# -# Joysticks -# -# CONFIG_INPUT_GAMEPORT is not set - -# -# Input core support is needed for gameports -# +# CONFIG_QIC02_TAPE is not set # -# Input core support is needed for joysticks +# IPMI # -# CONFIG_QIC02_TAPE is not set +# CONFIG_IPMI_HANDLER is not set # # Watchdog Cards # # CONFIG_WATCHDOG is not set -# CONFIG_INTEL_RNG is not set # CONFIG_NVRAM is not set # CONFIG_RTC is not set -CONFIG_SA1100_RTC=m +# CONFIG_GEN_RTC is not set # CONFIG_DTLK is not set # CONFIG_R3964 is not set # CONFIG_APPLICOM is not set -# CONFIG_SONYPI is not set # # Ftape, the floppy tape device driver @@ -613,6 +602,7 @@ CONFIG_SA1100_RTC=m # CONFIG_FTAPE is not set # CONFIG_AGP is not set # CONFIG_DRM is not set +# CONFIG_RAW_DRIVER is not set # # Multimedia devices @@ -620,87 +610,104 @@ CONFIG_SA1100_RTC=m # CONFIG_VIDEO_DEV is not set # +# Digital Video Broadcasting Devices +# +# CONFIG_DVB is not set + +# # File systems # -# CONFIG_QUOTA is not set -# CONFIG_AUTOFS_FS is not set -# CONFIG_AUTOFS4_FS is not set +CONFIG_EXT2_FS=y +# CONFIG_EXT2_FS_XATTR is not set +CONFIG_EXT3_FS=m +CONFIG_EXT3_FS_XATTR=y +# CONFIG_EXT3_FS_POSIX_ACL is not set +# CONFIG_EXT3_FS_SECURITY is not set +CONFIG_JBD=m +# CONFIG_JBD_DEBUG is not set +CONFIG_FS_MBCACHE=y CONFIG_REISERFS_FS=m # CONFIG_REISERFS_CHECK is not set # CONFIG_REISERFS_PROC_INFO is not set +# CONFIG_JFS_FS is not set +# CONFIG_XFS_FS is not set +# CONFIG_MINIX_FS is not set +# CONFIG_ROMFS_FS is not set +# CONFIG_QUOTA is not set +# CONFIG_AUTOFS_FS is not set +# CONFIG_AUTOFS4_FS is not set + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +# CONFIG_ZISOFS is not set +CONFIG_UDF_FS=m + +# +# DOS/FAT/NT Filesystems +# +# CONFIG_FAT_FS is not set +# CONFIG_NTFS_FS is not set + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +# CONFIG_DEVFS_FS is not set +CONFIG_DEVPTS_FS=y +# CONFIG_DEVPTS_FS_XATTR is not set +CONFIG_TMPFS=y +# CONFIG_HUGETLB_PAGE is not set +CONFIG_RAMFS=y + +# +# Miscellaneous filesystems +# # CONFIG_ADFS_FS is not set -# CONFIG_ADFS_FS_RW is not set # CONFIG_AFFS_FS is not set # CONFIG_HFS_FS is not set +# CONFIG_BEFS_FS is not set # CONFIG_BFS_FS is not set -# CONFIG_CMS_FS is not set -CONFIG_EXT3_FS=m -CONFIG_JBD=m -# CONFIG_JBD_DEBUG is not set -# CONFIG_FAT_FS is not set -# CONFIG_MSDOS_FS is not set -# CONFIG_UMSDOS_FS is not set -# CONFIG_VFAT_FS is not set # CONFIG_EFS_FS is not set # CONFIG_JFFS_FS is not set CONFIG_JFFS2_FS=m CONFIG_JFFS2_FS_DEBUG=1 +# CONFIG_JFFS2_FS_NAND is not set CONFIG_CRAMFS=m -CONFIG_TMPFS=y -CONFIG_RAMFS=m -CONFIG_ISO9660_FS=m -CONFIG_JOLIET=y -# CONFIG_MINIX_FS is not set -# CONFIG_FREEVXFS_FS is not set -# CONFIG_NTFS_FS is not set -# CONFIG_NTFS_DEBUG is not set -# CONFIG_NTFS_RW is not set +# CONFIG_VXFS_FS is not set # CONFIG_HPFS_FS is not set -CONFIG_PROC_FS=y -# CONFIG_DEVFS_FS is not set -# CONFIG_DEVFS_MOUNT is not set -# CONFIG_DEVFS_DEBUG is not set -CONFIG_DEVPTS_FS=y # CONFIG_QNX4FS_FS is not set -# CONFIG_QNX4FS_RW is not set -# CONFIG_ROMFS_FS is not set -CONFIG_EXT2_FS=y # CONFIG_SYSV_FS is not set -CONFIG_UDF_FS=m -# CONFIG_UDF_RW is not set # CONFIG_UFS_FS is not set -# CONFIG_UFS_FS_WRITE is not set # # Network File Systems # -# CONFIG_CODA_FS is not set -# CONFIG_INTERMEZZO_FS is not set CONFIG_NFS_FS=m CONFIG_NFS_V3=y -# CONFIG_ROOT_NFS is not set +# CONFIG_NFS_V4 is not set CONFIG_NFSD=m CONFIG_NFSD_V3=y -CONFIG_SUNRPC=m +# CONFIG_NFSD_V4 is not set +# CONFIG_NFSD_TCP is not set CONFIG_LOCKD=m CONFIG_LOCKD_V4=y +CONFIG_EXPORTFS=m +CONFIG_SUNRPC=m +# CONFIG_SUNRPC_GSS is not set # CONFIG_SMB_FS is not set +# CONFIG_CIFS is not set # CONFIG_NCP_FS is not set -# CONFIG_NCPFS_PACKET_SIGNING is not set -# CONFIG_NCPFS_IOCTL_LOCKING is not set -# CONFIG_NCPFS_STRONG is not set -# CONFIG_NCPFS_NFS_NS is not set -# CONFIG_NCPFS_OS2_NS is not set -# CONFIG_NCPFS_SMALLDOS is not set -# CONFIG_NCPFS_NLS is not set -# CONFIG_NCPFS_EXTRAS is not set +# CONFIG_CODA_FS is not set +# CONFIG_INTERMEZZO_FS is not set +# CONFIG_AFS_FS is not set # # Partition Types # # CONFIG_PARTITION_ADVANCED is not set -CONFIG_MSDOS_PARTITION=y -# CONFIG_SMB_NLS is not set CONFIG_NLS=y # @@ -728,6 +735,7 @@ CONFIG_NLS_CODEPAGE_850=m # CONFIG_NLS_CODEPAGE_949 is not set # CONFIG_NLS_CODEPAGE_874 is not set # CONFIG_NLS_ISO8859_8 is not set +# CONFIG_NLS_CODEPAGE_1250 is not set # CONFIG_NLS_CODEPAGE_1251 is not set CONFIG_NLS_ISO8859_1=m # CONFIG_NLS_ISO8859_2 is not set @@ -745,149 +753,62 @@ CONFIG_NLS_ISO8859_15=m CONFIG_NLS_UTF8=m # -# Sound -# -CONFIG_SOUND=m -# CONFIG_SOUND_BT878 is not set -# CONFIG_SOUND_CMPCI is not set -# CONFIG_SOUND_EMU10K1 is not set -# CONFIG_SOUND_FUSION is not set -# CONFIG_SOUND_CS4281 is not set -# CONFIG_SOUND_ES1370 is not set -# CONFIG_SOUND_ES1371 is not set -# CONFIG_SOUND_ESSSOLO1 is not set -# CONFIG_SOUND_MAESTRO is not set -# CONFIG_SOUND_MAESTRO3 is not set -# CONFIG_SOUND_ICH is not set -# CONFIG_SOUND_RME96XX is not set -# CONFIG_SOUND_SONICVIBES is not set -# CONFIG_SOUND_TRIDENT is not set -# CONFIG_SOUND_MSNDCLAS is not set -# CONFIG_SOUND_MSNDPIN is not set -# CONFIG_SOUND_VIA82CXXX is not set -# CONFIG_MIDI_VIA82CXXX is not set -# CONFIG_SOUND_ASSABET_UDA1341 is not set -# CONFIG_SOUND_H3600_UDA1341 is not set -# CONFIG_SOUND_PANGOLIN_UDA1341 is not set -# CONFIG_SOUND_SA1111_UDA1341 is not set -CONFIG_SOUND_SA1100SSP=m -# CONFIG_SOUND_OSS is not set -# CONFIG_SOUND_WAVEARTIST is not set -# CONFIG_SOUND_TVMIXER is not set - -# -# USB support -# -# CONFIG_USB is not set - -# -# USB Controllers -# -# CONFIG_USB_UHCI is not set -# CONFIG_USB_UHCI_ALT is not set -# CONFIG_USB_OHCI is not set - -# -# USB Device Class drivers +# Graphics support # -# CONFIG_USB_AUDIO is not set -# CONFIG_USB_BLUETOOTH is not set -# CONFIG_USB_STORAGE is not set -# CONFIG_USB_STORAGE_DEBUG is not set -# CONFIG_USB_STORAGE_DATAFAB is not set -# CONFIG_USB_STORAGE_FREECOM is not set -# CONFIG_USB_STORAGE_JUMPSHOT is not set -# CONFIG_USB_STORAGE_DPCM is not set -# CONFIG_USB_STORAGE_SDDR09 is not set -# CONFIG_USB_ACM is not set -# CONFIG_USB_PRINTER is not set +# CONFIG_FB is not set # -# USB Human Interface Devices (HID) +# Console display driver support # +# CONFIG_VGA_CONSOLE is not set +# CONFIG_MDA_CONSOLE is not set +CONFIG_DUMMY_CONSOLE=y # -# Input core support is needed for USB HID -# - -# -# USB Imaging devices +# Sound # -# CONFIG_USB_DC2XX is not set -# CONFIG_USB_MDC800 is not set -# CONFIG_USB_SCANNER is not set -# CONFIG_USB_MICROTEK is not set -# CONFIG_USB_HPUSBSCSI is not set +CONFIG_SOUND=m # -# USB Multimedia devices +# Advanced Linux Sound Architecture # +# CONFIG_SND is not set # -# Video4Linux support is needed for USB Multimedia device support +# Open Sound System # -# CONFIG_USB_DABUSB is not set +# CONFIG_SOUND_PRIME is not set # -# USB Network adaptors +# Misc devices # -# CONFIG_USB_PLUSB is not set -# CONFIG_USB_PEGASUS is not set -# CONFIG_USB_KAWETH is not set -# CONFIG_USB_CATC is not set -# CONFIG_USB_CDCETHER is not set -# CONFIG_USB_USBNET is not set # -# USB port drivers +# USB support # -# CONFIG_USB_USS720 is not set +# CONFIG_USB_GADGET is not set # -# USB Serial Converter support +# Kernel hacking # -# CONFIG_USB_SERIAL is not set -# CONFIG_USB_SERIAL_GENERIC is not set -# CONFIG_USB_SERIAL_BELKIN is not set -# CONFIG_USB_SERIAL_WHITEHEAT is not set -# CONFIG_USB_SERIAL_DIGI_ACCELEPORT is not set -# CONFIG_USB_SERIAL_EMPEG is not set -# CONFIG_USB_SERIAL_FTDI_SIO is not set -# CONFIG_USB_SERIAL_VISOR is not set -# CONFIG_USB_SERIAL_EDGEPORT is not set -# CONFIG_USB_SERIAL_KEYSPAN_PDA is not set -# CONFIG_USB_SERIAL_KEYSPAN is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28 is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28X is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA19 is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA18X is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA19W is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA49W is not set -# CONFIG_USB_SERIAL_MCT_U232 is not set -# CONFIG_USB_SERIAL_PL2303 is not set -# CONFIG_USB_SERIAL_CYBERJACK is not set -# CONFIG_USB_SERIAL_OMNINET is not set +CONFIG_FRAME_POINTER=y +CONFIG_DEBUG_USER=y +# CONFIG_DEBUG_INFO is not set +# CONFIG_DEBUG_KERNEL is not set # -# Miscellaneous USB drivers +# Security options # -# CONFIG_USB_RIO500 is not set -# CONFIG_USB_ID75 is not set +# CONFIG_SECURITY is not set # -# Bluetooth support +# Cryptographic options # -# CONFIG_BT is not set +# CONFIG_CRYPTO is not set # -# Kernel hacking +# Library routines # -CONFIG_FRAME_POINTER=y -CONFIG_DEBUG_ERRORS=y -CONFIG_DEBUG_USER=y -# CONFIG_DEBUG_INFO is not set -CONFIG_MAGIC_SYSRQ=y -# CONFIG_NO_PGT_CACHE is not set -CONFIG_DEBUG_LL=y -# CONFIG_DEBUG_DC21285_PORT is not set -# CONFIG_DEBUG_CLPS711X_UART2 is not set +CONFIG_CRC32=m +CONFIG_ZLIB_INFLATE=m +CONFIG_ZLIB_DEFLATE=m diff -prauN linux-2.6.0-test7/arch/arm/mach-sa1100/cpu-sa1100.c wli-2.6.0-test7-bk1-29/arch/arm/mach-sa1100/cpu-sa1100.c --- linux-2.6.0-test7/arch/arm/mach-sa1100/cpu-sa1100.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/arm/mach-sa1100/cpu-sa1100.c 2003-10-09 19:27:22.000000000 -0700 @@ -194,7 +194,7 @@ static int sa1100_target(struct cpufreq_ new_ppcr = sa11x0_freq_to_ppcr(target_freq); if ((sa11x0_ppcr_to_freq(new_ppcr) > target_freq) && (sa11x0_ppcr_to_freq(new_ppcr - 1) >= policy->min)) - mew_ppcr--; + new_ppcr--; break; } diff -prauN linux-2.6.0-test7/arch/arm/mach-sa1100/lart.c wli-2.6.0-test7-bk1-29/arch/arm/mach-sa1100/lart.c --- linux-2.6.0-test7/arch/arm/mach-sa1100/lart.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/arm/mach-sa1100/lart.c 2003-10-09 19:27:22.000000000 -0700 @@ -8,6 +8,7 @@ #include #include +#include #include #include diff -prauN linux-2.6.0-test7/arch/arm/mm/consistent.c wli-2.6.0-test7-bk1-29/arch/arm/mm/consistent.c --- linux-2.6.0-test7/arch/arm/mm/consistent.c 2003-10-08 12:24:27.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/arm/mm/consistent.c 2003-10-09 19:28:46.000000000 -0700 @@ -327,7 +327,7 @@ static int __init consistent_init(void) do { pgd = pgd_offset(&init_mm, CONSISTENT_BASE); - pmd = pmd_alloc(&init_mm, pgd, CONSISTENT_BASE); + pmd = pmd_alloc_kernel(&init_mm, pgd, CONSISTENT_BASE); if (!pmd) { printk(KERN_ERR "consistent_init: no pmd tables\n"); ret = -ENOMEM; diff -prauN linux-2.6.0-test7/arch/arm/mm/fault-armv.c wli-2.6.0-test7-bk1-29/arch/arm/mm/fault-armv.c --- linux-2.6.0-test7/arch/arm/mm/fault-armv.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/arm/mm/fault-armv.c 2003-10-09 19:42:26.000000000 -0700 @@ -191,19 +191,22 @@ void __flush_dcache_page(struct page *pa __cpuc_flush_dcache_page(page_address(page)); - if (!page->mapping) + if (!page_mapping(page)) return; /* * With a VIVT cache, we need to also write back * and invalidate any user data. */ - list_for_each(l, &page->mapping->i_mmap_shared) { + list_for_each_rcu(l, &page_mapping(page)->i_mmap_shared) { struct vm_area_struct *mpnt; unsigned long off; mpnt = list_entry(l, struct vm_area_struct, shared); + if (mpnt->vm_flags & VM_DEAD) + continue; + /* * If this VMA is not in our MM, we can ignore it. */ @@ -234,12 +237,15 @@ make_coherent(struct vm_area_struct *vma * space, then we need to handle them specially to maintain * cache coherency. */ - list_for_each(l, &page->mapping->i_mmap_shared) { + list_for_each_rcu(l, &page_mapping(page)->i_mmap_shared) { struct vm_area_struct *mpnt; unsigned long off; mpnt = list_entry(l, struct vm_area_struct, shared); + if (mpnt->vm_flags & VM_DEAD) + continue; + /* * If this VMA is not in our MM, we can ignore it. * Note that we intentionally don't mask out the VMA @@ -292,7 +298,7 @@ void update_mmu_cache(struct vm_area_str if (!pfn_valid(pfn)) return; page = pfn_to_page(pfn); - if (page->mapping) { + if (page_mapping(page)) { int dirty = test_and_clear_bit(PG_dcache_dirty, &page->flags); if (dirty) diff -prauN linux-2.6.0-test7/arch/arm/mm/ioremap.c wli-2.6.0-test7-bk1-29/arch/arm/mm/ioremap.c --- linux-2.6.0-test7/arch/arm/mm/ioremap.c 2003-10-08 12:24:16.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/arm/mm/ioremap.c 2003-10-09 19:28:46.000000000 -0700 @@ -95,7 +95,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test7/arch/arm/mm/minicache.c wli-2.6.0-test7-bk1-29/arch/arm/mm/minicache.c --- linux-2.6.0-test7/arch/arm/mm/minicache.c 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/arm/mm/minicache.c 2003-10-09 19:28:46.000000000 -0700 @@ -59,7 +59,7 @@ static int __init minicache_init(void) spin_lock(&init_mm.page_table_lock); pgd = pgd_offset_k(minicache_address); - pmd = pmd_alloc(&init_mm, pgd, minicache_address); + pmd = pmd_alloc_kernel(&init_mm, pgd, minicache_address); if (!pmd) BUG(); minicache_pte = pte_alloc_kernel(&init_mm, pmd, minicache_address); diff -prauN linux-2.6.0-test7/arch/arm/mm/mm-armv.c wli-2.6.0-test7-bk1-29/arch/arm/mm/mm-armv.c --- linux-2.6.0-test7/arch/arm/mm/mm-armv.c 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/arm/mm/mm-armv.c 2003-10-09 19:28:46.000000000 -0700 @@ -132,7 +132,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm if (vectors_base() == 0) { /* - * This lock is here just to satisfy pmd_alloc and pte_lock + * This lock is here just to satisfy pmd_alloc_map() and pte_lock */ spin_lock(&mm->page_table_lock); @@ -140,20 +140,22 @@ pgd_t *get_pgd_slow(struct mm_struct *mm * On ARM, first page must always be allocated since it * contains the machine vectors. */ - new_pmd = pmd_alloc(mm, new_pgd, 0); + new_pmd = pmd_alloc_map(mm, new_pgd, 0); if (!new_pmd) goto no_pmd; - new_pte = pte_alloc_map(mm, new_pmd, 0); - if (!new_pte) + new_pte = pte_alloc_map(mm, new_pgd, &new_pmd, 0); + if (!new_pte) { + pmd_unmap(new_pmd); goto no_pte; + } init_pmd = pmd_offset(init_pgd, 0); init_pte = pte_offset_map_nested(init_pmd, 0); set_pte(new_pte, *init_pte); pte_unmap_nested(init_pte); pte_unmap(new_pte); - + pmd_unmap(new_pmd); spin_unlock(&mm->page_table_lock); } diff -prauN linux-2.6.0-test7/arch/arm26/mm/mm-memc.c wli-2.6.0-test7-bk1-29/arch/arm26/mm/mm-memc.c --- linux-2.6.0-test7/arch/arm26/mm/mm-memc.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/arm26/mm/mm-memc.c 2003-10-09 19:28:46.000000000 -0700 @@ -79,7 +79,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm goto no_pgd; /* - * This lock is here just to satisfy pmd_alloc and pte_lock + * This lock is here just to satisfy pmd_alloc_kernel() and pte_lock * FIXME: I bet we could avoid taking it pretty much altogether */ spin_lock(&mm->page_table_lock); @@ -88,7 +88,7 @@ pgd_t *get_pgd_slow(struct mm_struct *mm * On ARM, first page must always be allocated since it contains * the machine vectors. */ - new_pmd = pmd_alloc(mm, new_pgd, 0); + new_pmd = pmd_alloc_kernel(mm, new_pgd, 0); if (!new_pmd) goto no_pmd; diff -prauN linux-2.6.0-test7/arch/cris/mm/ioremap.c wli-2.6.0-test7-bk1-29/arch/cris/mm/ioremap.c --- linux-2.6.0-test7/arch/cris/mm/ioremap.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/cris/mm/ioremap.c 2003-10-09 19:28:46.000000000 -0700 @@ -78,7 +78,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test7/arch/i386/Kconfig wli-2.6.0-test7-bk1-29/arch/i386/Kconfig --- linux-2.6.0-test7/arch/i386/Kconfig 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/Kconfig 2003-10-09 19:53:38.000000000 -0700 @@ -397,6 +397,11 @@ config X86_OOSTORE depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 default y +config X86_CMOV + bool + depends on M686 || MPENTIUMII || MPENTIUMIII || MPENTIUM4 || MK8 || MCRUSOE + default y + config HPET_TIMER bool "HPET Timer Support" help @@ -725,6 +730,26 @@ config HIGHPTE low memory. Setting this option will put user-space page table entries in high memory. +config HIGHPMD + bool "Allocate 2nd-level pagetables from highmem" + depends on HIGHMEM64G && HIGHPTE + help + The VM uses one lowmem-allocated pmd entry for each pagetable + page of physical memory allocated, and preallocates them all + for 12KB of per-process lowmem overhead. For systems with + extreme amounts of highmem, this cannot be tolerated. Setting + this option will put userspace 2nd-level pagetables in highmem. + +config 4K_STACK + bool "Use smaller 4k per-task stacks" + help + This option will shrink the kernel's per-task stack from 8k to + 4k. This will greatly increase your chance of overflowing it. + But, if you use the per-cpu interrupt stacks as well, your chances + go way down. Also try the CONFIG_X86_STACK_CHECK overflow + detection. It is much more reliable than the currently in-kernel + version. + config MATH_EMULATION bool "Math emulation" ---help--- @@ -1217,6 +1242,34 @@ config FRAME_POINTER If you don't debug the kernel, you can say N, but we may not be able to solve problems without frame pointers. +config X86_STACK_CHECK + bool "Detect stack overflows" + depends on FRAME_POINTER + help + Say Y here to have the kernel attempt to detect when the per-task + kernel stack overflows. This is much more robust checking than + the above overflow check, which will only occasionally detect + an overflow. The level of guarantee here is much greater. + + Some older versions of gcc don't handle the -p option correctly. + Kernprof is affected by the same problem, which is described here: + http://oss.sgi.com/projects/kernprof/faq.html#Q9 + + Basically, if you get oopses in __free_pages_ok during boot when + you have this turned on, you need to fix gcc. The Redhat 2.96 + version and gcc-3.x seem to work. + + If not debugging a stack overflow problem, say N + +config MMAP_TOPDOWN + bool "Top-down vma allocation" + help + Say Y here to have the kernel change its vma allocation policy + to allocate vma's from the top of the address space down, and + to shove the stack low so as to conserve virtualspace. This is + risky because various apps, including a number of versions of + ld.so, depend on the kernel's bottom-up behavior. + config X86_EXTRA_IRQS bool depends on X86_LOCAL_APIC || X86_VOYAGER diff -prauN linux-2.6.0-test7/arch/i386/Makefile wli-2.6.0-test7-bk1-29/arch/i386/Makefile --- linux-2.6.0-test7/arch/i386/Makefile 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/Makefile 2003-10-09 19:36:58.000000000 -0700 @@ -84,6 +84,10 @@ mcore-$(CONFIG_X86_ES7000) := mach-es700 # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default +ifdef CONFIG_X86_STACK_CHECK +CFLAGS += -p +endif + head-y := arch/i386/kernel/head.o arch/i386/kernel/init_task.o libs-y += arch/i386/lib/ diff -prauN linux-2.6.0-test7/arch/i386/boot/compressed/misc.c wli-2.6.0-test7-bk1-29/arch/i386/boot/compressed/misc.c --- linux-2.6.0-test7/arch/i386/boot/compressed/misc.c 2003-10-08 12:24:15.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/boot/compressed/misc.c 2003-10-09 19:36:58.000000000 -0700 @@ -379,3 +379,7 @@ asmlinkage int decompress_kernel(struct if (high_loaded) close_output_buffer_if_we_run_high(mv); return high_loaded; } + +/* We don't actually check for stack overflows this early. */ +__asm__(".globl mcount ; mcount: ret\n"); + diff -prauN linux-2.6.0-test7/arch/i386/kernel/acpi/wakeup.S wli-2.6.0-test7-bk1-29/arch/i386/kernel/acpi/wakeup.S --- linux-2.6.0-test7/arch/i386/kernel/acpi/wakeup.S 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/kernel/acpi/wakeup.S 2003-10-09 19:27:22.000000000 -0700 @@ -172,14 +172,13 @@ ENTRY(wakeup_end) .org 0x1000 wakeup_pmode_return: - movl $__KERNEL_DS, %eax - movl %eax, %ds - movw $0x0e00 + 'u', %ds:(0xb8016) - - # restore other segment registers - xorl %eax, %eax + movw $__KERNEL_DS, %ax + movw %ax, %ss + movw %ax, %ds + movw %ax, %es movw %ax, %fs movw %ax, %gs + movw $0x0e00 + 'u', 0xb8016 # reload the gdt, as we need the full 32 bit address lgdt saved_gdt @@ -192,46 +191,30 @@ wakeup_pmode_return: wbinvd # and restore the stack ... but you need gdt for this to work - movl $__KERNEL_DS, %eax - movw %ax, %ss - movw %ax, %ds - movw %ax, %es - movw %ax, %fs - movw %ax, %gs - movl saved_esp, %esp + movl saved_context_esp, %esp - movw $0x0e00 + 'W', %ds:(0xb8018) - movl $(1024*1024*3), %ecx - movl $0, %esi - rep lodsb - movw $0x0e00 + 'O', %ds:(0xb8018) + movw $0x0e00 + 'W', 0xb8018 + outl %eax, $0x80 + outl %eax, $0x80 + movw $0x0e00 + 'O', 0xb8018 movl %cs:saved_magic, %eax cmpl $0x12345678, %eax jne bogus_magic - # restore the other general registers - movl saved_ebx, %ebx - movl saved_edi, %edi - movl saved_esi, %esi - movl saved_ebp, %ebp - # jump to place where we left off movl saved_eip,%eax - movw $0x0e00 + 'x', %ds:(0xb8018) - pushl %eax - popl %eax - movw $0x0e00 + '!', %ds:(0xb801a) + movw $0x0e00 + 'x', 0xb8018 + outl %eax, $0x80 + outl %eax, $0x80 + movw $0x0e00 + '!', 0xb801a jmp *%eax bogus_magic: - movw $0x0e00 + 'B', %ds:(0xb8018) - jmp bogus_magic + movw $0x0e00 + 'B', 0xb8018 + jmp bogus_magic + -bogus_magic2: - movw $0x0e00 + '2', %ds:(0xb8018) - jmp bogus_magic2 - ## # acpi_copy_wakeup_routine # @@ -267,80 +250,45 @@ ENTRY(acpi_copy_wakeup_routine) .data ALIGN -ENTRY(saved_ebp) .long 0 -ENTRY(saved_esi) .long 0 -ENTRY(saved_edi) .long 0 -ENTRY(saved_ebx) .long 0 - -ENTRY(saved_eip) .long 0 -ENTRY(saved_esp) .long 0 - ENTRY(saved_magic) .long 0 +ENTRY(saved_eip) .long 0 -ENTRY(do_suspend_lowlevel) - cmpl $0,4(%esp) - jne ret_point - call save_processor_state - - movl %esp, saved_context_esp - movl %eax, saved_context_eax +save_registers: + leal 4(%esp), %eax + movl %eax, saved_context_esp movl %ebx, saved_context_ebx - movl %ecx, saved_context_ecx - movl %edx, saved_context_edx movl %ebp, saved_context_ebp movl %esi, saved_context_esi movl %edi, saved_context_edi pushfl ; popl saved_context_eflags movl $ret_point,saved_eip - movl %esp,saved_esp - movl %ebp,saved_ebp - movl %ebx,saved_ebx - movl %edi,saved_edi - movl %esi,saved_esi - - pushl $3 - call acpi_enter_sleep_state - addl $4,%esp ret - .p2align 4,,7 -ret_point: - movl $__KERNEL_DS,%eax - movw %ax, %ds - movl saved_context_esp, %esp + + +restore_registers: movl saved_context_ebp, %ebp - movl saved_context_eax, %eax movl saved_context_ebx, %ebx - movl saved_context_ecx, %ecx - movl saved_context_edx, %edx movl saved_context_esi, %esi movl saved_context_edi, %edi - call restore_processor_state pushl saved_context_eflags ; popfl + ret + +ENTRY(do_suspend_lowlevel) + call save_processor_state + call save_registers + pushl $3 + call acpi_enter_sleep_state + ret + .p2align 4,,7 +ret_point: + call restore_registers + call restore_processor_state ret ENTRY(do_suspend_lowlevel_s4bios) - cmpl $0,4(%esp) - jne ret_point call save_processor_state - - movl %esp, saved_context_esp - movl %eax, saved_context_eax - movl %ebx, saved_context_ebx - movl %ecx, saved_context_ecx - movl %edx, saved_context_edx - movl %ebp, saved_context_ebp - movl %esi, saved_context_esi - movl %edi, saved_context_edi - pushfl ; popl saved_context_eflags - - movl $ret_point,saved_eip - movl %esp,saved_esp - movl %ebp,saved_ebp - movl %ebx,saved_ebx - movl %edi,saved_edi - movl %esi,saved_esi - + call save_registers call acpi_enter_sleep_state_s4bios ret diff -prauN linux-2.6.0-test7/arch/i386/kernel/apic.c wli-2.6.0-test7-bk1-29/arch/i386/kernel/apic.c --- linux-2.6.0-test7/arch/i386/kernel/apic.c 2003-10-08 12:24:53.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/kernel/apic.c 2003-10-09 19:36:58.000000000 -0700 @@ -1069,7 +1069,8 @@ inline void smp_local_timer_interrupt(st * interrupt as well. Thus we cannot inline the local irq ... ] */ -void smp_apic_timer_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_apic_timer_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_apic_timer_interrupt(struct pt_regs* regs) { int cpu = smp_processor_id(); @@ -1089,14 +1090,16 @@ void smp_apic_timer_interrupt(struct pt_ * interrupt lock, which is the WrongThing (tm) to do. */ irq_enter(); - smp_local_timer_interrupt(®s); + smp_local_timer_interrupt(regs); irq_exit(); + return regs; } /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -asmlinkage void smp_spurious_interrupt(void) +struct pt_regs * IRQHANDLER(smp_spurious_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_spurious_interrupt(struct pt_regs* regs) { unsigned long v; @@ -1114,13 +1117,15 @@ asmlinkage void smp_spurious_interrupt(v printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", smp_processor_id()); irq_exit(); + return regs; } /* * This interrupt should never happen with our APIC/SMP architecture */ -asmlinkage void smp_error_interrupt(void) +struct pt_regs * IRQHANDLER(smp_error_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_error_interrupt(struct pt_regs* regs) { unsigned long v, v1; @@ -1145,6 +1150,7 @@ asmlinkage void smp_error_interrupt(void printk (KERN_INFO "APIC error on CPU%d: %02lx(%02lx)\n", smp_processor_id(), v , v1); irq_exit(); + return regs; } /* diff -prauN linux-2.6.0-test7/arch/i386/kernel/cpu/mcheck/p4.c wli-2.6.0-test7-bk1-29/arch/i386/kernel/cpu/mcheck/p4.c --- linux-2.6.0-test7/arch/i386/kernel/cpu/mcheck/p4.c 2003-10-08 12:24:16.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/kernel/cpu/mcheck/p4.c 2003-10-09 19:36:58.000000000 -0700 @@ -61,11 +61,13 @@ static void intel_thermal_interrupt(stru /* Thermal interrupt handler for this CPU setup */ static void (*vendor_thermal_interrupt)(struct pt_regs *regs) = unexpected_thermal_interrupt; -asmlinkage void smp_thermal_interrupt(struct pt_regs regs) +struct pt_regs * IRQHANDLER(smp_thermal_interrupt(struct pt_regs* regs)); +struct pt_regs * smp_thermal_interrupt(struct pt_regs* regs) { irq_enter(); - vendor_thermal_interrupt(®s); + vendor_thermal_interrupt(regs); irq_exit(); + return regs; } /* P4/Xeon Thermal regulation detect and init */ diff -prauN linux-2.6.0-test7/arch/i386/kernel/entry.S wli-2.6.0-test7-bk1-29/arch/i386/kernel/entry.S --- linux-2.6.0-test7/arch/i386/kernel/entry.S 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/kernel/entry.S 2003-10-09 19:36:58.000000000 -0700 @@ -160,7 +160,7 @@ do_lcall: movl %eax,EFLAGS(%ebp) # movl %edx,EIP(%ebp) # Now we move them to their "normal" places movl %ecx,CS(%ebp) # - andl $-8192, %ebp # GET_THREAD_INFO + GET_THREAD_INFO_WITH_ESP(%ebp) # GET_THREAD_INFO movl TI_EXEC_DOMAIN(%ebp), %edx # Get the execution domain call *4(%edx) # Call the lcall7 handler for the domain addl $4, %esp @@ -394,17 +394,78 @@ ENTRY(irq_entries_start) vector=vector+1 .endr + +# lets play optimizing compiler... +#ifdef CONFIG_X86_CMOV +#define COND_MOVE cmovnz %esi,%esp; +#else +#define COND_MOVE \ + jz 1f; \ + mov %esi,%esp; \ +1: +#endif + +# These macros will switch you to, and from a per-cpu interrupt stack +# They take the pt_regs arg and move it from the normal place on the +# stack to %eax. Any handler function can retrieve it using regparm(1). +# The handlers are expected to return the stack to switch back to in +# the same register. +# +# This means that the irq handlers need to return their arg +# +# SWITCH_TO_IRQSTACK clobbers %ebx, %ecx, %edx, %esi +# old stack gets put in %eax + +.macro SWITCH_TO_IRQSTACK + GET_THREAD_INFO(%ebx); + movl TI_IRQ_STACK(%ebx),%ecx; + movl TI_TASK(%ebx),%edx; + movl %esp,%eax; + + # %ecx+THREAD_SIZE is next stack -4 keeps us in the right one + leal (THREAD_SIZE-4)(%ecx),%esi; + + # is there a valid irq_stack? + testl %ecx,%ecx; + COND_MOVE; + + # update the task pointer in the irq stack + GET_THREAD_INFO(%esi); + movl %edx,TI_TASK(%esi); + + # update the preempt count in the irq stack + movl TI_PRE_COUNT(%ebx),%ecx; + movl %ecx,TI_PRE_COUNT(%esi); +.endm + +# copy flags from the irq stack back into the task's thread_info +# %esi is saved over the irq handler call and contains the irq stack's +# thread_info pointer +# %eax was returned from the handler, as described above +# %ebx contains the original thread_info pointer + +.macro RESTORE_FROM_IRQSTACK + movl %eax,%esp; + movl TI_FLAGS(%esi),%eax; + movl $0,TI_FLAGS(%esi); + LOCK orl %eax,TI_FLAGS(%ebx); +.endm + ALIGN common_interrupt: SAVE_ALL + SWITCH_TO_IRQSTACK call do_IRQ + RESTORE_FROM_IRQSTACK jmp ret_from_intr #define BUILD_INTERRUPT(name, nr) \ ENTRY(name) \ pushl $nr-256; \ SAVE_ALL \ - call smp_/**/name; \ + SWITCH_TO_IRQSTACK; \ + call smp_/**/name; \ + RESTORE_FROM_IRQSTACK; \ jmp ret_from_intr; /* The include is where all of the SMP etc. interrupts come from */ @@ -604,6 +665,61 @@ ENTRY(spurious_interrupt_bug) pushl $do_spurious_interrupt_bug jmp error_code + +#ifdef CONFIG_X86_STACK_CHECK +.data + .globl stack_overflowed +stack_overflowed: + .long 0 +.text + +ENTRY(mcount) + push %eax + movl $(THREAD_SIZE - 1),%eax + andl %esp,%eax + cmpl $STACK_WARN,%eax /* more than half the stack is used*/ + jle 1f +2: + popl %eax + ret +1: + lock; btsl $0,stack_overflowed + jc 2b + + # switch to overflow stack + movl %esp,%eax + movl $(stack_overflow_stack + THREAD_SIZE - 4),%esp + + pushf + cli + pushl %eax + + # push eip then esp of error for stack_overflow_panic + pushl 4(%eax) + pushl %eax + + # update the task pointer and cpu in the overflow stack's thread_info. + GET_THREAD_INFO_WITH_ESP(%eax) + movl TI_TASK(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_TASK + movl TI_CPU(%eax),%ebx + movl %ebx,stack_overflow_stack+TI_CPU + + call stack_overflow + + # pop off call arguments + addl $8,%esp + + popl %eax + popf + movl %eax,%esp + popl %eax + movl $0,stack_overflowed + ret + +#warning stack check enabled +#endif + .data ENTRY(sys_call_table) .long sys_restart_syscall /* 0 - old "setup()" system call, used for restarting */ diff -prauN linux-2.6.0-test7/arch/i386/kernel/head.S wli-2.6.0-test7-bk1-29/arch/i386/kernel/head.S --- linux-2.6.0-test7/arch/i386/kernel/head.S 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/kernel/head.S 2003-10-09 19:36:58.000000000 -0700 @@ -16,6 +16,7 @@ #include #include #include +#include #define OLD_CL_MAGIC_ADDR 0x90020 #define OLD_CL_MAGIC 0xA33F @@ -325,7 +326,7 @@ rp_sidt: ret ENTRY(stack_start) - .long init_thread_union+8192 + .long init_thread_union+THREAD_SIZE .long __BOOT_DS /* This is the default interrupt "handler" :-) */ diff -prauN linux-2.6.0-test7/arch/i386/kernel/i386_ksyms.c wli-2.6.0-test7-bk1-29/arch/i386/kernel/i386_ksyms.c --- linux-2.6.0-test7/arch/i386/kernel/i386_ksyms.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/kernel/i386_ksyms.c 2003-10-09 19:37:59.000000000 -0700 @@ -212,4 +212,9 @@ EXPORT_SYMBOL(eddnr); EXPORT_SYMBOL(ist_info); #endif +#ifdef CONFIG_X86_STACK_CHECK +void mcount(void); +EXPORT_SYMBOL(mcount); +#endif + EXPORT_SYMBOL(csum_partial); diff -prauN linux-2.6.0-test7/arch/i386/kernel/init_task.c wli-2.6.0-test7-bk1-29/arch/i386/kernel/init_task.c --- linux-2.6.0-test7/arch/i386/kernel/init_task.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/kernel/init_task.c 2003-10-09 19:36:58.000000000 -0700 @@ -17,6 +17,14 @@ struct mm_struct init_mm = INIT_MM(init_ EXPORT_SYMBOL(init_mm); +union thread_union init_irq_union + __attribute__((__section__(".data.init_task"))); + +#ifdef CONFIG_X86_STACK_CHECK +union thread_union stack_overflow_stack + __attribute__((__section__(".data.init_task"))); +#endif + /* * Initial thread structure. * diff -prauN linux-2.6.0-test7/arch/i386/kernel/irq.c wli-2.6.0-test7-bk1-29/arch/i386/kernel/irq.c --- linux-2.6.0-test7/arch/i386/kernel/irq.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/kernel/irq.c 2003-10-09 19:36:58.000000000 -0700 @@ -355,8 +355,10 @@ inline void disable_irq_nosync(unsigned void disable_irq(unsigned int irq) { + irq_desc_t *desc = irq_desc + irq; disable_irq_nosync(irq); - synchronize_irq(irq); + if (desc->action) + synchronize_irq(irq); } /** @@ -402,7 +404,8 @@ void enable_irq(unsigned int irq) * SMP cross-CPU interrupts have their own specific * handlers). */ -asmlinkage unsigned int do_IRQ(struct pt_regs regs) +struct pt_regs * IRQHANDLER(do_IRQ(struct pt_regs *regs)); +struct pt_regs * do_IRQ(struct pt_regs *regs) { /* * We ack quickly, we don't want the irq controller @@ -414,7 +417,7 @@ asmlinkage unsigned int do_IRQ(struct pt * 0 return value means that this irq is already being * handled by some other CPU. (or is disabled) */ - int irq = regs.orig_eax & 0xff; /* high bits used in ret_from_ code */ + int irq = regs->orig_eax & 0xff; /* high bits used in ret_from_ code */ irq_desc_t *desc = irq_desc + irq; struct irqaction * action; unsigned int status; @@ -427,7 +430,7 @@ asmlinkage unsigned int do_IRQ(struct pt long esp; __asm__ __volatile__("andl %%esp,%0" : - "=r" (esp) : "0" (8191)); + "=r" (esp) : "0" (THREAD_SIZE - 1)); if (unlikely(esp < (sizeof(struct thread_info) + 1024))) { printk("do_IRQ: stack overflow: %ld\n", esp - sizeof(struct thread_info)); @@ -480,7 +483,7 @@ asmlinkage unsigned int do_IRQ(struct pt irqreturn_t action_ret; spin_unlock(&desc->lock); - action_ret = handle_IRQ_event(irq, ®s, action); + action_ret = handle_IRQ_event(irq, regs, action); spin_lock(&desc->lock); if (!noirqdebug) note_interrupt(irq, desc, action_ret); @@ -500,7 +503,7 @@ out: irq_exit(); - return 1; + return regs; } /** diff -prauN linux-2.6.0-test7/arch/i386/kernel/process.c wli-2.6.0-test7-bk1-29/arch/i386/kernel/process.c --- linux-2.6.0-test7/arch/i386/kernel/process.c 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/kernel/process.c 2003-10-09 19:36:58.000000000 -0700 @@ -213,7 +213,25 @@ static int __init idle_setup (char *str) __setup("idle=", idle_setup); -void show_regs(struct pt_regs * regs) +void stack_overflow(unsigned long esp, unsigned long eip) +{ + int panicing = ((esp&(THREAD_SIZE-1)) <= STACK_PANIC); + + printk( "esp: 0x%lx masked: 0x%lx STACK_PANIC:0x%lx %d %d\n", + esp, (esp&(THREAD_SIZE-1)), STACK_PANIC, (((esp&(THREAD_SIZE-1)) <= STACK_PANIC)), panicing ); + + if (panicing) + print_symbol("stack overflow from %s\n", eip); + else + print_symbol("excessive stack use from %s\n", eip); + printk("esp: %p\n", (void*)esp); + show_trace(NULL, (void*)esp); + + if (panicing) + panic("stack overflow\n"); +} + +asmlinkage void show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; @@ -501,7 +519,7 @@ struct task_struct * __switch_to(struct struct tss_struct *tss = init_tss + cpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - + next_p->thread_info->irq_stack = prev_p->thread_info->irq_stack; __unlazy_fpu(prev_p); /* diff -prauN linux-2.6.0-test7/arch/i386/kernel/smp.c wli-2.6.0-test7-bk1-29/arch/i386/kernel/smp.c --- linux-2.6.0-test7/arch/i386/kernel/smp.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/kernel/smp.c 2003-10-09 19:36:58.000000000 -0700 @@ -308,7 +308,8 @@ static inline void leave_mm (unsigned lo * 2) Leave the mm if we are in the lazy tlb mode. */ -asmlinkage void smp_invalidate_interrupt (void) +struct pt_regs * IRQHANDLER(smp_invalidate_interrupt(struct pt_regs *regs)); +struct pt_regs * smp_invalidate_interrupt(struct pt_regs *regs) { unsigned long cpu; @@ -340,6 +341,7 @@ asmlinkage void smp_invalidate_interrupt smp_mb__after_clear_bit(); out: put_cpu_no_resched(); + return regs; } static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, @@ -576,12 +578,15 @@ void smp_send_stop(void) * all the work is done automatically when * we return from the interrupt. */ -asmlinkage void smp_reschedule_interrupt(void) +struct pt_regs *IRQHANDLER(smp_reschedule_interrupt(struct pt_regs *)); +struct pt_regs *smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + return regs; } -asmlinkage void smp_call_function_interrupt(void) +struct pt_regs *IRQHANDLER(smp_call_function_interrupt(struct pt_regs *)); +struct pt_regs *smp_call_function_interrupt(struct pt_regs *regs) { void (*func) (void *info) = call_data->func; void *info = call_data->info; @@ -605,5 +610,6 @@ asmlinkage void smp_call_function_interr mb(); atomic_inc(&call_data->finished); } + return regs; } diff -prauN linux-2.6.0-test7/arch/i386/kernel/smpboot.c wli-2.6.0-test7-bk1-29/arch/i386/kernel/smpboot.c --- linux-2.6.0-test7/arch/i386/kernel/smpboot.c 2003-10-08 12:24:05.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/kernel/smpboot.c 2003-10-09 19:49:59.000000000 -0700 @@ -71,6 +71,11 @@ static cpumask_t smp_commenced_mask; /* Per CPU bogomips and other parameters */ struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; +/* Per CPU interrupt stacks */ +extern union thread_union init_irq_union; +union thread_union *irq_stacks[NR_CPUS] __cacheline_aligned = + { &init_irq_union, }; + /* Set when the idlers are all forked */ int smp_threads_ready; @@ -770,6 +775,24 @@ wakeup_secondary_cpu(int phys_apicid, un } #endif /* WAKE_SECONDARY_VIA_INIT */ +static void __init setup_irq_stack(task_t *task, int cpu) +{ + unsigned long stack; + + stack = __get_free_pages(GFP_KERNEL, THREAD_ORDER); + if (!task) + panic("Cannot allocate irq stack\n"); + irq_stacks[cpu] = (void *)stack; + memset(irq_stacks[cpu], 0, THREAD_SIZE); + irq_stacks[cpu]->thread_info.cpu = cpu; + irq_stacks[cpu]->thread_info.preempt_count = 1; + task->thread_info->irq_stack = &irq_stacks[cpu]->thread_info; + /* + * If we want to make the irq stack more than one unit + * deep, we can chain them off the irq_stack pointer here. + */ +} + extern cpumask_t cpu_initialized; static int __init do_boot_cpu(int apicid) @@ -793,6 +816,7 @@ static int __init do_boot_cpu(int apicid idle = fork_by_hand(); if (IS_ERR(idle)) panic("failed fork for CPU %d", cpu); + setup_irq_stack(idle, cpu); wake_up_forked_process(idle); /* diff -prauN linux-2.6.0-test7/arch/i386/kernel/vm86.c wli-2.6.0-test7-bk1-29/arch/i386/kernel/vm86.c --- linux-2.6.0-test7/arch/i386/kernel/vm86.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/kernel/vm86.c 2003-10-09 19:34:30.000000000 -0700 @@ -127,16 +127,17 @@ struct pt_regs * save_v86_state(struct k return ret; } -static void mark_screen_rdonly(struct task_struct * tsk) +static void mark_screen_rdonly(task_t *task) { + struct mm_struct *mm = task->mm; pgd_t *pgd; pmd_t *pmd; pte_t *pte, *mapped; int i; preempt_disable(); - spin_lock(&tsk->mm->page_table_lock); - pgd = pgd_offset(tsk->mm, 0xA0000); + spin_lock(&mm->page_table_lock); + pgd = pgd_offset(mm, 0xA0000); if (pgd_none(*pgd)) goto out; if (pgd_bad(*pgd)) { @@ -144,23 +145,26 @@ static void mark_screen_rdonly(struct ta pgd_clear(pgd); goto out; } - pmd = pmd_offset(pgd, 0xA0000); - if (pmd_none(*pmd)) + pmd = pmd_offset_map(pgd, 0xA0000); + if (pmd_none(*pmd)) { + pmd_unmap(pmd); goto out; - if (pmd_bad(*pmd)) { + } else if (pmd_bad(*pmd)) { pmd_ERROR(*pmd); pmd_clear(pmd); + pmd_unmap(pmd); goto out; } pte = mapped = pte_offset_map(pmd, 0xA0000); for (i = 0; i < 32; i++) { if (pte_present(*pte)) - set_pte(pte, pte_wrprotect(*pte)); + vm_ptep_set_wrprotect(mm, pte); pte++; } pte_unmap(mapped); + pmd_unmap(pmd); out: - spin_unlock(&tsk->mm->page_table_lock); + spin_unlock(&mm->page_table_lock); preempt_enable(); flush_tlb(); } diff -prauN linux-2.6.0-test7/arch/i386/mm/discontig.c wli-2.6.0-test7-bk1-29/arch/i386/mm/discontig.c --- linux-2.6.0-test7/arch/i386/mm/discontig.c 2003-10-08 12:24:07.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/mm/discontig.c 2003-10-09 19:49:59.000000000 -0700 @@ -72,8 +72,6 @@ extern unsigned long max_low_pfn; extern unsigned long totalram_pages; extern unsigned long totalhigh_pages; -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - unsigned long node_remap_start_pfn[MAX_NUMNODES]; unsigned long node_remap_size[MAX_NUMNODES]; unsigned long node_remap_offset[MAX_NUMNODES]; @@ -128,6 +126,48 @@ static void __init find_max_pfn_node(int BUG(); } +extern char __per_cpu_start[], __per_cpu_end[]; +unsigned long __per_cpu_offset[NR_CPUS]; + +#define PER_CPU_PAGES PFN_UP((unsigned long)(__per_cpu_end-__per_cpu_start)) +#define MEM_MAP_SIZE(n) PFN_UP((node_end_pfn[n]-node_start_pfn[n]+1)*sizeof(struct page)) + +static void __init allocate_per_cpu_pages(int cpu) +{ + int cpu_in_node, node = cpu_to_node(cpu); + unsigned long vaddr; + cpumask_t nodemask = node_to_cpumask(node); + + if (!PER_CPU_PAGES || node >= numnodes) + return; + + if (!node) { + vaddr = (unsigned long)alloc_bootmem(PER_CPU_PAGES*PAGE_SIZE); + __per_cpu_offset[cpu] = vaddr - (unsigned long)__per_cpu_start; + } else { + int k; + vaddr = (unsigned long)node_remap_start_vaddr[node]; + for (k = 0, cpu_in_node = 0; k < cpu; ++k) + if (cpu_isset(k, nodemask)) + ++cpu_in_node; + __per_cpu_offset[cpu] = vaddr + PAGE_SIZE*MEM_MAP_SIZE(node) + + PAGE_SIZE*PFN_UP(sizeof(pg_data_t)) + + PAGE_SIZE*PER_CPU_PAGES*cpu_in_node + - (unsigned long)__per_cpu_start; + } + memcpy(RELOC_HIDE((char *)__per_cpu_start, __per_cpu_offset[cpu]), + __per_cpu_start, + PER_CPU_PAGES*PAGE_SIZE); +} + +void __init setup_per_cpu_areas(void) +{ + int cpu; + for (cpu = 0; cpu < NR_CPUS; ++cpu) + allocate_per_cpu_pages(cpu); +} + + /* * Allocate memory for the pg_data_t via a crude pre-bootmem method * We ought to relocate these onto their own node later on during boot. @@ -205,13 +245,11 @@ static unsigned long calculate_numa_rema unsigned long size, reserve_pages = 0; for (nid = 1; nid < numnodes; nid++) { - /* calculate the size of the mem_map needed in bytes */ - size = (node_end_pfn[nid] - node_start_pfn[nid] + 1) - * sizeof(struct page) + sizeof(pg_data_t); - /* convert size to large (pmd size) pages, rounding up */ - size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; - /* now the roundup is correct, convert to PAGE_SIZE pages */ - size = size * PTRS_PER_PTE; + /* calculate the size of the mem_map needed in pages */ + size = MEM_MAP_SIZE(nid) + PFN_UP(sizeof(pg_data_t)) + + PER_CPU_PAGES*MAX_NODE_CPUS; + /* round up to nearest pmd boundary */ + size = (size + PTRS_PER_PTE - 1) & ~(PTRS_PER_PTE - 1); printk("Reserving %ld pages of KVA for lmem_map of node %d\n", size, nid); node_remap_size[nid] = size; diff -prauN linux-2.6.0-test7/arch/i386/mm/fault.c wli-2.6.0-test7-bk1-29/arch/i386/mm/fault.c --- linux-2.6.0-test7/arch/i386/mm/fault.c 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/mm/fault.c 2003-10-09 19:28:46.000000000 -0700 @@ -247,6 +247,13 @@ no_context: printk(" printing eip:\n"); printk("%08lx\n", regs->eip); asm("movl %%cr3,%0":"=r" (page)); +#ifdef CONFIG_HIGHPMD /* Oh boy. Error reporting is going to blow major goats. */ + printk(KERN_ALERT "%%cr3 = 0x%lx\n", page); + /* Mask off flag bits. It should end up 32B-aligned. */ + page &= ~(PTRS_PER_PGD*sizeof(pgd_t) - 1); + printk(KERN_ALERT "*pdpte = 0x%Lx\n", + pgd_val(((pgd_t *)__va(page))[address >> PGDIR_SHIFT])); +#else /* !CONFIG_HIGHPMD */ page = ((unsigned long *) __va(page))[address >> 22]; printk(KERN_ALERT "*pde = %08lx\n", page); /* @@ -262,7 +269,8 @@ no_context: page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; printk(KERN_ALERT "*pte = %08lx\n", page); } -#endif +#endif /* !CONFIG_HIGHPTE */ +#endif /* CONFIG_HIGHPMD */ die("Oops", regs, error_code); bust_spinlocks(0); do_exit(SIGKILL); @@ -330,8 +338,8 @@ vmalloc_fault: * and redundant with the set_pmd() on non-PAE. */ - pmd = pmd_offset(pgd, address); - pmd_k = pmd_offset(pgd_k, address); + pmd = pmd_offset_kernel(pgd, address); + pmd_k = pmd_offset_kernel(pgd_k, address); if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); diff -prauN linux-2.6.0-test7/arch/i386/mm/highmem.c wli-2.6.0-test7-bk1-29/arch/i386/mm/highmem.c --- linux-2.6.0-test7/arch/i386/mm/highmem.c 2003-10-08 12:24:07.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/mm/highmem.c 2003-10-09 19:35:27.000000000 -0700 @@ -1,22 +1,5 @@ #include -void *kmap(struct page *page) -{ - might_sleep(); - if (page < highmem_start_page) - return page_address(page); - return kmap_high(page); -} - -void kunmap(struct page *page) -{ - if (in_interrupt()) - BUG(); - if (page < highmem_start_page) - return; - kunmap_high(page); -} - /* * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because * no global lock is needed and because the kmap code must perform a global TLB @@ -25,40 +8,39 @@ void kunmap(struct page *page) * However when holding an atomic kmap is is not legal to sleep, so atomic * kmaps are appropriate for short, tight code paths only. */ -void *kmap_atomic(struct page *page, enum km_type type) +void *__kmap_atomic(struct page *page, enum km_type type, unsigned long vaddr) { enum fixed_addresses idx; - unsigned long vaddr; + unsigned long offset = KM_TYPE_NR*smp_processor_id(); + pte_t old_pte, pte, *kpte; - inc_preempt_count(); - if (page < highmem_start_page) - return page_address(page); - - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + idx = type + offset; + vaddr -= PAGE_SIZE*offset; + kpte = kmap_pte - idx; + old_pte = *kpte; #ifdef CONFIG_DEBUG_HIGHMEM - if (!pte_none(*(kmap_pte-idx))) - BUG(); + BUG_ON(!pte_none(old_pte)); #endif - set_pte(kmap_pte-idx, mk_pte(page, kmap_prot)); - __flush_tlb_one(vaddr); - - return (void*) vaddr; + pte = mk_pte(page, kmap_prot); + if (!pte_same(old_pte, pte)) { + set_pte(kpte, pte); + if (!pte_none(old_pte)) + __flush_tlb_one(vaddr); + } + return (void *)vaddr; } -void kunmap_atomic(void *kvaddr, enum km_type type) -{ #ifdef CONFIG_DEBUG_HIGHMEM - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); +void __kunmap_atomic(void *kvaddr, enum km_type type, unsigned long vaddr) +{ + unsigned long offset = KM_TYPE_NR*smp_processor_id(); + unsigned long uvaddr = (unsigned long) kvaddr & PAGE_MASK; + enum fixed_addresses idx; - if (vaddr < FIXADDR_START) { // FIXME - dec_preempt_count(); - return; - } + idx = type + offset; + vaddr -= PAGE_SIZE*offset; - if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) - BUG(); + BUG_ON(uvaddr != vaddr); /* * force other mappings to Oops if they'll try to access @@ -66,21 +48,5 @@ void kunmap_atomic(void *kvaddr, enum km */ pte_clear(kmap_pte-idx); __flush_tlb_one(vaddr); -#endif - - dec_preempt_count(); } - -struct page *kmap_atomic_to_page(void *ptr) -{ - unsigned long idx, vaddr = (unsigned long)ptr; - pte_t *pte; - - if (vaddr < FIXADDR_START) - return virt_to_page(ptr); - - idx = virt_to_fix(vaddr); - pte = kmap_pte - (idx - FIX_KMAP_BEGIN); - return pte_page(*pte); -} - +#endif diff -prauN linux-2.6.0-test7/arch/i386/mm/hugetlbpage.c wli-2.6.0-test7-bk1-29/arch/i386/mm/hugetlbpage.c --- linux-2.6.0-test7/arch/i386/mm/hugetlbpage.c 2003-10-08 12:24:45.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/mm/hugetlbpage.c 2003-10-09 19:42:26.000000000 -0700 @@ -87,8 +87,8 @@ static pte_t *huge_pte_alloc(struct mm_s pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); - pmd = pmd_alloc(mm, pgd, addr); - return (pte_t *) pmd; + pmd = pmd_alloc_map(mm, pgd, addr); + return (pte_t *)pmd; } static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) @@ -97,11 +97,13 @@ static pte_t *huge_pte_offset(struct mm_ pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); - pmd = pmd_offset(pgd, addr); - return (pte_t *) pmd; + pmd = pmd_offset_map_nested(pgd, addr); + return (pte_t *)pmd; } -static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page, pte_t * page_table, int write_access) +static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, pte_t * page_table, + unsigned long addr, int write_access) { pte_t entry; @@ -114,6 +116,7 @@ static void set_huge_pte(struct mm_struc entry = pte_mkyoung(entry); mk_pte_huge(entry); set_pte(page_table, entry); + vm_account_huge_inc(vma, *page_table, addr); } /* @@ -145,6 +148,8 @@ int copy_hugetlb_page_range(struct mm_st ptepage = pte_page(entry); get_page(ptepage); set_pte(dst_pte, entry); + pmd_unmap(dst_pte); + pmd_unmap_nested(src_pte); dst->rss += (HPAGE_SIZE / PAGE_SIZE); addr += HPAGE_SIZE; } @@ -182,6 +187,7 @@ follow_hugetlb_page(struct mm_struct *mm get_page(page); pages[i] = page; + pmd_unmap_nested(pte); } if (vmas) @@ -271,6 +277,7 @@ follow_huge_pmd(struct mm_struct *mm, un page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); get_page(page); } + pmd_unmap(pmd); return page; } #endif @@ -278,7 +285,7 @@ follow_huge_pmd(struct mm_struct *mm, un void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); + BUG_ON(page_mapping(page)); INIT_LIST_HEAD(&page->list); @@ -314,6 +321,8 @@ void unmap_hugepage_range(struct vm_area page = pte_page(*pte); huge_page_release(page); pte_clear(pte); + vm_account_huge_dec(vma, *pte, address); + pmd_unmap_nested(pte); } mm->rss -= (end - start) >> PAGE_SHIFT; flush_tlb_range(vma, start, end); @@ -348,8 +357,10 @@ int hugetlb_prefault(struct address_spac ret = -ENOMEM; goto out; } - if (!pte_none(*pte)) + if (!pte_none(*pte)) { + pmd_unmap(pte); continue; + } idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); @@ -358,12 +369,14 @@ int hugetlb_prefault(struct address_spac /* charge the fs quota first */ if (hugetlb_get_quota(mapping)) { ret = -ENOMEM; + pmd_unmap(pte); goto out; } page = alloc_hugetlb_page(); if (!page) { hugetlb_put_quota(mapping); ret = -ENOMEM; + pmd_unmap(pte); goto out; } ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC); @@ -371,10 +384,12 @@ int hugetlb_prefault(struct address_spac if (ret) { hugetlb_put_quota(mapping); free_huge_page(page); + pmd_unmap(pte); goto out; } } - set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE); + set_huge_pte(mm, vma, page, pte, addr, vma->vm_flags & VM_WRITE); + pmd_unmap(pte); } out: spin_unlock(&mm->page_table_lock); diff -prauN linux-2.6.0-test7/arch/i386/mm/init.c wli-2.6.0-test7-bk1-29/arch/i386/mm/init.c --- linux-2.6.0-test7/arch/i386/mm/init.c 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/mm/init.c 2003-10-09 19:30:59.000000000 -0700 @@ -57,10 +57,10 @@ static pmd_t * __init one_md_table_init( #ifdef CONFIG_X86_PAE pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - if (pmd_table != pmd_offset(pgd, 0)) + if (pmd_table != pmd_offset_kernel(pgd, 0)) BUG(); #else - pmd_table = pmd_offset(pgd, 0); + pmd_table = pmd_offset_kernel(pgd, 0); #endif return pmd_table; @@ -111,7 +111,7 @@ static void __init page_table_range_init if (pgd_none(*pgd)) one_md_table_init(pgd); - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { if (pmd_none(*pmd)) one_page_table_init(pmd); @@ -195,7 +195,7 @@ EXPORT_SYMBOL(kmap_prot); EXPORT_SYMBOL(kmap_pte); #define kmap_get_fixmap_pte(vaddr) \ - pte_offset_kernel(pmd_offset(pgd_offset_k(vaddr), (vaddr)), (vaddr)) + pte_offset_kernel(pmd_offset_kernel(pgd_offset_k(vaddr), (vaddr)), (vaddr)) void __init kmap_init(void) { @@ -219,7 +219,7 @@ void __init permanent_kmaps_init(pgd_t * page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base); pgd = swapper_pg_dir + pgd_index(vaddr); - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; } @@ -466,7 +466,7 @@ void __init mem_init(void) /* this will put all low memory onto the freelists */ totalram_pages += __free_all_bootmem(); - + tlb_init(); reservedpages = 0; for (tmp = 0; tmp < max_low_pfn; tmp++) /* @@ -514,20 +514,9 @@ void __init mem_init(void) } kmem_cache_t *pgd_cache; -kmem_cache_t *pmd_cache; void __init pgtable_cache_init(void) { - if (PTRS_PER_PMD > 1) { - pmd_cache = kmem_cache_create("pmd", - PTRS_PER_PMD*sizeof(pmd_t), - 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, - pmd_ctor, - NULL); - if (!pmd_cache) - panic("pgtable_cache_init(): cannot create pmd cache"); - } pgd_cache = kmem_cache_create("pgd", PTRS_PER_PGD*sizeof(pgd_t), 0, diff -prauN linux-2.6.0-test7/arch/i386/mm/ioremap.c wli-2.6.0-test7-bk1-29/arch/i386/mm/ioremap.c --- linux-2.6.0-test7/arch/i386/mm/ioremap.c 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/mm/ioremap.c 2003-10-09 19:28:46.000000000 -0700 @@ -82,7 +82,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test7/arch/i386/mm/pageattr.c wli-2.6.0-test7-bk1-29/arch/i386/mm/pageattr.c --- linux-2.6.0-test7/arch/i386/mm/pageattr.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/mm/pageattr.c 2003-10-09 19:28:46.000000000 -0700 @@ -23,7 +23,7 @@ static inline pte_t *lookup_address(unsi pmd_t *pmd; if (pgd_none(*pgd)) return NULL; - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_kernel(pgd, address); if (pmd_none(*pmd)) return NULL; if (pmd_large(*pmd)) @@ -79,7 +79,7 @@ static void set_pmd_pte(pte_t *kpte, uns pgd_t *pgd; pmd_t *pmd; pgd = (pgd_t *)page_address(page) + pgd_index(address); - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_kernel(pgd, address); set_pte_atomic((pte_t *)pmd, pte); } spin_unlock_irqrestore(&pgd_lock, flags); @@ -92,7 +92,7 @@ static void set_pmd_pte(pte_t *kpte, uns static inline void revert_page(struct page *kpte_page, unsigned long address) { pte_t *linear = (pte_t *) - pmd_offset(pgd_offset(&init_mm, address), address); + pmd_offset_kernel(pgd_offset_k(address), address); set_pmd_pte(linear, address, pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); diff -prauN linux-2.6.0-test7/arch/i386/mm/pgtable.c wli-2.6.0-test7-bk1-29/arch/i386/mm/pgtable.c --- linux-2.6.0-test7/arch/i386/mm/pgtable.c 2003-10-08 12:24:53.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/i386/mm/pgtable.c 2003-10-09 19:57:39.000000000 -0700 @@ -70,7 +70,7 @@ static void set_pte_pfn(unsigned long va BUG(); return; } - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); if (pmd_none(*pmd)) { BUG(); return; @@ -110,7 +110,7 @@ void set_pmd_pfn(unsigned long vaddr, un printk ("set_pmd_pfn: pgd_none\n"); return; /* BUG(); */ } - pmd = pmd_offset(pgd, vaddr); + pmd = pmd_offset_kernel(pgd, vaddr); set_pmd(pmd, pfn_pmd(pfn, flags)); /* * It's enough to flush this one mapping. @@ -138,23 +138,76 @@ pte_t *pte_alloc_one_kernel(struct mm_st return pte; } -struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +void tlb_init(void) { - struct page *pte; + int cpu; + for (cpu = 0; cpu < NR_CPUS; ++cpu) { + int zone; + struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu); + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + INIT_LIST_HEAD(&tlb->active_list[zone]); + INIT_LIST_HEAD(&tlb->ready_list[zone]); + } + } +} -#ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0); -#else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); -#endif - if (pte) - clear_highpage(pte); - return pte; +static inline struct page *pte_alloc_fresh(int gfp_mask) +{ + struct page *page = alloc_page(gfp_mask); + if (page) { + clear_highpage(page); + if (TestSetPagePTE(page)) + BUG(); + } + return page; } -void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) +static inline int zone_high(struct zone *zone) { - memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); + if (!zone) + return 1; + else + return zone - zone->zone_pgdat->node_zones >= ZONE_HIGHMEM; +} + +static inline struct page *pte_alloc_ready(int gfp_flags) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + unsigned long flags; + struct page *page = NULL; + + smp_local_irq_save(flags); + if (tlb->nr_pte_ready) { + int z; + for (z = MAX_ZONE_ID - 1; z >= 0; --z) { + struct zone *zone = zone_table[z]; + if (!(gfp_flags & __GFP_HIGHMEM) && zone_high(zone)) + continue; + if (!list_empty(&tlb->ready_list[z])) + break; + } + page = list_entry(tlb->ready_list[z].next, struct page, list); + if (TestSetPagePTE(page)) + BUG(); + list_del(&page->list); + tlb->ready_count[z]--; + tlb->nr_pte_ready--; + } + smp_local_irq_restore(flags); + put_cpu(); + return page; +} + +struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) +{ + struct page *page = pte_alloc_ready(GFP_PTE); + return page ? page : pte_alloc_fresh(GFP_PTE); +} + +static inline struct page *__pmd_alloc_one(void) +{ + struct page *page = pte_alloc_ready(GFP_PMD); + return page ? page : pte_alloc_fresh(GFP_PMD); } /* @@ -212,16 +265,21 @@ pgd_t *pgd_alloc(struct mm_struct *mm) return pgd; for (i = 0; i < USER_PTRS_PER_PGD; ++i) { - pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); + struct page *pmd = __pmd_alloc_one(); if (!pmd) goto out_oom; - set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd)))); + set_pgd(&pgd[i], __pgd(1ULL | (u64)page_to_pfn(pmd) << PAGE_SHIFT)); } return pgd; + /* + * This looks unusual. pte_free() is actually a convenient wrapper + * for queueing up preconstructed pmd and/or pte pages. The cases + * fall through to just queueing them in the per-cpu lists. + */ out_oom: for (i--; i >= 0; i--) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + pte_free(pgd_page(pgd[i])); kmem_cache_free(pgd_cache, pgd); return NULL; } @@ -233,7 +291,110 @@ void pgd_free(pgd_t *pgd) /* in the PAE case user pgd entries are overwritten before usage */ if (PTRS_PER_PMD > 1) for (i = 0; i < USER_PTRS_PER_PGD; ++i) - kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1)); + pte_free(pgd_page(pgd[i])); /* in the non-PAE case, clear_page_tables() clears user pgd entries */ kmem_cache_free(pgd_cache, pgd); } + +static void shrink_cpu_pagetable_cache(void *__gfp_mask) +{ + int cpu, zone, high, gfp_mask = (int)gfp_mask; + unsigned long flags; + struct mmu_gather *tlb; + + high = !!(gfp_mask & __GFP_HIGHMEM); + cpu = get_cpu(); + tlb = &per_cpu(mmu_gathers, cpu); + smp_local_irq_save(flags); + + if (tlb->nr_pte_active || tlb->nr_nonpte) + tlb_flush(tlb); + + if (tlb->nr_pte_active) { + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + if (!high && zone_high(zone_table[zone])) + continue; + if (!tlb->active_count[zone]) + continue; + + list_splice_init(&tlb->active_list[zone], &tlb->ready_list[zone]); + tlb->ready_count[zone] += tlb->active_count[zone]; + tlb->active_count[zone] = 0; + } + tlb->nr_pte_ready += tlb->nr_pte_active; + tlb->nr_pte_active = 0; + } + + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + struct page *head; + + if (list_empty(&tlb->ready_list[zone])) + continue; + if (!high && zone_high(zone_table[zone])) + continue; + + head = list_entry(tlb->ready_list[zone].next, struct page, list); + list_del_init(&head->list); + list_splice_init(&tlb->ready_list[zone], &head->list); + head->private = tlb->ready_count[zone]; + tlb->nr_pte_ready -= tlb->ready_count[zone]; + tlb->ready_count[zone] = 0; + free_pages_bulk(zone_table[zone], head, 0); + } + + smp_local_irq_restore(flags); + put_cpu(); +} + +void shrink_pagetable_cache(int gfp_mask) +{ + BUG_ON(irqs_disabled()); + + preempt_disable(); + + /* disables interrupts appropriately internally */ + shrink_cpu_pagetable_cache((void *)gfp_mask); + + smp_call_function(shrink_cpu_pagetable_cache, (void *)gfp_mask, 1, 1); + preempt_enable(); +} + +unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma, *prev; + + len = PAGE_ALIGN(len); + addr = PAGE_ALIGN(addr); + + if (len > TASK_SIZE) + return -ENOMEM; + + if (addr) { + struct vm_area_struct *vma; + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) + goto out; + } + + if (!mm->mmap) { + addr = TASK_SIZE - len; + goto out; + } + + addr = -ENOMEM; + for (prev = NULL, vma = mm->mmap; vma; prev = vma, vma = vma->vm_next) { + unsigned long lo, hi; + lo = prev ? prev->vm_end : 0; + hi = vma->vm_start; + if (hi - lo >= len && (addr == -ENOMEM || addr < hi - len)) + addr = hi - len; + } + /* we're at the last one; let's try the top */ + if (prev && TASK_SIZE - prev->vm_end >= len) + addr = TASK_SIZE - len; +out: + return addr; +} diff -prauN linux-2.6.0-test7/arch/ia64/ia32/binfmt_elf32.c wli-2.6.0-test7-bk1-29/arch/ia64/ia32/binfmt_elf32.c --- linux-2.6.0-test7/arch/ia64/ia32/binfmt_elf32.c 2003-10-08 12:24:46.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/ia64/ia32/binfmt_elf32.c 2003-10-09 19:34:30.000000000 -0700 @@ -203,7 +203,8 @@ ia32_setup_arg_pages (struct linux_binpr struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, PAGE_COPY); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY); } stack_base += PAGE_SIZE; } diff -prauN linux-2.6.0-test7/arch/ia64/mm/hugetlbpage.c wli-2.6.0-test7-bk1-29/arch/ia64/mm/hugetlbpage.c --- linux-2.6.0-test7/arch/ia64/mm/hugetlbpage.c 2003-10-08 12:24:06.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/ia64/mm/hugetlbpage.c 2003-10-09 19:42:26.000000000 -0700 @@ -60,9 +60,9 @@ huge_pte_alloc (struct mm_struct *mm, un pte_t *pte = NULL; pgd = pgd_offset(mm, taddr); - pmd = pmd_alloc(mm, pgd, taddr); + pmd = pmd_alloc_map(mm, pgd, taddr); if (pmd) - pte = pte_alloc_map(mm, pmd, taddr); + pte = pte_alloc_map(mm, pgd, &pmd, taddr); return pte; } @@ -223,7 +223,7 @@ follow_huge_pmd(struct mm_struct *mm, un void free_huge_page(struct page *page) { BUG_ON(page_count(page)); - BUG_ON(page->mapping); + BUG_ON(page_mapping(page)); INIT_LIST_HEAD(&page->list); diff -prauN linux-2.6.0-test7/arch/ia64/mm/init.c wli-2.6.0-test7-bk1-29/arch/ia64/mm/init.c --- linux-2.6.0-test7/arch/ia64/mm/init.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/ia64/mm/init.c 2003-10-09 19:28:46.000000000 -0700 @@ -232,10 +232,10 @@ put_kernel_page (struct page *page, unsi spin_lock(&init_mm.page_table_lock); { - pmd = pmd_alloc(&init_mm, pgd, address); + pmd = pmd_alloc_kernel(&init_mm, pgd, address); if (!pmd) goto out; - pte = pte_alloc_map(&init_mm, pmd, address); + pte = pte_alloc_map(&init_mm, pgd, &pmd, address); if (!pte) goto out; if (!pte_none(*pte)) { diff -prauN linux-2.6.0-test7/arch/m68k/kernel/head.S wli-2.6.0-test7-bk1-29/arch/m68k/kernel/head.S --- linux-2.6.0-test7/arch/m68k/kernel/head.S 2003-10-08 12:24:52.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/m68k/kernel/head.S 2003-10-09 19:28:46.000000000 -0700 @@ -110,7 +110,7 @@ * * These routines are used by other mmu routines to get a pointer into * a table, if necessary a new table is allocated. These routines are working - * basically like pmd_alloc() and pte_alloc() in . The root + * basically like pmd_alloc_map() and pte_alloc_map() in . The root * table needs of course only to be allocated once in mmu_get_root_table_entry, * so that here also some mmu specific initialization is done. The second page * at the start of the kernel (the first page is unmapped later) is used for diff -prauN linux-2.6.0-test7/arch/m68k/mm/kmap.c wli-2.6.0-test7-bk1-29/arch/m68k/mm/kmap.c --- linux-2.6.0-test7/arch/m68k/mm/kmap.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/m68k/mm/kmap.c 2003-10-09 19:28:46.000000000 -0700 @@ -189,7 +189,7 @@ void *__ioremap(unsigned long physaddr, printk ("\npa=%#lx va=%#lx ", physaddr, virtaddr); #endif pgd_dir = pgd_offset_k(virtaddr); - pmd_dir = pmd_alloc(&init_mm, pgd_dir, virtaddr); + pmd_dir = pmd_alloc_kernel(&init_mm, pgd_dir, virtaddr); if (!pmd_dir) { printk("ioremap: no mem for pmd_dir\n"); return NULL; diff -prauN linux-2.6.0-test7/arch/m68k/sun3x/dvma.c wli-2.6.0-test7-bk1-29/arch/m68k/sun3x/dvma.c --- linux-2.6.0-test7/arch/m68k/sun3x/dvma.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/m68k/sun3x/dvma.c 2003-10-09 19:28:46.000000000 -0700 @@ -102,7 +102,7 @@ inline int dvma_map_cpu(unsigned long ka pmd_t *pmd; unsigned long end2; - if((pmd = pmd_alloc(&init_mm, pgd, vaddr)) == NULL) { + if((pmd = pmd_alloc_kernel(&init_mm, pgd, vaddr)) == NULL) { ret = -ENOMEM; goto out; } diff -prauN linux-2.6.0-test7/arch/mips/mm/ioremap.c wli-2.6.0-test7-bk1-29/arch/mips/mm/ioremap.c --- linux-2.6.0-test7/arch/mips/mm/ioremap.c 2003-10-08 12:24:05.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/mips/mm/ioremap.c 2003-10-09 19:28:46.000000000 -0700 @@ -81,7 +81,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test7/arch/parisc/kernel/cache.c wli-2.6.0-test7-bk1-29/arch/parisc/kernel/cache.c --- linux-2.6.0-test7/arch/parisc/kernel/cache.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/parisc/kernel/cache.c 2003-10-09 19:42:26.000000000 -0700 @@ -68,7 +68,7 @@ update_mmu_cache(struct vm_area_struct * { struct page *page = pte_page(pte); - if (VALID_PAGE(page) && page->mapping && + if (VALID_PAGE(page) && page_mapping(page) && test_bit(PG_dcache_dirty, &page->flags)) { flush_kernel_dcache_page(page_address(page)); @@ -234,15 +234,17 @@ void __flush_dcache_page(struct page *pa flush_kernel_dcache_page(page_address(page)); - if (!page->mapping) + if (!page_mapping(page)) return; /* check shared list first if it's not empty...it's usually * the shortest */ - list_for_each(l, &page->mapping->i_mmap_shared) { + list_for_each_rcu(l, &page->mapping->i_mmap_shared) { struct vm_area_struct *mpnt; unsigned long off; mpnt = list_entry(l, struct vm_area_struct, shared); + if (mpnt->vm_flags & VM_DEAD) + continue; /* * If this VMA is not in our MM, we can ignore it. diff -prauN linux-2.6.0-test7/arch/parisc/kernel/pci-dma.c wli-2.6.0-test7-bk1-29/arch/parisc/kernel/pci-dma.c --- linux-2.6.0-test7/arch/parisc/kernel/pci-dma.c 2003-10-08 12:24:45.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/parisc/kernel/pci-dma.c 2003-10-09 19:28:46.000000000 -0700 @@ -133,7 +133,7 @@ static inline int map_uncached_pages(uns do { pmd_t *pmd; - pmd = pmd_alloc(NULL, dir, vaddr); + pmd = pmd_alloc_kernel(NULL, dir, vaddr); if (!pmd) return -ENOMEM; if (map_pmd_uncached(pmd, vaddr, end - vaddr, &paddr)) diff -prauN linux-2.6.0-test7/arch/parisc/mm/ioremap.c wli-2.6.0-test7-bk1-29/arch/parisc/mm/ioremap.c --- linux-2.6.0-test7/arch/parisc/mm/ioremap.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/parisc/mm/ioremap.c 2003-10-09 19:28:46.000000000 -0700 @@ -77,7 +77,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(dir, address); + pmd = pmd_alloc_kernel(dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test7/arch/ppc/mm/init.c wli-2.6.0-test7-bk1-29/arch/ppc/mm/init.c --- linux-2.6.0-test7/arch/ppc/mm/init.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/ppc/mm/init.c 2003-10-09 19:42:26.000000000 -0700 @@ -477,14 +477,14 @@ void __init mem_init(void) printk(KERN_INFO "AGP special page: 0x%08lx\n", agp_special_page); #endif - /* Make sure all our pagetable pages have page->mapping + /* Make sure all our pagetable pages have page_mapping(page) and page->index set correctly. */ for (addr = KERNELBASE; addr != 0; addr += PGDIR_SIZE) { struct page *pg; pmd_t *pmd = pmd_offset(pgd_offset_k(addr), addr); if (pmd_present(*pmd)) { pg = pmd_page(*pmd); - pg->mapping = (void *) &init_mm; + set_page_mapping(pg, &init_mm); pg->index = addr; } } diff -prauN linux-2.6.0-test7/arch/ppc64/mm/init.c wli-2.6.0-test7-bk1-29/arch/ppc64/mm/init.c --- linux-2.6.0-test7/arch/ppc64/mm/init.c 2003-10-08 12:24:14.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/ppc64/mm/init.c 2003-10-09 19:28:46.000000000 -0700 @@ -205,7 +205,7 @@ static void map_io_page(unsigned long ea if (mem_init_done) { spin_lock(&ioremap_mm.page_table_lock); pgdp = pgd_offset_i(ea); - pmdp = pmd_alloc(&ioremap_mm, pgdp, ea); + pmdp = pmd_alloc_kernel(&ioremap_mm, pgdp, ea); ptep = pte_alloc_kernel(&ioremap_mm, pmdp, ea); pa = absolute_to_phys(pa); diff -prauN linux-2.6.0-test7/arch/s390/kernel/compat_exec.c wli-2.6.0-test7-bk1-29/arch/s390/kernel/compat_exec.c --- linux-2.6.0-test7/arch/s390/kernel/compat_exec.c 2003-10-08 12:24:43.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/s390/kernel/compat_exec.c 2003-10-09 19:34:30.000000000 -0700 @@ -81,7 +81,8 @@ int setup_arg_pages32(struct linux_binpr struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY); } stack_base += PAGE_SIZE; } diff -prauN linux-2.6.0-test7/arch/s390/mm/ioremap.c wli-2.6.0-test7-bk1-29/arch/s390/mm/ioremap.c --- linux-2.6.0-test7/arch/s390/mm/ioremap.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/s390/mm/ioremap.c 2003-10-09 19:28:46.000000000 -0700 @@ -83,7 +83,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test7/arch/sh/mm/ioremap.c wli-2.6.0-test7-bk1-29/arch/sh/mm/ioremap.c --- linux-2.6.0-test7/arch/sh/mm/ioremap.c 2003-10-08 12:24:15.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/sh/mm/ioremap.c 2003-10-09 19:28:46.000000000 -0700 @@ -45,7 +45,7 @@ static inline void remap_area_pte(pte_t } while (address && (address < end)); } -static inline int remap_area_pmd(pmd_t * pmd, unsigned long address, +static inline int remap_area_pmd(pgd_t *pgd, pmd_t * pmd, unsigned long address, unsigned long size, unsigned long phys_addr, unsigned long flags) { unsigned long end; @@ -83,11 +83,11 @@ int remap_area_pages(unsigned long addre spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_map(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; - if (remap_area_pmd(pmd, address, end - address, + if (remap_area_pmd(dir, pmd, address, end - address, phys_addr + address, flags)) break; error = 0; diff -prauN linux-2.6.0-test7/arch/sparc/mm/generic.c wli-2.6.0-test7-bk1-29/arch/sparc/mm/generic.c --- linux-2.6.0-test7/arch/sparc/mm/generic.c 2003-10-08 12:24:44.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/sparc/mm/generic.c 2003-10-09 19:28:46.000000000 -0700 @@ -67,7 +67,7 @@ static inline void io_remap_pte_range(pt } while (address < end); } -static inline int io_remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, +static inline int io_remap_pmd_range(pgd_t *pgd, pmd_t * pmd, unsigned long address, unsigned long size, unsigned long offset, pgprot_t prot, int space) { unsigned long end; @@ -78,7 +78,7 @@ static inline int io_remap_pmd_range(pmd end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(current->mm, pmd, address); + pte_t * pte = pte_alloc_map(current->mm, pgd, &pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(pte, address, end - address, address + offset, prot, space); @@ -103,11 +103,11 @@ int io_remap_page_range(struct vm_area_s spin_lock(&mm->page_table_lock); while (from < end) { - pmd_t *pmd = pmd_alloc(current->mm, dir, from); + pmd_t *pmd = pmd_alloc_map(current->mm, dir, from); error = -ENOMEM; if (!pmd) break; - error = io_remap_pmd_range(pmd, from, end - from, offset + from, prot, space); + error = io_remap_pmd_range(pgd, pmd, from, end - from, offset + from, prot, space); if (error) break; from = (from + PGDIR_SIZE) & PGDIR_MASK; diff -prauN linux-2.6.0-test7/arch/sparc/mm/srmmu.c wli-2.6.0-test7-bk1-29/arch/sparc/mm/srmmu.c --- linux-2.6.0-test7/arch/sparc/mm/srmmu.c 2003-10-08 12:24:14.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/sparc/mm/srmmu.c 2003-10-09 19:28:46.000000000 -0700 @@ -2180,7 +2180,7 @@ void __init ld_mmu_srmmu(void) BTFIXUPSET_CALL(pte_pfn, srmmu_pte_pfn, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pmd_page, srmmu_pmd_page, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(pgd_page, srmmu_pgd_page, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(__pgd_page, srmmu_pgd_page, BTFIXUPCALL_NORM); BTFIXUPSET_SETHI(none_mask, 0xF0000000); @@ -2212,7 +2212,7 @@ void __init ld_mmu_srmmu(void) BTFIXUPSET_CALL(pte_alloc_one_kernel, srmmu_pte_alloc_one_kernel, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pte_alloc_one, srmmu_pte_alloc_one, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(free_pmd_fast, srmmu_pmd_free, BTFIXUPCALL_NORM); - BTFIXUPSET_CALL(pmd_alloc_one, srmmu_pmd_alloc_one, BTFIXUPCALL_NORM); + BTFIXUPSET_CALL(__pmd_alloc_one, srmmu_pmd_alloc_one, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(free_pgd_fast, srmmu_free_pgd_fast, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(get_pgd_fast, srmmu_get_pgd_fast, BTFIXUPCALL_NORM); diff -prauN linux-2.6.0-test7/arch/sparc/mm/sun4c.c wli-2.6.0-test7-bk1-29/arch/sparc/mm/sun4c.c --- linux-2.6.0-test7/arch/sparc/mm/sun4c.c 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/sparc/mm/sun4c.c 2003-10-09 19:28:46.000000000 -0700 @@ -2211,7 +2211,7 @@ void __init ld_mmu_sun4c(void) BTFIXUPSET_CALL(pte_alloc_one_kernel, sun4c_pte_alloc_one_kernel, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(pte_alloc_one, sun4c_pte_alloc_one, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(free_pmd_fast, sun4c_free_pmd_fast, BTFIXUPCALL_NOP); - BTFIXUPSET_CALL(pmd_alloc_one, sun4c_pmd_alloc_one, BTFIXUPCALL_RETO0); + BTFIXUPSET_CALL(__pmd_alloc_one, sun4c_pmd_alloc_one, BTFIXUPCALL_RETO0); BTFIXUPSET_CALL(free_pgd_fast, sun4c_free_pgd_fast, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(get_pgd_fast, sun4c_get_pgd_fast, BTFIXUPCALL_NORM); @@ -2252,5 +2252,5 @@ void __init ld_mmu_sun4c(void) /* These should _never_ get called with two level tables. */ BTFIXUPSET_CALL(pgd_set, sun4c_pgd_set, BTFIXUPCALL_NOP); - BTFIXUPSET_CALL(pgd_page, sun4c_pgd_page, BTFIXUPCALL_RETO0); + BTFIXUPSET_CALL(__pgd_page, sun4c_pgd_page, BTFIXUPCALL_RETO0); } diff -prauN linux-2.6.0-test7/arch/sparc64/kernel/smp.c wli-2.6.0-test7-bk1-29/arch/sparc64/kernel/smp.c --- linux-2.6.0-test7/arch/sparc64/kernel/smp.c 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/sparc64/kernel/smp.c 2003-10-09 19:42:26.000000000 -0700 @@ -675,9 +675,9 @@ static __inline__ void __local_flush_dca #if (L1DCACHE_SIZE > PAGE_SIZE) __flush_dcache_page(page->virtual, ((tlb_type == spitfire) && - page->mapping != NULL)); + page_mapping(page) != NULL)); #else - if (page->mapping != NULL && + if (page_mapping(page) != NULL && tlb_type == spitfire) __flush_icache_page(__pa(page->virtual)); #endif @@ -698,7 +698,7 @@ void smp_flush_dcache_page_impl(struct p if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page->mapping != NULL) + if (page_mapping(page) != NULL) data0 |= ((u64)1 << 32); spitfire_xcall_deliver(data0, __pa(page->virtual), @@ -730,7 +730,7 @@ void flush_dcache_page_all(struct mm_str goto flush_self; if (tlb_type == spitfire) { data0 = ((u64)&xcall_flush_dcache_page_spitfire); - if (page->mapping != NULL) + if (page_mapping(page) != NULL) data0 |= ((u64)1 << 32); spitfire_xcall_deliver(data0, __pa(page->virtual), diff -prauN linux-2.6.0-test7/arch/sparc64/mm/generic.c wli-2.6.0-test7-bk1-29/arch/sparc64/mm/generic.c --- linux-2.6.0-test7/arch/sparc64/mm/generic.c 2003-10-08 12:24:27.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/sparc64/mm/generic.c 2003-10-09 19:28:46.000000000 -0700 @@ -85,7 +85,7 @@ static inline void io_remap_pte_range(pt } while (address < end); } -static inline int io_remap_pmd_range(pmd_t * pmd, unsigned long address, unsigned long size, +static inline int io_remap_pmd_range(pgd_t *pgd, pmd_t * pmd, unsigned long address, unsigned long size, unsigned long offset, pgprot_t prot, int space) { unsigned long end; @@ -96,7 +96,7 @@ static inline int io_remap_pmd_range(pmd end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(current->mm, pmd, address); + pte_t * pte = pte_alloc_map(current->mm, pgd, &pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(pte, address, end - address, address + offset, prot, space); @@ -122,11 +122,11 @@ int io_remap_page_range(struct vm_area_s spin_lock(&mm->page_table_lock); while (from < end) { - pmd_t *pmd = pmd_alloc(current->mm, dir, from); + pmd_t *pmd = pmd_alloc_map(current->mm, dir, from); error = -ENOMEM; if (!pmd) break; - error = io_remap_pmd_range(pmd, from, end - from, offset + from, prot, space); + error = io_remap_pmd_range(pgd, pmd, from, end - from, offset + from, prot, space); if (error) break; from = (from + PGDIR_SIZE) & PGDIR_MASK; diff -prauN linux-2.6.0-test7/arch/sparc64/mm/hugetlbpage.c wli-2.6.0-test7-bk1-29/arch/sparc64/mm/hugetlbpage.c --- linux-2.6.0-test7/arch/sparc64/mm/hugetlbpage.c 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/sparc64/mm/hugetlbpage.c 2003-10-09 19:42:26.000000000 -0700 @@ -74,8 +74,8 @@ static struct page *alloc_hugetlb_page(v static void free_hugetlb_page(struct page *page) { spin_lock(&htlbpage_lock); - if ((page->mapping != NULL) && (page_count(page) == 2)) { - struct inode *inode = page->mapping->host; + if ((page_mapping(page) != NULL) && (page_count(page) == 2)) { + struct inode *inode = page_mapping(page)->host; int i; ClearPageDirty(page); @@ -107,9 +107,11 @@ static pte_t *huge_pte_alloc_map(struct pgd = pgd_offset(mm, addr); if (pgd) { - pmd = pmd_alloc(mm, pgd, addr); - if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pmd = pmd_alloc_map(mm, pgd, addr); + if (pmd) { + pte = pte_alloc_map(mm, pgd, &pmd, addr); + pmd_unmap(pmd); + } } return pte; } @@ -122,9 +124,11 @@ static pte_t *huge_pte_offset_map(struct pgd = pgd_offset(mm, addr); if (pgd) { - pmd = pmd_offset(pgd, addr); - if (pmd) + pmd = pmd_offset_map(pgd, addr); + if (pmd) { pte = pte_offset_map(pmd, addr); + pmd_unmap(pmd); + } } return pte; } diff -prauN linux-2.6.0-test7/arch/sparc64/mm/init.c wli-2.6.0-test7-bk1-29/arch/sparc64/mm/init.c --- linux-2.6.0-test7/arch/sparc64/mm/init.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/sparc64/mm/init.c 2003-10-09 19:42:26.000000000 -0700 @@ -128,9 +128,9 @@ __inline__ void flush_dcache_page_impl(s #if (L1DCACHE_SIZE > PAGE_SIZE) __flush_dcache_page(page->virtual, ((tlb_type == spitfire) && - page->mapping != NULL)); + page_mapping(page) != NULL)); #else - if (page->mapping != NULL && + if (page_mapping(page) != NULL && tlb_type == spitfire) __flush_icache_page(__pa(page->virtual)); #endif @@ -192,7 +192,7 @@ void update_mmu_cache(struct vm_area_str pfn = pte_pfn(pte); if (pfn_valid(pfn) && - (page = pfn_to_page(pfn), page->mapping) && + (page = pfn_to_page(pfn), page_mapping(page)) && ((pg_flags = page->flags) & (1UL << PG_dcache_dirty))) { int cpu = ((pg_flags >> 24) & (NR_CPUS - 1UL)); @@ -216,9 +216,9 @@ void flush_dcache_page(struct page *page int dirty = test_bit(PG_dcache_dirty, &page->flags); int dirty_cpu = dcache_dirty_cpu(page); - if (page->mapping && - list_empty(&page->mapping->i_mmap) && - list_empty(&page->mapping->i_mmap_shared)) { + if (page_mapping(page) && + list_empty(&page_mapping(page)->i_mmap) && + list_empty(&page_mapping(page)->i_mmap_shared)) { if (dirty) { if (dirty_cpu == smp_processor_id()) return; @@ -226,7 +226,7 @@ void flush_dcache_page(struct page *page } set_dcache_dirty(page); } else { - /* We could delay the flush for the !page->mapping + /* We could delay the flush for the !page_mapping(page) * case too. But that case is for exec env/arg * pages and those are %99 certainly going to get * faulted into the tlb (and thus flushed) anyways. @@ -268,7 +268,7 @@ static inline void flush_cache_pte_range if (!pfn_valid(pfn)) continue; page = pfn_to_page(pfn); - if (PageReserved(page) || !page->mapping) + if (PageReserved(page) || !page_mapping(page)) continue; pgaddr = (unsigned long) page_address(page); uaddr = address + offset; diff -prauN linux-2.6.0-test7/arch/sparc64/mm/ultra.S wli-2.6.0-test7-bk1-29/arch/sparc64/mm/ultra.S --- linux-2.6.0-test7/arch/sparc64/mm/ultra.S 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/sparc64/mm/ultra.S 2003-10-09 19:42:26.000000000 -0700 @@ -615,7 +615,7 @@ xcall_flush_dcache_page_cheetah: /* %g1 .globl xcall_flush_dcache_page_spitfire xcall_flush_dcache_page_spitfire: /* %g1 == physical page address %g7 == kernel page virtual address - %g5 == (page->mapping != NULL) */ + %g5 == (page_mapping(page) != NULL) */ #if (L1DCACHE_SIZE > PAGE_SIZE) srlx %g1, (13 - 2), %g1 ! Form tag comparitor sethi %hi(L1DCACHE_SIZE), %g3 ! D$ size == 16K diff -prauN linux-2.6.0-test7/arch/x86_64/ia32/ia32_binfmt.c wli-2.6.0-test7-bk1-29/arch/x86_64/ia32/ia32_binfmt.c --- linux-2.6.0-test7/arch/x86_64/ia32/ia32_binfmt.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/x86_64/ia32/ia32_binfmt.c 2003-10-09 19:34:30.000000000 -0700 @@ -371,7 +371,8 @@ int setup_arg_pages(struct linux_binprm struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base,PAGE_COPY_EXEC); + put_dirty_page(current, mpnt, page, + stack_base, PAGE_COPY_EXEC); } stack_base += PAGE_SIZE; } diff -prauN linux-2.6.0-test7/arch/x86_64/ia32/syscall32.c wli-2.6.0-test7-bk1-29/arch/x86_64/ia32/syscall32.c --- linux-2.6.0-test7/arch/x86_64/ia32/syscall32.c 2003-10-08 12:24:16.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/x86_64/ia32/syscall32.c 2003-10-09 19:28:46.000000000 -0700 @@ -29,12 +29,15 @@ char *syscall32_page; and let it be handled by generic VM */ int map_syscall32(struct mm_struct *mm, unsigned long address) { + pgd_t *pgd; + pmd_t *pmd; pte_t *pte; int err = 0; down_read(&mm->mmap_sem); spin_lock(&mm->page_table_lock); - pmd_t *pmd = pmd_alloc(mm, pgd_offset(mm, address), address); - if (pmd && (pte = pte_alloc_map(mm, pmd, address)) != NULL) { + pgd = pgd_offset(mm, address); + pmd = pmd_alloc_map(mm, pgd, address); + if (pmd && (pte = pte_alloc_map(mm, pgd, &pmd, address)) != NULL) { if (pte_none(*pte)) { set_pte(pte, mk_pte(virt_to_page(syscall32_page), diff -prauN linux-2.6.0-test7/arch/x86_64/kernel/acpi/boot.c wli-2.6.0-test7-bk1-29/arch/x86_64/kernel/acpi/boot.c --- linux-2.6.0-test7/arch/x86_64/kernel/acpi/boot.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/x86_64/kernel/acpi/boot.c 2003-10-09 19:27:22.000000000 -0700 @@ -46,10 +46,8 @@ #include #include -extern int acpi_disabled; int acpi_lapic = 0; int acpi_ioapic = 0; -extern int disable_apic; #define PREFIX "ACPI: " @@ -316,7 +314,7 @@ acpi_boot_init (void) { int result = 0; - if (acpi_disabled) + if (acpi_disabled && !acpi_ht) return 1; /* @@ -339,9 +337,10 @@ acpi_boot_init (void) return result; } - extern int disable_apic; - if (disable_apic) + if (disable_apic) { + printk(KERN_INFO PREFIX "Skipping MADT probe because local APIC is disabled\n"); return 0; + } #ifdef CONFIG_X86_LOCAL_APIC @@ -423,7 +422,7 @@ acpi_boot_init (void) /* * if "noapic" boot option, don't look for IO-APICs */ - if (disable_apic) { + if (skip_ioapic_setup) { printk(KERN_INFO PREFIX "Skipping IOAPIC probe " "due to 'noapic' option.\n"); return 1; diff -prauN linux-2.6.0-test7/arch/x86_64/kernel/apic.c wli-2.6.0-test7-bk1-29/arch/x86_64/kernel/apic.c --- linux-2.6.0-test7/arch/x86_64/kernel/apic.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/x86_64/kernel/apic.c 2003-10-09 19:27:22.000000000 -0700 @@ -1023,8 +1023,11 @@ static __init int setup_noapictimer(char return 0; } +/* dummy parsing: see setup.c */ + __setup("disableapic", setup_disableapic); __setup("nolapic", setup_nolapic); /* same as disableapic, for compatibility */ + __setup("noapictimer", setup_noapictimer); /* no "lapic" flag - we only use the lapic when the BIOS tells us so. */ diff -prauN linux-2.6.0-test7/arch/x86_64/kernel/cpufreq/Makefile wli-2.6.0-test7-bk1-29/arch/x86_64/kernel/cpufreq/Makefile --- linux-2.6.0-test7/arch/x86_64/kernel/cpufreq/Makefile 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/x86_64/kernel/cpufreq/Makefile 2003-10-09 19:27:22.000000000 -0700 @@ -4,9 +4,4 @@ obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o -$(obj)/powernow-k8.c: $(obj)/powernow-k8.h - @ln -sf ../../../i386/kernel/cpu/cpufreq/powernow-k8.c $(obj)/powernow-k8.c -$(obj)/powernow-k8.h: - @ln -sf ../../../i386/kernel/cpu/cpufreq/powernow-k8.h $(obj)/powernow-k8.h - -clean-files += powernow-k8.c powernow-k8.h +powernow-k8-objs := ../../../i386/kernel/cpu/cpufreq/powernow-k8.o diff -prauN linux-2.6.0-test7/arch/x86_64/kernel/io_apic.c wli-2.6.0-test7-bk1-29/arch/x86_64/kernel/io_apic.c --- linux-2.6.0-test7/arch/x86_64/kernel/io_apic.c 2003-10-08 12:24:06.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/x86_64/kernel/io_apic.c 2003-10-09 19:27:22.000000000 -0700 @@ -176,14 +176,79 @@ static void clear_IO_APIC (void) int pirq_entries [MAX_PIRQS]; int pirqs_enabled; int skip_ioapic_setup; +int ioapic_force; -static int __init ioapic_setup(char *str) +/* dummy parsing: see setup.c */ + +static int __init disable_ioapic_setup(char *str) { skip_ioapic_setup = 1; return 1; } -__setup("noapic", ioapic_setup); +static int __init enable_ioapic_setup(char *str) +{ + ioapic_force = 1; + skip_ioapic_setup = 0; + return 1; +} + +__setup("noapic", disable_ioapic_setup); +__setup("apic", enable_ioapic_setup); + +#ifndef CONFIG_SMP +#include +#include +#include + +/* Temporary Hack. Nvidia and VIA boards currently only work with IO-APIC + off. Check for an Nvidia or VIA PCI bridge and turn it off. + Use pci direct infrastructure because this runs before the PCI subsystem. + + Can be overwritten with "apic" */ +void __init check_ioapic(void) +{ + int num,slot,func; + if (ioapic_force) + return; + + /* Poor man's PCI discovery */ + for (num = 0; num < 32; num++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + u32 class; + u32 vendor; + class = read_pci_config(num,slot,func, + PCI_CLASS_REVISION); + if (class == 0xffffffff) + break; + + if ((class >> 16) != PCI_CLASS_BRIDGE_PCI) + continue; + + vendor = read_pci_config(num, slot, func, + PCI_VENDOR_ID); + vendor &= 0xffff; + switch (vendor) { + case PCI_VENDOR_ID_NVIDIA: + case PCI_VENDOR_ID_VIA: + printk(KERN_INFO + "PCI bridge %02x:%02x from %x found. Setting \"noapic\". Overwrite with \"apic\"\n", + num,slot,vendor); + skip_ioapic_setup = 1; + return; + } + + /* No multi-function device? */ + u8 type = read_pci_config_byte(num,slot,func, + PCI_HEADER_TYPE); + if (!(type & 0x80)) + break; + } + } + } +} +#endif static int __init ioapic_pirq_setup(char *str) { diff -prauN linux-2.6.0-test7/arch/x86_64/kernel/setup.c wli-2.6.0-test7-bk1-29/arch/x86_64/kernel/setup.c --- linux-2.6.0-test7/arch/x86_64/kernel/setup.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/x86_64/kernel/setup.c 2003-10-09 19:27:22.000000000 -0700 @@ -65,6 +65,7 @@ unsigned long mmu_cr4_features; EXPORT_SYMBOL_GPL(mmu_cr4_features); int acpi_disabled = 0; +int acpi_ht = 0; /* For PCI or other memory-mapped resources */ unsigned long pci_mem_start = 0x10000000; @@ -204,9 +205,24 @@ static __init void parse_cmdline_early ( acpi_disabled = 0; } - if (!memcmp(from, "disableapic", 11)) + /* acpi=ht just means: do ACPI MADT parsing + at bootup, but don't enable the full ACPI interpreter */ + if (!memcmp(from, "acpi=ht", 7)) { + acpi_ht = 1; + } + + if (!memcmp(from, "nolapic", 7) || + !memcmp(from, "disableapic", 11)) disable_apic = 1; + if (!memcmp(from, "noapic", 6)) + skip_ioapic_setup = 1; + + if (!memcmp(from, "apic", 6)) { + skip_ioapic_setup = 0; + ioapic_force = 1; + } + if (!memcmp(from, "mem=", 4)) parse_memopt(from+4, &from); @@ -417,6 +433,13 @@ void __init setup_arch(char **cmdline_p) #endif paging_init(); + +#ifndef CONFIG_SMP + /* Temporary hack: disable the IO-APIC for UP Nvidia and + This is until we sort out the ACPI problems. */ + if (!acpi_disabled) + check_ioapic(); +#endif #ifdef CONFIG_ACPI_BOOT /* * Initialize the ACPI boot-time table parser (gets the RSDP and SDT). diff -prauN linux-2.6.0-test7/arch/x86_64/mm/Makefile wli-2.6.0-test7-bk1-29/arch/x86_64/mm/Makefile --- linux-2.6.0-test7/arch/x86_64/mm/Makefile 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/x86_64/mm/Makefile 2003-10-09 19:27:22.000000000 -0700 @@ -7,7 +7,4 @@ obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpag obj-$(CONFIG_DISCONTIGMEM) += numa.o obj-$(CONFIG_K8_NUMA) += k8topology.o -$(obj)/hugetlbpage.c: - @ln -sf ../../i386/mm/hugetlbpage.c $(obj)/hugetlbpage.c - -clean-files += hugetlbpage.c +hugetlbpage-y = ../../i386/mm/hugetlbpage.o diff -prauN linux-2.6.0-test7/arch/x86_64/mm/ioremap.c wli-2.6.0-test7-bk1-29/arch/x86_64/mm/ioremap.c --- linux-2.6.0-test7/arch/x86_64/mm/ioremap.c 2003-10-08 12:24:50.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/arch/x86_64/mm/ioremap.c 2003-10-09 19:28:46.000000000 -0700 @@ -82,7 +82,7 @@ static int remap_area_pages(unsigned lon spin_lock(&init_mm.page_table_lock); do { pmd_t *pmd; - pmd = pmd_alloc(&init_mm, dir, address); + pmd = pmd_alloc_kernel(&init_mm, dir, address); error = -ENOMEM; if (!pmd) break; diff -prauN linux-2.6.0-test7/drivers/acpi/sleep/main.c wli-2.6.0-test7-bk1-29/drivers/acpi/sleep/main.c --- linux-2.6.0-test7/drivers/acpi/sleep/main.c 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/drivers/acpi/sleep/main.c 2003-10-09 19:27:22.000000000 -0700 @@ -20,8 +20,8 @@ u8 sleep_states[ACPI_S_STATE_COUNT]; static struct pm_ops acpi_pm_ops; -extern void do_suspend_lowlevel_s4bios(int); -extern void do_suspend_lowlevel(int); +extern void do_suspend_lowlevel_s4bios(void); +extern void do_suspend_lowlevel(void); static u32 acpi_suspend_states[] = { [PM_SUSPEND_ON] = ACPI_STATE_S0, @@ -95,14 +95,14 @@ static int acpi_pm_enter(u32 state) break; case PM_SUSPEND_MEM: - do_suspend_lowlevel(0); + do_suspend_lowlevel(); break; case PM_SUSPEND_DISK: if (acpi_pm_ops.pm_disk_mode == PM_DISK_PLATFORM) status = acpi_enter_sleep_state(acpi_state); else - do_suspend_lowlevel_s4bios(0); + do_suspend_lowlevel_s4bios(); break; default: return -EINVAL; diff -prauN linux-2.6.0-test7/drivers/base/bus.c wli-2.6.0-test7-bk1-29/drivers/base/bus.c --- linux-2.6.0-test7/drivers/base/bus.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/drivers/base/bus.c 2003-10-09 19:27:22.000000000 -0700 @@ -459,10 +459,6 @@ int bus_add_driver(struct device_driver driver_attach(drv); up_write(&bus->subsys.rwsem); - if (error) { - kobject_unregister(&drv->kobj); - put_bus(bus); - } } return error; } diff -prauN linux-2.6.0-test7/drivers/base/core.c wli-2.6.0-test7-bk1-29/drivers/base/core.c --- linux-2.6.0-test7/drivers/base/core.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/drivers/base/core.c 2003-10-09 19:27:22.000000000 -0700 @@ -76,6 +76,8 @@ static struct sysfs_ops dev_sysfs_ops = static void device_release(struct kobject * kobj) { struct device * dev = to_dev(kobj); + struct completion * c = dev->complete; + if (dev->release) dev->release(dev); else { @@ -84,6 +86,8 @@ static void device_release(struct kobjec dev->bus_id); WARN_ON(1); } + if (c) + complete(c); } static struct kobj_type ktype_device = { @@ -349,6 +353,26 @@ void device_unregister(struct device * d put_device(dev); } + +/** + * device_unregister_wait - Unregister device and wait for it to be freed. + * @dev: Device to unregister. + * + * For the cases where the caller needs to wait for all references to + * be dropped from the device before continuing (e.g. modules with + * statically allocated devices), this function uses a completion struct + * to wait, along with a matching complete() in device_release() above. + */ + +void device_unregister_wait(struct device * dev) +{ + struct completion c; + init_completion(&c); + dev->complete = &c; + device_unregister(dev); + wait_for_completion(&c); +} + /** * device_for_each_child - device child iterator. * @dev: parent struct device. @@ -389,6 +413,7 @@ EXPORT_SYMBOL(device_register); EXPORT_SYMBOL(device_del); EXPORT_SYMBOL(device_unregister); +EXPORT_SYMBOL(device_unregister_wait); EXPORT_SYMBOL(get_device); EXPORT_SYMBOL(put_device); diff -prauN linux-2.6.0-test7/drivers/char/Kconfig wli-2.6.0-test7-bk1-29/drivers/char/Kconfig --- linux-2.6.0-test7/drivers/char/Kconfig 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/drivers/char/Kconfig 2003-10-09 19:27:22.000000000 -0700 @@ -851,7 +851,7 @@ config APPLICOM config SONYPI tristate "Sony Vaio Programmable I/O Control Device support (EXPERIMENTAL)" - depends on EXPERIMENTAL && X86 && PCI + depends on EXPERIMENTAL && X86 && PCI && !64BIT ---help--- This driver enables access to the Sony Programmable I/O Control Device which can be found in many (all ?) Sony Vaio laptops. diff -prauN linux-2.6.0-test7/drivers/char/drm/drm_memory.h wli-2.6.0-test7-bk1-29/drivers/char/drm/drm_memory.h --- linux-2.6.0-test7/drivers/char/drm/drm_memory.h 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/drivers/char/drm/drm_memory.h 2003-10-09 19:28:46.000000000 -0700 @@ -125,7 +125,7 @@ static inline unsigned long drm_follow_page (void *vaddr) { pgd_t *pgd = pgd_offset_k((unsigned long) vaddr); - pmd_t *pmd = pmd_offset(pgd, (unsigned long) vaddr); + pmd_t *pmd = pmd_offset_kernel(pgd, (unsigned long)vaddr); pte_t *ptep = pte_offset_kernel(pmd, (unsigned long) vaddr); return pte_pfn(*ptep) << PAGE_SHIFT; } diff -prauN linux-2.6.0-test7/drivers/char/tty_io.c wli-2.6.0-test7-bk1-29/drivers/char/tty_io.c --- linux-2.6.0-test7/drivers/char/tty_io.c 2003-10-08 12:24:15.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/drivers/char/tty_io.c 2003-10-09 20:04:57.000000000 -0700 @@ -481,8 +481,7 @@ void do_tty_hangup(void *data) read_lock(&tasklist_lock); if (tty->session > 0) { - struct list_head *l; - for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) { + for_each_task_pid(tty->session, PIDTYPE_SID, p, pid) { if (process_tty(p) == tty) p->signal->tty = NULL; if (!process_session_leader(p)) @@ -560,8 +559,7 @@ EXPORT_SYMBOL(tty_hung_up_p); void disassociate_ctty(int on_exit) { struct tty_struct *tty; - struct task_struct *p; - struct list_head *l; + task_t *p; struct pid *pid; int tty_pgrp = -1; @@ -591,7 +589,7 @@ void disassociate_ctty(int on_exit) tty->pgrp = -1; read_lock(&tasklist_lock); - for_each_task_pid(process_session(current), PIDTYPE_SID, p, l, pid) + for_each_task_pid(process_session(current), PIDTYPE_SID, p, pid) p->signal->tty = NULL; read_unlock(&tasklist_lock); unlock_kernel(); @@ -1214,15 +1212,14 @@ static void release_dev(struct file * fi * tty. */ if (tty_closing || o_tty_closing) { - struct task_struct *p; - struct list_head *l; + task_t *p; struct pid *pid; read_lock(&tasklist_lock); - for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) + for_each_task_pid(tty->session, PIDTYPE_SID, p, pid) p->signal->tty = NULL; if (o_tty) - for_each_task_pid(o_tty->session, PIDTYPE_SID, p,l, pid) + for_each_task_pid(o_tty->session, PIDTYPE_SID, p, pid) p->signal->tty = NULL; read_unlock(&tasklist_lock); } @@ -1537,7 +1534,6 @@ static int fionbio(struct file *file, in static int tiocsctty(struct tty_struct *tty, int arg) { - struct list_head *l; struct pid *pid; task_t *p; @@ -1561,7 +1557,7 @@ static int tiocsctty(struct tty_struct * */ read_lock(&tasklist_lock); - for_each_task_pid(tty->session, PIDTYPE_SID, p, l, pid) + for_each_task_pid(tty->session, PIDTYPE_SID, p, pid) p->signal->tty = NULL; read_unlock(&tasklist_lock); } else @@ -1867,8 +1863,7 @@ static void __do_SAK(void *arg) tty_hangup(tty); #else struct tty_struct *tty = arg; - struct task_struct *p; - struct list_head *l; + task_t *p; struct pid *pid; int session; int i; @@ -1882,7 +1877,7 @@ static void __do_SAK(void *arg) if (tty->driver->flush_buffer) tty->driver->flush_buffer(tty); read_lock(&tasklist_lock); - for_each_task_pid(session, PIDTYPE_SID, p, l, pid) { + for_each_task_pid(session, PIDTYPE_SID, p, pid) { if (process_tty(p) == tty || session > 0) { printk(KERN_NOTICE "SAK: killed process %d" " (%s): process_session(p)==tty->session\n", diff -prauN linux-2.6.0-test7/drivers/i2c/busses/Kconfig wli-2.6.0-test7-bk1-29/drivers/i2c/busses/Kconfig --- linux-2.6.0-test7/drivers/i2c/busses/Kconfig 2003-10-08 12:24:42.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/drivers/i2c/busses/Kconfig 2003-10-09 19:27:22.000000000 -0700 @@ -157,7 +157,7 @@ config I2C_PHILIPSPAR config I2C_PIIX4 tristate "Intel PIIX4" - depends on I2C && PCI && EXPERIMENTAL + depends on I2C && PCI && EXPERIMENTAL && !64BIT help If you say yes to this option, support will be included for the Intel PIIX4 family of mainboard I2C interfaces. Specifically, the following diff -prauN linux-2.6.0-test7/drivers/mtd/devices/lart.c wli-2.6.0-test7-bk1-29/drivers/mtd/devices/lart.c --- linux-2.6.0-test7/drivers/mtd/devices/lart.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/drivers/mtd/devices/lart.c 2003-10-09 19:27:22.000000000 -0700 @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #ifdef HAVE_PARTITIONS diff -prauN linux-2.6.0-test7/drivers/net/hamradio/Kconfig wli-2.6.0-test7-bk1-29/drivers/net/hamradio/Kconfig --- linux-2.6.0-test7/drivers/net/hamradio/Kconfig 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/drivers/net/hamradio/Kconfig 2003-10-09 19:27:22.000000000 -0700 @@ -159,7 +159,7 @@ config BAYCOM_PAR config BAYCOM_EPP tristate "BAYCOM epp driver for AX.25" - depends on PARPORT && AX25 + depends on PARPORT && AX25 && !64BIT ---help--- This is a driver for Baycom style simple amateur radio modems that connect to a parallel interface. The driver supports the EPP diff -prauN linux-2.6.0-test7/drivers/pci/hotplug/Kconfig wli-2.6.0-test7-bk1-29/drivers/pci/hotplug/Kconfig --- linux-2.6.0-test7/drivers/pci/hotplug/Kconfig 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/drivers/pci/hotplug/Kconfig 2003-10-09 19:27:22.000000000 -0700 @@ -44,7 +44,7 @@ config HOTPLUG_PCI_FAKE config HOTPLUG_PCI_COMPAQ tristate "Compaq PCI Hotplug driver" - depends on HOTPLUG_PCI && X86 + depends on HOTPLUG_PCI && X86 && PCI_BIOS help Say Y here if you have a motherboard with a Compaq PCI Hotplug controller. @@ -66,7 +66,7 @@ config HOTPLUG_PCI_COMPAQ_NVRAM config HOTPLUG_PCI_IBM tristate "IBM PCI Hotplug driver" - depends on HOTPLUG_PCI && X86_IO_APIC && X86 + depends on HOTPLUG_PCI && X86_IO_APIC && X86 && PCI_BIOS help Say Y here if you have a motherboard with a IBM PCI Hotplug controller. diff -prauN linux-2.6.0-test7/drivers/serial/8250_acpi.c wli-2.6.0-test7-bk1-29/drivers/serial/8250_acpi.c --- linux-2.6.0-test7/drivers/serial/8250_acpi.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/drivers/serial/8250_acpi.c 2003-10-09 19:27:22.000000000 -0700 @@ -38,8 +38,11 @@ static acpi_status acpi_serial_mmio(stru static acpi_status acpi_serial_port(struct serial_struct *req, struct acpi_resource_io *io) { - req->port = io->min_base_address; - req->io_type = SERIAL_IO_PORT; + if (io->range_length) { + req->port = io->min_base_address; + req->io_type = SERIAL_IO_PORT; + } else + printk(KERN_ERR "%s: zero-length IO port range?\n", __FUNCTION__); return AE_OK; } diff -prauN linux-2.6.0-test7/fs/Kconfig wli-2.6.0-test7-bk1-29/fs/Kconfig --- linux-2.6.0-test7/fs/Kconfig 2003-10-08 12:24:14.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/Kconfig 2003-10-09 19:27:22.000000000 -0700 @@ -1309,6 +1309,30 @@ config NFS_V4 If unsure, say N. +config NFS_DIRECTIO + bool "Allow direct I/O on NFS files (EXPERIMENTAL)" + depends on NFS_FS && EXPERIMENTAL + help + This option enables applications to perform uncached I/O on files + in NFS file systems using the O_DIRECT open() flag. When O_DIRECT + is set for a file, its data is not cached in the system's page + cache. Data is moved to and from user-level application buffers + directly. Unlike local disk-based file systems, NFS O_DIRECT has + no alignment restrictions. + + Unless your program is designed to use O_DIRECT properly, you are + much better off allowing the NFS client to manage data caching for + you. Misusing O_DIRECT can cause poor server performance or network + storms. This kernel build option defaults OFF to avoid exposing + system administrators unwittingly to a potentially hazardous + feature. + + For more details on NFS O_DIRECT, see fs/nfs/direct.c. + + If unsure, say N. This reduces the size of the NFS client, and + causes open() to return EINVAL if a file residing in NFS is + opened with the O_DIRECT flag. + config NFSD tristate "NFS server support" depends on INET diff -prauN linux-2.6.0-test7/fs/adfs/inode.c wli-2.6.0-test7-bk1-29/fs/adfs/inode.c --- linux-2.6.0-test7/fs/adfs/inode.c 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/adfs/inode.c 2003-10-09 19:42:26.000000000 -0700 @@ -63,7 +63,7 @@ static int adfs_readpage(struct file *fi static int adfs_prepare_write(struct file *file, struct page *page, unsigned int from, unsigned int to) { return cont_prepare_write(page, from, to, adfs_get_block, - &ADFS_I(page->mapping->host)->mmu_private); + &ADFS_I(page_mapping(page)->host)->mmu_private); } static sector_t _adfs_bmap(struct address_space *mapping, sector_t block) diff -prauN linux-2.6.0-test7/fs/affs/file.c wli-2.6.0-test7-bk1-29/fs/affs/file.c --- linux-2.6.0-test7/fs/affs/file.c 2003-10-08 12:24:05.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/affs/file.c 2003-10-09 19:42:26.000000000 -0700 @@ -418,7 +418,7 @@ static int affs_readpage(struct file *fi static int affs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { return cont_prepare_write(page, from, to, affs_get_block, - &AFFS_I(page->mapping->host)->mmu_private); + &AFFS_I(page_mapping(page)->host)->mmu_private); } static sector_t _affs_bmap(struct address_space *mapping, sector_t block) { @@ -508,7 +508,7 @@ affs_file_write(struct file *file, const static int affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh; char *data; @@ -616,7 +616,7 @@ out: static int affs_readpage_ofs(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; u32 to; int err; @@ -636,7 +636,7 @@ affs_readpage_ofs(struct file *file, str static int affs_prepare_write_ofs(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; u32 size, offset; u32 tmp; int err = 0; @@ -677,7 +677,7 @@ static int affs_prepare_write_ofs(struct static int affs_commit_write_ofs(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct super_block *sb = inode->i_sb; struct buffer_head *bh, *prev_bh; char *data; diff -prauN linux-2.6.0-test7/fs/affs/symlink.c wli-2.6.0-test7-bk1-29/fs/affs/symlink.c --- linux-2.6.0-test7/fs/affs/symlink.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/affs/symlink.c 2003-10-09 19:42:26.000000000 -0700 @@ -20,7 +20,7 @@ static int affs_symlink_readpage(struct file *file, struct page *page) { struct buffer_head *bh; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *link = kmap(page); struct slink_front *lf; int err; diff -prauN linux-2.6.0-test7/fs/afs/file.c wli-2.6.0-test7-bk1-29/fs/afs/file.c --- linux-2.6.0-test7/fs/afs/file.c 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/afs/file.c 2003-10-09 19:57:39.000000000 -0700 @@ -119,7 +119,7 @@ static int afs_file_readpage(struct file afs_vnode_t *vnode; int ret; - inode = page->mapping->host; + inode = page_mapping(page)->host; _enter("{%lu},{%lu}",inode->i_ino,page->index); @@ -242,7 +242,7 @@ static int afs_file_invalidatepage(struc BUG_ON(!PageLocked(page)); if (PagePrivate(page)) { #ifdef AFS_CACHING_SUPPORT - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + struct afs_vnode *vnode = AFS_FS_I(page_mapping(page)->host); cachefs_uncache_page(vnode->cache,page); #endif @@ -256,7 +256,7 @@ static int afs_file_invalidatepage(struc ret = 0; if (!PageWriteback(page)) - ret = page->mapping->a_ops->releasepage(page, 0); + ret = page_mapping(page)->a_ops->releasepage(page, 0); } } @@ -276,7 +276,7 @@ static int afs_file_releasepage(struct p if (PagePrivate(page)) { #ifdef AFS_CACHING_SUPPORT - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + struct afs_vnode *vnode = AFS_FS_I(page_mapping(page)->host); cachefs_uncache_page(vnode->cache,page); #endif diff -prauN linux-2.6.0-test7/fs/binfmt_elf.c wli-2.6.0-test7-bk1-29/fs/binfmt_elf.c --- linux-2.6.0-test7/fs/binfmt_elf.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/binfmt_elf.c 2003-10-09 19:53:38.000000000 -0700 @@ -7,6 +7,7 @@ * Tools". * * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com). + * Top-down vma allocation support, William Irwin, IBM, 2003 */ #include @@ -329,8 +330,13 @@ static unsigned long load_elf_interp(str if (retval < 0) goto out_close; +#ifndef CONFIG_MMAP_TOPDOWN eppnt = elf_phdata; for (i=0; ie_phnum; i++, eppnt++) { +#else + eppnt = &elf_phdata[interp_elf_ex->e_phnum - 1]; + for (i = interp_elf_ex->e_phnum - 1; i >= 0; --i, --eppnt) { +#endif if (eppnt->p_type == PT_LOAD) { int elf_type = MAP_PRIVATE | MAP_DENYWRITE; int elf_prot = 0; @@ -344,7 +350,8 @@ static unsigned long load_elf_interp(str if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) elf_type |= MAP_FIXED; - map_addr = elf_map(interpreter, load_addr + vaddr, eppnt, elf_prot, elf_type); + map_addr = load_addr_set ? load_addr + vaddr : 0; + map_addr = elf_map(interpreter, map_addr, eppnt, elf_prot, elf_type); if (BAD_ADDR(map_addr)) goto out_close; diff -prauN linux-2.6.0-test7/fs/buffer.c wli-2.6.0-test7-bk1-29/fs/buffer.c --- linux-2.6.0-test7/fs/buffer.c 2003-10-08 12:24:14.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/buffer.c 2003-10-09 19:56:01.000000000 -0700 @@ -46,7 +46,7 @@ static void invalidate_bh_lrus(void); /* * Hashed waitqueue_head's for wait_on_buffer() */ -#define BH_WAIT_TABLE_ORDER 7 +#define BH_WAIT_TABLE_ORDER 12 static struct bh_wait_queue_head { wait_queue_head_t wqh; } ____cacheline_aligned_in_smp bh_wait_queue_heads[1<b_bdev, b)); - set_bit(AS_EIO, &page->mapping->flags); + set_bit(AS_EIO, &page_mapping(page)->flags); clear_buffer_uptodate(bh); SetPageError(page); } @@ -790,7 +790,7 @@ void write_boundary_block(struct block_d void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) { struct address_space *mapping = inode->i_mapping; - struct address_space *buffer_mapping = bh->b_page->mapping; + struct address_space *buffer_mapping = page_mapping(bh->b_page); mark_buffer_dirty(bh); if (!mapping->assoc_mapping) { @@ -835,19 +835,10 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * * FIXME: may need to call ->reservepage here as well. That's rather up to the * address_space though. - * - * For now, we treat swapper_space specially. It doesn't use the normal - * block a_ops. */ -int __set_page_dirty_buffers(struct page *page) +int set_page_dirty_buffers(struct page *page) { - struct address_space * const mapping = page->mapping; - int ret = 0; - - if (mapping == NULL) { - SetPageDirty(page); - goto out; - } + struct address_space * const mapping = page_mapping(page); spin_lock(&mapping->private_lock); if (page_has_buffers(page)) { @@ -865,21 +856,19 @@ int __set_page_dirty_buffers(struct page spin_unlock(&mapping->private_lock); if (!TestSetPageDirty(page)) { - spin_lock(&mapping->page_lock); - if (page->mapping) { /* Race with truncate? */ + mapping_wrlock(&mapping->page_lock); + if (page_mapping(page)) { /* Race with truncate? */ if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } - -out: - return ret; + return 0; } -EXPORT_SYMBOL(__set_page_dirty_buffers); +EXPORT_SYMBOL(set_page_dirty_buffers); /* * Write out and wait upon a list of buffers. @@ -1251,7 +1240,7 @@ __getblk_slow(struct block_device *bdev, * address_space's dirty_pages list and then attach the address_space's * inode to its superblock's dirty inode list. * - * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, + * mark_buffer_dirty() is atomic. It takes page_mapping(bh->b_page)->private_lock, * mapping->page_lock and the global inode_lock. */ void mark_buffer_dirty(struct buffer_head *bh) @@ -1259,7 +1248,7 @@ void mark_buffer_dirty(struct buffer_hea if (!buffer_uptodate(bh)) buffer_error(); if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) - __set_page_dirty_nobuffers(bh->b_page); + set_page_dirty_nobuffers(bh->b_page); } /* @@ -1287,7 +1276,7 @@ void __bforget(struct buffer_head *bh) { clear_buffer_dirty(bh); if (!list_empty(&bh->b_assoc_buffers)) { - struct address_space *buffer_mapping = bh->b_page->mapping; + struct address_space *buffer_mapping = page_mapping(bh->b_page); spin_lock(&buffer_mapping->private_lock); list_del_init(&bh->b_assoc_buffers); @@ -1574,7 +1563,7 @@ static inline void discard_buffer(struct */ int try_to_release_page(struct page *page, int gfp_mask) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = page_mapping(page); if (!PageLocked(page)) BUG(); @@ -1640,7 +1629,7 @@ EXPORT_SYMBOL(block_invalidatepage); /* * We attach and possibly dirty the buffers atomically wrt - * __set_page_dirty_buffers() via private_lock. try_to_free_buffers + * set_page_dirty_buffers() via private_lock. try_to_free_buffers * is already excluded via the page lock. */ void create_empty_buffers(struct page *page, @@ -1657,7 +1646,7 @@ void create_empty_buffers(struct page *p } while (bh); tail->b_this_page = head; - spin_lock(&page->mapping->private_lock); + spin_lock(&page_mapping(page)->private_lock); if (PageUptodate(page) || PageDirty(page)) { bh = head; do { @@ -1669,7 +1658,7 @@ void create_empty_buffers(struct page *p } while (bh != head); } __set_page_buffers(page, head); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page_mapping(page)->private_lock); } EXPORT_SYMBOL(create_empty_buffers); @@ -1753,12 +1742,12 @@ static int __block_write_full_page(struc } /* - * Be very careful. We have no exclusion from __set_page_dirty_buffers + * Be very careful. We have no exclusion from set_page_dirty_buffers * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the page stays dirty. * - * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * Buffers outside i_size may be dirtied by set_page_dirty_buffers; * handle that here by just cleaning them. */ @@ -1809,7 +1798,7 @@ static int __block_write_full_page(struc lock_buffer(bh); } else { if (test_set_buffer_locked(bh)) { - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); continue; } } @@ -2062,7 +2051,7 @@ static int __block_commit_write(struct i */ int block_read_full_page(struct page *page, get_block_t *get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; sector_t iblock, lblock; struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; unsigned int blocksize; @@ -2202,7 +2191,7 @@ out: int cont_prepare_write(struct page *page, unsigned offset, unsigned to, get_block_t *get_block, loff_t *bytes) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = mapping->host; struct page *new_page; unsigned long pgpos; @@ -2284,7 +2273,7 @@ out: int block_prepare_write(struct page *page, unsigned from, unsigned to, get_block_t *get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int err = __block_prepare_write(inode, page, from, to, get_block); if (err) ClearPageUptodate(page); @@ -2293,7 +2282,7 @@ int block_prepare_write(struct page *pag int block_commit_write(struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; __block_commit_write(inode,page,from,to); return 0; } @@ -2301,7 +2290,7 @@ int block_commit_write(struct page *page int generic_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; __block_commit_write(inode,page,from,to); /* @@ -2322,7 +2311,7 @@ int generic_commit_write(struct file *fi int nobh_prepare_write(struct page *page, unsigned from, unsigned to, get_block_t *get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocksize = 1 << blkbits; struct buffer_head map_bh; @@ -2456,7 +2445,7 @@ EXPORT_SYMBOL(nobh_prepare_write); int nobh_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; set_page_dirty(page); @@ -2590,7 +2579,7 @@ out: int block_write_full_page(struct page *page, get_block_t *get_block, struct writeback_control *wbc) { - struct inode * const inode = page->mapping->host; + struct inode * const inode = page_mapping(page)->host; loff_t i_size = i_size_read(inode); const unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; unsigned offset; @@ -2769,9 +2758,9 @@ void sync_dirty_buffer(struct buffer_hea static void check_ttfb_buffer(struct page *page, struct buffer_head *bh) { if (!buffer_uptodate(bh) && !buffer_req(bh)) { - if (PageUptodate(page) && page->mapping + if (PageUptodate(page) && page_mapping(page) && buffer_mapped(bh) /* discard_buffer */ - && S_ISBLK(page->mapping->host->i_mode)) + && S_ISBLK(page_mapping(page)->host->i_mode)) { buffer_error(); } @@ -2793,7 +2782,7 @@ static void check_ttfb_buffer(struct pag * * The same applies to regular filesystem pages: if all the buffers are * clean then we set the page clean and proceed. To do that, we require - * total exclusion from __set_page_dirty_buffers(). That is obtained with + * total exclusion from set_page_dirty_buffers(). That is obtained with * private_lock. * * try_to_free_buffers() is non-blocking. @@ -2815,7 +2804,7 @@ drop_buffers(struct page *page, struct b do { check_ttfb_buffer(page, bh); if (buffer_write_io_error(bh)) - set_bit(AS_EIO, &page->mapping->flags); + set_bit(AS_EIO, &page_mapping(page)->flags); if (buffer_busy(bh)) goto failed; if (!buffer_uptodate(bh) && !buffer_req(bh)) @@ -2842,7 +2831,7 @@ failed: int try_to_free_buffers(struct page *page) { - struct address_space * const mapping = page->mapping; + struct address_space * const mapping = page_mapping(page); struct buffer_head *buffers_to_free = NULL; int ret = 0; diff -prauN linux-2.6.0-test7/fs/cifs/file.c wli-2.6.0-test7-bk1-29/fs/cifs/file.c --- linux-2.6.0-test7/fs/cifs/file.c 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/cifs/file.c 2003-10-09 19:42:26.000000000 -0700 @@ -471,14 +471,14 @@ cifs_write(struct file * file, const cha static int cifs_partialpagewrite(struct page *page,unsigned from, unsigned to) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; char * write_data; int rc = -EFAULT; int bytes_written = 0; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct cifsInodeInfo *cifsInode; struct cifsFileInfo *open_file = NULL; struct list_head *tmp; @@ -597,7 +597,7 @@ cifs_commit_write(struct file *file, str { int xid; int rc = 0; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; struct cifsFileInfo *open_file; struct cifs_sb_info *cifs_sb; @@ -651,7 +651,7 @@ cifs_sync_page(struct page *page) int rc = 0; cFYI(1,("sync page %p",page)); - mapping = page->mapping; + mapping = page_mapping(page); if (!mapping) return 0; inode = mapping->host; diff -prauN linux-2.6.0-test7/fs/coda/symlink.c wli-2.6.0-test7-bk1-29/fs/coda/symlink.c --- linux-2.6.0-test7/fs/coda/symlink.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/coda/symlink.c 2003-10-09 19:42:26.000000000 -0700 @@ -24,7 +24,7 @@ static int coda_symlink_filler(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int error; struct coda_inode_info *cii; unsigned int len = PAGE_SIZE; diff -prauN linux-2.6.0-test7/fs/cramfs/inode.c wli-2.6.0-test7-bk1-29/fs/cramfs/inode.c --- linux-2.6.0-test7/fs/cramfs/inode.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/cramfs/inode.c 2003-10-09 19:42:26.000000000 -0700 @@ -402,7 +402,7 @@ static struct dentry * cramfs_lookup(str static int cramfs_readpage(struct file *file, struct page * page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; u32 maxblock, bytes_filled; void *pgdata; diff -prauN linux-2.6.0-test7/fs/dnotify.c wli-2.6.0-test7-bk1-29/fs/dnotify.c --- linux-2.6.0-test7/fs/dnotify.c 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/dnotify.c 2003-10-09 19:27:22.000000000 -0700 @@ -93,7 +93,7 @@ int fcntl_dirnotify(int fd, struct file prev = &odn->dn_next; } - error = f_setown(filp, current->tgid, 1); + error = f_setown(filp, current->pid, 0); if (error) goto out_free; diff -prauN linux-2.6.0-test7/fs/efs/symlink.c wli-2.6.0-test7-bk1-29/fs/efs/symlink.c --- linux-2.6.0-test7/fs/efs/symlink.c 2003-10-08 12:24:14.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/efs/symlink.c 2003-10-09 19:42:26.000000000 -0700 @@ -16,7 +16,7 @@ static int efs_symlink_readpage(struct f { char *link = kmap(page); struct buffer_head * bh; - struct inode * inode = page->mapping->host; + struct inode * inode = page_mapping(page)->host; efs_block_t size = inode->i_size; int err; diff -prauN linux-2.6.0-test7/fs/exec.c wli-2.6.0-test7-bk1-29/fs/exec.c --- linux-2.6.0-test7/fs/exec.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/exec.c 2003-10-09 20:00:01.000000000 -0700 @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include #include @@ -189,6 +189,26 @@ static int count(char __user * __user * return i; } +static inline size_t exec_copy_from_user(struct page *page, + unsigned long offset, + const char __user *buf, + unsigned bytes) +{ + int left; + char *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + left = __copy_from_user(kaddr + offset, buf, bytes); + kunmap_atomic(kaddr, KM_USER0); + + if (left) { + kaddr = kmap(page); + left = __copy_from_user(kaddr + offset, buf, bytes); + kunmap(page); + } + return left; +} + /* * 'copy_strings()' copies argument/environment strings from user * memory to free pages in kernel mem. These are in a format ready @@ -196,8 +216,6 @@ static int count(char __user * __user * */ int copy_strings(int argc,char __user * __user * argv, struct linux_binprm *bprm) { - struct page *kmapped_page = NULL; - char *kaddr = NULL; int ret; while (argc-- > 0) { @@ -224,6 +242,7 @@ int copy_strings(int argc,char __user * int i, new, err; int offset, bytes_to_copy; struct page *page; + char *kaddr = NULL; offset = pos % PAGE_SIZE; i = pos/PAGE_SIZE; @@ -239,22 +258,26 @@ int copy_strings(int argc,char __user * new = 1; } - if (page != kmapped_page) { - if (kmapped_page) - kunmap(kmapped_page); - kmapped_page = page; - kaddr = kmap(kmapped_page); - } + bytes_to_copy = PAGE_SIZE - offset; + + if ((new && offset) || bytes_to_copy > len) + kaddr = kmap_atomic(page, KM_USER0); + if (new && offset) memset(kaddr, 0, offset); - bytes_to_copy = PAGE_SIZE - offset; + if (bytes_to_copy > len) { bytes_to_copy = len; if (new) memset(kaddr+offset+len, 0, PAGE_SIZE-offset-len); } - err = copy_from_user(kaddr+offset, str, bytes_to_copy); + + if (kaddr) + kunmap_atomic(kaddr, KM_USER0); + + fault_in_pages_readable(str, bytes_to_copy); + err = exec_copy_from_user(page, offset, str, bytes_to_copy); if (err) { ret = -EFAULT; goto out; @@ -267,8 +290,6 @@ int copy_strings(int argc,char __user * } ret = 0; out: - if (kmapped_page) - kunmap(kmapped_page); return ret; } @@ -292,52 +313,48 @@ EXPORT_SYMBOL(copy_strings_kernel); * This routine is used to map in a page into an address space: needed by * execve() for the initial stack and environment pages. * - * tsk->mmap_sem is held for writing. + * The caller should hold task->mm->mmap_sem for writing. */ -void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot) +void put_dirty_page(task_t *task, struct vm_area_struct *vma, + struct page *page, unsigned long address, pgprot_t prot) { - pgd_t * pgd; - pmd_t * pmd; - pte_t * pte; - struct pte_chain *pte_chain; + struct mm_struct *mm = task->mm; + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; if (page_count(page) != 1) printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address); - pgd = pgd_offset(tsk->mm, address); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto out_sig; - spin_lock(&tsk->mm->page_table_lock); - pmd = pmd_alloc(tsk->mm, pgd, address); + pgd = pgd_offset(mm, address); + spin_lock(&mm->page_table_lock); + pmd = pmd_alloc_map(mm, pgd, address); if (!pmd) goto out; - pte = pte_alloc_map(tsk->mm, pmd, address); + pte = pte_alloc_map(mm, pgd, &pmd, address); if (!pte) goto out; if (!pte_none(*pte)) { pte_unmap(pte); + pmd_unmap(pmd); goto out; } + mm->rss++; lru_cache_add_active(page); flush_dcache_page(page); - set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); - pte_chain = page_add_rmap(page, pte, pte_chain); + vm_set_pte(vma, pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot))), address); + page_add_rmap(page, vma, address, 1); pte_unmap(pte); - tsk->mm->rss++; - spin_unlock(&tsk->mm->page_table_lock); + pmd_unmap(pmd); + spin_unlock(&mm->page_table_lock); /* no need for flush_tlb */ - pte_chain_free(pte_chain); return; out: - spin_unlock(&tsk->mm->page_table_lock); -out_sig: + spin_unlock(&mm->page_table_lock); __free_page(page); - force_sig(SIGKILL, tsk); - pte_chain_free(pte_chain); + force_sig(SIGKILL, task); return; } @@ -439,7 +456,7 @@ int setup_arg_pages(struct linux_binprm struct page *page = bprm->page[i]; if (page) { bprm->page[i] = NULL; - put_dirty_page(current, page, stack_base, + put_dirty_page(current, mpnt, page, stack_base, mpnt->vm_page_prot); } stack_base += PAGE_SIZE; @@ -700,8 +717,9 @@ static inline int de_thread(struct task_ __ptrace_link(current, parent); } - list_del(¤t->tasks); - list_add_tail(¤t->tasks, &init_task.tasks); + /* is this necessary? only if the tgid changes... */ + remove_task_list(current); + insert_task_list(current); current->exit_signal = SIGCHLD; state = leader->state; @@ -723,7 +741,7 @@ no_thread_group: spin_lock(&newsighand->siglock); if (current == oldsig->curr_target) - oldsig->curr_target = next_thread(current); + oldsig->curr_target = another_thread(current); if (newsig) current->signal = newsig; current->sighand = newsighand; diff -prauN linux-2.6.0-test7/fs/ext2/dir.c wli-2.6.0-test7-bk1-29/fs/ext2/dir.c --- linux-2.6.0-test7/fs/ext2/dir.c 2003-10-08 12:24:07.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/ext2/dir.c 2003-10-09 19:42:26.000000000 -0700 @@ -64,10 +64,10 @@ ext2_last_byte(struct inode *inode, unsi static int ext2_commit_chunk(struct page *page, unsigned from, unsigned to) { - struct inode *dir = page->mapping->host; + struct inode *dir = page_mapping(page)->host; int err = 0; dir->i_version++; - page->mapping->a_ops->commit_write(NULL, page, from, to); + page_mapping(page)->a_ops->commit_write(NULL, page, from, to); if (IS_DIRSYNC(dir)) err = write_one_page(page, 1); else @@ -77,7 +77,7 @@ static int ext2_commit_chunk(struct page static void ext2_check_page(struct page *page) { - struct inode *dir = page->mapping->host; + struct inode *dir = page_mapping(page)->host; struct super_block *sb = dir->i_sb; unsigned chunk_size = ext2_chunk_size(dir); char *kaddr = page_address(page); @@ -412,7 +412,7 @@ void ext2_set_link(struct inode *dir, st int err; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) BUG(); de->inode = cpu_to_le32(inode->i_ino); @@ -495,7 +495,7 @@ int ext2_add_link (struct dentry *dentry got_it: from = (char*)de - (char*)page_address(page); to = from + rec_len; - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) goto out_unlock; if (de->inode) { @@ -528,7 +528,7 @@ out_unlock: */ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page ) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = mapping->host; char *kaddr = page_address(page); unsigned from = ((char*)dir - kaddr) & ~(ext2_chunk_size(inode)-1); diff -prauN linux-2.6.0-test7/fs/ext3/inode.c wli-2.6.0-test7-bk1-29/fs/ext3/inode.c --- linux-2.6.0-test7/fs/ext3/inode.c 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/ext3/inode.c 2003-10-09 19:42:26.000000000 -0700 @@ -1085,7 +1085,7 @@ static int do_journal_get_write_access(h static int ext3_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret, needed_blocks = ext3_writepage_trans_blocks(inode); handle_t *handle; @@ -1140,7 +1140,7 @@ static int ext3_ordered_commit_write(str unsigned from, unsigned to) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret = 0, ret2; ret = walk_page_buffers(handle, page_buffers(page), @@ -1169,7 +1169,7 @@ static int ext3_writeback_commit_write(s unsigned from, unsigned to) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret = 0, ret2; loff_t new_i_size; @@ -1187,7 +1187,7 @@ static int ext3_journalled_commit_write( struct page *page, unsigned from, unsigned to) { handle_t *handle = ext3_journal_current_handle(); - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int ret = 0, ret2; int partial = 0; loff_t pos; @@ -1342,7 +1342,7 @@ static int journal_dirty_data_fn(handle_ static int ext3_ordered_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct buffer_head *page_bufs; handle_t *handle = NULL; int ret = 0; @@ -1402,7 +1402,7 @@ static int ext3_ordered_writepage(struct return ret; out_fail: - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); unlock_page(page); return ret; } @@ -1410,7 +1410,7 @@ out_fail: static int ext3_writeback_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; handle_t *handle = NULL; int ret = 0; int err; @@ -1431,7 +1431,7 @@ static int ext3_writeback_writepage(stru return ret; out_fail: - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); unlock_page(page); return ret; } @@ -1439,7 +1439,7 @@ out_fail: static int ext3_journalled_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; handle_t *handle = NULL; int ret = 0; int err; @@ -1487,7 +1487,7 @@ out: return ret; no_write: - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); out_unlock: unlock_page(page); goto out; @@ -1507,7 +1507,7 @@ ext3_readpages(struct file *file, struct static int ext3_invalidatepage(struct page *page, unsigned long offset) { - journal_t *journal = EXT3_JOURNAL(page->mapping->host); + journal_t *journal = EXT3_JOURNAL(page_mapping(page)->host); /* * If it's a full truncate we just forget about the pending dirtying @@ -1520,7 +1520,7 @@ static int ext3_invalidatepage(struct pa static int ext3_releasepage(struct page *page, int wait) { - journal_t *journal = EXT3_JOURNAL(page->mapping->host); + journal_t *journal = EXT3_JOURNAL(page_mapping(page)->host); WARN_ON(PageChecked(page)); return journal_try_to_free_buffers(journal, page, wait); @@ -1607,7 +1607,7 @@ out: static int ext3_journalled_set_page_dirty(struct page *page) { SetPageChecked(page); - return __set_page_dirty_nobuffers(page); + return set_page_dirty_nobuffers(page); } static struct address_space_operations ext3_ordered_aops = { diff -prauN linux-2.6.0-test7/fs/fat/inode.c wli-2.6.0-test7-bk1-29/fs/fat/inode.c --- linux-2.6.0-test7/fs/fat/inode.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/fat/inode.c 2003-10-09 19:42:26.000000000 -0700 @@ -1096,7 +1096,7 @@ fat_prepare_write(struct file *file, str { kmap(page); return cont_prepare_write(page,from,to,fat_get_block, - &MSDOS_I(page->mapping->host)->mmu_private); + &MSDOS_I(page_mapping(page)->host)->mmu_private); } static int diff -prauN linux-2.6.0-test7/fs/fcntl.c wli-2.6.0-test7-bk1-29/fs/fcntl.c --- linux-2.6.0-test7/fs/fcntl.c 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/fcntl.c 2003-10-09 20:00:01.000000000 -0700 @@ -488,9 +488,8 @@ void send_sigio(struct fown_struct *fown send_sigio_to_task(p, fown, fd, band); } } else { - struct list_head *l; struct pid *pidptr; - for_each_task_pid(-pid, PIDTYPE_PGID, p, l, pidptr) { + for_each_task_pid(-pid, PIDTYPE_PGID, p, pidptr) { send_sigio_to_task(p, fown, fd, band); } } @@ -525,9 +524,8 @@ int send_sigurg(struct fown_struct *fown send_sigurg_to_task(p, fown); } } else { - struct list_head *l; struct pid *pidptr; - for_each_task_pid(-pid, PIDTYPE_PGID, p, l, pidptr) { + for_each_task_pid(-pid, PIDTYPE_PGID, p, pidptr) { send_sigurg_to_task(p, fown); } } diff -prauN linux-2.6.0-test7/fs/freevxfs/vxfs_immed.c wli-2.6.0-test7-bk1-29/fs/freevxfs/vxfs_immed.c --- linux-2.6.0-test7/fs/freevxfs/vxfs_immed.c 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/freevxfs/vxfs_immed.c 2003-10-09 19:42:26.000000000 -0700 @@ -122,7 +122,7 @@ vxfs_immed_follow_link(struct dentry *dp static int vxfs_immed_readpage(struct file *fp, struct page *pp) { - struct vxfs_inode_info *vip = VXFS_INO(pp->mapping->host); + struct vxfs_inode_info *vip = VXFS_INO(page_mapping(pp)->host); u_int64_t offset = pp->index << PAGE_CACHE_SHIFT; caddr_t kaddr; diff -prauN linux-2.6.0-test7/fs/fs-writeback.c wli-2.6.0-test7-bk1-29/fs/fs-writeback.c --- linux-2.6.0-test7/fs/fs-writeback.c 2003-10-08 12:24:53.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/fs-writeback.c 2003-10-09 19:36:12.000000000 -0700 @@ -152,10 +152,10 @@ __sync_single_inode(struct inode *inode, * read speculatively by this cpu before &= ~I_DIRTY -- mikulas */ - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (wait || !wbc->for_kupdate || list_empty(&mapping->io_pages)) list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); spin_unlock(&inode_lock); do_writepages(mapping, wbc); diff -prauN linux-2.6.0-test7/fs/hfs/inode.c wli-2.6.0-test7-bk1-29/fs/hfs/inode.c --- linux-2.6.0-test7/fs/hfs/inode.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/hfs/inode.c 2003-10-09 19:42:26.000000000 -0700 @@ -240,7 +240,7 @@ static int hfs_readpage(struct file *fil static int hfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { return cont_prepare_write(page,from,to,hfs_get_block, - &HFS_I(page->mapping->host)->mmu_private); + &HFS_I(page_mapping(page)->host)->mmu_private); } static sector_t hfs_bmap(struct address_space *mapping, sector_t block) { diff -prauN linux-2.6.0-test7/fs/hpfs/file.c wli-2.6.0-test7-bk1-29/fs/hpfs/file.c --- linux-2.6.0-test7/fs/hpfs/file.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/hpfs/file.c 2003-10-09 19:42:26.000000000 -0700 @@ -109,7 +109,7 @@ static int hpfs_readpage(struct file *fi static int hpfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { return cont_prepare_write(page,from,to,hpfs_get_block, - &hpfs_i(page->mapping->host)->mmu_private); + &hpfs_i(page_mapping(page)->host)->mmu_private); } static sector_t _hpfs_bmap(struct address_space *mapping, sector_t block) { diff -prauN linux-2.6.0-test7/fs/hpfs/namei.c wli-2.6.0-test7-bk1-29/fs/hpfs/namei.c --- linux-2.6.0-test7/fs/hpfs/namei.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/hpfs/namei.c 2003-10-09 19:42:26.000000000 -0700 @@ -452,7 +452,7 @@ int hpfs_rmdir(struct inode *dir, struct int hpfs_symlink_readpage(struct file *file, struct page *page) { char *link = kmap(page); - struct inode *i = page->mapping->host; + struct inode *i = page_mapping(page)->host; struct fnode *fnode; struct buffer_head *bh; int err; diff -prauN linux-2.6.0-test7/fs/hugetlbfs/inode.c wli-2.6.0-test7-bk1-29/fs/hugetlbfs/inode.c --- linux-2.6.0-test7/fs/hugetlbfs/inode.c 2003-10-08 12:24:43.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/hugetlbfs/inode.c 2003-10-09 19:40:04.000000000 -0700 @@ -269,12 +269,15 @@ hugetlb_vmtruncate_list(struct list_head { struct vm_area_struct *vma; - list_for_each_entry(vma, list, shared) { + list_for_each_entry_rcu(vma, list, shared) { unsigned long h_vm_pgoff; unsigned long v_length; unsigned long h_length; unsigned long v_offset; + if (vma->vm_flags & VM_DEAD) + continue; + h_vm_pgoff = vma->vm_pgoff << (HPAGE_SHIFT - PAGE_SHIFT); v_length = vma->vm_end - vma->vm_start; h_length = v_length >> HPAGE_SHIFT; @@ -319,12 +322,12 @@ static int hugetlb_vmtruncate(struct ino pgoff = offset >> HPAGE_SHIFT; inode->i_size = offset; - down(&mapping->i_shared_sem); + rcu_read_lock(); /* mapping->i_shared_lock */ if (!list_empty(&mapping->i_mmap)) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); if (!list_empty(&mapping->i_mmap_shared)) hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff); - up(&mapping->i_shared_sem); + rcu_read_unlock(); /* mapping->i_shared_lock */ truncate_hugepages(mapping, offset); return 0; } diff -prauN linux-2.6.0-test7/fs/inode.c wli-2.6.0-test7-bk1-29/fs/inode.c --- linux-2.6.0-test7/fs/inode.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/inode.c 2003-10-09 19:56:01.000000000 -0700 @@ -184,8 +184,8 @@ void inode_init_once(struct inode *inode INIT_LIST_HEAD(&inode->i_devices); sema_init(&inode->i_sem, 1); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); - spin_lock_init(&inode->i_data.page_lock); - init_MUTEX(&inode->i_data.i_shared_sem); + mapping_rwlock_init(&inode->i_data.page_lock); + spin_lock_init(&inode->i_data.i_shared_lock); atomic_set(&inode->i_data.truncate_count, 0); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); @@ -1260,7 +1260,7 @@ void remove_dquot_ref(struct super_block * Hashed waitqueues for wait_on_inode(). The table is pretty small - the * kernel doesn't lock many inodes at the same time. */ -#define I_WAIT_TABLE_ORDER 3 +#define I_WAIT_TABLE_ORDER 12 static struct i_wait_queue_head { wait_queue_head_t wqh; } ____cacheline_aligned_in_smp i_wait_queue_heads[1<f_dentry); if (dd && dd->dd_fset) { - int (*cache_ioctl)(struct inode *, struct file *, unsigned int, unsigned long ) = filter_c2cdfops(dd->dd_fset->fset_cache->cache_filter)->ioctl; + int (*cache_ioctl)(struct inode *, struct file *, unsigned int, unsigned long); + cache_ioctl = filter_c2cdfops(dd->dd_fset->fset_cache->cache_filter)->ioctl; rc = -ENOTTY; if (cache_ioctl) rc = cache_ioctl(inode, file, cmd, arg); @@ -904,47 +906,49 @@ int presto_ioctl(struct inode *inode, st return -EPERM; } - memset(buf, 0, sizeof(buf)); - - if (izo_ioctl_getdata(buf, buf + 1024, (void *)arg)) { + /* allocate a zero'd buffer for data */ + PRESTO_ALLOC(buf, bufsz); + if (!buf) { + EXIT; + return -ENOMEM; + } + + if (izo_ioctl_getdata(buf, buf + bufsz, (void *)arg)) { CERROR("intermezzo ioctl: data error\n"); - return -EINVAL; + rc = -EINVAL; + goto done; } data = (struct izo_ioctl_data *)buf; switch(cmd) { case IZO_IOC_REINTKML: { - int rc; int cperr; rc = kml_reint_rec(file, data); - EXIT; cperr = copy_to_user((char *)arg, data, sizeof(*data)); if (cperr) { CERROR("WARNING: cperr %d\n", cperr); rc = -EFAULT; } - return rc; + goto done; } case IZO_IOC_GET_RCVD: { struct izo_rcvd_rec rec; struct presto_file_set *fset; - int rc; fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; - } + rc = -ENODEV; + goto done; + } + rc = izo_rcvd_get(&rec, fset, data->ioc_uuid); - if (rc < 0) { - EXIT; - return rc; - } + if (rc < 0) + goto done; - EXIT; - return copy_to_user((char *)arg, &rec, sizeof(rec))? -EFAULT : 0; + rc = copy_to_user((char *)arg, &rec, sizeof(rec))? -EFAULT : 0; + goto done; } case IZO_IOC_REPSTATUS: { @@ -953,12 +957,11 @@ int presto_ioctl(struct inode *inode, st struct izo_rcvd_rec rec; struct presto_file_set *fset; int minor; - int rc; fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); @@ -967,13 +970,11 @@ int presto_ioctl(struct inode *inode, st rc = izo_repstatus(fset, client_kmlsize, lr_client, &rec); - if (rc < 0) { - EXIT; - return rc; - } + if (rc < 0) + goto done; - EXIT; - return copy_to_user((char *)arg, &rec, sizeof(rec))? -EFAULT : 0; + rc = copy_to_user((char *)arg, &rec, sizeof(rec))? -EFAULT : 0; + goto done; } case IZO_IOC_GET_CHANNEL: { @@ -981,30 +982,28 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } data->ioc_dev = fset->fset_cache->cache_psdev->uc_minor; CDEBUG(D_PSDEV, "CHANNEL %d\n", data->ioc_dev); - EXIT; - return copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + rc = copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; } case IZO_IOC_SET_IOCTL_UID: izo_authorized_uid = data->ioc_uid; - EXIT; - return 0; + rc = 0; + goto done; case IZO_IOC_SET_PID: rc = izo_psdev_setpid(data->ioc_dev); - EXIT; - return rc; + goto done; case IZO_IOC_SET_CHANNEL: rc = izo_psdev_setchannel(file, data->ioc_dev); - EXIT; - return rc; + goto done; case IZO_IOC_GET_KML_SIZE: { struct presto_file_set *fset; @@ -1012,14 +1011,14 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } kmlsize = presto_kml_offset(fset) + fset->fset_kml_logical_off; - EXIT; - return copy_to_user((char *)arg, &kmlsize, sizeof(kmlsize))?-EFAULT : 0; + rc = copy_to_user((char *)arg, &kmlsize, sizeof(kmlsize))?-EFAULT : 0; + goto done; } case IZO_IOC_PURGE_FILE_DATA: { @@ -1027,37 +1026,37 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } rc = izo_purge_file(fset, data->ioc_inlbuf1); - EXIT; - return rc; + goto done; } case IZO_IOC_GET_FILEID: { rc = izo_get_fileid(file, data); - EXIT; if (rc) - return rc; - return copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; + + rc = copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; } case IZO_IOC_SET_FILEID: { rc = izo_set_fileid(file, data); - EXIT; if (rc) - return rc; - return copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; + + rc = copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; } case IZO_IOC_ADJUST_LML: { struct lento_vfs_context *info; info = (struct lento_vfs_context *)data->ioc_inlbuf1; rc = presto_adjust_lml(file, info); - EXIT; - return rc; + goto done; } case IZO_IOC_CONNECT: { @@ -1066,16 +1065,15 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_connect(minor, data->ioc_ino, data->ioc_generation, data->ioc_uuid, data->ioc_flags); - EXIT; - return rc; + goto done; } case IZO_IOC_GO_FETCH_KML: { @@ -1084,15 +1082,14 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_go_fetch_kml(minor, fset->fset_name, data->ioc_uuid, data->ioc_kmlsize); - EXIT; - return rc; + goto done; } case IZO_IOC_REVOKE_PERMIT: @@ -1100,26 +1097,23 @@ int presto_ioctl(struct inode *inode, st rc = izo_revoke_permit(file->f_dentry, data->ioc_uuid); else rc = izo_revoke_permit(file->f_dentry, NULL); - EXIT; - return rc; + goto done; case IZO_IOC_CLEAR_FSET: rc = izo_clear_fsetroot(file->f_dentry); - EXIT; - return rc; + goto done; case IZO_IOC_CLEAR_ALL_FSETS: { struct presto_file_set *fset; fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } rc = izo_clear_all_fsetroots(fset->fset_cache); - EXIT; - return rc; + goto done; } case IZO_IOC_SET_FSET: @@ -1129,9 +1123,7 @@ int presto_ioctl(struct inode *inode, st rc = presto_set_fsetroot_from_ioc(file->f_dentry, data->ioc_inlbuf1, data->ioc_flags); - EXIT; - return rc; - + goto done; case IZO_IOC_MARK: { int res = 0; /* resulting flags - returned to user */ @@ -1187,16 +1179,16 @@ int presto_ioctl(struct inode *inode, st } if (error) { - EXIT; - return error; + rc = error; + goto done; } data->ioc_mark_what = res; CDEBUG(D_DOWNCALL, "mark inode: %ld, and: %x, or: %x, what %x\n", file->f_dentry->d_inode->i_ino, data->ioc_and_flag, data->ioc_or_flag, data->ioc_mark_what); - EXIT; - return copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + rc = copy_to_user((char *)arg, data, sizeof(*data))? -EFAULT : 0; + goto done; } #if 0 case IZO_IOC_CLIENT_MAKE_BRANCH: { @@ -1205,16 +1197,15 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_client_make_branch(minor, fset->fset_name, data->ioc_inlbuf1, data->ioc_inlbuf2); - EXIT; - return rc; + goto done; } #endif case IZO_IOC_SERVER_MAKE_BRANCH: { @@ -1223,14 +1214,14 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); izo_upc_server_make_branch(minor, data->ioc_inlbuf1); - EXIT; - return 0; + rc = 0; + goto done; } case IZO_IOC_SET_KMLSIZE: { struct presto_file_set *fset; @@ -1239,38 +1230,33 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_set_kmlsize(minor, fset->fset_name, data->ioc_uuid, data->ioc_kmlsize); - if (rc != 0) { - EXIT; - return rc; - } + if (rc != 0) + goto done; rc = izo_rcvd_get(&rec, fset, data->ioc_uuid); if (rc == -EINVAL) { /* We don't know anything about this uuid yet; no * worries. */ memset(&rec, 0, sizeof(rec)); - } else if (rc <= 0) { + } else if (rc <= 0) { /* do we really want to return 0 if rc == 0 here? */ CERROR("InterMezzo: error reading last_rcvd: %d\n", rc); - EXIT; - return rc; + goto done; } rec.lr_remote_offset = data->ioc_kmlsize; rc = izo_rcvd_write(fset, &rec); if (rc <= 0) { CERROR("InterMezzo: error writing last_rcvd: %d\n", rc); - EXIT; - return rc; + goto done; } - EXIT; - return rc; + goto done; } case IZO_IOC_BRANCH_UNDO: { struct presto_file_set *fset; @@ -1278,15 +1264,14 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_branch_undo(minor, fset->fset_name, data->ioc_inlbuf1); - EXIT; - return rc; + goto done; } case IZO_IOC_BRANCH_REDO: { struct presto_file_set *fset; @@ -1294,28 +1279,33 @@ int presto_ioctl(struct inode *inode, st fset = presto_fset(file->f_dentry); if (fset == NULL) { - EXIT; - return -ENODEV; + rc = -ENODEV; + goto done; } minor = presto_f2m(fset); rc = izo_upc_branch_redo(minor, fset->fset_name, data->ioc_inlbuf1); - EXIT; - return rc; + goto done; } case TCGETS: - EXIT; - return -EINVAL; + rc = -EINVAL; + goto done; default: EXIT; - return -EINVAL; - + rc = -EINVAL; + goto done; + } + + rc = 0; + + done: + PRESTO_FREE(buf, bufsz); EXIT; - return 0; + return rc; } struct file_operations presto_dir_fops = { diff -prauN linux-2.6.0-test7/fs/intermezzo/journal.c wli-2.6.0-test7-bk1-29/fs/intermezzo/journal.c --- linux-2.6.0-test7/fs/intermezzo/journal.c 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/intermezzo/journal.c 2003-10-09 19:56:47.000000000 -0700 @@ -1235,12 +1235,16 @@ int presto_write_kml_logical_offset(stru return izo_rcvd_write(fset, &rec); } +/* we are called from presto_finish_kml_truncate, which is called */ +/* with fset->fset_kml.fd_lock held. Allocations must be GFP_ATOMIC */ struct file * presto_copy_kml_tail(struct presto_file_set *fset, unsigned long int start) { struct file *f; int len; loff_t read_off, write_off, bytes; + char* buf; + size_t bufsz; ENTRY; @@ -1254,21 +1258,31 @@ struct file * presto_copy_kml_tail(struc write_off = 0; read_off = start; bytes = fset->fset_kml.fd_offset - start; - while (bytes > 0) { - char buf[4096]; - int toread; - if (bytes > sizeof(buf)) - toread = sizeof(buf); - else - toread = bytes; + bufsz = bytes; + /* can't use PRESTO_ALLOC - alloction must be atomic */ + buf = kmalloc(bufsz, GFP_ATOMIC); + if (!buf) { + CERROR("IZO: out of memory at %s:%d (trying to " + "allocate %d)\n", __FILE__, __LINE__, + bufsz); + filp_close(f, NULL); + EXIT; + return ERR_PTR(-ENOMEM); + } + + presto_kmem_inc(buf, bufsz); + memset(buf, 0, bufsz); - len = presto_fread(fset->fset_kml.fd_file, buf, toread, + while (bytes > 0) { + len = presto_fread(fset->fset_kml.fd_file, buf, bufsz, &read_off); if (len <= 0) break; if (presto_fwrite(f, buf, len, &write_off) != len) { + kfree(buf); + presto_kmem_dec(buf, bufsz); filp_close(f, NULL); EXIT; return ERR_PTR(-EIO); @@ -1276,7 +1290,9 @@ struct file * presto_copy_kml_tail(struc bytes -= len; } - + + kfree(buf); + presto_kmem_dec(buf, bufsz); EXIT; return f; } @@ -1585,11 +1601,12 @@ int presto_get_fileid(int minor, struct { int opcode = KML_OPCODE_GET_FILEID; struct rec_info rec; - char *buffer, *path, *logrecord, record[4096]; /*include path*/ + char *buffer, *path, *logrecord, *record; /*include path*/ struct dentry *root; __u32 uid, gid, pathlen; int error, size; struct kml_suffix *suffix; + size_t record_size; ENTRY; @@ -1605,9 +1622,13 @@ int presto_get_fileid(int minor, struct size_round(le32_to_cpu(pathlen)) + sizeof(struct kml_suffix); + record_size = max(4096, size); + error = -ENOMEM; + PRESTO_ALLOC(record, record_size); + if (!record) + goto free_buffer; + CDEBUG(D_FILE, "kml size: %d\n", size); - if ( size > sizeof(record) ) - CERROR("InterMezzo: BUFFER OVERFLOW in %s!\n", __FUNCTION__); memset(&rec, 0, sizeof(rec)); rec.is_kml = 1; @@ -1628,6 +1649,9 @@ int presto_get_fileid(int minor, struct size_round(le32_to_cpu(pathlen)), path, fset->fset_name); + PRESTO_FREE(record, record_size); + + free_buffer: BUFF_FREE(buffer); EXIT; return error; diff -prauN linux-2.6.0-test7/fs/isofs/rock.c wli-2.6.0-test7-bk1-29/fs/isofs/rock.c --- linux-2.6.0-test7/fs/isofs/rock.c 2003-10-08 12:24:07.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/isofs/rock.c 2003-10-09 19:42:26.000000000 -0700 @@ -430,7 +430,7 @@ int parse_rock_ridge_inode(struct iso_di static int rock_ridge_symlink_readpage(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *link = kmap(page); unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); unsigned char bufbits = ISOFS_BUFFER_BITS(inode); diff -prauN linux-2.6.0-test7/fs/jbd/commit.c wli-2.6.0-test7-bk1-29/fs/jbd/commit.c --- linux-2.6.0-test7/fs/jbd/commit.c 2003-10-08 12:24:43.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jbd/commit.c 2003-10-09 19:42:26.000000000 -0700 @@ -60,7 +60,7 @@ static void release_buffer_page(struct b page = bh->b_page; if (!page) goto nope; - if (page->mapping) + if (page_mapping(page)) goto nope; /* OK, it's a truncated page */ diff -prauN linux-2.6.0-test7/fs/jbd/journal.c wli-2.6.0-test7-bk1-29/fs/jbd/journal.c --- linux-2.6.0-test7/fs/jbd/journal.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jbd/journal.c 2003-10-09 19:42:26.000000000 -0700 @@ -1676,7 +1676,7 @@ repeat: } else { J_ASSERT_BH(bh, (atomic_read(&bh->b_count) > 0) || - (bh->b_page && bh->b_page->mapping)); + (bh->b_page && page_mapping(bh->b_page))); if (!new_jh) { jbd_unlock_bh_journal_head(bh); diff -prauN linux-2.6.0-test7/fs/jffs/inode-v23.c wli-2.6.0-test7-bk1-29/fs/jffs/inode-v23.c --- linux-2.6.0-test7/fs/jffs/inode-v23.c 2003-10-08 12:24:16.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jffs/inode-v23.c 2003-10-09 19:42:26.000000000 -0700 @@ -743,7 +743,7 @@ jffs_do_readpage_nolock(struct file *fil void *buf; unsigned long read_len; int result; - struct inode *inode = (struct inode*)page->mapping->host; + struct inode *inode = (struct inode*)page_mapping(page)->host; struct jffs_file *f = (struct jffs_file *)inode->u.generic_ip; struct jffs_control *c = (struct jffs_control *)inode->i_sb->s_fs_info; int r; diff -prauN linux-2.6.0-test7/fs/jffs2/file.c wli-2.6.0-test7-bk1-29/fs/jffs2/file.c --- linux-2.6.0-test7/fs/jffs2/file.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jffs2/file.c 2003-10-09 19:42:26.000000000 -0700 @@ -266,18 +266,18 @@ int jffs2_do_readpage_unlock(struct inod int jffs2_readpage (struct file *filp, struct page *pg) { - struct jffs2_inode_info *f = JFFS2_INODE_INFO(pg->mapping->host); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(page_mapping(pg)->host); int ret; down(&f->sem); - ret = jffs2_do_readpage_unlock(pg->mapping->host, pg); + ret = jffs2_do_readpage_unlock(page_mapping(pg)->host, pg); up(&f->sem); return ret; } int jffs2_prepare_write (struct file *filp, struct page *pg, unsigned start, unsigned end) { - struct inode *inode = pg->mapping->host; + struct inode *inode = page_mapping(pg)->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); uint32_t pageofs = pg->index << PAGE_CACHE_SHIFT; int ret = 0; @@ -362,7 +362,7 @@ int jffs2_commit_write (struct file *fil /* Actually commit the write from the page cache page we're looking at. * For now, we write the full page out each time. It sucks, but it's simple */ - struct inode *inode = pg->mapping->host; + struct inode *inode = page_mapping(pg)->host; struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_raw_inode *ri; diff -prauN linux-2.6.0-test7/fs/jfs/file.c wli-2.6.0-test7-bk1-29/fs/jfs/file.c --- linux-2.6.0-test7/fs/jfs/file.c 2003-10-08 12:24:27.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/file.c 2003-10-09 19:27:22.000000000 -0700 @@ -34,10 +34,12 @@ int jfs_fsync(struct file *file, struct struct inode *inode = dentry->d_inode; int rc = 0; - if (!(inode->i_state & I_DIRTY)) - return rc; - if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + if (!(inode->i_state & I_DIRTY) || + (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { + /* Make sure committed changes hit the disk */ + jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); return rc; + } rc |= jfs_commit_inode(inode, 1); diff -prauN linux-2.6.0-test7/fs/jfs/inode.c wli-2.6.0-test7-bk1-29/fs/jfs/inode.c --- linux-2.6.0-test7/fs/jfs/inode.c 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/inode.c 2003-10-09 19:27:22.000000000 -0700 @@ -107,14 +107,18 @@ int jfs_commit_inode(struct inode *inode void jfs_write_inode(struct inode *inode, int wait) { + if (test_cflag(COMMIT_Nolink, inode)) + return; /* * If COMMIT_DIRTY is not set, the inode isn't really dirty. * It has been committed since the last change, but was still - * on the dirty inode list + * on the dirty inode list. */ - if (test_cflag(COMMIT_Nolink, inode) || - !test_cflag(COMMIT_Dirty, inode)) + if (!test_cflag(COMMIT_Dirty, inode)) { + /* Make sure committed changes hit the disk */ + jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait); return; + } if (jfs_commit_inode(inode, wait)) { jfs_err("jfs_write_inode: jfs_commit_inode failed!"); diff -prauN linux-2.6.0-test7/fs/jfs/jfs_dmap.c wli-2.6.0-test7-bk1-29/fs/jfs/jfs_dmap.c --- linux-2.6.0-test7/fs/jfs/jfs_dmap.c 2003-10-08 12:24:42.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/jfs_dmap.c 2003-10-09 19:27:22.000000000 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 + * Copyright (C) International Business Machines Corp., 2000-2003 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -18,6 +18,7 @@ #include #include "jfs_incore.h" +#include "jfs_superblock.h" #include "jfs_dmap.h" #include "jfs_imap.h" #include "jfs_lock.h" @@ -134,7 +135,6 @@ static int dbFreeDmap(struct bmap * bmp, static int dbMaxBud(u8 * cp); s64 dbMapFileSizeToMapSize(struct inode *ipbmap); int blkstol2(s64 nb); -void fsDirty(void); int cntlz(u32 value); int cnttz(u32 word); @@ -382,7 +382,15 @@ int dbFree(struct inode *ip, s64 blkno, IREAD_LOCK(ipbmap); /* block to be freed better be within the mapsize. */ - assert(blkno + nblocks <= bmp->db_mapsize); + if (blkno + nblocks > bmp->db_mapsize) { + IREAD_UNLOCK(ipbmap); + printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks); + jfs_error(ip->i_sb, + "dbFree: block to be freed is outside the map"); + return -EIO; + } /* * free the blocks a dmap at a time. @@ -465,7 +473,14 @@ dbUpdatePMap(struct inode *ipbmap, int lsn, difft, diffp; /* the blocks better be within the mapsize. */ - assert(blkno + nblocks <= bmp->db_mapsize); + if (blkno + nblocks > bmp->db_mapsize) { + printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks); + jfs_error(ipbmap->i_sb, + "dbUpdatePMap: blocks are outside the map"); + return -EIO; + } /* compute delta of transaction lsn from log syncpt */ lsn = tblk->lsn; @@ -757,7 +772,10 @@ int dbAlloc(struct inode *ip, s64 hint, mapSize = bmp->db_mapsize; /* the hint should be within the map */ - assert(hint < mapSize); + if (hint >= mapSize) { + jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map"); + return -EIO; + } /* if the number of blocks to be allocated is greater than the * allocation group size, try to allocate anywhere. @@ -1104,7 +1122,12 @@ int dbExtend(struct inode *ip, s64 blkno /* better be within the file system */ bmp = sbi->bmap; - assert(lastblkno >= 0 && lastblkno < bmp->db_mapsize); + if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) { + IREAD_UNLOCK(ipbmap); + jfs_error(ip->i_sb, + "dbExtend: the block is outside the filesystem"); + return -EIO; + } /* we'll attempt to extend the current allocation in place by * allocating the additional blocks as the blocks immediately @@ -1145,11 +1168,10 @@ int dbExtend(struct inode *ip, s64 blkno DBALLOC(bmp->db_DBmap, bmp->db_mapsize, extblkno, addnblocks); write_metapage(mp); - } else { + } else /* we were not successful */ release_metapage(mp); - assert(rc == -ENOSPC || rc == -EIO); - } + return (rc); } @@ -1414,7 +1436,12 @@ dbAllocAG(struct bmap * bmp, int agno, s /* allocation request should not be for more than the * allocation group size. */ - assert(l2nb <= bmp->db_agl2size); + if (l2nb > bmp->db_agl2size) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocAG: allocation request is larger than the " + "allocation group size"); + return -EIO; + } /* determine the starting block number of the allocation * group. @@ -1441,13 +1468,13 @@ dbAllocAG(struct bmap * bmp, int agno, s if (bmp->db_agsize == BPERDMAP || bmp->db_agfree[agno] == bmp->db_agsize) { rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); - /* assert(!(rc == -ENOSPC && bmp->db_agfree[agno] == bmp->db_agsize)); */ if ((rc == -ENOSPC) && (bmp->db_agfree[agno] == bmp->db_agsize)) { - jfs_err("dbAllocAG: removed assert, but still need to " - "debug here\nblkno = 0x%Lx, nblocks = 0x%Lx", - (unsigned long long) blkno, - (unsigned long long) nblocks); + printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks); + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocAG: dbAllocCtl failed in free AG"); } return (rc); } @@ -1496,7 +1523,11 @@ dbAllocAG(struct bmap * bmp, int agno, s break; } } - assert(n < 4); + if (n == 4) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocAG: failed descending stree"); + return -EIO; + } } /* determine the block number within the file system @@ -1531,7 +1562,12 @@ dbAllocAG(struct bmap * bmp, int agno, s if ((rc = dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1, &blkno))) { - assert(rc != -ENOSPC); + if (rc == -ENOSPC) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocAG: control page " + "inconsistent"); + return -EIO; + } return (rc); } } @@ -1539,7 +1575,11 @@ dbAllocAG(struct bmap * bmp, int agno, s /* allocate the blocks. */ rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); - assert(rc != -ENOSPC); + if (rc == -ENOSPC) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocAG: unable to allocate blocks"); + rc = -EIO; + } return (rc); } @@ -1595,7 +1635,11 @@ static int dbAllocAny(struct bmap * bmp, /* allocate the blocks. */ rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); - assert(rc != -ENOSPC); + if (rc == -ENOSPC) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocAny: unable to allocate blocks"); + return -EIO; + } return (rc); } @@ -1666,7 +1710,11 @@ static int dbFindCtl(struct bmap * bmp, /* space found ? */ if (rc) { - assert(lev == level); + if (lev != level) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbFindCtl: dmap inconsistent"); + return -EIO; + } return -ENOSPC; } @@ -1785,7 +1833,13 @@ dbAllocCtl(struct bmap * bmp, s64 nblock /* the dmap better be all free. */ - assert(dp->tree.stree[ROOT] == L2BPERDMAP); + if (dp->tree.stree[ROOT] != L2BPERDMAP) { + release_metapage(mp); + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocCtl: the dmap is not all free"); + rc = -EIO; + goto backout; + } /* determine how many blocks to allocate from this dmap. */ @@ -1828,8 +1882,8 @@ dbAllocCtl(struct bmap * bmp, s64 nblock /* could not back out. mark the file system * to indicate that we have leaked blocks. */ - fsDirty(); /* !!! */ - jfs_err("dbAllocCtl: I/O Error: Block Leakage."); + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocCtl: I/O Error: Block Leakage."); continue; } dp = (struct dmap *) mp->data; @@ -1841,8 +1895,8 @@ dbAllocCtl(struct bmap * bmp, s64 nblock * to indicate that we have leaked blocks. */ release_metapage(mp); - fsDirty(); /* !!! */ - jfs_err("dbAllocCtl: Block Leakage."); + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocCtl: Block Leakage."); continue; } @@ -2137,7 +2191,12 @@ static void dbAllocBits(struct bmap * bm * the allocated words. */ for (; nwords > 0; nwords -= nw) { - assert(leaf[word] >= BUDMIN); + if (leaf[word] < BUDMIN) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAllocBits: leaf page " + "corrupt"); + break; + } /* determine what the leaf value should be * updated to as the minimum of the l2 number @@ -2489,7 +2548,11 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, i * of the maximum free buddy system. */ assert(level == bmp->db_maxlevel); - assert(bmp->db_maxfreebud == oldroot); + if (bmp->db_maxfreebud != oldroot) { + jfs_error(bmp->db_ipbmap->i_sb, + "dbAdjCtl: the maximum free buddy is " + "not the old root"); + } bmp->db_maxfreebud = dcp->stree[ROOT]; } } @@ -3040,24 +3103,6 @@ int blkstol2(s64 nb) /* - * NAME: fsDirty() - * - * FUNCTION: xxx - * - * PARAMETERS: - * ipmnt - mount inode - * - * RETURN VALUES: - * none - */ -void fsDirty(void) -{ - printk("fsDirty(): bye-bye\n"); - assert(0); -} - - -/* * NAME: dbAllocBottomUp() * * FUNCTION: alloc the specified block range from the working block @@ -3343,7 +3388,10 @@ int dbExtendFS(struct inode *ipbmap, s64 /* get L2 page */ p = BMAPBLKNO + nbperpage; /* L2 page */ l2mp = read_metapage(ipbmap, p, PSIZE, 0); - assert(l2mp); + if (!l2mp) { + jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read"); + return -EIO; + } l2dcp = (struct dmapctl *) l2mp->data; /* compute start L1 */ @@ -3504,7 +3552,9 @@ int dbExtendFS(struct inode *ipbmap, s64 } } /* for each L1 in a L2 */ - assert(0); + jfs_error(ipbmap->i_sb, + "dbExtendFS: function has not returned as expected"); + return -EIO; /* * finalize bmap control page @@ -3568,7 +3618,10 @@ void dbFinalizeBmap(struct inode *ipbmap if (bmp->db_agfree[bmp->db_agpref] >= avgfree) break; } - assert(bmp->db_agpref < bmp->db_numag); + if (bmp->db_agpref >= bmp->db_numag) { + jfs_error(ipbmap->i_sb, + "cannot find ag with average freespace"); + } } /* @@ -3589,10 +3642,6 @@ void dbFinalizeBmap(struct inode *ipbmap n <<= 2; } -/* -printk("bmap: agpref:%d aglevel:%d agheigth:%d agwidth:%d\n", - bmp->db_agpref, bmp->db_aglevel, bmp->db_agheigth, bmp->db_agwidth); -*/ } @@ -3616,9 +3665,6 @@ printk("bmap: agpref:%d aglevel:%d aghei static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks) { int blkno, w, b, r, nw, nb, i; -/* -printk("sbh_dmap: in dbInitDmap blkno:%Ld nblocks:%ld\n", Blkno, nblocks); -*/ /* starting block number within the dmap */ blkno = Blkno & (BPERDMAP - 1); @@ -3678,10 +3724,6 @@ printk("sbh_dmap: in dbInitDmap blkno:% * mark bits following the range to be freed (non-existing * blocks) as allocated (ONES) */ -/* -printk("sbh_dmap: in dbInitDmap, preparing to mark unbacked, blkno:%ld nblocks:%ld\n", - blkno, nblocks); -*/ if (blkno == BPERDMAP) goto initTree; @@ -3691,9 +3733,6 @@ printk("sbh_dmap: in dbInitDmap, prepar /* does nblocks fall on a 32-bit boundary ? */ b = blkno & (DBWORD - 1); -/* -printk("sbh_dmap: in dbInitDmap, b:%ld w:%ld mask: %lx\n", b, w, (ONES>>b)); -*/ if (b) { /* mark a partial word allocated */ dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b); @@ -3990,7 +4029,7 @@ static void DBinitmap(s64 size, struct i dbmap = (u32 *) xmalloc(npages * 4096, L2PSIZE, kernel_heap); if (dbmap == NULL) - assert(0); + BUG(); /* Not robust since this is only unused debug code */ for (n = 0, d = dbmap; n < npages; n++, d += 1024) bzero(d, 4096); @@ -4004,7 +4043,9 @@ static void DBinitmap(s64 size, struct i db_l2nbperpage); mp = read_metapage(ipbmap, lblkno, PSIZE, 0); if (mp == NULL) { - assert(0); + jfs_error(ipbmap->i_sb, + "DBinitmap: could not read disk map page"); + continue; } dp = (struct dmap *) mp->data; diff -prauN linux-2.6.0-test7/fs/jfs/jfs_dtree.c wli-2.6.0-test7-bk1-29/fs/jfs/jfs_dtree.c --- linux-2.6.0-test7/fs/jfs/jfs_dtree.c 2003-10-08 12:24:25.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/jfs_dtree.c 2003-10-09 19:27:22.000000000 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 + * Copyright (C) International Business Machines Corp., 2000-2003 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -130,9 +130,8 @@ struct dtsplit { if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\ ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\ {\ - jfs_err("DT_GETPAGE: dtree page corrupt");\ BT_PUTPAGE(MP);\ - updateSuper((IP)->i_sb, FM_DIRTY);\ + jfs_error((IP)->i_sb, "DT_GETPAGE: dtree page corrupt");\ MP = NULL;\ RC = -EIO;\ }\ @@ -768,8 +767,7 @@ int dtSearch(struct inode *ip, struct co /* Something's corrupted, mark filesytem dirty so * chkdsk will fix it. */ - jfs_err("stack overrun in dtSearch!"); - updateSuper(sb, FM_DIRTY); + jfs_error(sb, "stack overrun in dtSearch!"); rc = -EIO; goto out; } @@ -3204,11 +3202,12 @@ int jfs_readdir(struct file *filp, void d_namleft -= len; /* Sanity Check */ if (d_namleft == 0) { - jfs_err("JFS:Dtree error: ino = " - "%ld, bn=%Ld, index = %d", - (long)ip->i_ino,(long long)bn, - i); - updateSuper(ip->i_sb, FM_DIRTY); + jfs_error(ip->i_sb, + "JFS:Dtree error: ino = " + "%ld, bn=%Ld, index = %d", + (long)ip->i_ino, + (long long)bn, + i); goto skip_one; } len = min(d_namleft, DTSLOTDATALEN); diff -prauN linux-2.6.0-test7/fs/jfs/jfs_extent.c wli-2.6.0-test7-bk1-29/fs/jfs/jfs_extent.c --- linux-2.6.0-test7/fs/jfs/jfs_extent.c 2003-10-08 12:24:07.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/jfs_extent.c 2003-10-09 19:27:22.000000000 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 + * Copyright (C) International Business Machines Corp., 2000-2003 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -18,6 +18,7 @@ #include #include "jfs_incore.h" +#include "jfs_superblock.h" #include "jfs_dmap.h" #include "jfs_extent.h" #include "jfs_debug.h" @@ -403,8 +404,10 @@ int extHint(struct inode *ip, s64 offset */ xp->flag &= XAD_NOTRECORDED; - assert(xadl.nxad == 1); - assert(lengthXAD(xp) == nbperpage); + if(xadl.nxad != 1 || lengthXAD(xp) != nbperpage) { + jfs_error(ip->i_sb, "extHint: corrupt xtree"); + return -EIO; + } return (0); } diff -prauN linux-2.6.0-test7/fs/jfs/jfs_filsys.h wli-2.6.0-test7-bk1-29/fs/jfs/jfs_filsys.h --- linux-2.6.0-test7/fs/jfs/jfs_filsys.h 2003-10-08 12:24:05.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/jfs_filsys.h 2003-10-09 19:27:22.000000000 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2003 + * Copyright (C) International Business Machines Corp., 2000-2003 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -32,6 +32,11 @@ /* mount time flag to disable journaling to disk */ #define JFS_NOINTEGRITY 0x00000010 +/* mount time flags for error handling */ +#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ +#define JFS_ERR_CONTINUE 0x00000004 /* continue */ +#define JFS_ERR_PANIC 0x00000008 /* panic */ + /* platform option (conditional compilation) */ #define JFS_AIX 0x80000000 /* AIX support */ /* POSIX name/directory support */ diff -prauN linux-2.6.0-test7/fs/jfs/jfs_imap.c wli-2.6.0-test7-bk1-29/fs/jfs/jfs_imap.c --- linux-2.6.0-test7/fs/jfs/jfs_imap.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/jfs_imap.c 2003-10-09 19:27:22.000000000 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 + * Copyright (C) International Business Machines Corp., 2000-2003 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -410,8 +410,7 @@ int diRead(struct inode *ip) dp += rel_inode; if (ip->i_ino != le32_to_cpu(dp->di_number)) { - jfs_err("diRead: i_ino != di_number"); - updateSuper(ip->i_sb, FM_DIRTY); + jfs_error(ip->i_sb, "diRead: i_ino != di_number"); rc = -EIO; } else if (le32_to_cpu(dp->di_nlink) == 0) rc = -ESTALE; @@ -641,9 +640,12 @@ int diWrite(tid_t tid, struct inode *ip) ino = ip->i_ino & (INOSPERIAG - 1); - assert(lengthPXD(&(jfs_ip->ixpxd)) == - JFS_IP(ipimap)->i_imap->im_nbperiext); - assert(addressPXD(&(jfs_ip->ixpxd))); + if (!addressPXD(&(jfs_ip->ixpxd)) || + (lengthPXD(&(jfs_ip->ixpxd)) != + JFS_IP(ipimap)->i_imap->im_nbperiext)) { + jfs_error(ip->i_sb, "diWrite: ixpxd invalid"); + return -EIO; + } /* * read the page of disk inode containing the specified inode: @@ -918,12 +920,11 @@ int diFree(struct inode *ip) /* make sure that the iag is contained within * the map. */ - //assert(iagno < imap->im_nextiag); if (iagno >= imap->im_nextiag) { - jfs_err("diFree: inum = %d, iagno = %d, nextiag = %d", - (uint) inum, iagno, imap->im_nextiag); dump_mem("imap", imap, 32); - updateSuper(ip->i_sb, FM_DIRTY); + jfs_error(ip->i_sb, + "diFree: inum = %d, iagno = %d, nextiag = %d", + (uint) inum, iagno, imap->im_nextiag); return -EIO; } @@ -957,22 +958,28 @@ int diFree(struct inode *ip) bitno = ino & (INOSPEREXT - 1); mask = HIGHORDER >> bitno; - assert(le32_to_cpu(iagp->wmap[extno]) & mask); -#ifdef _STILL_TO_PORT - assert((le32_to_cpu(iagp->pmap[extno]) & mask) == 0); -#endif /* _STILL_TO_PORT */ - assert(addressPXD(&iagp->inoext[extno])); + if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { + jfs_error(ip->i_sb, + "diFree: wmap shows inode already free"); + } + + if (!addressPXD(&iagp->inoext[extno])) { + release_metapage(mp); + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + jfs_error(ip->i_sb, "diFree: invalid inoext"); + return -EIO; + } /* compute the bitmap for the extent reflecting the freed inode. */ bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask; if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) { - jfs_err("diFree: numfree > numinos"); release_metapage(mp); IREAD_UNLOCK(ipimap); AG_UNLOCK(imap, agno); - updateSuper(ip->i_sb, FM_DIRTY); + jfs_error(ip->i_sb, "diFree: numfree > numinos"); return -EIO; } /* @@ -1136,7 +1143,6 @@ int diFree(struct inode *ip) if ((rc = diIAGRead(imap, inofreefwd, &cmp))) goto error_out; - assert(cmp != NULL); ciagp = (struct iag *) cmp->data; } assert(ciagp != NULL); @@ -1151,7 +1157,6 @@ int diFree(struct inode *ip) if ((rc = diIAGRead(imap, inofreeback, &dmp))) goto error_out; - assert(dmp != NULL); diagp = (struct iag *) dmp->data; } assert(diagp != NULL); @@ -1224,7 +1229,9 @@ int diFree(struct inode *ip) * the permanent map should have been updated already * for the inode being freed. */ - assert(iagp->pmap[extno] == 0); + if (iagp->pmap[extno] != 0) { + jfs_error(ip->i_sb, "diFree: the pmap does not show inode free"); + } iagp->wmap[extno] = 0; DBG_DIFREE(imap, inum); PXDlength(&iagp->inoext[extno], 0); @@ -1304,7 +1311,7 @@ int diFree(struct inode *ip) iplist[1] = (struct inode *) (size_t)iagno; iplist[2] = (struct inode *) (size_t)extno; - rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); // D233382 + rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); txEnd(tid); @@ -1434,6 +1441,7 @@ int diAlloc(struct inode *pip, boolean_t iagno = INOTOIAG(inum); if ((rc = diIAGRead(imap, iagno, &mp))) { IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); return (rc); } iagp = (struct iag *) mp->data; @@ -1536,10 +1544,16 @@ int diAlloc(struct inode *pip, boolean_t */ rem = diFindFree(inosmap, 0); extno = (sword << L2EXTSPERSUM) + rem; - rem = - diFindFree(le32_to_cpu - (iagp->wmap[extno]), 0); - assert(rem < INOSPEREXT); + rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), + 0); + if (rem >= INOSPEREXT) { + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + jfs_error(ip->i_sb, + "diAlloc: can't find free bit " + "in wmap"); + return EIO; + } /* determine the inode number within the * iag and allocate the inode from the @@ -1548,9 +1562,9 @@ int diAlloc(struct inode *pip, boolean_t ino = (extno << L2INOSPEREXT) + rem; rc = diAllocBit(imap, iagp, ino); IREAD_UNLOCK(ipimap); - if (rc) { + if (rc) assert(rc == -EIO); - } else { + else { /* set the results of the allocation * and write the iag. */ @@ -1678,8 +1692,7 @@ diAllocAG(struct inomap * imap, int agno numinos = imap->im_agctl[agno].numinos; if (numfree > numinos) { - jfs_err("diAllocAG: numfree > numinos"); - updateSuper(ip->i_sb, FM_DIRTY); + jfs_error(ip->i_sb, "diAllocAG: numfree > numinos"); return -EIO; } @@ -1827,12 +1840,10 @@ static int diAllocIno(struct inomap * im /* better be free inodes in this iag if it is on the * list. */ - //assert(iagp->nfreeinos); if (!iagp->nfreeinos) { - jfs_err("diAllocIno: nfreeinos = 0, but iag on freelist"); - jfs_err(" agno = %d, iagno = %d", agno, iagno); - dump_mem("iag", iagp, 64); - updateSuper(ip->i_sb, FM_DIRTY); + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, + "diAllocIno: nfreeinos = 0, but iag on freelist"); return -EIO; } @@ -1840,7 +1851,12 @@ static int diAllocIno(struct inomap * im * with free inodes. */ for (sword = 0;; sword++) { - assert(sword < SMAPSZ); + if (sword >= SMAPSZ) { + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, + "diAllocIno: free inode not found in summary map"); + return -EIO; + } if (~iagp->inosmap[sword]) break; @@ -1850,13 +1866,21 @@ static int diAllocIno(struct inomap * im * the extent number. */ rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0); - assert(rem < EXTSPERSUM); + if (rem >= EXTSPERSUM) { + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, "diAllocIno: no free extent found"); + return -EIO; + } extno = (sword << L2EXTSPERSUM) + rem; /* find the first free inode in the extent. */ rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0); - assert(rem < INOSPEREXT); + if (rem >= INOSPEREXT) { + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, "diAllocIno: free inode not found"); + return -EIO; + } /* compute the inode number within the iag. */ @@ -1939,7 +1963,9 @@ static int diAllocExt(struct inomap * im */ IREAD_LOCK(imap->im_ipimap); if ((rc = diIAGRead(imap, iagno, &mp))) { - assert(0); + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, "diAllocExt: error reading iag"); + return rc; } iagp = (struct iag *) mp->data; } @@ -1947,7 +1973,13 @@ static int diAllocExt(struct inomap * im /* using the free extent summary map, find a free extent. */ for (sword = 0;; sword++) { - assert(sword < SMAPSZ); + if (sword >= SMAPSZ) { + release_metapage(mp); + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, + "diAllocExt: free ext summary map not found"); + return -EIO; + } if (~iagp->extsmap[sword]) break; } @@ -1955,7 +1987,12 @@ static int diAllocExt(struct inomap * im /* determine the extent number of the free extent. */ rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0); - assert(rem < EXTSPERSUM); + if (rem >= EXTSPERSUM) { + release_metapage(mp); + IREAD_UNLOCK(imap->im_ipimap); + jfs_error(ip->i_sb, "diAllocExt: free extent not found"); + return -EIO; + } extno = (sword << L2EXTSPERSUM) + rem; /* initialize the new extent. @@ -2066,9 +2103,18 @@ static int diAllocBit(struct inomap * im /* the inode should be free and backed. */ - assert((le32_to_cpu(iagp->pmap[extno]) & mask) == 0); - assert((le32_to_cpu(iagp->wmap[extno]) & mask) == 0); - assert(addressPXD(&iagp->inoext[extno]) != 0); + if (((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) || + ((le32_to_cpu(iagp->wmap[extno]) & mask) != 0) || + (addressPXD(&iagp->inoext[extno]) == 0)) { + if (amp) + release_metapage(amp); + if (bmp) + release_metapage(bmp); + + jfs_error(imap->im_ipimap->i_sb, + "diAllocBit: iag inconsistent"); + return -EIO; + } /* mark the inode as allocated in the working map. */ @@ -2172,7 +2218,10 @@ static int diNewExt(struct inomap * imap /* better have free extents. */ - assert(iagp->nfreeexts); + if (!iagp->nfreeexts) { + jfs_error(imap->im_ipimap->i_sb, "diNewExt: no free extents"); + return -EIO; + } /* get the inode map inode. */ @@ -2240,7 +2289,12 @@ static int diNewExt(struct inomap * imap goto error_out; ciagp = (struct iag *) cmp->data; } - assert(ciagp != NULL); + if (ciagp == NULL) { + jfs_error(imap->im_ipimap->i_sb, + "diNewExt: ciagp == NULL"); + rc = -EIO; + goto error_out; + } } } @@ -2474,7 +2528,14 @@ diNewIAG(struct inomap * imap, int *iagn /* acquire inode map lock */ IWRITE_LOCK(ipimap); - assert(ipimap->i_size >> L2PSIZE == imap->im_nextiag + 1); + if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) { + IWRITE_UNLOCK(ipimap); + IAGFREE_UNLOCK(imap); + jfs_error(imap->im_ipimap->i_sb, + "diNewIAG: ipimap->i_size is wrong"); + return -EIO; + } + /* get the next avaliable iag number */ iagno = imap->im_nextiag; @@ -2507,7 +2568,6 @@ diNewIAG(struct inomap * imap, int *iagn /* assign a buffer for the page */ mp = get_metapage(ipimap, xaddr, PSIZE, 1); - //bp = bmAssign(ipimap, blkno, xaddr, PSIZE, bmREAD_PAGE); if (!mp) { /* Free the blocks allocated for the iag since it was * not successfully added to the inode map @@ -2734,7 +2794,11 @@ diUpdatePMap(struct inode *ipimap, /* get the iag number containing the inode */ iagno = INOTOIAG(inum); /* make sure that the iag is contained within the map */ - assert(iagno < imap->im_nextiag); + if (iagno >= imap->im_nextiag) { + jfs_error(ipimap->i_sb, + "diUpdatePMap: the iag is outside the map"); + return -EIO; + } /* read the iag */ IREAD_LOCK(ipimap); rc = diIAGRead(imap, iagno, &mp); @@ -2759,14 +2823,14 @@ diUpdatePMap(struct inode *ipimap, * of last reference release; */ if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { - jfs_err("diUpdatePMap: inode %ld not marked as " - "allocated in wmap!", inum); - updateSuper(ipimap->i_sb, FM_DIRTY); + jfs_error(ipimap->i_sb, + "diUpdatePMap: inode %ld not marked as " + "allocated in wmap!", inum); } if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) { - jfs_err("diUpdatePMap: inode %ld not marked as " - "allocated in pmap!", inum); - updateSuper(ipimap->i_sb, FM_DIRTY); + jfs_error(ipimap->i_sb, + "diUpdatePMap: inode %ld not marked as " + "allocated in pmap!", inum); } /* update the bitmap for the extent of the freed inode */ iagp->pmap[extno] &= cpu_to_le32(~mask); @@ -2778,8 +2842,18 @@ diUpdatePMap(struct inode *ipimap, /* The inode should be already allocated in the working map * and should be free in persistent map; */ - assert(le32_to_cpu(iagp->wmap[extno]) & mask); - assert((le32_to_cpu(iagp->pmap[extno]) & mask) == 0); + if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { + jfs_error(ipimap->i_sb, + "diUpdatePMap: the inode is not allocated in " + "the working map"); + return -EIO; + } + if ((le32_to_cpu(iagp->pmap[extno]) & mask) != 0) { + jfs_error(ipimap->i_sb, + "diUpdatePMap: the inode is not free in the " + "persistent map"); + return -EIO; + } /* update the bitmap for the extent of the allocated inode */ iagp->pmap[extno] |= cpu_to_le32(mask); } @@ -2817,7 +2891,6 @@ diUpdatePMap(struct inode *ipimap, mp->clsn = tblk->clsn; LOGSYNC_UNLOCK(log); } -// bmLazyWrite(mp, log->flag & JFS_COMMIT); write_metapage(mp); return (0); } @@ -2872,7 +2945,12 @@ int diExtendFS(struct inode *ipimap, str continue; } iagp = (struct iag *) bp->data; - assert(le32_to_cpu(iagp->iagnum) == i); + if (le32_to_cpu(iagp->iagnum) != i) { + release_metapage(bp); + jfs_error(ipimap->i_sb, + "diExtendFs: unexpected value of iagnum"); + return -EIO; + } /* leave free iag in the free iag list */ if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { @@ -2884,9 +2962,6 @@ int diExtendFS(struct inode *ipimap, str agstart = le64_to_cpu(iagp->agstart); /* iagp->agstart = agstart & ~(mp->db_agsize - 1); */ n = agstart >> mp->db_agl2size; -/* -printf("diExtendFS: iag:%d agstart:%Ld agno:%d\n", i, agstart, n); -*/ /* compute backed inodes */ numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts)) @@ -2947,8 +3022,12 @@ printf("diExtendFS: iag:%d agstart:%Ld a write_metapage(bp); } - ASSERT(xnuminos == atomic_read(&imap->im_numinos) && - xnumfree == atomic_read(&imap->im_numfree)); + if (xnuminos != atomic_read(&imap->im_numinos) || + xnumfree != atomic_read(&imap->im_numfree)) { + jfs_error(ipimap->i_sb, + "diExtendFs: numinos or numfree incorrect"); + return -EIO; + } return rcx; } diff -prauN linux-2.6.0-test7/fs/jfs/jfs_metapage.c wli-2.6.0-test7-bk1-29/fs/jfs/jfs_metapage.c --- linux-2.6.0-test7/fs/jfs/jfs_metapage.c 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/jfs_metapage.c 2003-10-09 19:27:22.000000000 -0700 @@ -1,6 +1,6 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 - * Portions Copyright (c) Christoph Hellwig, 2001-2002 + * Copyright (C) International Business Machines Corp., 2000-2003 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,6 +22,7 @@ #include #include #include "jfs_incore.h" +#include "jfs_superblock.h" #include "jfs_filsys.h" #include "jfs_metapage.h" #include "jfs_txnmgr.h" @@ -233,14 +234,23 @@ struct metapage *__get_metapage(struct i if (mp) { page_found: if (test_bit(META_discard, &mp->flag)) { - assert(new); /* It's okay to reuse a discarded - * if we expect it to be empty - */ + if (!new) { + spin_unlock(&meta_lock); + jfs_error(inode->i_sb, + "__get_metapage: using a " + "discarded metapage"); + return NULL; + } clear_bit(META_discard, &mp->flag); } mp->count++; jfs_info("__get_metapage: found 0x%p, in hash", mp); - assert(mp->logical_size == size); + if (mp->logical_size != size) { + spin_unlock(&meta_lock); + jfs_error(inode->i_sb, + "__get_metapage: mp->logical_size != size"); + return NULL; + } lock_metapage(mp); spin_unlock(&meta_lock); } else { diff -prauN linux-2.6.0-test7/fs/jfs/jfs_superblock.h wli-2.6.0-test7-bk1-29/fs/jfs/jfs_superblock.h --- linux-2.6.0-test7/fs/jfs/jfs_superblock.h 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/jfs_superblock.h 2003-10-09 19:27:22.000000000 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 + * Copyright (C) International Business Machines Corp., 2000-2003 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -108,5 +108,6 @@ struct jfs_superblock { extern int readSuper(struct super_block *, struct buffer_head **); extern int updateSuper(struct super_block *, uint); +extern void jfs_error(struct super_block *, const char *, ...); #endif /*_H_JFS_SUPERBLOCK */ diff -prauN linux-2.6.0-test7/fs/jfs/jfs_txnmgr.c wli-2.6.0-test7-bk1-29/fs/jfs/jfs_txnmgr.c --- linux-2.6.0-test7/fs/jfs/jfs_txnmgr.c 2003-10-08 12:24:15.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/jfs_txnmgr.c 2003-10-09 19:27:22.000000000 -0700 @@ -1,6 +1,6 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2003 - * Portions Copyright (c) Christoph Hellwig, 2001-2002 + * Copyright (C) International Business Machines Corp., 2000-2003 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -1442,7 +1442,6 @@ int diLog(struct jfs_log * log, struct t * page is not itself logged, to prevent pageout of the map * page before the log; */ - assert(tlck->type & tlckFREE); /* log LOG_NOREDOINOEXT of the freed inode extent for * logredo() to start NoRedoPage filters, and to update @@ -2655,7 +2654,7 @@ void txAbort(tid_t tid, int dirty) * mark filesystem dirty */ if (dirty) - updateSuper(tblk->sb, FM_DIRTY); + jfs_error(tblk->sb, "txAbort"); return; } @@ -2714,7 +2713,7 @@ static void txAbortCommit(struct commit /* * mark filesystem dirty */ - updateSuper(cd->sb, FM_DIRTY); + jfs_error(cd->sb, "txAbortCommit"); } diff -prauN linux-2.6.0-test7/fs/jfs/jfs_xtree.c wli-2.6.0-test7-bk1-29/fs/jfs/jfs_xtree.c --- linux-2.6.0-test7/fs/jfs/jfs_xtree.c 2003-10-08 12:24:05.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/jfs_xtree.c 2003-10-09 19:27:22.000000000 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 + * Copyright (C) International Business Machines Corp., 2000-2003 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -60,21 +60,21 @@ #define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot) /* get page buffer for specified block address */ +/* ToDo: Replace this ugly macro with a function */ #define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ {\ - BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\ - if (!(RC))\ - {\ - if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\ - (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\ - (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\ - {\ - jfs_err("XT_GETPAGE: xtree page corrupt");\ + BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\ + if (!(RC))\ + {\ + if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\ + (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\ + (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\ + {\ + jfs_error((IP)->i_sb, "XT_GETPAGE: xtree page corrupt");\ BT_PUTPAGE(MP);\ - updateSuper((IP)->i_sb, FM_DIRTY);\ MP = NULL;\ - RC = -EIO;\ - }\ + RC = -EIO;\ + }\ }\ } @@ -1611,14 +1611,21 @@ int xtExtend(tid_t tid, /* transaction /* there must exist extent to be extended */ if ((rc = xtSearch(ip, xoff - 1, &cmp, &btstack, XT_INSERT))) return rc; - assert(cmp == 0); + if (cmp != 0) { + jfs_error(ip->i_sb, "xtExtend: xtSearch did not find extent"); + return -EIO; + } /* retrieve search result */ XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); /* extension must be contiguous */ xad = &p->xad[index]; - assert((offsetXAD(xad) + lengthXAD(xad)) == xoff); + if ((offsetXAD(xad) + lengthXAD(xad)) != xoff) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtExtend: extension is not contiguous"); + return -EIO; + } /* * acquire a transaction lock on the leaf page; @@ -1771,14 +1778,22 @@ printf("xtTailgate: nxoff:0x%lx nxlen:0x /* there must exist extent to be tailgated */ if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT))) return rc; - assert(cmp == 0); + if (cmp != 0) { + jfs_error(ip->i_sb, "xtTailgate: couldn't find extent"); + return -EIO; + } /* retrieve search result */ XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); /* entry found must be last entry */ nextindex = le16_to_cpu(p->header.nextindex); - assert(index == nextindex - 1); + if (index != nextindex - 1) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, + "xtTailgate: the entry found is not the last entry"); + return -EIO; + } BT_MARK_DIRTY(mp, ip); /* @@ -1941,13 +1956,14 @@ int xtUpdate(tid_t tid, struct inode *ip nxoff = offsetXAD(nxad); nxlen = lengthXAD(nxad); nxaddr = addressXAD(nxad); -/* -printf("xtUpdate: nxflag:0x%x nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", - nxad->flag, (ulong)nxoff, nxlen, (ulong)nxaddr); -*/ + if ((rc = xtSearch(ip, nxoff, &cmp, &btstack, XT_INSERT))) return rc; - assert(cmp == 0); + + if (cmp != 0) { + jfs_error(ip->i_sb, "xtUpdate: Could not find extent"); + return -EIO; + } /* retrieve search result */ XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); @@ -1966,14 +1982,15 @@ printf("xtUpdate: nxflag:0x%x nxoff:0x%l xoff = offsetXAD(xad); xlen = lengthXAD(xad); xaddr = addressXAD(xad); -/* -printf("xtUpdate: xflag:0x%x xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", - xflag, (ulong)xoff, xlen, (ulong)xaddr); -*/ /* nXAD must be completely contained within XAD */ - assert(xoff <= nxoff); - assert(nxoff + nxlen <= xoff + xlen); + if ((xoff > nxoff) || + (nxoff + nxlen > xoff + xlen)) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, + "xtUpdate: nXAD in not completely contained within XAD"); + return -EIO; + } index = index0; newindex = index + 1; @@ -2118,7 +2135,11 @@ printf("xtUpdate: xflag:0x%x xoff:0x%lx } else if (xoff == nxoff) goto out; - assert(xoff < nxoff); + if (xoff >= nxoff) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, "xtUpdate: xoff >= nxoff"); + return -EIO; + } /* #endif _JFS_WIP_COALESCE */ /* @@ -2135,9 +2156,6 @@ printf("xtUpdate: xflag:0x%x xoff:0x%lx /* insert nXAD:recorded */ if (nextindex == le16_to_cpu(p->header.maxentry)) { -/* -printf("xtUpdate.updateRight.split p:0x%p\n", p); -*/ rootsplit = p->header.flag & BT_ROOT; /* xtSpliUp() unpins leaf pages */ @@ -2248,18 +2266,23 @@ printf("xtUpdate.updateRight.split p:0x% /* recompute split pages */ if (nextindex == le16_to_cpu(p->header.maxentry)) { -/* -printf("xtUpdate: updateRight+Left recompute split pages: p:0x%p\n", p); -*/ XT_PUTPAGE(mp); if ((rc = xtSearch(ip, nxoff, &cmp, &btstack, XT_INSERT))) return rc; - assert(cmp == 0); + if (cmp != 0) { + jfs_error(ip->i_sb, "xtUpdate: xtSearch failed"); + return -EIO; + } /* retrieve search result */ XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); - assert(index0 == index); + if (index0 != index) { + XT_PUTPAGE(mp); + jfs_error(ip->i_sb, + "xtUpdate: unexpected value of index"); + return -EIO; + } } /* @@ -2755,6 +2778,7 @@ xtDeleteUp(tid_t tid, struct inode *ip, * txCommit() to commit all the allocation before call * this routine. */ +int xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ s64 nxaddr, /* new xaddr */ int xtype) @@ -3925,7 +3949,11 @@ s64 xtTruncate_pmap(tid_t tid, struct in rc = xtSearch(ip, xoff, &cmp, &btstack, 0); if (rc) return rc; - assert(cmp == 0); + if (cmp != 0) { + jfs_error(ip->i_sb, + "xtTruncate_pmap: did not find extent"); + return -EIO; + } XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); } else { /* diff -prauN linux-2.6.0-test7/fs/jfs/namei.c wli-2.6.0-test7-bk1-29/fs/jfs/namei.c --- linux-2.6.0-test7/fs/jfs/namei.c 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/namei.c 2003-10-09 19:27:22.000000000 -0700 @@ -1,6 +1,6 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 - * Portions Copyright (c) Christoph Hellwig, 2001-2002 + * Copyright (C) International Business Machines Corp., 2000-2003 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,6 +19,7 @@ #include #include "jfs_incore.h" +#include "jfs_superblock.h" #include "jfs_inode.h" #include "jfs_dinode.h" #include "jfs_dmap.h" @@ -1138,7 +1139,17 @@ int jfs_rename(struct inode *old_dir, st new_ip->i_nlink--; if (S_ISDIR(new_ip->i_mode)) { new_ip->i_nlink--; - assert(new_ip->i_nlink == 0); + if (new_ip->i_nlink) { + up(&JFS_IP(new_dir)->commit_sem); + up(&JFS_IP(old_ip)->commit_sem); + if (old_dir != new_dir) + up(&JFS_IP(old_dir)->commit_sem); + if (!S_ISDIR(old_ip->i_mode) && new_ip) + IWRITE_UNLOCK(new_ip); + jfs_error(new_ip->i_sb, + "jfs_rename: new_ip->i_nlink != 0"); + return -EIO; + } tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_DELETE; tblk->ip = new_ip; diff -prauN linux-2.6.0-test7/fs/jfs/resize.c wli-2.6.0-test7-bk1-29/fs/jfs/resize.c --- linux-2.6.0-test7/fs/jfs/resize.c 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/resize.c 2003-10-09 19:27:22.000000000 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2003 + * Copyright (C) International Business Machines Corp., 2000-2003 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -523,7 +523,7 @@ int jfs_extendfs(struct super_block *sb, goto resume; error_out: - updateSuper(sb, FM_DIRTY); + jfs_error(sb, "jfs_extendfs"); resume: /* diff -prauN linux-2.6.0-test7/fs/jfs/super.c wli-2.6.0-test7-bk1-29/fs/jfs/super.c --- linux-2.6.0-test7/fs/jfs/super.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/super.c 2003-10-09 19:27:22.000000000 -0700 @@ -1,6 +1,6 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2003 - * Portions Copyright (c) Christoph Hellwig, 2001-2002 + * Copyright (C) International Business Machines Corp., 2000-2003 + * Portions Copyright (C) Christoph Hellwig, 2001-2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -85,6 +85,42 @@ extern wait_queue_head_t jfs_IO_thread_w extern wait_queue_head_t jfs_commit_thread_wait; extern wait_queue_head_t jfs_sync_thread_wait; +static void jfs_handle_error(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + + if (sb->s_flags & MS_RDONLY) + return; + + updateSuper(sb, FM_DIRTY); + + if (sbi->flag & JFS_ERR_PANIC) + panic("JFS (device %s): panic forced after error\n", + sb->s_id); + else if (sbi->flag & JFS_ERR_REMOUNT_RO) { + jfs_err("ERROR: (device %s): remounting filesystem " + "as read-only\n", + sb->s_id); + sb->s_flags |= MS_RDONLY; + } + + /* nothing is done for continue beyond marking the superblock dirty */ +} + +void jfs_error(struct super_block *sb, const char * function, ...) +{ + static char error_buf[256]; + va_list args; + + va_start(args, function); + vsprintf(error_buf, function, args); + va_end(args); + + printk(KERN_ERR "ERROR: (device %s): %s\n", sb->s_id, error_buf); + + jfs_handle_error(sb); +} + static struct inode *jfs_alloc_inode(struct super_block *sb) { struct jfs_inode_info *jfs_inode; @@ -167,7 +203,7 @@ static void jfs_put_super(struct super_b enum { Opt_integrity, Opt_nointegrity, Opt_iocharset, Opt_resize, - Opt_ignore, Opt_err, + Opt_errors, Opt_ignore, Opt_err, }; static match_table_t tokens = { @@ -175,6 +211,7 @@ static match_table_t tokens = { {Opt_nointegrity, "nointegrity"}, {Opt_iocharset, "iocharset=%s"}, {Opt_resize, "resize=%u"}, + {Opt_errors, "errors=%s"}, {Opt_ignore, "noquota"}, {Opt_ignore, "quota"}, {Opt_ignore, "usrquota"}, @@ -234,6 +271,31 @@ static int parse_options(char *options, *newLVSize = simple_strtoull(resize, &resize, 0); break; } + case Opt_errors: + { + char *errors = args[0].from; + if (!errors || !*errors) + goto cleanup; + if (!strcmp(errors, "continue")) { + *flag &= ~JFS_ERR_REMOUNT_RO; + *flag &= ~JFS_ERR_PANIC; + *flag |= JFS_ERR_CONTINUE; + } else if (!strcmp(errors, "remount-ro")) { + *flag &= ~JFS_ERR_CONTINUE; + *flag &= ~JFS_ERR_PANIC; + *flag |= JFS_ERR_REMOUNT_RO; + } else if (!strcmp(errors, "panic")) { + *flag &= ~JFS_ERR_CONTINUE; + *flag &= ~JFS_ERR_REMOUNT_RO; + *flag |= JFS_ERR_PANIC; + } else { + printk(KERN_ERR + "JFS: %s is an invalid error handler\n", + errors); + goto cleanup; + } + break; + } default: printk("jfs: Unrecognized mount option \"%s\" " " or missing value\n", p); @@ -316,7 +378,9 @@ static int jfs_fill_super(struct super_b memset(sbi, 0, sizeof (struct jfs_sb_info)); sb->s_fs_info = sbi; - flag = 0; + /* initialize the mount flag and determine the default error handler */ + flag = JFS_ERR_REMOUNT_RO; + if (!parse_options((char *) data, sb, &newLVSize, &flag)) { kfree(sbi); return -EINVAL; diff -prauN linux-2.6.0-test7/fs/jfs/xattr.c wli-2.6.0-test7-bk1-29/fs/jfs/xattr.c --- linux-2.6.0-test7/fs/jfs/xattr.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/jfs/xattr.c 2003-10-09 19:27:22.000000000 -0700 @@ -1,6 +1,6 @@ /* - * Copyright (c) International Business Machines Corp., 2000-2002 - * Copyright (c) Christoph Hellwig, 2002 + * Copyright (C) International Business Machines Corp., 2000-2003 + * Copyright (C) Christoph Hellwig, 2002 * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -20,6 +20,7 @@ #include #include #include "jfs_incore.h" +#include "jfs_superblock.h" #include "jfs_dmap.h" #include "jfs_debug.h" #include "jfs_dinode.h" @@ -381,7 +382,10 @@ static int ea_read(struct inode *ip, str return ea_read_inline(ip, ealist); nbytes = sizeDXD(&ji->ea); - assert(nbytes); + if (!nbytes) { + jfs_error(sb, "ea_read: nbytes is 0"); + return -EIO; + } /* * Figure out how many blocks were allocated when this EA list was @@ -477,7 +481,10 @@ static int ea_get(struct inode *inode, s } current_blocks = 0; } else { - assert(ji->ea.flag & DXD_EXTENT); + if (!(ji->ea.flag & DXD_EXTENT)) { + jfs_error(sb, "ea_get: invalid ea.flag)"); + return -EIO; + } current_blocks = (ea_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits; } diff -prauN linux-2.6.0-test7/fs/libfs.c wli-2.6.0-test7-bk1-29/fs/libfs.c --- linux-2.6.0-test7/fs/libfs.c 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/libfs.c 2003-10-09 19:42:26.000000000 -0700 @@ -326,7 +326,7 @@ int simple_prepare_write(struct file *fi int simple_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; /* diff -prauN linux-2.6.0-test7/fs/locks.c wli-2.6.0-test7-bk1-29/fs/locks.c --- linux-2.6.0-test7/fs/locks.c 2003-10-08 12:24:50.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/locks.c 2003-10-09 19:27:22.000000000 -0700 @@ -1288,7 +1288,7 @@ int fcntl_setlease(unsigned int fd, stru locks_insert_lock(before, fl); - error = f_setown(filp, current->tgid, 1); + error = f_setown(filp, current->pid, 0); out_unlock: unlock_kernel(); return error; diff -prauN linux-2.6.0-test7/fs/minix/dir.c wli-2.6.0-test7-bk1-29/fs/minix/dir.c --- linux-2.6.0-test7/fs/minix/dir.c 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/minix/dir.c 2003-10-09 19:42:26.000000000 -0700 @@ -47,9 +47,9 @@ static inline unsigned long dir_pages(st static int dir_commit_chunk(struct page *page, unsigned from, unsigned to) { - struct inode *dir = (struct inode *)page->mapping->host; + struct inode *dir = (struct inode *)page_mapping(page)->host; int err = 0; - page->mapping->a_ops->commit_write(NULL, page, from, to); + page_mapping(page)->a_ops->commit_write(NULL, page, from, to); if (IS_DIRSYNC(dir)) err = write_one_page(page, 1); else @@ -240,7 +240,7 @@ int minix_add_link(struct dentry *dentry got_it: from = (char*)de - (char*)page_address(page); to = from + sbi->s_dirsize; - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) goto out_unlock; memcpy (de->name, name, namelen); @@ -260,7 +260,7 @@ out_unlock: int minix_delete_entry(struct minix_dir_entry *de, struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = (struct inode*)mapping->host; char *kaddr = page_address(page); unsigned from = (char*)de - kaddr; @@ -364,14 +364,14 @@ not_empty: void minix_set_link(struct minix_dir_entry *de, struct page *page, struct inode *inode) { - struct inode *dir = (struct inode*)page->mapping->host; + struct inode *dir = (struct inode*)page_mapping(page)->host; struct minix_sb_info *sbi = minix_sb(dir->i_sb); unsigned from = (char *)de-(char*)page_address(page); unsigned to = from + sbi->s_dirsize; int err; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err == 0) { de->inode = inode->i_ino; err = dir_commit_chunk(page, from, to); diff -prauN linux-2.6.0-test7/fs/mpage.c wli-2.6.0-test7-bk1-29/fs/mpage.c --- linux-2.6.0-test7/fs/mpage.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/mpage.c 2003-10-09 19:42:26.000000000 -0700 @@ -129,7 +129,7 @@ mpage_alloc(struct block_device *bdev, static void map_buffer_to_page(struct page *page, struct buffer_head *bh, int page_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct buffer_head *page_bh, *head; int block = 0; @@ -209,7 +209,7 @@ static struct bio * do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, sector_t *last_block_in_bio, get_block_t get_block) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; const unsigned blkbits = inode->i_blkbits; const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; const unsigned blocksize = 1 << blkbits; @@ -388,8 +388,8 @@ static struct bio * mpage_writepage(struct bio *bio, struct page *page, get_block_t get_block, sector_t *last_block_in_bio, int *ret, struct writeback_control *wbc) { - struct address_space *mapping = page->mapping; - struct inode *inode = page->mapping->host; + struct address_space *mapping = page_mapping(page); + struct inode *inode = page_mapping(page)->host; const unsigned blkbits = inode->i_blkbits; unsigned long end_index; const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; @@ -416,7 +416,7 @@ mpage_writepage(struct bio *bio, struct if (!buffer_mapped(bh)) { /* * unmapped dirty buffers are created by - * __set_page_dirty_buffers -> mmapped data + * set_page_dirty_buffers -> mmapped data */ if (buffer_dirty(bh)) goto confused; @@ -562,7 +562,7 @@ alloc_new: confused: if (bio) bio = mpage_bio_submit(WRITE, bio); - *ret = page->mapping->a_ops->writepage(page, wbc); + *ret = page_mapping(page)->a_ops->writepage(page, wbc); /* * The caller has a ref on the inode, so *mapping is stable */ @@ -635,7 +635,7 @@ mpage_writepages(struct address_space *m if (get_block == NULL) writepage = mapping->a_ops->writepage; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); while (!list_empty(&mapping->io_pages) && !done) { struct page *page = list_entry(mapping->io_pages.prev, struct page, list); @@ -655,12 +655,12 @@ mpage_writepages(struct address_space *m list_add(&page->list, &mapping->locked_pages); page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); /* * At this point we hold neither mapping->page_lock nor * lock on the page itself: the page may be truncated or - * invalidated (changing page->mapping to NULL), or even + * invalidated (changing page_mapping(page) to NULL), or even * swizzled back from swapper_space to tmpfs file mapping. */ @@ -669,7 +669,7 @@ mpage_writepages(struct address_space *m if (wbc->sync_mode != WB_SYNC_NONE) wait_on_page_writeback(page); - if (page->mapping == mapping && !PageWriteback(page) && + if (page_mapping(page) == mapping && !PageWriteback(page) && test_clear_page_dirty(page)) { if (writepage) { ret = (*writepage)(page, wbc); @@ -695,12 +695,12 @@ mpage_writepages(struct address_space *m unlock_page(page); } page_cache_release(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); } /* * Leave any remaining dirty pages on ->io_pages */ - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); if (bio) mpage_bio_submit(WRITE, bio); return ret; diff -prauN linux-2.6.0-test7/fs/ncpfs/symlink.c wli-2.6.0-test7-bk1-29/fs/ncpfs/symlink.c --- linux-2.6.0-test7/fs/ncpfs/symlink.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/ncpfs/symlink.c 2003-10-09 19:42:26.000000000 -0700 @@ -43,7 +43,7 @@ static int ncp_symlink_readpage(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int error, length, len; char *link, *rawlink; char *buf = kmap(page); diff -prauN linux-2.6.0-test7/fs/nfs/direct.c wli-2.6.0-test7-bk1-29/fs/nfs/direct.c --- linux-2.6.0-test7/fs/nfs/direct.c 2003-10-08 12:24:43.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/nfs/direct.c 2003-10-09 19:27:22.000000000 -0700 @@ -1,7 +1,7 @@ /* * linux/fs/nfs/direct.c * - * Copyright (C) 2001 by Chuck Lever + * Copyright (C) 2003 by Chuck Lever * * High-performance uncached I/O for the Linux NFS client * @@ -26,19 +26,23 @@ * also supports uncaching whole NFS partitions with "-o forcedirectio," * an undocumented mount option. * - * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust. + * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with + * help from Andrew Morton. * * 18 Dec 2001 Initial implementation for 2.4 --cel * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy - * 24 Sep 2002 Rewrite to use asynchronous RPCs, port to 2.5 --cel + * 08 Jun 2003 Port to 2.5 APIs --cel * */ #include +#include #include #include +#include #include -#include +#include + #include #include #include @@ -46,35 +50,41 @@ #include #include -#define NFSDBG_FACILITY (NFSDBG_PAGECACHE | NFSDBG_VFS) +#define NFSDBG_FACILITY NFSDBG_VFS #define VERF_SIZE (2 * sizeof(__u32)) +#define MAX_DIRECTIO_SIZE (4096UL << PAGE_SHIFT) /** - * nfs_get_user_pages - find and set up page representing user buffer - * addr: user-space address of target buffer - * size: total size in bytes of target buffer - * @pages: returned array of page struct pointers underlying target buffer - * write: whether or not buffer is target of a write operation + * nfs_get_user_pages - find and set up pages underlying user's buffer + * rw: direction (read or write) + * user_addr: starting address of this segment of user's buffer + * count: size of this segment + * @pages: returned array of page struct pointers underlying user's buffer */ static inline int -nfs_get_user_pages(unsigned long addr, size_t size, - struct page ***pages, int rw) +nfs_get_user_pages(int rw, unsigned long user_addr, size_t size, + struct page ***pages) { int result = -ENOMEM; - unsigned page_count = (unsigned) size >> PAGE_SHIFT; - unsigned array_size = (page_count * sizeof(struct page *)) + 2U; + unsigned long page_count; + size_t array_size; - *pages = (struct page **) kmalloc(array_size, GFP_KERNEL); + /* set an arbitrary limit to prevent arithmetic overflow */ + if (size > MAX_DIRECTIO_SIZE) + return -EFBIG; + + page_count = (user_addr + size + PAGE_SIZE - 1) >> PAGE_SHIFT; + page_count -= user_addr >> PAGE_SHIFT; + + array_size = (page_count * sizeof(struct page *)); + *pages = kmalloc(array_size, GFP_KERNEL); if (*pages) { down_read(¤t->mm->mmap_sem); - result = get_user_pages(current, current->mm, addr, - page_count, (rw == WRITE), 0, + result = get_user_pages(current, current->mm, user_addr, + page_count, (rw == READ), 0, *pages, NULL); up_read(¤t->mm->mmap_sem); - if (result < 0) - printk(KERN_ERR "%s: get_user_pages result %d\n", - __FUNCTION__, result); } return result; } @@ -83,177 +93,366 @@ nfs_get_user_pages(unsigned long addr, s * nfs_free_user_pages - tear down page struct array * @pages: array of page struct pointers underlying target buffer */ -static inline void -nfs_free_user_pages(struct page **pages, unsigned count) +static void +nfs_free_user_pages(struct page **pages, int npages, int do_dirty) { - unsigned page = 0; + int i; + for (i = 0; i < npages; i++) { + if (do_dirty) + set_page_dirty_lock(pages[i]); + page_cache_release(pages[i]); + } + kfree(pages); +} - while (count--) - page_cache_release(pages[page++]); +/** + * nfs_direct_read_seg - Read in one iov segment. Generate separate + * read RPCs for each "rsize" bytes. + * @inode: target inode + * @file: target file (may be NULL) + * user_addr: starting address of this segment of user's buffer + * count: size of this segment + * file_offset: offset in file to begin the operation + * @pages: array of addresses of page structs defining user's buffer + * nr_pages: size of pages array + */ +static int +nfs_direct_read_seg(struct inode *inode, struct file *file, + unsigned long user_addr, size_t count, loff_t file_offset, + struct page **pages, int nr_pages) +{ + const unsigned int rsize = NFS_SERVER(inode)->rsize; + int tot_bytes = 0; + int curpage = 0; + struct nfs_read_data rdata = { + .inode = inode, + .args = { + .fh = NFS_FH(inode), + }, + .res = { + .fattr = &rdata.fattr, + }, + }; + + rdata.args.pgbase = user_addr & ~PAGE_MASK; + rdata.args.offset = file_offset; + do { + int result; + + rdata.args.count = count; + if (rdata.args.count > rsize) + rdata.args.count = rsize; + rdata.args.pages = &pages[curpage]; + + dprintk("NFS: direct read: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", + rdata.args.count, (long long) rdata.args.offset, + user_addr + tot_bytes, rdata.args.pgbase, curpage); + + lock_kernel(); + result = NFS_PROTO(inode)->read(&rdata, file); + unlock_kernel(); + + if (result <= 0) { + if (tot_bytes > 0) + break; + if (result == -EISDIR) + result = -EINVAL; + return result; + } - kfree(pages); + tot_bytes += result; + if (rdata.res.eof) + break; + + rdata.args.offset += result; + rdata.args.pgbase += result; + curpage += rdata.args.pgbase >> PAGE_SHIFT; + rdata.args.pgbase &= ~PAGE_MASK; + count -= result; + } while (count != 0); + + /* XXX: should we zero the rest of the user's buffer if we + * hit eof? */ + + return tot_bytes; } /** - * nfs_iov2pagelist - convert an array of iovecs to a list of page requests - * @inode: inode of target file - * @cred: credentials of user who requested I/O + * nfs_direct_read - For each iov segment, map the user's buffer + * then generate read RPCs. + * @inode: target inode + * @file: target file (may be NULL) * @iov: array of vectors that define I/O buffer - * offset: where in file to begin the read + * file_offset: offset in file to begin the operation * nr_segs: size of iovec array - * @requests: append new page requests to this list head + * + * generic_file_direct_IO has already pushed out any non-direct + * writes so that this read will see them when we read from the + * server. */ static int -nfs_iov2pagelist(int rw, const struct inode *inode, - const struct rpc_cred *cred, - const struct iovec *iov, loff_t offset, - unsigned long nr_segs, struct list_head *requests) +nfs_direct_read(struct inode *inode, struct file *file, + const struct iovec *iov, loff_t file_offset, + unsigned long nr_segs) { - unsigned seg; int tot_bytes = 0; - struct page **pages; + unsigned long seg = 0; + + while ((seg < nr_segs) && (tot_bytes >= 0)) { + int result, page_count; + struct page **pages; + const struct iovec *vec = &iov[seg++]; + unsigned long user_addr = (unsigned long) vec->iov_base; + size_t size = vec->iov_len; + + page_count = nfs_get_user_pages(READ, user_addr, size, &pages); + if (page_count < 0) { + nfs_free_user_pages(pages, 0, 0); + if (tot_bytes > 0) + break; + return page_count; + } + + result = nfs_direct_read_seg(inode, file, user_addr, size, + file_offset, pages, page_count); + + nfs_free_user_pages(pages, page_count, 1); + + if (result <= 0) { + if (tot_bytes > 0) + break; + return result; + } + tot_bytes += result; + file_offset += result; + if (result < size) + break; + } + + return tot_bytes; +} - /* for each iovec in the array... */ - for (seg = 0; seg < nr_segs; seg++) { - const unsigned long user_addr = - (unsigned long) iov[seg].iov_base; - size_t bytes = iov[seg].iov_len; - unsigned int pg_offset = (user_addr & ~PAGE_MASK); - int page_count, page = 0; - - page_count = nfs_get_user_pages(user_addr, bytes, &pages, rw); - if (page_count < 0) { - nfs_release_list(requests); - return page_count; +/** + * nfs_direct_write_seg - Write out one iov segment. Generate separate + * write RPCs for each "wsize" bytes, then commit. + * @inode: target inode + * @file: target file (may be NULL) + * user_addr: starting address of this segment of user's buffer + * count: size of this segment + * file_offset: offset in file to begin the operation + * @pages: array of addresses of page structs defining user's buffer + * nr_pages: size of pages array + */ +static int +nfs_direct_write_seg(struct inode *inode, struct file *file, + unsigned long user_addr, size_t count, loff_t file_offset, + struct page **pages, int nr_pages) +{ + const unsigned int wsize = NFS_SERVER(inode)->wsize; + size_t request; + int need_commit; + int tot_bytes; + int curpage; + struct nfs_writeverf first_verf; + struct nfs_write_data wdata = { + .inode = inode, + .args = { + .fh = NFS_FH(inode), + }, + .res = { + .fattr = &wdata.fattr, + .verf = &wdata.verf, + }, + }; + + wdata.args.stable = NFS_UNSTABLE; + if (IS_SYNC(inode) || NFS_PROTO(inode)->version == 2 || count <= wsize) + wdata.args.stable = NFS_FILE_SYNC; + +retry: + need_commit = 0; + tot_bytes = 0; + curpage = 0; + request = count; + wdata.args.pgbase = user_addr & ~PAGE_MASK; + wdata.args.offset = file_offset; + do { + int result; + + wdata.args.count = request; + if (wdata.args.count > wsize) + wdata.args.count = wsize; + wdata.args.pages = &pages[curpage]; + + dprintk("NFS: direct write: c=%u o=%Ld ua=%lu, pb=%u, cp=%u\n", + wdata.args.count, (long long) wdata.args.offset, + user_addr + tot_bytes, wdata.args.pgbase, curpage); + + lock_kernel(); + result = NFS_PROTO(inode)->write(&wdata, file); + unlock_kernel(); + + if (result <= 0) { + if (tot_bytes > 0) + break; + return result; } - /* ...build as many page requests as required */ - while (bytes > 0) { - struct nfs_page *new; - const unsigned int pg_bytes = (bytes > PAGE_SIZE) ? - PAGE_SIZE : bytes; - - new = nfs_create_request((struct rpc_cred *) cred, - (struct inode *) inode, - pages[page], - pg_offset, pg_bytes); - if (IS_ERR(new)) { - nfs_free_user_pages(pages, page_count); - nfs_release_list(requests); - return PTR_ERR(new); - } - new->wb_index = offset; - nfs_list_add_request(new, requests); - - /* after the first page */ - pg_offset = 0; - offset += PAGE_SIZE; - tot_bytes += pg_bytes; - bytes -= pg_bytes; - page++; + if (tot_bytes == 0) + memcpy(&first_verf.verifier, &wdata.verf.verifier, + VERF_SIZE); + if (wdata.verf.committed != NFS_FILE_SYNC) { + need_commit = 1; + if (memcmp(&first_verf.verifier, + &wdata.verf.verifier, VERF_SIZE)) + goto sync_retry; } - /* don't release pages here -- I/O completion will do that */ - nfs_free_user_pages(pages, 0); + tot_bytes += result; + wdata.args.offset += result; + wdata.args.pgbase += result; + curpage += wdata.args.pgbase >> PAGE_SHIFT; + wdata.args.pgbase &= ~PAGE_MASK; + request -= result; + } while (request != 0); + + /* + * Commit data written so far, even in the event of an error + */ + if (need_commit) { + int result; + + wdata.args.count = tot_bytes; + wdata.args.offset = file_offset; + + lock_kernel(); + result = NFS_PROTO(inode)->commit(&wdata, file); + unlock_kernel(); + + if (result < 0 || memcmp(&first_verf.verifier, + &wdata.verf.verifier, + VERF_SIZE) != 0) + goto sync_retry; } return tot_bytes; + +sync_retry: + wdata.args.stable = NFS_FILE_SYNC; + goto retry; } /** - * do_nfs_direct_IO - Read or write data without caching - * @inode: inode of target file - * @cred: credentials of user who requested I/O + * nfs_direct_write - For each iov segment, map the user's buffer + * then generate write and commit RPCs. + * @inode: target inode + * @file: target file (may be NULL) * @iov: array of vectors that define I/O buffer - * offset: where in file to begin the read + * file_offset: offset in file to begin the operation * nr_segs: size of iovec array * - * Break the passed-in iovec into a series of page-sized or smaller - * requests, where each page is mapped for direct user-land I/O. - * - * For each of these pages, create an NFS page request and - * append it to an automatic list of page requests. - * - * When all page requests have been queued, start the I/O on the - * whole list. The underlying routines coalesce the pages on the - * list into a bunch of asynchronous "r/wsize" network requests. - * - * I/O completion automatically unmaps and releases the pages. + * Upon return, generic_file_direct_IO invalidates any cached pages + * that non-direct readers might access, so they will pick up these + * writes immediately. */ static int -do_nfs_direct_IO(int rw, const struct inode *inode, - const struct rpc_cred *cred, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) +nfs_direct_write(struct inode *inode, struct file *file, + const struct iovec *iov, loff_t file_offset, + unsigned long nr_segs) { - LIST_HEAD(requests); - int result, tot_bytes; - - result = nfs_iov2pagelist(rw, inode, cred, iov, offset, nr_segs, - &requests); - if (result < 0) - return result; - tot_bytes = result; + int tot_bytes = 0; + unsigned long seg = 0; - switch (rw) { - case READ: - if (IS_SYNC(inode) || (NFS_SERVER(inode)->rsize < PAGE_SIZE)) { - result = nfs_direct_read_sync(inode, cred, iov, offset, nr_segs); - break; + while ((seg < nr_segs) && (tot_bytes >= 0)) { + int result, page_count; + struct page **pages; + const struct iovec *vec = &iov[seg++]; + unsigned long user_addr = (unsigned long) vec->iov_base; + size_t size = vec->iov_len; + + page_count = nfs_get_user_pages(WRITE, user_addr, size, &pages); + if (page_count < 0) { + nfs_free_user_pages(pages, 0, 0); + if (tot_bytes > 0) + break; + return page_count; + } + + result = nfs_direct_write_seg(inode, file, user_addr, size, + file_offset, pages, page_count); + nfs_free_user_pages(pages, page_count, 0); + + if (result <= 0) { + if (tot_bytes > 0) + break; + return result; } - result = nfs_pagein_list(&requests, NFS_SERVER(inode)->rpages); - nfs_wait_for_reads(&requests); - break; - case WRITE: - if (IS_SYNC(inode) || (NFS_SERVER(inode)->wsize < PAGE_SIZE)) - result = nfs_direct_write_sync(inode, cred, iov, offset, nr_segs); - else - result = nfs_flush_list(&requests, - NFS_SERVER(inode)->wpages, FLUSH_WAIT); - - /* invalidate cache so non-direct readers pick up changes */ - invalidate_inode_pages((struct inode *) inode); - break; - default: - result = -EINVAL; - break; + tot_bytes += result; + file_offset += result; + if (result < size) + break; } + /* Zap the page cache if we managed to write */ + if (tot_bytes > 0) + invalidate_remote_inode(inode); - if (result < 0) - return result; return tot_bytes; } /** * nfs_direct_IO - NFS address space operation for direct I/O * rw: direction (read or write) - * @file: file struct of target file + * @iocb: target I/O control block * @iov: array of vectors that define I/O buffer - * offset: offset in file to begin the operation + * file_offset: offset in file to begin the operation * nr_segs: size of iovec array * + * Usually a file system implements direct I/O by calling out to + * blockdev_direct_IO. The NFS client doesn't have a backing block + * device, so we do everything by hand instead. + * * The inode's i_sem is no longer held by the VFS layer before it calls * this function to do a write. */ int nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, - loff_t offset, unsigned long nr_segs) + loff_t file_offset, unsigned long nr_segs) { - /* None of this works yet, so prevent it from compiling. */ -#if 0 - int result; + int result = -EINVAL; + struct file *file = iocb->ki_filp; struct dentry *dentry = file->f_dentry; - const struct inode *inode = dentry->d_inode->i_mapping->host; - const struct rpc_cred *cred = nfs_file_cred(file); -#endif - - dfprintk(VFS, "NFS: direct_IO(%s) (%s/%s) off/no(%Lu/%lu)\n", - ((rw == READ) ? "READ" : "WRITE"), - dentry->d_parent->d_name.name, - dentry->d_name.name, offset, nr_segs); + struct inode *inode = dentry->d_inode; + + /* + * No support for async yet + */ + if (!is_sync_kiocb(iocb)) + goto out; + + result = nfs_revalidate_inode(NFS_SERVER(inode), inode); + if (result < 0) + goto out; + + switch (rw) { + case READ: + dprintk("NFS: direct_IO(read) (%s) off/no(%Lu/%lu)\n", + dentry->d_name.name, file_offset, nr_segs); - result = do_nfs_direct_IO(rw, inode, cred, iov, offset, nr_segs); + result = nfs_direct_read(inode, file, iov, + file_offset, nr_segs); + break; + case WRITE: + dprintk("NFS: direct_IO(write) (%s) off/no(%Lu/%lu)\n", + dentry->d_name.name, file_offset, nr_segs); - dfprintk(VFS, "NFS: direct_IO result = %d\n", result); + result = nfs_direct_write(inode, file, iov, + file_offset, nr_segs); + break; + default: + break; + } +out: + dprintk("NFS: direct_IO result=%d\n", result); return result; } diff -prauN linux-2.6.0-test7/fs/nfs/file.c wli-2.6.0-test7-bk1-29/fs/nfs/file.c --- linux-2.6.0-test7/fs/nfs/file.c 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/nfs/file.c 2003-10-09 19:42:26.000000000 -0700 @@ -216,7 +216,7 @@ static int nfs_commit_write(struct file struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, .readpages = nfs_readpages, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = set_page_dirty_nobuffers, .writepage = nfs_writepage, .writepages = nfs_writepages, .prepare_write = nfs_prepare_write, diff -prauN linux-2.6.0-test7/fs/nfs/nfs3proc.c wli-2.6.0-test7-bk1-29/fs/nfs/nfs3proc.c --- linux-2.6.0-test7/fs/nfs/nfs3proc.c 2003-10-08 12:24:05.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/nfs/nfs3proc.c 2003-10-09 19:27:22.000000000 -0700 @@ -284,6 +284,29 @@ nfs3_proc_write(struct nfs_write_data *w return status < 0? status : wdata->res.count; } +static int +nfs3_proc_commit(struct nfs_write_data *cdata, struct file *filp) +{ + struct inode * inode = cdata->inode; + struct nfs_fattr * fattr = cdata->res.fattr; + struct rpc_message msg = { + .rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT], + .rpc_argp = &cdata->args, + .rpc_resp = &cdata->res, + }; + int status; + + dprintk("NFS call commit %d @ %Ld\n", cdata->args.count, + (long long) cdata->args.offset); + fattr->valid = 0; + msg.rpc_cred = nfs_cred(inode, filp); + status = rpc_call_sync(NFS_CLIENT(inode), &msg, 0); + if (status >= 0) + nfs3_write_refresh_inode(inode, fattr); + dprintk("NFS reply commit: %d\n", status); + return status; +} + /* * Create a regular file. * For now, we don't implement O_EXCL. @@ -883,6 +906,7 @@ struct nfs_rpc_ops nfs_v3_clientops = { .readlink = nfs3_proc_readlink, .read = nfs3_proc_read, .write = nfs3_proc_write, + .commit = nfs3_proc_commit, .create = nfs3_proc_create, .remove = nfs3_proc_remove, .unlink_setup = nfs3_proc_unlink_setup, diff -prauN linux-2.6.0-test7/fs/nfs/nfs4proc.c wli-2.6.0-test7-bk1-29/fs/nfs/nfs4proc.c --- linux-2.6.0-test7/fs/nfs/nfs4proc.c 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/nfs/nfs4proc.c 2003-10-09 19:27:22.000000000 -0700 @@ -1038,7 +1038,6 @@ nfs4_proc_read(struct nfs_read_data *rda .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ], .rpc_argp = &rdata->args, .rpc_resp = &rdata->res, - .rpc_cred = rdata->cred, }; unsigned long timestamp = jiffies; int status; @@ -1053,8 +1052,11 @@ nfs4_proc_read(struct nfs_read_data *rda struct nfs4_state *state; state = (struct nfs4_state *)filp->private_data; memcpy(&rdata->args.stateid, &state->stateid, sizeof(rdata->args.stateid)); - } else + msg.rpc_cred = state->owner->so_cred; + } else { memcpy(&rdata->args.stateid, &zero_stateid, sizeof(rdata->args.stateid)); + msg.rpc_cred = NFS_I(inode)->mm_cred; + } fattr->valid = 0; status = rpc_call_sync(server->client, &msg, flags); @@ -1079,7 +1081,6 @@ nfs4_proc_write(struct nfs_write_data *w .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE], .rpc_argp = &wdata->args, .rpc_resp = &wdata->res, - .rpc_cred = wdata->cred, }; int status; @@ -1093,15 +1094,54 @@ nfs4_proc_write(struct nfs_write_data *w struct nfs4_state *state; state = (struct nfs4_state *)filp->private_data; memcpy(&wdata->args.stateid, &state->stateid, sizeof(wdata->args.stateid)); - } else + msg.rpc_cred = state->owner->so_cred; + } else { memcpy(&wdata->args.stateid, &zero_stateid, sizeof(wdata->args.stateid)); + msg.rpc_cred = NFS_I(inode)->mm_cred; + } fattr->valid = 0; status = rpc_call_sync(server->client, &msg, rpcflags); + NFS_CACHEINV(inode); dprintk("NFS reply write: %d\n", status); return status; } +static int +nfs4_proc_commit(struct nfs_write_data *cdata, struct file *filp) +{ + struct inode *inode = cdata->inode; + struct nfs_fattr *fattr = cdata->res.fattr; + struct nfs_server *server = NFS_SERVER(inode); + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT], + .rpc_argp = &cdata->args, + .rpc_resp = &cdata->res, + }; + int status; + + dprintk("NFS call commit %d @ %Ld\n", cdata->args.count, + (long long) cdata->args.offset); + + /* + * Try first to use O_WRONLY, then O_RDWR stateid. + */ + if (filp) { + struct nfs4_state *state; + state = (struct nfs4_state *)filp->private_data; + memcpy(&cdata->args.stateid, &state->stateid, sizeof(cdata->args.stateid)); + msg.rpc_cred = state->owner->so_cred; + } else { + memcpy(&cdata->args.stateid, &zero_stateid, sizeof(cdata->args.stateid)); + msg.rpc_cred = NFS_I(inode)->mm_cred; + } + + fattr->valid = 0; + status = rpc_call_sync(server->client, &msg, 0); + dprintk("NFS reply commit: %d\n", status); + return status; +} + /* * Got race? * We will need to arrange for the VFS layer to provide an atomic open. @@ -1772,7 +1812,7 @@ struct nfs_rpc_ops nfs_v4_clientops = { .readlink = nfs4_proc_readlink, .read = nfs4_proc_read, .write = nfs4_proc_write, - .commit = NULL, + .commit = nfs4_proc_commit, .create = nfs4_proc_create, .remove = nfs4_proc_remove, .unlink_setup = nfs4_proc_unlink_setup, diff -prauN linux-2.6.0-test7/fs/nfs/nfs4xdr.c wli-2.6.0-test7-bk1-29/fs/nfs/nfs4xdr.c --- linux-2.6.0-test7/fs/nfs/nfs4xdr.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/nfs/nfs4xdr.c 2003-10-09 19:27:22.000000000 -0700 @@ -2323,6 +2323,8 @@ nfs4_xdr_dec_read(struct rpc_rqst *rqstp status = decode_read_getattr(&xdr, res->fattr); if (!status) status = -nfs_stat_to_errno(hdr.status); + if (!status) + status = res->count; out: return status; } diff -prauN linux-2.6.0-test7/fs/nfs/pagelist.c wli-2.6.0-test7-bk1-29/fs/nfs/pagelist.c --- linux-2.6.0-test7/fs/nfs/pagelist.c 2003-10-08 12:24:44.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/nfs/pagelist.c 2003-10-09 19:27:22.000000000 -0700 @@ -154,26 +154,6 @@ nfs_release_request(struct nfs_page *req } /** - * nfs_release_list - cleanly dispose of an unattached list of page requests - * @list: list of doomed page requests - */ -void -nfs_release_list(struct list_head *list) -{ - while (!list_empty(list)) { - struct nfs_page *req = nfs_list_entry(list); - - nfs_list_remove_request(req); - - page_cache_release(req->wb_page); - - /* Release struct file or cached credential */ - nfs_clear_request(req); - nfs_page_free(req); - } -} - -/** * nfs_list_add_request - Insert a request into a sorted list * @req: request * @head: head of list into which to insert the request. @@ -222,37 +202,6 @@ nfs_wait_on_request(struct nfs_page *req } /** - * nfs_wait_for_reads - wait for outstanding requests to complete - * @head: list of page requests to wait for - */ -int -nfs_wait_for_reads(struct list_head *head) -{ - struct list_head *p = head->next; - unsigned int res = 0; - - while (p != head) { - struct nfs_page *req = nfs_list_entry(p); - int error; - - if (!NFS_WBACK_BUSY(req)) - continue; - - req->wb_count++; - error = nfs_wait_on_request(req); - if (error < 0) - return error; - nfs_list_remove_request(req); - nfs_clear_request(req); - nfs_page_free(req); - - p = head->next; - res++; - } - return res; -} - -/** * nfs_coalesce_requests - Split coalesced requests out from a list. * @head: source list * @dst: destination list diff -prauN linux-2.6.0-test7/fs/nfs/read.c wli-2.6.0-test7-bk1-29/fs/nfs/read.c --- linux-2.6.0-test7/fs/nfs/read.c 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/nfs/read.c 2003-10-09 19:42:26.000000000 -0700 @@ -308,7 +308,7 @@ nfs_readpage_result(struct rpc_task *tas int nfs_readpage(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int error; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", @@ -349,14 +349,14 @@ static int readpage_sync_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - return nfs_readpage_sync(desc->filp, page->mapping->host, page); + return nfs_readpage_sync(desc->filp, page_mapping(page)->host, page); } static int readpage_async_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct nfs_page *new; nfs_wb_page(inode, page); diff -prauN linux-2.6.0-test7/fs/nfs/write.c wli-2.6.0-test7-bk1-29/fs/nfs/write.c --- linux-2.6.0-test7/fs/nfs/write.c 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/nfs/write.c 2003-10-09 19:42:55.000000000 -0700 @@ -224,7 +224,7 @@ nfs_writepage_async(struct file *file, s int nfs_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; unsigned long end_index; unsigned offset = PAGE_CACHE_SIZE; loff_t i_size = i_size_read(inode); @@ -629,7 +629,7 @@ nfs_strategy(struct inode *inode) int nfs_flush_incompatible(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct nfs_page *req; int status = 0; /* @@ -659,7 +659,7 @@ int nfs_updatepage(struct file *file, struct page *page, unsigned int offset, unsigned int count) { struct dentry *dentry = file->f_dentry; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct nfs_page *req; loff_t end; int status = 0; diff -prauN linux-2.6.0-test7/fs/ntfs/aops.c wli-2.6.0-test7-bk1-29/fs/ntfs/aops.c --- linux-2.6.0-test7/fs/ntfs/aops.c 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/ntfs/aops.c 2003-10-09 19:42:26.000000000 -0700 @@ -55,7 +55,7 @@ static void ntfs_end_buffer_async_read(s int page_uptodate = 1; page = bh->b_page; - ni = NTFS_I(page->mapping->host); + ni = NTFS_I(page_mapping(page)->host); if (likely(uptodate)) { s64 file_ofs; @@ -176,7 +176,7 @@ static int ntfs_read_block(struct page * int i, nr; unsigned char blocksize_bits; - ni = NTFS_I(page->mapping->host); + ni = NTFS_I(page_mapping(page)->host); vol = ni->vol; blocksize_bits = VFS_I(ni)->i_blkbits; @@ -359,7 +359,7 @@ int ntfs_readpage(struct file *file, str return 0; } - ni = NTFS_I(page->mapping->host); + ni = NTFS_I(page_mapping(page)->host); if (NInoNonResident(ni)) { /* @@ -473,7 +473,7 @@ static int ntfs_write_block(struct page BOOL need_end_writeback; unsigned char blocksize_bits; - vi = page->mapping->host; + vi = page_mapping(page)->host; ni = NTFS_I(vi); vol = ni->vol; @@ -500,9 +500,9 @@ static int ntfs_write_block(struct page * buffer's dirty state as-is. */ // FIXME: Once Andrew's -EAGAIN patch goes in, remove the - // __set_page_dirty_nobuffers(page) and return -EAGAIN instead + // set_page_dirty_nobuffers(page) and return -EAGAIN instead // of zero. - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); unlock_page(page); return 0; } @@ -519,12 +519,12 @@ static int ntfs_write_block(struct page iblock = ni->initialized_size >> blocksize_bits; /* - * Be very careful. We have no exclusion from __set_page_dirty_buffers + * Be very careful. We have no exclusion from set_page_dirty_buffers * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the page stays dirty. * - * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; + * Buffers outside i_size may be dirtied by set_page_dirty_buffers; * handle that here by just cleaning them. */ @@ -579,7 +579,7 @@ static int ntfs_write_block(struct page // Update initialized size in the attribute and // in the inode. // Again, for each page do: - // __set_page_dirty_buffers(); + // set_page_dirty_buffers(); // page_cache_release() // We don't need to wait on the writes. // Update iblock. @@ -734,9 +734,9 @@ lock_retry_remap: * leave its buffer's dirty state as-is. */ // FIXME: Once Andrew's -EAGAIN patch goes in, remove - // the __set_page_dirty_nobuffers(page) and set err to + // the set_page_dirty_nobuffers(page) and set err to // -EAGAIN instead of zero. - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); err = 0; } else SetPageError(page); @@ -805,7 +805,7 @@ static int ntfs_writepage(struct page *p BUG_ON(!PageLocked(page)); - vi = page->mapping->host; + vi = page_mapping(page)->host; /* Is the page fully outside i_size? (truncate in progress) */ if (unlikely(page->index >= (vi->i_size + PAGE_CACHE_SIZE - 1) >> @@ -987,9 +987,9 @@ err_out: * buffer's dirty state as-is. */ // FIXME: Once Andrew's -EAGAIN patch goes in, remove the - // __set_page_dirty_nobuffers(page) and set err to -EAGAIN + // set_page_dirty_nobuffers(page) and set err to -EAGAIN // instead of zero. - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); err = 0; } else { ntfs_error(vi->i_sb, "Resident attribute write failed with " @@ -1024,7 +1024,7 @@ static int ntfs_prepare_nonresident_writ BOOL is_retry; unsigned char blocksize_bits; - vi = page->mapping->host; + vi = page_mapping(page)->host; ni = NTFS_I(vi); vol = ni->vol; @@ -1125,7 +1125,7 @@ static int ntfs_prepare_nonresident_writ // Update initialized size in the attribute and // in the inode. // Again, for each page do: - // __set_page_dirty_buffers(); + // set_page_dirty_buffers(); // page_cache_release() // We don't need to wait on the writes. // Update iblock. @@ -1361,7 +1361,7 @@ err_out: * ntfs_prepare_write - prepare a page for receiving data * * This is called from generic_file_write() with i_sem held on the inode - * (@page->mapping->host). The @page is locked and kmap()ped so page_address() + * (@page_mapping(page)->host). The @page is locked and kmap()ped so page_address() * can simply be used. The source data has not yet been copied into the @page. * * Need to extend the attribute/fill in holes if necessary, create blocks and @@ -1382,7 +1382,7 @@ err_out: static int ntfs_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *vi = page->mapping->host; + struct inode *vi = page_mapping(page)->host; ntfs_inode *ni = NTFS_I(vi); ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " @@ -1491,7 +1491,7 @@ static int ntfs_commit_nonresident_write unsigned int block_start, block_end, blocksize; BOOL partial; - vi = page->mapping->host; + vi = page_mapping(page)->host; ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " "0x%lx, from = %u, to = %u.", vi->i_ino, @@ -1547,7 +1547,7 @@ static int ntfs_commit_nonresident_write * ntfs_commit_write - commit the received data * * This is called from generic_file_write() with i_sem held on the inode - * (@page->mapping->host). The @page is locked and kmap()ped so page_address() + * (@page_mapping(page)->host). The @page is locked and kmap()ped so page_address() * can simply be used. The source data has already been copied into the @page. * * Need to mark modified blocks dirty so they get written out later when @@ -1585,7 +1585,7 @@ static int ntfs_commit_write(struct file u32 attr_len, bytes; int err; - vi = page->mapping->host; + vi = page_mapping(page)->host; ni = NTFS_I(vi); ntfs_debug("Entering for inode %li, attribute type 0x%x, page index " @@ -1758,7 +1758,7 @@ err_out: * Put the page on mapping->dirty_pages, but leave its * buffer's dirty state as-is. */ - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); err = 0; } else ntfs_error(vi->i_sb, "Page is not uptodate. Written " diff -prauN linux-2.6.0-test7/fs/ntfs/compress.c wli-2.6.0-test7-bk1-29/fs/ntfs/compress.c --- linux-2.6.0-test7/fs/ntfs/compress.c 2003-10-08 12:24:06.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/ntfs/compress.c 2003-10-09 19:42:26.000000000 -0700 @@ -209,7 +209,7 @@ return_error: /* Second stage: finalize completed pages. */ if (nr_completed_pages > 0) { struct page *page = dest_pages[completed_pages[0]]; - ntfs_inode *ni = NTFS_I(page->mapping->host); + ntfs_inode *ni = NTFS_I(page_mapping(page)->host); for (i = 0; i < nr_completed_pages; i++) { int di = completed_pages[i]; @@ -467,7 +467,7 @@ return_overflow: */ int ntfs_read_compressed_block(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); ntfs_inode *ni = NTFS_I(mapping->host); ntfs_volume *vol = ni->vol; struct super_block *sb = vol->sb; diff -prauN linux-2.6.0-test7/fs/proc/array.c wli-2.6.0-test7-bk1-29/fs/proc/array.c --- linux-2.6.0-test7/fs/proc/array.c 2003-10-08 12:24:15.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/proc/array.c 2003-10-09 19:34:30.000000000 -0700 @@ -286,7 +286,7 @@ int proc_pid_status(struct task_struct * return buffer - orig; } -extern unsigned long task_vsize(struct mm_struct *); +unsigned long task_vsize(struct mm_struct *); int proc_pid_stat(struct task_struct *task, char * buffer) { unsigned long vsize, eip, esp, wchan; @@ -310,11 +310,9 @@ int proc_pid_stat(struct task_struct *ta } task_unlock(task); if (mm) { - down_read(&mm->mmap_sem); vsize = task_vsize(mm); eip = KSTK_EIP(task); esp = KSTK_ESP(task); - up_read(&mm->mmap_sem); } wchan = get_wchan(task); @@ -391,20 +389,20 @@ int proc_pid_stat(struct task_struct *ta return res; } -extern int task_statm(struct mm_struct *, int *, int *, int *, int *); +int task_statm(struct mm_struct *, int *, int *, int *, int *, int *, int *); int proc_pid_statm(struct task_struct *task, char *buffer) { - int size = 0, resident = 0, shared = 0, text = 0, lib = 0, data = 0; + int size, resident, shared, text, lib, data, dirty; struct mm_struct *mm = get_task_mm(task); - if (mm) { - down_read(&mm->mmap_sem); - size = task_statm(mm, &shared, &text, &data, &resident); - up_read(&mm->mmap_sem); - + if (!mm) + size = resident = shared = text = lib = data = dirty = 0; + else { + size = task_statm(mm, &shared, &text, &lib, &data, + &resident, &dirty); mmput(mm); } return sprintf(buffer,"%d %d %d %d %d %d %d\n", - size, resident, shared, text, lib, data, 0); + size, resident, shared, text, lib, data, dirty); } diff -prauN linux-2.6.0-test7/fs/proc/base.c wli-2.6.0-test7-bk1-29/fs/proc/base.c --- linux-2.6.0-test7/fs/proc/base.c 2003-10-08 12:24:16.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/proc/base.c 2003-10-09 20:00:01.000000000 -0700 @@ -716,8 +716,6 @@ static int proc_pid_readlink(struct dent struct dentry *de; struct vfsmount *mnt = NULL; - lock_kernel(); - if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) goto out; error = proc_check_root(inode); @@ -732,7 +730,6 @@ static int proc_pid_readlink(struct dent dput(de); mntput(mnt); out: - unlock_kernel(); return error; } @@ -1624,99 +1621,64 @@ out: } #define PROC_NUMBUF 10 -#define PROC_MAXPIDS 20 - -/* - * Get a few tgid's to return for filldir - we need to hold the - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. - */ -static int get_tgid_list(int index, unsigned int *tgids) -{ - struct task_struct *p; - int nr_tgids = 0; - - index--; - read_lock(&tasklist_lock); - for_each_process(p) { - int tgid = p->pid; - if (!pid_alive(p)) - continue; - if (--index >= 0) - continue; - tgids[nr_tgids] = tgid; - nr_tgids++; - if (nr_tgids >= PROC_MAXPIDS) - break; - } - read_unlock(&tasklist_lock); - return nr_tgids; -} /* * Get a few tid's to return for filldir - we need to hold the * tasklist lock while doing this, and we must release it before * we actually do the filldir itself, so we use a temp buffer.. + * + * Rewrite this flaming bag of shit pronto. */ -static int get_tid_list(int index, unsigned int *tids, struct inode *dir) -{ - struct task_struct *leader_task = proc_task(dir); - struct task_struct *task = leader_task; - int nr_tids = 0; - - index -= 2; - read_lock(&tasklist_lock); - do { - int tid = task->pid; - if (!pid_alive(task)) - continue; - if (--index >= 0) - continue; - tids[nr_tids] = tid; - nr_tids++; - if (nr_tids >= PROC_MAXPIDS) - break; - } while ((task = next_thread(task)) != leader_task); - read_unlock(&tasklist_lock); - return nr_tids; -} - /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int tgid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; + int tgid_array[PROC_MAXPIDS]; unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - unsigned int nr_tgids, i; + int k, tgid, nr_tgids; + + pr_debug("proc_pid_readdir: f_pos = %Lx\n", filp->f_pos); if (!nr) { - ino_t ino = fake_ino(0,PROC_TGID_INO); + ino_t ino = fake_ino(0, PROC_TGID_INO); if (filldir(dirent, "self", 4, filp->f_pos, ino, DT_LNK) < 0) return 0; filp->f_pos++; - nr++; + nr = 1; } - nr_tgids = get_tgid_list(nr, tgid_array); - - for (i = 0; i < nr_tgids; i++) { - int tgid = tgid_array[i]; - ino_t ino = fake_ino(tgid,PROC_TGID_INO); - unsigned long j = PROC_NUMBUF; - - do buf[--j] = '0' + (tgid % 10); while (tgid/=10); - - if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) - break; - filp->f_pos++; + tgid = nr - 1; + pr_debug("proc_pid_readdir: first tgid = %d\n", tgid); + nr_tgids = find_tgids_after(tgid, tgid_array); + pr_debug("proc_pid_readdir: got %d tgids from find_tgids_after()\n", + nr_tgids); + for (k = 0; k < nr_tgids; ++k) { + loff_t new_f_pos; + ino_t ino; + unsigned long i, j = PROC_NUMBUF; + + tgid = tgid_array[k]; + ino = fake_ino(tgid, PROC_TGID_INO); + pr_debug("proc_pid_readdir: saw tgid %d\n", tgid); + i = tgid; + do buf[--j] = '0' + (i % 10); while (i /= 10); + if (filldir(dirent, buf + j, PROC_NUMBUF - j, + filp->f_pos, ino, DT_DIR) < 0) + break; + new_f_pos = tgid + 1 + FIRST_PROCESS_ENTRY; + if (new_f_pos < filp->f_pos) + pr_debug("proc_pid_readdir: ->f_pos going backward!\n"); + filp->f_pos = max(filp->f_pos, new_f_pos); + pr_debug("proc_pid_readdir: f_pos = %Lx\n", filp->f_pos); } + pr_debug("proc_pid_readdir: return\n"); return 0; } /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int tid_array[PROC_MAXPIDS]; + int tid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; unsigned int nr_tids, i; struct dentry *dentry = filp->f_dentry; @@ -1744,13 +1706,13 @@ static int proc_task_readdir(struct file /* fall through */ } - nr_tids = get_tid_list(pos, tid_array, inode); + nr_tids = find_tids_after(proc_task(inode)->tgid, pos - 2, tid_array); for (i = 0; i < nr_tids; i++) { unsigned long j = PROC_NUMBUF; int tid = tid_array[i]; - ino = fake_ino(tid,PROC_TID_INO); + ino = fake_ino(tid, PROC_TID_INO); do buf[--j] = '0' + (tid % 10); @@ -1758,7 +1720,7 @@ static int proc_task_readdir(struct file if (filldir(dirent, buf+j, PROC_NUMBUF-j, pos, ino, DT_DIR) < 0) break; - pos++; + pos = tid + 2; } out: filp->f_pos = pos; diff -prauN linux-2.6.0-test7/fs/proc/proc_misc.c wli-2.6.0-test7-bk1-29/fs/proc/proc_misc.c --- linux-2.6.0-test7/fs/proc/proc_misc.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/proc/proc_misc.c 2003-10-09 19:42:26.000000000 -0700 @@ -200,6 +200,7 @@ static int meminfo_read_proc(char *page, "SwapFree: %8lu kB\n" "Dirty: %8lu kB\n" "Writeback: %8lu kB\n" + "Deferred: %8lu kB\n" "Mapped: %8lu kB\n" "Slab: %8lu kB\n" "Committed_AS: %8u kB\n" @@ -210,8 +211,8 @@ static int meminfo_read_proc(char *page, K(i.totalram), K(i.freeram), K(i.bufferram), - K(get_page_cache_size()-total_swapcache_pages-i.bufferram), - K(total_swapcache_pages), + K(get_page_cache_size() - i.bufferram - ps.nr_swapcache), + K(ps.nr_swapcache), K(active), K(inactive), K(i.totalhigh), @@ -222,6 +223,7 @@ static int meminfo_read_proc(char *page, K(i.freeswap), K(ps.nr_dirty), K(ps.nr_writeback), + K(nr_deferred_pages()), K(ps.nr_mapped), K(ps.nr_slab), K(committed), diff -prauN linux-2.6.0-test7/fs/proc/root.c wli-2.6.0-test7-bk1-29/fs/proc/root.c --- linux-2.6.0-test7/fs/proc/root.c 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/proc/root.c 2003-10-09 19:57:40.000000000 -0700 @@ -103,17 +103,12 @@ static int proc_root_readdir(struct file unsigned int nr = filp->f_pos; int ret; - lock_kernel(); - if (nr < FIRST_PROCESS_ENTRY) { int error = proc_readdir(filp, dirent, filldir); - if (error <= 0) { - unlock_kernel(); + if (error <= 0) return error; - } filp->f_pos = FIRST_PROCESS_ENTRY; } - unlock_kernel(); ret = proc_pid_readdir(filp, dirent, filldir); return ret; diff -prauN linux-2.6.0-test7/fs/proc/task_mmu.c wli-2.6.0-test7-bk1-29/fs/proc/task_mmu.c --- linux-2.6.0-test7/fs/proc/task_mmu.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/proc/task_mmu.c 2003-10-09 19:34:30.000000000 -0700 @@ -5,27 +5,6 @@ char *task_mem(struct mm_struct *mm, char *buffer) { - unsigned long data = 0, stack = 0, exec = 0, lib = 0; - struct vm_area_struct *vma; - - down_read(&mm->mmap_sem); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - unsigned long len = (vma->vm_end - vma->vm_start) >> 10; - if (!vma->vm_file) { - data += len; - if (vma->vm_flags & VM_GROWSDOWN) - stack += len; - continue; - } - if (vma->vm_flags & VM_WRITE) - continue; - if (vma->vm_flags & VM_EXEC) { - exec += len; - if (vma->vm_flags & VM_EXECUTABLE) - continue; - lib += len; - } - } buffer += sprintf(buffer, "VmSize:\t%8lu kB\n" "VmLck:\t%8lu kB\n" @@ -37,9 +16,10 @@ char *task_mem(struct mm_struct *mm, cha mm->total_vm << (PAGE_SHIFT-10), mm->locked_vm << (PAGE_SHIFT-10), mm->rss << (PAGE_SHIFT-10), - data - stack, stack, - exec - lib, lib); - up_read(&mm->mmap_sem); + (mm->data - mm->stack) << (PAGE_SHIFT-10), + mm->stack << (PAGE_SHIFT-10), + mm->text << (PAGE_SHIFT-10), + mm->lib << (PAGE_SHIFT-10)); return buffer; } @@ -49,30 +29,15 @@ unsigned long task_vsize(struct mm_struc } int task_statm(struct mm_struct *mm, int *shared, int *text, - int *data, int *resident) + int *lib, int *data, int *resident, int *dirty) { - struct vm_area_struct *vma; - int size = 0; - + *shared = mm->shared; + *text = mm->text; + *lib = mm->lib; + *data = mm->data; + *dirty = mm->dirty; *resident = mm->rss; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - int pages = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - - size += pages; - if (is_vm_hugetlb_page(vma)) { - if (!(vma->vm_flags & VM_DONTCOPY)) - *shared += pages; - continue; - } - if (vma->vm_flags & VM_SHARED || !list_empty(&vma->shared)) - *shared += pages; - if (vma->vm_flags & VM_EXECUTABLE) - *text += pages; - else - *data += pages; - } - - return size; + return mm->total_vm; } static int show_map(struct seq_file *m, void *v) diff -prauN linux-2.6.0-test7/fs/proc/task_nommu.c wli-2.6.0-test7-bk1-29/fs/proc/task_nommu.c --- linux-2.6.0-test7/fs/proc/task_nommu.c 2003-10-08 12:24:14.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/proc/task_nommu.c 2003-10-09 19:34:30.000000000 -0700 @@ -67,19 +67,23 @@ unsigned long task_vsize(struct mm_struc struct mm_tblock_struct *tbp; unsigned long vsize = 0; + down_read(&mm->mmap_sem); for (tbp = &mm->context.tblock; tbp; tbp = tbp->next) { if (tbp->rblock) vsize += kobjsize(tbp->rblock->kblock); } - + up_read(&mm->mmap_sem); return vsize; } int task_statm(struct mm_struct *mm, int *shared, int *text, - int *data, int *resident) + int *lib, int *data, int *resident, int *dirty) { struct mm_tblock_struct *tbp; - int size = kobjsize(mm); + int size; + + down_read(&mm->mmap_sem); + size = kobjsize(mm); for (tbp = &mm->context.tblock; tbp; tbp = tbp->next) { if (tbp->next) @@ -92,8 +96,9 @@ int task_statm(struct mm_struct *mm, int size += (*text = mm->end_code - mm->start_code); size += (*data = mm->start_stack - mm->start_data); - + *shared = *lib = *dirty = 0; *resident = size; + up_read(&mm->mmap_sem); return size; } diff -prauN linux-2.6.0-test7/fs/qnx4/inode.c wli-2.6.0-test7-bk1-29/fs/qnx4/inode.c --- linux-2.6.0-test7/fs/qnx4/inode.c 2003-10-08 12:24:43.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/qnx4/inode.c 2003-10-09 19:42:26.000000000 -0700 @@ -434,7 +434,7 @@ static int qnx4_readpage(struct file *fi static int qnx4_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct qnx4_inode_info *qnx4_inode = qnx4_i(page->mapping->host); + struct qnx4_inode_info *qnx4_inode = qnx4_i(page_mapping(page)->host); return cont_prepare_write(page, from, to, qnx4_get_block, &qnx4_inode->mmu_private); } diff -prauN linux-2.6.0-test7/fs/reiserfs/inode.c wli-2.6.0-test7-bk1-29/fs/reiserfs/inode.c --- linux-2.6.0-test7/fs/reiserfs/inode.c 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/reiserfs/inode.c 2003-10-09 19:42:26.000000000 -0700 @@ -2050,7 +2050,7 @@ static void lock_buffer_for_writepage(st lock_buffer(bh); } else { if (test_set_buffer_locked(bh)) { - __set_page_dirty_nobuffers(page); + set_page_dirty_nobuffers(page); return; } } @@ -2069,7 +2069,7 @@ static void lock_buffer_for_writepage(st * code to handle reiserfs tails. */ static int reiserfs_write_full_page(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; unsigned long end_index = inode->i_size >> PAGE_CACHE_SHIFT ; int error = 0; unsigned long block ; @@ -2222,7 +2222,7 @@ static int reiserfs_readpage (struct fil static int reiserfs_writepage (struct page * page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; reiserfs_wait_on_write_block(inode->i_sb) ; return reiserfs_write_full_page(page, wbc) ; } @@ -2230,7 +2230,7 @@ static int reiserfs_writepage (struct pa int reiserfs_prepare_write(struct file *f, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; reiserfs_wait_on_write_block(inode->i_sb) ; fix_tail_page_for_writing(page) ; return block_prepare_write(page, from, to, reiserfs_get_block) ; @@ -2243,7 +2243,7 @@ static sector_t reiserfs_aop_bmap(struct static int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; int ret ; @@ -2345,7 +2345,7 @@ void i_attrs_to_sd_attrs( struct inode * */ static int reiserfs_releasepage(struct page *page, int unused_gfp_flags) { - struct inode *inode = page->mapping->host ; + struct inode *inode = page_mapping(page)->host ; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb) ; struct buffer_head *head ; struct buffer_head *bh ; diff -prauN linux-2.6.0-test7/fs/reiserfs/tail_conversion.c wli-2.6.0-test7-bk1-29/fs/reiserfs/tail_conversion.c --- linux-2.6.0-test7/fs/reiserfs/tail_conversion.c 2003-10-08 12:24:53.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/reiserfs/tail_conversion.c 2003-10-09 19:57:40.000000000 -0700 @@ -149,7 +149,7 @@ void reiserfs_unmap_buffer(struct buffer interested in removing it from per-sb j_dirty_buffers list, to avoid BUG() on attempt to write not mapped buffer */ if ( !list_empty(&bh->b_assoc_buffers) && bh->b_page) { - struct inode *inode = bh->b_page->mapping->host; + struct inode *inode = page_mapping(bh->b_page)->host; struct reiserfs_journal *j = SB_JOURNAL(inode->i_sb); spin_lock(&j->j_dirty_buffers_lock); list_del_init(&bh->b_assoc_buffers); diff -prauN linux-2.6.0-test7/fs/romfs/inode.c wli-2.6.0-test7-bk1-29/fs/romfs/inode.c --- linux-2.6.0-test7/fs/romfs/inode.c 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/romfs/inode.c 2003-10-09 19:42:26.000000000 -0700 @@ -414,7 +414,7 @@ out: unlock_kernel(); static int romfs_readpage(struct file *file, struct page * page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; unsigned long offset, avail, readlen; void *buf; int result = -EIO; diff -prauN linux-2.6.0-test7/fs/smbfs/file.c wli-2.6.0-test7-bk1-29/fs/smbfs/file.c --- linux-2.6.0-test7/fs/smbfs/file.c 2003-10-08 12:24:43.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/smbfs/file.c 2003-10-09 19:42:26.000000000 -0700 @@ -172,7 +172,7 @@ smb_writepage_sync(struct inode *inode, static int smb_writepage(struct page *page, struct writeback_control *wbc) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode; unsigned long end_index; unsigned offset = PAGE_CACHE_SIZE; diff -prauN linux-2.6.0-test7/fs/sysv/dir.c wli-2.6.0-test7-bk1-29/fs/sysv/dir.c --- linux-2.6.0-test7/fs/sysv/dir.c 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/sysv/dir.c 2003-10-09 19:42:26.000000000 -0700 @@ -39,10 +39,10 @@ static inline unsigned long dir_pages(st static int dir_commit_chunk(struct page *page, unsigned from, unsigned to) { - struct inode *dir = (struct inode *)page->mapping->host; + struct inode *dir = (struct inode *)page_mapping(page)->host; int err = 0; - page->mapping->a_ops->commit_write(NULL, page, from, to); + page_mapping(page)->a_ops->commit_write(NULL, page, from, to); if (IS_DIRSYNC(dir)) err = write_one_page(page, 1); else @@ -225,7 +225,7 @@ got_it: from = (char*)de - (char*)page_address(page); to = from + SYSV_DIRSIZE; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) goto out_unlock; memcpy (de->name, name, namelen); @@ -245,7 +245,7 @@ out_unlock: int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); struct inode *inode = (struct inode*)mapping->host; char *kaddr = (char*)page_address(page); unsigned from = (char*)de - kaddr; @@ -347,13 +347,13 @@ not_empty: void sysv_set_link(struct sysv_dir_entry *de, struct page *page, struct inode *inode) { - struct inode *dir = (struct inode*)page->mapping->host; + struct inode *dir = (struct inode*)page_mapping(page)->host; unsigned from = (char *)de-(char*)page_address(page); unsigned to = from + SYSV_DIRSIZE; int err; lock_page(page); - err = page->mapping->a_ops->prepare_write(NULL, page, from, to); + err = page_mapping(page)->a_ops->prepare_write(NULL, page, from, to); if (err) BUG(); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); diff -prauN linux-2.6.0-test7/fs/udf/file.c wli-2.6.0-test7-bk1-29/fs/udf/file.c --- linux-2.6.0-test7/fs/udf/file.c 2003-10-08 12:24:14.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/udf/file.c 2003-10-09 19:42:26.000000000 -0700 @@ -46,7 +46,7 @@ static int udf_adinicb_readpage(struct file *file, struct page * page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *kaddr; if (!PageLocked(page)) @@ -64,7 +64,7 @@ static int udf_adinicb_readpage(struct f static int udf_adinicb_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *kaddr; if (!PageLocked(page)) @@ -87,7 +87,7 @@ static int udf_adinicb_prepare_write(str static int udf_adinicb_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; char *kaddr = page_address(page); memcpy(UDF_I_DATA(inode) + UDF_I_LENEATTR(inode) + offset, diff -prauN linux-2.6.0-test7/fs/udf/symlink.c wli-2.6.0-test7-bk1-29/fs/udf/symlink.c --- linux-2.6.0-test7/fs/udf/symlink.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/udf/symlink.c 2003-10-09 19:42:26.000000000 -0700 @@ -80,7 +80,7 @@ static void udf_pc_to_char(struct super_ static int udf_symlink_filler(struct file *file, struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; struct buffer_head *bh = NULL; char *symlink; int err = -EIO; diff -prauN linux-2.6.0-test7/fs/xfs/linux/xfs_aops.c wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_aops.c --- linux-2.6.0-test7/fs/xfs/linux/xfs_aops.c 2003-10-08 12:24:44.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_aops.c 2003-10-09 19:42:26.000000000 -0700 @@ -236,7 +236,7 @@ probe_unwritten_page( if (PageWriteback(page)) goto out; - if (page->mapping && page_has_buffers(page)) { + if (page_mapping(page) && page_has_buffers(page)) { struct buffer_head *bh, *head; unsigned long p_offset = 0; @@ -284,7 +284,7 @@ probe_unmapped_page( if (PageWriteback(page)) goto out; - if (page->mapping && PageDirty(page)) { + if (page_mapping(page) && PageDirty(page)) { if (page_has_buffers(page)) { struct buffer_head *bh, *head; @@ -363,7 +363,7 @@ probe_delalloc_page( if (PageWriteback(page)) goto out; - if (page->mapping && page_has_buffers(page)) { + if (page_mapping(page) && page_has_buffers(page)) { struct buffer_head *bh, *head; int acceptable = 0; @@ -984,8 +984,7 @@ linvfs_direct_IO( if (error) return -error; - return blockdev_direct_IO(rw, iocb, inode, - pbmap.pbm_target->pbr_bdev, + return blockdev_direct_IO(rw, iocb, inode, pbmap.pbm_target->pbr_bdev, iov, offset, nr_segs, linvfs_get_blocks_direct, linvfs_unwritten_convert_direct); @@ -1080,7 +1079,7 @@ linvfs_writepage( int error; int need_trans; int delalloc, unmapped, unwritten; - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; /* * We need a transaction if: @@ -1160,7 +1159,7 @@ linvfs_release_page( struct page *page, int gfp_mask) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; int delalloc, unmapped, unwritten; count_page_state(page, &delalloc, &unmapped, &unwritten); diff -prauN linux-2.6.0-test7/fs/xfs/linux/xfs_file.c wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_file.c --- linux-2.6.0-test7/fs/xfs/linux/xfs_file.c 2003-10-08 12:24:06.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_file.c 2003-10-09 19:27:22.000000000 -0700 @@ -58,29 +58,55 @@ static struct vm_operations_struct linvfs_file_vm_ops; -STATIC ssize_t -linvfs_read( +STATIC inline ssize_t +__linvfs_read( struct kiocb *iocb, char __user *buf, + int ioflags, size_t count, loff_t pos) { struct iovec iov = {buf, count}; + struct file *file = iocb->ki_filp; vnode_t *vp; int error; BUG_ON(iocb->ki_pos != pos); - vp = LINVFS_GET_VP(iocb->ki_filp->f_dentry->d_inode); - VOP_READ(vp, iocb, &iov, 1, &iocb->ki_pos, NULL, error); + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + vp = LINVFS_GET_VP(file->f_dentry->d_inode); + VOP_READ(vp, iocb, &iov, 1, &iocb->ki_pos, ioflags, NULL, error); return error; } STATIC ssize_t -linvfs_write( +linvfs_read( + struct kiocb *iocb, + char __user *buf, + size_t count, + loff_t pos) +{ + return __linvfs_read(iocb, buf, 0, count, pos); +} + +STATIC ssize_t +linvfs_read_invis( + struct kiocb *iocb, + char __user *buf, + size_t count, + loff_t pos) +{ + return __linvfs_read(iocb, buf, IO_INVIS, count, pos); +} + + +STATIC inline ssize_t +__linvfs_write( struct kiocb *iocb, const char *buf, + int ioflags, size_t count, loff_t pos) { @@ -89,25 +115,48 @@ linvfs_write( struct inode *inode = file->f_dentry->d_inode->i_mapping->host; vnode_t *vp = LINVFS_GET_VP(inode); int error; - int direct = file->f_flags & O_DIRECT; BUG_ON(iocb->ki_pos != pos); - - if (direct) { - VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, NULL, error); + if (unlikely(file->f_flags & O_DIRECT)) { + ioflags |= IO_ISDIRECT; + VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, + ioflags, NULL, error); } else { down(&inode->i_sem); - VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, NULL, error); + VOP_WRITE(vp, iocb, &iov, 1, &iocb->ki_pos, + ioflags, NULL, error); up(&inode->i_sem); } - return error; } + STATIC ssize_t -linvfs_readv( +linvfs_write( + struct kiocb *iocb, + const char __user *buf, + size_t count, + loff_t pos) +{ + return __linvfs_write(iocb, buf, 0, count, pos); +} + +STATIC ssize_t +linvfs_write_invis( + struct kiocb *iocb, + const char __user *buf, + size_t count, + loff_t pos) +{ + return __linvfs_write(iocb, buf, IO_INVIS, count, pos); +} + + +STATIC inline ssize_t +__linvfs_readv( struct file *file, const struct iovec *iov, + int ioflags, unsigned long nr_segs, loff_t *ppos) { @@ -118,7 +167,10 @@ linvfs_readv( init_sync_kiocb(&kiocb, file); kiocb.ki_pos = *ppos; - VOP_READ(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, NULL, error); + + if (unlikely(file->f_flags & O_DIRECT)) + ioflags |= IO_ISDIRECT; + VOP_READ(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, ioflags, NULL, error); if (-EIOCBQUEUED == error) error = wait_on_sync_kiocb(&kiocb); *ppos = kiocb.ki_pos; @@ -127,25 +179,49 @@ linvfs_readv( } STATIC ssize_t -linvfs_writev( +linvfs_readv( struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) { + return __linvfs_readv(file, iov, 0, nr_segs, ppos); +} + +STATIC ssize_t +linvfs_readv_invis( + struct file *file, + const struct iovec *iov, + unsigned long nr_segs, + loff_t *ppos) +{ + return __linvfs_readv(file, iov, IO_INVIS, nr_segs, ppos); +} + + +STATIC inline ssize_t +__linvfs_writev( + struct file *file, + const struct iovec *iov, + int ioflags, + unsigned long nr_segs, + loff_t *ppos) +{ struct inode *inode = file->f_dentry->d_inode->i_mapping->host; vnode_t *vp = LINVFS_GET_VP(inode); struct kiocb kiocb; int error; - int direct = file->f_flags & O_DIRECT; init_sync_kiocb(&kiocb, file); kiocb.ki_pos = *ppos; - if (direct) { - VOP_WRITE(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, NULL, error); + if (unlikely(file->f_flags & O_DIRECT)) { + ioflags |= IO_ISDIRECT; + VOP_WRITE(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, + ioflags, NULL, error); } else { down(&inode->i_sem); - VOP_WRITE(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, NULL, error); + VOP_WRITE(vp, &kiocb, iov, nr_segs, &kiocb.ki_pos, + ioflags, NULL, error); up(&inode->i_sem); } if (-EIOCBQUEUED == error) @@ -155,6 +231,27 @@ linvfs_writev( return error; } + +STATIC ssize_t +linvfs_writev( + struct file *file, + const struct iovec *iov, + unsigned long nr_segs, + loff_t *ppos) +{ + return __linvfs_writev(file, iov, 0, nr_segs, ppos); +} + +STATIC ssize_t +linvfs_writev_invis( + struct file *file, + const struct iovec *iov, + unsigned long nr_segs, + loff_t *ppos) +{ + return __linvfs_writev(file, iov, IO_INVIS, nr_segs, ppos); +} + STATIC ssize_t linvfs_sendfile( struct file *filp, @@ -166,8 +263,7 @@ linvfs_sendfile( vnode_t *vp = LINVFS_GET_VP(filp->f_dentry->d_inode); int error; - VOP_SENDFILE(vp, filp, ppos, count, actor, target, NULL, error); - + VOP_SENDFILE(vp, filp, ppos, 0, count, actor, target, NULL, error); return error; } @@ -260,7 +356,6 @@ linvfs_readdir( return -ENOMEM; uio.uio_iov = &iov; - uio.uio_fmode = filp->f_mode; uio.uio_segflg = UIO_SYSSPACE; curr_offset = filp->f_pos; if (filp->f_pos != 0x7fffffff) @@ -346,7 +441,30 @@ linvfs_ioctl( vnode_t *vp = LINVFS_GET_VP(inode); ASSERT(vp); - VOP_IOCTL(vp, inode, filp, cmd, arg, error); + VOP_IOCTL(vp, inode, filp, 0, cmd, arg, error); + VMODIFY(vp); + + /* NOTE: some of the ioctl's return positive #'s as a + * byte count indicating success, such as + * readlink_by_handle. So we don't "sign flip" + * like most other routines. This means true + * errors need to be returned as a negative value. + */ + return error; +} + +STATIC int +linvfs_ioctl_invis( + struct inode *inode, + struct file *filp, + unsigned int cmd, + unsigned long arg) +{ + int error; + vnode_t *vp = LINVFS_GET_VP(inode); + + ASSERT(vp); + VOP_IOCTL(vp, inode, filp, IO_INVIS, cmd, arg, error); VMODIFY(vp); /* NOTE: some of the ioctl's return positive #'s as a @@ -396,6 +514,23 @@ struct file_operations linvfs_file_opera .fsync = linvfs_fsync, }; +struct file_operations linvfs_invis_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .readv = linvfs_readv_invis, + .writev = linvfs_writev_invis, + .aio_read = linvfs_read_invis, + .aio_write = linvfs_write_invis, + .sendfile = linvfs_sendfile, + .ioctl = linvfs_ioctl_invis, + .mmap = linvfs_file_mmap, + .open = linvfs_open, + .release = linvfs_release, + .fsync = linvfs_fsync, +}; + + struct file_operations linvfs_dir_operations = { .read = generic_read_dir, .readdir = linvfs_readdir, diff -prauN linux-2.6.0-test7/fs/xfs/linux/xfs_ioctl.c wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_ioctl.c --- linux-2.6.0-test7/fs/xfs/linux/xfs_ioctl.c 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_ioctl.c 2003-10-09 19:27:22.000000000 -0700 @@ -373,7 +373,8 @@ xfs_open_by_handle( put_unused_fd(new_fd); return -XFS_ERROR(-PTR_ERR(filp)); } - filp->f_mode |= FINVIS; + if (inode->i_mode & S_IFREG) + filp->f_op = &linvfs_invis_file_operations; fd_install(new_fd, filp); return new_fd; @@ -415,12 +416,11 @@ xfs_readlink_by_handle( auio.uio_iov = &aiov; auio.uio_iovcnt = 1; - auio.uio_fmode = FINVIS; auio.uio_offset = 0; auio.uio_segflg = UIO_USERSPACE; auio.uio_resid = olen; - VOP_READLINK(vp, &auio, NULL, error); + VOP_READLINK(vp, &auio, IO_INVIS, NULL, error); VN_RELE(vp); return (olen - auio.uio_resid); @@ -575,6 +575,7 @@ xfs_ioc_space( bhv_desc_t *bdp, vnode_t *vp, struct file *filp, + int flags, unsigned int cmd, unsigned long arg); @@ -606,6 +607,7 @@ STATIC int xfs_ioc_getbmap( bhv_desc_t *bdp, struct file *filp, + int flags, unsigned int cmd, unsigned long arg); @@ -619,6 +621,7 @@ xfs_ioctl( bhv_desc_t *bdp, struct inode *inode, struct file *filp, + int ioflags, unsigned int cmd, unsigned long arg) { @@ -652,7 +655,7 @@ xfs_ioctl( !capable(CAP_SYS_ADMIN)) return -EPERM; - return xfs_ioc_space(bdp, vp, filp, cmd, arg); + return xfs_ioc_space(bdp, vp, filp, ioflags, cmd, arg); case XFS_IOC_DIOINFO: { struct dioattr da; @@ -703,7 +706,7 @@ xfs_ioctl( case XFS_IOC_GETBMAP: case XFS_IOC_GETBMAPA: - return xfs_ioc_getbmap(bdp, filp, cmd, arg); + return xfs_ioc_getbmap(bdp, filp, ioflags, cmd, arg); case XFS_IOC_GETBMAPX: return xfs_ioc_getbmapx(bdp, arg); @@ -865,6 +868,7 @@ xfs_ioc_space( bhv_desc_t *bdp, vnode_t *vp, struct file *filp, + int ioflags, unsigned int cmd, unsigned long arg) { @@ -886,7 +890,7 @@ xfs_ioc_space( if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) attr_flags |= ATTR_NONBLOCK; - if (filp->f_mode & FINVIS) + if (ioflags & IO_INVIS) attr_flags |= ATTR_DMI; error = xfs_change_file_space(bdp, cmd, &bf, filp->f_pos, @@ -1153,6 +1157,7 @@ STATIC int xfs_ioc_getbmap( bhv_desc_t *bdp, struct file *filp, + int ioflags, unsigned int cmd, unsigned long arg) { @@ -1167,7 +1172,7 @@ xfs_ioc_getbmap( return -XFS_ERROR(EINVAL); iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0); - if (filp->f_mode & FINVIS) + if (ioflags & IO_INVIS) iflags |= BMV_IF_NO_DMAPI_READ; error = xfs_getbmap(bdp, &bm, (struct getbmap *)arg+1, iflags); diff -prauN linux-2.6.0-test7/fs/xfs/linux/xfs_iops.c wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_iops.c --- linux-2.6.0-test7/fs/xfs/linux/xfs_iops.c 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_iops.c 2003-10-09 19:27:22.000000000 -0700 @@ -386,9 +386,8 @@ linvfs_readlink( uio.uio_segflg = UIO_USERSPACE; uio.uio_resid = size; uio.uio_iovcnt = 1; - uio.uio_fmode = 0; - VOP_READLINK(vp, &uio, NULL, error); + VOP_READLINK(vp, &uio, 0, NULL, error); if (error) return -error; @@ -433,10 +432,9 @@ linvfs_follow_link( uio->uio_offset = 0; uio->uio_segflg = UIO_SYSSPACE; uio->uio_resid = MAXNAMELEN; - uio->uio_fmode = 0; uio->uio_iovcnt = 1; - VOP_READLINK(vp, uio, NULL, error); + VOP_READLINK(vp, uio, 0, NULL, error); if (error) { kfree(uio); kfree(link); diff -prauN linux-2.6.0-test7/fs/xfs/linux/xfs_iops.h wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_iops.h --- linux-2.6.0-test7/fs/xfs/linux/xfs_iops.h 2003-10-08 12:24:07.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_iops.h 2003-10-09 19:27:22.000000000 -0700 @@ -61,6 +61,7 @@ extern struct inode_operations linvfs_di extern struct inode_operations linvfs_symlink_inode_operations; extern struct file_operations linvfs_file_operations; +extern struct file_operations linvfs_invis_file_operations; extern struct file_operations linvfs_dir_operations; extern struct address_space_operations linvfs_aops; diff -prauN linux-2.6.0-test7/fs/xfs/linux/xfs_linux.h wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_linux.h --- linux-2.6.0-test7/fs/xfs/linux/xfs_linux.h 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_linux.h 2003-10-09 19:27:22.000000000 -0700 @@ -195,8 +195,6 @@ static inline void set_buffer_unwritten_ #define MAXPATHLEN 1024 -#define FINVIS 0x0100 /* don't update timestamps - XFS */ - #define MIN(a,b) (min(a,b)) #define MAX(a,b) (max(a,b)) #define howmany(x, y) (((x)+((y)-1))/(y)) diff -prauN linux-2.6.0-test7/fs/xfs/linux/xfs_lrw.c wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_lrw.c --- linux-2.6.0-test7/fs/xfs/linux/xfs_lrw.c 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_lrw.c 2003-10-09 19:27:22.000000000 -0700 @@ -149,6 +149,7 @@ xfs_read( const struct iovec *iovp, unsigned int segs, loff_t *offset, + int ioflags, cred_t *credp) { struct file *file = iocb->ki_filp; @@ -159,8 +160,6 @@ xfs_read( xfs_mount_t *mp; vnode_t *vp; unsigned long seg; - int direct = (file->f_flags & O_DIRECT); - int invisible = (file->f_mode & FINVIS); ip = XFS_BHVTOI(bdp); vp = BHV_TO_VNODE(bdp); @@ -183,7 +182,7 @@ xfs_read( } /* END copy & waste from filemap.c */ - if (direct) { + if (ioflags & IO_ISDIRECT) { pb_target_t *target = (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -214,7 +213,8 @@ xfs_read( */ xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && !invisible) { + if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && + !(ioflags & IO_INVIS)) { int error; vrwlock_t locktype = VRWLOCK_READ; @@ -226,14 +226,13 @@ xfs_read( } } - /* We need to deal with the iovec case seperately here */ ret = __generic_file_aio_read(iocb, iovp, segs, offset); xfs_iunlock(ip, XFS_IOLOCK_SHARED); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); - if (!invisible) + if (likely(!(ioflags & IO_INVIS))) xfs_ichgtime(ip, XFS_ICHGTIME_ACC); return ret; @@ -244,6 +243,7 @@ xfs_sendfile( bhv_desc_t *bdp, struct file *filp, loff_t *offset, + int ioflags, size_t count, read_actor_t actor, void *target, @@ -254,7 +254,6 @@ xfs_sendfile( xfs_inode_t *ip; xfs_mount_t *mp; vnode_t *vp; - int invisible = (filp->f_mode & FINVIS); ip = XFS_BHVTOI(bdp); vp = BHV_TO_VNODE(bdp); @@ -274,7 +273,9 @@ xfs_sendfile( return -EIO; xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && !invisible) { + + if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && + (!(ioflags & IO_INVIS))) { vrwlock_t locktype = VRWLOCK_READ; int error; @@ -289,8 +290,7 @@ xfs_sendfile( xfs_iunlock(ip, XFS_IOLOCK_SHARED); XFS_STATS_ADD(xs_read_bytes, ret); - if (!invisible) - xfs_ichgtime(ip, XFS_ICHGTIME_ACC); + xfs_ichgtime(ip, XFS_ICHGTIME_ACC); return ret; } @@ -518,6 +518,7 @@ xfs_write( const struct iovec *iovp, unsigned int segs, loff_t *offset, + int ioflags, cred_t *credp) { struct file *file = iocb->ki_filp; @@ -532,8 +533,6 @@ xfs_write( vnode_t *vp; unsigned long seg; int iolock; - int direct = (file->f_flags & O_DIRECT); - int invisible = (file->f_mode & FINVIS); int eventsent = 0; vrwlock_t locktype; @@ -569,7 +568,7 @@ xfs_write( return -EIO; } - if (direct) { + if (ioflags & IO_ISDIRECT) { pb_target_t *target = (xip->i_d.di_flags & XFS_DIFLAG_REALTIME) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -586,6 +585,7 @@ xfs_write( } xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); + isize = xip->i_d.di_size; limit = XFS_MAXIOFFSET(mp); @@ -608,7 +608,7 @@ start: } if ((DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_WRITE) && - !invisible && !eventsent)) { + !(ioflags & IO_INVIS) && !eventsent)) { loff_t savedsize = *offset; xfs_iunlock(xip, XFS_ILOCK_EXCL); @@ -642,7 +642,7 @@ start: * * We must update xfs' times since revalidate will overcopy xfs. */ - if (size && !invisible) + if (size && !(ioflags & IO_INVIS)) xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); /* @@ -654,7 +654,7 @@ start: * to zero it out up to the new size. */ - if (!direct && (*offset > isize && isize)) { + if (!(ioflags & IO_ISDIRECT) && (*offset > isize && isize)) { error = xfs_zero_eof(BHV_TO_VNODE(bdp), io, *offset, isize, *offset + size); if (error) { @@ -671,9 +671,9 @@ start: * setgid binaries. */ - if (((xip->i_d.di_mode & ISUID) || - ((xip->i_d.di_mode & (ISGID | (IEXEC >> 3))) == - (ISGID | (IEXEC >> 3)))) && + if (((xip->i_d.di_mode & S_ISUID) || + ((xip->i_d.di_mode & (S_ISGID | S_IXGRP)) == + (S_ISGID | S_IXGRP))) && !capable(CAP_FSETID)) { error = xfs_write_clear_setuid(xip); if (error) { @@ -683,14 +683,15 @@ start: } retry: - if (direct) { + if (ioflags & IO_ISDIRECT) { xfs_inval_cached_pages(vp, &xip->i_iocore, *offset, 1, 1); } ret = generic_file_aio_write_nolock(iocb, iovp, segs, offset); if ((ret == -ENOSPC) && - DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && !invisible) { + DM_EVENT_ENABLED(vp->v_vfsp, xip, DM_EVENT_NOSPACE) && + !(ioflags & IO_INVIS)) { xfs_rwunlock(bdp, locktype); error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, @@ -851,7 +852,7 @@ xfs_bmap(bhv_desc_t *bdp, xfs_inode_t *ip = XFS_BHVTOI(bdp); xfs_iocore_t *io = &ip->i_iocore; - ASSERT((ip->i_d.di_mode & IFMT) == IFREG); + ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFREG); ASSERT(((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) != 0) == ((ip->i_iocore.io_flags & XFS_IOCORE_RT) != 0)); diff -prauN linux-2.6.0-test7/fs/xfs/linux/xfs_lrw.h wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_lrw.h --- linux-2.6.0-test7/fs/xfs/linux/xfs_lrw.h 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_lrw.h 2003-10-09 19:27:22.000000000 -0700 @@ -56,12 +56,12 @@ extern int xfs_zero_eof(struct vnode *, xfs_fsize_t, xfs_fsize_t); extern ssize_t xfs_read(struct bhv_desc *, struct kiocb *, const struct iovec *, unsigned int, - loff_t *, struct cred *); + loff_t *, int, struct cred *); extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *, const struct iovec *, unsigned int, - loff_t *, struct cred *); + loff_t *, int, struct cred *); extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, - loff_t *, size_t, read_actor_t, + loff_t *, int, size_t, read_actor_t, void *, struct cred *); extern int xfs_iomap(struct xfs_iocore *, xfs_off_t, ssize_t, int, diff -prauN linux-2.6.0-test7/fs/xfs/linux/xfs_super.c wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_super.c --- linux-2.6.0-test7/fs/xfs/linux/xfs_super.c 2003-10-08 12:24:44.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_super.c 2003-10-09 19:27:22.000000000 -0700 @@ -373,7 +373,7 @@ STATIC int init_inodecache( void ) { linvfs_inode_cachep = kmem_cache_create("linvfs_icache", - sizeof(vnode_t), 0, + sizeof(vnode_t), 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once, NULL); @@ -579,7 +579,7 @@ linvfs_freeze_fs( if (sb->s_flags & MS_RDONLY) return; VFS_ROOT(vfsp, &vp, error); - VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, XFS_IOC_FREEZE, 0, error); + VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, 0, XFS_IOC_FREEZE, 0, error); VN_RELE(vp); } @@ -592,7 +592,7 @@ linvfs_unfreeze_fs( int error; VFS_ROOT(vfsp, &vp, error); - VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, XFS_IOC_THAW, 0, error); + VOP_IOCTL(vp, LINVFS_GET_IP(vp), NULL, 0, XFS_IOC_THAW, 0, error); VN_RELE(vp); } diff -prauN linux-2.6.0-test7/fs/xfs/linux/xfs_sysctl.h wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_sysctl.h --- linux-2.6.0-test7/fs/xfs/linux/xfs_sysctl.h 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_sysctl.h 2003-10-09 19:27:22.000000000 -0700 @@ -47,7 +47,7 @@ typedef struct xfs_sysctl_val { typedef struct xfs_param { xfs_sysctl_val_t restrict_chown;/* Root/non-root can give away files.*/ - xfs_sysctl_val_t sgid_inherit; /* Inherit ISGID bit if process' GID + xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID bit if process' GID * is not a member of the parent dir * GID */ xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */ diff -prauN linux-2.6.0-test7/fs/xfs/linux/xfs_vfs.h wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_vfs.h --- linux-2.6.0-test7/fs/xfs/linux/xfs_vfs.h 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_vfs.h 2003-10-09 19:27:22.000000000 -0700 @@ -44,8 +44,8 @@ struct xfs_mount_args; typedef struct vfs { u_int vfs_flag; /* flags */ - __kernel_fsid_t vfs_fsid; /* file system ID */ - __kernel_fsid_t *vfs_altfsid; /* An ID fixed for life of FS */ + fsid_t vfs_fsid; /* file system ID */ + fsid_t *vfs_altfsid; /* An ID fixed for life of FS */ bhv_head_t vfs_bh; /* head of vfs behavior chain */ struct super_block *vfs_super; /* Linux superblock structure */ struct task_struct *vfs_sync_task; diff -prauN linux-2.6.0-test7/fs/xfs/linux/xfs_vnode.h wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_vnode.h --- linux-2.6.0-test7/fs/xfs/linux/xfs_vnode.h 2003-10-08 12:24:53.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/linux/xfs_vnode.h 2003-10-09 19:27:22.000000000 -0700 @@ -155,9 +155,17 @@ extern u_short vttoif_tab[]; #define VMODIFIED 0x8 /* XFS inode state possibly differs */ /* to the Linux inode state. */ -typedef enum vrwlock { VRWLOCK_NONE, VRWLOCK_READ, - VRWLOCK_WRITE, VRWLOCK_WRITE_DIRECT, - VRWLOCK_TRY_READ, VRWLOCK_TRY_WRITE } vrwlock_t; +/* + * Values for the VOP_RWLOCK and VOP_RWUNLOCK flags parameter. + */ +typedef enum vrwlock { + VRWLOCK_NONE, + VRWLOCK_READ, + VRWLOCK_WRITE, + VRWLOCK_WRITE_DIRECT, + VRWLOCK_TRY_READ, + VRWLOCK_TRY_WRITE +} vrwlock_t; /* * Return values for VOP_INACTIVE. A return value of @@ -182,15 +190,15 @@ typedef enum vchange { typedef int (*vop_open_t)(bhv_desc_t *, struct cred *); typedef ssize_t (*vop_read_t)(bhv_desc_t *, struct kiocb *, const struct iovec *, unsigned int, - loff_t *, struct cred *); + loff_t *, int, struct cred *); typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *, const struct iovec *, unsigned int, - loff_t *, struct cred *); + loff_t *, int, struct cred *); typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, - loff_t *, size_t, read_actor_t, + loff_t *, int, size_t, read_actor_t, void *, struct cred *); typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, - unsigned int, unsigned long); + int, unsigned int, unsigned long); typedef int (*vop_getattr_t)(bhv_desc_t *, struct vattr *, int, struct cred *); typedef int (*vop_setattr_t)(bhv_desc_t *, struct vattr *, int, @@ -212,7 +220,8 @@ typedef int (*vop_readdir_t)(bhv_desc_t int *); typedef int (*vop_symlink_t)(bhv_desc_t *, vname_t *, struct vattr *, char *, vnode_t **, struct cred *); -typedef int (*vop_readlink_t)(bhv_desc_t *, struct uio *, struct cred *); +typedef int (*vop_readlink_t)(bhv_desc_t *, struct uio *, int, + struct cred *); typedef int (*vop_fsync_t)(bhv_desc_t *, int, struct cred *, xfs_off_t, xfs_off_t); typedef int (*vop_inactive_t)(bhv_desc_t *, struct cred *); @@ -284,12 +293,12 @@ typedef struct vnodeops { */ #define _VOP_(op, vp) (*((vnodeops_t *)(vp)->v_fops)->op) -#define VOP_READ(vp,file,iov,segs,offset,cr,rv) \ - rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,iov,segs,offset,cr) -#define VOP_WRITE(vp,file,iov,segs,offset,cr,rv) \ - rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,cr) -#define VOP_SENDFILE(vp,f,off,cnt,act,targ,cr,rv) \ - rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,cnt,act,targ,cr) +#define VOP_READ(vp,file,iov,segs,offset,ioflags,cr,rv) \ + rv = _VOP_(vop_read, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr) +#define VOP_WRITE(vp,file,iov,segs,offset,ioflags,cr,rv) \ + rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr) +#define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv) \ + rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr) #define VOP_BMAP(vp,of,sz,rw,b,n,rv) \ rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n) #define VOP_OPEN(vp, cr, rv) \ @@ -318,8 +327,8 @@ typedef struct vnodeops { rv = _VOP_(vop_readdir, vp)((vp)->v_fbhv,uiop,cr,eofp) #define VOP_SYMLINK(dvp,d,vap,tnm,vpp,cr,rv) \ rv = _VOP_(vop_symlink, dvp) ((dvp)->v_fbhv,d,vap,tnm,vpp,cr) -#define VOP_READLINK(vp,uiop,cr,rv) \ - rv = _VOP_(vop_readlink, vp)((vp)->v_fbhv,uiop,cr) +#define VOP_READLINK(vp,uiop,fl,cr,rv) \ + rv = _VOP_(vop_readlink, vp)((vp)->v_fbhv,uiop,fl,cr) #define VOP_FSYNC(vp,f,cr,b,e,rv) \ rv = _VOP_(vop_fsync, vp)((vp)->v_fbhv,f,cr,b,e) #define VOP_INACTIVE(vp, cr, rv) \ @@ -366,15 +375,20 @@ typedef struct vnodeops { */ #define VOP_FLUSH_PAGES(vp, first, last, flags, fiopt, rv) \ rv = _VOP_(vop_flush_pages, vp)((vp)->v_fbhv,first,last,flags,fiopt) -#define VOP_IOCTL(vp, inode, filp, cmd, arg, rv) \ - rv = _VOP_(vop_ioctl, vp)((vp)->v_fbhv,inode,filp,cmd,arg) +#define VOP_IOCTL(vp, inode, filp, fl, cmd, arg, rv) \ + rv = _VOP_(vop_ioctl, vp)((vp)->v_fbhv,inode,filp,fl,cmd,arg) #define VOP_IFLUSH(vp, flags, rv) \ rv = _VOP_(vop_iflush, vp)((vp)->v_fbhv, flags) /* - * Flags for VOP_IFLUSH call + * Flags for read/write calls - same values as IRIX */ +#define IO_ISDIRECT 0x00004 /* bypass page cache */ +#define IO_INVIS 0x00020 /* don't update inode timestamps */ +/* + * Flags for VOP_IFLUSH call + */ #define FLUSH_SYNC 1 /* wait for flush to complete */ #define FLUSH_INODE 2 /* flush the inode itself */ #define FLUSH_LOG 4 /* force the last log entry for diff -prauN linux-2.6.0-test7/fs/xfs/quota/xfs_qm.c wli-2.6.0-test7-bk1-29/fs/xfs/quota/xfs_qm.c --- linux-2.6.0-test7/fs/xfs/quota/xfs_qm.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/quota/xfs_qm.c 2003-10-09 19:27:22.000000000 -0700 @@ -1433,7 +1433,7 @@ xfs_qm_qino_alloc( } memset(&zerocr, 0, sizeof(zerocr)); - if ((error = xfs_dir_ialloc(&tp, mp->m_rootip, IFREG, 1, 0, + if ((error = xfs_dir_ialloc(&tp, mp->m_rootip, S_IFREG, 1, 0, &zerocr, 0, 1, ip, &committed))) { xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); diff -prauN linux-2.6.0-test7/fs/xfs/support/move.c wli-2.6.0-test7-bk1-29/fs/xfs/support/move.c --- linux-2.6.0-test7/fs/xfs/support/move.c 2003-10-08 12:24:07.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/support/move.c 2003-10-09 19:27:22.000000000 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License as @@ -45,30 +45,28 @@ int uio_read(caddr_t src, size_t len, struct uio *uio) { - struct iovec *iov; - u_int cnt; - int error; - - if (len > 0 && uio->uio_resid) { - iov = uio->uio_iov; - cnt = (u_int)iov->iov_len; - if (cnt == 0) - return 0; - if (cnt > len) - cnt = (u_int)len; - if (uio->uio_segflg == UIO_USERSPACE) { - error = copy_to_user(iov->iov_base, src, cnt); - if (error) - return EFAULT; - } else if (uio->uio_segflg == UIO_SYSSPACE) { - memcpy(iov->iov_base, src, cnt); - } else { - ASSERT(0); - } - iov->iov_base = (void *)((char *)iov->iov_base + cnt); - iov->iov_len -= cnt; - uio->uio_resid -= cnt; - uio->uio_offset += cnt; + size_t count; + + if (!len || !uio->uio_resid) + return 0; + + count = uio->uio_iov->iov_len; + if (!count) + return 0; + if (count > len) + count = len; + + if (uio->uio_segflg == UIO_USERSPACE) { + if (copy_to_user(uio->uio_iov->iov_base, src, count)) + return EFAULT; + } else { + ASSERT(uio->uio_segflg == UIO_SYSSPACE); + memcpy(uio->uio_iov->iov_base, src, count); } + + uio->uio_iov->iov_base = (void*)((char*)uio->uio_iov->iov_base + count); + uio->uio_iov->iov_len -= count; + uio->uio_offset += count; + uio->uio_resid -= count; return 0; } diff -prauN linux-2.6.0-test7/fs/xfs/support/move.h wli-2.6.0-test7-bk1-29/fs/xfs/support/move.h --- linux-2.6.0-test7/fs/xfs/support/move.h 2003-10-08 12:24:05.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/support/move.h 2003-10-09 19:27:22.000000000 -0700 @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000-2002 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2000-2003 Silicon Graphics, Inc. All Rights Reserved. * * This program is free software; you can redistribute it and/or modify it * under the terms of version 2 of the GNU General Public License as @@ -28,32 +28,60 @@ * For further information regarding this notice, see: * * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/ + * + * Portions Copyright (c) 1982, 1986, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ - #ifndef __XFS_SUPPORT_MOVE_H__ #define __XFS_SUPPORT_MOVE_H__ #include #include -typedef struct iovec iovec_t; - -typedef struct uio { - iovec_t *uio_iov; /* pointer to array of iovecs */ - int uio_iovcnt; /* number of iovecs */ - int uio_fmode; /* file mode flags */ - xfs_off_t uio_offset; /* file offset */ - short uio_segflg; /* address space (kernel or user) */ - ssize_t uio_resid; /* residual count */ -} uio_t; +/* Segment flag values. */ +enum uio_seg { + UIO_USERSPACE, /* from user data space */ + UIO_SYSSPACE, /* from system space */ +}; + +struct uio { + struct iovec *uio_iov; + int uio_iovcnt; + xfs_off_t uio_offset; + int uio_resid; + enum uio_seg uio_segflg; +}; -/* - * Segment flag values. - */ -typedef enum uio_seg { - UIO_USERSPACE, /* uio_iov describes user space */ - UIO_SYSSPACE, /* uio_iov describes system space */ -} uio_seg_t; +typedef struct uio uio_t; +typedef struct iovec iovec_t; extern int uio_read (caddr_t, size_t, uio_t *); diff -prauN linux-2.6.0-test7/fs/xfs/xfs_attr.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_attr.c --- linux-2.6.0-test7/fs/xfs/xfs_attr.c 2003-10-08 12:24:14.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_attr.c 2003-10-09 19:27:22.000000000 -0700 @@ -143,7 +143,7 @@ xfs_attr_get_int(xfs_inode_t *ip, char * /* * Do we answer them, or ignore them? */ - if ((error = xfs_iaccess(ip, IREAD, cred))) { + if ((error = xfs_iaccess(ip, S_IRUSR, cred))) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return(XFS_ERROR(error)); } @@ -239,7 +239,7 @@ xfs_attr_set(bhv_desc_t *bdp, char *name return (EIO); xfs_ilock(dp, XFS_ILOCK_SHARED); - if ((error = xfs_iaccess(dp, IWRITE, cred))) { + if ((error = xfs_iaccess(dp, S_IWUSR, cred))) { xfs_iunlock(dp, XFS_ILOCK_SHARED); return(XFS_ERROR(error)); } @@ -498,7 +498,7 @@ xfs_attr_remove(bhv_desc_t *bdp, char *n return (EIO); xfs_ilock(dp, XFS_ILOCK_SHARED); - if ((error = xfs_iaccess(dp, IWRITE, cred))) { + if ((error = xfs_iaccess(dp, S_IWUSR, cred))) { xfs_iunlock(dp, XFS_ILOCK_SHARED); return(XFS_ERROR(error)); } else if (XFS_IFORK_Q(dp) == 0 || @@ -687,7 +687,7 @@ xfs_attr_list(bhv_desc_t *bdp, char *buf * Do they have permission? */ xfs_ilock(dp, XFS_ILOCK_SHARED); - if ((error = xfs_iaccess(dp, IREAD, cred))) { + if ((error = xfs_iaccess(dp, S_IRUSR, cred))) { xfs_iunlock(dp, XFS_ILOCK_SHARED); return(XFS_ERROR(error)); } diff -prauN linux-2.6.0-test7/fs/xfs/xfs_bmap.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_bmap.c --- linux-2.6.0-test7/fs/xfs/xfs_bmap.c 2003-10-08 12:24:46.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_bmap.c 2003-10-09 19:27:22.000000000 -0700 @@ -521,7 +521,7 @@ xfs_bmap_add_attrfork_local( if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip)) return 0; - if ((ip->i_d.di_mode & IFMT) == IFDIR) { + if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { mp = ip->i_mount; memset(&dargs, 0, sizeof(dargs)); dargs.dp = ip; @@ -3354,7 +3354,7 @@ xfs_bmap_local_to_extents( * We don't want to deal with the case of keeping inode data inline yet. * So sending the data fork of a regular inode is invalid. */ - ASSERT(!((ip->i_d.di_mode & IFMT) == IFREG && + ASSERT(!((ip->i_d.di_mode & S_IFMT) == S_IFREG && whichfork == XFS_DATA_FORK)); ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL); diff -prauN linux-2.6.0-test7/fs/xfs/xfs_buf_item.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_buf_item.c --- linux-2.6.0-test7/fs/xfs/xfs_buf_item.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_buf_item.c 2003-10-09 19:27:22.000000000 -0700 @@ -162,6 +162,7 @@ xfs_buf_item_log_check( #endif STATIC void xfs_buf_error_relse(xfs_buf_t *bp); +STATIC void xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip); /* * This returns the number of log iovecs needed to log the @@ -417,22 +418,25 @@ xfs_buf_item_unpin( ASSERT(XFS_BUF_VALUSEMA(bp) <= 0); ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); ASSERT(XFS_BUF_ISSTALE(bp)); -/** - ASSERT(bp->b_pincount == 0); -**/ ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL); xfs_buf_item_trace("UNPIN STALE", bip); xfs_buftrace("XFS_UNPIN STALE", bp); - AIL_LOCK(mp,s); /* * If we get called here because of an IO error, we may * or may not have the item on the AIL. xfs_trans_delete_ail() * will take care of that situation. * xfs_trans_delete_ail() drops the AIL lock. */ - xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s); - xfs_buf_item_relse(bp); - ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL); + if (bip->bli_flags & XFS_BLI_STALE_INODE) { + xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip); + XFS_BUF_FSPRIVATE(bp, void *) = NULL; + XFS_BUF_CLR_IODONE_FUNC(bp); + } else { + AIL_LOCK(mp,s); + xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip, s); + xfs_buf_item_relse(bp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL); + } xfs_buf_relse(bp); } } diff -prauN linux-2.6.0-test7/fs/xfs/xfs_buf_item.h wli-2.6.0-test7-bk1-29/fs/xfs/xfs_buf_item.h --- linux-2.6.0-test7/fs/xfs/xfs_buf_item.h 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_buf_item.h 2003-10-09 19:27:22.000000000 -0700 @@ -96,6 +96,7 @@ typedef struct xfs_buf_log_format_t { #define XFS_BLI_STALE 0x04 #define XFS_BLI_LOGGED 0x08 #define XFS_BLI_INODE_ALLOC_BUF 0x10 +#define XFS_BLI_STALE_INODE 0x20 #ifdef __KERNEL__ @@ -130,7 +131,7 @@ typedef struct xfs_buf_log_item { * items which have been canceled and should not be replayed. */ typedef struct xfs_buf_cancel { - xfs_daddr_t bc_blkno; + xfs_daddr_t bc_blkno; uint bc_len; int bc_refcount; struct xfs_buf_cancel *bc_next; diff -prauN linux-2.6.0-test7/fs/xfs/xfs_clnt.h wli-2.6.0-test7-bk1-29/fs/xfs/xfs_clnt.h --- linux-2.6.0-test7/fs/xfs/xfs_clnt.h 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_clnt.h 2003-10-09 19:27:22.000000000 -0700 @@ -99,5 +99,6 @@ struct xfs_mount_args { #define XFSMNT_NOUUID 0x01000000 /* Ignore fs uuid */ #define XFSMNT_DMAPI 0x02000000 /* enable dmapi/xdsm */ #define XFSMNT_NOLOGFLUSH 0x04000000 /* Don't flush for log blocks */ +#define XFSMNT_IDELETE 0x08000000 /* inode cluster delete */ #endif /* __XFS_CLNT_H__ */ diff -prauN linux-2.6.0-test7/fs/xfs/xfs_dfrag.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_dfrag.c --- linux-2.6.0-test7/fs/xfs/xfs_dfrag.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_dfrag.c 2003-10-09 19:27:22.000000000 -0700 @@ -154,12 +154,12 @@ xfs_swapext( goto error0; } if ((current->fsuid != ip->i_d.di_uid) && - (error = xfs_iaccess(ip, IWRITE, NULL)) && + (error = xfs_iaccess(ip, S_IWUSR, NULL)) && !capable_cred(NULL, CAP_FOWNER)) { goto error0; } if ((current->fsuid != tip->i_d.di_uid) && - (error = xfs_iaccess(tip, IWRITE, NULL)) && + (error = xfs_iaccess(tip, S_IWUSR, NULL)) && !capable_cred(NULL, CAP_FOWNER)) { goto error0; } diff -prauN linux-2.6.0-test7/fs/xfs/xfs_dinode.h wli-2.6.0-test7-bk1-29/fs/xfs/xfs_dinode.h --- linux-2.6.0-test7/fs/xfs/xfs_dinode.h 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_dinode.h 2003-10-09 19:27:22.000000000 -0700 @@ -107,7 +107,7 @@ typedef struct xfs_dinode xfs_dir_shortform_t di_dirsf; /* shortform directory */ xfs_dir2_sf_t di_dir2sf; /* shortform directory v2 */ char di_c[1]; /* local contents */ - xfs_dev_t di_dev; /* device for IFCHR/IFBLK */ + xfs_dev_t di_dev; /* device for S_IFCHR/S_IFBLK */ uuid_t di_muuid; /* mount point value */ char di_symlink[1]; /* local symbolic link */ } di_u; @@ -436,25 +436,6 @@ void xfs_dfork_next_set(xfs_dinode_t *di #endif -/* - * File types (mode field) - */ -#define IFMT S_IFMT -#define IFSOCK S_IFSOCK -#define IFLNK S_IFLNK -#define IFREG S_IFREG -#define IFBLK S_IFBLK -#define IFDIR S_IFDIR -#define IFCHR S_IFCHR -#define IFIFO S_IFIFO - -#define ISUID S_ISUID -#define ISGID S_ISGID -#define ISVTX S_ISVTX -#define IREAD S_IRUSR -#define IWRITE S_IWUSR -#define IEXEC S_IXUSR - #if XFS_WANT_FUNCS || (XFS_WANT_SPACE && XFSSO_XFS_BUF_TO_DINODE) xfs_dinode_t *xfs_buf_to_dinode(struct xfs_buf *bp); #define XFS_BUF_TO_DINODE(bp) xfs_buf_to_dinode(bp) diff -prauN linux-2.6.0-test7/fs/xfs/xfs_dir.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_dir.c --- linux-2.6.0-test7/fs/xfs/xfs_dir.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_dir.c 2003-10-09 19:27:22.000000000 -0700 @@ -216,7 +216,7 @@ xfs_dir_isempty(xfs_inode_t *dp) { xfs_dir_sf_hdr_t *hdr; - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); if (dp->i_d.di_size == 0) return(1); if (dp->i_d.di_size > XFS_IFORK_DSIZE(dp)) @@ -238,7 +238,7 @@ xfs_dir_init(xfs_trans_t *trans, xfs_ino args.dp = dir; args.trans = trans; - ASSERT((dir->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dir->i_d.di_mode & S_IFMT) == S_IFDIR); if ((error = xfs_dir_ino_validate(trans->t_mountp, parent_dir->i_ino))) return error; @@ -257,7 +257,7 @@ xfs_dir_createname(xfs_trans_t *trans, x xfs_da_args_t args; int retval, newsize, done; - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum))) return (retval); @@ -321,7 +321,7 @@ xfs_dir_canenter(xfs_trans_t *trans, xfs xfs_da_args_t args; int retval, newsize; - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); /* * Fill in the arg structure for this request. */ @@ -366,7 +366,7 @@ xfs_dir_removename(xfs_trans_t *trans, x xfs_da_args_t args; int count, totallen, newsize, retval; - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); XFS_STATS_INC(xs_dir_remove); /* * Fill in the arg structure for this request. @@ -409,7 +409,7 @@ xfs_dir_lookup(xfs_trans_t *trans, xfs_i xfs_da_args_t args; int retval; - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); XFS_STATS_INC(xs_dir_lookup); /* @@ -455,7 +455,7 @@ xfs_dir_getdents(xfs_trans_t *trans, xfs xfs_dir_put_t put; XFS_STATS_INC(xs_dir_getdents); - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); /* * If our caller has given us a single contiguous memory buffer, @@ -499,7 +499,7 @@ xfs_dir_replace(xfs_trans_t *trans, xfs_ xfs_da_args_t args; int retval; - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); if ((retval = xfs_dir_ino_validate(trans->t_mountp, inum))) return retval; @@ -545,7 +545,7 @@ xfs_dir_shortform_validate_ondisk(xfs_mo - if ((INT_GET(dp->di_core.di_mode, ARCH_CONVERT) & IFMT) != IFDIR) { + if ((INT_GET(dp->di_core.di_mode, ARCH_CONVERT) & S_IFMT) != S_IFDIR) { return 0; } if (INT_GET(dp->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_LOCAL) { diff -prauN linux-2.6.0-test7/fs/xfs/xfs_dir2.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_dir2.c --- linux-2.6.0-test7/fs/xfs/xfs_dir2.c 2003-10-08 12:24:53.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_dir2.c 2003-10-09 19:27:22.000000000 -0700 @@ -155,7 +155,7 @@ xfs_dir2_isempty( { xfs_dir2_sf_t *sfp; /* shortform directory structure */ - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); /* * Might happen during shutdown. */ @@ -183,7 +183,7 @@ xfs_dir2_init( memset((char *)&args, 0, sizeof(args)); args.dp = dp; args.trans = tp; - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); if ((error = xfs_dir_ino_validate(tp->t_mountp, pdp->i_ino))) { return error; } @@ -208,7 +208,7 @@ xfs_dir2_createname( int rval; /* return value */ int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) { return rval; } @@ -261,7 +261,7 @@ xfs_dir2_lookup( int rval; /* return value */ int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); XFS_STATS_INC(xs_dir_lookup); /* @@ -319,7 +319,7 @@ xfs_dir2_removename( int rval; /* return value */ int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); XFS_STATS_INC(xs_dir_remove); /* * Fill in the arg structure for this request. @@ -369,7 +369,7 @@ xfs_dir2_getdents( int rval; /* return value */ int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); XFS_STATS_INC(xs_dir_getdents); /* * If our caller has given us a single contiguous aligned memory buffer, @@ -422,7 +422,7 @@ xfs_dir2_replace( int rval; /* return value */ int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) { return rval; @@ -473,7 +473,7 @@ xfs_dir2_canenter( int rval; /* return value */ int v; /* type-checking value */ - ASSERT((dp->i_d.di_mode & IFMT) == IFDIR); + ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); /* * Fill in the arg structure for this request. */ diff -prauN linux-2.6.0-test7/fs/xfs/xfs_ialloc.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_ialloc.c --- linux-2.6.0-test7/fs/xfs/xfs_ialloc.c 2003-10-08 12:24:14.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_ialloc.c 2003-10-09 19:27:22.000000000 -0700 @@ -57,6 +57,7 @@ #include "xfs_bit.h" #include "xfs_rtalloc.h" #include "xfs_error.h" +#include "xfs_bmap.h" /* * Log specified fields for the inode given by bp and off. @@ -921,7 +922,10 @@ error0: int xfs_difree( xfs_trans_t *tp, /* transaction pointer */ - xfs_ino_t inode) /* inode to be freed */ + xfs_ino_t inode, /* inode to be freed */ + xfs_bmap_free_t *flist, /* extents to free */ + int *delete, /* set if inode cluster was deleted */ + xfs_ino_t *first_ino) /* first inode in deleted cluster */ { /* REFERENCED */ xfs_agblock_t agbno; /* block number containing inode */ @@ -932,6 +936,7 @@ xfs_difree( xfs_btree_cur_t *cur; /* inode btree cursor */ int error; /* error return value */ int i; /* result code */ + int ilen; /* inodes in an inode cluster */ xfs_mount_t *mp; /* mount structure for filesystem */ int off; /* offset of inode in inode chunk */ xfs_inobt_rec_t rec; /* btree record */ @@ -995,10 +1000,11 @@ xfs_difree( if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount, &rec.ir_free, &i, ARCH_NOCONVERT))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - freecount += rec.ir_freecount; - if ((error = xfs_inobt_increment(cur, 0, &i))) - goto error0; + if (i) { + freecount += rec.ir_freecount; + if ((error = xfs_inobt_increment(cur, 0, &i))) + goto error0; + } } while (i == 1); ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) || XFS_FORCED_SHUTDOWN(mp)); @@ -1033,20 +1039,60 @@ xfs_difree( */ XFS_INOBT_SET_FREE(&rec, off, ARCH_NOCONVERT); rec.ir_freecount++; - if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) { - cmn_err(CE_WARN, - "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.", - error, mp->m_fsname); - goto error0; - } + /* - * Change the inode free counts and log the ag/sb changes. + * When an inode cluster is free, it becomes elgible for removal */ - INT_MOD(agi->agi_freecount, ARCH_CONVERT, 1); - xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); - down_read(&mp->m_peraglock); - mp->m_perag[agno].pagi_freecount++; - up_read(&mp->m_peraglock); + if ((mp->m_flags & XFS_MOUNT_IDELETE) && + (rec.ir_freecount == XFS_IALLOC_INODES(mp))) { + + *delete = 1; + *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + + /* + * Remove the inode cluster from the AGI B+Tree, adjust the + * AGI and Superblock inode counts, and mark the disk space + * to be freed when the transaction is committed. + */ + ilen = XFS_IALLOC_INODES(mp); + INT_MOD(agi->agi_count, ARCH_CONVERT, -ilen); + INT_MOD(agi->agi_freecount, ARCH_CONVERT, -(ilen - 1)); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); + down_read(&mp->m_peraglock); + mp->m_perag[agno].pagi_freecount -= ilen - 1; + up_read(&mp->m_peraglock); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1)); + + if ((error = xfs_inobt_delete(cur, &i))) { + cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n", + error, mp->m_fsname); + goto error0; + } + + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, + agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)), + XFS_IALLOC_BLOCKS(mp), flist, mp); + } else { + *delete = 0; + + if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) { + cmn_err(CE_WARN, + "xfs_difree: xfs_inobt_update() returned an error %d on %s. Returning error.", + error, mp->m_fsname); + goto error0; + } + /* + * Change the inode free counts and log the ag/sb changes. + */ + INT_MOD(agi->agi_freecount, ARCH_CONVERT, 1); + xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT); + down_read(&mp->m_peraglock); + mp->m_perag[agno].pagi_freecount++; + up_read(&mp->m_peraglock); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); + } + #ifdef DEBUG if (cur->bc_nlevels == 1) { int freecount = 0; @@ -1054,20 +1100,23 @@ xfs_difree( if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i))) goto error0; do { - if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, - &rec.ir_freecount, &rec.ir_free, &i, ARCH_NOCONVERT))) - goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - freecount += rec.ir_freecount; - if ((error = xfs_inobt_increment(cur, 0, &i))) + if ((error = xfs_inobt_get_rec(cur, + &rec.ir_startino, + &rec.ir_freecount, + &rec.ir_free, &i, + ARCH_NOCONVERT))) goto error0; + if (i) { + freecount += rec.ir_freecount; + if ((error = xfs_inobt_increment(cur, 0, &i))) + goto error0; + } } while (i == 1); ASSERT(freecount == INT_GET(agi->agi_freecount, ARCH_CONVERT) || XFS_FORCED_SHUTDOWN(mp)); } #endif xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); - xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1); return 0; error0: @@ -1114,7 +1163,7 @@ xfs_dilocate( agbno = XFS_AGINO_TO_AGBNO(mp, agino); if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks || ino != XFS_AGINO_TO_INO(mp, agno, agino)) { -#if 0 +#ifdef DEBUG if (agno >= mp->m_sb.sb_agcount) { xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: agno (%d) >= " diff -prauN linux-2.6.0-test7/fs/xfs/xfs_ialloc.h wli-2.6.0-test7-bk1-29/fs/xfs/xfs_ialloc.h --- linux-2.6.0-test7/fs/xfs/xfs_ialloc.h 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_ialloc.h 2003-10-09 19:27:22.000000000 -0700 @@ -134,7 +134,10 @@ xfs_dialloc( int /* error */ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ - xfs_ino_t inode); /* inode to be freed */ + xfs_ino_t inode, /* inode to be freed */ + struct xfs_bmap_free *flist, /* extents to free */ + int *delete, /* set if inode cluster was deleted */ + xfs_ino_t *first_ino); /* first inode in deleted cluster */ /* * Return the location of the inode in bno/len/off, diff -prauN linux-2.6.0-test7/fs/xfs/xfs_ialloc_btree.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_ialloc_btree.c --- linux-2.6.0-test7/fs/xfs/xfs_ialloc_btree.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_ialloc_btree.c 2003-10-09 19:27:22.000000000 -0700 @@ -49,6 +49,7 @@ #include "xfs_btree.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" +#include "xfs_error.h" /* * Inode allocation management for XFS. @@ -73,7 +74,6 @@ STATIC int xfs_inobt_updkey(xfs_btree_cu * Internal functions. */ -#ifdef _NOTYET_ /* * Single level of the xfs_inobt_delete record deletion routine. * Delete record pointed to by cur/level. @@ -87,8 +87,7 @@ xfs_inobt_delrec( int *stat) /* fail/done/go-on */ { xfs_buf_t *agbp; /* buffer for a.g. inode header */ - xfs_agnumber_t agfbno; /* agf block of freed btree block */ - xfs_buf_t *agfbp; /* bp of agf block of freed block */ + xfs_mount_t *mp; /* mount structure */ xfs_agi_t *agi; /* allocation group inode header */ xfs_inobt_block_t *block; /* btree block record/key lives in */ xfs_agblock_t bno; /* btree block number */ @@ -96,15 +95,15 @@ xfs_inobt_delrec( int error; /* error return value */ int i; /* loop index */ xfs_inobt_key_t key; /* kp points here if block is level 0 */ - xfs_inobt_key_t *kp; /* pointer to btree keys */ + xfs_inobt_key_t *kp = NULL; /* pointer to btree keys */ xfs_agblock_t lbno; /* left block's block number */ xfs_buf_t *lbp; /* left block's buffer pointer */ xfs_inobt_block_t *left; /* left btree block */ xfs_inobt_key_t *lkp; /* left block key pointer */ xfs_inobt_ptr_t *lpp; /* left block address pointer */ - int lrecs; /* number of records in left block */ + int lrecs = 0; /* number of records in left block */ xfs_inobt_rec_t *lrp; /* left block record pointer */ - xfs_inobt_ptr_t *pp; /* pointer to btree addresses */ + xfs_inobt_ptr_t *pp = NULL; /* pointer to btree addresses */ int ptr; /* index in btree block for this rec */ xfs_agblock_t rbno; /* right block's block number */ xfs_buf_t *rbp; /* right block's buffer pointer */ @@ -112,10 +111,12 @@ xfs_inobt_delrec( xfs_inobt_key_t *rkp; /* right block key pointer */ xfs_inobt_rec_t *rp; /* pointer to btree records */ xfs_inobt_ptr_t *rpp; /* right block address pointer */ - int rrecs; /* number of records in right block */ + int rrecs = 0; /* number of records in right block */ + int numrecs; xfs_inobt_rec_t *rrp; /* right block record pointer */ xfs_btree_cur_t *tcur; /* temporary btree cursor */ + mp = cur->bc_mp; /* * Get the index of the entry being deleted, check for nothing there. @@ -125,19 +126,22 @@ xfs_inobt_delrec( *stat = 0; return 0; } + /* * Get the buffer & block containing the record or key/ptr. */ bp = cur->bc_bufs[level]; block = XFS_BUF_TO_INOBT_BLOCK(bp); #ifdef DEBUG - if (error = xfs_btree_check_sblock(cur, block, level, bp)) + if ((error = xfs_btree_check_sblock(cur, block, level, bp))) return error; #endif /* * Fail if we're off the end of the block. */ - if (ptr > INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + + numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT); + if (ptr > numrecs) { *stat = 0; return 0; } @@ -150,18 +154,18 @@ xfs_inobt_delrec( kp = XFS_INOBT_KEY_ADDR(block, 1, cur); pp = XFS_INOBT_PTR_ADDR(block, 1, cur); #ifdef DEBUG - for (i = ptr; i < INT_GET(block->bb_numrecs, ARCH_CONVERT); i++) { - if (error = xfs_btree_check_sptr(cur, INT_GET(pp[i], ARCH_CONVERT), level)) + for (i = ptr; i < numrecs; i++) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i], ARCH_CONVERT), level))) return error; } #endif - if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + if (ptr < numrecs) { memmove(&kp[ptr - 1], &kp[ptr], - (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*kp)); + (numrecs - ptr) * sizeof(*kp)); memmove(&pp[ptr - 1], &pp[ptr], - (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*pp)); - xfs_inobt_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1); - xfs_inobt_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1); + (numrecs - ptr) * sizeof(*kp)); + xfs_inobt_log_keys(cur, bp, ptr, numrecs - 1); + xfs_inobt_log_ptrs(cur, bp, ptr, numrecs - 1); } } /* @@ -170,24 +174,25 @@ xfs_inobt_delrec( */ else { rp = XFS_INOBT_REC_ADDR(block, 1, cur); - if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + if (ptr < numrecs) { memmove(&rp[ptr - 1], &rp[ptr], - (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr) * sizeof(*rp)); - xfs_inobt_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT) - 1); + (numrecs - ptr) * sizeof(*rp)); + xfs_inobt_log_recs(cur, bp, ptr, numrecs - 1); } /* * If it's the first record in the block, we'll need a key * structure to pass up to the next level (updkey). */ if (ptr == 1) { - INT_COPY(key.ir_startino, rp->ir_startino, ARCH_CONVERT); + key.ir_startino = rp->ir_startino; kp = &key; } } /* * Decrement and log the number of entries in the block. */ - INT_MOD(block->bb_numrecs, ARCH_CONVERT, -1); + numrecs--; + INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs); xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS); /* * Is this the root level? If so, we're almost done. @@ -199,7 +204,7 @@ xfs_inobt_delrec( * and it's NOT the leaf level, * then we can get rid of this level. */ - if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == 1 && level > 0) { + if (numrecs == 1 && level > 0) { agbp = cur->bc_private.i.agbp; agi = XFS_BUF_TO_AGI(agbp); /* @@ -207,12 +212,13 @@ xfs_inobt_delrec( * Make it the new root of the btree. */ bno = INT_GET(agi->agi_root, ARCH_CONVERT); - INT_COPY(agi->agi_root, *pp, ARCH_CONVERT); + agi->agi_root = *pp; INT_MOD(agi->agi_level, ARCH_CONVERT, -1); /* * Free the block. */ - if (error = xfs_free_extent(cur->bc_tp, bno, 1)) + if ((error = xfs_free_extent(cur->bc_tp, + XFS_AGB_TO_FSB(mp, cur->bc_private.i.agno, bno), 1))) return error; xfs_trans_binval(cur->bc_tp, bp); xfs_ialloc_log_agi(cur->bc_tp, agbp, @@ -222,21 +228,6 @@ xfs_inobt_delrec( */ cur->bc_bufs[level] = NULL; cur->bc_nlevels--; - /* - * To ensure that the freed block is not used for - * user data until this transaction is permanent, - * we lock the agf buffer for this ag until the - * transaction record makes it to the on-disk log. - */ - agfbno = XFS_AG_DADDR(cur->bc_mp, - cur->bc_private.i.agno, - XFS_AGF_DADDR(mp)); - if (error = xfs_trans_read_buf(cur->bc_mp, cur->bc_tp, - cur->bc_mp->m_ddev_targp, agfbno, - XFS_FSS_TO_BB(mp, 1), 0, &agfbp)) - return error; - ASSERT(!XFS_BUF_GETERROR(agfbp)); - xfs_trans_bhold_until_committed(cur->bc_tp, agfbp); } else if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i))) return error; @@ -253,7 +244,7 @@ xfs_inobt_delrec( * If the number of records remaining in the block is at least * the minimum, we're done. */ - if (INT_GET(block->bb_numrecs, ARCH_CONVERT) >= XFS_INOBT_BLOCK_MINRECS(level, cur)) { + if (numrecs >= XFS_INOBT_BLOCK_MINRECS(level, cur)) { if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i))) return error; @@ -273,7 +264,7 @@ xfs_inobt_delrec( * Duplicate the cursor so our btree manipulations here won't * disrupt the next level up. */ - if (error = xfs_btree_dup_cursor(cur, &tcur)) + if ((error = xfs_btree_dup_cursor(cur, &tcur))) return error; /* * If there's a right sibling, see if it's ok to shift an entry @@ -286,7 +277,7 @@ xfs_inobt_delrec( */ i = xfs_btree_lastrec(tcur, level); XFS_WANT_CORRUPTED_GOTO(i == 1, error0); - if (error = xfs_inobt_increment(tcur, level, &i)) + if ((error = xfs_inobt_increment(tcur, level, &i))) goto error0; XFS_WANT_CORRUPTED_GOTO(i == 1, error0); i = xfs_btree_lastrec(tcur, level); @@ -297,7 +288,7 @@ xfs_inobt_delrec( rbp = tcur->bc_bufs[level]; right = XFS_BUF_TO_INOBT_BLOCK(rbp); #ifdef DEBUG - if (error = xfs_btree_check_sblock(cur, right, level, rbp)) + if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) goto error0; #endif /* @@ -311,7 +302,7 @@ xfs_inobt_delrec( */ if (INT_GET(right->bb_numrecs, ARCH_CONVERT) - 1 >= XFS_INOBT_BLOCK_MINRECS(level, cur)) { - if (error = xfs_inobt_lshift(tcur, level, &i)) + if ((error = xfs_inobt_lshift(tcur, level, &i))) goto error0; if (i) { ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >= @@ -334,7 +325,7 @@ xfs_inobt_delrec( rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT); if (lbno != NULLAGBLOCK) { xfs_btree_firstrec(tcur, level); - if (error = xfs_inobt_decrement(tcur, level, &i)) + if ((error = xfs_inobt_decrement(tcur, level, &i))) goto error0; } } @@ -348,7 +339,7 @@ xfs_inobt_delrec( * previous block. */ xfs_btree_firstrec(tcur, level); - if (error = xfs_inobt_decrement(tcur, level, &i)) + if ((error = xfs_inobt_decrement(tcur, level, &i))) goto error0; xfs_btree_firstrec(tcur, level); /* @@ -357,7 +348,7 @@ xfs_inobt_delrec( lbp = tcur->bc_bufs[level]; left = XFS_BUF_TO_INOBT_BLOCK(lbp); #ifdef DEBUG - if (error = xfs_btree_check_sblock(cur, left, level, lbp)) + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) goto error0; #endif /* @@ -371,7 +362,7 @@ xfs_inobt_delrec( */ if (INT_GET(left->bb_numrecs, ARCH_CONVERT) - 1 >= XFS_INOBT_BLOCK_MINRECS(level, cur)) { - if (error = xfs_inobt_rshift(tcur, level, &i)) + if ((error = xfs_inobt_rshift(tcur, level, &i))) goto error0; if (i) { ASSERT(INT_GET(block->bb_numrecs, ARCH_CONVERT) >= @@ -402,41 +393,44 @@ xfs_inobt_delrec( * See if we can join with the left neighbor block. */ if (lbno != NULLAGBLOCK && - lrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) { + lrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) { /* * Set "right" to be the starting block, * "left" to be the left neighbor. */ rbno = bno; right = block; + rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT); rbp = bp; - if (error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, cur->bc_private.i.agno, lbno, 0, &lbp, - XFS_INO_BTREE_REF)) + XFS_INO_BTREE_REF))) return error; left = XFS_BUF_TO_INOBT_BLOCK(lbp); - if (error = xfs_btree_check_sblock(cur, left, level, lbp)) + lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT); + if ((error = xfs_btree_check_sblock(cur, left, level, lbp))) return error; } /* * If that won't work, see if we can join with the right neighbor block. */ else if (rbno != NULLAGBLOCK && - rrecs + INT_GET(block->bb_numrecs, ARCH_CONVERT) <= - XFS_INOBT_BLOCK_MAXRECS(level, cur)) { + rrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) { /* * Set "left" to be the starting block, * "right" to be the right neighbor. */ lbno = bno; left = block; + lrecs = INT_GET(left->bb_numrecs, ARCH_CONVERT); lbp = bp; - if (error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, cur->bc_private.i.agno, rbno, 0, &rbp, - XFS_INO_BTREE_REF)) + XFS_INO_BTREE_REF))) return error; right = XFS_BUF_TO_INOBT_BLOCK(rbp); - if (error = xfs_btree_check_sblock(cur, right, level, rbp)) + rrecs = INT_GET(right->bb_numrecs, ARCH_CONVERT); + if ((error = xfs_btree_check_sblock(cur, right, level, rbp))) return error; } /* @@ -457,40 +451,53 @@ xfs_inobt_delrec( /* * It's a non-leaf. Move keys and pointers. */ - lkp = XFS_INOBT_KEY_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur); - lpp = XFS_INOBT_PTR_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur); + lkp = XFS_INOBT_KEY_ADDR(left, lrecs + 1, cur); + lpp = XFS_INOBT_PTR_ADDR(left, lrecs + 1, cur); rkp = XFS_INOBT_KEY_ADDR(right, 1, cur); rpp = XFS_INOBT_PTR_ADDR(right, 1, cur); #ifdef DEBUG - for (i = 0; i < INT_GET(right->bb_numrecs, ARCH_CONVERT); i++) { - if (error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level)) + for (i = 0; i < rrecs; i++) { + if ((error = xfs_btree_check_sptr(cur, INT_GET(rpp[i], ARCH_CONVERT), level))) return error; } #endif - memcpy(lkp, rkp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lkp)); - memcpy(lpp, rpp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lpp)); - xfs_inobt_log_keys(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, - INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT)); - xfs_inobt_log_ptrs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, - INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT)); + memcpy(lkp, rkp, rrecs * sizeof(*lkp)); + memcpy(lpp, rpp, rrecs * sizeof(*lpp)); + xfs_inobt_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs); + xfs_inobt_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs); } else { /* * It's a leaf. Move records. */ - lrp = XFS_INOBT_REC_ADDR(left, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, cur); + lrp = XFS_INOBT_REC_ADDR(left, lrecs + 1, cur); rrp = XFS_INOBT_REC_ADDR(right, 1, cur); - memcpy(lrp, rrp, INT_GET(right->bb_numrecs, ARCH_CONVERT) * sizeof(*lrp)); - xfs_inobt_log_recs(cur, lbp, INT_GET(left->bb_numrecs, ARCH_CONVERT) + 1, - INT_GET(left->bb_numrecs, ARCH_CONVERT) + INT_GET(right->bb_numrecs, ARCH_CONVERT)); + memcpy(lrp, rrp, rrecs * sizeof(*lrp)); + xfs_inobt_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs); + } + /* + * If we joined with the left neighbor, set the buffer in the + * cursor to the left block, and fix up the index. + */ + if (bp != lbp) { + xfs_btree_setbuf(cur, level, lbp); + cur->bc_ptrs[level] += lrecs; } /* + * If we joined with the right neighbor and there's a level above + * us, increment the cursor at that level. + */ + else if (level + 1 < cur->bc_nlevels && + (error = xfs_alloc_increment(cur, level + 1, &i))) + return error; + /* * Fix up the number of records in the surviving block. */ - INT_MOD(left->bb_numrecs, ARCH_CONVERT, INT_GET(right->bb_numrecs, ARCH_CONVERT)); + lrecs += rrecs; + INT_SET(left->bb_numrecs, ARCH_CONVERT, lrecs); /* * Fix up the right block pointer in the surviving block, and log it. */ - INT_COPY(left->bb_rightsib, right->bb_rightsib, ARCH_CONVERT); + left->bb_rightsib = right->bb_rightsib; xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB); /* * If there is a right sibling now, make it point to the @@ -500,12 +507,12 @@ xfs_inobt_delrec( xfs_inobt_block_t *rrblock; xfs_buf_t *rrbp; - if (error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp, + if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, cur->bc_private.i.agno, INT_GET(left->bb_rightsib, ARCH_CONVERT), 0, - &rrbp, XFS_INO_BTREE_REF)) + &rrbp, XFS_INO_BTREE_REF))) return error; rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp); - if (error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)) + if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp))) return error; INT_SET(rrblock->bb_leftsib, ARCH_CONVERT, lbno); xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB); @@ -513,41 +520,11 @@ xfs_inobt_delrec( /* * Free the deleting block. */ - if (error = xfs_free_extent(cur->bc_tp, rbno, 1)) + if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp, + cur->bc_private.i.agno, rbno), 1))) return error; xfs_trans_binval(cur->bc_tp, rbp); /* - * To ensure that the freed block is not used for - * user data until this transaction is permanent, - * we lock the agf buffer for this ag until the - * transaction record makes it to the on-disk log. - */ - agfbno = XFS_AG_DADDR(cur->bc_mp, cur->bc_private.i.agno, - XFS_AGF_DADDR(mp)); - if (error = xfs_trans_read_buf(cur->bc_mp, cur->bc_tp, - cur->bc_mp->m_ddev_targp, agfbno, - XFS_FSS_TO_BB(mp, 1), 0, &agfbp)) - return error; - ASSERT(!XFS_BUF_GETERROR(agfbp)); - xfs_trans_bhold_until_committed(cur->bc_tp, agfbp); - /* - * If we joined with the left neighbor, set the buffer in the - * cursor to the left block, and fix up the index. - */ - if (bp != lbp) { - cur->bc_bufs[level] = lbp; - cur->bc_ptrs[level] += INT_GET(left->bb_numrecs, ARCH_CONVERT); - cur->bc_ra[level] = 0; - } - /* - * If we joined with the right neighbor and there's a level above - * us, increment the cursor at that level. - */ - else if (level + 1 < cur->bc_nlevels && - (error = xfs_inobt_increment(cur, level + 1, &i))) { - return error; - } - /* * Readjust the ptr at this level if it's not a leaf, since it's * still pointing at the deletion point, which makes the cursor * inconsistent. If this makes the ptr 0, the caller fixes it up. @@ -565,7 +542,6 @@ error0: xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR); return error; } -#endif /* _NOTYET_ */ /* * Insert one record/level. Return information to the caller @@ -590,6 +566,7 @@ xfs_inobt_insrec( xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */ xfs_inobt_key_t nkey; /* new key value, from split */ xfs_inobt_rec_t nrec; /* new record value, for caller */ + int numrecs; int optr; /* old ptr value */ xfs_inobt_ptr_t *pp; /* pointer to btree addresses */ int ptr; /* index in btree block for this rec */ @@ -622,13 +599,14 @@ xfs_inobt_insrec( */ bp = cur->bc_bufs[level]; block = XFS_BUF_TO_INOBT_BLOCK(bp); + numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT); #ifdef DEBUG if ((error = xfs_btree_check_sblock(cur, block, level, bp))) return error; /* * Check that the new entry is being inserted in the right place. */ - if (ptr <= INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + if (ptr <= numrecs) { if (level == 0) { rp = XFS_INOBT_REC_ADDR(block, ptr, cur); xfs_btree_check_rec(cur->bc_btnum, recp, rp); @@ -644,7 +622,7 @@ xfs_inobt_insrec( * If the block is full, we can't insert the new entry until we * make the block un-full. */ - if (INT_GET(block->bb_numrecs, ARCH_CONVERT) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) { + if (numrecs == XFS_INOBT_BLOCK_MAXRECS(level, cur)) { /* * First, try shifting an entry to the right neighbor. */ @@ -695,6 +673,7 @@ xfs_inobt_insrec( * At this point we know there's room for our new entry in the block * we're pointing at. */ + numrecs = INT_GET(block->bb_numrecs, ARCH_CONVERT); if (level > 0) { /* * It's a non-leaf entry. Make a hole for the new data @@ -703,15 +682,15 @@ xfs_inobt_insrec( kp = XFS_INOBT_KEY_ADDR(block, 1, cur); pp = XFS_INOBT_PTR_ADDR(block, 1, cur); #ifdef DEBUG - for (i = INT_GET(block->bb_numrecs, ARCH_CONVERT); i >= ptr; i--) { + for (i = numrecs; i >= ptr; i--) { if ((error = xfs_btree_check_sptr(cur, INT_GET(pp[i - 1], ARCH_CONVERT), level))) return error; } #endif memmove(&kp[ptr], &kp[ptr - 1], - (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*kp)); + (numrecs - ptr + 1) * sizeof(*kp)); memmove(&pp[ptr], &pp[ptr - 1], - (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*pp)); + (numrecs - ptr + 1) * sizeof(*pp)); /* * Now stuff the new data in, bump numrecs and log the new data. */ @@ -721,23 +700,25 @@ xfs_inobt_insrec( #endif kp[ptr - 1] = key; /* INT_: struct copy */ INT_SET(pp[ptr - 1], ARCH_CONVERT, *bnop); - INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1); - xfs_inobt_log_keys(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT)); - xfs_inobt_log_ptrs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT)); + numrecs++; + INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs); + xfs_inobt_log_keys(cur, bp, ptr, numrecs); + xfs_inobt_log_ptrs(cur, bp, ptr, numrecs); } else { /* * It's a leaf entry. Make a hole for the new record. */ rp = XFS_INOBT_REC_ADDR(block, 1, cur); memmove(&rp[ptr], &rp[ptr - 1], - (INT_GET(block->bb_numrecs, ARCH_CONVERT) - ptr + 1) * sizeof(*rp)); + (numrecs - ptr + 1) * sizeof(*rp)); /* * Now stuff the new record in, bump numrecs * and log the new data. */ rp[ptr - 1] = *recp; /* INT_: struct copy */ - INT_MOD(block->bb_numrecs, ARCH_CONVERT, +1); - xfs_inobt_log_recs(cur, bp, ptr, INT_GET(block->bb_numrecs, ARCH_CONVERT)); + numrecs++; + INT_SET(block->bb_numrecs, ARCH_CONVERT, numrecs); + xfs_inobt_log_recs(cur, bp, ptr, numrecs); } /* * Log the new number of records in the btree header. @@ -747,7 +728,7 @@ xfs_inobt_insrec( /* * Check that the key/record is in the right place, now. */ - if (ptr < INT_GET(block->bb_numrecs, ARCH_CONVERT)) { + if (ptr < numrecs) { if (level == 0) xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1, rp + ptr); @@ -1774,7 +1755,6 @@ xfs_inobt_decrement( return 0; } -#ifdef _NOTYET_ /* * Delete the record pointed to by cur. * The cursor refers to the place where the record was (could be inserted) @@ -1795,13 +1775,13 @@ xfs_inobt_delete( * Otherwise we are done. */ for (level = 0, i = 2; i == 2; level++) { - if (error = xfs_inobt_delrec(cur, level, &i)) + if ((error = xfs_inobt_delrec(cur, level, &i))) return error; } if (i == 0) { for (level = 1; level < cur->bc_nlevels; level++) { if (cur->bc_ptrs[level] == 0) { - if (error = xfs_inobt_decrement(cur, level, &i)) + if ((error = xfs_inobt_decrement(cur, level, &i))) return error; break; } @@ -1810,7 +1790,7 @@ xfs_inobt_delete( *stat = i; return 0; } -#endif /* _NOTYET_ */ + /* * Get the data from the pointed-to record. diff -prauN linux-2.6.0-test7/fs/xfs/xfs_ialloc_btree.h wli-2.6.0-test7-bk1-29/fs/xfs/xfs_ialloc_btree.h --- linux-2.6.0-test7/fs/xfs/xfs_ialloc_btree.h 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_ialloc_btree.h 2003-10-09 19:27:22.000000000 -0700 @@ -225,7 +225,6 @@ xfs_inobt_decrement( int level, /* level in btree, 0 is leaf */ int *stat); /* success/failure */ -#ifdef _NOTYET_ /* * Delete the record pointed to by cur. * The cursor refers to the place where the record was (could be inserted) @@ -235,7 +234,6 @@ int /* error */ xfs_inobt_delete( struct xfs_btree_cur *cur, /* btree cursor */ int *stat); /* success/failure */ -#endif /* _NOTYET_ */ /* * Get the data from the pointed-to record. diff -prauN linux-2.6.0-test7/fs/xfs/xfs_iget.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_iget.c --- linux-2.6.0-test7/fs/xfs/xfs_iget.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_iget.c 2003-10-09 19:27:22.000000000 -0700 @@ -258,6 +258,7 @@ finish_inode: if (newnode) { xfs_iocore_inode_reinit(ip); } + ip->i_flags &= ~XFS_ISTALE; vn_trace_exit(vp, "xfs_iget.found", (inst_t *)__return_address); diff -prauN linux-2.6.0-test7/fs/xfs/xfs_inode.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_inode.c --- linux-2.6.0-test7/fs/xfs/xfs_inode.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_inode.c 2003-10-09 19:27:22.000000000 -0700 @@ -36,6 +36,7 @@ #include "xfs_inum.h" #include "xfs_log.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_dir.h" @@ -486,11 +487,11 @@ xfs_iformat( return XFS_ERROR(EFSCORRUPTED); } - switch (ip->i_d.di_mode & IFMT) { - case IFIFO: - case IFCHR: - case IFBLK: - case IFSOCK: + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: if (unlikely(INT_GET(dip->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_DEV)) { XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW, ip->i_mount, dip); @@ -500,15 +501,15 @@ xfs_iformat( ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT); break; - case IFREG: - case IFLNK: - case IFDIR: + case S_IFREG: + case S_IFLNK: + case S_IFDIR: switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) { case XFS_DINODE_FMT_LOCAL: /* * no local regular files yet */ - if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & IFMT) == IFREG)) { + if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) { xfs_fs_cmn_err(CE_WARN, ip->i_mount, "corrupt inode (local format for regular file) %Lu. Unmount and run xfs_repair.", (unsigned long long) ip->i_ino); @@ -1171,20 +1172,20 @@ xfs_ialloc( if (XFS_INHERIT_GID(pip, vp->v_vfsp)) { ip->i_d.di_gid = pip->i_d.di_gid; - if ((pip->i_d.di_mode & ISGID) && (mode & IFMT) == IFDIR) { - ip->i_d.di_mode |= ISGID; + if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) { + ip->i_d.di_mode |= S_ISGID; } } /* * If the group ID of the new file does not match the effective group - * ID or one of the supplementary group IDs, the ISGID bit is cleared + * ID or one of the supplementary group IDs, the S_ISGID bit is cleared * (and only if the irix_sgid_inherit compatibility variable is set). */ if ((irix_sgid_inherit) && - (ip->i_d.di_mode & ISGID) && + (ip->i_d.di_mode & S_ISGID) && (!in_group_p((gid_t)ip->i_d.di_gid))) { - ip->i_d.di_mode &= ~ISGID; + ip->i_d.di_mode &= ~S_ISGID; } ip->i_d.di_size = 0; @@ -1199,18 +1200,18 @@ xfs_ialloc( ip->i_d.di_dmstate = 0; ip->i_d.di_flags = 0; flags = XFS_ILOG_CORE; - switch (mode & IFMT) { - case IFIFO: - case IFCHR: - case IFBLK: - case IFSOCK: + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFCHR: + case S_IFBLK: + case S_IFSOCK: ip->i_d.di_format = XFS_DINODE_FMT_DEV; ip->i_df.if_u2.if_rdev = rdev; ip->i_df.if_flags = 0; flags |= XFS_ILOG_DEV; break; - case IFREG: - case IFDIR: + case S_IFREG: + case S_IFDIR: if (pip->i_d.di_flags & (XFS_DIFLAG_NOATIME|XFS_DIFLAG_NODUMP|XFS_DIFLAG_SYNC)) { if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) && @@ -1223,7 +1224,7 @@ xfs_ialloc( xfs_inherit_sync) ip->i_d.di_flags |= XFS_DIFLAG_SYNC; } - case IFLNK: + case S_IFLNK: ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; ip->i_df.if_flags = XFS_IFEXTENTS; ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0; @@ -1267,7 +1268,7 @@ xfs_isize_check( int nimaps; xfs_bmbt_irec_t imaps[2]; - if ((ip->i_d.di_mode & IFMT) != IFREG) + if ((ip->i_d.di_mode & S_IFMT) != S_IFREG) return; if ( ip->i_d.di_flags & XFS_DIFLAG_REALTIME ) @@ -2103,6 +2104,180 @@ xfs_iunlink_remove( return 0; } +static __inline__ int xfs_inode_clean(xfs_inode_t *ip) +{ + return (((ip->i_itemp == NULL) || + !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) && + (ip->i_update_core == 0)); +} + +void +xfs_ifree_cluster( + xfs_inode_t *free_ip, + xfs_trans_t *tp, + xfs_ino_t inum) +{ + xfs_mount_t *mp = free_ip->i_mount; + int blks_per_cluster; + int nbufs; + int ninodes; + int i, j, found, pre_flushed; + xfs_daddr_t blkno; + xfs_buf_t *bp; + xfs_ihash_t *ih; + xfs_inode_t *ip, **ip_found; + xfs_inode_log_item_t *iip; + xfs_log_item_t *lip; + SPLDECL(s); + + if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) { + blks_per_cluster = 1; + ninodes = mp->m_sb.sb_inopblock; + nbufs = XFS_IALLOC_BLOCKS(mp); + } else { + blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) / + mp->m_sb.sb_blocksize; + ninodes = blks_per_cluster * mp->m_sb.sb_inopblock; + nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster; + } + + ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS); + + for (j = 0; j < nbufs; j++, inum += ninodes) { + blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), + XFS_INO_TO_AGBNO(mp, inum)); + + + /* + * Look for each inode in memory and attempt to lock it, + * we can be racing with flush and tail pushing here. + * any inode we get the locks on, add to an array of + * inode items to process later. + * + * The get the buffer lock, we could beat a flush + * or tail pushing thread to the lock here, in which + * case they will go looking for the inode buffer + * and fail, we need some other form of interlock + * here. + */ + found = 0; + for (i = 0; i < ninodes; i++) { + ih = XFS_IHASH(mp, inum + i); + read_lock(&ih->ih_lock); + for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) { + if (ip->i_ino == inum + i) + break; + } + + /* Inode not in memory or we found it already, + * nothing to do + */ + if (!ip || (ip->i_flags & XFS_ISTALE)) { + read_unlock(&ih->ih_lock); + continue; + } + + if (xfs_inode_clean(ip)) { + read_unlock(&ih->ih_lock); + continue; + } + + /* If we can get the locks then add it to the + * list, otherwise by the time we get the bp lock + * below it will already be attached to the + * inode buffer. + */ + + /* This inode will already be locked - by us, lets + * keep it that way. + */ + + if (ip == free_ip) { + if (xfs_iflock_nowait(ip)) { + ip->i_flags |= XFS_ISTALE; + + if (xfs_inode_clean(ip)) { + xfs_ifunlock(ip); + } else { + ip_found[found++] = ip; + } + } + read_unlock(&ih->ih_lock); + continue; + } + + if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { + if (xfs_iflock_nowait(ip)) { + ip->i_flags |= XFS_ISTALE; + + if (xfs_inode_clean(ip)) { + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } else { + ip_found[found++] = ip; + } + } else { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + } + + read_unlock(&ih->ih_lock); + } + + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, + mp->m_bsize * blks_per_cluster, + XFS_BUF_LOCK); + + pre_flushed = 0; + lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); + while (lip) { + if (lip->li_type == XFS_LI_INODE) { + iip = (xfs_inode_log_item_t *)lip; + ASSERT(iip->ili_logged == 1); + lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done; + AIL_LOCK(mp,s); + iip->ili_flush_lsn = iip->ili_item.li_lsn; + AIL_UNLOCK(mp, s); + iip->ili_inode->i_flags |= XFS_ISTALE; + pre_flushed++; + } + lip = lip->li_bio_list; + } + + for (i = 0; i < found; i++) { + ip = ip_found[i]; + iip = ip->i_itemp; + + if (!iip) { + ip->i_update_core = 0; + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + continue; + } + + iip->ili_last_fields = iip->ili_format.ilf_fields; + iip->ili_format.ilf_fields = 0; + iip->ili_logged = 1; + AIL_LOCK(mp,s); + iip->ili_flush_lsn = iip->ili_item.li_lsn; + AIL_UNLOCK(mp, s); + + xfs_buf_attach_iodone(bp, + (void(*)(xfs_buf_t*,xfs_log_item_t*)) + xfs_istale_done, (xfs_log_item_t *)iip); + if (ip != free_ip) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + } + + if (found || pre_flushed) + xfs_trans_stale_inode_buf(tp, bp); + xfs_trans_binval(tp, bp); + } + + kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *)); +} + /* * This is called to return an inode to the inode free list. * The inode should already be truncated to 0 length and have @@ -2116,9 +2291,12 @@ xfs_iunlink_remove( int xfs_ifree( xfs_trans_t *tp, - xfs_inode_t *ip) + xfs_inode_t *ip, + xfs_bmap_free_t *flist) { - int error; + int error; + int delete; + xfs_ino_t first_ino; ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); ASSERT(ip->i_transp == tp); @@ -2126,7 +2304,7 @@ xfs_ifree( ASSERT(ip->i_d.di_nextents == 0); ASSERT(ip->i_d.di_anextents == 0); ASSERT((ip->i_d.di_size == 0) || - ((ip->i_d.di_mode & IFMT) != IFREG)); + ((ip->i_d.di_mode & S_IFMT) != S_IFREG)); ASSERT(ip->i_d.di_nblocks == 0); /* @@ -2137,7 +2315,7 @@ xfs_ifree( return error; } - error = xfs_difree(tp, ip->i_ino); + error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); if (error != 0) { return error; } @@ -2149,13 +2327,17 @@ xfs_ifree( XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t); ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS; ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS; - /* * Bump the generation count so no one will be confused * by reincarnations of this inode. */ ip->i_d.di_gen++; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (delete) { + xfs_ifree_cluster(ip, tp, first_ino); + } + return 0; } @@ -2564,10 +2746,10 @@ xfs_idestroy( xfs_inode_t *ip) { - switch (ip->i_d.di_mode & IFMT) { - case IFREG: - case IFDIR: - case IFLNK: + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: xfs_idestroy_fork(ip, XFS_DATA_FORK); break; } @@ -3208,7 +3390,7 @@ xfs_iflush_int( ip->i_ino, ip, ip->i_d.di_magic); goto corrupt_out; } - if ((ip->i_d.di_mode & IFMT) == IFREG) { + if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE), @@ -3218,7 +3400,7 @@ xfs_iflush_int( ip->i_ino, ip); goto corrupt_out; } - } else if ((ip->i_d.di_mode & IFMT) == IFDIR) { + } else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) { if (XFS_TEST_ERROR( (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) && (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) && @@ -3507,7 +3689,7 @@ xfs_iaccess( if ((error = _MAC_XFS_IACCESS(ip, mode, cr))) return XFS_ERROR(error); - if (mode & IWRITE) { + if (mode & S_IWUSR) { umode_t imode = inode->i_mode; if (IS_RDONLY(inode) && @@ -3540,13 +3722,13 @@ xfs_iaccess( * Read/write DACs are always overridable. * Executable DACs are overridable if at least one exec bit is set. */ - if ((orgmode & (IREAD|IWRITE)) || (inode->i_mode & S_IXUGO)) + if ((orgmode & (S_IRUSR|S_IWUSR)) || (inode->i_mode & S_IXUGO)) if (capable_cred(cr, CAP_DAC_OVERRIDE)) return 0; - if ((orgmode == IREAD) || - (((ip->i_d.di_mode & IFMT) == IFDIR) && - (!(orgmode & ~(IWRITE|IEXEC))))) { + if ((orgmode == S_IRUSR) || + (((ip->i_d.di_mode & S_IFMT) == S_IFDIR) && + (!(orgmode & ~(S_IWUSR|S_IXUSR))))) { if (capable_cred(cr, CAP_DAC_READ_SEARCH)) return 0; #ifdef NOISE diff -prauN linux-2.6.0-test7/fs/xfs/xfs_inode.h wli-2.6.0-test7-bk1-29/fs/xfs/xfs_inode.h --- linux-2.6.0-test7/fs/xfs/xfs_inode.h 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_inode.h 2003-10-09 19:27:22.000000000 -0700 @@ -179,7 +179,7 @@ typedef struct xfs_ihash { * Inode hashing and hash bucket locking. */ #define XFS_BUCKETS(mp) (37*(mp)->m_sb.sb_agcount-1) -#define XFS_IHASH(mp,ino) ((mp)->m_ihash + (((uint)ino) % (mp)->m_ihsize)) +#define XFS_IHASH(mp,ino) ((mp)->m_ihash + (((uint)(ino)) % (mp)->m_ihsize)) /* * This is the xfs inode cluster hash. This hash is used by xfs_iflush to @@ -362,7 +362,8 @@ void xfs_ifork_next_set(xfs_inode_t *ip, #define XFS_IUIOSZ 0x0002 /* inode i/o sizes have been explicitly set */ #define XFS_IQUIESCE 0x0004 /* we have started quiescing for this inode */ #define XFS_IRECLAIM 0x0008 /* we have started reclaiming this inode */ -#define XFS_IRECLAIMABLE 0x0010 /* inode can be reclaimed */ +#define XFS_ISTALE 0x0010 /* inode has been staled */ +#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */ /* * Flags for inode locking. @@ -437,12 +438,12 @@ xfs_inode_t *xfs_bhvtoi(struct bhv_desc #define XFS_CHASH(mp,blk) ((mp)->m_chash + (((uint)blk) % (mp)->m_chsize)) /* - * For multiple groups support: if ISGID bit is set in the parent + * For multiple groups support: if S_ISGID bit is set in the parent * directory, group of new file is set to that of the parent, and - * new subdirectory gets ISGID bit from parent. + * new subdirectory gets S_ISGID bit from parent. */ #define XFS_INHERIT_GID(pip, vfsp) ((pip) != NULL && \ - (((vfsp)->vfs_flag & VFS_GRPID) || ((pip)->i_d.di_mode & ISGID))) + (((vfsp)->vfs_flag & VFS_GRPID) || ((pip)->i_d.di_mode & S_ISGID))) /* * xfs_iget.c prototypes. @@ -487,7 +488,8 @@ int xfs_ialloc(struct xfs_trans *, xfs_ struct xfs_buf **, boolean_t *, xfs_inode_t **); void xfs_xlate_dinode_core(xfs_caddr_t, struct xfs_dinode_core *, int, xfs_arch_t); -int xfs_ifree(struct xfs_trans *, xfs_inode_t *); +int xfs_ifree(struct xfs_trans *, xfs_inode_t *, + struct xfs_bmap_free *); int xfs_atruncate_start(xfs_inode_t *); void xfs_itruncate_start(xfs_inode_t *, uint, xfs_fsize_t); int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *, diff -prauN linux-2.6.0-test7/fs/xfs/xfs_inode_item.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_inode_item.c --- linux-2.6.0-test7/fs/xfs/xfs_inode_item.c 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_inode_item.c 2003-10-09 19:27:22.000000000 -0700 @@ -631,6 +631,14 @@ xfs_inode_item_trylock( } /* NOTREACHED */ } + + /* Stale items should force out the iclog */ + if (ip->i_flags & XFS_ISTALE) { + xfs_ifunlock(ip); + xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); + return XFS_ITEM_PINNED; + } + #ifdef DEBUG if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { ASSERT(iip->ili_format.ilf_fields != 0); @@ -1074,3 +1082,11 @@ xfs_iflush_abort( */ xfs_ifunlock(ip); } + +void +xfs_istale_done( + xfs_buf_t *bp, + xfs_inode_log_item_t *iip) +{ + xfs_iflush_abort(iip->ili_inode); +} diff -prauN linux-2.6.0-test7/fs/xfs/xfs_inode_item.h wli-2.6.0-test7-bk1-29/fs/xfs/xfs_inode_item.h --- linux-2.6.0-test7/fs/xfs/xfs_inode_item.h 2003-10-08 12:24:44.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_inode_item.h 2003-10-09 19:27:22.000000000 -0700 @@ -189,6 +189,7 @@ int xfs_ilog_fext(int w); void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); void xfs_inode_item_destroy(struct xfs_inode *); void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *); +void xfs_istale_done(struct xfs_buf *, xfs_inode_log_item_t *); void xfs_iflush_abort(struct xfs_inode *); #endif /* __KERNEL__ */ diff -prauN linux-2.6.0-test7/fs/xfs/xfs_log_recover.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_log_recover.c --- linux-2.6.0-test7/fs/xfs/xfs_log_recover.c 2003-10-08 12:24:07.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_log_recover.c 2003-10-09 19:27:23.000000000 -0700 @@ -1529,17 +1529,35 @@ xlog_recover_reorder_trans( xlog_recover_t *trans) { xlog_recover_item_t *first_item, *itemq, *itemq_next; + xfs_buf_log_format_t *buf_f; + xfs_buf_log_format_v1_t *obuf_f; + ushort flags; first_item = itemq = trans->r_itemq; trans->r_itemq = NULL; do { itemq_next = itemq->ri_next; + buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr; switch (ITEM_TYPE(itemq)) { case XFS_LI_BUF: + flags = buf_f->blf_flags; + break; case XFS_LI_6_1_BUF: case XFS_LI_5_3_BUF: - xlog_recover_insert_item_frontq(&trans->r_itemq, itemq); + obuf_f = (xfs_buf_log_format_v1_t*)buf_f; + flags = obuf_f->blf_flags; break; + } + + switch (ITEM_TYPE(itemq)) { + case XFS_LI_BUF: + case XFS_LI_6_1_BUF: + case XFS_LI_5_3_BUF: + if ((!flags & XFS_BLI_CANCEL)) { + xlog_recover_insert_item_frontq(&trans->r_itemq, + itemq); + break; + } case XFS_LI_INODE: case XFS_LI_6_1_INODE: case XFS_LI_5_3_INODE: @@ -1668,32 +1686,16 @@ xlog_recover_do_buffer_pass1( * made at that point. */ STATIC int -xlog_recover_do_buffer_pass2( +xlog_check_buffer_cancelled( xlog_t *log, - xfs_buf_log_format_t *buf_f) + xfs_daddr_t blkno, + uint len, + ushort flags) { xfs_buf_cancel_t *bcp; xfs_buf_cancel_t *prevp; xfs_buf_cancel_t **bucket; - xfs_buf_log_format_v1_t *obuf_f; - xfs_daddr_t blkno = 0; - ushort flags = 0; - uint len = 0; - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - flags = buf_f->blf_flags; - len = buf_f->blf_len; - break; - case XFS_LI_6_1_BUF: - case XFS_LI_5_3_BUF: - obuf_f = (xfs_buf_log_format_v1_t*)buf_f; - blkno = (xfs_daddr_t) obuf_f->blf_blkno; - flags = obuf_f->blf_flags; - len = (xfs_daddr_t) obuf_f->blf_len; - break; - } if (log->l_buf_cancel_table == NULL) { /* * There is nothing in the table built in pass one, @@ -1755,6 +1757,34 @@ xlog_recover_do_buffer_pass2( return 0; } +STATIC int +xlog_recover_do_buffer_pass2( + xlog_t *log, + xfs_buf_log_format_t *buf_f) +{ + xfs_buf_log_format_v1_t *obuf_f; + xfs_daddr_t blkno = 0; + ushort flags = 0; + uint len = 0; + + switch (buf_f->blf_type) { + case XFS_LI_BUF: + blkno = buf_f->blf_blkno; + flags = buf_f->blf_flags; + len = buf_f->blf_len; + break; + case XFS_LI_6_1_BUF: + case XFS_LI_5_3_BUF: + obuf_f = (xfs_buf_log_format_v1_t*)buf_f; + blkno = (xfs_daddr_t) obuf_f->blf_blkno; + flags = obuf_f->blf_flags; + len = (xfs_daddr_t) obuf_f->blf_len; + break; + } + + return xlog_check_buffer_cancelled(log, blkno, len, flags); +} + /* * Perform recovery for a buffer full of inodes. In these buffers, * the only data which should be recovered is that which corresponds @@ -2009,7 +2039,7 @@ xfs_qm_dqcheck( if (id != -1 && id != INT_GET(ddq->d_id, ARCH_CONVERT)) { if (flags & XFS_QMOPT_DOWARN) cmn_err(CE_ALERT, - "%s : ondisk-dquot 0x%x, ID mismatch: " + "%s : ondisk-dquot 0x%p, ID mismatch: " "0x%x expected, found id 0x%x", str, ddq, id, INT_GET(ddq->d_id, ARCH_CONVERT)); errs++; @@ -2023,7 +2053,7 @@ xfs_qm_dqcheck( !INT_ISZERO(ddq->d_id, ARCH_CONVERT)) { if (flags & XFS_QMOPT_DOWARN) cmn_err(CE_ALERT, - "%s : Dquot ID 0x%x (0x%x) " + "%s : Dquot ID 0x%x (0x%p) " "BLK TIMER NOT STARTED", str, (int) INT_GET(ddq->d_id, ARCH_CONVERT), ddq); @@ -2037,7 +2067,7 @@ xfs_qm_dqcheck( !INT_ISZERO(ddq->d_id, ARCH_CONVERT)) { if (flags & XFS_QMOPT_DOWARN) cmn_err(CE_ALERT, - "%s : Dquot ID 0x%x (0x%x) " + "%s : Dquot ID 0x%x (0x%p) " "INODE TIMER NOT STARTED", str, (int) INT_GET(ddq->d_id, ARCH_CONVERT), ddq); @@ -2289,6 +2319,14 @@ xlog_recover_do_inode_trans( imap.im_blkno = 0; xfs_imap(log->l_mp, 0, ino, &imap, 0); } + + /* + * Inode buffers can be freed, look out for it, + * and do not replay the inode. + */ + if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) + return 0; + bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len, XFS_BUF_LOCK); if (XFS_BUF_ISERROR(bp)) { @@ -2345,7 +2383,7 @@ xlog_recover_do_inode_trans( /* Take the opportunity to reset the flush iteration count */ dicp->di_flushiter = 0; - if (unlikely((dicp->di_mode & IFMT) == IFREG)) { + if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE)) { XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)", @@ -2356,7 +2394,7 @@ xlog_recover_do_inode_trans( item, dip, bp, ino); return XFS_ERROR(EFSCORRUPTED); } - } else if (unlikely((dicp->di_mode & IFMT) == IFDIR)) { + } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE) && (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { diff -prauN linux-2.6.0-test7/fs/xfs/xfs_mount.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_mount.c --- linux-2.6.0-test7/fs/xfs/xfs_mount.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_mount.c 2003-10-09 19:27:23.000000000 -0700 @@ -903,7 +903,7 @@ xfs_mountfs( * File systems that don't support user level file handles (i.e. * all of them except for XFS) will leave vfs_altfsid as NULL. */ - vfsp->vfs_altfsid = (__kernel_fsid_t *)mp->m_fixedfsid; + vfsp->vfs_altfsid = (fsid_t *)mp->m_fixedfsid; mp->m_dmevmask = 0; /* not persistent; set after each mount */ /* @@ -977,7 +977,7 @@ xfs_mountfs( rvp = XFS_ITOV(rip); VMAP(rvp, vmap); - if (unlikely((rip->i_d.di_mode & IFMT) != IFDIR)) { + if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { cmn_err(CE_WARN, "XFS: corrupted root inode"); prdev("Root inode %llu is not a directory", mp->m_ddev_targp, (unsigned long long)rip->i_ino); diff -prauN linux-2.6.0-test7/fs/xfs/xfs_mount.h wli-2.6.0-test7-bk1-29/fs/xfs/xfs_mount.h --- linux-2.6.0-test7/fs/xfs/xfs_mount.h 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_mount.h 2003-10-09 19:27:23.000000000 -0700 @@ -416,6 +416,7 @@ typedef struct xfs_mount { #define XFS_MOUNT_32BITINOOPT 0x00008000 /* saved mount option state */ #define XFS_MOUNT_NOUUID 0x00010000 /* ignore uuid during mount */ #define XFS_MOUNT_NOLOGFLUSH 0x00020000 +#define XFS_MOUNT_IDELETE 0x00040000 /* delete empty inode clusters*/ /* * Default minimum read and write sizes. diff -prauN linux-2.6.0-test7/fs/xfs/xfs_rename.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_rename.c --- linux-2.6.0-test7/fs/xfs/xfs_rename.c 2003-10-08 12:24:05.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_rename.c 2003-10-09 19:27:23.000000000 -0700 @@ -326,7 +326,7 @@ xfs_rename( ASSERT(src_ip != NULL); - if ((src_ip->i_d.di_mode & IFMT) == IFDIR) { + if ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR) { /* * Check for link count overflow on target_dp */ @@ -340,7 +340,7 @@ xfs_rename( } new_parent = (src_dp != target_dp); - src_is_directory = ((src_ip->i_d.di_mode & IFMT) == IFDIR); + src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); /* * Drop the locks on our inodes so that we can do the ancestor @@ -449,7 +449,7 @@ xfs_rename( * target and source are directories and that target can be * destroyed, or that neither is a directory. */ - if ((target_ip->i_d.di_mode & IFMT) == IFDIR) { + if ((target_ip->i_d.di_mode & S_IFMT) == S_IFDIR) { /* * Make sure target dir is empty. */ diff -prauN linux-2.6.0-test7/fs/xfs/xfs_rw.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_rw.c --- linux-2.6.0-test7/fs/xfs/xfs_rw.c 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_rw.c 2003-10-09 19:27:23.000000000 -0700 @@ -86,17 +86,17 @@ xfs_write_clear_setuid( xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); - ip->i_d.di_mode &= ~ISUID; + ip->i_d.di_mode &= ~S_ISUID; /* * Note that we don't have to worry about mandatory * file locking being disabled here because we only - * clear the ISGID bit if the Group execute bit is + * clear the S_ISGID bit if the Group execute bit is * on, but if it was on then mandatory locking wouldn't * have been enabled. */ - if (ip->i_d.di_mode & (IEXEC >> 3)) { - ip->i_d.di_mode &= ~ISGID; + if (ip->i_d.di_mode & S_IXGRP) { + ip->i_d.di_mode &= ~S_ISGID; } xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_set_sync(tp); diff -prauN linux-2.6.0-test7/fs/xfs/xfs_trans.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_trans.c --- linux-2.6.0-test7/fs/xfs/xfs_trans.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_trans.c 2003-10-09 19:27:23.000000000 -0700 @@ -365,7 +365,6 @@ xfs_trans_mod_sb( switch (field) { case XFS_TRANS_SB_ICOUNT: - ASSERT(delta > 0); tp->t_icount_delta += delta; break; case XFS_TRANS_SB_IFREE: diff -prauN linux-2.6.0-test7/fs/xfs/xfs_trans.h wli-2.6.0-test7-bk1-29/fs/xfs/xfs_trans.h --- linux-2.6.0-test7/fs/xfs/xfs_trans.h 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_trans.h 2003-10-09 19:27:23.000000000 -0700 @@ -703,6 +703,8 @@ typedef struct xfs_trans { * the agi hash list and counters: sector size * the inode btree entry: block size * the on disk inode before ours in the agi hash list: inode cluster size + * the inode btree: max depth * blocksize + * the allocation btrees: 2 trees * (max depth - 1) * block size */ #define XFS_CALC_IFREE_LOG_RES(mp) \ ((mp)->m_sb.sb_inodesize + \ @@ -710,7 +712,10 @@ typedef struct xfs_trans { (mp)->m_sb.sb_sectsize + \ XFS_FSB_TO_B((mp), 1) + \ MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \ - (128 * 5)) + (128 * 5) + \ + (128 * (2 + XFS_IALLOC_BLOCKS(mp) + XFS_IN_MAXLEVELS(mp) + \ + XFS_ALLOCFREE_LOG_COUNT(mp, 1)))) + #define XFS_IFREE_LOG_RES(mp) ((mp)->m_reservations.tr_ifree) @@ -918,6 +923,7 @@ typedef struct xfs_trans { #define XFS_DEFAULT_LOG_COUNT 1 #define XFS_DEFAULT_PERM_LOG_COUNT 2 #define XFS_ITRUNCATE_LOG_COUNT 2 +#define XFS_INACTIVE_LOG_COUNT 2 #define XFS_CREATE_LOG_COUNT 2 #define XFS_MKDIR_LOG_COUNT 3 #define XFS_SYMLINK_LOG_COUNT 3 @@ -991,6 +997,8 @@ void xfs_trans_bhold(xfs_trans_t *, str void xfs_trans_bhold_until_committed(xfs_trans_t *, struct xfs_buf *); void xfs_trans_binval(xfs_trans_t *, struct xfs_buf *); void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_inode_buf(xfs_trans_t *, struct xfs_buf *); +void xfs_trans_stale_inode_buf(xfs_trans_t *, struct xfs_buf *); void xfs_trans_dquot_buf(xfs_trans_t *, struct xfs_buf *, uint); void xfs_trans_inode_alloc_buf(xfs_trans_t *, struct xfs_buf *); int xfs_trans_iget(struct xfs_mount *, xfs_trans_t *, diff -prauN linux-2.6.0-test7/fs/xfs/xfs_trans_buf.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_trans_buf.c --- linux-2.6.0-test7/fs/xfs/xfs_trans_buf.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_trans_buf.c 2003-10-09 19:27:23.000000000 -0700 @@ -931,6 +931,35 @@ xfs_trans_inode_buf( bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF; } +/* + * This call is used to indicate that the buffer is going to + * be staled and was an inode buffer. This means it gets + * special processing during unpin - where any inodes + * associated with the buffer should be removed from ail. + * There is also special processing during recovery, + * any replay of the inodes in the buffer needs to be + * prevented as the buffer may have been reused. + */ +void +xfs_trans_stale_inode_buf( + xfs_trans_t *tp, + xfs_buf_t *bp) +{ + xfs_buf_log_item_t *bip; + + ASSERT(XFS_BUF_ISBUSY(bp)); + ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp); + ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL); + + bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); + ASSERT(atomic_read(&bip->bli_refcount) > 0); + + bip->bli_flags |= XFS_BLI_STALE_INODE; + bip->bli_item.li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) + xfs_buf_iodone; +} + + /* * Mark the buffer as being one which contains newly allocated @@ -954,7 +983,6 @@ xfs_trans_inode_alloc_buf( bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *); ASSERT(atomic_read(&bip->bli_refcount) > 0); - ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; } diff -prauN linux-2.6.0-test7/fs/xfs/xfs_types.h wli-2.6.0-test7-bk1-29/fs/xfs/xfs_types.h --- linux-2.6.0-test7/fs/xfs/xfs_types.h 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_types.h 2003-10-09 19:27:23.000000000 -0700 @@ -75,8 +75,6 @@ typedef __uint64_t __psunsigned_t; #error BITS_PER_LONG must be 32 or 64 #endif -#endif /* __KERNEL__ */ - /* * Some types are conditional depending on the target system. * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits. @@ -95,6 +93,8 @@ typedef __uint64_t __psunsigned_t; # define XFS_BIG_INUMS 0 #endif +#endif /* __KERNEL__ */ + typedef __uint32_t xfs_agblock_t; /* blockno in alloc. group */ typedef __uint32_t xfs_extlen_t; /* extent length in blocks */ typedef __uint32_t xfs_agnumber_t; /* allocation group number */ @@ -197,4 +197,4 @@ typedef enum { XFS_BTNUM_MAX } xfs_btnum_t; -#endif /* !__XFS_TYPES_H */ +#endif /* __XFS_TYPES_H__ */ diff -prauN linux-2.6.0-test7/fs/xfs/xfs_vfsops.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_vfsops.c --- linux-2.6.0-test7/fs/xfs/xfs_vfsops.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_vfsops.c 2003-10-09 19:27:23.000000000 -0700 @@ -225,7 +225,7 @@ xfs_start_flags( /* * At this point the superblock has not been read * in, therefore we do not know the block size. - * Before, the mount call ends we will convert + * Before the mount call ends we will convert * these to FSBs. */ mp->m_dalign = ap->sunit; @@ -298,6 +298,8 @@ xfs_start_flags( mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE; mp->m_readio_log = mp->m_writeio_log = ap->iosizelog; } + if (ap->flags & XFSMNT_IDELETE) + mp->m_flags |= XFS_MOUNT_IDELETE; /* * no recovery flag requires a read-only mount @@ -603,6 +605,7 @@ xfs_mntupdate( struct vfs *vfsp = bhvtovfs(bdp); xfs_mount_t *mp = XFS_BHVTOM(bdp); int pincount, error; + int count = 0; if (args->flags & XFSMNT_NOATIME) mp->m_flags |= XFS_MOUNT_NOATIME; @@ -617,11 +620,19 @@ xfs_mntupdate( pagebuf_delwri_flush(mp->m_ddev_targp, 0, NULL); xfs_finish_reclaim_all(mp, 0); + /* This loop must run at least twice. + * The first instance of the loop will flush + * most meta data but that will generate more + * meta data (typically directory updates). + * Which then must be flushed and logged before + * we can write the unmount record. + */ do { VFS_SYNC(vfsp, REMOUNT_READONLY_FLAGS, NULL, error); pagebuf_delwri_flush(mp->m_ddev_targp, PBDF_WAIT, &pincount); - } while (pincount); + if(0 == pincount) { delay(50); count++; } + } while (count < 2); /* Ok now write out an unmount record */ xfs_log_unmount_write(mp); @@ -1588,6 +1599,7 @@ xfs_vget( #define MNTOPT_NOLOGFLUSH "nologflush" /* don't hard flush on log writes */ #define MNTOPT_OSYNCISOSYNC "osyncisosync" /* o_sync is REALLY o_sync */ #define MNTOPT_64BITINODE "inode64" /* inodes can be allocated anywhere */ +#define MNTOPT_IKEEP "ikeep" /* free empty inode clusters */ int @@ -1602,6 +1614,8 @@ xfs_parseargs( int dsunit, dswidth, vol_dsunit, vol_dswidth; int iosize; + args->flags |= XFSMNT_IDELETE; /* default to on */ + if (!options) return 0; @@ -1706,6 +1720,8 @@ xfs_parseargs( args->flags |= XFSMNT_NOUUID; } else if (!strcmp(this_char, MNTOPT_NOLOGFLUSH)) { args->flags |= XFSMNT_NOLOGFLUSH; + } else if (!strcmp(this_char, MNTOPT_IKEEP)) { + args->flags &= ~XFSMNT_IDELETE; } else if (!strcmp(this_char, "osyncisdsync")) { /* no-op, this is now the default */ printk("XFS: osyncisdsync is now the default, option is deprecated.\n"); diff -prauN linux-2.6.0-test7/fs/xfs/xfs_vnodeops.c wli-2.6.0-test7-bk1-29/fs/xfs/xfs_vnodeops.c --- linux-2.6.0-test7/fs/xfs/xfs_vnodeops.c 2003-10-08 12:24:53.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfs_vnodeops.c 2003-10-09 19:27:23.000000000 -0700 @@ -78,7 +78,7 @@ #define SYMLINK_MAPS 2 extern int xfs_ioctl(bhv_desc_t *, struct inode *, struct file *, - unsigned int, unsigned long); + int, unsigned int, unsigned long); /* @@ -470,15 +470,15 @@ xfs_setattr( if (mask & XFS_AT_MODE) { mode_t m = 0; - if ((vap->va_mode & ISUID) && !file_owner) - m |= ISUID; - if ((vap->va_mode & ISGID) && + if ((vap->va_mode & S_ISUID) && !file_owner) + m |= S_ISUID; + if ((vap->va_mode & S_ISGID) && !in_group_p((gid_t)ip->i_d.di_gid)) - m |= ISGID; + m |= S_ISGID; #if 0 /* Linux allows this, Irix doesn't. */ - if ((vap->va_mode & ISVTX) && vp->v_type != VDIR) - m |= ISVTX; + if ((vap->va_mode & S_ISVTX) && vp->v_type != VDIR) + m |= S_ISVTX; #endif if (m && !capable(CAP_FSETID)) vap->va_mode &= ~m; @@ -755,8 +755,8 @@ xfs_setattr( * Change file access modes. */ if (mask & XFS_AT_MODE) { - ip->i_d.di_mode &= IFMT; - ip->i_d.di_mode |= vap->va_mode & ~IFMT; + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= vap->va_mode & ~S_IFMT; xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); timeflags |= XFS_ICHGTIME_CHG; @@ -776,9 +776,9 @@ xfs_setattr( * The set-user-ID and set-group-ID bits of a file will be * cleared upon successful return from chown() */ - if ((ip->i_d.di_mode & (ISUID|ISGID)) && + if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && !capable(CAP_FSETID)) { - ip->i_d.di_mode &= ~(ISUID|ISGID); + ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); } /* @@ -992,6 +992,7 @@ STATIC int xfs_readlink( bhv_desc_t *bdp, uio_t *uiop, + int ioflags, cred_t *credp) { xfs_inode_t *ip; @@ -1019,7 +1020,7 @@ xfs_readlink( xfs_ilock(ip, XFS_ILOCK_SHARED); - ASSERT((ip->i_d.di_mode & IFMT) == IFLNK); + ASSERT((ip->i_d.di_mode & S_IFMT) == S_IFLNK); offset = uiop->uio_offset; count = uiop->uio_resid; @@ -1033,7 +1034,7 @@ xfs_readlink( goto error_return; } - if (!(uiop->uio_fmode & FINVIS)) { + if (!(ioflags & IO_INVIS)) { xfs_ichgtime(ip, XFS_ICHGTIME_ACC); } @@ -1595,8 +1596,7 @@ xfs_inactive_symlink_local( STATIC int xfs_inactive_attrs( xfs_inode_t *ip, - xfs_trans_t **tpp, - int *commitflags) + xfs_trans_t **tpp) { xfs_trans_t *tp; int error; @@ -1606,9 +1606,8 @@ xfs_inactive_attrs( tp = *tpp; mp = ip->i_mount; ASSERT(ip->i_d.di_forkoff != 0); - xfs_trans_commit(tp, *commitflags, NULL); + xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); xfs_iunlock(ip, XFS_ILOCK_EXCL); - *commitflags = 0; error = xfs_attr_inactive(ip); if (error) { @@ -1620,8 +1619,8 @@ xfs_inactive_attrs( tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); error = xfs_trans_reserve(tp, 0, XFS_IFREE_LOG_RES(mp), - 0, 0, - XFS_DEFAULT_LOG_COUNT); + 0, XFS_TRANS_PERM_LOG_RES, + XFS_INACTIVE_LOG_COUNT); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_trans_cancel(tp, 0); @@ -1664,7 +1663,7 @@ xfs_release( mp = ip->i_mount; if (ip->i_d.di_nlink != 0) { - if ((((ip->i_d.di_mode & IFMT) == IFREG) && + if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS)) && (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)))) { @@ -1694,10 +1693,12 @@ xfs_inactive( { xfs_inode_t *ip; vnode_t *vp; + xfs_bmap_free_t free_list; + xfs_fsblock_t first_block; + int committed; xfs_trans_t *tp; xfs_mount_t *mp; int error; - int commit_flags; int truncate; vp = BHV_TO_VNODE(bdp); @@ -1723,7 +1724,7 @@ xfs_inactive( */ truncate = ((ip->i_d.di_nlink == 0) && ((ip->i_d.di_size != 0) || (ip->i_d.di_nextents > 0)) && - ((ip->i_d.di_mode & IFMT) == IFREG)); + ((ip->i_d.di_mode & S_IFMT) == S_IFREG)); mp = ip->i_mount; @@ -1739,7 +1740,7 @@ xfs_inactive( goto out; if (ip->i_d.di_nlink != 0) { - if ((((ip->i_d.di_mode & IFMT) == IFREG) && + if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && ((ip->i_d.di_size > 0) || (VN_CACHED(vp) > 0)) && (ip->i_df.if_flags & XFS_IFEXTENTS)) && (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)) || @@ -1795,14 +1796,14 @@ xfs_inactive( */ error = xfs_itruncate_finish(&tp, ip, 0, XFS_DATA_FORK, (!(mp->m_flags & XFS_MOUNT_WSYNC) ? 1 : 0)); - commit_flags = XFS_TRANS_RELEASE_LOG_RES; if (error) { - xfs_trans_cancel(tp, commit_flags | XFS_TRANS_ABORT); + xfs_trans_cancel(tp, + XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); xfs_iunlock(ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); return (VN_INACTIVE_CACHE); } - } else if ((ip->i_d.di_mode & IFMT) == IFLNK) { + } else if ((ip->i_d.di_mode & S_IFMT) == S_IFLNK) { /* * If we get an error while cleaning up a @@ -1819,13 +1820,11 @@ xfs_inactive( xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); - commit_flags = XFS_TRANS_RELEASE_LOG_RES; - } else { error = xfs_trans_reserve(tp, 0, XFS_IFREE_LOG_RES(mp), - 0, 0, - XFS_DEFAULT_LOG_COUNT); + 0, XFS_TRANS_PERM_LOG_RES, + XFS_INACTIVE_LOG_COUNT); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_trans_cancel(tp, 0); @@ -1835,7 +1834,6 @@ xfs_inactive( xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); - commit_flags = 0; } /* @@ -1846,7 +1844,7 @@ xfs_inactive( * because we can't use it for xfs_attr_inactive(). */ if (ip->i_d.di_anextents > 0) { - error = xfs_inactive_attrs(ip, &tp, &commit_flags); + error = xfs_inactive_attrs(ip, &tp); /* * If we got an error, the transaction is already * cancelled, and the inode is unlocked. Just get out. @@ -1860,7 +1858,8 @@ xfs_inactive( /* * Free the inode. */ - error = xfs_ifree(tp, ip); + XFS_BMAP_INIT(&free_list, &first_block); + error = xfs_ifree(tp, ip, &free_list); if (error) { /* * If we fail to free the inode, shut down. The cancel @@ -1873,7 +1872,7 @@ xfs_inactive( error, mp->m_fsname); xfs_force_shutdown(mp, XFS_METADATA_IO_ERROR); } - xfs_trans_cancel(tp, commit_flags | XFS_TRANS_ABORT); + xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); } else { /* * Credit the quota account(s). The inode is gone. @@ -1884,7 +1883,9 @@ xfs_inactive( * Just ignore errors at this point. There is * nothing we can do except to try to keep going. */ - (void) xfs_trans_commit(tp, commit_flags, NULL); + (void) xfs_bmap_finish(&tp, &free_list, first_block, + &committed); + (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); } /* * Release the dquots held by inode, if any. @@ -3508,7 +3509,7 @@ xfs_symlink( /* * Allocate an inode for the symlink. */ - error = xfs_dir_ialloc(&tp, dp, IFLNK | (vap->va_mode&~IFMT), + error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (vap->va_mode&~S_IFMT), 1, 0, credp, prid, resblks > 0, &ip, NULL); if (error) { if (error == ENOSPC) @@ -3891,7 +3892,7 @@ xfs_reclaim( ASSERT(!VN_MAPPED(vp)); ip = XFS_BHVTOI(bdp); - if ((ip->i_d.di_mode & IFMT) == IFREG) { + if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) { if (ip->i_d.di_size > 0) { /* * Flush and invalidate any data left around that is @@ -4597,7 +4598,7 @@ xfs_change_file_space( xfs_ilock(ip, XFS_ILOCK_SHARED); - if ((error = xfs_iaccess(ip, IWRITE, credp))) { + if ((error = xfs_iaccess(ip, S_IWUSR, credp))) { xfs_iunlock(ip, XFS_ILOCK_SHARED); return error; } @@ -4704,17 +4705,17 @@ xfs_change_file_space( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_ihold(tp, ip); - ip->i_d.di_mode &= ~ISUID; + ip->i_d.di_mode &= ~S_ISUID; /* * Note that we don't have to worry about mandatory * file locking being disabled here because we only - * clear the ISGID bit if the Group execute bit is + * clear the S_ISGID bit if the Group execute bit is * on, but if it was on then mandatory locking wouldn't * have been enabled. */ - if (ip->i_d.di_mode & (IEXEC >> 3)) - ip->i_d.di_mode &= ~ISGID; + if (ip->i_d.di_mode & S_IXGRP) + ip->i_d.di_mode &= ~S_ISGID; xfs_ichgtime(ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); diff -prauN linux-2.6.0-test7/fs/xfs/xfsidbg.c wli-2.6.0-test7-bk1-29/fs/xfs/xfsidbg.c --- linux-2.6.0-test7/fs/xfs/xfsidbg.c 2003-10-08 12:24:27.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/fs/xfs/xfsidbg.c 2003-10-09 19:27:23.000000000 -0700 @@ -2643,6 +2643,7 @@ xfs_buf_item_print(xfs_buf_log_item_t *b "stale", /* 0x4 */ "logged", /* 0x8 */ "ialloc", /* 0x10 */ + "inode_stale", /* 0x20 */ 0 }; static char *blf_flags[] = { @@ -4811,6 +4812,7 @@ xfsidbg_xnode(xfs_inode_t *ip) "uiosize", /* XFS_IUIOSZ */ "quiesce", /* XFS_IQUIESCE */ "reclaim", /* XFS_IRECLAIM */ + "stale", /* XFS_ISTALE */ NULL }; diff -prauN linux-2.6.0-test7/include/asm-alpha/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-alpha/pgalloc.h --- linux-2.6.0-test7/include/asm-alpha/pgalloc.h 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-alpha/pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -24,9 +24,9 @@ pmd_populate_kernel(struct mm_struct *mm } static inline void -pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) +pgd_populate(struct mm_struct *mm, pgd_t *pgd, struct page *pmd) { - pgd_set(pgd, pmd); + pgd_set(pgd, page_address(pmd)); } extern pgd_t *pgd_alloc(struct mm_struct *mm); @@ -37,19 +37,29 @@ pgd_free(pgd_t *pgd) free_page((unsigned long)pgd); } -static inline pmd_t * +static inline struct page * pmd_alloc_one(struct mm_struct *mm, unsigned long address) { - pmd_t *ret = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); - if (ret) - clear_page(ret); - return ret; + struct page *page = alloc_page(GFP_KERNEL|__GFP_REPEAT); + if (page) + clear_highpage(page); + return page; +} + +static inline pmd_t * +pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) +{ + struct page *page = pmd_alloc_one(mm, addr); + if (page) + return page_address(page); + else + return NULL; } static inline void -pmd_free(pmd_t *pmd) +pmd_free(struct page *pmd) { - free_page((unsigned long)pmd); + __free_page(pmd); } extern pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr); diff -prauN linux-2.6.0-test7/include/asm-alpha/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-alpha/pgtable.h --- linux-2.6.0-test7/include/asm-alpha/pgtable.h 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-alpha/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -228,9 +228,11 @@ pmd_page_kernel(pmd_t pmd) #define pmd_page(pmd) (mem_map + ((pmd_val(pmd) & _PFN_MASK) >> 32)) #endif -extern inline unsigned long pgd_page(pgd_t pgd) +extern inline unsigned long __pgd_page(pgd_t pgd) { return PAGE_OFFSET + ((pgd_val(pgd) & _PFN_MASK) >> (32-PAGE_SHIFT)); } +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) + extern inline int pte_none(pte_t pte) { return !pte_val(pte); } extern inline int pte_present(pte_t pte) { return pte_val(pte) & _PAGE_VALID; } extern inline void pte_clear(pte_t *ptep) { pte_val(*ptep) = 0; } @@ -279,9 +281,15 @@ extern inline pte_t pte_mkyoung(pte_t pt /* Find an entry in the second-level page table.. */ extern inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { - return (pmd_t *) pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); + return (pmd_t *)__pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PAGE - 1)); } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ extern inline pte_t * pte_offset_kernel(pmd_t * dir, unsigned long address) { diff -prauN linux-2.6.0-test7/include/asm-arm/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-arm/pgalloc.h --- linux-2.6.0-test7/include/asm-arm/pgalloc.h 2003-10-08 12:24:50.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-arm/pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -17,7 +17,8 @@ /* * Since we have only two-level page tables, these are trivial */ -#define pmd_alloc_one(mm,addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(pmd) do { } while (0) #define pgd_populate(mm,pmd,pte) BUG() diff -prauN linux-2.6.0-test7/include/asm-arm/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-arm/pgtable.h --- linux-2.6.0-test7/include/asm-arm/pgtable.h 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-arm/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -317,6 +317,11 @@ static inline pte_t *pmd_page_kernel(pmd /* Find an entry in the second-level page table.. */ #define pmd_offset(dir, addr) ((pmd_t *)(dir)) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ #define __pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff -prauN linux-2.6.0-test7/include/asm-arm26/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-arm26/pgalloc.h --- linux-2.6.0-test7/include/asm-arm26/pgalloc.h 2003-10-08 12:24:53.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-arm26/pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -55,7 +55,8 @@ pmd_populate_kernel(struct mm_struct *mm * is thrown away. It just cant be zero. -IM */ -#define pmd_alloc_one(mm,addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(pmd) do { } while (0) #define pgd_populate(mm,pmd,pte) BUG() diff -prauN linux-2.6.0-test7/include/asm-arm26/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-arm26/pgtable.h --- linux-2.6.0-test7/include/asm-arm26/pgtable.h 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-arm26/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -99,7 +99,7 @@ extern struct page *empty_zero_page; * on arm26 we have no 2nd level page table. we simulate this by removing the * PMD. * - * pgd_none is 0 to prevernt pmd_alloc() calling __pmd_alloc(). This causes it + * pgd_none is 0 to prevernt pmd_alloc_map() calling __pmd_alloc(). This causes it * to return pmd_offset(pgd,addr) which is a pointer to the pgd (IOW, a no-op). * * however, to work this way, whilst we are allocating 32 pgds, containing 32 @@ -134,7 +134,7 @@ extern struct page *empty_zero_page; #define _PMD_PRESENT (0x01) -/* These definitions allow us to optimise out stuff like pmd_alloc() */ +/* These definitions allow us to optimise out stuff like pmd_alloc_map() */ #define pgd_none(pgd) (0) #define pgd_bad(pgd) (0) #define pgd_present(pgd) (1) @@ -188,6 +188,12 @@ extern struct page *empty_zero_page; #define pte_unmap(pte) do { } while (0) #define pte_unmap_nested(pte) do { } while (0) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pgd, addr) do { } while (0) +#define pmd_unmap_nested(pgd, addr) do { } while (0) + #define _PAGE_PRESENT 0x01 #define _PAGE_READONLY 0x02 diff -prauN linux-2.6.0-test7/include/asm-arm26/rmap.h wli-2.6.0-test7-bk1-29/include/asm-arm26/rmap.h --- linux-2.6.0-test7/include/asm-arm26/rmap.h 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-arm26/rmap.h 2003-10-09 19:42:26.000000000 -0700 @@ -14,14 +14,14 @@ static inline void pgtable_add_rmap(struct page *page, struct mm_struct * mm, unsigned long address) { - page->mapping = (void *)mm; + set_page_mapping(page, mm); page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); inc_page_state(nr_page_table_pages); } static inline void pgtable_remove_rmap(struct page *page) { - page->mapping = NULL; + set_page_mapping(page, NULL); page->index = 0; dec_page_state(nr_page_table_pages); } @@ -29,7 +29,7 @@ static inline void pgtable_remove_rmap(s static inline struct mm_struct * ptep_to_mm(pte_t * ptep) { struct page * page = virt_to_page(ptep); - return (struct mm_struct *)page->mapping; + return (struct mm_struct *)page_mapping(page); } /* The page table takes half of the page */ diff -prauN linux-2.6.0-test7/include/asm-cris/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-cris/pgalloc.h --- linux-2.6.0-test7/include/asm-cris/pgalloc.h 2003-10-08 12:24:42.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-cris/pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -57,7 +57,8 @@ extern inline void pte_free(struct page * the pgd will always be present.. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() diff -prauN linux-2.6.0-test7/include/asm-cris/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-cris/pgtable.h --- linux-2.6.0-test7/include/asm-cris/pgtable.h 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-cris/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -281,6 +281,12 @@ extern inline pmd_t * pmd_offset(pgd_t * return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define __pte_offset(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff -prauN linux-2.6.0-test7/include/asm-generic/rmap.h wli-2.6.0-test7-bk1-29/include/asm-generic/rmap.h --- linux-2.6.0-test7/include/asm-generic/rmap.h 2003-10-08 12:24:16.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-generic/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,90 +0,0 @@ -#ifndef _GENERIC_RMAP_H -#define _GENERIC_RMAP_H -/* - * linux/include/asm-generic/rmap.h - * - * Architecture dependent parts of the reverse mapping code, - * this version should work for most architectures with a - * 'normal' page table layout. - * - * We use the struct page of the page table page to find out - * the process and full address of a page table entry: - * - page->mapping points to the process' mm_struct - * - page->index has the high bits of the address - * - the lower bits of the address are calculated from the - * offset of the page table entry within the page table page - * - * For CONFIG_HIGHPTE, we need to represent the address of a pte in a - * scalar pte_addr_t. The pfn of the pte's page is shifted left by PAGE_SIZE - * bits and is then ORed with the byte offset of the pte within its page. - * - * For CONFIG_HIGHMEM4G, the pte_addr_t is 32 bits. 20 for the pfn, 12 for - * the offset. - * - * For CONFIG_HIGHMEM64G, the pte_addr_t is 64 bits. 52 for the pfn, 12 for - * the offset. - */ -#include - -static inline void pgtable_add_rmap(struct page * page, struct mm_struct * mm, unsigned long address) -{ -#ifdef BROKEN_PPC_PTE_ALLOC_ONE - /* OK, so PPC calls pte_alloc() before mem_map[] is setup ... ;( */ - extern int mem_init_done; - - if (!mem_init_done) - return; -#endif - page->mapping = (void *)mm; - page->index = address & ~((PTRS_PER_PTE * PAGE_SIZE) - 1); - inc_page_state(nr_page_table_pages); -} - -static inline void pgtable_remove_rmap(struct page * page) -{ - page->mapping = NULL; - page->index = 0; - dec_page_state(nr_page_table_pages); -} - -static inline struct mm_struct * ptep_to_mm(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - return (struct mm_struct *) page->mapping; -} - -static inline unsigned long ptep_to_address(pte_t * ptep) -{ - struct page * page = kmap_atomic_to_page(ptep); - unsigned long low_bits; - low_bits = ((unsigned long)ptep & ~PAGE_MASK) * PTRS_PER_PTE; - return page->index + low_bits; -} - -#ifdef CONFIG_HIGHPTE -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - pte_addr_t paddr; - paddr = ((pte_addr_t)page_to_pfn(kmap_atomic_to_page(ptep))) << PAGE_SHIFT; - return paddr + (pte_addr_t)((unsigned long)ptep & ~PAGE_MASK); -} -#else -static inline pte_addr_t ptep_to_paddr(pte_t *ptep) -{ - return (pte_addr_t)ptep; -} -#endif - -#ifndef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - return (pte_t *)pte_paddr; -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - return; -} -#endif - -#endif /* _GENERIC_RMAP_H */ diff -prauN linux-2.6.0-test7/include/asm-h8300/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-h8300/pgtable.h --- linux-2.6.0-test7/include/asm-h8300/pgtable.h 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-h8300/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -18,6 +18,11 @@ typedef pte_t *pte_addr_t; #define pmd_none(pmd) (1) #define pgd_offset_k(adrdress) ((pgd_t *)0) #define pte_offset_kernel(dir, address) ((pte_t *)0) +#define pmd_offset_kernel(a,b) pmd_offset(a,b) +#define pmd_offset_map(a,b) pmd_offset(a,b) +#define pmd_offset_map_nested(a,b) pmd_offset(a,b) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define PAGE_NONE __pgprot(0) /* these mean nothing to NO_MM */ #define PAGE_SHARED __pgprot(0) /* these mean nothing to NO_MM */ diff -prauN linux-2.6.0-test7/include/asm-i386/a.out.h wli-2.6.0-test7-bk1-29/include/asm-i386/a.out.h --- linux-2.6.0-test7/include/asm-i386/a.out.h 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/a.out.h 2003-10-09 19:53:38.000000000 -0700 @@ -19,7 +19,11 @@ struct exec #ifdef __KERNEL__ +#ifdef CONFIG_MMAP_TOPDOWN +#define STACK_TOP (128 << 20) +#else #define STACK_TOP TASK_SIZE +#endif #endif diff -prauN linux-2.6.0-test7/include/asm-i386/highmem.h wli-2.6.0-test7-bk1-29/include/asm-i386/highmem.h --- linux-2.6.0-test7/include/asm-i386/highmem.h 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/highmem.h 2003-10-09 19:35:27.000000000 -0700 @@ -41,9 +41,9 @@ extern void kmap_init(void); * chunk of RAM. */ #if NR_CPUS <= 32 -#define PKMAP_BASE (0xff800000UL) +#define PKMAP_BASE (0xff400000UL) #else -#define PKMAP_BASE (0xff600000UL) +#define PKMAP_BASE (0xfe800000UL) #endif #ifdef CONFIG_X86_PAE #define LAST_PKMAP 512 @@ -54,14 +54,60 @@ extern void kmap_init(void); #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -extern void * FASTCALL(kmap_high(struct page *page)); -extern void FASTCALL(kunmap_high(struct page *page)); +void *FASTCALL(kmap_high(struct page *page)); +void FASTCALL(kunmap_high(struct page *page)); -void *kmap(struct page *page); -void kunmap(struct page *page); -void *kmap_atomic(struct page *page, enum km_type type); -void kunmap_atomic(void *kvaddr, enum km_type type); -struct page *kmap_atomic_to_page(void *ptr); +void *FASTCALL(__kmap_atomic(struct page *page, enum km_type type, unsigned long vaddr)); + +static inline void *kmap(struct page *page) +{ + might_sleep(); + if (page < highmem_start_page) + return lowmem_page_address(page); + else + return kmap_high(page); +} + +static inline void kunmap(struct page *page) +{ + BUG_ON(in_interrupt()); + if (page >= highmem_start_page) + kunmap_high(page); +} + +static inline void *kmap_atomic(struct page *page, enum km_type type) +{ + inc_preempt_count(); + if (page < highmem_start_page) + return lowmem_page_address(page); + else + return __kmap_atomic(page, type, __fix_to_virt(FIX_KMAP_BEGIN + type)); +} + +#ifdef CONFIG_DEBUG_HIGHMEM +void FASTCALL(__kunmap_atomic(void *kvaddr, enum km_type type, unsigned long vaddr)); +#else +static inline void __kunmap_atomic(void *kvaddr, enum km_type idx, unsigned long vaddr) +{ +} +#endif + +static inline void kunmap_atomic(void *kvaddr, enum km_type type) +{ + if ((unsigned long)kvaddr >= FIXADDR_START) + __kunmap_atomic(kvaddr, type, __fix_to_virt(FIX_KMAP_BEGIN + type)); + dec_preempt_count(); +} + +static inline struct page *kmap_atomic_to_page(void *vaddr) +{ + if ((unsigned long)vaddr < FIXADDR_START) + return virt_to_page(vaddr); + else { + unsigned long idx = virt_to_fix((unsigned long)vaddr); + return pte_page(*(kmap_pte - (idx - FIX_KMAP_BEGIN))); + } +} #define flush_cache_kmaps() do { } while (0) diff -prauN linux-2.6.0-test7/include/asm-i386/kmap_types.h wli-2.6.0-test7-bk1-29/include/asm-i386/kmap_types.h --- linux-2.6.0-test7/include/asm-i386/kmap_types.h 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/kmap_types.h 2003-10-09 19:28:46.000000000 -0700 @@ -17,14 +17,16 @@ D(3) KM_USER0, D(4) KM_USER1, D(5) KM_BIO_SRC_IRQ, D(6) KM_BIO_DST_IRQ, -D(7) KM_PTE0, -D(8) KM_PTE1, -D(9) KM_PTE2, -D(10) KM_IRQ0, -D(11) KM_IRQ1, -D(12) KM_SOFTIRQ0, -D(13) KM_SOFTIRQ1, -D(14) KM_TYPE_NR +D(7) KM_PMD0, +D(8) KM_PMD1, +D(9) KM_PTE0, +D(10) KM_PTE1, +D(11) KM_PTE2, +D(12) KM_IRQ0, +D(13) KM_IRQ1, +D(14) KM_SOFTIRQ0, +D(15) KM_SOFTIRQ1, +D(16) KM_TYPE_NR }; #undef D diff -prauN linux-2.6.0-test7/include/asm-i386/linkage.h wli-2.6.0-test7-bk1-29/include/asm-i386/linkage.h --- linux-2.6.0-test7/include/asm-i386/linkage.h 2003-10-08 12:24:53.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/linkage.h 2003-10-09 19:36:58.000000000 -0700 @@ -3,6 +3,7 @@ #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) #define FASTCALL(x) x __attribute__((regparm(3))) +#define IRQHANDLER(x) x __attribute__((regparm(1))) #ifdef CONFIG_X86_ALIGNMENT_16 #define __ALIGN .align 16,0x90 diff -prauN linux-2.6.0-test7/include/asm-i386/numaq.h wli-2.6.0-test7-bk1-29/include/asm-i386/numaq.h --- linux-2.6.0-test7/include/asm-i386/numaq.h 2003-10-08 12:24:07.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/numaq.h 2003-10-09 19:52:21.000000000 -0700 @@ -28,7 +28,8 @@ #ifdef CONFIG_X86_NUMAQ -extern int get_memcfg_numaq(void); +#define MAX_NODE_CPUS 4 +int get_memcfg_numaq(void); /* * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the diff -prauN linux-2.6.0-test7/include/asm-i386/page.h wli-2.6.0-test7-bk1-29/include/asm-i386/page.h --- linux-2.6.0-test7/include/asm-i386/page.h 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/page.h 2003-10-09 19:36:58.000000000 -0700 @@ -3,7 +3,11 @@ /* PAGE_SHIFT determines the page size */ #define PAGE_SHIFT 12 -#define PAGE_SIZE (1UL << PAGE_SHIFT) +#ifndef __ASSEMBLY__ +#define PAGE_SIZE (1UL << PAGE_SHIFT) +#else +#define PAGE_SIZE (1 << PAGE_SHIFT) +#endif #define PAGE_MASK (~(PAGE_SIZE-1)) #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) diff -prauN linux-2.6.0-test7/include/asm-i386/percpu.h wli-2.6.0-test7-bk1-29/include/asm-i386/percpu.h --- linux-2.6.0-test7/include/asm-i386/percpu.h 2003-10-08 12:24:50.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/percpu.h 2003-10-09 19:49:59.000000000 -0700 @@ -3,4 +3,9 @@ #include +#ifdef CONFIG_NUMA +#undef __GENERIC_PER_CPU +void setup_per_cpu_areas(void); +#endif + #endif /* __ARCH_I386_PERCPU__ */ diff -prauN linux-2.6.0-test7/include/asm-i386/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-i386/pgalloc.h --- linux-2.6.0-test7/include/asm-i386/pgalloc.h 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/pgalloc.h 2003-10-09 19:30:59.000000000 -0700 @@ -31,25 +31,36 @@ static inline void pte_free_kernel(pte_t free_page((unsigned long)pte); } -static inline void pte_free(struct page *pte) -{ - __free_page(pte); -} - - -#define __pte_free_tlb(tlb,pte) tlb_remove_page((tlb),(pte)) - /* * allocating and freeing a pmd is trivial: the 1-entry pmd is * inside the pgd, so has no extra memory associated with it. * (In the PAE case we free the pmds as part of the pgd.) */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() #define check_pgt_cache() do { } while (0) +#include + +static inline void pte_free(struct page *page) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + tlb_remove_page(tlb, page); + put_cpu(); +} + +static inline void pte_free_tlb(struct mmu_gather *tlb, struct page *page) +{ + tlb_remove_page(tlb, page); +} + +static inline void pmd_free_tlb(struct mmu_gather *tlb, struct page *page) +{ +} + #endif /* _I386_PGALLOC_H */ diff -prauN linux-2.6.0-test7/include/asm-i386/pgtable-2level.h wli-2.6.0-test7-bk1-29/include/asm-i386/pgtable-2level.h --- linux-2.6.0-test7/include/asm-i386/pgtable-2level.h 2003-10-08 12:24:07.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/pgtable-2level.h 2003-10-09 19:28:46.000000000 -0700 @@ -48,13 +48,15 @@ static inline int pgd_present(pgd_t pgd) #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) -#define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) pfn_to_page(pgd_val(pgd) >> PAGE_SHIFT) + +#define pmd_offset_map(pgd, addr) ({ (pmd_t *)(pgd); }) +#define pmd_offset_map_nested(pgd, addr) pmd_offset_map(pgd, addr) +#define pmd_offset_kernel(pgd, addr) pmd_offset_map(pgd, addr) + +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) -static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) -{ - return (pmd_t *) dir; -} #define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte_low, 0)) #define pte_same(a, b) ((a).pte_low == (b).pte_low) #define pte_page(x) pfn_to_page(pte_pfn(x)) diff -prauN linux-2.6.0-test7/include/asm-i386/pgtable-3level.h wli-2.6.0-test7-bk1-29/include/asm-i386/pgtable-3level.h --- linux-2.6.0-test7/include/asm-i386/pgtable-3level.h 2003-10-08 12:24:27.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/pgtable-3level.h 2003-10-09 19:28:46.000000000 -0700 @@ -64,12 +64,32 @@ static inline void set_pte(pte_t *ptep, */ static inline void pgd_clear (pgd_t * pgd) { } -#define pgd_page(pgd) \ -((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +static inline unsigned long pgd_pfn(pgd_t pgd) +{ + return pgd_val(pgd) >> PAGE_SHIFT; +} + +#define pgd_page(pgd) pfn_to_page(pgd_pfn(pgd)) + +#define pmd_offset_kernel(pgd, addr) \ + ((pmd_t *)__va(pgd_val(*(pgd)) & PAGE_MASK) + pmd_index(addr)) /* Find an entry in the second-level page table.. */ -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ - pmd_index(address)) +#ifdef CONFIG_HIGHPMD +#define __pmd_offset(pgd, addr, type) \ + ((pmd_t *)kmap_atomic(pgd_page(*(pgd)), type) + pmd_index(addr)) +#define __pmd_unmap(pmd, type) kunmap_atomic(pmd, type) +#else +#define __pmd_offset(pgd, addr, type) \ + ((pmd_t *)__va(pgd_val(*(pgd)) & PAGE_MASK) + pmd_index(addr)) +#define __pmd_unmap(pmd, type) do { } while (0) +#endif + +#define pmd_offset_map(pgd, addr) __pmd_offset(pgd, addr, KM_PMD0) +#define pmd_offset_map_nested(pgd, addr) __pmd_offset(pgd, addr, KM_PMD1) + +#define pmd_unmap(pmd) __pmd_unmap(pmd, KM_PMD0); +#define pmd_unmap_nested(pmd) __pmd_unmap(pmd, KM_PMD1); static inline pte_t ptep_get_and_clear(pte_t *ptep) { diff -prauN linux-2.6.0-test7/include/asm-i386/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-i386/pgtable.h --- linux-2.6.0-test7/include/asm-i386/pgtable.h 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/pgtable.h 2003-10-09 19:53:38.000000000 -0700 @@ -25,6 +25,10 @@ #include #include +#ifdef CONFIG_MMAP_TOPDOWN +#define HAVE_ARCH_UNMAPPED_AREA +#endif + /* * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. @@ -33,16 +37,17 @@ extern unsigned long empty_zero_page[1024]; extern pgd_t swapper_pg_dir[1024]; extern kmem_cache_t *pgd_cache; -extern kmem_cache_t *pmd_cache; extern spinlock_t pgd_lock; extern struct list_head pgd_list; -void pmd_ctor(void *, kmem_cache_t *, unsigned long); void pgd_ctor(void *, kmem_cache_t *, unsigned long); void pgd_dtor(void *, kmem_cache_t *, unsigned long); void pgtable_cache_init(void); void paging_init(void); +#define HAVE_ARCH_PAGETABLE_CACHE +void shrink_pagetable_cache(int gfp_mask); + #endif /* !__ASSEMBLY__ */ /* diff -prauN linux-2.6.0-test7/include/asm-i386/rmap.h wli-2.6.0-test7-bk1-29/include/asm-i386/rmap.h --- linux-2.6.0-test7/include/asm-i386/rmap.h 2003-10-08 12:24:16.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/rmap.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,21 +0,0 @@ -#ifndef _I386_RMAP_H -#define _I386_RMAP_H - -/* nothing to see, move along */ -#include - -#ifdef CONFIG_HIGHPTE -static inline pte_t *rmap_ptep_map(pte_addr_t pte_paddr) -{ - unsigned long pfn = (unsigned long)(pte_paddr >> PAGE_SHIFT); - unsigned long off = ((unsigned long)pte_paddr) & ~PAGE_MASK; - return (pte_t *)((char *)kmap_atomic(pfn_to_page(pfn), KM_PTE2) + off); -} - -static inline void rmap_ptep_unmap(pte_t *pte) -{ - kunmap_atomic(pte, KM_PTE2); -} -#endif - -#endif diff -prauN linux-2.6.0-test7/include/asm-i386/srat.h wli-2.6.0-test7-bk1-29/include/asm-i386/srat.h --- linux-2.6.0-test7/include/asm-i386/srat.h 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/srat.h 2003-10-09 19:51:50.000000000 -0700 @@ -31,7 +31,8 @@ #error CONFIG_ACPI_SRAT not defined, and srat.h header has been included #endif -extern int get_memcfg_from_srat(void); -extern unsigned long *get_zholes_size(int); +#define MAX_NODE_CPUS 4 +int get_memcfg_from_srat(void); +unsigned long *get_zholes_size(int); #endif /* _ASM_SRAT_H_ */ diff -prauN linux-2.6.0-test7/include/asm-i386/system.h wli-2.6.0-test7-bk1-29/include/asm-i386/system.h --- linux-2.6.0-test7/include/asm-i386/system.h 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/system.h 2003-10-09 19:30:59.000000000 -0700 @@ -461,6 +461,18 @@ struct alt_instr { /* For spinlocks etc */ #define local_irq_save(x) __asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory") +#ifdef CONFIG_SMP +#define smp_local_irq_save(x) local_irq_save(x) +#define smp_local_irq_restore(x) local_irq_restore(x) +#define smp_local_irq_disable() local_irq_disable() +#define smp_local_irq_enable() local_irq_enable() +#else +#define smp_local_irq_save(x) do { (void)(x); } while (0) +#define smp_local_irq_restore(x) do { (void)(x); } while (0) +#define smp_local_irq_disable() do { } while (0) +#define smp_local_irq_enable() do { } while (0) +#endif /* CONFIG_SMP */ + /* * disable hlt during certain critical i/o operations */ diff -prauN linux-2.6.0-test7/include/asm-i386/thread_info.h wli-2.6.0-test7-bk1-29/include/asm-i386/thread_info.h --- linux-2.6.0-test7/include/asm-i386/thread_info.h 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/thread_info.h 2003-10-09 19:36:58.000000000 -0700 @@ -9,6 +9,8 @@ #ifdef __KERNEL__ +#include +#include #ifndef __ASSEMBLY__ #include #endif @@ -30,9 +32,11 @@ struct thread_info { __s32 preempt_count; /* 0 => preemptable, <0 => BUG */ mm_segment_t addr_limit; /* thread address space: + 0 for interrupts: illegal 0-0xBFFFFFFF for user-thead 0-0xFFFFFFFF for kernel-thread */ + struct thread_info *irq_stack; /* pointer to cpu irq stack */ struct restart_block restart_block; __u8 supervisor_stack[0]; @@ -48,7 +52,8 @@ struct thread_info { #define TI_CPU 0x00000010 #define TI_PRE_COUNT 0x00000014 #define TI_ADDR_LIMIT 0x00000018 -#define TI_RESTART_BLOCK 0x000001C +#define TI_IRQ_STACK 0x0000001C +#define TI_RESTART_BLOCK 0x0000026 #endif @@ -59,46 +64,60 @@ struct thread_info { * * preempt_count needs to be 1 initially, until the scheduler is functional. */ +#ifdef CONFIG_4K_STACK +#define THREAD_ORDER 0 +#else +#define THREAD_ORDER 1 +#endif +#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define INIT_THREAD_SIZE THREAD_SIZE +#define STACK_WARN (THREAD_SIZE/4) +#define STACK_PANIC (THREAD_SIZE/8) + #ifndef __ASSEMBLY__ -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .exec_domain = &default_exec_domain, \ - .flags = 0, \ - .cpu = 0, \ - .preempt_count = 1, \ - .addr_limit = KERNEL_DS, \ - .restart_block = { \ - .fn = do_no_restart_syscall, \ - }, \ +#define INIT_THREAD_INFO(tsk) \ +{ \ + .task = &tsk, \ + .exec_domain = &default_exec_domain, \ + .flags = 0, \ + .cpu = 0, \ + .preempt_count = 1, \ + .addr_limit = KERNEL_DS, \ + .irq_stack = &init_irq_union.thread_info, \ + .restart_block = { \ + .fn = do_no_restart_syscall, \ + } \ } #define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) +/* thread information allocation */ +#define alloc_thread_info(task) ((struct thread_info *)kmalloc(THREAD_SIZE, GFP_KERNEL)) +#define free_thread_info(info) kfree(info) +#define get_thread_info(ti) get_task_struct((ti)->task) +#define put_thread_info(ti) put_task_struct((ti)->task) + /* how to get the thread information struct from C */ static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~8191UL)); + __asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1))); return ti; } -/* thread information allocation */ -#define THREAD_SIZE (2*PAGE_SIZE) -#define alloc_thread_info(task) ((struct thread_info *)kmalloc(THREAD_SIZE, GFP_KERNEL)) -#define free_thread_info(info) kfree(info) -#define get_thread_info(ti) get_task_struct((ti)->task) -#define put_thread_info(ti) put_task_struct((ti)->task) - #else /* !__ASSEMBLY__ */ /* how to get the thread information struct from ASM */ #define GET_THREAD_INFO(reg) \ - movl $-8192, reg; \ + movl $-THREAD_SIZE, reg; \ andl %esp, reg +/* use this one if reg already contains %esp */ +#define GET_THREAD_INFO_WITH_ESP(reg) \ + andl $-THREAD_SIZE, reg + #endif /* diff -prauN linux-2.6.0-test7/include/asm-i386/tlb.h wli-2.6.0-test7-bk1-29/include/asm-i386/tlb.h --- linux-2.6.0-test7/include/asm-i386/tlb.h 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-i386/tlb.h 2003-10-09 19:30:59.000000000 -0700 @@ -1,10 +1,58 @@ #ifndef _I386_TLB_H #define _I386_TLB_H +/* + * include/asm-i386/tlb.h + * (C) June 2003 William Irwin, IBM + * Routines for pagetable cacheing and release. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_HIGHPTE +#define GFP_PTE (GFP_KERNEL|__GFP_REPEAT|__GFP_HIGHMEM) +#else +#define GFP_PTE (GFP_KERNEL|__GFP_REPEAT) +#endif + +#ifdef CONFIG_HIGHPMD +#define GFP_PMD (GFP_KERNEL|__GFP_REPEAT|__GFP_HIGHMEM) +#else +#define GFP_PMD (GFP_KERNEL|__GFP_REPEAT) +#endif + +#define PG_PTE PG_arch_1 +#define NR_PTE 128 +#define FREE_PTE_NR NR_PTE +#define NR_NONPTE 512 +#define MAX_ZONE_ID (MAX_NUMNODES * MAX_NR_ZONES) + +#define PagePTE(page) test_bit(PG_PTE, &(page)->flags) +#define SetPagePTE(page) set_bit(PG_PTE, &(page)->flags) +#define ClearPagePTE(page) clear_bit(PG_PTE, &(page)->flags) +#define TestSetPagePTE(page) test_and_set_bit(PG_PTE, &(page)->flags) +#define TestClearPagePTE(page) test_and_clear_bit(PG_PTE, &(page)->flags) +#define PageZoneID(page) ((page)->flags >> ZONE_SHIFT) /* - * x86 doesn't need any special per-pte or - * per-vma handling.. + * vmscan.c does smp_call_function() to shoot down cached pagetables under + * memory pressure. */ +struct mmu_gather { + struct mm_struct *mm; + int nr_pte_active, nr_pte_ready, nr_nonpte, need_flush, fullmm, freed; + struct list_head active_list[MAX_ZONE_ID], ready_list[MAX_ZONE_ID]; + int active_count[MAX_ZONE_ID], ready_count[MAX_ZONE_ID]; + struct page *nonpte[NR_NONPTE]; +}; + +DECLARE_PER_CPU(struct mmu_gather, mmu_gathers); + #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0) @@ -15,6 +63,122 @@ */ #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) -#include +void tlb_init(void); -#endif +static inline +struct mmu_gather *tlb_gather_mmu(struct mm_struct *mm, unsigned int flush) +{ + struct mmu_gather *tlb = &per_cpu(mmu_gathers, get_cpu()); + tlb->mm = mm; + tlb->fullmm = flush; + tlb->freed = 0; + put_cpu(); + return tlb; +} + +static inline +void tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *pte, unsigned long addr) +{ + tlb->need_flush = 1; +} + +static inline +void tlb_flush_ready(struct mmu_gather *tlb) +{ + int zone; + + for (zone = 0; tlb->nr_pte_ready >= NR_PTE && zone < MAX_ZONE_ID; ++zone) { + struct page *head; + + if (!tlb->ready_count[zone]) + continue; + + head = list_entry(tlb->ready_list[zone].next, struct page, list); + list_del_init(&head->list); + list_splice_init(&tlb->ready_list[zone], &head->list); + head->private = tlb->ready_count[zone]; + tlb->nr_pte_ready -= tlb->ready_count[zone]; + tlb->ready_count[zone] = 0; + free_pages_bulk(zone_table[zone], head, 0); + } +} + +static inline +void tlb_flush_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + int zone; + unsigned long flags; + + if (!tlb->need_flush && tlb->nr_nonpte < NR_NONPTE) + return; + + tlb->need_flush = 0; + tlb_flush(tlb); + + smp_local_irq_save(flags); + + if (tlb->nr_nonpte) { + free_pages_and_swap_cache(tlb->nonpte, tlb->nr_nonpte); + tlb->nr_nonpte = 0; + } + + for (zone = 0; zone < MAX_ZONE_ID; ++zone) { + if (!tlb->active_count[zone]) + continue; + + list_splice_init(&tlb->active_list[zone], &tlb->ready_list[zone]); + tlb->ready_count[zone] += tlb->active_count[zone]; + tlb->active_count[zone] = 0; + } + tlb->nr_pte_ready += tlb->nr_pte_active; + tlb->nr_pte_active = 0; + if (tlb->nr_pte_ready >= NR_PTE) + tlb_flush_ready(tlb); + + smp_local_irq_restore(flags); +} + +static inline +void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) +{ + if (tlb->mm->rss >= tlb->freed) + tlb->mm->rss -= tlb->freed; + else + tlb->mm->rss = 0; + tlb_flush_mmu(tlb, start, end); +} + +static inline +void tlb_remove_nonpte_page(struct mmu_gather *tlb, struct page *page) +{ + tlb->nonpte[tlb->nr_nonpte] = page; + tlb->nr_nonpte++; + if (tlb->nr_nonpte >= NR_NONPTE) + tlb_flush_mmu(tlb, 0, 0); +} + +static inline +void tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page) +{ + int zone = PageZoneID(page); + ClearPagePTE(page); + tlb->nr_pte_active++; + tlb->active_count[zone]++; + list_add(&page->list, &tlb->active_list[zone]); +} + +static inline +void tlb_remove_page(struct mmu_gather *tlb, struct page *page) +{ + unsigned long flags; + + smp_local_irq_save(flags); + tlb->need_flush = 1; + if (PagePTE(page)) + tlb_remove_pte_page(tlb, page); + else + tlb_remove_nonpte_page(tlb, page); + smp_local_irq_restore(flags); +} + +#endif /* _I386_TLB_H */ diff -prauN linux-2.6.0-test7/include/asm-ia64/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-ia64/pgalloc.h --- linux-2.6.0-test7/include/asm-ia64/pgalloc.h 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-ia64/pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -71,9 +71,9 @@ pgd_free (pgd_t *pgd) } static inline void -pgd_populate (struct mm_struct *mm, pgd_t *pgd_entry, pmd_t *pmd) +pgd_populate (struct mm_struct *mm, pgd_t *pgd_entry, struct page *pmd) { - pgd_val(*pgd_entry) = __pa(pmd); + pgd_val(*pgd_entry) = __pa(page_address(pmd)); } @@ -90,8 +90,8 @@ pmd_alloc_one_fast (struct mm_struct *mm return (pmd_t *)ret; } -static inline pmd_t* -pmd_alloc_one (struct mm_struct *mm, unsigned long addr) +static inline pmd_t * +pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); @@ -100,9 +100,16 @@ pmd_alloc_one (struct mm_struct *mm, uns return pmd; } +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + return pmd ? virt_to_page(pmd) : NULL; +} + static inline void -pmd_free (pmd_t *pmd) +pmd_free(struct page *page) { + pmd_t *pmd = page_address(page); *(unsigned long *)pmd = (unsigned long) pmd_quicklist; pmd_quicklist = (unsigned long *) pmd; ++pgtable_cache_size; diff -prauN linux-2.6.0-test7/include/asm-ia64/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-ia64/pgtable.h --- linux-2.6.0-test7/include/asm-ia64/pgtable.h 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-ia64/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -256,7 +256,8 @@ ia64_phys_addr_valid (unsigned long addr #define pgd_bad(pgd) (!ia64_phys_addr_valid(pgd_val(pgd))) #define pgd_present(pgd) (pgd_val(pgd) != 0UL) #define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & _PFN_MASK)) +#define __pgd_page(pgd) ((unsigned long)__va(pgd_val(pgd) & _PFN_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * The following have defined behavior only work if pte_present() is true. @@ -325,7 +326,13 @@ pgd_offset (struct mm_struct *mm, unsign /* Find an entry in the second-level page table.. */ #define pmd_offset(dir,addr) \ - ((pmd_t *) pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + ((pmd_t *)__pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* * Find an entry in the third-level page table. This looks more complicated than it diff -prauN linux-2.6.0-test7/include/asm-m68k/motorola_pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-m68k/motorola_pgalloc.h --- linux-2.6.0-test7/include/asm-m68k/motorola_pgalloc.h 2003-10-08 12:24:53.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-m68k/motorola_pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -63,19 +63,28 @@ static inline void __pte_free_tlb(struct } -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pmd_t *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { return get_pointer_table(); } -static inline int pmd_free(pmd_t *pmd) +static inline struct page *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { - return free_pointer_table(pmd); + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; } -static inline int __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +static inline int pmd_free(struct page *pmd) { - return free_pointer_table(pmd); + return free_pointer_table(page_address(pmd)); +} + +static inline int __pmd_free_tlb(struct mmu_gather *tlb, struct page *pmd) +{ + return free_pointer_table(page_address(pmd)); } @@ -100,9 +109,9 @@ static inline void pmd_populate(struct m pmd_set(pmd, page_address(page)); } -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, struct page *pmd) { - pgd_set(pgd, pmd); + pgd_set(pgd, page_address(pmd)); } #endif /* _MOTOROLA_PGALLOC_H */ diff -prauN linux-2.6.0-test7/include/asm-m68k/motorola_pgtable.h wli-2.6.0-test7-bk1-29/include/asm-m68k/motorola_pgtable.h --- linux-2.6.0-test7/include/asm-m68k/motorola_pgtable.h 2003-10-08 12:24:16.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-m68k/motorola_pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -116,6 +116,7 @@ extern inline void pgd_set(pgd_t * pgdp, #define __pte_page(pte) ((unsigned long)__va(pte_val(pte) & PAGE_MASK)) #define __pmd_page(pmd) ((unsigned long)__va(pmd_val(pmd) & _TABLE_MASK)) #define __pgd_page(pgd) ((unsigned long)__va(pgd_val(pgd) & _TABLE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define pte_none(pte) (!pte_val(pte)) @@ -204,6 +205,12 @@ extern inline pmd_t * pmd_offset(pgd_t * return (pmd_t *)__pgd_page(*dir) + ((address >> PMD_SHIFT) & (PTRS_PER_PMD-1)); } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ extern inline pte_t * pte_offset_kernel(pmd_t * pmdp, unsigned long address) { diff -prauN linux-2.6.0-test7/include/asm-m68k/sun3_pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-m68k/sun3_pgalloc.h --- linux-2.6.0-test7/include/asm-m68k/sun3_pgalloc.h 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-m68k/sun3_pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -18,7 +18,8 @@ extern const char bad_pmd_string[]; -#define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,address) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,address) ({ BUG(); ((pmd_t *)2); }) static inline void pte_free_kernel(pte_t * pte) diff -prauN linux-2.6.0-test7/include/asm-m68knommu/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-m68knommu/pgtable.h --- linux-2.6.0-test7/include/asm-m68knommu/pgtable.h 2003-10-08 12:24:15.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-m68knommu/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -21,7 +21,12 @@ typedef pte_t *pte_addr_t; #define pgd_bad(pgd) (0) #define pgd_clear(pgdp) #define kern_addr_valid(addr) (1) -#define pmd_offset(a, b) ((void *)0) +#define pmd_offset(a, b) ((void *)0) +#define pmd_offset_kernel(a, b) pmd_offset(a, b) +#define pmd_offset_map(a, b) pmd_offset(a, b) +#define pmd_offset_map_nested(a, b) pmd_offset(a, b) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define PAGE_NONE __pgprot(0) #define PAGE_SHARED __pgprot(0) diff -prauN linux-2.6.0-test7/include/asm-mips/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-mips/pgalloc.h --- linux-2.6.0-test7/include/asm-mips/pgalloc.h 2003-10-08 12:24:06.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-mips/pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -95,7 +95,8 @@ static inline void pte_free(struct page * allocating and freeing a pmd is trivial: the 1-entry pmd is * inside the pgd, so has no extra memory associated with it. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #endif diff -prauN linux-2.6.0-test7/include/asm-mips/pgtable-32.h wli-2.6.0-test7-bk1-29/include/asm-mips/pgtable-32.h --- linux-2.6.0-test7/include/asm-mips/pgtable-32.h 2003-10-08 12:24:06.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-mips/pgtable-32.h 2003-10-09 19:28:46.000000000 -0700 @@ -180,6 +180,12 @@ static inline pmd_t *pmd_offset(pgd_t *d return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define __pte_offset(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff -prauN linux-2.6.0-test7/include/asm-mips/pgtable-64.h wli-2.6.0-test7-bk1-29/include/asm-mips/pgtable-64.h --- linux-2.6.0-test7/include/asm-mips/pgtable-64.h 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-mips/pgtable-64.h 2003-10-09 19:28:46.000000000 -0700 @@ -161,10 +161,16 @@ static inline unsigned long pgd_page(pgd /* Find an entry in the second-level page table.. */ static inline pmd_t *pmd_offset(pgd_t * dir, unsigned long address) { - return (pmd_t *) pgd_page(*dir) + + return (pmd_t *)page_address(pgd_page(*dir)) + ((address >> PMD_SHIFT) & (PTRS_PER_PMD - 1)); } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define __pte_offset(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff -prauN linux-2.6.0-test7/include/asm-parisc/cacheflush.h wli-2.6.0-test7-bk1-29/include/asm-parisc/cacheflush.h --- linux-2.6.0-test7/include/asm-parisc/cacheflush.h 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-parisc/cacheflush.h 2003-10-09 19:42:26.000000000 -0700 @@ -69,7 +69,7 @@ extern void __flush_dcache_page(struct p static inline void flush_dcache_page(struct page *page) { - if (page->mapping && list_empty(&page->mapping->i_mmap) && + if (page_mapping(page) && list_empty(&page_mapping(page)->i_mmap) && list_empty(&page->mapping->i_mmap_shared)) { set_bit(PG_dcache_dirty, &page->flags); } else { diff -prauN linux-2.6.0-test7/include/asm-parisc/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-parisc/pgalloc.h --- linux-2.6.0-test7/include/asm-parisc/pgalloc.h 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-parisc/pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -27,12 +27,12 @@ static inline void pgd_free(pgd_t *pgd) /* Three Level Page Table Support for pmd's */ -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, struct page *pmd) { - pgd_val(*pgd) = _PAGE_TABLE + __pa((unsigned long)pmd); + pgd_val(*pgd) = _PAGE_TABLE + __pa(page_address(pmd)); } -static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +static inline pmd_t *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { pmd_t *pmd = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); if (pmd) @@ -40,9 +40,18 @@ static inline pmd_t *pmd_alloc_one(struc return pmd; } -static inline void pmd_free(pmd_t *pmd) +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - free_page((unsigned long)pmd); + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + +static inline void pmd_free(struct page *pmd) +{ + __free_page(pmd); } #else @@ -54,7 +63,8 @@ static inline void pmd_free(pmd_t *pmd) * inside the pgd, so has no extra memory associated with it. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) pmd_alloc_one(mm, addr) #define pmd_free(x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() diff -prauN linux-2.6.0-test7/include/asm-parisc/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-parisc/pgtable.h --- linux-2.6.0-test7/include/asm-parisc/pgtable.h 2003-10-08 12:24:43.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-parisc/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -242,7 +242,8 @@ extern unsigned long *empty_zero_page; #ifdef __LP64__ -#define pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define __pgd_page(pgd) ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* For 64 bit we have three level tables */ @@ -339,11 +340,17 @@ extern inline pte_t pte_modify(pte_t pte #ifdef __LP64__ #define pmd_offset(dir,address) \ -((pmd_t *) pgd_page(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1))) +((pmd_t *)__pgd_page(*(dir)) + (((address)>>PMD_SHIFT) & (PTRS_PER_PMD-1))) #else #define pmd_offset(dir,addr) ((pmd_t *) dir) #endif +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) #define pte_offset_kernel(pmd, address) \ diff -prauN linux-2.6.0-test7/include/asm-ppc/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-ppc/pgalloc.h --- linux-2.6.0-test7/include/asm-ppc/pgalloc.h 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-ppc/pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -14,7 +14,8 @@ extern void pgd_free(pgd_t *pgd); * We don't have any real pmd's, and this code never triggers because * the pgd will always be present.. */ -#define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,address) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() diff -prauN linux-2.6.0-test7/include/asm-ppc/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-ppc/pgtable.h --- linux-2.6.0-test7/include/asm-ppc/pgtable.h 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-ppc/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -426,8 +426,9 @@ static inline int pgd_bad(pgd_t pgd) { static inline int pgd_present(pgd_t pgd) { return 1; } #define pgd_clear(xp) do { } while (0) -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * The following only work if pte_present() is true. @@ -575,6 +576,12 @@ static inline pmd_t * pmd_offset(pgd_t * return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) \ (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) diff -prauN linux-2.6.0-test7/include/asm-ppc64/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-ppc64/pgalloc.h --- linux-2.6.0-test7/include/asm-ppc64/pgalloc.h 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-ppc64/pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -26,10 +26,10 @@ pgd_free(pgd_t *pgd) free_page((unsigned long)pgd); } -#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, PMD) +#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, page_address(PMD)) static inline pmd_t * -pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { pmd_t *pmd; @@ -39,10 +39,19 @@ pmd_alloc_one(struct mm_struct *mm, unsi return pmd; } +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + static inline void -pmd_free(pmd_t *pmd) +pmd_free(struct page *pmd) { - free_page((unsigned long)pmd); + __free_page(pmd); } #define __pmd_free_tlb(tlb, pmd) pmd_free(pmd) diff -prauN linux-2.6.0-test7/include/asm-ppc64/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-ppc64/pgtable.h --- linux-2.6.0-test7/include/asm-ppc64/pgtable.h 2003-10-08 12:24:50.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-ppc64/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -212,7 +212,8 @@ int hash_huge_page(struct mm_struct *mm, #define pgd_bad(pgd) ((pgd_val(pgd)) == 0) #define pgd_present(pgd) (pgd_val(pgd) != 0UL) #define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) -#define pgd_page(pgd) (__bpn_to_ba(pgd_val(pgd))) +#define __pgd_page(pgd) (__bpn_to_ba(pgd_val(pgd))) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) /* * Find an entry in a page-table-directory. We combine the address region @@ -225,12 +226,18 @@ int hash_huge_page(struct mm_struct *mm, /* Find an entry in the second-level page table.. */ #define pmd_offset(dir,addr) \ - ((pmd_t *) pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) + ((pmd_t *)__pgd_page(*(dir)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))) /* Find an entry in the third-level page table.. */ #define pte_offset_kernel(dir,addr) \ ((pte_t *) pmd_page_kernel(*(dir)) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_offset_map_nested(dir,addr) pte_offset_kernel((dir), (addr)) #define pte_unmap(pte) do { } while(0) diff -prauN linux-2.6.0-test7/include/asm-s390/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-s390/pgalloc.h --- linux-2.6.0-test7/include/asm-s390/pgalloc.h 2003-10-08 12:24:16.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-s390/pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -61,12 +61,13 @@ static inline void pgd_free(pgd_t *pgd) * We use pmd cache only on s390x, so these are dummy routines. This * code never triggers because the pgd will always be present. */ -#define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm,address) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm,addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() #else /* __s390x__ */ -static inline pmd_t * pmd_alloc_one(struct mm_struct *mm, unsigned long vmaddr) +static inline pmd_t * pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long vmaddr) { pmd_t *pmd; int i; @@ -79,16 +80,25 @@ static inline pmd_t * pmd_alloc_one(stru return pmd; } -static inline void pmd_free (pmd_t *pmd) +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - free_pages((unsigned long) pmd, 2); + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + +static inline void pmd_free(struct page *pmd) +{ + __free_pages(pmd, 2); } #define __pmd_free_tlb(tlb,pmd) pmd_free(pmd) -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmd) +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, struct page *pmd) { - pgd_val(*pgd) = _PGD_ENTRY | __pa(pmd); + pgd_val(*pgd) = _PGD_ENTRY | __pa(page_address(pmd)); } #endif /* __s390x__ */ diff -prauN linux-2.6.0-test7/include/asm-s390/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-s390/pgtable.h --- linux-2.6.0-test7/include/asm-s390/pgtable.h 2003-10-08 12:24:27.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-s390/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -612,6 +612,7 @@ static inline pte_t mk_pte_phys(unsigned /* to find an entry in a page-table-directory */ #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) #define pgd_offset(mm, address) ((mm)->pgd+pgd_index(address)) +#define pgd_page(pgd) virt_to_page(pgd_page_kernel(pgd)) /* to find an entry in a kernel page-table-directory */ #define pgd_offset_k(address) pgd_offset(&init_mm, address) @@ -633,6 +634,12 @@ extern inline pmd_t * pmd_offset(pgd_t * #endif /* __s390x__ */ +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE-1)) #define pte_offset_kernel(pmd, address) \ diff -prauN linux-2.6.0-test7/include/asm-sh/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-sh/pgalloc.h --- linux-2.6.0-test7/include/asm-sh/pgalloc.h 2003-10-08 12:24:05.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-sh/pgalloc.h 2003-10-09 19:42:26.000000000 -0700 @@ -94,7 +94,8 @@ static inline void pte_free(struct page * inside the pgd, so has no extra memory associated with it. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() @@ -115,8 +116,8 @@ static inline pte_t ptep_get_and_clear(p unsigned long pfn = pte_pfn(pte); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - if (!page->mapping - || list_empty(&page->mapping->i_mmap_shared)) + if (!page_mapping(page) + || list_empty(&page_mapping(page)->i_mmap_shared)) __clear_bit(PG_mapped, &page->flags); } } diff -prauN linux-2.6.0-test7/include/asm-sh/pgtable-2level.h wli-2.6.0-test7-bk1-29/include/asm-sh/pgtable-2level.h --- linux-2.6.0-test7/include/asm-sh/pgtable-2level.h 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-sh/pgtable-2level.h 2003-10-09 19:28:46.000000000 -0700 @@ -48,14 +48,21 @@ static inline void pgd_clear (pgd_t * pg #define set_pmd(pmdptr, pmdval) (*(pmdptr) = pmdval) #define set_pgd(pgdptr, pgdval) (*(pgdptr) = pgdval) -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) { return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) + #define pte_pfn(x) ((unsigned long)(((x).pte >> PAGE_SHIFT))) #define pfn_pte(pfn, prot) __pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define pfn_pmd(pfn, prot) __pmd(((pfn) << PAGE_SHIFT) | pgprot_val(prot)) diff -prauN linux-2.6.0-test7/include/asm-sparc/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-sparc/pgalloc.h --- linux-2.6.0-test7/include/asm-sparc/pgalloc.h 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-sparc/pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -38,15 +38,24 @@ BTFIXUPDEF_CALL(void, free_pgd_fast, pgd BTFIXUPDEF_CALL(void, pgd_set, pgd_t *, pmd_t *) #define pgd_set(pgdp,pmdp) BTFIXUP_CALL(pgd_set)(pgdp,pmdp) -#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, PMD) +#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, page_address(PMD)) -BTFIXUPDEF_CALL(pmd_t *, pmd_alloc_one, struct mm_struct *, unsigned long) -#define pmd_alloc_one(mm, address) BTFIXUP_CALL(pmd_alloc_one)(mm, address) +BTFIXUPDEF_CALL(pmd_t *, __pmd_alloc_one, struct mm_struct *, unsigned long) +#define pmd_alloc_one_kernel(mm, address) BTFIXUP_CALL(__pmd_alloc_one)(mm, address) + +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} BTFIXUPDEF_CALL(void, free_pmd_fast, pmd_t *) #define free_pmd_fast(pmd) BTFIXUP_CALL(free_pmd_fast)(pmd) -#define pmd_free(pmd) free_pmd_fast(pmd) +#define pmd_free(pmd) free_pmd_fast(page_address(pmd)) #define __pmd_free_tlb(tlb, pmd) pmd_free(pmd) BTFIXUPDEF_CALL(void, pmd_populate, pmd_t *, struct page *) diff -prauN linux-2.6.0-test7/include/asm-sparc/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-sparc/pgtable.h --- linux-2.6.0-test7/include/asm-sparc/pgtable.h 2003-10-08 12:24:08.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-sparc/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -200,10 +200,11 @@ extern unsigned long empty_zero_page; /* */ BTFIXUPDEF_CALL_CONST(struct page *, pmd_page, pmd_t) -BTFIXUPDEF_CALL_CONST(unsigned long, pgd_page, pgd_t) +BTFIXUPDEF_CALL_CONST(unsigned long, __pgd_page, pgd_t) #define pmd_page(pmd) BTFIXUP_CALL(pmd_page)(pmd) -#define pgd_page(pgd) BTFIXUP_CALL(pgd_page)(pgd) +#define __pgd_page(pgd) BTFIXUP_CALL(__pgd_page)(pgd) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) BTFIXUPDEF_SETHI(none_mask) BTFIXUPDEF_CALL_CONST(int, pte_present, pte_t) @@ -350,6 +351,11 @@ extern __inline__ pte_t pte_modify(pte_t /* Find an entry in the second-level page table.. */ BTFIXUPDEF_CALL(pmd_t *, pmd_offset, pgd_t *, unsigned long) #define pmd_offset(dir,addr) BTFIXUP_CALL(pmd_offset)(dir,addr) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ BTFIXUPDEF_CALL(pte_t *, pte_offset_kernel, pmd_t *, unsigned long) diff -prauN linux-2.6.0-test7/include/asm-sparc64/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-sparc64/pgalloc.h --- linux-2.6.0-test7/include/asm-sparc64/pgalloc.h 2003-10-08 12:24:50.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-sparc64/pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -133,7 +133,7 @@ static __inline__ void free_pgd_slow(pgd #define DCACHE_COLOR(address) 0 #endif -#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, PMD) +#define pgd_populate(MM, PGD, PMD) pgd_set(PGD, page_address(PMD)) static __inline__ pmd_t *pmd_alloc_one_fast(struct mm_struct *mm, unsigned long address) { @@ -154,7 +154,7 @@ static __inline__ pmd_t *pmd_alloc_one_f return (pmd_t *)ret; } -static __inline__ pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long address) +static __inline__ pmd_t *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pmd_t *pmd; @@ -167,6 +167,15 @@ static __inline__ pmd_t *pmd_alloc_one(s return pmd; } +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + static __inline__ void free_pmd_fast(pmd_t *pmd) { unsigned long color = DCACHE_COLOR((unsigned long)pmd); @@ -223,7 +232,7 @@ static __inline__ void free_pte_slow(pte #define pte_free_kernel(pte) free_pte_fast(pte) #define pte_free(pte) free_pte_fast(page_address(pte)) -#define pmd_free(pmd) free_pmd_fast(pmd) +#define pmd_free(pmd) free_pmd_fast(page_address(pmd)) #define pgd_free(pgd) free_pgd_fast(pgd) #define pgd_alloc(mm) get_pgd_fast() diff -prauN linux-2.6.0-test7/include/asm-sparc64/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-sparc64/pgtable.h --- linux-2.6.0-test7/include/asm-sparc64/pgtable.h 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-sparc64/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -227,7 +227,8 @@ static inline pte_t pte_modify(pte_t ori (pgd_val(*(pgdp)) = (__pa((unsigned long) (pmdp)) >> 11UL)) #define __pmd_page(pmd) ((unsigned long) __va((pmd_val(pmd)<<11UL))) #define pmd_page(pmd) virt_to_page((void *)__pmd_page(pmd)) -#define pgd_page(pgd) ((unsigned long) __va((pgd_val(pgd)<<11UL))) +#define __pgd_page(pgd) ((unsigned long) __va((pgd_val(pgd)<<11UL))) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define pte_none(pte) (!pte_val(pte)) #define pte_present(pte) (pte_val(pte) & _PAGE_PRESENT) #define pte_clear(pte) (pte_val(*(pte)) = 0UL) @@ -269,8 +270,13 @@ static inline pte_t pte_modify(pte_t ori #define pgd_offset_k(address) pgd_offset(&init_mm, address) /* Find an entry in the second-level page table.. */ -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ +#define pmd_offset(dir, address) ((pmd_t *)__pgd_page(*(dir)) + \ ((address >> PMD_SHIFT) & (REAL_PTRS_PER_PMD-1))) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) /* Find an entry in the third-level page table.. */ #define pte_index(dir, address) ((pte_t *) __pmd_page(*(dir)) + \ diff -prauN linux-2.6.0-test7/include/asm-um/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-um/pgalloc.h --- linux-2.6.0-test7/include/asm-um/pgalloc.h 2003-10-08 12:24:33.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-um/pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -42,7 +42,8 @@ static inline void pte_free(struct page * inside the pgd, so has no extra memory associated with it. */ -#define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) +#define pmd_alloc_one(mm, addr) ({ BUG(); ((struct page *)2); }) +#define pmd_alloc_one_kernel(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(x) do { } while (0) #define __pmd_free_tlb(tlb,x) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() diff -prauN linux-2.6.0-test7/include/asm-um/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-um/pgtable.h --- linux-2.6.0-test7/include/asm-um/pgtable.h 2003-10-08 12:24:53.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-um/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -373,6 +373,12 @@ static inline pmd_t * pmd_offset(pgd_t * return (pmd_t *) dir; } +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pgd, addr) do { } while (0) +#define pmd_unmap_nested(pgd, addr) do { } while (0) + /* Find an entry in the third-level page table.. */ #define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) #define pte_offset_kernel(dir, address) \ diff -prauN linux-2.6.0-test7/include/asm-v850/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-v850/pgtable.h --- linux-2.6.0-test7/include/asm-v850/pgtable.h 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-v850/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -13,6 +13,11 @@ typedef pte_t *pte_addr_t; #define pgd_clear(pgdp) ((void)0) #define pmd_offset(a, b) ((void *)0) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define kern_addr_valid(addr) (1) diff -prauN linux-2.6.0-test7/include/asm-x86_64/pgalloc.h wli-2.6.0-test7-bk1-29/include/asm-x86_64/pgalloc.h --- linux-2.6.0-test7/include/asm-x86_64/pgalloc.h 2003-10-08 12:24:06.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-x86_64/pgalloc.h 2003-10-09 19:28:46.000000000 -0700 @@ -10,7 +10,7 @@ #define pmd_populate_kernel(mm, pmd, pte) \ set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte))) #define pgd_populate(mm, pgd, pmd) \ - set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pmd))) + set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(page_address(pmd)))) static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { @@ -22,18 +22,25 @@ extern __inline__ pmd_t *get_pmd(void) return (pmd_t *)get_zeroed_page(GFP_KERNEL); } -extern __inline__ void pmd_free(pmd_t *pmd) +extern __inline__ void pmd_free(struct page *pmd) { - if ((unsigned long)pmd & (PAGE_SIZE-1)) - BUG(); - free_page((unsigned long)pmd); + __free_page(pmd); } -static inline pmd_t *pmd_alloc_one (struct mm_struct *mm, unsigned long addr) +static inline pmd_t *pmd_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); } +static inline struct page *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) +{ + pmd_t *pmd = pmd_alloc_one_kernel(mm, addr); + if (pmd) + return virt_to_page(pmd); + else + return NULL; +} + static inline pgd_t *pgd_alloc (struct mm_struct *mm) { return (pgd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); diff -prauN linux-2.6.0-test7/include/asm-x86_64/pgtable.h wli-2.6.0-test7-bk1-29/include/asm-x86_64/pgtable.h --- linux-2.6.0-test7/include/asm-x86_64/pgtable.h 2003-10-08 12:24:26.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-x86_64/pgtable.h 2003-10-09 19:28:46.000000000 -0700 @@ -98,8 +98,9 @@ static inline void set_pml4(pml4_t *dst, pml4_val(*dst) = pml4_val(val); } -#define pgd_page(pgd) \ +#define __pgd_page(pgd) \ ((unsigned long) __va(pgd_val(pgd) & PHYSICAL_PAGE_MASK)) +#define pgd_page(pgd) virt_to_page(__pgd_page(pgd)) #define ptep_get_and_clear(xp) __pte(xchg(&(xp)->pte, 0)) #define pte_same(a, b) ((a).pte == (b).pte) @@ -331,8 +332,13 @@ static inline pgd_t *current_pgd_offset_ #define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) -#define pmd_offset(dir, address) ((pmd_t *) pgd_page(*(dir)) + \ +#define pmd_offset(dir, address) ((pmd_t *)__pgd_page(*(dir)) + \ pmd_index(address)) +#define pmd_offset_kernel(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map(pgd, addr) pmd_offset(pgd, addr) +#define pmd_offset_map_nested(pgd, addr) pmd_offset(pgd, addr) +#define pmd_unmap(pmd) do { } while (0) +#define pmd_unmap_nested(pmd) do { } while (0) #define pmd_none(x) (!pmd_val(x)) #define pmd_present(x) (pmd_val(x) & _PAGE_PRESENT) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) diff -prauN linux-2.6.0-test7/include/asm-x86_64/proto.h wli-2.6.0-test7-bk1-29/include/asm-x86_64/proto.h --- linux-2.6.0-test7/include/asm-x86_64/proto.h 2003-10-08 12:24:43.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/asm-x86_64/proto.h 2003-10-09 19:27:23.000000000 -0700 @@ -70,7 +70,9 @@ extern void show_regs(struct pt_regs * r extern int map_syscall32(struct mm_struct *mm, unsigned long address); extern char *syscall32_page; -void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); +extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); + +extern void check_ioapic(void); extern unsigned long max_mapnr; extern unsigned long end_pfn; @@ -81,6 +83,10 @@ extern int force_iommu, no_iommu; extern int using_apic_timer; extern int disable_apic; extern unsigned cpu_khz; +extern int ioapic_force; +extern int skip_ioapic_setup; +extern int acpi_ht; +extern int acpi_disabled; extern int fallback_aper_order; extern int fallback_aper_force; diff -prauN linux-2.6.0-test7/include/linux/device.h wli-2.6.0-test7-bk1-29/include/linux/device.h --- linux-2.6.0-test7/include/linux/device.h 2003-10-08 12:24:44.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/device.h 2003-10-09 19:27:23.000000000 -0700 @@ -254,6 +254,7 @@ struct device { struct list_head children; struct device * parent; + struct completion * complete; /* Notification for freeing device. */ struct kobject kobj; char bus_id[BUS_ID_SIZE]; /* position on parent bus */ @@ -301,6 +302,7 @@ dev_set_drvdata (struct device *dev, voi */ extern int device_register(struct device * dev); extern void device_unregister(struct device * dev); +extern void device_unregister_wait(struct device * dev); extern void device_initialize(struct device * dev); extern int device_add(struct device * dev); extern void device_del(struct device * dev); diff -prauN linux-2.6.0-test7/include/linux/fs.h wli-2.6.0-test7-bk1-29/include/linux/fs.h --- linux-2.6.0-test7/include/linux/fs.h 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/fs.h 2003-10-09 19:40:04.000000000 -0700 @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include struct iovec; @@ -315,11 +317,29 @@ struct address_space_operations { loff_t offset, unsigned long nr_segs); }; +#if NR_CPUS > 8 +typedef rwlock_t mapping_rwlock_t; +#define mapping_rdlock(lock) read_lock(lock) +#define mapping_rdunlock(lock) read_unlock(lock) +#define mapping_wrlock(lock) write_lock(lock) +#define mapping_wrunlock(lock) write_unlock(lock) +#define mapping_rwlock_init(lock) rwlock_init(lock) +#define MAPPING_RW_LOCK_UNLOCKED RW_LOCK_UNLOCKED +#else +typedef spinlock_t mapping_rwlock_t; +#define mapping_rdlock(lock) spin_lock(lock) +#define mapping_rdunlock(lock) spin_unlock(lock) +#define mapping_wrlock(lock) spin_lock(lock) +#define mapping_wrunlock(lock) spin_unlock(lock) +#define mapping_rwlock_init(lock) spin_lock_init(lock) +#define MAPPING_RW_LOCK_UNLOCKED SPIN_LOCK_UNLOCKED +#endif + struct backing_dev_info; struct address_space { struct inode *host; /* owner: inode, block_device */ struct radix_tree_root page_tree; /* radix tree of all pages */ - spinlock_t page_lock; /* and spinlock protecting it */ + mapping_rwlock_t page_lock; /* and spinlock protecting it */ struct list_head clean_pages; /* list of clean pages */ struct list_head dirty_pages; /* list of dirty pages */ struct list_head locked_pages; /* list of locked pages */ @@ -328,7 +348,7 @@ struct address_space { struct address_space_operations *a_ops; /* methods */ struct list_head i_mmap; /* list of private mappings */ struct list_head i_mmap_shared; /* list of shared mappings */ - struct semaphore i_shared_sem; /* protect both above lists */ + spinlock_t i_shared_lock; /* protect both above lists */ atomic_t truncate_count; /* Cover race condition with truncate */ unsigned long dirtied_when; /* jiffies of first page dirtying */ unsigned long flags; /* error bits/gfp mask */ diff -prauN linux-2.6.0-test7/include/linux/gfp.h wli-2.6.0-test7-bk1-29/include/linux/gfp.h --- linux-2.6.0-test7/include/linux/gfp.h 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/gfp.h 2003-10-09 19:30:59.000000000 -0700 @@ -79,6 +79,7 @@ static inline struct page * alloc_pages_ extern unsigned long FASTCALL(__get_free_pages(unsigned int gfp_mask, unsigned int order)); extern unsigned long FASTCALL(get_zeroed_page(unsigned int gfp_mask)); +void free_pages_bulk(struct zone *zone, struct page *page, unsigned int order); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask),0) diff -prauN linux-2.6.0-test7/include/linux/hugetlb.h wli-2.6.0-test7-bk1-29/include/linux/hugetlb.h --- linux-2.6.0-test7/include/linux/hugetlb.h 2003-10-08 12:24:05.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/hugetlb.h 2003-10-09 19:34:30.000000000 -0700 @@ -41,6 +41,11 @@ mark_mm_hugetlb(struct mm_struct *mm, st #define is_hugepage_only_range(addr, len) 0 #endif +#define vm_account_huge_inc(vma, pte, addr) \ + vm_account(vma, pte, addr, HPAGE_SIZE/PAGE_SIZE) +#define vm_account_huge_dec(vma, pte, addr) \ + vm_account(vma, pte, addr, -(HPAGE_SIZE/PAGE_SIZE)) + #else /* !CONFIG_HUGETLB_PAGE */ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) diff -prauN linux-2.6.0-test7/include/linux/init_task.h wli-2.6.0-test7-bk1-29/include/linux/init_task.h --- linux-2.6.0-test7/include/linux/init_task.h 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/init_task.h 2003-10-09 19:58:29.000000000 -0700 @@ -75,7 +75,12 @@ .active_mm = &init_mm, \ .run_list = LIST_HEAD_INIT(tsk.run_list), \ .time_slice = HZ, \ - .tasks = LIST_HEAD_INIT(tsk.tasks), \ + .tasks = { \ + .rb_parent = NULL, \ + .rb_left = NULL, \ + .rb_right = NULL, \ + .rb_color = RB_BLACK, \ + }, \ .ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \ .ptrace_list = LIST_HEAD_INIT(tsk.ptrace_list), \ .real_parent = &tsk, \ diff -prauN linux-2.6.0-test7/include/linux/ioport.h wli-2.6.0-test7-bk1-29/include/linux/ioport.h --- linux-2.6.0-test7/include/linux/ioport.h 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/ioport.h 2003-10-09 19:27:23.000000000 -0700 @@ -90,6 +90,7 @@ extern int get_resource_list(struct reso extern int request_resource(struct resource *root, struct resource *new); extern int release_resource(struct resource *new); +extern int insert_resource(struct resource *parent, struct resource *new); extern int allocate_resource(struct resource *root, struct resource *new, unsigned long size, unsigned long min, unsigned long max, diff -prauN linux-2.6.0-test7/include/linux/mm.h wli-2.6.0-test7-bk1-29/include/linux/mm.h --- linux-2.6.0-test7/include/linux/mm.h 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/mm.h 2003-10-09 19:54:33.000000000 -0700 @@ -12,6 +12,7 @@ #include #include #include +#include #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; @@ -77,6 +78,7 @@ struct vm_area_struct { units, *not* PAGE_CACHE_SIZE */ struct file * vm_file; /* File we map to (can be NULL). */ void * vm_private_data; /* was vm_pte (shared mem) */ + struct rcu_head rcu; }; /* @@ -111,6 +113,7 @@ struct vm_area_struct { #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ +#define VM_DEAD 0x01000000 /* vma is dead, don't touch */ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS @@ -147,8 +150,6 @@ struct vm_operations_struct { int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock); }; -/* forward declaration; pte_chain is meant to be internal to rmap.c */ -struct pte_chain; struct mmu_gather; struct inode; @@ -172,15 +173,12 @@ struct page { updated asynchronously */ atomic_t count; /* Usage count, see below. */ struct list_head list; /* ->mapping has some page lists. */ - struct address_space *mapping; /* The inode (or ...) we belong to. */ + unsigned long __mapping; /* The inode (or ...) we belong to. */ unsigned long index; /* Our offset within mapping. */ struct list_head lru; /* Pageout list, eg. active_list; protected by zone->lru_lock !! */ - union { - struct pte_chain *chain;/* Reverse pte mapping pointer. - * protected by PG_chainlock */ - pte_addr_t direct; - } pte; + atomic_t mapcount; + struct rmap_chain *chain; unsigned long private; /* mapping-private opaque data */ /* @@ -375,13 +373,41 @@ void page_address_init(void); #endif /* + * On an anonymous page mapped into a user virutal memory area, + * page->mapping points to its anonmm, not to a struct address_space. + * + * Please note that, confusingly, page_mapping() refers to the inode + * struct address_space which maps the page from disk, where page_mapped() + * refers to whether it's mapped into a user virtual address space. + */ +static inline struct address_space *page_mapping(struct page *page) +{ + if (PageAnon(page)) + return NULL; + else + return (struct address_space *)page->__mapping; +} + +struct anon; +static inline struct anon *page_anon(struct page *page) +{ + BUG_ON(!PageAnon(page)); + return (struct anon *)page->__mapping; +} + +static inline void set_page_mapping(struct page *page, void *ptr) +{ + page->__mapping = (unsigned long)ptr; +} + +/* * Return true if this page is mapped into pagetables. Subtle: test pte.direct * rather than pte.chain. Because sometimes pte.direct is 64-bit, and .chain * is only 32-bit. */ static inline int page_mapped(struct page *page) { - return page->pte.direct != 0; + return atomic_read(&page->mapcount) != 0; } /* @@ -428,8 +454,9 @@ extern void invalidate_mmap_range(struct loff_t const holelen); extern int vmtruncate(struct inode * inode, loff_t offset); extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)); +pmd_t *FASTCALL(__pmd_alloc_kernel(struct mm_struct *mm, pgd_t *pmd, unsigned long address)); extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); -extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)); +pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pgd_t *pgd, pmd_t **pmd, unsigned long address)); extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot); extern int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot); extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access); @@ -437,16 +464,19 @@ extern int make_pages_present(unsigned l extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); extern long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long nonblock); extern long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice); -void put_dirty_page(struct task_struct *tsk, struct page *page, - unsigned long address, pgprot_t prot); +void put_dirty_page(task_t *task, struct vm_area_struct *vma, + struct page *page, unsigned long address, pgprot_t prot); int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); -int __set_page_dirty_buffers(struct page *page); -int __set_page_dirty_nobuffers(struct page *page); +int set_page_dirty(struct page *page); +int set_page_dirty_buffers(struct page *page); +int set_page_dirty_nobuffers(struct page *page); int set_page_dirty_lock(struct page *page); +void free_vma(struct vm_area_struct *); + /* * Prototype to add a shrinker callback for ageable caches. * @@ -471,33 +501,15 @@ extern struct shrinker *set_shrinker(int extern void remove_shrinker(struct shrinker *shrinker); /* - * If the mapping doesn't provide a set_page_dirty a_op, then - * just fall through and assume that it wants buffer_heads. - * FIXME: make the method unconditional. - */ -static inline int set_page_dirty(struct page *page) -{ - if (page->mapping) { - int (*spd)(struct page *); - - spd = page->mapping->a_ops->set_page_dirty; - if (spd) - return (*spd)(page); - } - return __set_page_dirty_buffers(page); -} - -/* * On a two-level page table, this ends up being trivial. Thus the * inlining and the symmetry break with pte_alloc_map() that does all * of this out-of-line. */ -static inline pmd_t *pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) -{ - if (pgd_none(*pgd)) - return __pmd_alloc(mm, pgd, address); - return pmd_offset(pgd, address); -} +#define pmd_alloc_map(mm, pgd, addr) \ + (pgd_none(*(pgd))? __pmd_alloc(mm,pgd,addr): pmd_offset_map(pgd,addr)) + +#define pmd_alloc_kernel(mm, pgd, addr) \ + (pgd_none(*(pgd))? __pmd_alloc_kernel(mm,pgd,addr): pmd_offset_kernel(pgd,addr)) extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap, @@ -621,6 +633,75 @@ kernel_map_pages(struct page *page, int { } #endif + + static inline void vm_account(struct vm_area_struct *vma, pte_t pte, + unsigned long addr, long adjustment) + { + struct mm_struct *mm = vma->vm_mm; + unsigned long pfn; + struct page *page; + + if (!pte_present(pte)) + return; + + pfn = pte_pfn(pte); + if (!pfn_valid(pfn)) + goto out; + + page = pfn_to_page(pfn); + if (PageReserved(page)) + goto out; + + if (vma->vm_flags & VM_EXECUTABLE) + mm->text += adjustment; + else if (vma->vm_flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN))) { + mm->data += adjustment; + mm->stack += adjustment; + } else if (addr >= TASK_UNMAPPED_BASE) + mm->lib += adjustment; + else + mm->data += adjustment; + + if (page_mapping(page)) + mm->shared += adjustment; + + out: + if (pte_write(pte)) + mm->dirty += adjustment; + } + + #define vm_account_inc(vma, pte, addr) vm_account(vma, pte, addr, +1) + #define vm_account_dec(vma, pte, addr) vm_account(vma, pte, addr, -1) + + static inline void vm_ptep_set_wrprotect(struct mm_struct *mm, pte_t *pte) + { + if (pte_write(*pte)) + mm->dirty--; + ptep_set_wrprotect(pte); + } + + static inline void vm_set_pte(struct vm_area_struct *vma, pte_t *dst, + pte_t val, unsigned long addr) + { + vm_account_inc(vma, val, addr); + set_pte(dst, val); + } + + static inline pte_t vm_ptep_get_and_clear(struct vm_area_struct *vma, + pte_t *pte, unsigned long addr) + { + pte_t val = ptep_get_and_clear(pte); + vm_account_dec(vma, val, addr); + return val; + } + + static inline void vm_pte_clear(struct vm_area_struct *vma, pte_t *pte, + unsigned long addr) + { + pte_t val = *pte; + pte_clear(pte); + vm_account_dec(vma, val, addr); + } #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff -prauN linux-2.6.0-test7/include/linux/mmzone.h wli-2.6.0-test7-bk1-29/include/linux/mmzone.h --- linux-2.6.0-test7/include/linux/mmzone.h 2003-10-08 12:24:08.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/mmzone.h 2003-10-09 19:30:08.000000000 -0700 @@ -21,8 +21,8 @@ #endif struct free_area { - struct list_head free_list; - unsigned long *map; + struct list_head free_list, deferred_pages; + unsigned long *map, globally_free, active, locally_free; }; struct pglist_data; diff -prauN linux-2.6.0-test7/include/linux/nfs_fs.h wli-2.6.0-test7-bk1-29/include/linux/nfs_fs.h --- linux-2.6.0-test7/include/linux/nfs_fs.h 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/nfs_fs.h 2003-10-09 19:27:23.000000000 -0700 @@ -264,7 +264,7 @@ nfs_file_cred(struct file *file) /* * linux/fs/nfs/direct.c */ -extern int nfs_direct_IO(int, struct file *, const struct iovec *, loff_t, +extern int nfs_direct_IO(int, struct kiocb *, const struct iovec *, loff_t, unsigned long); /* diff -prauN linux-2.6.0-test7/include/linux/nfs_page.h wli-2.6.0-test7-bk1-29/include/linux/nfs_page.h --- linux-2.6.0-test7/include/linux/nfs_page.h 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/nfs_page.h 2003-10-09 19:27:23.000000000 -0700 @@ -47,7 +47,6 @@ extern struct nfs_page *nfs_create_reque unsigned int, unsigned int); extern void nfs_clear_request(struct nfs_page *req); extern void nfs_release_request(struct nfs_page *req); -extern void nfs_release_list(struct list_head *list); extern void nfs_list_add_request(struct nfs_page *, struct list_head *); @@ -57,7 +56,6 @@ extern int nfs_scan_list(struct list_hea extern int nfs_coalesce_requests(struct list_head *, struct list_head *, unsigned int); extern int nfs_wait_on_request(struct nfs_page *); -extern int nfs_wait_for_reads(struct list_head *); extern spinlock_t nfs_wreq_lock; diff -prauN linux-2.6.0-test7/include/linux/nfs_xdr.h wli-2.6.0-test7-bk1-29/include/linux/nfs_xdr.h --- linux-2.6.0-test7/include/linux/nfs_xdr.h 2003-10-08 12:24:27.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/nfs_xdr.h 2003-10-09 19:27:23.000000000 -0700 @@ -639,8 +639,7 @@ struct nfs_rpc_ops { int (*readlink)(struct inode *, struct page *); int (*read) (struct nfs_read_data *, struct file *); int (*write) (struct nfs_write_data *, struct file *); - int (*commit) (struct inode *, struct nfs_fattr *, - unsigned long, unsigned int); + int (*commit) (struct nfs_write_data *, struct file *); struct inode * (*create) (struct inode *, struct qstr *, struct iattr *, int); int (*remove) (struct inode *, struct qstr *); diff -prauN linux-2.6.0-test7/include/linux/page-flags.h wli-2.6.0-test7-bk1-29/include/linux/page-flags.h --- linux-2.6.0-test7/include/linux/page-flags.h 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/page-flags.h 2003-10-09 19:54:33.000000000 -0700 @@ -69,12 +69,13 @@ #define PG_private 12 /* Has something at ->private */ #define PG_writeback 13 /* Page is under writeback */ #define PG_nosave 14 /* Used for system suspend/resume */ -#define PG_chainlock 15 /* lock bit for ->pte_chain */ +#define PG_rmaplock 15 /* lock bit for ->pte_chain */ -#define PG_direct 16 /* ->pte_chain points directly at pte */ -#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */ -#define PG_reclaim 18 /* To be reclaimed asap */ -#define PG_compound 19 /* Part of a compound page */ +#define PG_mappedtodisk 16 /* Has blocks allocated on-disk */ +#define PG_reclaim 17 /* To be reclaimed asap */ +#define PG_compound 18 /* Part of a compound page */ +#define PG_anon 19 /* Anonymous page */ +#define PG_swapcache 20 /* Swap page; swp_entry_t in ->private */ /* @@ -87,6 +88,7 @@ struct page_state { unsigned long nr_unstable; /* NFS unstable pages */ unsigned long nr_page_table_pages;/* Pages used for pagetables */ unsigned long nr_mapped; /* mapped into pagetables */ + unsigned long nr_swapcache; /* in swapcache */ unsigned long nr_slab; /* In slab */ #define GET_PAGE_STATE_LAST nr_slab @@ -250,12 +252,6 @@ extern void get_full_page_state(struct p #define ClearPageNosave(page) clear_bit(PG_nosave, &(page)->flags) #define TestClearPageNosave(page) test_and_clear_bit(PG_nosave, &(page)->flags) -#define PageDirect(page) test_bit(PG_direct, &(page)->flags) -#define SetPageDirect(page) set_bit(PG_direct, &(page)->flags) -#define TestSetPageDirect(page) test_and_set_bit(PG_direct, &(page)->flags) -#define ClearPageDirect(page) clear_bit(PG_direct, &(page)->flags) -#define TestClearPageDirect(page) test_and_clear_bit(PG_direct, &(page)->flags) - #define PageMappedToDisk(page) test_bit(PG_mappedtodisk, &(page)->flags) #define SetPageMappedToDisk(page) set_bit(PG_mappedtodisk, &(page)->flags) #define ClearPageMappedToDisk(page) clear_bit(PG_mappedtodisk, &(page)->flags) @@ -269,15 +265,16 @@ extern void get_full_page_state(struct p #define SetPageCompound(page) set_bit(PG_compound, &(page)->flags) #define ClearPageCompound(page) clear_bit(PG_compound, &(page)->flags) -/* - * The PageSwapCache predicate doesn't use a PG_flag at this time, - * but it may again do so one day. - */ +#define PageAnon(page) test_bit(PG_anon, &(page)->flags) +#define SetPageAnon(page) set_bit(PG_anon, &(page)->flags) +#define ClearPageAnon(page) clear_bit(PG_anon, &(page)->flags) + #ifdef CONFIG_SWAP -extern struct address_space swapper_space; -#define PageSwapCache(page) ((page)->mapping == &swapper_space) +#define PageSwapCache(page) test_bit(PG_swapcache, &(page)->flags) +#define SetPageSwapCache(page) set_bit(PG_swapcache, &(page)->flags) +#define ClearPageSwapCache(page) clear_bit(PG_swapcache, &(page)->flags) #else -#define PageSwapCache(page) 0 +#define PageSwapCache(page) 0 #endif struct page; /* forward declaration */ diff -prauN linux-2.6.0-test7/include/linux/pagemap.h wli-2.6.0-test7-bk1-29/include/linux/pagemap.h --- linux-2.6.0-test7/include/linux/pagemap.h 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/pagemap.h 2003-10-09 19:42:26.000000000 -0700 @@ -139,17 +139,6 @@ static inline unsigned long get_page_cac return atomic_read(&nr_pagecache); } -static inline void ___add_to_page_cache(struct page *page, - struct address_space *mapping, unsigned long index) -{ - list_add(&page->list, &mapping->clean_pages); - page->mapping = mapping; - page->index = index; - - mapping->nrpages++; - pagecache_acct(1); -} - extern void FASTCALL(__lock_page(struct page *page)); extern void FASTCALL(unlock_page(struct page *page)); diff -prauN linux-2.6.0-test7/include/linux/pagevec.h wli-2.6.0-test7-bk1-29/include/linux/pagevec.h --- linux-2.6.0-test7/include/linux/pagevec.h 2003-10-08 12:24:51.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/pagevec.h 2003-10-09 19:56:01.000000000 -0700 @@ -4,8 +4,15 @@ * In many places it is efficient to batch an operation up against multiple * pages. A pagevec is a multipage container which is used for that. */ +#include -#define PAGEVEC_SIZE 16 +#define __MIN_PVEC_SIZE 16 +#define __MAX_PVEC_SIZE 1024 +#define __PVEC_MIN(x,y) ((x) < (y) ? (x) : (y)) +#define __PVEC_MAX(x,y) ((x) > (y) ? (x) : (y)) +#define __PVEC_SIZE (4*NR_CPUS) +#define __PAGEVEC_SIZE __PVEC_MIN(__PVEC_SIZE, __MAX_PVEC_SIZE) +#define PAGEVEC_SIZE __PVEC_MAX(__PAGEVEC_SIZE, __MIN_PVEC_SIZE) struct page; struct address_space; diff -prauN linux-2.6.0-test7/include/linux/pid.h wli-2.6.0-test7-bk1-29/include/linux/pid.h --- linux-2.6.0-test7/include/linux/pid.h 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/pid.h 2003-10-09 20:00:01.000000000 -0700 @@ -14,51 +14,76 @@ struct pid { int nr; atomic_t count; - struct task_struct *task; - struct list_head task_list; + task_t *task; + struct rb_root task_list; struct list_head hash_chain; }; struct pid_link { - struct list_head pid_chain; + struct rb_node pid_chain; struct pid *pidptr; struct pid pid; }; #define pid_task(elem, type) \ - list_entry(elem, struct task_struct, pids[type].pid_chain) + rb_entry(elem, task_t, pids[type].pid_chain) /* * attach_pid() and link_pid() must be called with the tasklist_lock * write-held. */ -extern int FASTCALL(attach_pid(struct task_struct *task, enum pid_type type, int nr)); +int FASTCALL(attach_pid(task_t *task, enum pid_type type, int nr)); -extern void FASTCALL(link_pid(struct task_struct *task, struct pid_link *link, struct pid *pid)); +void FASTCALL(link_pid(task_t *task, struct pid *pid, enum pid_type type)); /* * detach_pid() must be called with the tasklist_lock write-held. */ -extern void FASTCALL(detach_pid(struct task_struct *task, enum pid_type)); +void FASTCALL(detach_pid(task_t *task, enum pid_type)); /* * look up a PID in the hash table. Must be called with the tasklist_lock * held. */ -extern struct pid *FASTCALL(find_pid(enum pid_type, int)); +struct pid *FASTCALL(find_pid(enum pid_type, int)); +int find_next_tgid(int); -extern int alloc_pidmap(void); -extern void FASTCALL(free_pidmap(int)); -extern void switch_exec_pids(struct task_struct *leader, struct task_struct *thread); +#define PROC_MAXPIDS 32 +int find_tgids_after(int tgid, int tgids[PROC_MAXPIDS]); +int find_tids_after(int tgid, int tid, int tids[PROC_MAXPIDS]); + +int alloc_pidmap(void); +void FASTCALL(free_pidmap(int)); +void switch_exec_pids(task_t *leader, task_t *thread); +void insert_task_list(task_t *); +void remove_task_list(task_t *); +task_t *first_task(void); + +#define __first_task_pid(pid, type) \ + pid_task(rb_first(&(pid)->task_list), type) +#define first_task_pid(task, type) \ + __first_task_pid((task)->pids[type].pidptr, type) +#define next_task_pid(task, type) \ +({ \ + struct rb_node *__node = rb_next(&(task)->pids[type].pid_chain);\ + __node ? pid_task(__node, type) : NULL; \ +}) +#define first_thread(task) first_task_pid(task, PIDTYPE_TGID) +#define next_thread(task) next_task_pid(task, PIDTYPE_TGID) +#define another_thread(task) \ +({ \ + task_t *__other = next_thread(task); \ + __other ? __other : first_task_pid(task, PIDTYPE_TGID); \ +}) + +#define __for_each_task_pid(type, task, pid) \ + for (task = __first_task_pid(pid, type); \ + task; \ + task = next_task_pid(task, type)) -#define for_each_task_pid(who, type, task, elem, pid) \ +#define for_each_task_pid(who, type, task, pid) \ if ((pid = find_pid(type, who))) \ - for (elem = pid->task_list.next, \ - prefetch(elem->next), \ - task = pid_task(elem, type); \ - elem != &pid->task_list; \ - elem = elem->next, prefetch(elem->next), \ - task = pid_task(elem, type)) + __for_each_task_pid(type, task, pid) #endif /* _LINUX_PID_H */ diff -prauN linux-2.6.0-test7/include/linux/rmap-locking.h wli-2.6.0-test7-bk1-29/include/linux/rmap-locking.h --- linux-2.6.0-test7/include/linux/rmap-locking.h 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/rmap-locking.h 1969-12-31 16:00:00.000000000 -0800 @@ -1,23 +0,0 @@ -/* - * include/linux/rmap-locking.h - * - * Locking primitives for exclusive access to a page's reverse-mapping - * pte chain. - */ - -#include - -struct pte_chain; -extern kmem_cache_t *pte_chain_cache; - -#define pte_chain_lock(page) bit_spin_lock(PG_chainlock, &page->flags) -#define pte_chain_unlock(page) bit_spin_unlock(PG_chainlock, &page->flags) - -struct pte_chain *pte_chain_alloc(int gfp_flags); -void __pte_chain_free(struct pte_chain *pte_chain); - -static inline void pte_chain_free(struct pte_chain *pte_chain) -{ - if (pte_chain) - __pte_chain_free(pte_chain); -} diff -prauN linux-2.6.0-test7/include/linux/rmap.h wli-2.6.0-test7-bk1-29/include/linux/rmap.h --- linux-2.6.0-test7/include/linux/rmap.h 1969-12-31 16:00:00.000000000 -0800 +++ wli-2.6.0-test7-bk1-29/include/linux/rmap.h 2003-10-09 19:55:13.000000000 -0700 @@ -0,0 +1,163 @@ +/* + * include/linux/rmap.h + * + * Locking primitives for exclusive access to a page's reverse-mapping + * pte chain. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +struct anon { + atomic_t count; + spinlock_t lock; + struct list_head list; + struct rcu_head rcu; +}; + +#ifdef CONFIG_MMU + +int FASTCALL(rmap_get_cpu(void)); +void FASTCALL(page_turn_rmap(struct page *, struct vm_area_struct *)); +void FASTCALL(page_move_rmap(struct page *, struct vm_area_struct *, unsigned long, unsigned long)); +void FASTCALL(add_rmap_address(struct page *, unsigned long)); +void FASTCALL(clear_page_chained(struct page *page)); + +/* + * Called from mm/vmscan.c to handle pageout + */ +int FASTCALL(page_referenced(struct page *)); +int FASTCALL(try_to_unmap(struct page *)); + +void init_rmap(void); +int exec_rmap(struct mm_struct *); +void dup_rmap(struct mm_struct *, struct mm_struct *); +void exit_rmap(struct mm_struct *); + +/* + * Return values of try_to_unmap(): + */ +#define SWAP_SUCCESS 0 +#define SWAP_AGAIN 1 +#define SWAP_FAIL 2 + +#else /* !CONFIG_MMU */ +#define page_referenced(page) TestClearPageReferenced(page) +#define init_rmap() do { } while (0) +#define exec_rmap(mm) ({ 0; }) +#define dup_rmap(new, old) ({ 0; }) +#define exit_rmap(mm) do { } while (0) +#define try_to_unmap(page) ({ SWAP_FAIL; }) +#endif /* CONFIG_MMU */ + +#define NOADDR (~0UL) + +static inline void rmap_lock(struct page *page) +{ + bit_spin_lock(PG_rmaplock, &page->flags); +} + +static inline void rmap_unlock(struct page *page) +{ + bit_spin_unlock(PG_rmaplock, &page->flags); +} + +#define NRSLOT ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(unsigned long)) + +struct rmap_chain { + unsigned long slot[NRSLOT]; /* first contains count, then */ + struct rmap_chain *next; /* user virtual addresses */ +}; + +static inline void page_dup_rmap(struct page *page) +{ + atomic_inc(&page->mapcount); +} + +static inline void clear_page_anon(struct page *page) +{ + set_page_mapping(page, NULL); + ClearPageAnon(page); +} + +/** + * page_remove_rmap - take down reverse mapping to a page + * @page: page to remove mapping from + * + * For general use: Remove the reverse mapping from the page. + * after that the caller can clear the page table entry and free + * the page. Caller needs to hold the mm->page_table_lock. + */ +static inline void page_remove_rmap(struct page *page) +{ + if (!atomic_dec_and_test(&page->mapcount)) + return; + + rmap_lock(page); + dec_page_state(nr_mapped); + if (PageAnon(page)) + clear_page_anon(page); + if (page->chain) + clear_page_chained(page); + rmap_unlock(page); +} + +static inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) +{ + unsigned long pgoff, address; + pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + if (address < vma->vm_start || address >= vma->vm_end) + return NOADDR; + else + return address; +} + +/** + * page_add_rmap - add reverse mapping entry to a page + * @page: the page to add the mapping to + * @vma: the vma into which this page is being mapped + * @address: the virtual address at which the page is being mapped + * @anon: is this an anonymous (not file-backed) page? + * + * Add a new pte reverse mapping to a page. + * The caller needs to hold the mm->page_table_lock. + */ +static inline void page_add_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long address, int anon) +{ + if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) + return; + + address &= PAGE_MASK; + + rmap_lock(page); + + if (!page_mapped(page)) + inc_page_state(nr_mapped); + + atomic_inc(&page->mapcount); + + if (page->__mapping) { + if (anon) { + BUG_ON(!PageAnon(page)); + if (address != page->index) + add_rmap_address(page, address); + } else { + BUG_ON(PageAnon(page)); + if (address != vma_address(page, vma)) + add_rmap_address(page, address); + } + } else if (anon) { + SetPageAnon(page); + set_page_mapping(page, vma->vm_mm->anon); + page->index = address; + } + rmap_unlock(page); +} diff -prauN linux-2.6.0-test7/include/linux/sched.h wli-2.6.0-test7-bk1-29/include/linux/sched.h --- linux-2.6.0-test7/include/linux/sched.h 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/sched.h 2003-10-09 20:00:01.000000000 -0700 @@ -27,8 +27,8 @@ #include #include #include -#include #include +#include struct exec_domain; @@ -145,34 +145,35 @@ extern rwlock_t tasklist_lock; extern spinlock_t mmlist_lock; typedef struct task_struct task_t; +#include -extern void sched_init(void); -extern void init_idle(task_t *idle, int cpu); +void sched_init(void); +void init_idle(task_t *idle, int cpu); -extern void show_state(void); -extern void show_regs(struct pt_regs *); +void show_state(void); +void show_regs(struct pt_regs *); /* * TASK is a pointer to the task whose backtrace we want to see (or NULL for current * task), SP is the stack pointer of the first frame that should be shown in the back * trace (or NULL if the entire call-chain of the task should be shown). */ -extern void show_stack(struct task_struct *task, unsigned long *sp); +void show_stack(task_t *task, unsigned long *sp); void io_schedule(void); long io_schedule_timeout(long timeout); -extern void cpu_init (void); -extern void trap_init(void); -extern void update_process_times(int user); -extern void update_one_process(struct task_struct *p, unsigned long user, +void cpu_init (void); +void trap_init(void); +void update_process_times(int user); +void update_one_process(task_t *p, unsigned long user, unsigned long system, int cpu); -extern void scheduler_tick(int user_tick, int system); +void scheduler_tick(int user_tick, int system); extern unsigned long cache_decay_ticks; #define MAX_SCHEDULE_TIMEOUT LONG_MAX -extern signed long FASTCALL(schedule_timeout(signed long timeout)); +signed long FASTCALL(schedule_timeout(signed long timeout)); asmlinkage void schedule(void); struct namespace; @@ -198,11 +199,14 @@ struct mm_struct { * together off init_mm.mmlist, and are protected * by mmlist_lock */ + struct anon *anon; /* set of forks between execs */ + struct list_head anon_list; /* chain of mm's against anon */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; unsigned long rss, total_vm, locked_vm; + unsigned long shared, text, lib, data, dirty, stack; unsigned long def_flags; cpumask_t cpu_vm_mask; unsigned long swap_address; @@ -225,6 +229,7 @@ struct mm_struct { struct kioctx *ioctx_list; struct kioctx default_kioctx; + struct rcu_head rcu; }; extern int mmlist_nr; @@ -361,7 +366,7 @@ struct task_struct { cpumask_t cpus_allowed; unsigned int time_slice, first_time_slice; - struct list_head tasks; + struct rb_node tasks; struct list_head ptrace_children; struct list_head ptrace_list; @@ -560,7 +565,7 @@ union thread_union { }; extern union thread_union init_thread_union; -extern struct task_struct init_task; +extern task_t init_task; extern struct mm_struct init_mm; @@ -708,47 +713,61 @@ extern void wait_task_inactive(task_t * #define REMOVE_LINKS(p) do { \ if (thread_group_leader(p)) \ - list_del_init(&(p)->tasks); \ + remove_task_list(p); \ remove_parent(p); \ } while (0) #define SET_LINKS(p) do { \ if (thread_group_leader(p)) \ - list_add_tail(&(p)->tasks,&init_task.tasks); \ + insert_task_list(p); \ add_parent(p, (p)->parent); \ } while (0) -#define next_task(p) list_entry((p)->tasks.next, struct task_struct, tasks) -#define prev_task(p) list_entry((p)->tasks.prev, struct task_struct, tasks) +static inline task_t *next_task(task_t *task) +{ + struct rb_node *node = rb_next(&task->tasks); + return node ? rb_entry(node, task_t, tasks) : NULL; +} + +static inline task_t *prev_task(task_t *task) +{ + struct rb_node *node = rb_prev(&task->tasks); + return node ? rb_entry(node, task_t, tasks) : NULL; +} #define for_each_process(p) \ - for (p = &init_task ; (p = next_task(p)) != &init_task ; ) + for (p = first_task(); p; p = next_task(p)) /* * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. */ #define do_each_thread(g, t) \ - for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do + for (g = first_task(), t = first_thread(g); \ + g; \ + g = next_task(g), t = g ? first_thread(g) : NULL) do #define while_each_thread(g, t) \ - while ((t = next_thread(t)) != g) + while ((t = next_thread(t))) -extern task_t * FASTCALL(next_thread(task_t *p)); - -#define thread_group_leader(p) (p->pid == p->tgid) - -static inline int thread_group_empty(task_t *p) +static inline int thread_group_leader(task_t *task) { - struct pid *pid = p->pids[PIDTYPE_TGID].pidptr; + return task->pid == task->tgid; +} - return pid->task_list.next->next == &pid->task_list; +/* a singleton's element has no successor */ +static inline int thread_group_empty(task_t *task) +{ + struct pid *pid = task->pids[PIDTYPE_TGID].pidptr; + return !rb_next(rb_first(&pid->task_list)); } -#define delay_group_leader(p) \ - (thread_group_leader(p) && !thread_group_empty(p)) +static inline int delay_group_leader(task_t *task) +{ + return thread_group_leader(task) && !thread_group_empty(task); +} -extern void unhash_process(struct task_struct *p); +void unhash_process(task_t *task); /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). * Nests both inside and outside of read_lock(&tasklist_lock). diff -prauN linux-2.6.0-test7/include/linux/suspend.h wli-2.6.0-test7-bk1-29/include/linux/suspend.h --- linux-2.6.0-test7/include/linux/suspend.h 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/suspend.h 2003-10-09 19:27:23.000000000 -0700 @@ -47,12 +47,6 @@ extern void drain_local_pages(void); extern unsigned int nr_copy_pages __nosavedata; extern suspend_pagedir_t *pagedir_nosave __nosavedata; - -/* Communication between acpi and arch/i386/suspend.c */ - -extern void do_suspend_lowlevel(int resume); -extern void do_suspend_lowlevel_s4bios(int resume); - #endif /* CONFIG_PM */ #ifdef CONFIG_SOFTWARE_SUSPEND diff -prauN linux-2.6.0-test7/include/linux/swap.h wli-2.6.0-test7-bk1-29/include/linux/swap.h --- linux-2.6.0-test7/include/linux/swap.h 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/include/linux/swap.h 2003-10-09 19:42:26.000000000 -0700 @@ -76,7 +76,6 @@ struct reclaim_state { #ifdef __KERNEL__ struct address_space; -struct pte_chain; struct sysinfo; struct writeback_control; struct zone; @@ -162,6 +161,7 @@ extern unsigned int nr_free_pages(void); extern unsigned int nr_free_pages_pgdat(pg_data_t *pgdat); extern unsigned int nr_free_buffer_pages(void); extern unsigned int nr_free_pagecache_pages(void); +unsigned long nr_deferred_pages(void); /* linux/mm/swap.c */ extern void FASTCALL(lru_cache_add(struct page *)); @@ -177,25 +177,8 @@ extern int try_to_free_pages(struct zone extern int shrink_all_memory(int); extern int vm_swappiness; -/* linux/mm/rmap.c */ -#ifdef CONFIG_MMU -int FASTCALL(page_referenced(struct page *)); -struct pte_chain *FASTCALL(page_add_rmap(struct page *, pte_t *, - struct pte_chain *)); -void FASTCALL(page_remove_rmap(struct page *, pte_t *)); -int FASTCALL(try_to_unmap(struct page *)); - /* linux/mm/shmem.c */ -extern int shmem_unuse(swp_entry_t entry, struct page *page); -#else -#define page_referenced(page) TestClearPageReferenced(page) -#define try_to_unmap(page) SWAP_FAIL -#endif /* CONFIG_MMU */ - -/* return values of try_to_unmap */ -#define SWAP_SUCCESS 0 -#define SWAP_AGAIN 1 -#define SWAP_FAIL 2 +int shmem_unuse(swp_entry_t entry, struct page *page); #ifdef CONFIG_SWAP /* linux/mm/page_io.c */ @@ -205,7 +188,6 @@ extern int rw_swap_page_sync(int, swp_en /* linux/mm/swap_state.c */ extern struct address_space swapper_space; -#define total_swapcache_pages swapper_space.nrpages extern void show_swap_cache_info(void); extern int add_to_swap(struct page *); extern void __delete_from_swap_cache(struct page *); @@ -244,7 +226,6 @@ extern spinlock_t swaplock; #else /* CONFIG_SWAP */ #define total_swap_pages 0 -#define total_swapcache_pages 0UL #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) diff -prauN linux-2.6.0-test7/init/main.c wli-2.6.0-test7-bk1-29/init/main.c --- linux-2.6.0-test7/init/main.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/init/main.c 2003-10-09 19:44:30.000000000 -0700 @@ -80,7 +80,6 @@ extern void signals_init(void); extern void buffer_init(void); extern void pidhash_init(void); extern void pidmap_init(void); -extern void pte_chain_init(void); extern void radix_tree_init(void); extern void free_initmem(void); extern void populate_rootfs(void); @@ -442,7 +441,6 @@ asmlinkage void __init start_kernel(void calibrate_delay(); pidmap_init(); pgtable_cache_init(); - pte_chain_init(); fork_init(num_physpages); proc_caches_init(); buffer_init(); diff -prauN linux-2.6.0-test7/ipc/shm.c wli-2.6.0-test7-bk1-29/ipc/shm.c --- linux-2.6.0-test7/ipc/shm.c 2003-10-08 12:24:06.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/ipc/shm.c 2003-10-09 19:36:12.000000000 -0700 @@ -380,9 +380,9 @@ static void shm_get_stat(unsigned long * if (is_file_hugepages(shp->shm_file)) { struct address_space *mapping = inode->i_mapping; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); *rss += (HPAGE_SIZE/PAGE_SIZE)*mapping->nrpages; - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); } else { struct shmem_inode_info *info = SHMEM_I(inode); spin_lock(&info->lock); diff -prauN linux-2.6.0-test7/kernel/capability.c wli-2.6.0-test7-bk1-29/kernel/capability.c --- linux-2.6.0-test7/kernel/capability.c 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/kernel/capability.c 2003-10-09 20:00:01.000000000 -0700 @@ -89,10 +89,9 @@ static inline void cap_set_pg(int pgrp, kernel_cap_t *permitted) { task_t *g, *target; - struct list_head *l; struct pid *pid; - for_each_task_pid(pgrp, PIDTYPE_PGID, g, l, pid) { + for_each_task_pid(pgrp, PIDTYPE_PGID, g, pid) { target = g; while_each_thread(g, target) security_capset_set(target, effective, inheritable, permitted); diff -prauN linux-2.6.0-test7/kernel/exit.c wli-2.6.0-test7-bk1-29/kernel/exit.c --- linux-2.6.0-test7/kernel/exit.c 2003-10-08 12:24:43.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/kernel/exit.c 2003-10-09 20:00:47.000000000 -0700 @@ -113,12 +113,11 @@ void unhash_process(struct task_struct * int session_of_pgrp(int pgrp) { struct task_struct *p; - struct list_head *l; struct pid *pid; int sid = -1; read_lock(&tasklist_lock); - for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) + for_each_task_pid(pgrp, PIDTYPE_PGID, p, pid) if (process_session(p) > 0) { sid = process_session(p); goto out; @@ -143,11 +142,10 @@ out: static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) { struct task_struct *p; - struct list_head *l; struct pid *pid; int ret = 1; - for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { + for_each_task_pid(pgrp, PIDTYPE_PGID, p, pid) { if (p == ignored_task || p->state >= TASK_ZOMBIE || p->real_parent->pid == 1) @@ -176,10 +174,9 @@ static inline int has_stopped_jobs(int p { int retval = 0; struct task_struct *p; - struct list_head *l; struct pid *pid; - for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { + for_each_task_pid(pgrp, PIDTYPE_PGID, p, pid) { if (p->state != TASK_STOPPED) continue; @@ -571,9 +568,10 @@ static inline void forget_original_paren * Send signals to all our closest relatives so that they know * to properly mourn us.. */ -static void exit_notify(struct task_struct *tsk) +static void exit_notify(task_t *tsk) { - struct task_struct *t; + task_t *t; + struct pid *pid = tsk->pids[PIDTYPE_TGID].pidptr; if (signal_pending(tsk) && !tsk->signal->group_exit && !thread_group_empty(tsk)) { @@ -588,12 +586,15 @@ static void exit_notify(struct task_stru */ read_lock(&tasklist_lock); spin_lock_irq(&tsk->sighand->siglock); - for (t = next_thread(tsk); t != tsk; t = next_thread(t)) + __for_each_task_pid(PIDTYPE_TGID, t, pid) { + if (t == tsk) + continue; if (!signal_pending(t) && !(t->flags & PF_EXITING)) { recalc_sigpending_tsk(t); if (signal_pending(t)) signal_wake_up(t, 0); } + } spin_unlock_irq(&tsk->sighand->siglock); read_unlock(&tasklist_lock); } @@ -754,27 +755,6 @@ asmlinkage long sys_exit(int error_code) do_exit((error_code&0xff)<<8); } -task_t *next_thread(task_t *p) -{ - struct pid_link *link = p->pids + PIDTYPE_TGID; - struct list_head *tmp, *head = &link->pidptr->task_list; - -#ifdef CONFIG_SMP - if (!p->sighand) - BUG(); - if (!spin_is_locked(&p->sighand->siglock) && - !rwlock_is_locked(&tasklist_lock)) - BUG(); -#endif - tmp = link->pid_chain.next; - if (tmp == head) - tmp = head->next; - - return pid_task(tmp, PIDTYPE_TGID); -} - -EXPORT_SYMBOL(next_thread); - /* * Take down every thread in the group. This is called by fatal signals * as well as by sys_exit_group (below). @@ -1010,7 +990,8 @@ static int wait_task_stopped(task_t *p, asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru) { DECLARE_WAITQUEUE(wait, current); - struct task_struct *tsk; + struct pid *tgrp_pid = current->pids[PIDTYPE_TGID].pidptr; + task_t *tsk; int flag, retval; if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL)) @@ -1021,14 +1002,14 @@ repeat: flag = 0; current->state = TASK_INTERRUPTIBLE; read_lock(&tasklist_lock); - tsk = current; - do { - struct task_struct *p; + __for_each_task_pid(PIDTYPE_TGID, tsk, tgrp_pid) { + task_t *p; struct list_head *_p; int ret; + BUG_ON(tsk->signal != current->signal); list_for_each(_p,&tsk->children) { - p = list_entry(_p,struct task_struct,sibling); + p = list_entry(_p, task_t, sibling); ret = eligible_child(pid, options, p); if (!ret) @@ -1068,10 +1049,7 @@ repeat: } if (options & __WNOTHREAD) break; - tsk = next_thread(tsk); - if (tsk->signal != current->signal) - BUG(); - } while (tsk != current); + } read_unlock(&tasklist_lock); if (flag) { retval = 0; diff -prauN linux-2.6.0-test7/kernel/fork.c wli-2.6.0-test7-bk1-29/kernel/fork.c --- linux-2.6.0-test7/kernel/fork.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/kernel/fork.c 2003-10-09 20:00:01.000000000 -0700 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -290,9 +291,9 @@ static inline int dup_mmap(struct mm_str atomic_dec(&inode->i_writecount); /* insert tmp into the share list, just after mpnt */ - down(&inode->i_mapping->i_shared_sem); - list_add_tail(&tmp->shared, &mpnt->shared); - up(&inode->i_mapping->i_shared_sem); + spin_lock(&inode->i_mapping->i_shared_lock); + list_add_tail_rcu(&tmp->shared, &mpnt->shared); + spin_unlock(&inode->i_mapping->i_shared_lock); } /* @@ -346,8 +347,21 @@ static inline void mm_free_pgd(struct mm spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; int mmlist_nr; +/* SLAB cache for mm_struct structures (tsk->mm) */ +static kmem_cache_t *mm_cachep; + #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) -#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +static void __free_mm(void *mm) +{ + kmem_cache_free(mm_cachep, mm); +} + +void free_mm(struct mm_struct *mm) +{ + INIT_RCU_HEAD(&mm->rcu); + call_rcu(&mm->rcu, __free_mm, mm); +} #include @@ -362,6 +376,7 @@ static struct mm_struct * mm_init(struct mm->ioctx_list = NULL; mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm); mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->shared = mm->text = mm->lib = mm->data = mm->dirty = mm->stack = 0; if (likely(!mm_alloc_pgd(mm))) { mm->def_flags = 0; @@ -379,11 +394,15 @@ struct mm_struct * mm_alloc(void) struct mm_struct * mm; mm = allocate_mm(); - if (mm) { - memset(mm, 0, sizeof(*mm)); - return mm_init(mm); + if (!mm) + return NULL; + memset(mm, 0, sizeof(*mm)); + if (exec_rmap(mm)) { + mm_free_pgd(mm); + free_mm(mm); + return NULL; } - return NULL; + return mm_init(mm); } /* @@ -410,6 +429,7 @@ void mmput(struct mm_struct *mm) spin_unlock(&mmlist_lock); exit_aio(mm); exit_mmap(mm); + exit_rmap(mm); mmdrop(mm); } } @@ -514,6 +534,8 @@ static int copy_mm(unsigned long clone_f if (!mm_init(mm)) goto fail_nomem; + dup_rmap(mm, oldmm); + if (init_new_context(tsk,mm)) goto fail_nocontext; @@ -1050,7 +1072,7 @@ struct task_struct *copy_process(unsigne if (p->pid) __get_cpu_var(process_counts)++; } else - link_pid(p, p->pids + PIDTYPE_TGID, &p->group_leader->pids[PIDTYPE_TGID].pid); + link_pid(p, &p->group_leader->pids[PIDTYPE_TGID].pid, PIDTYPE_TGID); nr_threads++; write_unlock_irq(&tasklist_lock); @@ -1193,8 +1215,7 @@ kmem_cache_t *fs_cachep; /* SLAB cache for vm_area_struct structures */ kmem_cache_t *vm_area_cachep; -/* SLAB cache for mm_struct structures (tsk->mm) */ -kmem_cache_t *mm_cachep; +void init_rmap(void); void __init proc_caches_init(void) { @@ -1233,4 +1254,6 @@ void __init proc_caches_init(void) SLAB_HWCACHE_ALIGN, NULL, NULL); if(!mm_cachep) panic("vma_init: Cannot alloc mm_struct SLAB cache"); + + init_rmap(); } diff -prauN linux-2.6.0-test7/kernel/futex.c wli-2.6.0-test7-bk1-29/kernel/futex.c --- linux-2.6.0-test7/kernel/futex.c 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/kernel/futex.c 2003-10-09 19:27:23.000000000 -0700 @@ -45,6 +45,9 @@ * Futexes are matched on equal values of this key. * The key type depends on whether it's a shared or private mapping. * Don't rearrange members without looking at hash_futex(). + * + * offset is aligned to a multiple of sizeof(u32) (== 4) by definition. + * We set bit 0 to indicate if it's an inode-based key. */ union futex_key { struct { @@ -66,12 +69,20 @@ union futex_key { /* * We use this hashed waitqueue instead of a normal wait_queue_t, so - * we can wake only the relevant ones (hashed queues may be shared): + * we can wake only the relevant ones (hashed queues may be shared). + * + * A futex_q has a woken state, just like tasks have TASK_RUNNING. + * It is considered woken when list_empty(&q->list) || q->lock_ptr == 0. + * The order of wakup is always to make the first condition true, then + * wake up q->waiters, then make the second condition true. */ struct futex_q { struct list_head list; wait_queue_head_t waiters; + /* Which hash list lock to use. */ + spinlock_t *lock_ptr; + /* Key which the futex is hashed on. */ union futex_key key; @@ -124,8 +135,7 @@ static inline int match_futex(union fute * Returns: 0, or negative error code. * The key words are stored in *key on success. * - * Should be called with ¤t->mm->mmap_sem, - * but NOT &futex_lock or ¤t->mm->page_table_lock. + * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. */ static int get_futex_key(unsigned long uaddr, union futex_key *key) { @@ -172,9 +182,10 @@ static int get_futex_key(unsigned long u } /* - * Linear mappings are also simple. + * Linear file mappings are also simple. */ key->shared.inode = vma->vm_file->f_dentry->d_inode; + key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ if (likely(!(vma->vm_flags & VM_NONLINEAR))) { key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff); @@ -214,16 +225,68 @@ static int get_futex_key(unsigned long u return err; } +/* + * Take a reference to the resource addressed by a key. + * Can be called while holding spinlocks. + * + * NOTE: mmap_sem MUST be held between get_futex_key() and calling this + * function, if it is called at all. mmap_sem keeps key->shared.inode valid. + */ +static inline void get_key_refs(union futex_key *key) +{ + if (key->both.ptr != 0) { + if (key->both.offset & 1) + atomic_inc(&key->shared.inode->i_count); + else + atomic_inc(&key->private.mm->mm_count); + } +} + +/* + * Drop a reference to the resource addressed by a key. + * The hash bucket spinlock must not be held. + */ +static inline void drop_key_refs(union futex_key *key) +{ + if (key->both.ptr != 0) { + if (key->both.offset & 1) + iput(key->shared.inode); + else + mmdrop(key->private.mm); + } +} + +/* + * The hash bucket lock must be held when this is called. + * Afterwards, the futex_q must not be accessed. + */ +static inline void wake_futex(struct futex_q *q) +{ + list_del_init(&q->list); + if (q->filp) + send_sigio(&q->filp->f_owner, q->fd, POLL_IN); + /* + * The lock in wake_up_all() is a crucial memory barrier after the + * list_del_init() and also before assigning to q->lock_ptr. + */ + wake_up_all(&q->waiters); + /* + * The waiting task can free the futex_q as soon as this is written, + * without taking any locks. This must come last. + */ + q->lock_ptr = 0; +} /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ -static int futex_wake(unsigned long uaddr, int num) +static int futex_wake(unsigned long uaddr, int nr_wake) { - struct list_head *i, *next, *head; - struct futex_hash_bucket *bh; union futex_key key; + struct futex_hash_bucket *bh; + struct list_head *head; + struct futex_q *this, *next; int ret; down_read(¤t->mm->mmap_sem); @@ -236,21 +299,15 @@ static int futex_wake(unsigned long uadd spin_lock(&bh->lock); head = &bh->chain; - list_for_each_safe(i, next, head) { - struct futex_q *this = list_entry(i, struct futex_q, list); - + list_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { - list_del_init(i); - wake_up_all(&this->waiters); - if (this->filp) - send_sigio(&this->filp->f_owner, this->fd, POLL_IN); - ret++; - if (ret >= num) + wake_futex(this); + if (++ret >= nr_wake) break; } } - spin_unlock(&bh->lock); + spin_unlock(&bh->lock); out: up_read(¤t->mm->mmap_sem); return ret; @@ -263,10 +320,11 @@ out: static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_requeue) { - struct list_head *i, *next, *head1, *head2; - struct futex_hash_bucket *bh1, *bh2; union futex_key key1, key2; - int ret; + struct futex_hash_bucket *bh1, *bh2; + struct list_head *head1; + struct futex_q *this, *next; + int ret, drop_count = 0; down_read(¤t->mm->mmap_sem); @@ -279,78 +337,107 @@ static int futex_requeue(unsigned long u bh1 = hash_futex(&key1); bh2 = hash_futex(&key2); - if (bh1 < bh2) { + + if (bh1 < bh2) + spin_lock(&bh1->lock); + spin_lock(&bh2->lock); + if (bh1 > bh2) spin_lock(&bh1->lock); - spin_lock(&bh2->lock); - } else { - spin_lock(&bh2->lock); - if (bh1 > bh2) - spin_lock(&bh1->lock); - } - head1 = &bh1->chain; - head2 = &bh2->chain; - list_for_each_safe(i, next, head1) { - struct futex_q *this = list_entry(i, struct futex_q, list); + head1 = &bh1->chain; + list_for_each_entry_safe(this, next, head1, list) { + if (!match_futex (&this->key, &key1)) + continue; + if (++ret <= nr_wake) { + wake_futex(this); + } else { + list_move_tail(&this->list, &bh2->chain); + this->lock_ptr = &bh2->lock; + this->key = key2; + get_key_refs(&key2); + drop_count++; - if (match_futex (&this->key, &key1)) { - list_del_init(i); - if (++ret <= nr_wake) { - wake_up_all(&this->waiters); - if (this->filp) - send_sigio(&this->filp->f_owner, - this->fd, POLL_IN); - } else { - list_add_tail(i, head2); - this->key = key2; - if (ret - nr_wake >= nr_requeue) - break; - /* Make sure to stop if key1 == key2 */ - if (head1 == head2 && head1 != next) - head1 = i; - } + if (ret - nr_wake >= nr_requeue) + break; + /* Make sure to stop if key1 == key2 */ + if (head1 == &bh2->chain && head1 != &next->list) + head1 = &this->list; } } - if (bh1 < bh2) { - spin_unlock(&bh2->lock); - spin_unlock(&bh1->lock); - } else { - if (bh1 > bh2) - spin_unlock(&bh1->lock); + + spin_unlock(&bh1->lock); + if (bh1 != bh2) spin_unlock(&bh2->lock); - } + + /* drop_key_refs() must be called outside the spinlocks. */ + while (--drop_count >= 0) + drop_key_refs(&key1); + out: up_read(¤t->mm->mmap_sem); return ret; } -static inline void queue_me(struct futex_q *q, union futex_key *key, - int fd, struct file *filp) +/* + * queue_me and unqueue_me must be called as a pair, each + * exactly once. They are called with the hashed spinlock held. + */ + +/* The key must be already stored in q->key. */ +static inline void queue_me(struct futex_q *q, int fd, struct file *filp) { - struct futex_hash_bucket *bh = hash_futex(key); - struct list_head *head = &bh->chain; + struct futex_hash_bucket *bh; - q->key = *key; q->fd = fd; q->filp = filp; + init_waitqueue_head(&q->waiters); + + get_key_refs(&q->key); + bh = hash_futex(&q->key); + q->lock_ptr = &bh->lock; + spin_lock(&bh->lock); - list_add_tail(&q->list, head); + list_add_tail(&q->list, &bh->chain); spin_unlock(&bh->lock); } /* Return 1 if we were still queued (ie. 0 means we were woken) */ -static inline int unqueue_me(struct futex_q *q) +static int unqueue_me(struct futex_q *q) { - struct futex_hash_bucket *bh = hash_futex(&q->key); int ret = 0; + spinlock_t *lock_ptr; - spin_lock(&bh->lock); - if (!list_empty(&q->list)) { - list_del(&q->list); - ret = 1; + /* In the common case we don't take the spinlock, which is nice. */ + retry: + lock_ptr = q->lock_ptr; + if (lock_ptr != 0) { + spin_lock(lock_ptr); + /* + * q->lock_ptr can change between reading it and + * spin_lock(), causing us to take the wrong lock. This + * corrects the race condition. + * + * Reasoning goes like this: if we have the wrong lock, + * q->lock_ptr must have changed (maybe several times) + * between reading it and the spin_lock(). It can + * change again after the spin_lock() but only if it was + * already changed before the spin_lock(). It cannot, + * however, change back to the original value. Therefore + * we can detect whether we acquired the correct lock. + */ + if (unlikely(lock_ptr != q->lock_ptr)) { + spin_unlock(lock_ptr); + goto retry; + } + if (likely(!list_empty(&q->list))) { + list_del(&q->list); + ret = 1; + } + spin_unlock(lock_ptr); } - spin_unlock(&bh->lock); + + drop_key_refs(&q->key); return ret; } @@ -358,19 +445,15 @@ static int futex_wait(unsigned long uadd { DECLARE_WAITQUEUE(wait, current); int ret, curval; - union futex_key key; struct futex_q q; - struct futex_hash_bucket *bh = NULL; - - init_waitqueue_head(&q.waiters); down_read(¤t->mm->mmap_sem); - ret = get_futex_key(uaddr, &key); + ret = get_futex_key(uaddr, &q.key); if (unlikely(ret != 0)) goto out_release_sem; - queue_me(&q, &key, -1, NULL); + queue_me(&q, -1, NULL); /* * Access the page after the futex is queued. @@ -400,23 +483,17 @@ static int futex_wait(unsigned long uadd * rely on the futex_wake() code removing us from hash when it * wakes us up. */ - add_wait_queue(&q.waiters, &wait); - bh = hash_futex(&key); - spin_lock(&bh->lock); - set_current_state(TASK_INTERRUPTIBLE); - - if (unlikely(list_empty(&q.list))) { - /* - * We were woken already. - */ - spin_unlock(&bh->lock); - set_current_state(TASK_RUNNING); - return 0; - } - spin_unlock(&bh->lock); - time = schedule_timeout(time); - set_current_state(TASK_RUNNING); + /* add_wait_queue is the barrier after __set_current_state. */ + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&q.waiters, &wait); + /* + * !list_empty() is safe here without any lock. + * q.lock_ptr != 0 is not safe, because of ordering against wakeup. + */ + if (likely(!list_empty(&q.list))) + time = schedule_timeout(time); + __set_current_state(TASK_RUNNING); /* * NOTE: we don't remove ourselves from the waitqueue because @@ -446,7 +523,7 @@ static int futex_close(struct inode *ino struct futex_q *q = filp->private_data; unqueue_me(q); - kfree(filp->private_data); + kfree(q); return 0; } @@ -455,14 +532,16 @@ static unsigned int futex_poll(struct fi struct poll_table_struct *wait) { struct futex_q *q = filp->private_data; - struct futex_hash_bucket *bh = hash_futex(&q->key); int ret = 0; poll_wait(filp, &q->waiters, wait); - spin_lock(&bh->lock); + + /* + * list_empty() is safe here without any lock. + * q->lock_ptr != 0 is not safe, because of ordering against wakeup. + */ if (list_empty(&q->list)) ret = POLLIN | POLLRDNORM; - spin_unlock(&bh->lock); return ret; } @@ -472,12 +551,13 @@ static struct file_operations futex_fops .poll = futex_poll, }; -/* Signal allows caller to avoid the race which would occur if they - set the sigio stuff up afterwards. */ +/* + * Signal allows caller to avoid the race which would occur if they + * set the sigio stuff up afterwards. + */ static int futex_fd(unsigned long uaddr, int signal) { struct futex_q *q; - union futex_key key; struct file *filp; int ret, err; @@ -500,7 +580,7 @@ static int futex_fd(unsigned long uaddr, if (signal) { int err; - err = f_setown(filp, current->tgid, 1); + err = f_setown(filp, current->pid, 1); if (err < 0) { put_unused_fd(ret); put_filp(filp); @@ -519,20 +599,24 @@ static int futex_fd(unsigned long uaddr, } down_read(¤t->mm->mmap_sem); - err = get_futex_key(uaddr, &key); - up_read(¤t->mm->mmap_sem); + err = get_futex_key(uaddr, &q->key); if (unlikely(err != 0)) { + up_read(¤t->mm->mmap_sem); put_unused_fd(ret); put_filp(filp); kfree(q); return err; } - init_waitqueue_head(&q->waiters); + /* + * queue_me() must be called before releasing mmap_sem, because + * key->shared.inode needs to be referenced while holding it. + */ filp->private_data = q; - queue_me(q, &key, ret, filp); + queue_me(q, ret, filp); + up_read(¤t->mm->mmap_sem); /* Now we map fd to filp, so userspace can access it */ fd_install(ret, filp); diff -prauN linux-2.6.0-test7/kernel/pid.c wli-2.6.0-test7-bk1-29/kernel/pid.c --- linux-2.6.0-test7/kernel/pid.c 2003-10-08 12:24:08.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/kernel/pid.c 2003-10-09 20:16:27.000000000 -0700 @@ -159,30 +159,110 @@ inline struct pid *find_pid(enum pid_typ return NULL; } -void link_pid(task_t *task, struct pid_link *link, struct pid *pid) +static struct rb_root tasklist_root = { .rb_node = &init_task.tasks }; + +task_t *first_task(void) +{ + struct rb_node *node = rb_first(&tasklist_root); + BUG_ON(!node); + return rb_entry(node, task_t, tasks); +} + +void insert_task_list(task_t *task) +{ + struct rb_node **node = &tasklist_root.rb_node, *parent = NULL; + task_t *candidate; + + while (*node) { + parent = *node; + candidate = rb_entry(parent, task_t, tasks); + if (candidate->tgid < task->tgid) + node = &parent->rb_right; + else if (candidate->tgid > task->tgid) + node = &parent->rb_left; + else /* already there? give up */ + return; + } + rb_link_node(&task->tasks, parent, node); + rb_insert_color(&task->tasks, &tasklist_root); +} + +void remove_task_list(task_t *task) +{ + /* pray this is not called on something in the == case above */ + rb_erase(&task->tasks, &tasklist_root); +} + +/* + * need to be sorted by the id's of the type's predecessor + * if there's no predecessor (PIDTYPE_PID) then we expect a + * singleton and just use the type's own id for uniformity + * with the lists sorted this way we can perform efficient + * incremental enumeration + */ +static void insert_pid_chain(task_t *task, struct pid *pid, enum pid_type type) +{ + struct pid_link *candidate, *link = &task->pids[type]; + struct rb_node **node, *parent = NULL; + int n = type ? (link - 1)->pid.nr : link->pid.nr; + + node = &pid->task_list.rb_node; + while (*node) { + int k; + parent = *node; + candidate = rb_entry(parent, struct pid_link, pid_chain); + k = type ? (candidate - 1)->pid.nr : candidate->pid.nr; + + if (k <= n) + node = &parent->rb_right; + else /* if (k > n) */ + node = &parent->rb_left; + /* + * else + * something unexpected happened + * return; + */ + } + rb_link_node(&link->pid_chain, parent, node); + rb_insert_color(&link->pid_chain, &pid->task_list); +} + +static void remove_pid_chain(task_t *task, struct pid *pid, enum pid_type type) +{ + rb_erase(&task->pids[type].pid_chain, &pid->task_list); +} + +void link_pid(task_t *task, struct pid *pid, enum pid_type type) { atomic_inc(&pid->count); - list_add_tail(&link->pid_chain, &pid->task_list); - link->pidptr = pid; + insert_pid_chain(task, pid, type); + task->pids[type].pidptr = pid; } int attach_pid(task_t *task, enum pid_type type, int nr) { struct pid *pid = find_pid(type, nr); + struct pid_link *link = &task->pids[type]; if (pid) atomic_inc(&pid->count); else { - pid = &task->pids[type].pid; + struct list_head *bucket; + + pid = &link->pid; pid->nr = nr; atomic_set(&pid->count, 1); - INIT_LIST_HEAD(&pid->task_list); + pid->task_list.rb_node = NULL; pid->task = task; get_task_struct(task); - list_add(&pid->hash_chain, &pid_hash[type][pid_hashfn(nr)]); + bucket = &pid_hash[type][pid_hashfn(nr)]; + list_add_tail(&pid->hash_chain, bucket); } - list_add_tail(&task->pids[type].pid_chain, &pid->task_list); - task->pids[type].pidptr = pid; + link->pid_chain.rb_parent = link->pid_chain.rb_left + = link->pid_chain.rb_right = NULL; + link->pid_chain.rb_color = RB_BLACK; + link->pidptr = pid; + insert_pid_chain(task, pid, type); return 0; } @@ -193,7 +273,7 @@ static inline int __detach_pid(task_t *t struct pid *pid = link->pidptr; int nr; - list_del(&link->pid_chain); + remove_pid_chain(task, pid, type); if (!atomic_dec_and_test(&pid->count)) return 0; @@ -222,13 +302,85 @@ void detach_pid(task_t *task, enum pid_t free_pidmap(nr); } +/** + * find_tgids_after - Returns the tgids of tasks after tgid. + * @tgid: strict lower bound on tgids to return + * @tgids: buffer for return of tgids + * + * Returns the number of tgids returned in tgids + * The function works even if the input tgid value + * is not valid anymore. + */ +int find_tgids_after(int tgid, int tgids[PROC_MAXPIDS]) +{ + struct rb_node *node; + task_t *task = NULL, *lub = NULL; + int k; + + read_lock(&tasklist_lock); + node = tasklist_root.rb_node; + while (node) { + task = rb_entry(node, task_t, tasks); + if (task->tgid < tgid) + node = node->rb_right; + else if (task->tgid > tgid) { + node = node->rb_left; + lub = task; + } else { + struct rb_node *lub_node = rb_next(node); + if (lub_node) + lub = rb_entry(node, task_t, tasks); + break; + } + } + + for (k = 0, task = lub; task && k < PROC_MAXPIDS; ++k, task = next_task(task)) + tgids[k] = task->tgid; + read_unlock(&tasklist_lock); + return k; +} + +int find_tids_after(int tgid, int tid, int tids[PROC_MAXPIDS]) +{ + struct pid *pid; + struct rb_node *node; + task_t *task = NULL, *lub = NULL; + int k = 0; + + read_lock(&tasklist_lock); + pid = find_pid(tgid, PIDTYPE_TGID); + if (!pid) + goto out; + node = pid->task_list.rb_node; + while (node) { + task = pid_task(node, PIDTYPE_TGID); + if (task->pid < tid) + node = node->rb_right; + else if (task->pid > tid) { + lub = task; + node = node->rb_left; + } else { + struct rb_node *lub_node = rb_next(node); + if (lub_node) + lub = pid_task(lub_node, PIDTYPE_TGID); + break; + } + } + + for (task = lub; task && k < PROC_MAXPIDS; ++k, task = next_task_pid(task, PIDTYPE_TGID)) + tids[k] = task->pid; +out: + read_unlock(&tasklist_lock); + return k; +} + task_t *find_task_by_pid(int nr) { struct pid *pid = find_pid(PIDTYPE_PID, nr); if (!pid) return NULL; - return pid_task(pid->task_list.next, PIDTYPE_PID); + return __first_task_pid(pid, PIDTYPE_PID); } EXPORT_SYMBOL(find_task_by_pid); @@ -255,7 +407,7 @@ void switch_exec_pids(task_t *leader, ta attach_pid(thread, PIDTYPE_TGID, thread->tgid); attach_pid(thread, PIDTYPE_PGID, thread->signal->pgrp); attach_pid(thread, PIDTYPE_SID, thread->signal->session); - list_add_tail(&thread->tasks, &init_task.tasks); + insert_task_list(thread); attach_pid(leader, PIDTYPE_PID, leader->pid); attach_pid(leader, PIDTYPE_TGID, leader->tgid); diff -prauN linux-2.6.0-test7/kernel/power/disk.c wli-2.6.0-test7-bk1-29/kernel/power/disk.c --- linux-2.6.0-test7/kernel/power/disk.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/kernel/power/disk.c 2003-10-09 19:27:23.000000000 -0700 @@ -4,7 +4,7 @@ * Copyright (c) 2003 Patrick Mochel * Copyright (c) 2003 Open Source Development Lab * - * This file is release under the GPLv2 + * This file is released under the GPLv2. * */ diff -prauN linux-2.6.0-test7/kernel/power/pmdisk.c wli-2.6.0-test7-bk1-29/kernel/power/pmdisk.c --- linux-2.6.0-test7/kernel/power/pmdisk.c 2003-10-08 12:24:07.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/kernel/power/pmdisk.c 2003-10-09 19:27:23.000000000 -0700 @@ -448,7 +448,7 @@ static void count_pages(void) /** - * copy_pages - Atmoically snapshot memory. + * copy_pages - Atomically snapshot memory. * * Iterate over all the pages in the system and copy each one * into its corresponding location in the pagedir. diff -prauN linux-2.6.0-test7/kernel/power/swsusp.c wli-2.6.0-test7-bk1-29/kernel/power/swsusp.c --- linux-2.6.0-test7/kernel/power/swsusp.c 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/kernel/power/swsusp.c 2003-10-09 19:27:23.000000000 -0700 @@ -5,7 +5,9 @@ * machine suspend feature using pretty near only high-level routines * * Copyright (C) 1998-2001 Gabor Kuti - * Copyright (C) 1998,2001,2002 Pavel Machek + * Copyright (C) 1998,2001-2003 Pavel Machek + * + * This file is released under the GPLv2. * * I'd like to thank the following people for their work: * @@ -273,6 +275,17 @@ static void lock_swapdevices(void) /* Th swap_list_unlock(); } +/** + * write_suspend_image - Write entire image to disk. + * + * After writing suspend signature to the disk, suspend may no + * longer fail: we have ready-to-run image in swap, and rollback + * would happen on next reboot -- corrupting data. + * + * Note: The buffer we allocate to use to write the suspend header is + * not freed; its not needed since system is going down anyway + * (plus it causes oops and I'm lazy^H^H^H^Htoo busy). + */ static int write_suspend_image(void) { int i; @@ -282,6 +295,9 @@ static int write_suspend_image(void) unsigned long address; struct page *page; + if (!buffer) + return -ENOMEM; + printk( "Writing data to swap (%d pages): ", nr_copy_pages ); for (i=0; i 1) { printk(KERN_WARNING "Software Suspend has malfunctioning SMP support. Disabled :(\n"); - return; + return -EINVAL; } /* We enable the possibility of machine suspend */ software_suspend_enabled = 1; if (!resume_status) - return; + return 0; printk( "%s", name_resume ); if (resume_status == NORESUME) { if(resume_file[0]) read_suspend_image(resume_file, 1); printk( "disabled\n" ); - return; + return 0; } MDELAY(1000); @@ -1061,7 +1085,7 @@ void software_resume(void) if (!resume_file[0] && resume_status == RESUME_SPECIFIED) { printk( "suspension device unspecified\n" ); - return; + return -EINVAL; } printk( "resuming from %s\n", resume_file); @@ -1072,9 +1096,11 @@ void software_resume(void) read_failure: pm_restore_console(); - return; + return 0; } +late_initcall(software_resume); + static int __init resume_setup(char *str) { if (resume_status == NORESUME) diff -prauN linux-2.6.0-test7/kernel/resource.c wli-2.6.0-test7-bk1-29/kernel/resource.c --- linux-2.6.0-test7/kernel/resource.c 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/kernel/resource.c 2003-10-09 19:27:23.000000000 -0700 @@ -279,6 +279,67 @@ int allocate_resource(struct resource *r EXPORT_SYMBOL(allocate_resource); +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + * + * This function is equivalent of request_resource when no + * conflict happens. If a conflict happens, and the conflicting + * resources entirely fit within the range of the new resource, + * then the new resource is inserted and the conflicting resources + * become childs of the new resource. + */ +int insert_resource(struct resource *parent, struct resource *new) +{ + int result = 0; + struct resource *first, *next; + + write_lock(&resource_lock); + first = __request_resource(parent, new); + if (!first) + goto out; + + result = -EBUSY; + if (first == parent) + goto out; + + for (next = first; next->sibling; next = next->sibling) + if (next->sibling->start > new->end) + break; + + /* existing resource overlaps end of new resource */ + if (next->end > new->end) + goto out; + + result = 0; + + new->parent = parent; + new->sibling = next->sibling; + new->child = first; + + next->sibling = NULL; + for (next = first; next; next = next->sibling) + next->parent = new; + + if (parent->child == first) { + parent->child = new; + } else { + next = parent->child; + while (next->sibling != first) + next = next->sibling; + next->sibling = new; + } + + out: + write_unlock(&resource_lock); + return result; +} + +EXPORT_SYMBOL(insert_resource); + /* * This is compatibility stuff for IO resources. * diff -prauN linux-2.6.0-test7/kernel/signal.c wli-2.6.0-test7-bk1-29/kernel/signal.c --- linux-2.6.0-test7/kernel/signal.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/kernel/signal.c 2003-10-09 20:02:01.000000000 -0700 @@ -353,7 +353,7 @@ void __exit_signal(struct task_struct *t spin_lock(&sighand->siglock); if (atomic_dec_and_test(&sig->count)) { if (tsk == sig->curr_target) - sig->curr_target = next_thread(tsk); + sig->curr_target = another_thread(tsk); tsk->signal = NULL; spin_unlock(&sighand->siglock); flush_sigqueue(&sig->shared_pending); @@ -368,7 +368,7 @@ void __exit_signal(struct task_struct *t sig->group_exit_task = NULL; } if (tsk == sig->curr_target) - sig->curr_target = next_thread(tsk); + sig->curr_target = another_thread(tsk); tsk->signal = NULL; spin_unlock(&sighand->siglock); } @@ -613,20 +613,18 @@ static void do_notify_parent_cldstop(str * actual continuing for SIGCONT, but not the actual stopping for stop * signals. The process stop is done as a signal action for SIG_DFL. */ -static void handle_stop_signal(int sig, struct task_struct *p) +static void handle_stop_signal(int sig, task_t *p) { - struct task_struct *t; + struct pid *pid = p->pids[PIDTYPE_TGID].pidptr; + task_t *t; if (sig_kernel_stop(sig)) { /* * This is a stop signal. Remove SIGCONT from all queues. */ rm_from_queue(sigmask(SIGCONT), &p->signal->shared_pending); - t = p; - do { + __for_each_task_pid(PIDTYPE_TGID, t, pid) rm_from_queue(sigmask(SIGCONT), &t->pending); - t = next_thread(t); - } while (t != p); } else if (sig == SIGCONT) { /* * Remove all stop signals from all queues, @@ -654,8 +652,7 @@ static void handle_stop_signal(int sig, p->group_leader->real_parent); } rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); - t = p; - do { + __for_each_task_pid(PIDTYPE_TGID, t, pid) { unsigned int state; rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); @@ -679,9 +676,7 @@ static void handle_stop_signal(int sig, state |= TASK_INTERRUPTIBLE; } wake_up_state(t, state); - - t = next_thread(t); - } while (t != p); + } } } @@ -846,7 +841,8 @@ force_sig_specific(int sig, struct task_ static inline void __group_complete_signal(int sig, struct task_struct *p, unsigned int mask) { - struct task_struct *t; + task_t *t; + struct pid *pid = p->pids[PIDTYPE_TGID].pidptr; /* * Now find a thread we can wake up to take the signal off the queue. @@ -873,7 +869,7 @@ __group_complete_signal(int sig, struct BUG_ON(t->tgid != p->tgid); while (!wants_signal(sig, t, mask)) { - t = next_thread(t); + t = another_thread(t); if (t == p->signal->curr_target) /* * No thread needs to be woken. @@ -905,12 +901,10 @@ __group_complete_signal(int sig, struct p->signal->group_exit = 1; p->signal->group_exit_code = sig; p->signal->group_stop_count = 0; - t = p; - do { + __for_each_task_pid(PIDTYPE_TGID, t, pid) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); - t = next_thread(t); - } while (t != p); + } return; } @@ -928,12 +922,10 @@ __group_complete_signal(int sig, struct rm_from_queue(SIG_KERNEL_STOP_MASK, &p->signal->shared_pending); p->signal->group_stop_count = 0; p->signal->group_exit_task = t; - t = p; - do { + __for_each_task_pid(PIDTYPE_TGID, t, pid) { p->signal->group_stop_count++; signal_wake_up(t, 0); - t = next_thread(t); - } while (t != p); + } wake_up_process(p->signal->group_exit_task); return; } @@ -996,16 +988,19 @@ __group_send_sig_info(int sig, struct si /* * Nuke all other threads in the group. */ -void zap_other_threads(struct task_struct *p) +void zap_other_threads(task_t *p) { - struct task_struct *t; + struct pid *pid = p->pids[PIDTYPE_TGID].pidptr; + task_t *t; p->signal->group_stop_count = 0; if (thread_group_empty(p)) return; - for (t = next_thread(p); t != p; t = next_thread(t)) { + __for_each_task_pid(PIDTYPE_TGID, t, pid) { + if (t == p) + continue; /* * Don't bother with already dead threads */ @@ -1054,15 +1049,14 @@ int group_send_sig_info(int sig, struct int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) { - struct task_struct *p; - struct list_head *l; + task_t *p; struct pid *pid; int err, retval = -ESRCH; if (pgrp <= 0) return -EINVAL; - for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { + for_each_task_pid(pgrp, PIDTYPE_PGID, p, pid) { err = group_send_sig_info(sig, info, p); if (retval) retval = err; @@ -1094,15 +1088,14 @@ kill_sl_info(int sig, struct siginfo *in { int err, retval = -EINVAL; struct pid *pid; - struct list_head *l; - struct task_struct *p; + task_t *p; if (sid <= 0) goto out; retval = -ESRCH; read_lock(&tasklist_lock); - for_each_task_pid(sid, PIDTYPE_SID, p, l, pid) { + for_each_task_pid(sid, PIDTYPE_SID, p, pid) { if (!process_session_leader(p)) continue; err = group_send_sig_info(sig, info, p); @@ -1375,25 +1368,23 @@ out: * Joy. Or not. Pthread wants us to wake up every thread * in our parent group. */ -static inline void __wake_up_parent(struct task_struct *p, - struct task_struct *parent) +static inline void __wake_up_parent(task_t *p, task_t *parent) { - struct task_struct *tsk = parent; + task_t *task; + struct pid *pid = parent->pids[PIDTYPE_TGID].pidptr; /* * Fortunately this is not necessary for thread groups: */ - if (p->tgid == tsk->tgid) { - wake_up_interruptible(&tsk->wait_chldexit); + if (p->tgid == parent->tgid) { + wake_up_interruptible(&parent->wait_chldexit); return; } - do { - wake_up_interruptible(&tsk->wait_chldexit); - tsk = next_thread(tsk); - if (tsk->signal != parent->signal) - BUG(); - } while (tsk != parent); + __for_each_task_pid(PIDTYPE_TGID, task, pid) { + wake_up_interruptible(&task->wait_chldexit); + BUG_ON(task->signal != parent->signal); + } } /* @@ -1628,10 +1619,12 @@ do_signal_stop(int signr) } if (sig->group_stop_count == 0) { + struct pid *pid = current->pids[PIDTYPE_TGID].pidptr; sig->group_exit_code = signr; stop_count = 0; - for (t = next_thread(current); t != current; - t = next_thread(t)) + __for_each_task_pid(PIDTYPE_TGID, t, pid) { + if (t == current) + continue; /* * Setting state to TASK_STOPPED for a group * stop is always done with the siglock held, @@ -1641,9 +1634,9 @@ do_signal_stop(int signr) stop_count++; signal_wake_up(t, 0); } + } sig->group_stop_count = stop_count; - } - else { + } else { /* A race with another thread while unlocked. */ signr = sig->group_exit_code; stop_count = --sig->group_stop_count; @@ -2302,7 +2295,9 @@ do_sigaction(int sig, const struct k_sig * Now we must do this little unlock and relock * dance to maintain the lock hierarchy. */ - struct task_struct *t = current; + task_t *t = current; + struct pid *pid = t->pids[PIDTYPE_TGID].pidptr; + spin_unlock_irq(&t->sighand->siglock); read_lock(&tasklist_lock); spin_lock_irq(&t->sighand->siglock); @@ -2310,11 +2305,10 @@ do_sigaction(int sig, const struct k_sig sigdelsetmask(&k->sa.sa_mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); rm_from_queue(sigmask(sig), &t->signal->shared_pending); - do { + __for_each_task_pid(PIDTYPE_TGID, t, pid) { rm_from_queue(sigmask(sig), &t->pending); recalc_sigpending_tsk(t); - t = next_thread(t); - } while (t != current); + } spin_unlock_irq(¤t->sighand->siglock); read_unlock(&tasklist_lock); return 0; diff -prauN linux-2.6.0-test7/kernel/sys.c wli-2.6.0-test7-bk1-29/kernel/sys.c --- linux-2.6.0-test7/kernel/sys.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/kernel/sys.c 2003-10-09 20:01:19.000000000 -0700 @@ -284,7 +284,6 @@ asmlinkage long sys_setpriority(int whic struct task_struct *g, *p; struct user_struct *user; struct pid *pid; - struct list_head *l; int error = -EINVAL; if (which > 2 || which < 0) @@ -309,7 +308,7 @@ asmlinkage long sys_setpriority(int whic case PRIO_PGRP: if (!who) who = process_group(current); - for_each_task_pid(who, PIDTYPE_PGID, p, l, pid) + for_each_task_pid(who, PIDTYPE_PGID, p, pid) error = set_one_prio(p, niceval, error); break; case PRIO_USER: @@ -341,8 +340,7 @@ out: */ asmlinkage long sys_getpriority(int which, int who) { - struct task_struct *g, *p; - struct list_head *l; + task_t *g, *p; struct pid *pid; struct user_struct *user; long niceval, retval = -ESRCH; @@ -365,7 +363,7 @@ asmlinkage long sys_getpriority(int whic case PRIO_PGRP: if (!who) who = process_group(current); - for_each_task_pid(who, PIDTYPE_PGID, p, l, pid) { + for_each_task_pid(who, PIDTYPE_PGID, p, pid) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; @@ -985,11 +983,10 @@ asmlinkage long sys_setpgid(pid_t pid, p goto out; if (pgid != pid) { - struct task_struct *p; + task_t *p; struct pid *pid; - struct list_head *l; - for_each_task_pid(pgid, PIDTYPE_PGID, p, l, pid) + for_each_task_pid(pgid, PIDTYPE_PGID, p, pid) if (process_session(p) == process_session(current)) goto ok_pgid; goto out; diff -prauN linux-2.6.0-test7/lib/kobject.c wli-2.6.0-test7-bk1-29/lib/kobject.c --- linux-2.6.0-test7/lib/kobject.c 2003-10-08 12:24:44.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/lib/kobject.c 2003-10-09 19:27:23.000000000 -0700 @@ -331,6 +331,7 @@ int kobject_set_name(struct kobject * ko int limit = KOBJ_NAME_LEN; int need; va_list args; + char * name; va_start(args,fmt); /* @@ -338,25 +339,33 @@ int kobject_set_name(struct kobject * ko */ need = vsnprintf(kobj->name,limit,fmt,args); if (need < limit) - kobj->k_name = kobj->name; + name = kobj->name; else { /* * Need more space? Allocate it and try again */ - kobj->k_name = kmalloc(need,GFP_KERNEL); - if (!kobj->k_name) { + name = kmalloc(need,GFP_KERNEL); + if (!name) { error = -ENOMEM; goto Done; } limit = need; - need = vsnprintf(kobj->k_name,limit,fmt,args); + need = vsnprintf(name,limit,fmt,args); /* Still? Give up. */ if (need > limit) { - kfree(kobj->k_name); + kfree(name); error = -EFAULT; + goto Done; } } + + /* Free the old name, if necessary. */ + if (kobj->k_name && kobj->k_name != kobj->name) + kfree(kobj->k_name); + + /* Now, set the new name */ + kobj->k_name = name; Done: va_end(args); return error; @@ -627,6 +636,8 @@ EXPORT_SYMBOL(kobject_unregister); EXPORT_SYMBOL(kobject_get); EXPORT_SYMBOL(kobject_put); +EXPORT_SYMBOL(kset_register); +EXPORT_SYMBOL(kset_unregister); EXPORT_SYMBOL(kset_find_obj); EXPORT_SYMBOL(subsystem_init); diff -prauN linux-2.6.0-test7/mm/filemap.c wli-2.6.0-test7-bk1-29/mm/filemap.c --- linux-2.6.0-test7/mm/filemap.c 2003-10-08 12:24:03.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/filemap.c 2003-10-09 19:42:26.000000000 -0700 @@ -55,14 +55,14 @@ /* * Lock ordering: * - * ->i_shared_sem (vmtruncate) - * ->private_lock (__free_pte->__set_page_dirty_buffers) + * ->i_shared_lock (vmtruncate) + * ->private_lock (__free_pte->set_page_dirty_buffers) * ->swap_list_lock * ->swap_device_lock (exclusive_swap_page, others) * ->mapping->page_lock * * ->mmap_sem - * ->i_shared_sem (various places) + * ->i_shared_lock (various places) * * ->mmap_sem * ->lock_page (access_process_vm) @@ -70,6 +70,9 @@ * ->mmap_sem * ->i_sem (msync) * + * ->lock_page + * ->i_shared_lock (page_convert_anon) + * * ->inode_lock * ->sb_lock (fs/fs-writeback.c) * ->mapping->page_lock (__sync_single_inode) @@ -88,11 +91,11 @@ */ void __remove_from_page_cache(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); radix_tree_delete(&mapping->page_tree, page->index); list_del(&page->list); - page->mapping = NULL; + set_page_mapping(page, NULL); mapping->nrpages--; pagecache_acct(-1); @@ -100,22 +103,24 @@ void __remove_from_page_cache(struct pag void remove_from_page_cache(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); - if (unlikely(!PageLocked(page))) - PAGE_BUG(page); + BUG_ON(!PageLocked(page)); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); + page_cache_release(page); } static inline int sync_page(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping && mapping->a_ops && mapping->a_ops->sync_page) return mapping->a_ops->sync_page(page); + if (PageSwapCache(page)) + blk_run_queues(); return 0; } @@ -139,9 +144,9 @@ static int __filemap_fdatawrite(struct a if (mapping->backing_dev_info->memory_backed) return 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); list_splice_init(&mapping->dirty_pages, &mapping->io_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ret = do_writepages(mapping, &wbc); return ret; } @@ -174,7 +179,7 @@ int filemap_fdatawait(struct address_spa restart: progress = 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); while (!list_empty(&mapping->locked_pages)) { struct page *page; @@ -188,7 +193,7 @@ restart: if (!PageWriteback(page)) { if (++progress > 32) { if (need_resched()) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __cond_resched(); goto restart; } @@ -198,16 +203,16 @@ restart: progress = 0; page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); wait_on_page_writeback(page); if (PageError(page)) ret = -EIO; page_cache_release(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); /* Check for outstanding write errors */ if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) @@ -224,16 +229,9 @@ EXPORT_SYMBOL(filemap_fdatawait); * This adds a page to the page cache, starting out as locked, unreferenced, * not uptodate and with no errors. * - * This function is used for two things: adding newly allocated pagecache - * pages and for moving existing anon pages into swapcache. - * - * In the case of pagecache pages, the page is new, so we can just run - * SetPageLocked() against it. The other page state flags were set by - * rmqueue() - * - * In the case of swapcache, try_to_swap_out() has already locked the page, so - * SetPageLocked() is ugly-but-OK there too. The required page state has been - * set up by swap_out_add_to_swap_cache(). + * This function is used to add newly allocated pagecache pages; + * the page is new, so we can just run SetPageLocked() against it. + * The other page state flags were set by rmqueue(). * * This function does not add the page to the LRU. The caller must do that. */ @@ -244,15 +242,19 @@ int add_to_page_cache(struct page *page, if (error == 0) { page_cache_get(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { SetPageLocked(page); - ___add_to_page_cache(page, mapping, offset); + list_add(&page->list, &mapping->clean_pages); + set_page_mapping(page, mapping); + page->index = offset; + mapping->nrpages++; + pagecache_acct(+1); } else { page_cache_release(page); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); radix_tree_preload_end(); } return error; @@ -388,11 +390,11 @@ struct page * find_get_page(struct addre * We scan the hash list read-only. Addition to and removal from * the hash-list needs a held write-lock. */ - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page) page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -405,11 +407,11 @@ struct page *find_trylock_page(struct ad { struct page *page; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); page = radix_tree_lookup(&mapping->page_tree, offset); if (page && TestSetPageLocked(page)) page = NULL; - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -431,25 +433,25 @@ struct page *find_lock_page(struct addre { struct page *page; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); repeat: page = radix_tree_lookup(&mapping->page_tree, offset); if (page) { page_cache_get(page); if (TestSetPageLocked(page)) { - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); lock_page(page); - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); /* Has the page been truncated while we slept? */ - if (page->mapping != mapping || page->index != offset) { + if (page_mapping(page) != mapping || page->index != offset) { unlock_page(page); page_cache_release(page); goto repeat; } } } - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return page; } @@ -523,12 +525,12 @@ unsigned int find_get_pages(struct addre unsigned int i; unsigned int ret; - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); ret = radix_tree_gang_lookup(&mapping->page_tree, (void **)pages, start, nr_pages); for (i = 0; i < ret; i++) page_cache_get(pages[i]); - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); return ret; } @@ -658,8 +660,8 @@ page_not_up_to_date: /* Get exclusive access to the page ... */ lock_page(page); - /* Did it get unhashed before we got the lock? */ - if (!page->mapping) { + /* Did it get removed from the radix tree before we got the lock? */ + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); continue; @@ -1101,8 +1103,8 @@ page_not_uptodate: inc_page_state(pgmajfault); lock_page(page); - /* Did it get unhashed while we waited for it? */ - if (!page->mapping) { + /* Did it get removed from the radix tree while we waited for it? */ + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); goto retry_all; @@ -1129,7 +1131,7 @@ page_not_uptodate: lock_page(page); /* Somebody truncated the page on us? */ - if (!page->mapping) { + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); goto retry_all; @@ -1210,8 +1212,8 @@ no_cached_page: page_not_uptodate: lock_page(page); - /* Did it get unhashed while we waited for it? */ - if (!page->mapping) { + /* Did it get removed from the radix tree while we waited for it? */ + if (!page_mapping(page)) { unlock_page(page); goto err; } @@ -1237,7 +1239,7 @@ page_not_uptodate: lock_page(page); /* Somebody truncated the page on us? */ - if (!page->mapping) { + if (!page_mapping(page)) { unlock_page(page); goto err; } @@ -1422,7 +1424,7 @@ retry: goto out; lock_page(page); - if (!page->mapping) { + if (!page_mapping(page)) { unlock_page(page); page_cache_release(page); goto retry; diff -prauN linux-2.6.0-test7/mm/fremap.c wli-2.6.0-test7-bk1-29/mm/fremap.c --- linux-2.6.0-test7/mm/fremap.c 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/fremap.c 2003-10-09 19:47:30.000000000 -0700 @@ -12,13 +12,16 @@ #include #include #include -#include +#include #include #include #include #include +/* + * This is never done to an anonymous page so page->mapping is never altered. + */ static inline int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -30,13 +33,13 @@ static inline int zap_pte(struct mm_stru unsigned long pfn = pte_pfn(pte); flush_cache_page(vma, addr); - pte = ptep_get_and_clear(ptep); + pte = vm_ptep_get_and_clear(vma, ptep, addr); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - page_remove_rmap(page, ptep); + page_remove_rmap(page); page_cache_release(page); mm->rss--; } @@ -45,7 +48,7 @@ static inline int zap_pte(struct mm_stru } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); + vm_pte_clear(vma, ptep, addr); return 0; } } @@ -62,19 +65,18 @@ int install_page(struct mm_struct *mm, s pgd_t *pgd; pmd_t *pmd; pte_t pte_val; - struct pte_chain *pte_chain; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto err; pgd = pgd_offset(mm, addr); + if (!rmap_get_cpu()) + goto err; spin_lock(&mm->page_table_lock); + put_cpu(); - pmd = pmd_alloc(mm, pgd, addr); + pmd = pmd_alloc_map(mm, pgd, addr); if (!pmd) goto err_unlock; - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, pgd, &pmd, addr); if (!pte) goto err_unlock; @@ -82,20 +84,20 @@ int install_page(struct mm_struct *mm, s mm->rss++; flush_icache_page(vma, page); - set_pte(pte, mk_pte(page, prot)); - pte_chain = page_add_rmap(page, pte, pte_chain); + vm_set_pte(vma, pte, mk_pte(page, prot), addr); + if (!PageReserved(page)) + page_add_rmap(page, vma, addr, 0); pte_val = *pte; pte_unmap(pte); + pmd_unmap(pmd); if (flush) flush_tlb_page(vma, addr); update_mmu_cache(vma, addr, pte_val); spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); return 0; err_unlock: spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); err: return err; } @@ -118,19 +120,22 @@ int install_file_pte(struct mm_struct *m pgd = pgd_offset(mm, addr); spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, addr); + pmd = pmd_alloc_map(mm, pgd, addr); if (!pmd) goto err_unlock; - pte = pte_alloc_map(mm, pmd, addr); - if (!pte) + pte = pte_alloc_map(mm, pgd, &pmd, addr); + if (!pte) { + pmd_unmap(pmd); goto err_unlock; + } flush = zap_pte(mm, vma, addr, pte); set_pte(pte, pgoff_to_pte(pgoff)); pte_val = *pte; pte_unmap(pte); + pmd_unmap(pmd); if (flush) flush_tlb_page(vma, addr); update_mmu_cache(vma, addr, pte_val); diff -prauN linux-2.6.0-test7/mm/memory.c wli-2.6.0-test7-bk1-29/mm/memory.c --- linux-2.6.0-test7/mm/memory.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/memory.c 2003-10-09 20:16:18.000000000 -0700 @@ -43,11 +43,10 @@ #include #include #include -#include +#include #include #include -#include #include #include #include @@ -103,7 +102,7 @@ static inline void free_one_pmd(struct m } page = pmd_page(*dir); pmd_clear(dir); - pgtable_remove_rmap(page); + dec_page_state(nr_page_table_pages); pte_free_tlb(tlb, page); } @@ -111,6 +110,7 @@ static inline void free_one_pgd(struct m { int j; pmd_t * pmd; + struct page *page; if (pgd_none(*dir)) return; @@ -119,11 +119,13 @@ static inline void free_one_pgd(struct m pgd_clear(dir); return; } - pmd = pmd_offset(dir, 0); + page = pgd_page(*dir); + pmd = pmd_offset_map(dir, 0); pgd_clear(dir); for (j = 0; j < PTRS_PER_PMD ; j++) free_one_pmd(tlb, pmd+j); - pmd_free_tlb(tlb, pmd); + pmd_unmap(pmd); + pmd_free_tlb(tlb, page); } /* @@ -143,30 +145,38 @@ void clear_page_tables(struct mmu_gather } while (--nr); } -pte_t * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +/* + * error return happens with pmd unmapped + */ +pte_t *pte_alloc_map(struct mm_struct *mm, pgd_t *pgd, pmd_t **pmd, unsigned long addr) { - if (!pmd_present(*pmd)) { + if (!pmd_present(**pmd)) { struct page *new; + pmd_unmap(*pmd); spin_unlock(&mm->page_table_lock); - new = pte_alloc_one(mm, address); + new = pte_alloc_one(mm, addr); spin_lock(&mm->page_table_lock); - if (!new) + if (!new) { + *pmd = NULL; return NULL; + } + + *pmd = pmd_offset_map(pgd, addr); /* * Because we dropped the lock, we should re-check the * entry, as somebody else could have populated it.. */ - if (pmd_present(*pmd)) { + if (pmd_present(**pmd)) { pte_free(new); goto out; } - pgtable_add_rmap(new, mm, address); - pmd_populate(mm, pmd, new); + inc_page_state(nr_page_table_pages); + pmd_populate(mm, *pmd, new); } out: - return pte_offset_map(pmd, address); + return pte_offset_map(*pmd, addr); } pte_t * pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address) @@ -188,7 +198,7 @@ pte_t * pte_alloc_kernel(struct mm_struc pte_free_kernel(new); goto out; } - pgtable_add_rmap(virt_to_page(new), mm, address); + inc_page_state(nr_page_table_pages); pmd_populate_kernel(mm, pmd, new); } out: @@ -206,7 +216,7 @@ out: * variable count and make things faster. -jj * * dst->page_table_lock is held on entry and exit, - * but may be dropped within pmd_alloc() and pte_alloc_map(). + * but may be dropped within pmd_alloc_map() and pte_alloc_map(). */ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) @@ -215,20 +225,10 @@ int copy_page_range(struct mm_struct *ds unsigned long address = vma->vm_start; unsigned long end = vma->vm_end; unsigned long cow; - struct pte_chain *pte_chain = NULL; if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst, src, vma); - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (!pte_chain) { - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - } - cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; src_pgd = pgd_offset(src, address)-1; dst_pgd = pgd_offset(dst, address)-1; @@ -251,11 +251,10 @@ skip_copy_pmd_range: address = (address continue; } - src_pmd = pmd_offset(src_pgd, address); - dst_pmd = pmd_alloc(dst, dst_pgd, address); + dst_pmd = pmd_alloc_map(dst, dst_pgd, address); if (!dst_pmd) goto nomem; - + src_pmd = pmd_offset_map_nested(src_pgd, address); do { pte_t * src_pte, * dst_pte; @@ -268,15 +267,20 @@ skip_copy_pmd_range: address = (address pmd_clear(src_pmd); skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; - if (address >= end) + if (address >= end) { + pmd_unmap(dst_pmd); + pmd_unmap_nested(src_pmd); goto out; + } goto cont_copy_pmd_range; } - dst_pte = pte_alloc_map(dst, dst_pmd, address); + pmd_unmap_nested(src_pmd); + dst_pte = pte_alloc_map(dst, dst_pgd, &dst_pmd, address); if (!dst_pte) goto nomem; spin_lock(&src->page_table_lock); + src_pmd = pmd_offset_map_nested(src_pgd, address); src_pte = pte_offset_map_nested(src_pmd, address); do { pte_t pte = *src_pte; @@ -291,8 +295,7 @@ skip_copy_pte_range: if (!pte_present(pte)) { if (!pte_file(pte)) swap_duplicate(pte_to_swp_entry(pte)); - set_pte(dst_pte, pte); - goto cont_copy_pte_range_noset; + goto cont_copy_pte_range; } pfn = pte_pfn(pte); /* the pte points outside of valid memory, the @@ -300,13 +303,13 @@ skip_copy_pte_range: * and not mapped via rmap - duplicate the * mapping as is. */ - page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); - - if (!page || PageReserved(page)) { - set_pte(dst_pte, pte); - goto cont_copy_pte_range_noset; + if (!pfn_valid(pfn)) { + page = NULL; + goto cont_copy_pte_range; + } else { + page = pfn_to_page(pfn); + if (PageReserved(page)) + goto cont_copy_pte_range; } /* @@ -314,7 +317,7 @@ skip_copy_pte_range: * in the parent and the child */ if (cow) { - ptep_set_wrprotect(src_pte); + vm_ptep_set_wrprotect(src, src_pte); pte = *src_pte; } @@ -327,35 +330,14 @@ skip_copy_pte_range: pte = pte_mkold(pte); get_page(page); dst->rss++; - - set_pte(dst_pte, pte); - pte_chain = page_add_rmap(page, dst_pte, - pte_chain); - if (pte_chain) - goto cont_copy_pte_range_noset; - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (pte_chain) - goto cont_copy_pte_range_noset; - - /* - * pte_chain allocation failed, and we need to - * run page reclaim. - */ - pte_unmap_nested(src_pte); - pte_unmap(dst_pte); - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - spin_lock(&dst->page_table_lock); - if (!pte_chain) - goto nomem; - spin_lock(&src->page_table_lock); - dst_pte = pte_offset_map(dst_pmd, address); - src_pte = pte_offset_map_nested(src_pmd, - address); + page_dup_rmap(page); +cont_copy_pte_range: + vm_set_pte(vma, dst_pte, pte, address); cont_copy_pte_range_noset: address += PAGE_SIZE; if (address >= end) { + pmd_unmap(dst_pmd); + pmd_unmap_nested(src_pmd); pte_unmap_nested(src_pte); pte_unmap(dst_pte); goto out_unlock; @@ -371,19 +353,19 @@ cont_copy_pmd_range: src_pmd++; dst_pmd++; } while ((unsigned long)src_pmd & PMD_TABLE_MASK); + pmd_unmap_nested(src_pmd-1); + pmd_unmap(dst_pmd-1); } out_unlock: spin_unlock(&src->page_table_lock); out: - pte_chain_free(pte_chain); return 0; nomem: - pte_chain_free(pte_chain); return -ENOMEM; } static void -zap_pte_range(struct mmu_gather *tlb, pmd_t * pmd, +zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, unsigned long size) { unsigned long offset; @@ -408,32 +390,32 @@ zap_pte_range(struct mmu_gather *tlb, pm if (pte_present(pte)) { unsigned long pfn = pte_pfn(pte); - pte = ptep_get_and_clear(ptep); + pte = vm_ptep_get_and_clear(vma, ptep, address + offset); tlb_remove_tlb_entry(tlb, ptep, address+offset); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); if (!PageReserved(page)) { if (pte_dirty(pte)) set_page_dirty(page); - if (page->mapping && pte_young(pte) && + if (page_mapping(page) && pte_young(pte) && !PageSwapCache(page)) mark_page_accessed(page); tlb->freed++; - page_remove_rmap(page, ptep); + page_remove_rmap(page); tlb_remove_page(tlb, page); } } } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); - pte_clear(ptep); + vm_pte_clear(vma, ptep, address); } } pte_unmap(ptep-1); } static void -zap_pmd_range(struct mmu_gather *tlb, pgd_t * dir, +zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *dir, unsigned long address, unsigned long size) { pmd_t * pmd; @@ -446,15 +428,16 @@ zap_pmd_range(struct mmu_gather *tlb, pg pgd_clear(dir); return; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_map(dir, address); end = address + size; if (end > ((address + PGDIR_SIZE) & PGDIR_MASK)) end = ((address + PGDIR_SIZE) & PGDIR_MASK); do { - zap_pte_range(tlb, pmd, address, end - address); + zap_pte_range(tlb, vma, pmd, address, end - address); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address < end); + pmd_unmap(pmd - 1); } void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, @@ -472,7 +455,7 @@ void unmap_page_range(struct mmu_gather dir = pgd_offset(vma->vm_mm, address); tlb_start_vma(tlb, vma); do { - zap_pmd_range(tlb, dir, address, end - address); + zap_pmd_range(tlb, vma, dir, address, end - address); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); @@ -636,20 +619,27 @@ follow_page(struct mm_struct *mm, unsign if (pgd_none(*pgd) || pgd_bad(*pgd)) goto out; - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); if (pmd_none(*pmd)) - goto out; - if (pmd_huge(*pmd)) - return follow_huge_pmd(mm, address, pmd, write); - if (pmd_bad(*pmd)) - goto out; + goto out_unmap; + if (pmd_bad(*pmd)) { + pmd_ERROR(*pmd); + pmd_clear(pmd); + goto out_unmap; + } + if (pmd_huge(*pmd)) { + struct page *page = follow_huge_pmd(mm, address, pmd, write); + pmd_unmap(pmd); + return page; + } ptep = pte_offset_map(pmd, address); if (!ptep) - goto out; + goto out_unmap; pte = *ptep; pte_unmap(ptep); + pmd_unmap(pmd); if (pte_present(pte)) { if (!write || (pte_write(pte) && pte_dirty(pte))) { pfn = pte_pfn(pte); @@ -664,6 +654,9 @@ follow_page(struct mm_struct *mm, unsign out: return NULL; +out_unmap: + pmd_unmap(pmd); + goto out; } /* @@ -722,7 +715,7 @@ int get_user_pages(struct task_struct *t pgd = pgd_offset_k(pg); if (!pgd) return i ? : -EFAULT; - pmd = pmd_offset(pgd, pg); + pmd = pmd_offset_kernel(pgd, pg); if (!pmd) return i ? : -EFAULT; pte = pte_offset_kernel(pmd, pg); @@ -798,8 +791,8 @@ out: EXPORT_SYMBOL(get_user_pages); -static void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) +static void zeromap_pte_range(struct vm_area_struct *vma, pte_t *pte, + unsigned long address, unsigned long size, pgprot_t prot) { unsigned long end; @@ -810,14 +803,14 @@ static void zeromap_pte_range(pte_t * pt do { pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(address), prot)); BUG_ON(!pte_none(*pte)); - set_pte(pte, zero_pte); + vm_set_pte(vma, pte, zero_pte, address); address += PAGE_SIZE; pte++; } while (address && (address < end)); } -static inline int zeromap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, - unsigned long size, pgprot_t prot) +static inline int zeromap_pmd_range(struct vm_area_struct *vma, pgd_t *pgd, + pmd_t **pmd, unsigned long address, unsigned long size, pgprot_t prot) { unsigned long base, end; @@ -827,13 +820,13 @@ static inline int zeromap_pmd_range(stru if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); + pte_t *pte = pte_alloc_map(vma->vm_mm, pgd, pmd, base + address); if (!pte) return -ENOMEM; - zeromap_pte_range(pte, base + address, end - address, prot); + zeromap_pte_range(vma, pte, base + address, end - address, prot); pte_unmap(pte); address = (address + PMD_SIZE) & PMD_MASK; - pmd++; + (*pmd)++; } while (address && (address < end)); return 0; } @@ -853,13 +846,14 @@ int zeromap_page_range(struct vm_area_st spin_lock(&mm->page_table_lock); do { - pmd_t *pmd = pmd_alloc(mm, dir, address); + pmd_t *pmd = pmd_alloc_map(mm, dir, address); error = -ENOMEM; if (!pmd) break; - error = zeromap_pmd_range(mm, pmd, address, end - address, prot); + error = zeromap_pmd_range(vma, dir, &pmd, address, end - address, prot); if (error) break; + pmd_unmap(pmd - 1); address = (address + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (address && (address < end)); @@ -873,8 +867,9 @@ int zeromap_page_range(struct vm_area_st * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ -static inline void remap_pte_range(pte_t * pte, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) +static inline void remap_pte_range(struct vm_area_struct *vma, pte_t *pte, + unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) { unsigned long end; unsigned long pfn; @@ -887,15 +882,16 @@ static inline void remap_pte_range(pte_t do { BUG_ON(!pte_none(*pte)); if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn))) - set_pte(pte, pfn_pte(pfn, prot)); + vm_set_pte(vma, pte, pfn_pte(pfn, prot), address); address += PAGE_SIZE; pfn++; pte++; } while (address && (address < end)); } -static inline int remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned long address, unsigned long size, - unsigned long phys_addr, pgprot_t prot) +static inline int remap_pmd_range(struct vm_area_struct *vma, pgd_t *pgd, + pmd_t **pmd, unsigned long address, unsigned long size, + unsigned long phys_addr, pgprot_t prot) { unsigned long base, end; @@ -906,13 +902,13 @@ static inline int remap_pmd_range(struct end = PGDIR_SIZE; phys_addr -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, base + address); + pte_t *pte = pte_alloc_map(vma->vm_mm, pgd, pmd, base + address); if (!pte) return -ENOMEM; - remap_pte_range(pte, base + address, end - address, address + phys_addr, prot); + remap_pte_range(vma, pte, base + address, end - address, address + phys_addr, prot); pte_unmap(pte); address = (address + PMD_SIZE) & PMD_MASK; - pmd++; + (*pmd)++; } while (address && (address < end)); return 0; } @@ -934,13 +930,14 @@ int remap_page_range(struct vm_area_stru spin_lock(&mm->page_table_lock); do { - pmd_t *pmd = pmd_alloc(mm, dir, from); + pmd_t *pmd = pmd_alloc_map(mm, dir, from); error = -ENOMEM; if (!pmd) break; - error = remap_pmd_range(mm, pmd, from, end - from, phys_addr + from, prot); + error = remap_pmd_range(vma, dir, &pmd, from, end - from, phys_addr + from, prot); if (error) break; + pmd_unmap(pmd - 1); from = (from + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (from && (from < end)); @@ -959,9 +956,10 @@ EXPORT_SYMBOL(remap_page_range); * * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ -static inline void establish_pte(struct vm_area_struct * vma, unsigned long address, pte_t *page_table, pte_t entry) +static inline void establish_pte(struct vm_area_struct *vma, + unsigned long address, pte_t *page_table, pte_t entry) { - set_pte(page_table, entry); + vm_set_pte(vma, page_table, entry, address); flush_tlb_page(vma, address); update_mmu_cache(vma, address, entry); } @@ -969,8 +967,9 @@ static inline void establish_pte(struct /* * We hold the mm semaphore for reading and vma->vm_mm->page_table_lock */ -static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, - pte_t *page_table) +static inline void break_cow(struct vm_area_struct *vma, + struct page *new_page, unsigned long address, + pte_t *page_table) { flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); @@ -1001,7 +1000,6 @@ static int do_wp_page(struct mm_struct * { struct page *old_page, *new_page; unsigned long pfn = pte_pfn(pte); - struct pte_chain *pte_chain = NULL; int ret; if (unlikely(!pfn_valid(pfn))) { @@ -1011,6 +1009,7 @@ static int do_wp_page(struct mm_struct * * data, but for the moment just pretend this is OOM. */ pte_unmap(page_table); + pmd_unmap(pmd); printk(KERN_ERR "do_wp_page: bogus page at address %08lx\n", address); goto oom; @@ -1019,17 +1018,22 @@ static int do_wp_page(struct mm_struct * if (!TestSetPageLocked(old_page)) { int reuse = can_share_swap_page(old_page); - unlock_page(old_page); - if (reuse) { + if (!reuse) + unlock_page(old_page); + else { flush_cache_page(vma, address); establish_pte(vma, address, page_table, pte_mkyoung(pte_mkdirty(pte_mkwrite(pte)))); + page_turn_rmap(old_page, vma); pte_unmap(page_table); + pmd_unmap(pmd); ret = VM_FAULT_MINOR; + unlock_page(old_page); goto out; } } pte_unmap(page_table); + pmd_unmap(pmd); /* * Ok, we need to copy. Oh, well.. @@ -1037,9 +1041,6 @@ static int do_wp_page(struct mm_struct * page_cache_get(old_page); spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_mem; new_page = alloc_page(GFP_HIGHUSER); if (!new_page) goto no_mem; @@ -1049,32 +1050,37 @@ static int do_wp_page(struct mm_struct * * Re-check the pte - we dropped the lock */ spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, pte)) { if (PageReserved(old_page)) ++mm->rss; - page_remove_rmap(old_page, page_table); + else + /* should be file-backed, ->__mapping not modified */ + page_remove_rmap(old_page); break_cow(vma, new_page, address, page_table); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + + /* we have a unique reference, so PG_locked need not be held */ + page_add_rmap(new_page, vma, address, 1); lru_cache_add_active(new_page); /* Free the old page.. */ new_page = old_page; } pte_unmap(page_table); + pmd_unmap(pmd); page_cache_release(new_page); page_cache_release(old_page); ret = VM_FAULT_MINOR; - goto out; +out: + spin_unlock(&mm->page_table_lock); + return ret; no_mem: page_cache_release(old_page); oom: ret = VM_FAULT_OOM; -out: - spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); - return ret; + goto out; } /* @@ -1098,17 +1104,30 @@ invalidate_mmap_range_list(struct list_h hea = hba + hlen - 1; /* avoid overflow. */ if (hea < hba) hea = ULONG_MAX; - list_for_each(curr, head) { + list_for_each_rcu(curr, head) { + struct mmu_gather *tlb; + unsigned long start, end; + vp = list_entry(curr, struct vm_area_struct, shared); + + if (vp->vm_flags & VM_DEAD) + continue; + vba = vp->vm_pgoff; vea = vba + ((vp->vm_end - vp->vm_start) >> PAGE_SHIFT) - 1; if (hea < vba || vea < hba) continue; /* Mapping disjoint from hole. */ zba = (hba <= vba) ? vba : hba; zea = (vea <= hea) ? vea : hea; - zap_page_range(vp, - ((zba - vba) << PAGE_SHIFT) + vp->vm_start, - (zea - zba + 1) << PAGE_SHIFT); + + start = vp->vm_start + ((zba - vba) << PAGE_SHIFT); + end = start + ((zea - zba + 1) << PAGE_SHIFT); + + spin_lock(&vp->vm_mm->page_table_lock); + tlb = tlb_gather_mmu(vp->vm_mm, 0); + unmap_page_range(tlb, vp, start, end); + tlb_finish_mmu(tlb, start, end); + spin_unlock(&vp->vm_mm->page_table_lock); } } @@ -1140,14 +1159,14 @@ void invalidate_mmap_range(struct addres if (holeend & ~(long long)ULONG_MAX) hlen = ULONG_MAX - hba + 1; } - down(&mapping->i_shared_sem); + rcu_read_lock(); /* Protect against page fault */ atomic_inc(&mapping->truncate_count); if (unlikely(!list_empty(&mapping->i_mmap))) invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen); if (unlikely(!list_empty(&mapping->i_mmap_shared))) invalidate_mmap_range_list(&mapping->i_mmap_shared, hba, hlen); - up(&mapping->i_shared_sem); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(invalidate_mmap_range); @@ -1230,9 +1249,9 @@ static int do_swap_page(struct mm_struct swp_entry_t entry = pte_to_swp_entry(orig_pte); pte_t pte; int ret = VM_FAULT_MINOR; - struct pte_chain *pte_chain = NULL; pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); page = lookup_swap_cache(entry); if (!page) { @@ -1244,12 +1263,14 @@ static int do_swap_page(struct mm_struct * we released the page table lock. */ spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); if (pte_same(*page_table, orig_pte)) ret = VM_FAULT_OOM; else ret = VM_FAULT_MINOR; pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); goto out; } @@ -1260,26 +1281,27 @@ static int do_swap_page(struct mm_struct } mark_page_accessed(page); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { - ret = -ENOMEM; - goto out; - } lock_page(page); + if (!rmap_get_cpu()) { + ret = VM_FAULT_OOM; + goto outrel; + } + spin_lock(&mm->page_table_lock); + put_cpu(); + pmd = pmd_offset_map(pgd_offset(mm, address), address); + page_table = pte_offset_map(pmd, address); + /* * Back out if somebody else faulted in this pte while we * released the page table lock. */ - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, address); if (!pte_same(*page_table, orig_pte)) { pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); - unlock_page(page); - page_cache_release(page); ret = VM_FAULT_MINOR; - goto out; + goto outrel; } /* The page isn't present yet, go ahead with the fault. */ @@ -1292,19 +1314,23 @@ static int do_swap_page(struct mm_struct pte = mk_pte(page, vma->vm_page_prot); if (write_access && can_share_swap_page(page)) pte = pte_mkdirty(pte_mkwrite(pte)); - unlock_page(page); flush_icache_page(vma, page); - set_pte(page_table, pte); - pte_chain = page_add_rmap(page, page_table, pte_chain); + vm_set_pte(vma, page_table, pte, address); + page_add_rmap(page, vma, address, 1); + unlock_page(page); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); + pmd_unmap(pmd); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); out: - pte_chain_free(pte_chain); return ret; +outrel: + unlock_page(page); + page_cache_release(page); + goto out; } /* @@ -1319,20 +1345,8 @@ do_anonymous_page(struct mm_struct *mm, { pte_t entry; struct page * page = ZERO_PAGE(addr); - struct pte_chain *pte_chain; int ret; - pte_chain = pte_chain_alloc(GFP_ATOMIC); - if (!pte_chain) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto no_mem; - spin_lock(&mm->page_table_lock); - page_table = pte_offset_map(pmd, addr); - } - /* Read-only mapping of ZERO_PAGE. */ entry = pte_wrprotect(mk_pte(ZERO_PAGE(addr), vma->vm_page_prot)); @@ -1340,6 +1354,7 @@ do_anonymous_page(struct mm_struct *mm, if (write_access) { /* Allocate our own private page. */ pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); page = alloc_page(GFP_HIGHUSER); @@ -1348,9 +1363,11 @@ do_anonymous_page(struct mm_struct *mm, clear_user_highpage(page, addr); spin_lock(&mm->page_table_lock); + pmd = pmd_offset_map(pgd_offset(mm, addr), addr); page_table = pte_offset_map(pmd, addr); if (!pte_none(*page_table)) { + pmd_unmap(pmd); pte_unmap(page_table); page_cache_release(page); spin_unlock(&mm->page_table_lock); @@ -1359,26 +1376,26 @@ do_anonymous_page(struct mm_struct *mm, } mm->rss++; entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); + } + + vm_set_pte(vma, page_table, entry, addr); + if (write_access) { + page_add_rmap(page, vma, addr, 1); lru_cache_add_active(page); mark_page_accessed(page); } - - set_pte(page_table, entry); - /* ignores ZERO_PAGE */ - pte_chain = page_add_rmap(page, page_table, pte_chain); + pmd_unmap(pmd); pte_unmap(page_table); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, addr, entry); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MINOR; - goto out; - -no_mem: - ret = VM_FAULT_OOM; out: - pte_chain_free(pte_chain); return ret; +no_mem: + ret = VM_FAULT_OOM; + goto out; } /* @@ -1400,14 +1417,14 @@ do_no_page(struct mm_struct *mm, struct struct page * new_page; struct address_space *mapping = NULL; pte_t entry; - struct pte_chain *pte_chain; int sequence = 0; - int ret; + int ret, anon = 0; if (!vma->vm_ops || !vma->vm_ops->nopage) return do_anonymous_page(mm, vma, page_table, pmd, write_access, address); pte_unmap(page_table); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); if (vma->vm_file) { @@ -1424,26 +1441,25 @@ retry: if (new_page == NOPAGE_OOM) return VM_FAULT_OOM; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - goto oom; - /* * Should we do an early C-O-W break? */ - if (write_access && !(vma->vm_flags & VM_SHARED)) { + if ((write_access || page_zone(new_page)->zone_pgdat->node_id != numa_node_id()) && !(vma->vm_flags & VM_SHARED)) { struct page * page = alloc_page(GFP_HIGHUSER); - if (!page) { - page_cache_release(new_page); + if (!page) goto oom; - } + /* start with refcount 1 */ copy_user_highpage(page, new_page, address); page_cache_release(new_page); - lru_cache_add_active(page); + anon = 1; new_page = page; } + if (!rmap_get_cpu()) + goto oom; spin_lock(&mm->page_table_lock); + put_cpu(); + /* * For a file-backed vma, someone could have truncated or otherwise * invalidated this page. If invalidate_mmap_range got called, @@ -1454,9 +1470,9 @@ retry: sequence = atomic_read(&mapping->truncate_count); spin_unlock(&mm->page_table_lock); page_cache_release(new_page); - pte_chain_free(pte_chain); goto retry; } + pmd = pmd_offset_map(pgd_offset(mm, address), address); page_table = pte_offset_map(pmd, address); /* @@ -1477,12 +1493,29 @@ retry: entry = mk_pte(new_page, vma->vm_page_prot); if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); - set_pte(page_table, entry); - pte_chain = page_add_rmap(new_page, page_table, pte_chain); + vm_set_pte(vma, page_table, entry, address); + + /* + * PG_locked not held for the anon case, but we have a + * unique reference, and ->__mapping is untouched when file-backed + */ + if (!PageReserved(new_page)) + page_add_rmap(new_page, vma, address, anon); + + /* kswapd can find us now, but we're already prepped */ + if (anon) + lru_cache_add_active(new_page); pte_unmap(page_table); + pmd_unmap(pmd); } else { /* One of our sibling threads was faster, back out. */ pte_unmap(page_table); + pmd_unmap(pmd); + /* + * In the anon case, we never hit the LRU, so we free instantly, + * where in mainline the LRU retains a reference. In the file- + * backed case, we merely release a reference acquired earlier. + */ page_cache_release(new_page); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MINOR; @@ -1493,12 +1526,12 @@ retry: update_mmu_cache(vma, address, entry); spin_unlock(&mm->page_table_lock); ret = VM_FAULT_MAJOR; - goto out; -oom: - ret = VM_FAULT_OOM; out: - pte_chain_free(pte_chain); return ret; +oom: + page_cache_release(new_page); + ret = VM_FAULT_OOM; + goto out; } /* @@ -1519,13 +1552,14 @@ static int do_file_page(struct mm_struct */ if (!vma->vm_ops || !vma->vm_ops->populate || (write_access && !(vma->vm_flags & VM_SHARED))) { - pte_clear(pte); + vm_pte_clear(vma, pte, address); return do_no_page(mm, vma, address, write_access, pte, pmd); } pgoff = pte_to_pgoff(*pte); pte_unmap(pte); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE, vma->vm_page_prot, pgoff, 0); @@ -1586,6 +1620,7 @@ static inline int handle_pte_fault(struc entry = pte_mkyoung(entry); establish_pte(vma, address, pte, entry); pte_unmap(pte); + pmd_unmap(pmd); spin_unlock(&mm->page_table_lock); return VM_FAULT_MINOR; } @@ -1612,10 +1647,10 @@ int handle_mm_fault(struct mm_struct *mm * and the SMP-safe atomic PTE updates. */ spin_lock(&mm->page_table_lock); - pmd = pmd_alloc(mm, pgd, address); + pmd = pmd_alloc_map(mm, pgd, address); if (pmd) { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, pgd, &pmd, address); if (pte) return handle_pte_fault(mm, vma, address, write_access, pte, pmd); } @@ -1634,10 +1669,33 @@ int handle_mm_fault(struct mm_struct *mm */ pmd_t *__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) { + struct page *page; + + spin_unlock(&mm->page_table_lock); + page = pmd_alloc_one(mm, address); + spin_lock(&mm->page_table_lock); + if (!page) + return NULL; + + /* + * Because we dropped the lock, we should re-check the + * entry, as somebody else could have populated it.. + */ + if (pgd_present(*pgd)) { + pmd_free(page); + goto out; + } + pgd_populate(mm, pgd, page); +out: + return pmd_offset_map(pgd, address); +} + +pmd_t *__pmd_alloc_kernel(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ pmd_t *new; spin_unlock(&mm->page_table_lock); - new = pmd_alloc_one(mm, address); + new = pmd_alloc_one_kernel(mm, address); spin_lock(&mm->page_table_lock); if (!new) return NULL; @@ -1647,12 +1705,12 @@ pmd_t *__pmd_alloc(struct mm_struct *mm, * entry, as somebody else could have populated it.. */ if (pgd_present(*pgd)) { - pmd_free(new); + pmd_free(virt_to_page(new)); goto out; } - pgd_populate(mm, pgd, new); + pgd_populate(mm, pgd, virt_to_page(new)); out: - return pmd_offset(pgd, address); + return pmd_offset_kernel(pgd, address); } int make_pages_present(unsigned long addr, unsigned long end) @@ -1686,7 +1744,7 @@ struct page * vmalloc_to_page(void * vma pte_t *ptep, pte; if (!pgd_none(*pgd)) { - pmd = pmd_offset(pgd, addr); + pmd = pmd_offset_map(pgd, addr); if (!pmd_none(*pmd)) { preempt_disable(); ptep = pte_offset_map(pmd, addr); @@ -1696,6 +1754,7 @@ struct page * vmalloc_to_page(void * vma pte_unmap(ptep); preempt_enable(); } + pmd_unmap(pmd); } return page; } diff -prauN linux-2.6.0-test7/mm/mmap.c wli-2.6.0-test7-bk1-29/mm/mmap.c --- linux-2.6.0-test7/mm/mmap.c 2003-10-08 12:24:16.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/mmap.c 2003-10-09 19:57:40.000000000 -0700 @@ -58,8 +58,19 @@ EXPORT_SYMBOL(sysctl_overcommit_memory); EXPORT_SYMBOL(sysctl_overcommit_ratio); EXPORT_SYMBOL(vm_committed_space); +static void __free_vma(void *vma) +{ + kmem_cache_free(vm_area_cachep, vma); +} + +void free_vma(struct vm_area_struct *vma) +{ + INIT_LIST_HEAD(&vma->rcu.list); + call_rcu(&vma->rcu, __free_vma, vma); +} + /* - * Requires inode->i_mapping->i_shared_sem + * Requires inode->i_mapping->i_shared_lock */ static inline void __remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode) @@ -67,7 +78,8 @@ __remove_shared_vm_struct(struct vm_area if (inode) { if (vma->vm_flags & VM_DENYWRITE) atomic_inc(&inode->i_writecount); - list_del_init(&vma->shared); + vma->vm_flags |= VM_DEAD; + list_del_rcu(&vma->shared); } } @@ -81,9 +93,9 @@ static void remove_shared_vm_struct(stru if (file) { struct inode *inode = file->f_dentry->d_inode; - down(&inode->i_mapping->i_shared_sem); + spin_lock(&inode->i_mapping->i_shared_lock); __remove_shared_vm_struct(vma, inode); - up(&inode->i_mapping->i_shared_sem); + spin_unlock(&inode->i_mapping->i_shared_lock); } } @@ -241,9 +253,9 @@ static inline void __vma_link_file(struc atomic_dec(&inode->i_writecount); if (vma->vm_flags & VM_SHARED) - list_add_tail(&vma->shared, &mapping->i_mmap_shared); + list_add_tail_rcu(&vma->shared, &mapping->i_mmap_shared); else - list_add_tail(&vma->shared, &mapping->i_mmap); + list_add_tail_rcu(&vma->shared, &mapping->i_mmap); } } @@ -267,12 +279,12 @@ static void vma_link(struct mm_struct *m mapping = vma->vm_file->f_dentry->d_inode->i_mapping; if (mapping) - down(&mapping->i_shared_sem); + spin_lock(&mapping->i_shared_lock); spin_lock(&mm->page_table_lock); __vma_link(mm, vma, prev, rb_link, rb_parent); spin_unlock(&mm->page_table_lock); if (mapping) - up(&mapping->i_shared_sem); + spin_unlock(&mapping->i_shared_lock); mark_mm_hugetlb(mm, vma); mm->map_count++; @@ -299,6 +311,28 @@ static inline int is_mergeable_vma(struc return 1; } +static void move_vma_start(struct vm_area_struct *vma, unsigned long addr) +{ + spinlock_t *lock = &vma->vm_mm->page_table_lock; + struct inode *inode = NULL; + + if (vma->vm_file) { + inode = vma->vm_file->f_dentry->d_inode; + spin_lock(&inode->i_mapping->i_shared_lock); + } + spin_lock(lock); + if (inode) + __remove_shared_vm_struct(vma, inode); + /* If no vm_file, perhaps we should always keep vm_pgoff at 0?? */ + vma->vm_pgoff += (long)(addr - vma->vm_start) >> PAGE_SHIFT; + vma->vm_start = addr; + if (inode) { + __vma_link_file(vma); + spin_unlock(&inode->i_mapping->i_shared_lock); + } + spin_unlock(lock); +} + /* * Return true if we can merge this (vm_flags,file,vm_pgoff,size) * in front of (at a lower virtual address and file offset than) the vma. @@ -351,8 +385,6 @@ static int vma_merge(struct mm_struct *m unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) { - spinlock_t * lock = &mm->page_table_lock; - /* * We later require that vma->vm_flags == vm_flags, so this tests * vma->vm_flags & VM_SPECIAL, too. @@ -372,12 +404,13 @@ static int vma_merge(struct mm_struct *m is_mergeable_vma(prev, file, vm_flags) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { struct vm_area_struct *next; + spinlock_t *lock = &mm->page_table_lock; struct inode *inode = file ? file->f_dentry->d_inode : NULL; int need_up = 0; if (unlikely(file && prev->vm_next && prev->vm_next->vm_file == file)) { - down(&inode->i_mapping->i_shared_sem); + spin_lock(&inode->i_mapping->i_shared_lock); need_up = 1; } spin_lock(lock); @@ -395,17 +428,17 @@ static int vma_merge(struct mm_struct *m __remove_shared_vm_struct(next, inode); spin_unlock(lock); if (need_up) - up(&inode->i_mapping->i_shared_sem); + spin_unlock(&inode->i_mapping->i_shared_lock); if (file) fput(file); mm->map_count--; - kmem_cache_free(vm_area_cachep, next); + free_vma(next); return 1; } spin_unlock(lock); if (need_up) - up(&inode->i_mapping->i_shared_sem); + spin_unlock(&inode->i_mapping->i_shared_lock); return 1; } @@ -419,10 +452,7 @@ static int vma_merge(struct mm_struct *m pgoff, (end - addr) >> PAGE_SHIFT)) return 0; if (end == prev->vm_start) { - spin_lock(lock); - prev->vm_start = addr; - prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; - spin_unlock(lock); + move_vma_start(prev, addr); return 1; } } @@ -649,7 +679,7 @@ munmap_back: atomic_inc(&inode->i_writecount); fput(file); } - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); } out: mm->total_vm += len >> PAGE_SHIFT; @@ -674,7 +704,7 @@ unmap_and_free_vma: /* Undo any partial mapping done by a device driver. */ zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start); free_vma: - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); unacct_error: if (charged) vm_unacct_memory(charged); @@ -1067,7 +1097,7 @@ static void unmap_vma(struct mm_struct * area->vm_ops->close(area); if (area->vm_file) fput(area->vm_file); - kmem_cache_free(vm_area_cachep, area); + free_vma(area); } /* @@ -1157,8 +1187,7 @@ int split_vma(struct mm_struct * mm, str if (new_below) { new->vm_end = addr; - vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + move_vma_start(vma, addr); } else { vma->vm_end = addr; new->vm_start = addr; @@ -1413,7 +1442,7 @@ void exit_mmap(struct mm_struct *mm) } if (vma->vm_file) fput(vma->vm_file); - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); vma = next; } } diff -prauN linux-2.6.0-test7/mm/mprotect.c wli-2.6.0-test7-bk1-29/mm/mprotect.c --- linux-2.6.0-test7/mm/mprotect.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/mprotect.c 2003-10-09 19:40:04.000000000 -0700 @@ -24,11 +24,11 @@ #include static inline void -change_pte_range(pmd_t *pmd, unsigned long address, - unsigned long size, pgprot_t newprot) +change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, unsigned long size, pgprot_t newprot) { pte_t * pte; - unsigned long end; + unsigned long start, end; if (pmd_none(*pmd)) return; @@ -38,6 +38,7 @@ change_pte_range(pmd_t *pmd, unsigned lo return; } pte = pte_offset_map(pmd, address); + start = address & PMD_MASK; address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -50,8 +51,8 @@ change_pte_range(pmd_t *pmd, unsigned lo * bits by wiping the pte and then setting the new pte * into place. */ - entry = ptep_get_and_clear(pte); - set_pte(pte, pte_modify(entry, newprot)); + entry = vm_ptep_get_and_clear(vma, pte, address + start); + vm_set_pte(vma, pte, pte_modify(entry, newprot), start + address); } address += PAGE_SIZE; pte++; @@ -60,11 +61,11 @@ change_pte_range(pmd_t *pmd, unsigned lo } static inline void -change_pmd_range(pgd_t *pgd, unsigned long address, - unsigned long size, pgprot_t newprot) +change_pmd_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long address, unsigned long size, pgprot_t newprot) { pmd_t * pmd; - unsigned long end; + unsigned long start, end; if (pgd_none(*pgd)) return; @@ -73,16 +74,18 @@ change_pmd_range(pgd_t *pgd, unsigned lo pgd_clear(pgd); return; } - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); + start = address & PGDIR_MASK; address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) end = PGDIR_SIZE; do { - change_pte_range(pmd, address, end - address, newprot); + change_pte_range(vma, pmd, start + address, end - address, newprot); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); } static void @@ -98,7 +101,7 @@ change_protection(struct vm_area_struct BUG(); spin_lock(¤t->mm->page_table_lock); do { - change_pmd_range(dir, start, end - start, newprot); + change_pmd_range(vma, dir, start, end - start, newprot); start = (start + PGDIR_SIZE) & PGDIR_MASK; dir++; } while (start && (start < end)); @@ -135,7 +138,7 @@ mprotect_attempt_merge(struct vm_area_st __vma_unlink(mm, vma, prev); spin_unlock(&mm->page_table_lock); - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); mm->map_count--; return 1; } @@ -322,7 +325,7 @@ sys_mprotect(unsigned long start, size_t __vma_unlink(prev->vm_mm, next, prev); spin_unlock(&prev->vm_mm->page_table_lock); - kmem_cache_free(vm_area_cachep, next); + free_vma(next); prev->vm_mm->map_count--; } out: diff -prauN linux-2.6.0-test7/mm/mremap.c wli-2.6.0-test7-bk1-29/mm/mremap.c --- linux-2.6.0-test7/mm/mremap.c 2003-10-08 12:24:07.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/mremap.c 2003-10-09 19:47:30.000000000 -0700 @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include @@ -38,7 +38,7 @@ static pte_t *get_one_pte_map_nested(str goto end; } - pmd = pmd_offset(pgd, addr); + pmd = pmd_offset_map_nested(pgd, addr); if (pmd_none(*pmd)) goto end; if (pmd_bad(*pmd)) { @@ -53,6 +53,7 @@ static pte_t *get_one_pte_map_nested(str pte = NULL; } end: + pmd_unmap_nested(pmd); return pte; } @@ -60,50 +61,51 @@ static inline int page_table_present(str { pgd_t *pgd; pmd_t *pmd; + int ret; pgd = pgd_offset(mm, addr); if (pgd_none(*pgd)) return 0; - pmd = pmd_offset(pgd, addr); - return pmd_present(*pmd); + pmd = pmd_offset_map(pgd, addr); + ret = pmd_present(*pmd); + pmd_unmap(pmd); + return ret != 0; } static inline pte_t *alloc_one_pte_map(struct mm_struct *mm, unsigned long addr) { + pgd_t *pgd; pmd_t *pmd; pte_t *pte = NULL; - pmd = pmd_alloc(mm, pgd_offset(mm, addr), addr); + pgd = pgd_offset(mm, addr); + pmd = pmd_alloc_map(mm, pgd, addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, pgd, &pmd, addr); + pmd_unmap(pmd); return pte; } static int -copy_one_pte(struct mm_struct *mm, pte_t *src, pte_t *dst, - struct pte_chain **pte_chainp) +copy_one_pte(struct vm_area_struct *vma, pte_t *src, pte_t *dst, + unsigned long old_addr, unsigned long new_addr) { - int error = 0; - pte_t pte; - struct page *page = NULL; - - if (pte_present(*src)) - page = pte_page(*src); - if (!pte_none(*src)) { - if (page) - page_remove_rmap(page, src); - pte = ptep_get_and_clear(src); - if (!dst) { - /* No dest? We must put it back. */ - dst = src; - error++; + pte_t pte; + if (!dst) + return -1; + pte = vm_ptep_get_and_clear(vma, src, old_addr); + vm_set_pte(vma, dst, pte, new_addr); + if (pte_present(pte)) { + unsigned long pfn = pte_pfn(pte); + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + if (!PageReserved(page)) + page_move_rmap(page, vma, old_addr, new_addr); + } } - set_pte(dst, pte); - if (page) - *pte_chainp = page_add_rmap(page, dst, *pte_chainp); } - return error; + return 0; } static int @@ -111,16 +113,16 @@ move_one_page(struct vm_area_struct *vma unsigned long new_addr) { struct mm_struct *mm = vma->vm_mm; - int error = 0; pte_t *src, *dst; - struct pte_chain *pte_chain; + int error = 0; - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) { + if (!rmap_get_cpu()) { error = -ENOMEM; goto out; } + spin_lock(&mm->page_table_lock); + put_cpu(); src = get_one_pte_map_nested(mm, old_addr); if (src) { /* @@ -135,13 +137,12 @@ move_one_page(struct vm_area_struct *vma dst = alloc_one_pte_map(mm, new_addr); if (src == NULL) src = get_one_pte_map_nested(mm, old_addr); - error = copy_one_pte(mm, src, dst, &pte_chain); + error = copy_one_pte(vma, src, dst, old_addr, new_addr); pte_unmap_nested(src); pte_unmap(dst); } flush_tlb_page(vma, old_addr); spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); out: return error; } @@ -210,7 +211,7 @@ static unsigned long move_vma(struct vm_ if (vma == next) vma = prev; mm->map_count--; - kmem_cache_free(vm_area_cachep, next); + free_vma(next); } } else if (next->vm_start == new_addr + new_len && can_vma_merge(next, vma->vm_flags) && @@ -286,7 +287,7 @@ static unsigned long move_vma(struct vm_ return new_addr; } if (allocated_vma) - kmem_cache_free(vm_area_cachep, new_vma); + free_vma(new_vma); out: return -ENOMEM; } diff -prauN linux-2.6.0-test7/mm/msync.c wli-2.6.0-test7-bk1-29/mm/msync.c --- linux-2.6.0-test7/mm/msync.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/msync.c 2003-10-09 19:28:46.000000000 -0700 @@ -82,7 +82,7 @@ static inline int filemap_sync_pmd_range pgd_clear(pgd); return 0; } - pmd = pmd_offset(pgd, address); + pmd = pmd_offset_map(pgd, address); if ((address & PGDIR_MASK) != (end & PGDIR_MASK)) end = (address & PGDIR_MASK) + PGDIR_SIZE; error = 0; @@ -91,6 +91,7 @@ static inline int filemap_sync_pmd_range address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); return error; } diff -prauN linux-2.6.0-test7/mm/nommu.c wli-2.6.0-test7-bk1-29/mm/nommu.c --- linux-2.6.0-test7/mm/nommu.c 2003-10-08 12:24:43.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/nommu.c 2003-10-09 19:44:30.000000000 -0700 @@ -562,7 +562,3 @@ unsigned long get_unmapped_area(struct f { return -ENOMEM; } - -void pte_chain_init(void) -{ -} diff -prauN linux-2.6.0-test7/mm/page-writeback.c wli-2.6.0-test7-bk1-29/mm/page-writeback.c --- linux-2.6.0-test7/mm/page-writeback.c 2003-10-08 12:24:17.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/page-writeback.c 2003-10-09 19:42:26.000000000 -0700 @@ -457,7 +457,7 @@ int do_writepages(struct address_space * */ int write_one_page(struct page *page, int wait) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); int ret = 0; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, @@ -469,12 +469,12 @@ int write_one_page(struct page *page, in if (wait) wait_on_page_writeback(page); - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); list_del(&page->list); if (test_clear_page_dirty(page)) { list_add(&page->list, &mapping->locked_pages); page_cache_get(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ret = mapping->a_ops->writepage(page, &wbc); if (ret == 0 && wait) { wait_on_page_writeback(page); @@ -484,7 +484,7 @@ int write_one_page(struct page *page, in page_cache_release(page); } else { list_add(&page->list, &mapping->clean_pages); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); unlock_page(page); } return ret; @@ -496,31 +496,31 @@ EXPORT_SYMBOL(write_one_page); * and move it to the dirty_pages list. Also perform space reservation if * required. * - * __set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page + * set_page_dirty_nobuffers() may return -ENOSPC. But if it does, the page * is still safe, as long as it actually manages to find some blocks at * writeback time. * * This is also used when a single buffer is being dirtied: we want to set the * page dirty in that case, but not all the buffers. This is a "bottom-up" - * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. + * dirtying, whereas set_page_dirty_buffers() is a "top-down" dirtying. */ -int __set_page_dirty_nobuffers(struct page *page) +int set_page_dirty_nobuffers(struct page *page) { int ret = 0; if (!TestSetPageDirty(page)) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping) { - spin_lock(&mapping->page_lock); - if (page->mapping) { /* Race with truncate? */ - BUG_ON(page->mapping != mapping); + mapping_wrlock(&mapping->page_lock); + if (page_mapping(page)) { /* Race with truncate? */ + BUG_ON(page_mapping(page) != mapping); if (!mapping->backing_dev_info->memory_backed) inc_page_state(nr_dirty); list_del(&page->list); list_add(&page->list, &mapping->dirty_pages); } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); if (!PageSwapCache(page)) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -528,7 +528,28 @@ int __set_page_dirty_nobuffers(struct pa } return ret; } -EXPORT_SYMBOL(__set_page_dirty_nobuffers); +EXPORT_SYMBOL(set_page_dirty_nobuffers); + +/* + * If the mapping doesn't provide a set_page_dirty() a_op, then + * just fall through and assume that it wants bh's. + */ +int set_page_dirty(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + int (*spd)(struct page *); + + if (!mapping) { + SetPageDirty(page); + return 0; + } + spd = mapping->a_ops->set_page_dirty; + if (spd) + return (*spd)(page); + else + return set_page_dirty_buffers(page); +} +EXPORT_SYMBOL(set_page_dirty); /* * set_page_dirty() is racy if the caller has no reference against @@ -549,6 +570,7 @@ int set_page_dirty_lock(struct page *pag unlock_page(page); return ret; } +EXPORT_SYMBOL(set_page_dirty_lock); /* * Clear a page's dirty flag, while caring for dirty memory accounting. @@ -557,7 +579,7 @@ int set_page_dirty_lock(struct page *pag int test_clear_page_dirty(struct page *page) { if (TestClearPageDirty(page)) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_mapping(page); if (mapping && !mapping->backing_dev_info->memory_backed) dec_page_state(nr_dirty); diff -prauN linux-2.6.0-test7/mm/page_alloc.c wli-2.6.0-test7-bk1-29/mm/page_alloc.c --- linux-2.6.0-test7/mm/page_alloc.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/page_alloc.c 2003-10-09 19:57:40.000000000 -0700 @@ -74,7 +74,7 @@ static void bad_page(const char *functio { printk("Bad page state at %s\n", function); printk("flags:0x%08lx mapping:%p mapped:%d count:%d\n", - page->flags, page->mapping, + page->flags, (void *)page->__mapping, page_mapped(page), page_count(page)); printk("Backtrace:\n"); dump_stack(); @@ -84,9 +84,12 @@ static void bad_page(const char *functio 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | + 1 << PG_rmaplock | + 1 << PG_anon | + 1 << PG_swapcache | 1 << PG_writeback); set_page_count(page, 0); - page->mapping = NULL; + set_page_mapping(page, NULL); } #ifndef CONFIG_HUGETLB_PAGE @@ -168,7 +171,7 @@ static void destroy_compound_page(struct * -- wli */ -static inline void __free_pages_bulk (struct page *page, struct page *base, +static inline void buddy_free(struct page *page, struct page *base, struct zone *zone, struct free_area *area, unsigned long mask, unsigned int order) { @@ -181,7 +184,6 @@ static inline void __free_pages_bulk (st BUG(); index = page_idx >> (1 + order); - zone->free_pages -= mask; while (mask + (1 << (MAX_ORDER-1))) { struct page *buddy1, *buddy2; @@ -202,17 +204,45 @@ static inline void __free_pages_bulk (st BUG_ON(bad_range(zone, buddy2)); list_del(&buddy1->list); mask <<= 1; + area->globally_free--; area++; index >>= 1; page_idx &= mask; } list_add(&(base + page_idx)->list, &area->free_list); + area->globally_free++; +} + +static inline void __free_pages_bulk(struct page *page, struct page *base, + struct zone *zone, struct free_area *area, unsigned long mask, + unsigned int order) +{ + switch (area->active - area->locally_free) { + case 0: + if (!list_empty(&area->deferred_pages)) { + struct page *defer = list_entry(area->deferred_pages.next, struct page, list); + list_del(&defer->list); + area->locally_free--; + buddy_free(defer, base, zone, area, mask, order); + } + /* fall through */ + case 1: + buddy_free(page, base, zone, area, mask, order); + break; + default: + list_add(&page->list, &area->deferred_pages); + area->locally_free++; + break; + } + if (area->active) + area->active--; + zone->free_pages += 1 << order; } static inline void free_pages_check(const char *function, struct page *page) { if ( page_mapped(page) || - page->mapping != NULL || + page->__mapping != 0 || page_count(page) != 0 || (page->flags & ( 1 << PG_lru | @@ -220,6 +250,8 @@ static inline void free_pages_check(cons 1 << PG_locked | 1 << PG_active | 1 << PG_reclaim | + 1 << PG_rmaplock | + 1 << PG_anon | 1 << PG_slab | 1 << PG_writeback ))) bad_page(function, page); @@ -238,41 +270,78 @@ static inline void free_pages_check(cons * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static int -free_pages_bulk(struct zone *zone, int count, - struct list_head *list, unsigned int order) +void free_pages_bulk(struct zone *zone, struct page *page, unsigned int order) { - unsigned long mask, flags; + unsigned long mask, flags, count; struct free_area *area; - struct page *base, *page = NULL; - int ret = 0; + struct page *base, *save; + LIST_HEAD(tmp); + count = page->private; mask = (~0UL) << order; base = zone->zone_mem_map; area = zone->free_area + order; spin_lock_irqsave(&zone->lock, flags); zone->all_unreclaimable = 0; zone->pages_scanned = 0; - while (!list_empty(list) && count--) { - page = list_entry(list->prev, struct page, list); - /* have to delete it as __free_pages_bulk list manipulates */ - list_del(&page->list); - __free_pages_bulk(page, base, zone, area, mask, order); - ret++; + + if (order || area->active - area->locally_free <= 2*count) { + list_splice(&page->list, &tmp); + list_add(&page->list, &tmp); + page->private = 0; + } + + if (order) { + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + __free_pages_bulk(page, base, zone, area, mask, order); + } + } else if (area->active - area->locally_free <= 2*count) { + /* + * This is a somewhat ad hoc approach to dealing with + * the interaction of gang allocation and the deferred + * coalescing heuristics. + */ + if (area->active - area->locally_free < count) { + int local = 0; + + while (local < count && area->locally_free) { + struct page *follow, *head = + list_entry(area->deferred_pages.next, struct page, lru); + list_del(&head->lru); + list_for_each_entry_safe(follow, save, &head->list, list) { + list_del(&follow->list); + buddy_free(follow, base, zone, area, mask, 0); + } + local += head->private; + area->locally_free -= head->private; + head->private = 0; + buddy_free(head, base, zone, area, mask, 0); + } + } + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + buddy_free(page, base, zone, area, mask, order); + } + } else { + area->locally_free += count; + list_add(&page->lru, &area->deferred_pages); + } + if (!order) { + zone->free_pages += count; + area->active -= min(area->active, count); } spin_unlock_irqrestore(&zone->lock, flags); - return ret; } void __free_pages_ok(struct page *page, unsigned int order) { - LIST_HEAD(list); - mod_page_state(pgfree, 1 << order); free_pages_check(__FUNCTION__, page); - list_add(&page->list, &list); kernel_map_pages(page, 1<private = 1; + INIT_LIST_HEAD(&page->list); + free_pages_bulk(page_zone(page), page, order); } #define MARK_USED(index, order, area) \ @@ -285,10 +354,10 @@ expand(struct zone *zone, struct page *p unsigned long size = 1 << high; while (high > low) { - BUG_ON(bad_range(zone, page)); area--; high--; size >>= 1; + area->globally_free++; list_add(&page->list, &area->free_list); MARK_USED(index, high, area); index += size; @@ -318,7 +387,7 @@ static inline void set_page_refs(struct */ static void prep_new_page(struct page *page, int order) { - if (page->mapping || page_mapped(page) || + if (page->__mapping || page_mapped(page) || (page->flags & ( 1 << PG_private | 1 << PG_locked | @@ -326,6 +395,9 @@ static void prep_new_page(struct page *p 1 << PG_active | 1 << PG_dirty | 1 << PG_reclaim | + 1 << PG_rmaplock | + 1 << PG_anon | + 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(__FUNCTION__, page); @@ -340,7 +412,7 @@ static void prep_new_page(struct page *p * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ -static struct page *__rmqueue(struct zone *zone, unsigned int order) +static struct page *buddy_alloc(struct zone *zone, unsigned int order) { struct free_area * area; unsigned int current_order; @@ -354,16 +426,144 @@ static struct page *__rmqueue(struct zon page = list_entry(area->free_list.next, struct page, list); list_del(&page->list); + area->globally_free--; index = page - zone->zone_mem_map; if (current_order != MAX_ORDER-1) MARK_USED(index, current_order, area); - zone->free_pages -= 1UL << order; return expand(zone, page, index, order, current_order, area); } return NULL; } +/* + * This is bad; some way to avoid putting singleton pages on the + * deferred lists should be worked out at some point. + */ +static void split_pages(struct zone *zone, struct page *page, int page_order, int deferred_order) +{ + int split_order = deferred_order - 1; + unsigned long split_offset = 1UL << split_order; + struct page *split_page; + + while (split_order >= page_order) { + split_page = &page[split_offset]; + if (split_order) + list_add(&split_page->list, + &zone->free_area[split_order].deferred_pages); + else if (!zone->free_area[split_order].locally_free) { + INIT_LIST_HEAD(&split_page->list); + split_page->private = 1; + list_add(&split_page->lru, + &zone->free_area[split_order].deferred_pages); + } else { + struct page *head; + head = list_entry(zone->free_area[split_order].deferred_pages.next, struct page, lru); + head->private++; + list_add(&split_page->list, &head->list); + } + zone->free_area[split_order].locally_free++; + --split_order; + split_offset >>= 1; + } +} + +#define COALESCE_BATCH 256 +static inline struct page *steal_deferred_page(struct zone *zone, int order) +{ + struct page *page; + struct list_head *elem; + struct free_area *area = zone->free_area; + int found_order, k; + + if (zone->free_pages < (1 << order)) + return NULL; + + /* the range of found_order precludes order 0 */ + for (found_order = order + 1; found_order < MAX_ORDER; ++found_order) + if (!list_empty(&area[found_order].deferred_pages)) { + elem = area[found_order].deferred_pages.next; + page = list_entry(elem, struct page, list); + list_del(elem); + area[found_order].locally_free--; + split_pages(zone, page, order, found_order); + return page; + } + + for (found_order = order - 1; found_order >= 0; --found_order) { + for (k = 0; k < COALESCE_BATCH; ++k) { + unsigned long mask = (~0UL) << found_order; + if (list_empty(&area[found_order].deferred_pages)) + break; + elem = area[found_order].deferred_pages.next; + if (found_order) { + page = list_entry(elem, struct page, list); + list_del(elem); + area[found_order].locally_free--; + buddy_free(page, zone->zone_mem_map, zone, &area[found_order], mask, found_order); + } else { + LIST_HEAD(tmp); + struct page *save; + + page = list_entry(elem, struct page, lru); + list_del(elem); + area[found_order].locally_free -= page->private; + page->private = 0; + list_splice(&page->list, &tmp); + list_add(&page->list, &tmp); + list_for_each_entry_safe(page, save, &tmp, list) { + list_del(&page->list); + buddy_free(page, zone->zone_mem_map, zone, &area[found_order], mask, found_order); + } + } + } + page = buddy_alloc(zone, order); + if (page) + return page; + } + return buddy_alloc(zone, order); +} + +static inline int __rmqueue(struct zone *zone, unsigned int order, struct list_head *list) +{ + struct free_area *area = &zone->free_area[order]; + struct page *page; + int count; + + if (!list_empty(&area->deferred_pages)) { + if (order) { + page = list_entry(area->deferred_pages.next, struct page, list); + list_del(&page->list); + count = 1; + } else { + page = list_entry(area->deferred_pages.next, struct page, lru); + list_del(&page->lru); + count = page->private; + page->private = 0; + list_splice(&page->list, list); + } + + area->locally_free -= count; + area->active += count; + zone->free_pages -= count << order; + } else { + page = buddy_alloc(zone, order); + if (page) + count = 1; + else { + page = steal_deferred_page(zone, order); + if (page) + count = 1; + else + return 0; + } + area->active += count; + zone->free_pages -= count << order; + } + list_add(&page->list, list); + return count; +} + /* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. @@ -373,17 +573,14 @@ static int rmqueue_bulk(struct zone *zon unsigned long count, struct list_head *list) { unsigned long flags; - int i; - int allocated = 0; - struct page *page; + int i, j, allocated = 0; spin_lock_irqsave(&zone->lock, flags); - for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) + for (i = 0; i < count && allocated < count; ++i) { + j = __rmqueue(zone, order, list); + if (!j) break; - allocated++; - list_add_tail(&page->list, list); + allocated += j; } spin_unlock_irqrestore(&zone->lock, flags); return allocated; @@ -428,10 +625,14 @@ void drain_local_pages(void) pset = &zone->pageset[smp_processor_id()]; for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; + struct page *page, *save; pcp = &pset->pcp[i]; - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); + list_for_each_entry_safe(page, save, &pcp->list, lru) { + list_del(&page->lru); + pcp->count -= page->private; + free_pages_bulk(zone, page, 0); + } } } local_irq_restore(flags); @@ -447,15 +648,28 @@ static void free_hot_cold_page(struct pa struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + struct page *head; kernel_map_pages(page, 1, 0); inc_page_state(pgfree); free_pages_check(__FUNCTION__, page); pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); - list_add(&page->list, &pcp->list); + while (pcp->count >= pcp->high) { + struct page *free = list_entry(pcp->list.prev, struct page, lru); + list_del(&free->lru); + pcp->count -= free->private; + free_pages_bulk(zone, free, 0); + } + head = list_entry(pcp->list.next, struct page, lru); + if (!list_empty(&pcp->list) && head->private < pcp->batch) { + list_add(&page->list, &head->list); + head->private++; + } else { + INIT_LIST_HEAD(&page->list); + list_add(&page->lru, &pcp->list); + page->private = 1; + } pcp->count++; local_irq_restore(flags); put_cpu(); @@ -480,31 +694,75 @@ void free_cold_page(struct page *page) static struct page *buffered_rmqueue(struct zone *zone, int order, int cold) { unsigned long flags; - struct page *page = NULL; + struct page *head, *page = NULL; + struct per_cpu_pages *pcp = NULL; if (order == 0) { - struct per_cpu_pages *pcp; - pcp = &zone->pageset[get_cpu()].pcp[cold]; local_irq_save(flags); - if (pcp->count <= pcp->low) - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list); + if (pcp->count <= pcp->low) { + LIST_HEAD(tmp); + int k; + + k = rmqueue_bulk(zone, 0, pcp->batch, &tmp); + if (k) { + pcp->count += k; + head = list_entry(tmp.next, struct page, list); + list_del_init(&head->list); + head->private = k; + list_splice(&tmp, &head->list); + list_add(&head->lru, &pcp->list); + } + } if (pcp->count) { - page = list_entry(pcp->list.next, struct page, list); - list_del(&page->list); + head = list_entry(pcp->list.next, struct page, lru); + if (head->private == 1) { + list_del(&head->lru); + page = head; + page->private = 0; + } else { + page = list_entry(head->list.next, struct page,list); + list_del(&page->list); + head->private--; + } pcp->count--; } local_irq_restore(flags); put_cpu(); } - if (page == NULL) { + if (unlikely(!page)) { + LIST_HEAD(tmp); + int count; + + if (!order) + pcp = &zone->pageset[get_cpu()].pcp[cold]; + spin_lock_irqsave(&zone->lock, flags); - page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); + count = __rmqueue(zone, order, &tmp); + spin_unlock(&zone->lock); + + if (!list_empty(&tmp)) + page = list_entry(tmp.next, struct page, list); + + if (!order && count > 1) { + struct page *head; + + list_del(&page->list); + pcp->count += count - 1; + head = list_entry(tmp.next, struct page, list); + list_del_init(&head->list); + head->private = count - 1; + list_splice(&tmp, &head->list); + list_add(&head->lru, &pcp->list); + } + + local_irq_restore(flags); + if (order && page) prep_compound_page(page, order); + else if (!order) + put_cpu(); } if (page != NULL) { @@ -845,6 +1103,17 @@ static void show_node(struct zone *zone) #define show_node(zone) do { } while (0) #endif +unsigned long nr_deferred_pages(void) +{ + struct zone *zone; + unsigned long order, pages = 0; + + for_each_zone(zone) + for (order = 0; order < MAX_ORDER; ++order) + pages += zone->free_area[order].locally_free << order; + return pages; +} + /* * Accumulate the page_state information across all CPUs. * The result is unavoidably approximate - it can change @@ -1018,8 +1287,7 @@ void show_free_areas(void) } for_each_zone(zone) { - struct list_head *elem; - unsigned long nr, flags, order, total = 0; + unsigned long order, total = 0; show_node(zone); printk("%s: ", zone->name); @@ -1028,16 +1296,20 @@ void show_free_areas(void) continue; } - spin_lock_irqsave(&zone->lock, flags); + printk("buddy: "); for (order = 0; order < MAX_ORDER; order++) { - nr = 0; - list_for_each(elem, &zone->free_area[order].free_list) - ++nr; - total += nr << order; - printk("%lu*%lukB ", nr, K(1UL) << order); + printk("%lu*%lukB ", zone->free_area[order].globally_free, K(1UL) << order); + total += zone->free_area[order].globally_free << order; } - spin_unlock_irqrestore(&zone->lock, flags); - printk("= %lukB\n", K(total)); + printk("\ndefer: "); + for (order = 0; order < MAX_ORDER; order++) { + printk("%lu*%lukB ", zone->free_area[order].locally_free, K(1UL) << order); + total += zone->free_area[order].locally_free << order; + } + printk("\nactive: "); + for (order = 0; order < MAX_ORDER; order++) + printk("%lu*%lukB ", zone->free_area[order].active, K(1UL) << order); + printk("\n= %lukB\n", K(total)); } show_swap_cache_info(); @@ -1145,9 +1417,13 @@ static inline unsigned long wait_table_s * on IO we've got bigger problems than wait queue collision. * Limit the size of the wait table to a reasonable size. */ - size = min(size, 4096UL); + size = min(size, 1UL << (16 + fls(NR_CPUS))); - return max(size, 4UL); + /* + * Internal fragmentation in the bootmem allocator makes anything + * smaller than this a waste anyway. + */ + return max(size, 1UL << fls(PAGE_SIZE/sizeof(wait_queue_head_t))); } /* @@ -1274,7 +1550,7 @@ static void __init free_area_init_core(s batch = zone->present_pages / 1024; if (batch * PAGE_SIZE > 256 * 1024) batch = (256 * 1024) / PAGE_SIZE; - batch /= 4; /* We effectively *= 4 below */ + batch *= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -1334,8 +1610,11 @@ static void __init free_area_init_core(s for (i = 0; ; i++) { unsigned long bitmap_size; - + INIT_LIST_HEAD(&zone->free_area[i].deferred_pages); INIT_LIST_HEAD(&zone->free_area[i].free_list); + zone->free_area[i].globally_free = 0; + zone->free_area[i].locally_free = 0; + zone->free_area[i].active = 0; if (i == MAX_ORDER-1) { zone->free_area[i].map = NULL; break; @@ -1443,24 +1722,22 @@ static int frag_show(struct seq_file *m, pg_data_t *pgdat = (pg_data_t *)arg; struct zone *zone; struct zone *node_zones = pgdat->node_zones; - unsigned long flags; int order; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { if (!zone->present_pages) continue; - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) { - unsigned long nr_bufs = 0; - struct list_head *elem; - - list_for_each(elem, &(zone->free_area[order].free_list)) - ++nr_bufs; - seq_printf(m, "%6lu ", nr_bufs); - } - spin_unlock_irqrestore(&zone->lock, flags); + seq_printf(m, "Node %d, zone %8s\n", pgdat->node_id, zone->name); + seq_puts(m, "buddy: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].globally_free); + seq_puts(m, "\ndefer: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].locally_free); + seq_puts(m, "\nactive: "); + for (order = 0; order < MAX_ORDER; ++order) + seq_printf(m, "%6lu ", zone->free_area[order].active); seq_putc(m, '\n'); } return 0; @@ -1479,6 +1756,7 @@ static char *vmstat_text[] = { "nr_unstable", "nr_page_table_pages", "nr_mapped", + "nr_swapcache", "nr_slab", "pgpgin", diff -prauN linux-2.6.0-test7/mm/page_io.c wli-2.6.0-test7-bk1-29/mm/page_io.c --- linux-2.6.0-test7/mm/page_io.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/page_io.c 2003-10-09 19:42:26.000000000 -0700 @@ -16,8 +16,6 @@ #include #include #include -#include /* for block_sync_page() */ -#include #include #include @@ -32,7 +30,7 @@ get_swap_bio(int gfp_flags, struct page swp_entry_t entry; BUG_ON(!PageSwapCache(page)); - entry.val = page->index; + entry.val = page->private; sis = get_swap_info_struct(swp_type(entry)); bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * @@ -130,13 +128,6 @@ out: return ret; } -struct address_space_operations swap_aops = { - .writepage = swap_writepage, - .readpage = swap_readpage, - .sync_page = block_sync_page, - .set_page_dirty = __set_page_dirty_nobuffers, -}; - /* * A scruffy utility function to read or write an arbitrary swap page * and wait on the I/O. @@ -150,9 +141,8 @@ int rw_swap_page_sync(int rw, swp_entry_ lock_page(page); - BUG_ON(page->mapping); - page->mapping = &swapper_space; - page->index = entry.val; + SetPageSwapCache(page); + page->private = entry.val; if (rw == READ) { ret = swap_readpage(NULL, page); @@ -161,7 +151,7 @@ int rw_swap_page_sync(int rw, swp_entry_ ret = swap_writepage(page, &swap_wbc); wait_on_page_writeback(page); } - page->mapping = NULL; + ClearPageSwapCache(page); if (ret == 0 && (!PageUptodate(page) || PageError(page))) ret = -EIO; return ret; diff -prauN linux-2.6.0-test7/mm/readahead.c wli-2.6.0-test7-bk1-29/mm/readahead.c --- linux-2.6.0-test7/mm/readahead.c 2003-10-08 12:24:01.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/readahead.c 2003-10-09 19:36:12.000000000 -0700 @@ -229,7 +229,7 @@ __do_page_cache_readahead(struct address /* * Preallocate as many pages as we will need. */ - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); for (page_idx = 0; page_idx < nr_to_read; page_idx++) { unsigned long page_offset = offset + page_idx; @@ -240,16 +240,16 @@ __do_page_cache_readahead(struct address if (page) continue; - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); page = page_cache_alloc_cold(mapping); - spin_lock(&mapping->page_lock); + mapping_rdlock(&mapping->page_lock); if (!page) break; page->index = page_offset; list_add(&page->list, &page_pool); ret++; } - spin_unlock(&mapping->page_lock); + mapping_rdunlock(&mapping->page_lock); /* * Now start the IO. We ignore I/O errors - if the page is not diff -prauN linux-2.6.0-test7/mm/rmap.c wli-2.6.0-test7-bk1-29/mm/rmap.c --- linux-2.6.0-test7/mm/rmap.c 2003-10-08 12:24:46.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/rmap.c 2003-10-09 19:55:13.000000000 -0700 @@ -5,527 +5,634 @@ * Released under the General Public License (GPL). * * - * Simple, low overhead pte-based reverse mapping scheme. - * This is kept modular because we may want to experiment - * with object-based reverse mapping schemes. Please try - * to keep this thing as modular as possible. + * Simple, low overhead reverse mapping scheme. + * Please try to keep this thing as modular as possible. */ /* * Locking: - * - the page->pte.chain is protected by the PG_chainlock bit, + * - the page->rmap field is protected by the PG_rmaplock bit, * which nests within the the mm->page_table_lock, * which nests within the page lock. * - because swapout locking is opposite to the locking order * in the page fault path, the swapout path uses trylocks * on the mm->page_table_lock */ + #include #include #include #include #include #include -#include +#include #include #include - -#include -#include -#include +#include +#include #include /* #define DEBUG_RMAP */ /* - * Shared pages have a chain of pte_chain structures, used to locate - * all the mappings to this page. We only need a pointer to the pte - * here, the page struct for the page table page contains the process - * it belongs to and the offset within that process. - * - * We use an array of pte pointers in this structure to minimise cache misses - * while traversing reverse maps. + * struct addresser: for next_rmap_address to dole out user addresses + * one by one to page_referenced() or try_to_unmap() */ -#define NRPTE ((L1_CACHE_BYTES - sizeof(unsigned long))/sizeof(pte_addr_t)) +struct addresser { + unsigned long address, count; + struct rmap_chain *chain; + int index; +}; -/* - * next_and_idx encodes both the address of the next pte_chain and the - * offset of the highest-index used pte in ptes[]. - */ -struct pte_chain { - unsigned long next_and_idx; - pte_addr_t ptes[NRPTE]; -} ____cacheline_aligned; +static kmem_cache_t *rmap_chain_cache; + +static DEFINE_PER_CPU(struct rmap_chain *, rmap_chain) = NULL; -kmem_cache_t *pte_chain_cache; +kmem_cache_t *anon_cache; -static inline struct pte_chain *pte_chain_next(struct pte_chain *pte_chain) +static void anon_ctor(void *arg, kmem_cache_t *cache, unsigned long unused) { - return (struct pte_chain *)(pte_chain->next_and_idx & ~NRPTE); + struct anon *anon = (struct anon *)arg; + atomic_set(&anon->count, 1); + anon->lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&anon->list); + INIT_RCU_HEAD(&anon->rcu); } -static inline struct pte_chain *pte_chain_ptr(unsigned long pte_chain_addr) +static void rmap_chain_ctor(void *arg, kmem_cache_t *cache, unsigned long flags) { - return (struct pte_chain *)(pte_chain_addr & ~NRPTE); + int i; + struct rmap_chain *chain = (struct rmap_chain *)arg; + + for (i = 0; i < NRSLOT; ++i) + chain->slot[i] = NOADDR; + chain->next = NULL; } -static inline int pte_chain_idx(struct pte_chain *pte_chain) +static inline void rmap_chain_dtor(struct rmap_chain *chain) { - return pte_chain->next_and_idx & NRPTE; + int i; + for (i = 0; i < NRSLOT; ++i) + if (chain->slot[i] != NOADDR) + chain->slot[i] = NOADDR; + if (chain->next) + chain->next = NULL; } -static inline unsigned long -pte_chain_encode(struct pte_chain *pte_chain, int idx) +void __init init_rmap(void) { - return (unsigned long)pte_chain | idx; + anon_cache = kmem_cache_create("anon", sizeof(struct anon), 0, 0, anon_ctor, NULL); + if (!anon_cache) + panic("init_rmap: Cannot alloc anon slab cache\n"); + rmap_chain_cache = kmem_cache_create("rmap_chain", sizeof(struct rmap_chain), 0, 0, rmap_chain_ctor, NULL); } -/* - * pte_chain list management policy: - * - * - If a page has a pte_chain list then it is shared by at least two processes, - * because a single sharing uses PageDirect. (Well, this isn't true yet, - * coz this code doesn't collapse singletons back to PageDirect on the remove - * path). - * - A pte_chain list has free space only in the head member - all succeeding - * members are 100% full. - * - If the head element has free space, it occurs in its leading slots. - * - All free space in the pte_chain is at the start of the head member. - * - Insertion into the pte_chain puts a pte pointer in the last free slot of - * the head member. - * - Removal from a pte chain moves the head pte of the head member onto the - * victim pte and frees the head member if it became empty. - */ +int exec_rmap(struct mm_struct *mm) +{ + struct anon *anon = kmem_cache_alloc(anon_cache, GFP_KERNEL); + if (!anon) + return -ENOMEM; + mm->anon = anon; + /* unique reference; no locking required */ + list_add_rcu(&mm->anon_list, &anon->list); + return 0; +} -/** - ** VM stuff below this comment - **/ +void dup_rmap(struct mm_struct *new, struct mm_struct *old) +{ + struct anon *anon = old->anon; + atomic_inc(&anon->count); + new->anon = anon; + spin_lock(&anon->lock); + list_add_tail_rcu(&new->anon_list, &anon->list); + spin_unlock(&anon->lock); +} -/** - * page_referenced - test if the page was referenced - * @page: the page to test - * - * Quick test_and_clear_referenced for all mappings to a page, - * returns the number of processes which referenced the page. - * Caller needs to hold the pte_chain_lock. - * - * If the page has a single-entry pte_chain, collapse that back to a PageDirect - * representation. This way, it's only done under memory pressure. - */ -int page_referenced(struct page * page) +static void free_anon(void *__anon) { - struct pte_chain *pc; - int referenced = 0; + struct anon *anon = (struct anon *)__anon; + INIT_LIST_HEAD(&anon->list); + atomic_set(&anon->count, 1); + kmem_cache_free(anon_cache, anon); +} - if (TestClearPageReferenced(page)) - referenced++; +void exit_rmap(struct mm_struct *mm) +{ + struct anon *anon = mm->anon; - if (PageDirect(page)) { - pte_t *pte = rmap_ptep_map(page->pte.direct); - if (ptep_test_and_clear_young(pte)) - referenced++; - rmap_ptep_unmap(pte); - } else { - int nr_chains = 0; + mm->anon = NULL; + spin_lock(&anon->lock); + list_del_rcu(&mm->anon_list); + spin_unlock(&anon->lock); + + if (!atomic_dec_and_test(&anon->count)) + return; + + call_rcu(&anon->rcu, free_anon, anon); +} + +/** + ** Functions for manipulating struct rmap_chain. + **/ - /* Check all the page tables mapping this page. */ - for (pc = page->pte.chain; pc; pc = pte_chain_next(pc)) { - int i; - - for (i = NRPTE-1; i >= 0; i--) { - pte_addr_t pte_paddr = pc->ptes[i]; - pte_t *p; - - if (!pte_paddr) - break; - p = rmap_ptep_map(pte_paddr); - if (ptep_test_and_clear_young(p)) - referenced++; - rmap_ptep_unmap(p); - nr_chains++; +/* + * Boolean rmap_get_cpu() ensures the cpu has an rmap_chain cached + * in case it is needed later while lock is held. It is never needed + * when page_add_rmap() is adding a freshly allocated anon page. + * caller does put_cpu() once ->page_table_lock prevents preemption. + */ +int rmap_get_cpu(void) +{ + struct rmap_chain **cache, *chain; + might_sleep(); + cache = &per_cpu(rmap_chain, get_cpu()); + if (*cache) + return 1; + put_cpu(); + chain = kmem_cache_alloc(rmap_chain_cache, GFP_KERNEL); + cache = &per_cpu(rmap_chain, get_cpu()); + if (*cache) + kmem_cache_free(rmap_chain_cache, chain); + else if (chain) + *cache = chain; + else { + put_cpu(); + return 0; + } + return 1; +} + +static struct rmap_chain *get_rmap_chain(void) +{ + struct rmap_chain **cache, *chain; + int i; + + /* + * ->page_table_lock and rmap_lock are held, no need to get_cpu() + */ + cache = &per_cpu(rmap_chain, smp_processor_id()); + chain = *cache; + *cache = NULL; + for (i = 0; i < NRSLOT; ++i) + chain->slot[i] = NOADDR; + chain->next = NULL; + return chain; +} + +void add_rmap_address(struct page *page, unsigned long address) +{ + struct rmap_chain *chain = page->chain; + int i = 0; + + if (!chain) + page->chain = get_rmap_chain(); + else { + /* + * Check lest duplicates arise, and find a free slot at the end + */ + for (chain = page->chain; ; chain = chain->next) { + for (i = 0; i < NRSLOT; ++i) { + if (chain->slot[i] == NOADDR) + goto set; + else if (chain->slot[i] == address) + return; } + if (!chain->next) + chain->next = get_rmap_chain(); } - if (nr_chains == 1) { - pc = page->pte.chain; - page->pte.direct = pc->ptes[NRPTE-1]; - SetPageDirect(page); - pc->ptes[NRPTE-1] = 0; - __pte_chain_free(pc); + } +set: + chain->slot[i] = address; +} + +static int +next_rmap_address(struct page *page, struct vm_area_struct *vma, + struct addresser *addresser) +{ + /* bootstrap it */ + if (addresser->address == NOADDR) { + /* set chain and index for next call */ + addresser->chain = page->chain; + addresser->index = 0; + if (vma) { + addresser->address = vma_address(page, vma); + if (addresser->address != NOADDR) + return 1; + } else { + addresser->address = page->index; + return 1; } } - return referenced; + while (addresser->chain) { + if (addresser->index >= NRSLOT) + addresser->index = 0; + addresser->address = + addresser->chain->slot[addresser->index]; + if (addresser->address == NOADDR) + break; + addresser->index++; + if (addresser->index >= NRSLOT) + addresser->chain = addresser->chain->next; + if (!vma || addresser->address != vma_address(page, vma)) + return 1; + } + return 0; } -/** - * page_add_rmap - add reverse mapping entry to a page - * @page: the page to add the mapping to - * @ptep: the page table entry mapping this page - * - * Add a new pte reverse mapping to a page. - * The caller needs to hold the mm->page_table_lock. - */ -struct pte_chain * -page_add_rmap(struct page *page, pte_t *ptep, struct pte_chain *pte_chain) +void clear_page_chained(struct page *page) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *cur_pte_chain; + struct rmap_chain *chain = page->chain; - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return pte_chain; + /* + * This is only called when mapcount goes to 0, which + * means it's possible for a page to accumulate a large + * chain of stale addresses. But normally try_to_unmap_one() + * will bring the count to 0 and free them all here. + */ + do { + struct rmap_chain *next = chain->next; + rmap_chain_dtor(chain); + kmem_cache_free(rmap_chain_cache, chain); + chain = next; + } while (chain); +} - pte_chain_lock(page); +/** + ** Subfunctions of page_referenced(): page_referenced_one() called + ** repeatedly from page_referenced_obj(); + **/ - if (page->pte.direct == 0) { - page->pte.direct = pte_paddr; - SetPageDirect(page); - inc_page_state(nr_mapped); - goto out; - } +static inline int page_referenced_one(struct page *page, struct mm_struct *mm, + struct addresser *addresser) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + int referenced = 0; - if (PageDirect(page)) { - /* Convert a direct pointer into a pte_chain */ - ClearPageDirect(page); - pte_chain->ptes[NRPTE-1] = page->pte.direct; - pte_chain->ptes[NRPTE-2] = pte_paddr; - pte_chain->next_and_idx = pte_chain_encode(NULL, NRPTE-2); - page->pte.direct = 0; - page->pte.chain = pte_chain; - pte_chain = NULL; /* We consumed it */ + if (!spin_trylock(&mm->page_table_lock)) { + referenced = 1; goto out; } - cur_pte_chain = page->pte.chain; - if (cur_pte_chain->ptes[0]) { /* It's full */ - pte_chain->next_and_idx = pte_chain_encode(cur_pte_chain, - NRPTE - 1); - page->pte.chain = pte_chain; - pte_chain->ptes[NRPTE-1] = pte_paddr; - pte_chain = NULL; /* We consumed it */ + pgd = pgd_offset(mm, addresser->address); + if (!pgd_present(*pgd)) + goto out_unlock; + + pmd = pmd_offset_map(pgd, addresser->address); + if (!pmd) goto out; - } - cur_pte_chain->ptes[pte_chain_idx(cur_pte_chain) - 1] = pte_paddr; - cur_pte_chain->next_and_idx--; + + if (!pmd_present(*pmd)) + goto out_unmap_pmd; + + pte = pte_offset_map(pmd, addresser->address); + if (!pte_present(*pte)) + goto out_unmap_pte; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap_pte; + + referenced = ptep_test_and_clear_young(pte); + addresser->count--; + +out_unmap_pmd: + pmd_unmap(pmd); +out_unmap_pte: + pte_unmap(pte); +out_unlock: + spin_unlock(&mm->page_table_lock); out: - pte_chain_unlock(page); - return pte_chain; + return referenced; } -/** - * page_remove_rmap - take down reverse mapping to a page - * @page: page to remove mapping from - * @ptep: page table entry to remove - * - * Removes the reverse mapping from the pte_chain of the page, - * after that the caller can clear the page table entry and free - * the page. - * Caller needs to hold the mm->page_table_lock. - */ -void page_remove_rmap(struct page *page, pte_t *ptep) +static inline int +page_referenced_anon(struct page *page, struct addresser *addresser) { - pte_addr_t pte_paddr = ptep_to_paddr(ptep); - struct pte_chain *pc; + struct mm_struct *mm; + struct anon *anon; + int referenced = 0; - if (!pfn_valid(page_to_pfn(page)) || PageReserved(page)) - return; + rcu_read_lock(); /* anon->lock */ - pte_chain_lock(page); + anon = page_anon(page); + if (!anon) + goto out; - if (!page_mapped(page)) - goto out_unlock; /* remap_page_range() from a driver? */ + list_for_each_entry_rcu(mm, &anon->list, anon_list) { + if (!mm->anon || !mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, NULL, addresser)) { + referenced += page_referenced_one(page, mm, addresser); + if (!addresser->count) + goto out; + } + } +out: + rcu_read_unlock(); /* anon->lock */ + return referenced; +} - if (PageDirect(page)) { - if (page->pte.direct == pte_paddr) { - page->pte.direct = 0; - ClearPageDirect(page); - goto out; +static inline int page_referenced_obj(struct page *page, struct addresser *addresser) +{ + struct address_space *mapping = page_mapping(page); + struct vm_area_struct *vma; + int referenced = 0; + + /* bail if it's a Morton page */ + if (!mapping) + return 0; + + rcu_read_lock(); /* mapping->i_shared_lock */ + list_for_each_entry_rcu(vma, &mapping->i_mmap, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, vma, addresser)) { + referenced += page_referenced_one(page, vma->vm_mm, addresser); + if (!addresser->count) + goto out; } - } else { - struct pte_chain *start = page->pte.chain; - struct pte_chain *next; - int victim_i = -1; - - for (pc = start; pc; pc = next) { - int i; - - next = pte_chain_next(pc); - if (next) - prefetch(next); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pa = pc->ptes[i]; - - if (victim_i == -1) - victim_i = i; - if (pa != pte_paddr) - continue; - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - if (victim_i == NRPTE-1) { - /* Emptied a pte_chain */ - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - } else { - start->next_and_idx++; - } + } + + list_for_each_entry_rcu(vma, &mapping->i_mmap_shared, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, vma, addresser)) { + referenced += page_referenced_one(page, vma->vm_mm, addresser); + if (!addresser->count) goto out; - } } } out: - if (!page_mapped(page)) - dec_page_state(nr_mapped); -out_unlock: - pte_chain_unlock(page); - return; + rcu_read_unlock(); /* mapping->i_shared_lock */ + return referenced; } /** - * try_to_unmap_one - worker function for try_to_unmap - * @page: page to unmap - * @ptep: page table entry to unmap from page + * page_referenced - test if the page was referenced + * @page: the page to test * - * Internal helper function for try_to_unmap, called for each page - * table entry mapping a page. Because locking order here is opposite - * to the locking order used by the page fault path, we use trylocks. - * Locking: - * page lock shrink_list(), trylock - * pte_chain_lock shrink_list() - * mm->page_table_lock try_to_unmap_one(), trylock + * returns the number of ptes which referenced the page. + * Caller needs to hold the rmap_lock. */ -static int FASTCALL(try_to_unmap_one(struct page *, pte_addr_t)); -static int try_to_unmap_one(struct page * page, pte_addr_t paddr) +int page_referenced(struct page * page) { - pte_t *ptep = rmap_ptep_map(paddr); - unsigned long address = ptep_to_address(ptep); - struct mm_struct * mm = ptep_to_mm(ptep); - struct vm_area_struct * vma; - pte_t pte; - int ret; + int referenced = !!TestClearPageReferenced(page); + struct addresser addresser; + + addresser.count = atomic_read(&page->mapcount); + if (!addresser.count || !page->__mapping) + return 0; + else if (PageAnon(page)) + referenced += page_referenced_anon(page, &addresser); + else + referenced += page_referenced_obj(page, &addresser); + return referenced; +} + +void page_turn_rmap(struct page *page, struct vm_area_struct *vma) +{ + struct anon *old, *new; + old = page_anon(page); + new = vma->vm_mm->anon; + + BUG_ON(!PageAnon(page)); + BUG_ON(atomic_read(&page->mapcount) != 1); + + if (old == new) + return; + + rmap_lock(page); + set_page_mapping(page, new); + rmap_unlock(page); +} + +void page_move_rmap(struct page *page, struct vm_area_struct *vma, + unsigned long old, unsigned long new) +{ + if (!page_mapped(page) || !page->__mapping) + return; + + rmap_lock(page); + + if (PageAnon(page)) { + /* + * Don't check atomic_read(&page->mapcount) == 1 here + * because the mapcount could be 1 but the page + * could still have a chain, and our new address + * in that chain. + */ + if (atomic_read(&page->mapcount) == 1) + page->index = new; + else if (new != page->index) + add_rmap_address(page, new); + } else { + /* + * Just in case things are nonlinear. + */ + if (old != vma_address(page, vma)) + add_rmap_address(page, new); + } - if (!mm) - BUG(); + rmap_unlock(page); +} + +static int try_to_unmap_one(struct page *page, struct mm_struct *mm, + struct addresser *addresser, struct vm_area_struct *vma) +{ + pgd_t *pgd; + pmd_t *pmd; + pte_t *pte; + pte_t pteval; + unsigned long address = addresser->address; + int ret = SWAP_AGAIN; /* * We need the page_table_lock to protect us from page faults, * munmap, fork, etc... */ - if (!spin_trylock(&mm->page_table_lock)) { - rmap_ptep_unmap(ptep); - return SWAP_AGAIN; - } - + if (!spin_trylock(&mm->page_table_lock)) + goto out; - /* During mremap, it's possible pages are not in a VMA. */ - vma = find_vma(mm, address); - if (!vma) { + /* If the page is mlock()'d, we can't unmap it. */ + if (!vma) + vma = find_vma(mm, address); + if (!vma || (vma->vm_flags & VM_LOCKED)) { ret = SWAP_FAIL; goto out_unlock; } - /* The page is mlock()d, we cannot swap it out. */ - if (vma->vm_flags & VM_LOCKED) { - ret = SWAP_FAIL; + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) goto out_unlock; - } + pmd = pmd_offset_map(pgd, address); + if (!pmd_present(*pmd)) + goto out_unmap_pmd; + pte = pte_offset_map(pmd, address); + if (!pte_present(*pte)) + goto out_unmap_pte; + + if (page_to_pfn(page) != pte_pfn(*pte)) + goto out_unmap_pte; + + addresser->count--; /* Nuke the page table entry. */ flush_cache_page(vma, address); - pte = ptep_get_and_clear(ptep); + pteval = vm_ptep_get_and_clear(vma, pte, address); flush_tlb_page(vma, address); - if (PageSwapCache(page)) { + if (PageAnon(page)) { /* * Store the swap location in the pte. * See handle_pte_fault() ... */ - swp_entry_t entry = { .val = page->index }; + swp_entry_t entry = { .val = page->private }; + BUG_ON(!PageSwapCache(page)); swap_duplicate(entry); - set_pte(ptep, swp_entry_to_pte(entry)); - BUG_ON(pte_file(*ptep)); + vm_set_pte(vma, pte, swp_entry_to_pte(entry), address); + BUG_ON(pte_file(*pte)); } else { - unsigned long pgidx; /* - * If a nonlinear mapping then store the file page offset - * in the pte. + * If a nonlinear mapping from sys_remap_file_pages(), + * then store the file page offset in the pte. */ - pgidx = (address - vma->vm_start) >> PAGE_SHIFT; - pgidx += vma->vm_pgoff; - pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT; - if (page->index != pgidx) { - set_pte(ptep, pgoff_to_pte(page->index)); - BUG_ON(!pte_file(*ptep)); + if (address != vma_address(page, vma)) { + vm_set_pte(vma, pte, pgoff_to_pte(page->index), address); + BUG_ON(!pte_file(*pte)); } } /* Move the dirty bit to the physical page now the pte is gone. */ - if (pte_dirty(pte)) + if (pte_dirty(pteval)) set_page_dirty(page); - mm->rss--; + BUG_ON(!atomic_read(&page->mapcount)); + if (atomic_dec_and_test(&page->mapcount)) + if (page->chain) + clear_page_chained(page); page_cache_release(page); - ret = SWAP_SUCCESS; + mm->rss--; +out_unmap_pmd: + pmd_unmap(pmd); +out_unmap_pte: + pte_unmap(pte); out_unlock: - rmap_ptep_unmap(ptep); spin_unlock(&mm->page_table_lock); +out: return ret; } -/** - * try_to_unmap - try to remove all page table mappings to a page - * @page: the page to get unmapped - * - * Tries to remove all the page table entries which are mapping this - * page, used in the pageout path. Caller must hold the page lock - * and its pte chain lock. Return values are: - * - * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a trylock, try again later - * SWAP_FAIL - the page is unswappable - */ -int try_to_unmap(struct page * page) +static inline int try_to_unmap_anon(struct page *page, struct addresser *addresser) { - struct pte_chain *pc, *next_pc, *start; - int ret = SWAP_SUCCESS; - int victim_i = -1; - - /* This page should not be on the pageout lists. */ - if (PageReserved(page)) - BUG(); - if (!PageLocked(page)) - BUG(); - /* We need backing store to swap out a page. */ - if (!page->mapping) - BUG(); - - if (PageDirect(page)) { - ret = try_to_unmap_one(page, page->pte.direct); - if (ret == SWAP_SUCCESS) { - page->pte.direct = 0; - ClearPageDirect(page); - } + struct mm_struct *mm; + struct anon *anon; + int ret = SWAP_AGAIN; + + rcu_read_lock(); /* anon->lock */ + + anon = page_anon(page); + if (!anon) goto out; - } - start = page->pte.chain; - for (pc = start; pc; pc = next_pc) { - int i; - - next_pc = pte_chain_next(pc); - if (next_pc) - prefetch(next_pc); - for (i = pte_chain_idx(pc); i < NRPTE; i++) { - pte_addr_t pte_paddr = pc->ptes[i]; - - if (!pte_paddr) - continue; - if (victim_i == -1) - victim_i = i; - - switch (try_to_unmap_one(page, pte_paddr)) { - case SWAP_SUCCESS: - /* - * Release a slot. If we're releasing the - * first pte in the first pte_chain then - * pc->ptes[i] and start->ptes[victim_i] both - * refer to the same thing. It works out. - */ - pc->ptes[i] = start->ptes[victim_i]; - start->ptes[victim_i] = 0; - victim_i++; - if (victim_i == NRPTE) { - page->pte.chain = pte_chain_next(start); - __pte_chain_free(start); - start = page->pte.chain; - victim_i = 0; - } else { - start->next_and_idx++; - } - break; - case SWAP_AGAIN: - /* Skip this pte, remembering status. */ - ret = SWAP_AGAIN; - continue; - case SWAP_FAIL: - ret = SWAP_FAIL; + list_for_each_entry_rcu(mm, &anon->list, anon_list) { + if (!mm->anon) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, NULL, addresser)) { + ret = try_to_unmap_one(page, mm, addresser, NULL); + if (ret == SWAP_FAIL || !addresser->count) goto out; - } } } out: - if (!page_mapped(page)) - dec_page_state(nr_mapped); + rcu_read_unlock(); /* anon->lock */ return ret; } -/** - ** No more VM stuff below this comment, only pte_chain helper - ** functions. - **/ - -static void pte_chain_ctor(void *p, kmem_cache_t *cachep, unsigned long flags) +static inline int try_to_unmap_obj(struct page *page, struct addresser *addresser) { - struct pte_chain *pc = p; + struct address_space *mapping; + struct vm_area_struct *vma; + int ret = SWAP_AGAIN; + + mapping = page_mapping(page); + + /* bail if it's a Morton page */ + if (!mapping) + return SWAP_FAIL; + + rcu_read_lock(); /* mapping->i_shared_lock */ + + list_for_each_entry_rcu(vma, &mapping->i_mmap, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, vma, addresser)) { + ret = try_to_unmap_one(page, vma->vm_mm, addresser, vma); + if (ret == SWAP_FAIL || !addresser->count) + goto out; + } + } - memset(pc, 0, sizeof(*pc)); + list_for_each_entry_rcu(vma, &mapping->i_mmap_shared, shared) { + if (vma->vm_flags & VM_DEAD) + continue; + if (!vma->vm_mm->rss) + continue; + addresser->address = NOADDR; + while (next_rmap_address(page, vma, addresser)) { + ret = try_to_unmap_one(page, vma->vm_mm, addresser, vma); + if (ret == SWAP_FAIL || !addresser->count) + goto out; + } + } +out: + rcu_read_unlock(); /* mapping->i_shared_lock */ + return ret; } -DEFINE_PER_CPU(struct pte_chain *, local_pte_chain) = 0; - /** - * __pte_chain_free - free pte_chain structure - * @pte_chain: pte_chain struct to free - */ -void __pte_chain_free(struct pte_chain *pte_chain) -{ - struct pte_chain **pte_chainp; - - pte_chainp = &get_cpu_var(local_pte_chain); - if (pte_chain->next_and_idx) - pte_chain->next_and_idx = 0; - if (*pte_chainp) - kmem_cache_free(pte_chain_cache, *pte_chainp); - *pte_chainp = pte_chain; - put_cpu_var(local_pte_chain); -} - -/* - * pte_chain_alloc(): allocate a pte_chain structure for use by page_add_rmap(). + * try_to_unmap - try to remove all page table mappings to a page + * @page: the page to get unmapped + * + * Tries to remove all the page table entries which are mapping this + * page, used in the pageout path. Caller must hold the page lock + * and its pte chain lock. Return values are: * - * The caller of page_add_rmap() must perform the allocation because - * page_add_rmap() is invariably called under spinlock. Often, page_add_rmap() - * will not actually use the pte_chain, because there is space available in one - * of the existing pte_chains which are attached to the page. So the case of - * allocating and then freeing a single pte_chain is specially optimised here, - * with a one-deep per-cpu cache. + * SWAP_SUCCESS - we succeeded in removing all mappings + * SWAP_AGAIN - we missed a trylock, try again later + * SWAP_FAIL - the page is unswappable */ -struct pte_chain *pte_chain_alloc(int gfp_flags) +int try_to_unmap(struct page *page) { - struct pte_chain *ret; - struct pte_chain **pte_chainp; - - might_sleep_if(gfp_flags & __GFP_WAIT); + struct addresser addresser; + int ret; - pte_chainp = &get_cpu_var(local_pte_chain); - if (*pte_chainp) { - ret = *pte_chainp; - *pte_chainp = NULL; - put_cpu_var(local_pte_chain); - } else { - put_cpu_var(local_pte_chain); - ret = kmem_cache_alloc(pte_chain_cache, gfp_flags); + BUG_ON(PageReserved(page)); + BUG_ON(!PageLocked(page)); + BUG_ON(!page_mapped(page)); + + addresser.count = atomic_read(&page->mapcount); + if (PageAnon(page)) + ret = try_to_unmap_anon(page, &addresser); + else + ret = try_to_unmap_obj(page, &addresser); + if (!page_mapped(page)) { + dec_page_state(nr_mapped); + if (PageAnon(page)) + clear_page_anon(page); + ret = SWAP_SUCCESS; } return ret; } - -void __init pte_chain_init(void) -{ - pte_chain_cache = kmem_cache_create( "pte_chain", - sizeof(struct pte_chain), - 0, - SLAB_MUST_HWCACHE_ALIGN, - pte_chain_ctor, - NULL); - - if (!pte_chain_cache) - panic("failed to create pte_chain cache!\n"); -} diff -prauN linux-2.6.0-test7/mm/shmem.c wli-2.6.0-test7-bk1-29/mm/shmem.c --- linux-2.6.0-test7/mm/shmem.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/shmem.c 2003-10-09 19:42:26.000000000 -0700 @@ -694,7 +694,7 @@ static int shmem_writepage(struct page * BUG_ON(!PageLocked(page)); BUG_ON(page_mapped(page)); - mapping = page->mapping; + mapping = page_mapping(page); index = page->index; inode = mapping->host; info = SHMEM_I(inode); @@ -1123,7 +1123,7 @@ static struct inode_operations shmem_sym static int shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_mapping(page)->host; return shmem_getpage(inode, page->index, &page, SGP_WRITE); } @@ -1779,7 +1779,7 @@ static void destroy_inodecache(void) static struct address_space_operations shmem_aops = { .writepage = shmem_writepage, - .set_page_dirty = __set_page_dirty_nobuffers, + .set_page_dirty = set_page_dirty_nobuffers, #ifdef CONFIG_TMPFS .prepare_write = shmem_prepare_write, .commit_write = simple_commit_write, diff -prauN linux-2.6.0-test7/mm/slab.c wli-2.6.0-test7-bk1-29/mm/slab.c --- linux-2.6.0-test7/mm/slab.c 2003-10-08 12:24:27.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/slab.c 2003-10-09 19:28:46.000000000 -0700 @@ -2781,7 +2781,7 @@ void ptrinfo(unsigned long addr) printk("No pgd.\n"); break; } - pmd = pmd_offset(pgd, addr); + pmd = pmd_offset_kernel(pgd, addr); if (pmd_none(*pmd)) { printk("No pmd.\n"); break; diff -prauN linux-2.6.0-test7/mm/swap_state.c wli-2.6.0-test7-bk1-29/mm/swap_state.c --- linux-2.6.0-test7/mm/swap_state.c 2003-10-08 12:24:04.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/swap_state.c 2003-10-09 19:57:40.000000000 -0700 @@ -21,23 +21,16 @@ static struct backing_dev_info swap_back .memory_backed = 1, /* Does not contribute to dirty memory */ }; -extern struct address_space_operations swap_aops; +static struct address_space_operations swap_aops = { + .writepage = swap_writepage, + .readpage = swap_readpage, +}; struct address_space swapper_space = { .page_tree = RADIX_TREE_INIT(GFP_ATOMIC), - .page_lock = SPIN_LOCK_UNLOCKED, - .clean_pages = LIST_HEAD_INIT(swapper_space.clean_pages), - .dirty_pages = LIST_HEAD_INIT(swapper_space.dirty_pages), - .io_pages = LIST_HEAD_INIT(swapper_space.io_pages), - .locked_pages = LIST_HEAD_INIT(swapper_space.locked_pages), + .page_lock = MAPPING_RW_LOCK_UNLOCKED, .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, - .i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap), - .i_mmap_shared = LIST_HEAD_INIT(swapper_space.i_mmap_shared), - .i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem), - .truncate_count = ATOMIC_INIT(0), - .private_lock = SPIN_LOCK_UNLOCKED, - .private_list = LIST_HEAD_INIT(swapper_space.private_list), }; #define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0) @@ -59,30 +52,50 @@ void show_swap_cache_info(void) swap_cache_info.noent_race, swap_cache_info.exist_race); } +static int __add_to_swap_cache(struct page *page, swp_entry_t entry) +{ + int error; + + BUG_ON(PageSwapCache(page)); + BUG_ON(PagePrivate(page)); + error = radix_tree_preload(GFP_ATOMIC); + if (error) + return error; + + page_cache_get(page); + mapping_wrlock(&swapper_space.page_lock); + error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + if (error) + page_cache_release(page); + else { + SetPageLocked(page); + SetPageSwapCache(page); + page->private = entry.val; + inc_page_state(nr_swapcache); + } + mapping_wrunlock(&swapper_space.page_lock); + radix_tree_preload_end(); + return error; +} + static int add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; - if (page->mapping) - BUG(); if (!swap_duplicate(entry)) { INC_CACHE_INFO(noent_race); return -ENOENT; } - error = add_to_page_cache(page, &swapper_space, entry.val, GFP_KERNEL); + error = __add_to_swap_cache(page, entry); /* * Anon pages are already on the LRU, we don't run lru_cache_add here. */ - if (error != 0) { + if (error) { swap_free(entry); if (error == -EEXIST) INC_CACHE_INFO(exist_race); return error; } - if (!PageLocked(page)) - BUG(); - if (!PageSwapCache(page)) - BUG(); INC_CACHE_INFO(add_total); return 0; } @@ -96,7 +109,9 @@ void __delete_from_swap_cache(struct pag BUG_ON(!PageLocked(page)); BUG_ON(!PageSwapCache(page)); BUG_ON(PageWriteback(page)); - __remove_from_page_cache(page); + radix_tree_delete(&swapper_space.page_tree, page->private); + ClearPageSwapCache(page); + dec_page_state(nr_swapcache); INC_CACHE_INFO(del_total); } @@ -140,8 +155,7 @@ int add_to_swap(struct page * page) /* * Add it to the swap cache and mark it dirty */ - err = add_to_page_cache(page, &swapper_space, - entry.val, GFP_ATOMIC); + err = __add_to_swap_cache(page, entry); if (pf_flags & PF_MEMALLOC) current->flags |= PF_MEMALLOC; @@ -149,8 +163,7 @@ int add_to_swap(struct page * page) switch (err) { case 0: /* Success */ SetPageUptodate(page); - ClearPageDirty(page); - set_page_dirty(page); + SetPageDirty(page); INC_CACHE_INFO(add_total); return 1; case -EEXIST: @@ -176,15 +189,16 @@ void delete_from_swap_cache(struct page { swp_entry_t entry; + BUG_ON(!PageSwapCache(page)); BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); BUG_ON(PagePrivate(page)); - entry.val = page->index; + entry.val = page->private; - spin_lock(&swapper_space.page_lock); + mapping_wrlock(&swapper_space.page_lock); __delete_from_swap_cache(page); - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&swapper_space.page_lock); swap_free(entry); page_cache_release(page); @@ -192,27 +206,11 @@ void delete_from_swap_cache(struct page int move_to_swap_cache(struct page *page, swp_entry_t entry) { - struct address_space *mapping = page->mapping; - int err; - - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); - - err = radix_tree_insert(&swapper_space.page_tree, entry.val, page); - if (!err) { - __remove_from_page_cache(page); - ___add_to_page_cache(page, &swapper_space, entry.val); - } - - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); - + int err = __add_to_swap_cache(page, entry); if (!err) { - if (!swap_duplicate(entry)) - BUG(); - /* shift page from clean_pages to dirty_pages list */ - BUG_ON(PageDirty(page)); - set_page_dirty(page); + remove_from_page_cache(page); + BUG_ON(!swap_duplicate(entry)); + SetPageDirty(page); INC_CACHE_INFO(add_total); } else if (err == -EEXIST) INC_CACHE_INFO(exist_race); @@ -222,29 +220,13 @@ int move_to_swap_cache(struct page *page int move_from_swap_cache(struct page *page, unsigned long index, struct address_space *mapping) { - swp_entry_t entry; - int err; - - BUG_ON(!PageLocked(page)); - BUG_ON(PageWriteback(page)); - BUG_ON(PagePrivate(page)); - - entry.val = page->index; - - spin_lock(&swapper_space.page_lock); - spin_lock(&mapping->page_lock); - - err = radix_tree_insert(&mapping->page_tree, index, page); - if (!err) { - __delete_from_swap_cache(page); - ___add_to_page_cache(page, mapping, index); + int err = add_to_page_cache(page, mapping, index, GFP_ATOMIC); + if (err == -EEXIST) { + INC_CACHE_INFO(exist_race); + err = 0; } - - spin_unlock(&mapping->page_lock); - spin_unlock(&swapper_space.page_lock); - if (!err) { - swap_free(entry); + delete_from_swap_cache(page); /* shift page from clean_pages to dirty_pages list */ ClearPageDirty(page); set_page_dirty(page); @@ -308,11 +290,17 @@ void free_pages_and_swap_cache(struct pa * lock getting page table operations atomic even if we drop the page * lock before returning. */ -struct page * lookup_swap_cache(swp_entry_t entry) +struct page *lookup_swap_cache(swp_entry_t entry) { - struct page *found; + struct page *page; - found = find_get_page(&swapper_space, entry.val); + mapping_rdlock(&swapper_space.page_lock); + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + if (page) { + page_cache_get(page); + INC_CACHE_INFO(find_success); + } + mapping_rdunlock(&swapper_space.page_lock); /* * Unsafe to assert PageSwapCache and mapping on page found: * if SMP nothing prevents swapoff from deleting this page from @@ -320,9 +308,7 @@ struct page * lookup_swap_cache(swp_entr * that, but no need to change: we _have_ got the right page. */ INC_CACHE_INFO(find_total); - if (found) - INC_CACHE_INFO(find_success); - return found; + return page; } /* @@ -331,7 +317,7 @@ struct page * lookup_swap_cache(swp_entr * A failure return means that either the page allocation failed or that * the swap entry is no longer in use. */ -struct page * read_swap_cache_async(swp_entry_t entry) +struct page *read_swap_cache_async(swp_entry_t entry) { struct page *found_page, *new_page = NULL; int err; @@ -343,7 +329,11 @@ struct page * read_swap_cache_async(swp_ * that would confuse statistics: use find_get_page() * directly. */ - found_page = find_get_page(&swapper_space, entry.val); + mapping_rdlock(&swapper_space.page_lock); + found_page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + if (found_page) + page_cache_get(found_page); + mapping_rdunlock(&swapper_space.page_lock); if (found_page) break; diff -prauN linux-2.6.0-test7/mm/swapfile.c wli-2.6.0-test7-bk1-29/mm/swapfile.c --- linux-2.6.0-test7/mm/swapfile.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/swapfile.c 2003-10-09 19:47:30.000000000 -0700 @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include @@ -247,16 +247,16 @@ static int exclusive_swap_page(struct pa struct swap_info_struct * p; swp_entry_t entry; - entry.val = page->index; + entry.val = page->private; p = swap_info_get(entry); if (p) { /* Is the only swap cache user the cache itself? */ if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + mapping_rdlock(&swapper_space.page_lock); if (page_count(page) - !!PagePrivate(page) == 2) retval = 1; - spin_unlock(&swapper_space.page_lock); + mapping_rdunlock(&swapper_space.page_lock); } swap_info_put(p); } @@ -315,7 +315,7 @@ int remove_exclusive_swap_page(struct pa if (page_count(page) != 2) /* 2: us + cache */ return 0; - entry.val = page->index; + entry.val = page->private; p = swap_info_get(entry); if (!p) return 0; @@ -324,13 +324,13 @@ int remove_exclusive_swap_page(struct pa retval = 0; if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&swapper_space.page_lock); + mapping_wrlock(&swapper_space.page_lock); if ((page_count(page) == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; } - spin_unlock(&swapper_space.page_lock); + mapping_wrunlock(&swapper_space.page_lock); } swap_info_put(p); @@ -353,8 +353,13 @@ void free_swap_and_cache(swp_entry_t ent p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, swp_offset(entry)) == 1) - page = find_trylock_page(&swapper_space, entry.val); + if (swap_entry_free(p, swp_offset(entry)) == 1) { + mapping_rdlock(&swapper_space.page_lock); + page = radix_tree_lookup(&swapper_space.page_tree, entry.val); + if (page && TestSetPageLocked(page)) + page = NULL; + mapping_rdunlock(&swapper_space.page_lock); + } swap_info_put(p); } if (page) { @@ -383,21 +388,21 @@ void free_swap_and_cache(swp_entry_t ent * what to do if a write is requested later. */ /* vma->vm_mm->page_table_lock is held */ -static void +static inline void unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { - vma->vm_mm->rss++; get_page(page); - set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); - *pte_chainp = page_add_rmap(page, dir, *pte_chainp); + vm_set_pte(vma, dir, pte_mkold(mk_pte(page, vma->vm_page_prot)), address); + vma->vm_mm->rss++; + page_add_rmap(page, vma, address, 1); swap_free(entry); } /* vma->vm_mm->page_table_lock is held */ static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, unsigned long address, unsigned long size, unsigned long offset, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pte_t * pte; unsigned long end; @@ -422,8 +427,7 @@ static int unuse_pmd(struct vm_area_stru * Test inline before going to call unuse_pte. */ if (unlikely(pte_same(*pte, swp_pte))) { - unuse_pte(vma, offset + address, pte, - entry, page, pte_chainp); + unuse_pte(vma, offset + address, pte, entry, page); pte_unmap(pte); return 1; } @@ -437,7 +441,7 @@ static int unuse_pmd(struct vm_area_stru /* vma->vm_mm->page_table_lock is held */ static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, unsigned long address, unsigned long size, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { pmd_t * pmd; unsigned long offset, end; @@ -449,7 +453,7 @@ static int unuse_pgd(struct vm_area_stru pgd_clear(dir); return 0; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_map(dir, address); offset = address & PGDIR_MASK; address &= ~PGDIR_MASK; end = address + size; @@ -458,26 +462,25 @@ static int unuse_pgd(struct vm_area_stru if (address >= end) BUG(); do { - if (unuse_pmd(vma, pmd, address, end - address, - offset, entry, page, pte_chainp)) + if (unuse_pmd(vma, pmd, address, end - address, offset, entry, page)) return 1; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); + pmd_unmap(pmd - 1); return 0; } /* vma->vm_mm->page_table_lock is held */ static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, - swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) + swp_entry_t entry, struct page *page) { unsigned long start = vma->vm_start, end = vma->vm_end; if (start >= end) BUG(); do { - if (unuse_pgd(vma, pgdir, start, end - start, - entry, page, pte_chainp)) + if (unuse_pgd(vma, pgdir, start, end - start, entry, page)) return 1; start = (start + PGDIR_SIZE) & PGDIR_MASK; pgdir++; @@ -489,23 +492,20 @@ static int unuse_process(struct mm_struc swp_entry_t entry, struct page* page) { struct vm_area_struct* vma; - struct pte_chain *pte_chain; - - pte_chain = pte_chain_alloc(GFP_KERNEL); - if (!pte_chain) - return -ENOMEM; /* * Go through process' page directory. */ + if (!rmap_get_cpu()) + return -ENOMEM; spin_lock(&mm->page_table_lock); + put_cpu(); for (vma = mm->mmap; vma; vma = vma->vm_next) { pgd_t * pgd = pgd_offset(mm, vma->vm_start); - if (unuse_vma(vma, pgd, entry, page, &pte_chain)) + if (unuse_vma(vma, pgd, entry, page)) break; } spin_unlock(&mm->page_table_lock); - pte_chain_free(pte_chain); return 0; } @@ -653,8 +653,14 @@ static int try_to_unuse(unsigned int typ if (swcount > 1) { if (start_mm == &init_mm) shmem = shmem_unuse(entry, page); - else + else { retval = unuse_process(start_mm, entry, page); + if (retval) { + unlock_page(page); + page_cache_release(page); + break; + } + } } if (*swap_map > 1) { int set_start_mm = (*swap_map >= swcount); @@ -677,9 +683,7 @@ static int try_to_unuse(unsigned int typ cond_resched(); swcount = *swap_map; - if (swcount <= 1) - ; - else if (mm == &init_mm) { + if (mm == &init_mm) { set_start_mm = 1; shmem = shmem_unuse(entry, page); } else @@ -995,9 +999,10 @@ int page_queue_congested(struct page *pa BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ - bdi = page->mapping->backing_dev_info; - if (PageSwapCache(page)) { - swp_entry_t entry = { .val = page->index }; + if (!PageSwapCache(page)) + bdi = page_mapping(page)->backing_dev_info; + else { + swp_entry_t entry = { .val = page->private }; struct swap_info_struct *sis; sis = get_swap_info_struct(swp_type(entry)); diff -prauN linux-2.6.0-test7/mm/truncate.c wli-2.6.0-test7-bk1-29/mm/truncate.c --- linux-2.6.0-test7/mm/truncate.c 2003-10-08 12:24:50.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/truncate.c 2003-10-09 19:42:26.000000000 -0700 @@ -19,7 +19,7 @@ static int do_invalidatepage(struct page *page, unsigned long offset) { int (*invalidatepage)(struct page *, unsigned long); - invalidatepage = page->mapping->a_ops->invalidatepage; + invalidatepage = page_mapping(page)->a_ops->invalidatepage; if (invalidatepage == NULL) invalidatepage = block_invalidatepage; return (*invalidatepage)(page, offset); @@ -37,7 +37,7 @@ static inline void truncate_partial_page * becomes anonymous. It will be left on the LRU and may even be mapped into * user pagetables if we're racing with filemap_nopage(). * - * We need to bale out if page->mapping is no longer equal to the original + * We need to bale out if page_mapping(page) is no longer equal to the original * mapping. This happens a) when the VM reclaimed the page while we waited on * its lock, b) when a concurrent invalidate_inode_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. @@ -45,7 +45,7 @@ static inline void truncate_partial_page static void truncate_complete_page(struct address_space *mapping, struct page *page) { - if (page->mapping != mapping) + if (page_mapping(page) != mapping) return; if (PagePrivate(page)) @@ -55,32 +55,31 @@ truncate_complete_page(struct address_sp ClearPageUptodate(page); ClearPageMappedToDisk(page); remove_from_page_cache(page); - page_cache_release(page); /* pagecache ref */ } /* * This is for invalidate_inode_pages(). That function can be called at * any time, and is not supposed to throw away dirty pages. But pages can * be marked dirty at any time too. So we re-check the dirtiness inside - * ->page_lock. That provides exclusion against the __set_page_dirty + * ->page_lock. That provides exclusion against the set_page_dirty * functions. */ static int invalidate_complete_page(struct address_space *mapping, struct page *page) { - if (page->mapping != mapping) + if (page_mapping(page) != mapping) return 0; if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (PageDirty(page)) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); return 0; } __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); ClearPageUptodate(page); page_cache_release(page); /* pagecache ref */ return 1; @@ -255,7 +254,7 @@ void invalidate_inode_pages2(struct addr struct page *page = pvec.pages[i]; lock_page(page); - if (page->mapping == mapping) { /* truncate race? */ + if (page_mapping(page) == mapping) { /* truncate race? */ wait_on_page_writeback(page); next = page->index + 1; if (page_mapped(page)) diff -prauN linux-2.6.0-test7/mm/vmalloc.c wli-2.6.0-test7-bk1-29/mm/vmalloc.c --- linux-2.6.0-test7/mm/vmalloc.c 2003-10-08 12:24:14.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/vmalloc.c 2003-10-09 19:28:46.000000000 -0700 @@ -71,7 +71,7 @@ static void unmap_area_pmd(pgd_t *dir, u return; } - pmd = pmd_offset(dir, address); + pmd = pmd_offset_kernel(dir, address); address &= ~PGDIR_MASK; end = address + size; if (end > PGDIR_SIZE) @@ -160,7 +160,7 @@ int map_vm_area(struct vm_struct *area, dir = pgd_offset_k(address); spin_lock(&init_mm.page_table_lock); do { - pmd_t *pmd = pmd_alloc(&init_mm, dir, address); + pmd_t *pmd = pmd_alloc_kernel(&init_mm, dir, address); if (!pmd) { err = -ENOMEM; break; diff -prauN linux-2.6.0-test7/mm/vmscan.c wli-2.6.0-test7-bk1-29/mm/vmscan.c --- linux-2.6.0-test7/mm/vmscan.c 2003-10-08 12:24:02.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/mm/vmscan.c 2003-10-09 19:44:30.000000000 -0700 @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include @@ -177,23 +177,23 @@ static int shrink_slab(long scanned, uns return 0; } -/* Must be called with page's pte_chain_lock held. */ +/* Must be called with page's rmap_lock held. */ static inline int page_mapping_inuse(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping; /* Page is in somebody's page tables. */ if (page_mapped(page)) return 1; - /* XXX: does this happen ? */ - if (!mapping) - return 0; - /* Be more reluctant to reclaim swapcache than pagecache */ if (PageSwapCache(page)) return 1; + mapping = page_mapping(page); + if (!mapping) + return 0; + /* File is mmap'd by somebody. */ if (!list_empty(&mapping->i_mmap)) return 1; @@ -237,7 +237,7 @@ static void handle_write_error(struct ad struct page *page, int error) { lock_page(page); - if (page->mapping == mapping) { + if (page_mapping(page) == mapping) { if (error == -ENOSPC) set_bit(AS_ENOSPC, &mapping->flags); else @@ -284,15 +284,15 @@ shrink_list(struct list_head *page_list, if (PageWriteback(page)) goto keep_locked; - pte_chain_lock(page); + rmap_lock(page); referenced = page_referenced(page); if (referenced && page_mapping_inuse(page)) { /* In active use or really unfreeable. Activate it. */ - pte_chain_unlock(page); + rmap_unlock(page); goto activate_locked; } - mapping = page->mapping; + mapping = page_mapping(page); #ifdef CONFIG_SWAP /* @@ -301,12 +301,14 @@ shrink_list(struct list_head *page_list, * * XXX: implement swap clustering ? */ - if (page_mapped(page) && !mapping && !PagePrivate(page)) { - pte_chain_unlock(page); + if (PageSwapCache(page)) + mapping = &swapper_space; + else if (PageAnon(page)) { + rmap_unlock(page); if (!add_to_swap(page)) goto activate_locked; - pte_chain_lock(page); - mapping = page->mapping; + rmap_lock(page); + mapping = &swapper_space; } #endif /* CONFIG_SWAP */ @@ -317,16 +319,16 @@ shrink_list(struct list_head *page_list, if (page_mapped(page) && mapping) { switch (try_to_unmap(page)) { case SWAP_FAIL: - pte_chain_unlock(page); + rmap_unlock(page); goto activate_locked; case SWAP_AGAIN: - pte_chain_unlock(page); + rmap_unlock(page); goto keep_locked; case SWAP_SUCCESS: ; /* try to free the page below */ } } - pte_chain_unlock(page); + rmap_unlock(page); /* * If the page is dirty, only perform writeback if that write @@ -358,7 +360,7 @@ shrink_list(struct list_head *page_list, goto keep_locked; if (!may_write_to_queue(mapping->backing_dev_info)) goto keep_locked; - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); if (test_clear_page_dirty(page)) { int res; struct writeback_control wbc = { @@ -368,8 +370,9 @@ shrink_list(struct list_head *page_list, .for_reclaim = 1, }; - list_move(&page->list, &mapping->locked_pages); - spin_unlock(&mapping->page_lock); + if (!PageSwapCache(page)) + list_move(&page->list, &mapping->locked_pages); + mapping_wrunlock(&mapping->page_lock); SetPageReclaim(page); res = mapping->a_ops->writepage(page, &wbc); @@ -385,7 +388,7 @@ shrink_list(struct list_head *page_list, } goto keep; } - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); } /* @@ -402,7 +405,7 @@ shrink_list(struct list_head *page_list, * try_to_release_page() will discover that cleanness and will * drop the buffers and mark the page clean - it can be freed. * - * Rarely, pages can have buffers and no ->mapping. These are + * Rarely, pages can have buffers and no page_mapping(). These are * the pages which were not successfully invalidated in * truncate_complete_page(). We try to drop those buffers here * and if that worked, and the page is no longer mapped into @@ -419,7 +422,7 @@ shrink_list(struct list_head *page_list, if (!mapping) goto keep_locked; /* truncate got there first */ - spin_lock(&mapping->page_lock); + mapping_wrlock(&mapping->page_lock); /* * The non-racy check for busy page. It is critical to check @@ -427,15 +430,15 @@ shrink_list(struct list_head *page_list, * not in use by anybody. (pagecache + us == 2) */ if (page_count(page) != 2 || PageDirty(page)) { - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); goto keep_locked; } #ifdef CONFIG_SWAP if (PageSwapCache(page)) { - swp_entry_t swap = { .val = page->index }; + swp_entry_t swap = { .val = page->private }; __delete_from_swap_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); swap_free(swap); __put_page(page); /* The pagecache ref */ goto free_it; @@ -443,7 +446,7 @@ shrink_list(struct list_head *page_list, #endif /* CONFIG_SWAP */ __remove_from_page_cache(page); - spin_unlock(&mapping->page_lock); + mapping_wrunlock(&mapping->page_lock); __put_page(page); free_it: @@ -663,13 +666,13 @@ refill_inactive_zone(struct zone *zone, page = list_entry(l_hold.prev, struct page, lru); list_del(&page->lru); if (page_mapped(page)) { - pte_chain_lock(page); + rmap_lock(page); if (page_mapped(page) && page_referenced(page)) { - pte_chain_unlock(page); + rmap_unlock(page); list_add(&page->lru, &l_active); continue; } - pte_chain_unlock(page); + rmap_unlock(page); if (!reclaim_mapped) { list_add(&page->lru, &l_active); continue; @@ -679,7 +682,7 @@ refill_inactive_zone(struct zone *zone, * FIXME: need to consider page_count(page) here if/when we * reap orphaned pages via the LRU (Daniel's locking stuff) */ - if (total_swap_pages == 0 && !page->mapping && + if (total_swap_pages == 0 && !page_mapping(page) && !PagePrivate(page)) { list_add(&page->lru, &l_active); continue; @@ -837,6 +840,10 @@ shrink_caches(struct zone *classzone, in } return ret; } + +#ifndef HAVE_ARCH_PAGETABLE_CACHE +#define shrink_pagetable_cache(gfp_mask) do { } while (0) +#endif /* * This is the main entry point to direct page reclaim. @@ -890,6 +897,9 @@ int try_to_free_pages(struct zone *cz, */ wakeup_bdflush(total_scanned); + /* shoot down some pagetable caches before napping */ + shrink_pagetable_cache(gfp_mask); + /* Take a nap, wait for some writeback to complete */ blk_congestion_wait(WRITE, HZ/10); if (cz - cz->zone_pgdat->node_zones < ZONE_HIGHMEM) { @@ -981,8 +991,10 @@ static int balance_pgdat(pg_data_t *pgda } if (all_zones_ok) break; - if (to_free > 0) + if (to_free > 0) { + shrink_pagetable_cache(GFP_HIGHUSER); blk_congestion_wait(WRITE, HZ/10); + } } for (i = 0; i < pgdat->nr_zones; i++) { diff -prauN linux-2.6.0-test7/net/sunrpc/xdr.c wli-2.6.0-test7-bk1-29/net/sunrpc/xdr.c --- linux-2.6.0-test7/net/sunrpc/xdr.c 2003-10-08 12:24:25.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/net/sunrpc/xdr.c 2003-10-09 19:27:23.000000000 -0700 @@ -715,6 +715,7 @@ xdr_read_pages(struct xdr_stream *xdr, u struct xdr_buf *buf = xdr->buf; struct iovec *iov; ssize_t shift; + int padding; /* Realign pages to current pointer position */ iov = buf->head; @@ -723,10 +724,10 @@ xdr_read_pages(struct xdr_stream *xdr, u xdr_shrink_bufhead(buf, shift); /* Truncate page data and move it into the tail */ - len = XDR_QUADLEN(len) << 2; if (buf->page_len > len) xdr_shrink_pagelen(buf, buf->page_len - len); + padding = (XDR_QUADLEN(len) << 2) - len; xdr->iov = iov = buf->tail; - xdr->p = (uint32_t *)iov->iov_base; + xdr->p = (uint32_t *)((char *)iov->iov_base + padding); xdr->end = (uint32_t *)((char *)iov->iov_base + iov->iov_len); } diff -prauN linux-2.6.0-test7/sound/oss/cs46xx.c wli-2.6.0-test7-bk1-29/sound/oss/cs46xx.c --- linux-2.6.0-test7/sound/oss/cs46xx.c 2003-10-08 12:24:00.000000000 -0700 +++ wli-2.6.0-test7-bk1-29/sound/oss/cs46xx.c 2003-10-09 19:57:40.000000000 -0700 @@ -1890,7 +1890,6 @@ static int cs_midi_open(struct inode *in spin_unlock_irqrestore(&card->midi.lock, flags); card->midi.open_mode |= (file->f_mode & (FMODE_READ | FMODE_WRITE)); up(&card->midi.open_sem); - MOD_INC_USE_COUNT; /* for 2.2 */ return 0; } @@ -1926,7 +1925,6 @@ static int cs_midi_release(struct inode card->midi.open_mode &= (~(file->f_mode & (FMODE_READ | FMODE_WRITE))); up(&card->midi.open_sem); wake_up(&card->midi.open_wait); - MOD_DEC_USE_COUNT; /* for 2.2 */ return 0; } @@ -3370,7 +3368,6 @@ static int cs_open(struct inode *inode, if((ret = prog_dmabuf(state))) return ret; } - MOD_INC_USE_COUNT; /* for 2.2 */ CS_DBGOUT(CS_OPEN | CS_FUNCTION, 2, printk("cs46xx: cs_open()- 0\n") ); return 0; } @@ -3457,7 +3454,6 @@ static int cs_release(struct inode *inod } CS_DBGOUT(CS_FUNCTION | CS_RELEASE, 2, printk("cs46xx: cs_release()- 0\n") ); - MOD_DEC_USE_COUNT; /* For 2.2 */ return 0; } @@ -4105,7 +4101,6 @@ static int cs_open_mixdev(struct inode * } card->amplifier_ctrl(card, 1); CS_INC_USE_COUNT(&card->mixer_use_cnt); - MOD_INC_USE_COUNT; /* for 2.2 */ CS_DBGOUT(CS_FUNCTION | CS_OPEN, 4, printk(KERN_INFO "cs46xx: cs_open_mixdev()- 0\n")); return 0; @@ -4136,7 +4131,6 @@ static int cs_release_mixdev(struct inod return -ENODEV; } match: - MOD_DEC_USE_COUNT; /* for 2.2 */ if(!CS_DEC_AND_TEST(&card->mixer_use_cnt)) { CS_DBGOUT(CS_FUNCTION | CS_RELEASE, 4,