diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/CREDITS linuxppc64_2_4/CREDITS --- ../kernel.org/linux-2.4.19/CREDITS Wed May 15 08:29:33 2002 +++ linuxppc64_2_4/CREDITS Mon May 13 16:39:05 2002 @@ -986,6 +986,14 @@ S: 80050-430 - Curitiba - Paraná S: Brazil +N: Tom Gall +E: tom_gall@vnet.ibm.com +E: tgall@rochcivictheatre.org +D: ppc64, ppc +S: 710 Walnut St +S: Mantorville, MN 55955 +S: USA + N: Nigel Gamble E: nigel@nrg.org E: nigel@sgi.com diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/Documentation/Configure.help linuxppc64_2_4/Documentation/Configure.help --- ../kernel.org/linux-2.4.19/Documentation/Configure.help Wed May 15 08:29:33 2002 +++ linuxppc64_2_4/Documentation/Configure.help Mon May 13 16:39:06 2002 @@ -232,6 +232,13 @@ CPU and the single-board computers built around it, targeted for network and embedded applications. For more information see the Axis Communication site, . +PowerPC64 processor +CONFIG_PPC64 + The PowerPC architecture was designed for both 32 bit and 64 bit + processor implementations. 64 bit PowerPC processors are in many + ways a superset of their 32 bit PowerPC cousins. Each 64 bit PowerPC + processor also has a 32 bit mode to allow for 32 bit compatibility. + The home of the PowerPC 64 Linux project is at Multiquad support for NUMA systems CONFIG_MULTIQUAD @@ -15358,6 +15365,20 @@ hard drives and ADFS-formatted floppy disks. This is experimental codes, so if you're unsure, say N. +JFS filesystem support +CONFIG_JFS_FS + This is a port of IBM's Journaled Filesystem . More information is + available in the file Documentation/filesystems/jfs.txt. + + If you do not intend to use the JFS filesystem, say N. + +JFS Debugging +CONFIG_JFS_DEBUG + If you are experiencing any problems with the JFS filesystem, say + Y here. This will result in additional debugging messages to be + written to the system log. Under normal circumstances, this + results in very little overhead. + /dev/pts file system for Unix98 PTYs CONFIG_DEVPTS_FS You should say Y here if you said Y to "Unix98 PTY support" above. @@ -16527,6 +16548,19 @@ . The module will be called isicom.o. +IBM Multiport Serial Adapter +CONFIG_ICOM + This driver is for a family of multiport serial adapters including + 2 port RVX (iSeries 2745), 2 port modem (iSeries + 2772) and 1 port RVX + 1 port modem (iSeries 2771). The + module is called iCom.o +CONFIG_ICOM_MODEM_CC + This field entry enables the device driver to configure the modem + for appropriate operations based on country code. If you do not + have an internal modem card then a blank entry is recommended. + If you do have an internal modem card, look for the comment in iCom.c + indicating which value relates to your country. + Unix98 PTY support CONFIG_UNIX98_PTYS A pseudo terminal (PTY) is a software device consisting of two @@ -21333,6 +21367,12 @@ Select APUS if configuring for a PowerUP Amiga. More information is available at: . +# Choice: i or p +Platform support +CONFIG_PPC_ISERIES + Linux runs on certain models of the IBM AS/400, now known as the + IBM iSeries. Generally if you can run LPAR (Logical Partitioning) + on your iSeries you can run Linux in a partition on your machine. AltiVec kernel support CONFIG_ALTIVEC @@ -21396,6 +21436,16 @@ You may also want to compile the dma sound driver as a module and have it autoloaded. The act of removing the module shuts down the sound hardware for more power savings. +Platform support +CONFIG_PPC_PSERIES + Linux runs on most models of IBM pSeries hardware. (pSeries used + to be known as the RS/6000) + + See for exact model information for the + 64 bit PowerPC kernel. + + pSeries Linux information from IBM can be found at: + APM emulation CONFIG_PMAC_APM_EMU @@ -21684,6 +21734,12 @@ Date of Release: early 2001 (?) End of life: - URL: +Support for Large Memory +CONFIG_MSCHUNKS + MsChunks stands for Main Store Chunks and specifically allows the + 64 bit PowerPC Linux kernel to optimize for machines with sparse + discontiguous memory. iSeries kernels need to have this on. + It is recommended that for pSeries hardware that you answer N. ADB raw keycode support CONFIG_MAC_ADBKEYCODES diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/Documentation/cachetlb.txt linuxppc64_2_4/Documentation/cachetlb.txt --- ../kernel.org/linux-2.4.19/Documentation/cachetlb.txt Fri Apr 19 11:00:11 2002 +++ linuxppc64_2_4/Documentation/cachetlb.txt Mon Apr 22 10:24:05 2002 @@ -260,8 +260,9 @@ Here is the new interface: - void copy_user_page(void *to, void *from, unsigned long address) - void clear_user_page(void *to, unsigned long address) + void copy_user_page(struct page *to, struct page *from, + unsigned long address) + void clear_user_page(struct page *to, unsigned long address) These two routines store data in user anonymous or COW pages. It allows a port to efficiently avoid D-cache alias @@ -279,6 +280,11 @@ If D-cache aliasing is not an issue, these two routines may simply call memcpy/memset directly and do nothing more. + + There are default versions of these procedures supplied in + include/linux/highmem.h. If a port does not want to use the + default versions it should declare them and define the symbol + __HAVE_ARCH_USER_PAGE in include/asm/page.h. void flush_dcache_page(struct page *page) diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/Documentation/filesystems/00-INDEX linuxppc64_2_4/Documentation/filesystems/00-INDEX --- ../kernel.org/linux-2.4.19/Documentation/filesystems/00-INDEX Fri Apr 19 10:30:50 2002 +++ linuxppc64_2_4/Documentation/filesystems/00-INDEX Thu Sep 13 14:29:38 2001 @@ -22,6 +22,8 @@ - info and mount options for the OS/2 HPFS. isofs.txt - info and mount options for the ISO 9660 (CDROM) filesystem. +jfs.txt + - info and mount options for the JFS filesystem. ncpfs.txt - info on Novell Netware(tm) filesystem using NCP protocol. ntfs.txt diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/Documentation/filesystems/changelog.jfs linuxppc64_2_4/Documentation/filesystems/changelog.jfs --- ../kernel.org/linux-2.4.19/Documentation/filesystems/changelog.jfs Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/Documentation/filesystems/changelog.jfs Tue Apr 23 11:14:24 2002 @@ -0,0 +1,234 @@ +IBM's Journaled File System (JFS) for Linux version 1.0.17 +Team members +Steve Best sbest@us.ibm.com +Dave Kleikamp shaggy@austin.ibm.com +Barry Arndt barndt@us.ibm.com +Christoph Hellwig hch@infradead.org + + +Release April 2, 2002 (version 1.0.17) + +This is our fifty-fifth release of IBM's Enterprise JFS technology port to Linux. +Beta 1 was release 0.1.0 on 12/8/2000, Beta 2 was release 0.2.0 on 3/7/2001, +Beta 3 was release 0.3.0 on 4/30/2001, and release 1.0.0 on 6/28/2001. + + +Function and Fixes in drop 55 (1.0.17) + - Call sb_set_blocksize instead of set_blocksize in 2.5 (Christoph Hellwig) + - Replace strtok by strsep (Christoph Hellwig) + - Store entire device number in log superblock rather than just the minor. + - Include file clean (Christoph Hellwig) + - Fix race introduced by thread handling cleanups (Christoph Hellwig) + - Detect dtree corruption to avoid infinite loop + - JFS needs to include completion.h + - Support external log(journal) device file system work part 1 (Christoph Hellwig) + +Function and Fixes in drop 54 (1.0.16) + - Limit readdir offset to signed integer for NFSv2 (Christoph Hellwig) + - missing static in jfs_imap.c (Christoph Hellwig) + - Fix infinite loop in jfs_readdir + weren't updating the directory index table completely (bug # 2591) + - Sync up 2.4 tree with 2.5 -- (Christoph Hellwig & Shaggy) + move to completions, provide back-compact for pre-2.4.7 + remove dead code + add kdev_t conversion, that should have been in 2.4 anyway + move one-time inode initialization into slab constructor + - Remove non-core files from CVS + +Function and Fixes in drop 53 (1.0.15) + - Fix trap when appending to very large file + - Moving jfs headers into fs/jfs at Linus' request + - Move up to linux-2.5.4 + - Fix file size limit on 32-bit (Andi Kleen) + - make changelog more read-able and include only 1.0.0 and above (Christoph Hellwig) + - Don't allocate metadata pages from high memory. JFS keeps them kmapped too long causing deadlock. + - Fix xtree corruption when creating file with >= 64 GB of physically contiguous dasd + - Replace semaphore with struct completion for thread startup/shutdown (Benedikt Spranger) + - cleanup Tx alloc/free (Christoph Hellwig) + - Move up to linux-2.5.3 + - thread cleanups (Christoph Hellwig) + - First step toward making tblocks and tlocks dynamically allocated. Intro tid_t and lid_t to + insulate the majority of the code from future changes. Also hide TxBlock and TxLock arrays + by using macros to get from tids and lids to real structures. + - minor list-handling cleanup (Christoph Hellwig) + - Replace altnext and altprev with struct list_head + - Clean up the debugging code and add support for collecting statistics (Christoph Hellwig) + + +Function and Fixes in drop 52 (1.0.14) + - Fix hang in invalidate_metapages when jfs.o is built as a module + - Fix anon_list removal logic in txLock + +Function and Fixes in drop 51 (1.0.13) + - chmod changes on newly created directories are lost after umount (bug 2535) + - Page locking race fixes + - Improve metapage locking + - Fix timing window. Lock page while metapage is active to avoid page going + away before the metadata is released. (Fixed crash during mount/umount testing) + - Make changes for 2.5.2 kernel + - Fix race condition truncating large files + +Function and Fixes in drop50 (1.0.12) + - Add O_DIRECT support + - Add support for 2.4.17 kernel + - Make sure COMMIT_STALE gets reset before the inode is unlocked. Fixing + this gets rid of XT_GETPAGE errors + - Remove invalid __exit keyword from metapage_exit and txExit. + - fix assert(log->cqueue.head == NULL by waiting longer + +Function and Fixes in drop49 (1.0.11) + - Readdir was not handling multibyte codepages correctly. + - Make mount option parsing more robust. + - Add iocharset mount option. + - Journalling of symlinks incorrect, resulting in logredo failure of -265. + - Add jfsutils information to Changes file + - Improve recoverability of the file system when metadata corruption is detected. + - Fix kernel OOPS when root inode is corrupted + +Function and Fixes in drop48 (1.0.10) + - put inodes later on hash queues + - Fix boundary case in xtTruncate + - When invalidating metadata, try to flush the dirty buffers rather than sync them. + - Add another sanity check to avoid trapping when imap is corrupt + - Fix file truncate while removing large file (assert(cmp == 0)) + - read_cache_page returns ERR_PTR, not NULL on error + - Add dtSearchNode and dtRelocate + - JFS needs to use generic_file_open & generic_file_llseek + - Remove lazyQwait, etc. It created an unnecessary bottleneck in TxBegin. + +Function and Fixes in drop47 (1.0.9) + - Fix data corruption problem when creating files while deleting others. (jitterbug 183) + - Make sure all metadata is written before finalizing the log + - Fix serialization problem in shutdown by setting i_size of directory sooner. (bugzilla #334) + - JFS should quit whining when special files are marked dirty during read-only mount. + - Must always check rc after DT_GETPAGE + - Add diExtendFS + - Removing defconfig form JFS source - not really needed + +Function and Fixes in drop46 (1.0.8) + - Synclist was being built backwards causing logredo to quit too early + - jfs_compat.h needs to include module.h + - uncomment EXPORTS_NO_SYMBOLS in super.c + - Minor code cleanup + - xtree of zero-truncated file not being logged + - Fix logging on file truncate + - remove unused metapage fields + +Function and Fixes in drop45 (1.0.7) + - cleanup remove IS_KIOBUFIO define. + - cleanup remove TRUNC_NO_TOSS define. + - have jFYI's use the name directly from dentry + - Remove nul _ALLOC and _FREE macros and also make spinlocks static. + - cleanup add externs where needed in the header files + - jfs_write_inode is a bad place to call iput. Also limit warnings. + - More truncate cleanup + - Truncate cleanup + - Add missing statics in jfs_metapage.c + - fsync fixes + - Clean up symlink code - use page_symlink_inode_operations + - unicode handling cleanup + - cleanup replace UniChar with wchar_t + - Get rid of CDLL_* macros - use list.h instead + - 2.4.11-prex mount problem Call new_inode instead of get_empty_inode + - use kernel min/max macros + - Add MODULE_LICENSE stub for older kernels + - IA64/gcc3 fixes + - Log Manager fixes, introduce __SLEEP_COND macro + - Mark superblock dirty when some errors detected (forcing fsck to be run). + - More robust remounting from r/o to r/w. + - Misc. cleanup add static where appropriate + - small cleanup in jfs_umount_rw + - add MODULE_ stuff + - Set *dropped_lock in alloc_metapage + - Get rid of unused log list + - cleanup jfs_imap.c to remove _OLD_STUFF and _NO_MORE_MOUNT_INODE defines + - Log manager cleanup + - Transaction manager cleanup + - correct memory allocations flags + - Better handling of iterative truncation + - Change continue to break, otherwise we don't re-acquire LAZY_LOCK + +Function and Fixes in drop44 (1.0.6) + - Create jfs_incore.h which merges linux/jfs_fs.h, linux/jfs_fs_i.h, and jfs_fs_sb.h + - Create a configuration option to handle JFS_DEBUG define + - Fixed a few cases where positive error codes were returned to the VFS. + - Replace jfs_dir_read by generic_read_dir. + - jfs_fsync_inode is only called by jfs_fsync_file, merge the two and rename to jfs_fsync. + - Add a bunch of missing externs. + - jfs_rwlock_lock is unused, nuke it. + - Always use atomic set/test_bit operations to protect jfs_ip->cflag + - Combine jfs_ip->flag with jfs_ip->cflag + - Fixed minor format errors reported by fsck + - cflags should be long so bitops always works correctly + - Use GFP_NOFS for runtime memory allocations + - Support VM changes in 2.4.10 of the kernel + - Remove ifdefs supporting older 2.4 kernels. JFS now requires at least 2.4.3 or 2.4.2-ac2 + - Simplify and remove one use of IWRITE_TRYLOCK + - jfs_truncate was not passing tid to xtTruncate + - removed obsolete extent_page workaround + - correct recovery from failed diAlloc call (disk full) + - In write_metapage, don't call commit_write if prepare_write failed + +Function and Fixes in drop43 (1.0.5) + - Allow separate allocation of JFS-private superblock/inode data. + - Remove checks in namei.c that are already done by the VFS. + - Remove redundant mutex defines. + - Replace all occurrences of #include with #include + - Work around race condition in remount -fixes OOPS during shutdown + - Truncate large files incrementally ( affects directories too) + +Function and Fixes in drop42 (1.0.4) + - Fixed compiler warnings in the FS when building on 64 bits systems + - Fixed deadlock where jfsCommit hung in hold_metapage + - Fixed problems with remount + - Reserve metapages for jfsCommit thread + - Get rid of buggy invalidate_metapage & use discard_metapage + - Don't hand metapages to jfsIOthread (too many context switches) (jitterbug 125, bugzilla 238) + - Fix error message in jfs_strtoUCS + +Function and Fixes in drop41 (1.0.3) + - Patch to move from previous release to latest release needs to update the version number in super.c + - Jitterbug problems (134,140,152) removing files have been fixed + - Set rc=ENOSPC if ialloc fails in jfs_create and jfs_mkdir + - Fixed jfs_txnmgr.c 775! assert + - Fixed jfs_txnmgr.c 884! assert(mp->nohomeok==0) + - Fix hang - prevent tblocks from being exhausted + - Fix oops trying to mount reiserfs + - Fail more gracefully in jfs_imap.c + - Print more information when char2uni fails + - Fix timing problem between Block map and metapage cache - jitterbug 139 + - Code Cleanup (removed many ifdef's, obsolete code, ran code through indent) Mostly 2.4 tree + - Split source tree (Now have a separate source tree for 2.2, 2.4, and jfsutils) + +Function and Fixes in drop40 (1.0.2) + - Fixed multiple truncate hang + - Fixed hang on unlink a file and sync happening at the same time + - Improved handling of kmalloc error conditions + - Fixed hang in blk_get_queue and SMP deadlock: bh_end_io call generic_make_request + (jitterbug 145 and 146) + - stbl was not set correctly set in dtDelete + - changed trap to printk in dbAllocAG to avoid system hang + +Function and Fixes in drop 39 (1.0.1) + - Fixed hang during copying files on 2.2.x series + - Fixed TxLock compile problem + - Fixed to correctly update the number of blocks for directories (this was causing the FS + to show fsck error after compiling mozilla). + - Fixed to prevent old data from being written to disk from the page cache. + +Function and Fixes in drop 38 (1.0.0) + - Fixed some general log problems + +Please send bugs, comments, cards and letters to linuxjfs@us.ibm.com. + +The JFS mailing list can be subscribed to by using the link labeled "Mail list Subscribe" +at our web page http://oss.software.ibm.com/jfs/. + + + + + + + + + diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/Documentation/filesystems/jfs.txt linuxppc64_2_4/Documentation/filesystems/jfs.txt --- ../kernel.org/linux-2.4.19/Documentation/filesystems/jfs.txt Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/Documentation/filesystems/jfs.txt Tue Apr 23 11:14:24 2002 @@ -0,0 +1,10 @@ +IBM's Journaled File System (JFS) for Linux + +The JFS utilities can be found at the JFS homepage at +http://oss.software.ibm.com/jfs + +Team members +Steve Best sbest@us.ibm.com +Dave Kleikamp shaggy@austin.ibm.com +Barry Arndt barndt@us.ibm.com +Christoph Hellwig hch@infradead.org diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/MAINTAINERS linuxppc64_2_4/MAINTAINERS --- ../kernel.org/linux-2.4.19/MAINTAINERS Wed May 15 08:29:34 2002 +++ linuxppc64_2_4/MAINTAINERS Mon May 13 16:39:05 2002 @@ -852,6 +852,13 @@ W: http://sources.redhat.com/jffs2/ S: Maintained +JFS FILESYSTEM +P: Dave Kleikamp +M: shaggy@austin.ibm.com +L: jfs-discussion@oss.software.ibm.com +W: http://oss.software.ibm.com/developerworks/opensource/jfs/ +S: Supported + JOYSTICK DRIVER P: Vojtech Pavlik M: vojtech@suse.cz @@ -925,6 +932,13 @@ W: http://www.linuxppc.org/ L: linuxppc-dev@lists.linuxppc.org S: Maintained + +LINUX FOR 64BIT POWERPC +P: David Engebretsen +M: engebret@us.ibm.com +W: http://linuxppc64.org +L: linuxppc64-dev@lists.linuxppc.org +S: Supported LINUX FOR 64BIT POWERPC P: David Engebretsen diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/Makefile linuxppc64_2_4/Makefile --- ../kernel.org/linux-2.4.19/Makefile Wed May 15 08:29:34 2002 +++ linuxppc64_2_4/Makefile Mon May 13 16:39:05 2002 @@ -5,7 +5,8 @@ KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) -ARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ -e s/arm.*/arm/ -e s/sa110/arm/) +#ARCH := $(shell uname -m | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ -e s/arm.*/arm/ -e s/sa110/arm/) +ARCH := ppc64 KERNELPATH=kernel-$(shell echo $(KERNELRELEASE) | sed -e "s/-//g") CONFIG_SHELL := $(shell if [ -x "$$BASH" ]; then echo $$BASH; \ @@ -19,7 +20,7 @@ HOSTCC = gcc HOSTCFLAGS = -Wall -Wstrict-prototypes -O2 -fomit-frame-pointer -CROSS_COMPILE = +CROSS_COMPILE = /usr/local/ppc64-current3.0/bin/powerpc64-linux- # # Include the make variables (CC, etc...) @@ -154,6 +155,7 @@ DRIVERS-$(CONFIG_SCSI) += drivers/scsi/scsidrv.o DRIVERS-$(CONFIG_FUSION_BOOT) += drivers/message/fusion/fusion.o DRIVERS-$(CONFIG_IEEE1394) += drivers/ieee1394/ieee1394drv.o +DRIVERS-$(CONFIG_PPC_ISERIES) += drivers/iseries/iseries.o ifneq ($(CONFIG_CD_NO_IDESCSI)$(CONFIG_BLK_DEV_IDECD)$(CONFIG_BLK_DEV_SR)$(CONFIG_PARIDE_PCD),) DRIVERS-y += drivers/cdrom/driver.o @@ -170,7 +172,6 @@ DRIVERS-$(CONFIG_SBUS) += drivers/sbus/sbus_all.o DRIVERS-$(CONFIG_ZORRO) += drivers/zorro/driver.o DRIVERS-$(CONFIG_FC4) += drivers/fc4/fc4.a -DRIVERS-$(CONFIG_PPC) += drivers/macintosh/macintosh.o DRIVERS-$(CONFIG_MAC) += drivers/macintosh/macintosh.o DRIVERS-$(CONFIG_ISAPNP) += drivers/pnp/pnp.o DRIVERS-$(CONFIG_SGI_IP22) += drivers/sgi/sgi.a diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/boot/zImage.c linuxppc64_2_4/arch/ppc64/boot/zImage.c --- ../kernel.org/linux-2.4.19/arch/ppc64/boot/zImage.c Wed May 15 08:29:35 2002 +++ linuxppc64_2_4/arch/ppc64/boot/zImage.c Tue Apr 30 09:19:01 2002 @@ -225,7 +225,7 @@ rec = bi_rec_alloc(rec, 2); rec->tag = BI_MACHTYPE; - rec->data[0] = _MACH_pSeries; + rec->data[0] = PLATFORM_PSERIES; rec->data[1] = 1; if ( initrd.size > 0 ) { diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/config.in linuxppc64_2_4/arch/ppc64/config.in --- ../kernel.org/linux-2.4.19/arch/ppc64/config.in Wed May 15 08:29:35 2002 +++ linuxppc64_2_4/arch/ppc64/config.in Mon May 6 14:29:20 2002 @@ -37,6 +37,7 @@ define_bool CONFIG_MSCHUNKS y else bool 'MsChunks Physical to Absolute address translation support' CONFIG_MSCHUNKS +tristate 'Firmware flash interface' CONFIG_RTAS_FLASH fi diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/configs/iSeries_nodevfs_ideemul_defconfig linuxppc64_2_4/arch/ppc64/configs/iSeries_nodevfs_ideemul_defconfig --- ../kernel.org/linux-2.4.19/arch/ppc64/configs/iSeries_nodevfs_ideemul_defconfig Fri Apr 19 11:00:33 2002 +++ linuxppc64_2_4/arch/ppc64/configs/iSeries_nodevfs_ideemul_defconfig Thu Apr 25 20:32:38 2002 @@ -67,7 +67,6 @@ # # CONFIG_PNP is not set # CONFIG_ISAPNP is not set -# CONFIG_PNPBIOS is not set # # Block devices @@ -77,6 +76,7 @@ # CONFIG_PARIDE is not set # CONFIG_BLK_CPQ_DA is not set # CONFIG_BLK_CPQ_CISS_DA is not set +# CONFIG_CISS_SCSI_TAPE is not set # CONFIG_BLK_DEV_DAC960 is not set CONFIG_BLK_DEV_LOOP=y # CONFIG_BLK_DEV_NBD is not set @@ -101,8 +101,6 @@ # CONFIG_PACKET=y # CONFIG_PACKET_MMAP is not set -CONFIG_NETLINK=y -CONFIG_RTNETLINK=y # CONFIG_NETLINK_DEV is not set # CONFIG_NETFILTER is not set CONFIG_FILTER=y @@ -110,8 +108,6 @@ CONFIG_INET=y CONFIG_IP_MULTICAST=y CONFIG_IP_ADVANCED_ROUTER=y -CONFIG_RTNETLINK=y -CONFIG_NETLINK=y CONFIG_IP_MULTIPLE_TABLES=y CONFIG_IP_ROUTE_NAT=y CONFIG_IP_ROUTE_MULTIPATH=y @@ -134,12 +130,18 @@ # CONFIG_IPV6 is not set # CONFIG_KHTTPD is not set # CONFIG_ATM is not set +# CONFIG_VLAN_8021Q is not set # # # # CONFIG_IPX is not set # CONFIG_ATALK is not set + +# +# Appletalk devices +# +# CONFIG_DEV_APPLETALK is not set # CONFIG_DECNET is not set # CONFIG_BRIDGE is not set # CONFIG_X25 is not set @@ -155,6 +157,16 @@ # QoS and/or fair queueing # # CONFIG_NET_SCHED is not set + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set + +# +# ATA/IDE/MFM/RLL support +# +# CONFIG_IDE is not set # CONFIG_BLK_DEV_IDE_MODES is not set # CONFIG_BLK_DEV_HD is not set @@ -192,6 +204,7 @@ # CONFIG_SCSI_AHA152X is not set # CONFIG_SCSI_AHA1542 is not set # CONFIG_SCSI_AHA1740 is not set +# CONFIG_SCSI_AACRAID is not set # CONFIG_SCSI_AIC7XXX is not set # CONFIG_SCSI_AIC7XXX_OLD is not set # CONFIG_SCSI_DPT_I2O is not set @@ -213,6 +226,7 @@ # CONFIG_SCSI_INIA100 is not set # CONFIG_SCSI_NCR53C406A is not set # CONFIG_SCSI_NCR53C7xx is not set +# CONFIG_SCSI_SYM53C8XX_2 is not set # CONFIG_SCSI_NCR53C8XX is not set # CONFIG_SCSI_SYM53C8XX is not set # CONFIG_SCSI_PAS16 is not set @@ -229,8 +243,6 @@ # CONFIG_SCSI_T128 is not set # CONFIG_SCSI_U14_34F is not set # CONFIG_SCSI_DEBUG is not set -# CONFIG_SCSI_MESH is not set -# CONFIG_SCSI_MAC53C94 is not set # # IEEE 1394 (FireWire) support (EXPERIMENTAL) @@ -259,12 +271,10 @@ # CONFIG_MACE is not set # CONFIG_BMAC is not set # CONFIG_GMAC is not set -# CONFIG_OAKNET is not set # CONFIG_SUNLANCE is not set # CONFIG_HAPPYMEAL is not set # CONFIG_SUNBMAC is not set # CONFIG_SUNQE is not set -# CONFIG_SUNLANCE is not set # CONFIG_SUNGEM is not set # CONFIG_NET_VENDOR_3COM is not set # CONFIG_LANCE is not set @@ -278,6 +288,7 @@ # CONFIG_APRICOT is not set # CONFIG_CS89x0 is not set # CONFIG_TULIP is not set +# CONFIG_TC35815 is not set # CONFIG_DE4X5 is not set # CONFIG_DGRS is not set # CONFIG_DM9102 is not set @@ -293,11 +304,13 @@ # CONFIG_8139TOO_PIO is not set # CONFIG_8139TOO_TUNE_TWISTER is not set # CONFIG_8139TOO_8129 is not set +# CONFIG_8139_NEW_RX_RESET is not set # CONFIG_SIS900 is not set # CONFIG_EPIC100 is not set # CONFIG_SUNDANCE is not set # CONFIG_TLAN is not set # CONFIG_VIA_RHINE is not set +# CONFIG_VIA_RHINE_MMIO is not set # CONFIG_WINBOND_840 is not set # CONFIG_NET_POCKET is not set @@ -312,6 +325,7 @@ # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set # CONFIG_SK98LIN is not set +# CONFIG_TIGON3 is not set # CONFIG_FDDI is not set # CONFIG_HIPPI is not set # CONFIG_PLIP is not set @@ -329,6 +343,7 @@ CONFIG_TR=y CONFIG_IBMOL=m # CONFIG_IBMLS is not set +# CONFIG_3C359 is not set # CONFIG_TMS380TR is not set # CONFIG_NET_FC is not set # CONFIG_RCPCI is not set @@ -380,6 +395,15 @@ # CONFIG_FB is not set # +# Input core support +# +# CONFIG_INPUT is not set +# CONFIG_INPUT_KEYBDEV is not set +# CONFIG_INPUT_MOUSEDEV is not set +# CONFIG_INPUT_JOYDEV is not set +# CONFIG_INPUT_EVDEV is not set + +# # iSeries device drivers # CONFIG_VIOCONS=y @@ -435,43 +459,20 @@ # Joysticks # # CONFIG_INPUT_GAMEPORT is not set -# CONFIG_INPUT_NS558 is not set -# CONFIG_INPUT_LIGHTNING is not set -# CONFIG_INPUT_PCIGAME is not set -# CONFIG_INPUT_CS461X is not set -# CONFIG_INPUT_EMU10K1 is not set -# CONFIG_INPUT_SERIO is not set -# CONFIG_INPUT_SERPORT is not set # -# Joysticks +# Input core support is needed for gameports +# + +# +# Input core support is needed for joysticks # -# CONFIG_INPUT_ANALOG is not set -# CONFIG_INPUT_A3D is not set -# CONFIG_INPUT_ADI is not set -# CONFIG_INPUT_COBRA is not set -# CONFIG_INPUT_GF2K is not set -# CONFIG_INPUT_GRIP is not set -# CONFIG_INPUT_INTERACT is not set -# CONFIG_INPUT_TMDC is not set -# CONFIG_INPUT_SIDEWINDER is not set -# CONFIG_INPUT_IFORCE_USB is not set -# CONFIG_INPUT_IFORCE_232 is not set -# CONFIG_INPUT_WARRIOR is not set -# CONFIG_INPUT_MAGELLAN is not set -# CONFIG_INPUT_SPACEORB is not set -# CONFIG_INPUT_SPACEBALL is not set -# CONFIG_INPUT_STINGER is not set -# CONFIG_INPUT_DB9 is not set -# CONFIG_INPUT_GAMECON is not set -# CONFIG_INPUT_TURBOGRAFX is not set # CONFIG_QIC02_TAPE is not set # # Watchdog Cards # # CONFIG_WATCHDOG is not set -# CONFIG_INTEL_RNG is not set # CONFIG_NVRAM is not set # CONFIG_RTC is not set # CONFIG_DTLK is not set @@ -484,7 +485,6 @@ # CONFIG_FTAPE is not set # CONFIG_AGP is not set # CONFIG_DRM is not set -# CONFIG_MWAVE is not set # # File systems @@ -494,11 +494,15 @@ CONFIG_AUTOFS4_FS=y CONFIG_REISERFS_FS=y # CONFIG_REISERFS_CHECK is not set +# CONFIG_REISERFS_PROC_INFO is not set # CONFIG_ADFS_FS is not set # CONFIG_ADFS_FS_RW is not set # CONFIG_AFFS_FS is not set # CONFIG_HFS_FS is not set # CONFIG_BFS_FS is not set +CONFIG_EXT3_FS=y +CONFIG_JBD=y +# CONFIG_JBD_DEBUG is not set # CONFIG_FAT_FS is not set # CONFIG_MSDOS_FS is not set # CONFIG_UMSDOS_FS is not set @@ -511,8 +515,10 @@ CONFIG_RAMFS=y CONFIG_ISO9660_FS=y CONFIG_JOLIET=y +# CONFIG_ZISOFS is not set # CONFIG_MINIX_FS is not set # CONFIG_JFS_FS is not set +# CONFIG_JFS_DEBUG is not set # CONFIG_VXFS_FS is not set # CONFIG_NTFS_FS is not set # CONFIG_NTFS_RW is not set @@ -536,6 +542,7 @@ # Network File Systems # # CONFIG_CODA_FS is not set +# CONFIG_INTERMEZZO_FS is not set CONFIG_NFS_FS=y CONFIG_NFS_V3=y # CONFIG_ROOT_NFS is not set @@ -555,6 +562,8 @@ CONFIG_NCPFS_SMALLDOS=y CONFIG_NCPFS_NLS=y CONFIG_NCPFS_EXTRAS=y +# CONFIG_ZISOFS_FS is not set +# CONFIG_ZLIB_FS_INFLATE is not set # # Partition Types @@ -589,6 +598,7 @@ # CONFIG_NLS_CODEPAGE_949 is not set # CONFIG_NLS_CODEPAGE_874 is not set # CONFIG_NLS_ISO8859_8 is not set +# CONFIG_NLS_CODEPAGE_1250 is not set # CONFIG_NLS_CODEPAGE_1251 is not set CONFIG_NLS_ISO8859_1=y # CONFIG_NLS_ISO8859_2 is not set @@ -614,106 +624,6 @@ # USB support # # CONFIG_USB is not set - -# -# USB Controllers -# -# CONFIG_USB_UHCI is not set -# CONFIG_USB_UHCI_ALT is not set -# CONFIG_USB_OHCI is not set - -# -# USB Device Class drivers -# -# CONFIG_USB_AUDIO is not set -# CONFIG_USB_BLUETOOTH is not set -# CONFIG_USB_STORAGE is not set -# CONFIG_USB_STORAGE_DEBUG is not set -# CONFIG_USB_STORAGE_DATAFAB is not set -# CONFIG_USB_STORAGE_FREECOM is not set -# CONFIG_USB_STORAGE_ISD200 is not set -# CONFIG_USB_STORAGE_DPCM is not set -# CONFIG_USB_STORAGE_HP8200e is not set -# CONFIG_USB_STORAGE_SDDR09 is not set -# CONFIG_USB_STORAGE_JUMPSHOT is not set -# CONFIG_USB_ACM is not set -# CONFIG_USB_PRINTER is not set - -# -# USB Human Interface Devices (HID) -# -# CONFIG_USB_HID is not set -# CONFIG_USB_HIDDEV is not set -# CONFIG_USB_KBD is not set -# CONFIG_USB_MOUSE is not set -# CONFIG_USB_WACOM is not set - -# -# USB Imaging devices -# -# CONFIG_USB_DC2XX is not set -# CONFIG_USB_MDC800 is not set -# CONFIG_USB_SCANNER is not set -# CONFIG_USB_MICROTEK is not set -# CONFIG_USB_HPUSBSCSI is not set - -# -# USB Multimedia devices -# -# CONFIG_USB_IBMCAM is not set -# CONFIG_USB_OV511 is not set -# CONFIG_USB_PWC is not set -# CONFIG_USB_SE401 is not set -# CONFIG_USB_DSBR is not set -# CONFIG_USB_DABUSB is not set - -# -# USB Network adaptors -# -# CONFIG_USB_PEGASUS is not set -# CONFIG_USB_KAWETH is not set -# CONFIG_USB_CATC is not set -# CONFIG_USB_CDCETHER is not set -# CONFIG_USB_USBNET is not set - -# -# USB port drivers -# -# CONFIG_USB_USS720 is not set - -# -# USB Serial Converter support -# -# CONFIG_USB_SERIAL is not set -# CONFIG_USB_SERIAL_GENERIC is not set -# CONFIG_USB_SERIAL_BELKIN is not set -# CONFIG_USB_SERIAL_WHITEHEAT is not set -# CONFIG_USB_SERIAL_DIGI_ACCELEPORT is not set -# CONFIG_USB_SERIAL_EMPEG is not set -# CONFIG_USB_SERIAL_FTDI_SIO is not set -# CONFIG_USB_SERIAL_VISOR is not set -# CONFIG_USB_SERIAL_IR is not set -# CONFIG_USB_SERIAL_EDGEPORT is not set -# CONFIG_USB_SERIAL_KEYSPAN_PDA is not set -# CONFIG_USB_SERIAL_KEYSPAN is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28 is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28X is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28XA is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28XB is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA19 is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA18X is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA19W is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA49W is not set -# CONFIG_USB_SERIAL_MCT_U232 is not set -# CONFIG_USB_SERIAL_PL2303 is not set -# CONFIG_USB_SERIAL_CYBERJACK is not set -# CONFIG_USB_SERIAL_XIRCOM is not set -# CONFIG_USB_SERIAL_OMNINET is not set - -# -# USB Miscellaneous drivers -# -# CONFIG_USB_RIO500 is not set # # Kernel hacking diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/configs/pSeries_defconfig linuxppc64_2_4/arch/ppc64/configs/pSeries_defconfig --- ../kernel.org/linux-2.4.19/arch/ppc64/configs/pSeries_defconfig Wed May 15 08:29:35 2002 +++ linuxppc64_2_4/arch/ppc64/configs/pSeries_defconfig Mon May 6 14:29:20 2002 @@ -25,6 +25,7 @@ CONFIG_IRQ_ALL_CPUS=y # CONFIG_HMT is not set # CONFIG_MSCHUNKS is not set +CONFIG_RTAS_FLASH=m # # Loadable module support @@ -80,6 +81,7 @@ # CONFIG_PARIDE is not set # CONFIG_BLK_CPQ_DA is not set # CONFIG_BLK_CPQ_CISS_DA is not set +# CONFIG_CISS_SCSI_TAPE is not set # CONFIG_BLK_DEV_DAC960 is not set CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_NBD=y @@ -128,6 +130,11 @@ # # CONFIG_IPX is not set # CONFIG_ATALK is not set + +# +# Appletalk devices +# +# CONFIG_DEV_APPLETALK is not set # CONFIG_DECNET is not set # CONFIG_BRIDGE is not set # CONFIG_X25 is not set @@ -145,6 +152,11 @@ # CONFIG_NET_SCHED is not set # +# Network testing +# +# CONFIG_NET_PKTGEN is not set + +# # ATA/IDE/MFM/RLL support # CONFIG_IDE=y @@ -161,6 +173,7 @@ # CONFIG_BLK_DEV_HD is not set # CONFIG_BLK_DEV_IDEDISK is not set # CONFIG_IDEDISK_MULTI_MODE is not set +# CONFIG_IDEDISK_STROKE is not set # CONFIG_BLK_DEV_IDEDISK_VENDOR is not set # CONFIG_BLK_DEV_IDEDISK_FUJITSU is not set # CONFIG_BLK_DEV_IDEDISK_IBM is not set @@ -175,6 +188,7 @@ # CONFIG_BLK_DEV_IDETAPE is not set # CONFIG_BLK_DEV_IDEFLOPPY is not set # CONFIG_BLK_DEV_IDESCSI is not set +# CONFIG_IDE_TASK_IOCTL is not set # # IDE chipset support/bugfixes @@ -186,12 +200,15 @@ CONFIG_BLK_DEV_IDEPCI=y # CONFIG_IDEPCI_SHARE_IRQ is not set CONFIG_BLK_DEV_IDEDMA_PCI=y -CONFIG_BLK_DEV_ADMA=y # CONFIG_BLK_DEV_OFFBOARD is not set +# CONFIG_BLK_DEV_IDEDMA_FORCED is not set # CONFIG_IDEDMA_PCI_AUTO is not set +# CONFIG_IDEDMA_ONLYDISK is not set CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_PCI_WIP is not set +# CONFIG_BLK_DEV_IDEDMA_TIMEOUT is not set # CONFIG_IDEDMA_NEW_DRIVE_LISTINGS is not set +CONFIG_BLK_DEV_ADMA=y # CONFIG_BLK_DEV_AEC62XX is not set # CONFIG_AEC62XX_TUNING is not set # CONFIG_BLK_DEV_ALI15X3 is not set @@ -199,6 +216,7 @@ # CONFIG_BLK_DEV_AMD74XX is not set # CONFIG_AMD74XX_OVERRIDE is not set # CONFIG_BLK_DEV_CMD64X is not set +# CONFIG_BLK_DEV_CMD680 is not set # CONFIG_BLK_DEV_CY82C693 is not set # CONFIG_BLK_DEV_CS5530 is not set # CONFIG_BLK_DEV_HPT34X is not set @@ -349,6 +367,7 @@ # CONFIG_APRICOT is not set # CONFIG_CS89x0 is not set # CONFIG_TULIP is not set +# CONFIG_TC35815 is not set # CONFIG_DE4X5 is not set # CONFIG_DGRS is not set # CONFIG_DM9102 is not set @@ -380,12 +399,12 @@ CONFIG_ACENIC=y # CONFIG_ACENIC_OMIT_TIGON_I is not set # CONFIG_DL2K is not set -CONFIG_E1000=y # CONFIG_MYRI_SBUS is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set # CONFIG_SK98LIN is not set +# CONFIG_TIGON3 is not set # CONFIG_FDDI is not set # CONFIG_HIPPI is not set # CONFIG_PLIP is not set @@ -403,6 +422,7 @@ CONFIG_TR=y CONFIG_IBMOL=y # CONFIG_IBMLS is not set +# CONFIG_3C359 is not set # CONFIG_TMS380TR is not set # CONFIG_NET_FC is not set # CONFIG_RCPCI is not set @@ -445,6 +465,7 @@ # CONFIG_FB_RIVA is not set # CONFIG_FB_CLGEN is not set # CONFIG_FB_PM2 is not set +# CONFIG_FB_PM3 is not set # CONFIG_FB_CYBER2000 is not set CONFIG_FB_OF=y # CONFIG_FB_CONTROL is not set @@ -464,6 +485,7 @@ # CONFIG_FB_RADEON is not set # CONFIG_FB_ATY128 is not set # CONFIG_FB_SIS is not set +# CONFIG_FB_NEOMAGIC is not set # CONFIG_FB_3DFX is not set # CONFIG_FB_VOODOO1 is not set # CONFIG_FB_TRIDENT is not set @@ -517,6 +539,7 @@ CONFIG_PSMOUSE=y # CONFIG_82C710_MOUSE is not set # CONFIG_PC110_PAD is not set +# CONFIG_MK712_MOUSE is not set # # Joysticks @@ -536,7 +559,6 @@ # Watchdog Cards # # CONFIG_WATCHDOG is not set -# CONFIG_INTEL_RNG is not set # CONFIG_NVRAM is not set # CONFIG_RTC is not set # CONFIG_RTC is not set @@ -577,7 +599,7 @@ # CONFIG_JFFS2_FS is not set # CONFIG_CRAMFS is not set # CONFIG_TMPFS is not set -# CONFIG_RAMFS is not set +CONFIG_RAMFS=y CONFIG_ISO9660_FS=y # CONFIG_JOLIET is not set # CONFIG_ZISOFS is not set @@ -689,109 +711,6 @@ # USB support # # CONFIG_USB is not set - -# -# USB Controllers -# -# CONFIG_USB_UHCI is not set -# CONFIG_USB_UHCI_ALT is not set -# CONFIG_USB_OHCI is not set - -# -# USB Device Class drivers -# -# CONFIG_USB_AUDIO is not set -# CONFIG_USB_BLUETOOTH is not set -# CONFIG_USB_STORAGE is not set -# CONFIG_USB_STORAGE_DEBUG is not set -# CONFIG_USB_STORAGE_DATAFAB is not set -# CONFIG_USB_STORAGE_FREECOM is not set -# CONFIG_USB_STORAGE_ISD200 is not set -# CONFIG_USB_STORAGE_DPCM is not set -# CONFIG_USB_STORAGE_HP8200e is not set -# CONFIG_USB_STORAGE_SDDR09 is not set -# CONFIG_USB_STORAGE_JUMPSHOT is not set -# CONFIG_USB_ACM is not set -# CONFIG_USB_PRINTER is not set - -# -# USB Human Interface Devices (HID) -# - -# -# Input core support is needed for USB HID -# - -# -# USB Imaging devices -# -# CONFIG_USB_DC2XX is not set -# CONFIG_USB_MDC800 is not set -# CONFIG_USB_SCANNER is not set -# CONFIG_USB_MICROTEK is not set -# CONFIG_USB_HPUSBSCSI is not set - -# -# USB Multimedia devices -# -# CONFIG_USB_IBMCAM is not set -# CONFIG_USB_OV511 is not set -# CONFIG_USB_PWC is not set -# CONFIG_USB_SE401 is not set -# CONFIG_USB_STV680 is not set -# CONFIG_USB_VICAM is not set -# CONFIG_USB_DSBR is not set -# CONFIG_USB_DABUSB is not set - -# -# USB Network adaptors -# -# CONFIG_USB_PEGASUS is not set -# CONFIG_USB_KAWETH is not set -# CONFIG_USB_CATC is not set -# CONFIG_USB_CDCETHER is not set -# CONFIG_USB_USBNET is not set - -# -# USB port drivers -# -# CONFIG_USB_USS720 is not set - -# -# USB Serial Converter support -# -# CONFIG_USB_SERIAL is not set -# CONFIG_USB_SERIAL_GENERIC is not set -# CONFIG_USB_SERIAL_BELKIN is not set -# CONFIG_USB_SERIAL_WHITEHEAT is not set -# CONFIG_USB_SERIAL_DIGI_ACCELEPORT is not set -# CONFIG_USB_SERIAL_EMPEG is not set -# CONFIG_USB_SERIAL_FTDI_SIO is not set -# CONFIG_USB_SERIAL_VISOR is not set -# CONFIG_USB_SERIAL_IPAQ is not set -# CONFIG_USB_SERIAL_IR is not set -# CONFIG_USB_SERIAL_EDGEPORT is not set -# CONFIG_USB_SERIAL_KEYSPAN_PDA is not set -# CONFIG_USB_SERIAL_KEYSPAN is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28 is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28X is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28XA is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28XB is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA19 is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA18X is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA19W is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA49W is not set -# CONFIG_USB_SERIAL_MCT_U232 is not set -# CONFIG_USB_SERIAL_KLSI is not set -# CONFIG_USB_SERIAL_PL2303 is not set -# CONFIG_USB_SERIAL_CYBERJACK is not set -# CONFIG_USB_SERIAL_XIRCOM is not set -# CONFIG_USB_SERIAL_OMNINET is not set - -# -# USB Miscellaneous drivers -# -# CONFIG_USB_RIO500 is not set # # Kernel hacking diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/defconfig linuxppc64_2_4/arch/ppc64/defconfig --- ../kernel.org/linux-2.4.19/arch/ppc64/defconfig Wed May 15 08:29:35 2002 +++ linuxppc64_2_4/arch/ppc64/defconfig Mon May 13 16:39:10 2002 @@ -25,6 +25,7 @@ CONFIG_IRQ_ALL_CPUS=y # CONFIG_HMT is not set # CONFIG_MSCHUNKS is not set +CONFIG_RTAS_FLASH=m # # Loadable module support @@ -80,7 +81,9 @@ # CONFIG_PARIDE is not set # CONFIG_BLK_CPQ_DA is not set # CONFIG_BLK_CPQ_CISS_DA is not set +# CONFIG_CISS_SCSI_TAPE is not set # CONFIG_BLK_DEV_DAC960 is not set +# CONFIG_BLK_DEV_UMEM is not set CONFIG_BLK_DEV_LOOP=y CONFIG_BLK_DEV_NBD=y CONFIG_BLK_DEV_RAM=y @@ -128,6 +131,11 @@ # # CONFIG_IPX is not set # CONFIG_ATALK is not set + +# +# Appletalk devices +# +# CONFIG_DEV_APPLETALK is not set # CONFIG_DECNET is not set # CONFIG_BRIDGE is not set # CONFIG_X25 is not set @@ -145,6 +153,11 @@ # CONFIG_NET_SCHED is not set # +# Network testing +# +# CONFIG_NET_PKTGEN is not set + +# # ATA/IDE/MFM/RLL support # CONFIG_IDE=y @@ -161,6 +174,7 @@ # CONFIG_BLK_DEV_HD is not set # CONFIG_BLK_DEV_IDEDISK is not set # CONFIG_IDEDISK_MULTI_MODE is not set +# CONFIG_IDEDISK_STROKE is not set # CONFIG_BLK_DEV_IDEDISK_VENDOR is not set # CONFIG_BLK_DEV_IDEDISK_FUJITSU is not set # CONFIG_BLK_DEV_IDEDISK_IBM is not set @@ -175,6 +189,7 @@ # CONFIG_BLK_DEV_IDETAPE is not set # CONFIG_BLK_DEV_IDEFLOPPY is not set # CONFIG_BLK_DEV_IDESCSI is not set +# CONFIG_IDE_TASK_IOCTL is not set # # IDE chipset support/bugfixes @@ -186,12 +201,15 @@ CONFIG_BLK_DEV_IDEPCI=y # CONFIG_IDEPCI_SHARE_IRQ is not set CONFIG_BLK_DEV_IDEDMA_PCI=y -CONFIG_BLK_DEV_ADMA=y # CONFIG_BLK_DEV_OFFBOARD is not set +# CONFIG_BLK_DEV_IDEDMA_FORCED is not set # CONFIG_IDEDMA_PCI_AUTO is not set +# CONFIG_IDEDMA_ONLYDISK is not set CONFIG_BLK_DEV_IDEDMA=y # CONFIG_IDEDMA_PCI_WIP is not set +# CONFIG_BLK_DEV_IDEDMA_TIMEOUT is not set # CONFIG_IDEDMA_NEW_DRIVE_LISTINGS is not set +CONFIG_BLK_DEV_ADMA=y # CONFIG_BLK_DEV_AEC62XX is not set # CONFIG_AEC62XX_TUNING is not set # CONFIG_BLK_DEV_ALI15X3 is not set @@ -199,6 +217,7 @@ # CONFIG_BLK_DEV_AMD74XX is not set # CONFIG_AMD74XX_OVERRIDE is not set # CONFIG_BLK_DEV_CMD64X is not set +# CONFIG_BLK_DEV_CMD680 is not set # CONFIG_BLK_DEV_CY82C693 is not set # CONFIG_BLK_DEV_CS5530 is not set # CONFIG_BLK_DEV_HPT34X is not set @@ -349,6 +368,7 @@ # CONFIG_APRICOT is not set # CONFIG_CS89x0 is not set # CONFIG_TULIP is not set +# CONFIG_TC35815 is not set # CONFIG_DE4X5 is not set # CONFIG_DGRS is not set # CONFIG_DM9102 is not set @@ -380,12 +400,12 @@ CONFIG_ACENIC=y # CONFIG_ACENIC_OMIT_TIGON_I is not set # CONFIG_DL2K is not set -CONFIG_E1000=y # CONFIG_MYRI_SBUS is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set # CONFIG_SK98LIN is not set +# CONFIG_TIGON3 is not set # CONFIG_FDDI is not set # CONFIG_HIPPI is not set # CONFIG_PLIP is not set @@ -403,6 +423,7 @@ CONFIG_TR=y CONFIG_IBMOL=y # CONFIG_IBMLS is not set +# CONFIG_3C359 is not set # CONFIG_TMS380TR is not set # CONFIG_NET_FC is not set # CONFIG_RCPCI is not set @@ -445,6 +466,7 @@ # CONFIG_FB_RIVA is not set # CONFIG_FB_CLGEN is not set # CONFIG_FB_PM2 is not set +# CONFIG_FB_PM3 is not set # CONFIG_FB_CYBER2000 is not set CONFIG_FB_OF=y # CONFIG_FB_CONTROL is not set @@ -464,6 +486,7 @@ # CONFIG_FB_RADEON is not set # CONFIG_FB_ATY128 is not set # CONFIG_FB_SIS is not set +# CONFIG_FB_NEOMAGIC is not set # CONFIG_FB_3DFX is not set # CONFIG_FB_VOODOO1 is not set # CONFIG_FB_TRIDENT is not set @@ -517,6 +540,7 @@ CONFIG_PSMOUSE=y # CONFIG_82C710_MOUSE is not set # CONFIG_PC110_PAD is not set +# CONFIG_MK712_MOUSE is not set # # Joysticks @@ -536,7 +560,6 @@ # Watchdog Cards # # CONFIG_WATCHDOG is not set -# CONFIG_INTEL_RNG is not set # CONFIG_NVRAM is not set # CONFIG_RTC is not set # CONFIG_RTC is not set @@ -577,7 +600,7 @@ # CONFIG_JFFS2_FS is not set # CONFIG_CRAMFS is not set # CONFIG_TMPFS is not set -# CONFIG_RAMFS is not set +CONFIG_RAMFS=y CONFIG_ISO9660_FS=y # CONFIG_JOLIET is not set # CONFIG_ZISOFS is not set @@ -689,109 +712,6 @@ # USB support # # CONFIG_USB is not set - -# -# USB Controllers -# -# CONFIG_USB_UHCI is not set -# CONFIG_USB_UHCI_ALT is not set -# CONFIG_USB_OHCI is not set - -# -# USB Device Class drivers -# -# CONFIG_USB_AUDIO is not set -# CONFIG_USB_BLUETOOTH is not set -# CONFIG_USB_STORAGE is not set -# CONFIG_USB_STORAGE_DEBUG is not set -# CONFIG_USB_STORAGE_DATAFAB is not set -# CONFIG_USB_STORAGE_FREECOM is not set -# CONFIG_USB_STORAGE_ISD200 is not set -# CONFIG_USB_STORAGE_DPCM is not set -# CONFIG_USB_STORAGE_HP8200e is not set -# CONFIG_USB_STORAGE_SDDR09 is not set -# CONFIG_USB_STORAGE_JUMPSHOT is not set -# CONFIG_USB_ACM is not set -# CONFIG_USB_PRINTER is not set - -# -# USB Human Interface Devices (HID) -# - -# -# Input core support is needed for USB HID -# - -# -# USB Imaging devices -# -# CONFIG_USB_DC2XX is not set -# CONFIG_USB_MDC800 is not set -# CONFIG_USB_SCANNER is not set -# CONFIG_USB_MICROTEK is not set -# CONFIG_USB_HPUSBSCSI is not set - -# -# USB Multimedia devices -# -# CONFIG_USB_IBMCAM is not set -# CONFIG_USB_OV511 is not set -# CONFIG_USB_PWC is not set -# CONFIG_USB_SE401 is not set -# CONFIG_USB_STV680 is not set -# CONFIG_USB_VICAM is not set -# CONFIG_USB_DSBR is not set -# CONFIG_USB_DABUSB is not set - -# -# USB Network adaptors -# -# CONFIG_USB_PEGASUS is not set -# CONFIG_USB_KAWETH is not set -# CONFIG_USB_CATC is not set -# CONFIG_USB_CDCETHER is not set -# CONFIG_USB_USBNET is not set - -# -# USB port drivers -# -# CONFIG_USB_USS720 is not set - -# -# USB Serial Converter support -# -# CONFIG_USB_SERIAL is not set -# CONFIG_USB_SERIAL_GENERIC is not set -# CONFIG_USB_SERIAL_BELKIN is not set -# CONFIG_USB_SERIAL_WHITEHEAT is not set -# CONFIG_USB_SERIAL_DIGI_ACCELEPORT is not set -# CONFIG_USB_SERIAL_EMPEG is not set -# CONFIG_USB_SERIAL_FTDI_SIO is not set -# CONFIG_USB_SERIAL_VISOR is not set -# CONFIG_USB_SERIAL_IPAQ is not set -# CONFIG_USB_SERIAL_IR is not set -# CONFIG_USB_SERIAL_EDGEPORT is not set -# CONFIG_USB_SERIAL_KEYSPAN_PDA is not set -# CONFIG_USB_SERIAL_KEYSPAN is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28 is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28X is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28XA is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA28XB is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA19 is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA18X is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA19W is not set -# CONFIG_USB_SERIAL_KEYSPAN_USA49W is not set -# CONFIG_USB_SERIAL_MCT_U232 is not set -# CONFIG_USB_SERIAL_KLSI is not set -# CONFIG_USB_SERIAL_PL2303 is not set -# CONFIG_USB_SERIAL_CYBERJACK is not set -# CONFIG_USB_SERIAL_XIRCOM is not set -# CONFIG_USB_SERIAL_OMNINET is not set - -# -# USB Miscellaneous drivers -# -# CONFIG_USB_RIO500 is not set # # Kernel hacking diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/Makefile linuxppc64_2_4/arch/ppc64/kernel/Makefile --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/Makefile Wed May 15 08:29:35 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/Makefile Mon May 6 14:29:20 2002 @@ -40,6 +40,8 @@ obj-y += rtasd.o nvram.o endif +obj-$(CONFIG_RTAS_FLASH) += rtas_flash.o + obj-$(CONFIG_KGDB) += ppc-stub.o obj-$(CONFIG_SMP) += smp.o diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/chrp_setup.c linuxppc64_2_4/arch/ppc64/kernel/chrp_setup.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/chrp_setup.c Wed May 15 08:29:35 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/chrp_setup.c Thu Apr 25 20:45:03 2002 @@ -259,11 +259,9 @@ if(naca->interrupt_controller == IC_OPEN_PIC) { ppc_md.init_IRQ = openpic_init_IRQ; ppc_md.get_irq = openpic_get_irq; - ppc_md.post_irq = NULL; } else { ppc_md.init_IRQ = xics_init_IRQ; ppc_md.get_irq = xics_get_irq; - ppc_md.post_irq = NULL; } ppc_md.init_ras_IRQ = init_ras_IRQ; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/ioctl32.c linuxppc64_2_4/arch/ppc64/kernel/ioctl32.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/ioctl32.c Wed May 15 08:29:35 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/ioctl32.c Thu Apr 25 22:03:30 2002 @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -102,6 +103,7 @@ #include #include #include +#include #include #include @@ -469,6 +471,7 @@ return -ENODEV; strcpy(ifr32.ifr_name, dev->name); + dev_put(dev); err = copy_to_user((struct ifreq32 *)arg, &ifr32, sizeof(struct ifreq32)); return (err ? -EFAULT : 0); @@ -1157,6 +1160,234 @@ return err; } +typedef struct sg_io_hdr32 { + s32 interface_id; /* [i] 'S' for SCSI generic (required) */ + s32 dxfer_direction; /* [i] data transfer direction */ + u8 cmd_len; /* [i] SCSI command length ( <= 16 bytes) */ + u8 mx_sb_len; /* [i] max length to write to sbp */ + u16 iovec_count; /* [i] 0 implies no scatter gather */ + u32 dxfer_len; /* [i] byte count of data transfer */ + u32 dxferp; /* [i], [*io] points to data transfer memory + or scatter gather list */ + u32 cmdp; /* [i], [*i] points to command to perform */ + u32 sbp; /* [i], [*o] points to sense_buffer memory */ + u32 timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */ + u32 flags; /* [i] 0 -> default, see SG_FLAG... */ + s32 pack_id; /* [i->o] unused internally (normally) */ + u32 usr_ptr; /* [i->o] unused internally */ + u8 status; /* [o] scsi status */ + u8 masked_status; /* [o] shifted, masked scsi status */ + u8 msg_status; /* [o] messaging level data (optional) */ + u8 sb_len_wr; /* [o] byte count actually written to sbp */ + u16 host_status; /* [o] errors from host adapter */ + u16 driver_status; /* [o] errors from software driver */ + s32 resid; /* [o] dxfer_len - actual_transferred */ + u32 duration; /* [o] time taken by cmd (unit: millisec) */ + u32 info; /* [o] auxiliary information */ +} sg_io_hdr32_t; /* 64 bytes long (on sparc32) */ + +typedef struct sg_iovec32 { + u32 iov_base; + u32 iov_len; +} sg_iovec32_t; + +static int alloc_sg_iovec(sg_io_hdr_t *sgp, u32 uptr32) +{ + sg_iovec32_t *uiov = (sg_iovec32_t *) A(uptr32); + sg_iovec_t *kiov; + int i; + + sgp->dxferp = kmalloc(sgp->iovec_count * + sizeof(sg_iovec_t), GFP_KERNEL); + if (!sgp->dxferp) + return -ENOMEM; + memset(sgp->dxferp, 0, + sgp->iovec_count * sizeof(sg_iovec_t)); + + kiov = (sg_iovec_t *) sgp->dxferp; + for (i = 0; i < sgp->iovec_count; i++) { + u32 iov_base32; + if (__get_user(iov_base32, &uiov->iov_base) || + __get_user(kiov->iov_len, &uiov->iov_len)) + return -EFAULT; + + kiov->iov_base = kmalloc(kiov->iov_len, GFP_KERNEL); + if (!kiov->iov_base) + return -ENOMEM; + if (copy_from_user(kiov->iov_base, + (void *) A(iov_base32), + kiov->iov_len)) + return -EFAULT; + + uiov++; + kiov++; + } + + return 0; +} + +static int copy_back_sg_iovec(sg_io_hdr_t *sgp, u32 uptr32) +{ + sg_iovec32_t *uiov = (sg_iovec32_t *) A(uptr32); + sg_iovec_t *kiov = (sg_iovec_t *) sgp->dxferp; + int i; + + for (i = 0; i < sgp->iovec_count; i++) { + u32 iov_base32; + + if (__get_user(iov_base32, &uiov->iov_base)) + return -EFAULT; + + if (copy_to_user((void *) A(iov_base32), + kiov->iov_base, + kiov->iov_len)) + return -EFAULT; + + uiov++; + kiov++; + } + + return 0; +} + +static void free_sg_iovec(sg_io_hdr_t *sgp) +{ + sg_iovec_t *kiov = (sg_iovec_t *) sgp->dxferp; + int i; + + for (i = 0; i < sgp->iovec_count; i++) { + if (kiov->iov_base) { + kfree(kiov->iov_base); + kiov->iov_base = NULL; + } + kiov++; + } + kfree(sgp->dxferp); + sgp->dxferp = NULL; +} + +static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) +{ + sg_io_hdr32_t *sg_io32; + sg_io_hdr_t sg_io64; + u32 dxferp32, cmdp32, sbp32; + mm_segment_t old_fs; + int err = 0; + + sg_io32 = (sg_io_hdr32_t *)arg; + err = __get_user(sg_io64.interface_id, &sg_io32->interface_id); + err |= __get_user(sg_io64.dxfer_direction, &sg_io32->dxfer_direction); + err |= __get_user(sg_io64.cmd_len, &sg_io32->cmd_len); + err |= __get_user(sg_io64.mx_sb_len, &sg_io32->mx_sb_len); + err |= __get_user(sg_io64.iovec_count, &sg_io32->iovec_count); + err |= __get_user(sg_io64.dxfer_len, &sg_io32->dxfer_len); + err |= __get_user(sg_io64.timeout, &sg_io32->timeout); + err |= __get_user(sg_io64.flags, &sg_io32->flags); + err |= __get_user(sg_io64.pack_id, &sg_io32->pack_id); + + sg_io64.dxferp = NULL; + sg_io64.cmdp = NULL; + sg_io64.sbp = NULL; + + err |= __get_user(cmdp32, &sg_io32->cmdp); + sg_io64.cmdp = kmalloc(sg_io64.cmd_len, GFP_KERNEL); + if (!sg_io64.cmdp) { + err = -ENOMEM; + goto out; + } + if (copy_from_user(sg_io64.cmdp, + (void *) A(cmdp32), + sg_io64.cmd_len)) { + err = -EFAULT; + goto out; + } + + err |= __get_user(sbp32, &sg_io32->sbp); + sg_io64.sbp = kmalloc(sg_io64.mx_sb_len, GFP_KERNEL); + if (!sg_io64.sbp) { + err = -ENOMEM; + goto out; + } + if (copy_from_user(sg_io64.sbp, + (void *) A(sbp32), + sg_io64.mx_sb_len)) { + err = -EFAULT; + goto out; + } + + err |= __get_user(dxferp32, &sg_io32->dxferp); + if (sg_io64.iovec_count) { + int ret; + + if ((ret = alloc_sg_iovec(&sg_io64, dxferp32))) { + err = ret; + goto out; + } + } else { + sg_io64.dxferp = kmalloc(sg_io64.dxfer_len, GFP_KERNEL); + if (!sg_io64.dxferp) { + err = -ENOMEM; + goto out; + } + if (copy_from_user(sg_io64.dxferp, + (void *) A(dxferp32), + sg_io64.dxfer_len)) { + err = -EFAULT; + goto out; + } + } + + /* Unused internally, do not even bother to copy it over. */ + sg_io64.usr_ptr = NULL; + + if (err) + return -EFAULT; + + old_fs = get_fs(); + set_fs (KERNEL_DS); + err = sys_ioctl (fd, cmd, (unsigned long) &sg_io64); + set_fs (old_fs); + + if (err < 0) + goto out; + + err = __put_user(sg_io64.pack_id, &sg_io32->pack_id); + err |= __put_user(sg_io64.status, &sg_io32->status); + err |= __put_user(sg_io64.masked_status, &sg_io32->masked_status); + err |= __put_user(sg_io64.msg_status, &sg_io32->msg_status); + err |= __put_user(sg_io64.sb_len_wr, &sg_io32->sb_len_wr); + err |= __put_user(sg_io64.host_status, &sg_io32->host_status); + err |= __put_user(sg_io64.driver_status, &sg_io32->driver_status); + err |= __put_user(sg_io64.resid, &sg_io32->resid); + err |= __put_user(sg_io64.duration, &sg_io32->duration); + err |= __put_user(sg_io64.info, &sg_io32->info); + err |= copy_to_user((void *)A(sbp32), sg_io64.sbp, sg_io64.mx_sb_len); + if (sg_io64.dxferp) { + if (sg_io64.iovec_count) + err |= copy_back_sg_iovec(&sg_io64, dxferp32); + else + err |= copy_to_user((void *)A(dxferp32), + sg_io64.dxferp, + sg_io64.dxfer_len); + } + if (err) + err = -EFAULT; + +out: + if (sg_io64.cmdp) + kfree(sg_io64.cmdp); + if (sg_io64.sbp) + kfree(sg_io64.sbp); + if (sg_io64.dxferp) { + if (sg_io64.iovec_count) { + free_sg_iovec(&sg_io64); + } else { + kfree(sg_io64.dxferp); + } + } + return err; +} + struct ppp_option_data32 { __kernel_caddr_t32 ptr; __u32 length; @@ -2161,12 +2392,11 @@ if (l->lv_block_exception) { lbe32 = (lv_block_exception32_t *)A(ptr2); memset(lbe, 0, size); - for (i = 0; i < l->lv_remap_end; i++, lbe++, lbe32++) { - err |= get_user(lbe->rsector_org, &lbe32->rsector_org); - err |= __get_user(lbe->rdev_org, &lbe32->rdev_org); - err |= __get_user(lbe->rsector_new, &lbe32->rsector_new); - err |= __get_user(lbe->rdev_new, &lbe32->rdev_new); - + for (i = 0; i < l->lv_remap_end; i++, lbe++, lbe32++) { + err |= get_user(lbe->rsector_org, &lbe32->rsector_org); + err |= __get_user(lbe->rdev_org, &lbe32->rdev_org); + err |= __get_user(lbe->rsector_new, &lbe32->rsector_new); + err |= __get_user(lbe->rdev_new, &lbe32->rdev_new); } } } @@ -2222,16 +2452,17 @@ switch (cmd) { case VG_STATUS: v = kmalloc(sizeof(vg_t), GFP_KERNEL); - if (!v) return -ENOMEM; + if (!v) + return -ENOMEM; karg = v; break; case VG_CREATE_OLD: case VG_CREATE: v = kmalloc(sizeof(vg_t), GFP_KERNEL); - if (!v) return -ENOMEM; - if (copy_from_user(v, (void *)arg, (long)&((vg32_t *)0)->proc) || - __get_user(v->proc, &((vg32_t *)arg)->proc)) { + if (!v) + return -ENOMEM; + if (copy_from_user(v, (void *)arg, (long)&((vg32_t *)0)->proc)) { kfree(v); return -EFAULT; } @@ -2248,39 +2479,46 @@ return -EPERM; for (i = 0; i < v->pv_max; i++) { err = __get_user(ptr, &((vg32_t *)arg)->pv[i]); - if (err) break; + if (err) + break; if (ptr) { v->pv[i] = kmalloc(sizeof(pv_t), GFP_KERNEL); if (!v->pv[i]) { err = -ENOMEM; break; } - err = copy_from_user(v->pv[i], (void *)A(ptr), sizeof(pv32_t) - 8 - UUID_LEN+1); + err = copy_from_user(v->pv[i], (void *)A(ptr), + sizeof(pv32_t) - 8 - UUID_LEN+1); if (err) { err = -EFAULT; break; } - err = copy_from_user(v->pv[i]->pv_uuid, ((pv32_t *)A(ptr))->pv_uuid, UUID_LEN+1); + err = copy_from_user(v->pv[i]->pv_uuid, + ((pv32_t *)A(ptr))->pv_uuid, + UUID_LEN+1); if (err) { err = -EFAULT; break; } - - v->pv[i]->pe = NULL; v->pv[i]->inode = NULL; + v->pv[i]->pe = NULL; + v->pv[i]->bd = NULL; } } if (!err) { for (i = 0; i < v->lv_max; i++) { err = __get_user(ptr, &((vg32_t *)arg)->lv[i]); - if (err) break; + if (err) + break; if (ptr) { v->lv[i] = get_lv_t(ptr, &err); - if (err) break; + if (err) + break; } } } break; + case LV_CREATE: case LV_EXTEND: case LV_REDUCE: @@ -2288,54 +2526,70 @@ case LV_RENAME: case LV_STATUS_BYNAME: err = copy_from_user(&u.pv_status, arg, sizeof(u.pv_status.pv_name)); - if (err) return -EFAULT; + if (err) + return -EFAULT; if (cmd != LV_REMOVE) { err = __get_user(ptr, &((lv_req32_t *)arg)->lv); - if (err) return err; + if (err) + return err; u.lv_req.lv = get_lv_t(ptr, &err); } else u.lv_req.lv = NULL; break; - case LV_STATUS_BYINDEX: - err = get_user(u.lv_byindex.lv_index, &((lv_status_byindex_req32_t *)arg)->lv_index); + err = get_user(u.lv_byindex.lv_index, + &((lv_status_byindex_req32_t *)arg)->lv_index); err |= __get_user(ptr, &((lv_status_byindex_req32_t *)arg)->lv); - if (err) return err; + if (err) + return err; u.lv_byindex.lv = get_lv_t(ptr, &err); break; + case LV_STATUS_BYDEV: err = get_user(u.lv_bydev.dev, &((lv_status_bydev_req32_t *)arg)->dev); + err |= __get_user(ptr, &((lv_status_bydev_req32_t *)arg)->lv); + if (err) + return err; u.lv_bydev.lv = get_lv_t(ptr, &err); - if (err) return err; - u.lv_bydev.lv = &p; - p.pe = NULL; p.inode = NULL; - break; + break; + case VG_EXTEND: err = copy_from_user(&p, (void *)arg, sizeof(pv32_t) - 8 - UUID_LEN+1); - if (err) return -EFAULT; + if (err) + return -EFAULT; err = copy_from_user(p.pv_uuid, ((pv32_t *)arg)->pv_uuid, UUID_LEN+1); - if (err) return -EFAULT; - p.pe = NULL; p.inode = NULL; + if (err) + return -EFAULT; + p.pe = NULL; + p.bd = NULL; karg = &p; break; + case PV_CHANGE: case PV_STATUS: err = copy_from_user(&u.pv_status, arg, sizeof(u.lv_req.lv_name)); - if (err) return -EFAULT; + if (err) + return -EFAULT; err = __get_user(ptr, &((pv_status_req32_t *)arg)->pv); - if (err) return err; + if (err) + return err; u.pv_status.pv = &p; if (cmd == PV_CHANGE) { - err = copy_from_user(&p, (void *)A(ptr), sizeof(pv32_t) - 8 - UUID_LEN+1); - if (err) return -EFAULT; - p.pe = NULL; p.inode = NULL; + err = copy_from_user(&p, (void *)A(ptr), + sizeof(pv32_t) - 8 - UUID_LEN+1); + if (err) + return -EFAULT; + p.pe = NULL; + p.bd = NULL; } break; - } + }; + old_fs = get_fs(); set_fs (KERNEL_DS); err = sys_ioctl (fd, cmd, (unsigned long)karg); set_fs (old_fs); + switch (cmd) { case VG_STATUS: if (!err) { @@ -2361,17 +2615,23 @@ } kfree(v); break; + case LV_STATUS_BYNAME: - if (!err && u.lv_req.lv) err = copy_lv_t(ptr, u.lv_req.lv); + if (!err && u.lv_req.lv) + err = copy_lv_t(ptr, u.lv_req.lv); /* Fall through */ + case LV_CREATE: case LV_EXTEND: case LV_REDUCE: - if (u.lv_req.lv) put_lv_t(u.lv_req.lv); + if (u.lv_req.lv) + put_lv_t(u.lv_req.lv); break; + case LV_STATUS_BYINDEX: if (u.lv_byindex.lv) { - if (!err) err = copy_lv_t(ptr, u.lv_byindex.lv); + if (!err) + err = copy_lv_t(ptr, u.lv_byindex.lv); put_lv_t(u.lv_byindex.lv); } break; @@ -2387,9 +2647,11 @@ case PV_STATUS: if (!err) { err = copy_to_user((void *)A(ptr), &p, sizeof(pv32_t) - 8 - UUID_LEN+1); - if (err) return -EFAULT; + if (err) + return -EFAULT; err = copy_to_user(((pv_t *)A(ptr))->pv_uuid, p.pv_uuid, UUID_LEN + 1); - if (err) return -EFAULT; + if (err) + return -EFAULT; } break; }; @@ -3435,119 +3697,6 @@ return ((0 == ret) ? 0 : -EFAULT); } -struct sg_io_hdr_32 -{ - int interface_id; - int dxfer_direction; - unsigned char cmd_len; - unsigned char mx_sb_len; - unsigned short iovec_count; - unsigned int dxfer_len; - u32 dxferp; - u32 cmdp; - u32 sbp; - unsigned int timeout; - unsigned int flags; - int pack_id; - u32 usr_ptr; - unsigned char status; - unsigned char masked_status; - unsigned char msg_status; - unsigned char sb_len_wr; - unsigned short host_status; - unsigned short driver_status; - int resid; - unsigned int duration; - unsigned int info; -}; - -static int do_sg_io(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - struct sg_io_hdr *sg = kmalloc(sizeof(struct sg_io_hdr), GFP_KERNEL); - struct sg_io_hdr_32 *sg_32 = (struct sg_io_hdr_32 *)arg; - u32 dxferp_32; - u32 cmdp_32; - u32 sbp_32; - u32 usr_ptr_32; - int ret = -EFAULT; - int err; - mm_segment_t old_fs = get_fs(); - - if (!sg) - return -ENOMEM; - - memset(sg, 0, sizeof(*sg)); - - err = copy_from_user(sg, sg_32, offsetof(struct sg_io_hdr, dxferp)); - err |= __get_user(dxferp_32, &sg_32->dxferp); - err |= __get_user(cmdp_32, &sg_32->cmdp); - err |= __get_user(sbp_32, &sg_32->sbp); - - if (err) - goto error; - - sg->dxferp = (void *)A(dxferp_32); - sg->cmdp = (void *)A(cmdp_32); - sg->sbp = (void *)A(sbp_32); - - err = __copy_from_user(&sg->timeout, &sg_32->timeout, - (long)&sg->usr_ptr - (long)&sg->timeout); - - err |= __get_user(usr_ptr_32, &sg_32->usr_ptr); - - if (err) - goto error; - - sg->usr_ptr = (void *)A(usr_ptr_32); - - err = __copy_from_user(&sg->status, &sg_32->status, - sizeof(struct sg_io_hdr) - - offsetof(struct sg_io_hdr, status)); - - if (err) - goto error; - - set_fs(KERNEL_DS); - ret = sys_ioctl(fd, cmd, (unsigned long)sg); - set_fs(old_fs); - - err = copy_to_user(sg_32, sg, offsetof(struct sg_io_hdr, dxferp)); - - dxferp_32 = (unsigned long)sg->dxferp; - cmdp_32 = (unsigned long)sg->cmdp; - sbp_32 = (unsigned long)sg->sbp; - err |= __put_user(dxferp_32, &sg_32->dxferp); - err |= __put_user(cmdp_32, &sg_32->cmdp); - err |= __put_user(sbp_32, &sg_32->sbp); - - if (err) { - ret = -EFAULT; - goto error; - } - - err = __copy_to_user(&sg_32->timeout, &sg->timeout, - (long)&sg->usr_ptr - (long)&sg->timeout); - - usr_ptr_32 = (unsigned long)sg->usr_ptr; - err |= __put_user(usr_ptr_32, &sg_32->usr_ptr); - - if (err) { - ret = -EFAULT; - goto error; - } - - err = __copy_to_user(&sg_32->status, &sg->status, - sizeof(struct sg_io_hdr) - - offsetof(struct sg_io_hdr, status)); - - if (err) - ret = -EFAULT; - -error: - kfree(sg); - return ret; -} - struct ioctl_trans { unsigned long cmd; unsigned long handler; @@ -3736,6 +3885,12 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_TAGGED_DISABLE), COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER), COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND), +/* Big T */ +COMPATIBLE_IOCTL(TUNSETNOCSUM), +COMPATIBLE_IOCTL(TUNSETDEBUG), +COMPATIBLE_IOCTL(TUNSETIFF), +COMPATIBLE_IOCTL(TUNSETPERSIST), +COMPATIBLE_IOCTL(TUNSETOWNER), /* Big V */ COMPATIBLE_IOCTL(VT_SETMODE), COMPATIBLE_IOCTL(VT_GETMODE), @@ -3812,6 +3967,8 @@ COMPATIBLE_IOCTL(SIOCDRARP), COMPATIBLE_IOCTL(SIOCADDDLCI), COMPATIBLE_IOCTL(SIOCDELDLCI), +COMPATIBLE_IOCTL(SIOCGIFVLAN), +COMPATIBLE_IOCTL(SIOCSIFVLAN), /* SG stuff */ COMPATIBLE_IOCTL(SG_SET_TIMEOUT), COMPATIBLE_IOCTL(SG_GET_TIMEOUT), @@ -4145,6 +4302,13 @@ COMPATIBLE_IOCTL(WDIOC_GETTEMP), COMPATIBLE_IOCTL(WDIOC_SETOPTIONS), COMPATIBLE_IOCTL(WDIOC_KEEPALIVE), +/* Big R */ +COMPATIBLE_IOCTL(RNDGETENTCNT), +COMPATIBLE_IOCTL(RNDADDTOENTCNT), +COMPATIBLE_IOCTL(RNDGETPOOL), +COMPATIBLE_IOCTL(RNDADDENTROPY), +COMPATIBLE_IOCTL(RNDZAPENTCNT), +COMPATIBLE_IOCTL(RNDCLEARPOOL), /* Bluetooth ioctls */ COMPATIBLE_IOCTL(HCIDEVUP), COMPATIBLE_IOCTL(HCIDEVDOWN), @@ -4264,6 +4428,7 @@ HANDLE_IOCTL(FDPOLLDRVSTAT32, fd_ioctl_trans), HANDLE_IOCTL(FDGETFDCSTAT32, fd_ioctl_trans), HANDLE_IOCTL(FDWERRORGET32, fd_ioctl_trans), +HANDLE_IOCTL(SG_IO,sg_ioctl_trans), HANDLE_IOCTL(PPPIOCGIDLE32, ppp_ioctl_trans), HANDLE_IOCTL(PPPIOCSCOMPRESS32, ppp_ioctl_trans), HANDLE_IOCTL(MTIOCGET32, mt_ioctl_trans), @@ -4358,7 +4523,6 @@ HANDLE_IOCTL(USBDEVFS_REAPURB32, do_usbdevfs_reapurb), HANDLE_IOCTL(USBDEVFS_REAPURBNDELAY32, do_usbdevfs_reapurb), HANDLE_IOCTL(USBDEVFS_DISCSIGNAL32, do_usbdevfs_discsignal), -HANDLE_IOCTL(SG_IO, do_sg_io), }; unsigned long ioctl32_hash_table[1024]; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/irq.c linuxppc64_2_4/arch/ppc64/kernel/irq.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/irq.c Wed May 15 08:29:35 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/irq.c Fri May 3 03:15:52 2002 @@ -99,7 +99,7 @@ * this needs to be removed. * -- Cort */ -#define IRQ_KMALLOC_ENTRIES 8 +#define IRQ_KMALLOC_ENTRIES 16 static int cache_bitmask = 0; static struct irqaction malloc_cache[IRQ_KMALLOC_ENTRIES]; extern int mem_init_done; @@ -554,58 +554,55 @@ spin_unlock(&desc->lock); } -int do_IRQ(struct pt_regs *regs, int isfake) +int do_IRQ(struct pt_regs *regs) { int cpu = smp_processor_id(); - int irq; + int irq, first = 1; +#ifdef CONFIG_PPC_ISERIES struct paca_struct *lpaca; - struct ItLpQueue * lpq; - - /* if(cpu) udbg_printf("Entering do_IRQ\n"); */ + struct ItLpQueue *lpq; +#endif - irq_enter(cpu); + irq_enter(cpu); - if (naca->platform != PLATFORM_ISERIES_LPAR) { - - /* every arch is required to have a get_irq -- Cort */ - irq = ppc_md.get_irq( regs ); - - if ( irq >= 0 ) { - ppc_irq_dispatch_handler( regs, irq ); - if (ppc_md.post_irq) - ppc_md.post_irq( regs, irq ); - } else { - /* -2 means ignore, already handled */ - if (irq != -2) { - printk(KERN_DEBUG "Bogus interrupt %d from PC = %lx\n", - irq, regs->nip); - ppc_spurious_interrupts++; - } - } - } - /* if on iSeries partition */ - else { - lpaca = get_paca(); +#ifdef CONFIG_PPC_ISERIES + lpaca = get_paca(); #ifdef CONFIG_SMP - if ( lpaca->xLpPaca.xIntDword.xFields.xIpiCnt ) { - lpaca->xLpPaca.xIntDword.xFields.xIpiCnt = 0; - iSeries_smp_message_recv( regs ); - } -#endif /* CONFIG_SMP */ - lpq = lpaca->lpQueuePtr; - if ( lpq && ItLpQueue_isLpIntPending( lpq ) ) - lpEvent_count += ItLpQueue_process( lpq, regs ); + if (lpaca->xLpPaca.xIntDword.xFields.xIpiCnt) { + lpaca->xLpPaca.xIntDword.xFields.xIpiCnt = 0; + iSeries_smp_message_recv(regs); } - +#endif /* CONFIG_SMP */ + lpq = lpaca->lpQueuePtr; + if (lpq && ItLpQueue_isLpIntPending(lpq)) + lpEvent_count += ItLpQueue_process(lpq, regs); +#else + /* + * Every arch is required to implement ppc_md.get_irq. + * This function will either return an irq number or -1 to + * indicate there are no more pending. But the first time + * through the loop this means there wasn't an IRQ pending. + * The value -2 is for buggy hardware and means that this IRQ + * has already been handled. -- Tom + */ + while ((irq = ppc_md.get_irq(regs)) >= 0) { + ppc_irq_dispatch_handler(regs, irq); + first = 0; + } + if (irq != -2 && first) + /* That's not SMP safe ... but who cares ? */ + ppc_spurious_interrupts++; +#endif + irq_exit(cpu); - if (naca->platform == PLATFORM_ISERIES_LPAR) { - if ( lpaca->xLpPaca.xIntDword.xFields.xDecrInt ) { - lpaca->xLpPaca.xIntDword.xFields.xDecrInt = 0; - /* Signal a fake decrementer interrupt */ - timer_interrupt( regs ); - } +#ifdef CONFIG_PPC_ISERIES + if (lpaca->xLpPaca.xIntDword.xFields.xDecrInt) { + lpaca->xLpPaca.xIntDword.xFields.xDecrInt = 0; + /* Signal a fake decrementer interrupt */ + timer_interrupt(regs); } +#endif if (softirq_pending(cpu)) do_softirq(); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/open_pic.c linuxppc64_2_4/arch/ppc64/kernel/open_pic.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/open_pic.c Fri Apr 19 11:00:33 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/open_pic.c Thu Apr 25 20:45:03 2002 @@ -47,7 +47,6 @@ OpenPIC_SourcePtr ISU[OPENPIC_MAX_ISU]; static void openpic_end_irq(unsigned int irq_nr); -static void openpic_ack_irq(unsigned int irq_nr); static void openpic_set_affinity(unsigned int irq_nr, unsigned long cpumask); struct hw_interrupt_type open_pic = { @@ -56,14 +55,13 @@ NULL, openpic_enable_irq, openpic_disable_irq, - openpic_ack_irq, + NULL, openpic_end_irq, openpic_set_affinity }; #ifdef CONFIG_SMP static void openpic_end_ipi(unsigned int irq_nr); -static void openpic_ack_ipi(unsigned int irq_nr); static void openpic_enable_ipi(unsigned int irq_nr); static void openpic_disable_ipi(unsigned int irq_nr); @@ -73,9 +71,9 @@ NULL, openpic_enable_ipi, openpic_disable_ipi, - openpic_ack_ipi, + NULL, openpic_end_ipi, - 0 + NULL }; #endif /* CONFIG_SMP */ @@ -756,13 +754,6 @@ (sense ? OPENPIC_SENSE_LEVEL : 0)); } -/* No spinlocks, should not be necessary with the OpenPIC - * (1 register = 1 interrupt and we have the desc lock). - */ -static void openpic_ack_irq(unsigned int irq_nr) -{ -} - static void openpic_end_irq(unsigned int irq_nr) { if ((irq_desc[irq_nr].status & IRQ_LEVEL) != 0) @@ -775,10 +766,6 @@ } #ifdef CONFIG_SMP -static void openpic_ack_ipi(unsigned int irq_nr) -{ -} - static void openpic_end_ipi(unsigned int irq_nr) { /* IPIs are marked IRQ_PER_CPU. This has the side effect of @@ -825,4 +812,3 @@ irq = -1; return irq; } - diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/ppc_ksyms.c linuxppc64_2_4/arch/ppc64/kernel/ppc_ksyms.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/ppc_ksyms.c Wed May 15 08:29:35 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/ppc_ksyms.c Mon May 6 14:29:20 2002 @@ -40,15 +40,15 @@ #include #include #include -#ifdef CONFIG_SMP #include -#endif /* CONFIG_SMP */ #ifdef CONFIG_PPC_ISERIES #include #include #include #include #include +#else +#include #endif /* Tell string.h we don't want memcpy etc. as cpp defines */ @@ -237,6 +237,13 @@ EXPORT_SYMBOL(machine_is_compatible); EXPORT_SYMBOL(find_all_nodes); EXPORT_SYMBOL(get_property); + +#ifdef CONFIG_PPC_PSERIES +EXPORT_SYMBOL(rtas_proc_dir); +EXPORT_SYMBOL(rtas_firmware_flash_list); +EXPORT_SYMBOL(rtas_token); +EXPORT_SYMBOL(rtas_call); +#endif #ifndef CONFIG_PPC_ISERIES EXPORT_SYMBOL(kd_mksound); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/proc_pmc.c linuxppc64_2_4/arch/ppc64/kernel/proc_pmc.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/proc_pmc.c Wed May 15 08:29:35 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/proc_pmc.c Mon May 6 14:29:20 2002 @@ -41,6 +41,7 @@ #include #include #include +#include /* pci Flight Recorder AHT */ extern void proc_pciFr_init(struct proc_dir_entry *proc_ppc64_root); @@ -115,6 +116,9 @@ for (i = 0; i < naca->processorCount; i++) proc_ppc64_create_paca(i, ent); } + + /* Placeholder for rtas interfaces. */ + rtas_proc_dir = proc_mkdir("rtas", proc_ppc64_root); /* Create the /proc/ppc64/pcifr for the Pci Flight Recorder. */ proc_pciFr_init(proc_ppc64_root); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/process.c linuxppc64_2_4/arch/ppc64/kernel/process.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/process.c Wed May 15 08:29:35 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/process.c Thu Apr 25 20:09:58 2002 @@ -278,87 +278,45 @@ current->thread.fpscr = 0; } -asmlinkage int sys_clone(int p1, int p2, int p3, int p4, int p5, int p6, - struct pt_regs *regs) +int sys_clone(int p1, int p2, int p3, int p4, int p5, int p6, + struct pt_regs *regs) { - unsigned long clone_flags = p1; - int res; - - PPCDBG(PPCDBG_SYS64, "sys_clone - entered - pid=%ld current=%lx comm=%s \n", current->pid, current, current->comm); - - res = do_fork(clone_flags, regs->gpr[1], regs, 0); -#ifdef CONFIG_SMP - /* When we clone the idle task we keep the same pid but - * the return value of 0 for both causes problems. - * -- Cort - */ - if ((current->pid == 0) && (current == &init_task)) - res = 1; -#endif /* CONFIG_SMP */ - - PPCDBG(PPCDBG_SYS64, "sys_clone - exited - pid=%ld current=%lx comm=%s \n", current->pid, current, current->comm); - - return res; + return do_fork(p1, regs->gpr[1], regs, 0); } -asmlinkage int sys_fork(int p1, int p2, int p3, int p4, int p5, int p6, - struct pt_regs *regs) +int sys_fork(int p1, int p2, int p3, int p4, int p5, int p6, + struct pt_regs *regs) { - int res; - - PPCDBG(PPCDBG_SYS64, "sys_fork - entered - pid=%ld comm=%s \n", current->pid, current->comm); - - res = do_fork(SIGCHLD, regs->gpr[1], regs, 0); - -#ifdef CONFIG_SMP - /* When we clone the idle task we keep the same pid but - * the return value of 0 for both causes problems. - * -- Cort - */ - if ((current->pid == 0) && (current == &init_task)) - res = 1; -#endif /* CONFIG_SMP */ - - PPCDBG(PPCDBG_SYS64, "sys_fork - exited - pid=%ld comm=%s \n", current->pid, current->comm); - - return res; + return do_fork(SIGCHLD, regs->gpr[1], regs, 0); } -asmlinkage int sys_vfork(int p1, int p2, int p3, int p4, int p5, int p6, +int sys_vfork(int p1, int p2, int p3, int p4, int p5, int p6, struct pt_regs *regs) { - PPCDBG(PPCDBG_SYS64, "sys_vfork - running - pid=%ld current=%lx comm=%s \n", current->pid, current, current->comm); - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->gpr[1], regs, 0); } -asmlinkage int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2, - unsigned long a3, unsigned long a4, unsigned long a5, - struct pt_regs *regs) +int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4, unsigned long a5, + struct pt_regs *regs) { int error; char * filename; - - PPCDBG(PPCDBG_SYS64, "sys_execve - entered - pid=%ld current=%lx comm=%s \n", current->pid, current, current->comm); - + filename = getname((char *) a0); error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; if (regs->msr & MSR_FP) giveup_fpu(current); - - PPCDBG(PPCDBG_SYS64, "sys_execve - before do_execve : filename = %s\n", filename); - + error = do_execve(filename, (char **) a1, (char **) a2, regs); - + if (error == 0) current->ptrace &= ~PT_DTRACE; putname(filename); - out: - PPCDBG(PPCDBG_SYS64, "sys_execve - exited - pid=%ld current=%lx comm=%s error = %lx\n", current->pid, current, current->comm, error); - +out: return error; } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/prom.c linuxppc64_2_4/arch/ppc64/kernel/prom.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/prom.c Wed May 15 08:29:35 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/prom.c Wed May 1 09:55:09 2002 @@ -912,8 +912,6 @@ (strstr(model, RELOC("peedwagon")) == NULL) && (strstr(model, RELOC("innipeg")) == NULL)) continue; - } else { - prom_print(RELOC("No known I/O bridge chip found.\n")); } if ((type[0] == 0) || (strstr(type, RELOC("pci")) == NULL)) { diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/ptrace.c linuxppc64_2_4/arch/ppc64/kernel/ptrace.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/ptrace.c Fri Apr 19 11:00:33 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/ptrace.c Thu Apr 25 19:05:58 2002 @@ -126,14 +126,9 @@ ret = ptrace_attach(child); goto out_tsk; } - ret = -ESRCH; - if (!(child->ptrace & PT_PTRACED)) - goto out_tsk; - if (child->state != TASK_STOPPED) { - if (request != PTRACE_KILL) - goto out_tsk; - } - if (child->p_pptr != current) + + ret = ptrace_check_attach(child, request == PTRACE_KILL); + if (ret < 0) goto out_tsk; switch (request) { diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/ptrace32.c linuxppc64_2_4/arch/ppc64/kernel/ptrace32.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/ptrace32.c Wed May 15 08:29:35 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/ptrace32.c Thu Apr 25 19:05:58 2002 @@ -115,14 +115,9 @@ ret = ptrace_attach(child); goto out_tsk; } - ret = -ESRCH; - if (!(child->ptrace & PT_PTRACED)) - goto out_tsk; - if (child->state != TASK_STOPPED) { - if (request != PTRACE_KILL) - goto out_tsk; - } - if (child->p_pptr != current) + + ret = ptrace_check_attach(child, request == PTRACE_KILL); + if (ret < 0) goto out_tsk; switch (request) diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/rtas.c linuxppc64_2_4/arch/ppc64/kernel/rtas.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/rtas.c Wed May 15 08:29:35 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/rtas.c Mon May 6 14:29:20 2002 @@ -24,8 +24,12 @@ #include #include #include +#include #include +struct proc_dir_entry *rtas_proc_dir; /* /proc/ppc64/rtas dir */ +struct flash_block_list_header rtas_firmware_flash_list = {0, 0}; + /* * prom_init() is called very early on, before the kernel text * and data have been mapped to KERNELBASE. At this point the code @@ -185,9 +189,87 @@ return (ulong)((nret > 0) ? rtas_args->rets[0] : 0); } +#define FLASH_BLOCK_LIST_VERSION (1UL) +static void +rtas_flash_firmware(void) +{ + unsigned long image_size; + struct flash_block_list *f, *next, *flist; + unsigned long rtas_block_list; + int i, status, update_token; + + update_token = rtas_token("ibm,update-flash-64-and-reboot"); + if (update_token == RTAS_UNKNOWN_SERVICE) { + printk(KERN_ALERT "FLASH: ibm,update-flash-64-and-reboot is not available -- not a service partition?\n"); + printk(KERN_ALERT "FLASH: firmware will not be flashed\n"); + return; + } + + /* NOTE: the "first" block list is a global var with no data + * blocks in the kernel data segment. We do this because + * we want to ensure this block_list addr is under 4GB. + */ + rtas_firmware_flash_list.num_blocks = 0; + flist = (struct flash_block_list *)&rtas_firmware_flash_list; + rtas_block_list = virt_to_absolute((unsigned long)flist); + if (rtas_block_list >= (4UL << 20)) { + printk(KERN_ALERT "FLASH: kernel bug...flash list header addr above 4GB\n"); + return; + } + + printk(KERN_ALERT "FLASH: preparing saved firmware image for flash\n"); + /* Update the block_list in place. */ + image_size = 0; + for (f = flist; f; f = next) { + /* Translate data addrs to absolute */ + for (i = 0; i < f->num_blocks; i++) { + f->blocks[i].data = (char *)virt_to_absolute((unsigned long)f->blocks[i].data); + image_size += f->blocks[i].length; + } + next = f->next; + f->next = (struct flash_block_list *)virt_to_absolute((unsigned long)f->next); + /* make num_blocks into the version/length field */ + f->num_blocks = (FLASH_BLOCK_LIST_VERSION << 56) | ((f->num_blocks+1)*16); + } + + printk(KERN_ALERT "FLASH: flash image is %ld bytes\n", image_size); + printk(KERN_ALERT "FLASH: performing flash and reboot\n"); + ppc_md.progress("Flashing \n", 0x0); + ppc_md.progress("Please Wait... ", 0x0); + printk(KERN_ALERT "FLASH: this will take several minutes. Do not power off!\n"); + status = rtas_call(update_token, 1, 1, NULL, rtas_block_list); + switch (status) { /* should only get "bad" status */ + case 0: + printk(KERN_ALERT "FLASH: success\n"); + break; + case -1: + printk(KERN_ALERT "FLASH: hardware error. Firmware may not be not flashed\n"); + break; + case -3: + printk(KERN_ALERT "FLASH: image is corrupt or not correct for this platform. Firmware not flashed\n"); + break; + case -4: + printk(KERN_ALERT "FLASH: flash failed when partially complete. System may not reboot\n"); + break; + default: + printk(KERN_ALERT "FLASH: unknown flash return code %d\n", status); + break; + } +} + +void rtas_flash_bypass_warning(void) +{ + printk(KERN_ALERT "FLASH: firmware flash requires a reboot\n"); + printk(KERN_ALERT "FLASH: the firmware image will NOT be flashed\n"); +} + + void __chrp rtas_restart(char *cmd) { + if (rtas_firmware_flash_list.next) + rtas_flash_firmware(); + printk("RTAS system-reboot returned %ld\n", rtas_call(rtas_token("system-reboot"), 0, 1, NULL)); for (;;); @@ -196,6 +278,8 @@ void __chrp rtas_power_off(void) { + if (rtas_firmware_flash_list.next) + rtas_flash_bypass_warning(); /* allow power on only with power button press */ printk("RTAS power-off returned %ld\n", rtas_call(rtas_token("power-off"), 2, 1, NULL,0xffffffff,0xffffffff)); @@ -205,5 +289,7 @@ void __chrp rtas_halt(void) { + if (rtas_firmware_flash_list.next) + rtas_flash_bypass_warning(); rtas_power_off(); } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/rtas_flash.c linuxppc64_2_4/arch/ppc64/kernel/rtas_flash.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/rtas_flash.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/arch/ppc64/kernel/rtas_flash.c Mon May 6 14:29:20 2002 @@ -0,0 +1,240 @@ +/* + * c 2001 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * /proc/ppc64/rtas/firmware_flash interface + * + * This file implements a firmware_flash interface to pump a firmware + * image into the kernel. At reboot time rtas_restart() will see the + * firmware image and flash it as it reboots (see rtas.c). + */ + +#include + +#include +#include +#include +#include +#include + +#define MODULE_VERSION "1.0" +#define MODULE_NAME "rtas_flash" + +#define FIRMWARE_FLASH_NAME "firmware_flash" + +/* Local copy of the flash block list. + * We only allow one open of the flash proc file and create this + * list as we go. This list will be put in the kernel's + * rtas_firmware_flash_list global var once it is fully read. + * + * For convenience as we build the list we use virtual addrs, + * we do not fill in the version number, and the length field + * is treated as the number of entries currently in the block + * (i.e. not a byte count). This is all fixed on release. + */ +static struct flash_block_list *flist; +static char *flash_msg; +static int flash_possible; + +static int rtas_flash_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && flash_possible) { + if (flist) + return -EBUSY; + flist = (struct flash_block_list *)get_free_page(GFP_KERNEL); + if (!flist) + return -ENOMEM; + } + return 0; +} + +/* Do simple sanity checks on the flash image. */ +static int flash_list_valid(struct flash_block_list *flist) +{ + struct flash_block_list *f; + int i; + unsigned long block_size, image_size; + + flash_msg = NULL; + /* Paranoid self test here. We also collect the image size. */ + image_size = 0; + for (f = flist; f; f = f->next) { + for (i = 0; i < f->num_blocks; i++) { + if (f->blocks[i].data == NULL) { + flash_msg = "error: internal error null data\n"; + return 0; + } + block_size = f->blocks[i].length; + if (block_size <= 0 || block_size > PAGE_SIZE) { + flash_msg = "error: internal error bad length\n"; + return 0; + } + image_size += block_size; + } + } + if (image_size < (256 << 10)) { + if (image_size < 2) + flash_msg = NULL; /* allow "clear" of image */ + else + flash_msg = "error: flash image short\n"; + return 0; + } + printk(KERN_INFO "FLASH: flash image with %ld bytes stored for hardware flash on reboot\n", image_size); + return 1; +} + +static void free_flash_list(struct flash_block_list *f) +{ + struct flash_block_list *next; + int i; + + while (f) { + for (i = 0; i < f->num_blocks; i++) + free_page((unsigned long)(f->blocks[i].data)); + next = f->next; + free_page((unsigned long)f); + f = next; + } +} + +static int rtas_flash_release(struct inode *inode, struct file *file) +{ + if (flist) { + /* Always clear saved list on a new attempt. */ + if (rtas_firmware_flash_list.next) { + free_flash_list(rtas_firmware_flash_list.next); + rtas_firmware_flash_list.next = NULL; + } + + if (flash_list_valid(flist)) + rtas_firmware_flash_list.next = flist; + else + free_flash_list(flist); + flist = NULL; + } + return 0; +} + +/* Reading the proc file will show status (not the firmware contents) */ +static ssize_t rtas_flash_read(struct file *file, char *buf, + size_t count, loff_t *ppos) +{ + int error; + char *msg; + int msglen; + + if (!flash_possible) { + msg = "error: this partition does not have service authority\n"; + } else if (flist) { + msg = "info: this file is busy for write by some process\n"; + } else if (flash_msg) { + msg = flash_msg; /* message from last flash attempt */ + } else if (rtas_firmware_flash_list.next) { + msg = "ready: firmware image ready for flash on reboot\n"; + } else { + msg = "info: no firmware image for flash\n"; + } + msglen = strlen(msg); + if (msglen > count) + msglen = count; + + if (ppos && *ppos != 0) + return 0; /* be cheap */ + + error = verify_area(VERIFY_WRITE, buf, msglen); + if (error) + return -EINVAL; + + copy_to_user(buf, msg, msglen); + + if (ppos) + *ppos = msglen; + return msglen; +} + +/* We could be much more efficient here. But to keep this function + * simple we allocate a page to the block list no matter how small the + * count is. If the system is low on memory it will be just as well + * that we fail.... + */ +static ssize_t rtas_flash_write(struct file *file, const char *buffer, + size_t count, loff_t *off) +{ + size_t len = count; + char *p; + int next_free; + struct flash_block_list *fl = flist; + + if (!flash_possible || len == 0) + return len; /* discard data */ + + while (fl->next) + fl = fl->next; /* seek to last block_list for append */ + next_free = fl->num_blocks; + if (next_free == FLASH_BLOCKS_PER_NODE) { + /* Need to allocate another block_list */ + fl->next = (struct flash_block_list *)get_free_page(GFP_KERNEL); + if (!fl->next) + return -ENOMEM; + fl = fl->next; + next_free = 0; + } + + if (len > PAGE_SIZE) + len = PAGE_SIZE; + p = (char *)get_free_page(GFP_KERNEL); + if (!p) + return -ENOMEM; + if(copy_from_user(p, buffer, len)) { + free_page((unsigned long)p); + return -EFAULT; + } + fl->blocks[next_free].data = p; + fl->blocks[next_free].length = len; + fl->num_blocks++; + + return len; +} + +static struct file_operations rtas_flash_operations = { + read: rtas_flash_read, + write: rtas_flash_write, + open: rtas_flash_open, + release: rtas_flash_release, +}; + + +int __init rtas_flash_init(void) +{ + struct proc_dir_entry *ent = NULL; + + if (!rtas_proc_dir) { + printk(KERN_WARNING "rtas proc dir does not already exist"); + return -ENOENT; + } + + if (rtas_token("ibm,update-flash-64-and-reboot") != RTAS_UNKNOWN_SERVICE) + flash_possible = 1; + + if ((ent = create_proc_entry(FIRMWARE_FLASH_NAME, S_IRUSR | S_IWUSR, rtas_proc_dir)) != NULL) { + ent->nlink = 1; + ent->proc_fops = &rtas_flash_operations; + ent->owner = THIS_MODULE; + } + return 0; +} + +void __exit rtas_flash_cleanup(void) +{ + if (!rtas_proc_dir) + return; + remove_proc_entry(FIRMWARE_FLASH_NAME, rtas_proc_dir); +} + +module_init(rtas_flash_init); +module_exit(rtas_flash_cleanup); +MODULE_LICENSE("GPL"); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/signal.c linuxppc64_2_4/arch/ppc64/kernel/signal.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/signal.c Wed May 15 08:29:36 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/signal.c Thu May 9 00:49:40 2002 @@ -637,11 +637,6 @@ */ if (current->thread.flags & PPC_FLAG_32BIT) return do_signal32(oldset, regs); - - PPCDBG(PPCDBG_SIGNAL, "do_signal - running - pid=%ld current=%lx comm=%s \n", - current->pid, current, current->comm); - - if (!oldset) oldset = ¤t->blocked; @@ -716,7 +711,7 @@ continue; switch (signr) { - case SIGCONT: case SIGCHLD: case SIGWINCH: + case SIGCONT: case SIGCHLD: case SIGWINCH: case SIGURG: continue; case SIGTSTP: case SIGTTIN: case SIGTTOU: @@ -740,10 +735,7 @@ /* FALLTHRU */ default: - sigaddset(¤t->pending.signal, signr); - recalc_sigpending(current); - current->flags |= PF_SIGNALED; - do_exit(exit_code); + sig_exit(signr, exit_code, &info); /* NOTREACHED */ } } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/signal32.c linuxppc64_2_4/arch/ppc64/kernel/signal32.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/signal32.c Wed May 15 08:29:36 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/signal32.c Thu May 9 00:49:40 2002 @@ -1406,7 +1406,7 @@ continue; switch (signr) { - case SIGCONT: case SIGCHLD: case SIGWINCH: + case SIGCONT: case SIGCHLD: case SIGWINCH: case SIGURG: continue; case SIGTSTP: case SIGTTIN: case SIGTTOU: @@ -1430,10 +1430,7 @@ /* FALLTHRU */ default: - sigaddset(¤t->pending.signal, signr); - recalc_sigpending(current); - current->flags |= PF_SIGNALED; - do_exit(exit_code); + sig_exit(signr, exit_code, &info); /* NOTREACHED */ } } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/smp.c linuxppc64_2_4/arch/ppc64/kernel/smp.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/smp.c Wed May 15 08:29:36 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/smp.c Thu Apr 25 20:16:39 2002 @@ -346,7 +346,7 @@ set_bit(msg, &xics_ipi_message[i]); mb(); xics_cause_IPI(i); - } + } } } @@ -577,7 +577,7 @@ init_idle(); for (i = 0; i < NR_CPUS; i++) { - paca[i].prof_counter=1; + paca[i].prof_counter = 1; paca[i].prof_multiplier = 1; if(i != 0) { /* diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/kernel/xics.c linuxppc64_2_4/arch/ppc64/kernel/xics.c --- ../kernel.org/linux-2.4.19/arch/ppc64/kernel/xics.c Wed May 15 08:29:36 2002 +++ linuxppc64_2_4/arch/ppc64/kernel/xics.c Thu Apr 25 20:45:03 2002 @@ -231,9 +231,9 @@ } } else if( vec == XICS_IRQ_SPURIOUS ) { irq = -1; - printk("spurious PPC interrupt!\n"); - } else + } else { irq = real_irq_to_virt(vec) + XICS_IRQ_OFFSET; + } return irq; } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/arch/ppc64/xmon/xmon.c linuxppc64_2_4/arch/ppc64/xmon/xmon.c --- ../kernel.org/linux-2.4.19/arch/ppc64/xmon/xmon.c Wed May 15 08:29:36 2002 +++ linuxppc64_2_4/arch/ppc64/xmon/xmon.c Thu Apr 25 19:17:44 2002 @@ -123,12 +123,10 @@ static void mem_check(void); static void mem_find_real(void); static void mem_find_vsid(void); -static void mem_check_full_group(void); static void mem_check_pagetable_vsids (void); static void mem_map_check_slab(void); static void mem_map_lock_pages(void); -static void mem_map_check_hash(void); static void mem_check_dup_rpn (void); static void debug_trace(void); @@ -634,15 +632,9 @@ case 'c': mem_check(); break; - case 'g': - mem_check_full_group(); - break; case 'j': mem_map_check_slab(); break; - case 'h': - mem_map_check_hash(); - break; case 'f': mem_find_real(); break; @@ -2490,34 +2482,6 @@ printf(" count of locked pages = %d \n", lock_count); } - - -void mem_map_check_hash() -{ - int i = max_mapnr; - - while (i-- > 0) { - /* skip the reserved */ - if (!PageReserved(mem_map+i)) { - if (((mem_map+i)->next_hash) != NULL) { - if ( REGION_ID((mem_map+i)->next_hash) != KERNEL_REGION_ID ) { - printf(" mem_map check hash - non c0 entry - " - "address/value = %p %lx\n", mem_map+i,(mem_map+i)->next_hash); - } - if ((unsigned long)((mem_map+i)->next_hash) == KERNELBASE){ - printf(" mem_map check hash - 0x%lx entry = %p \n", - KERNELBASE, mem_map+i); - } - } - } else { - if (page_count(mem_map+i) < 0) { - printf(" reserved page with negative count- entry = %lx \n", mem_map+i); - } - } - } - printf(" mem_map check hash completed \n"); -} - void mem_check_dup_rpn () { unsigned long htab_size_bytes; @@ -2725,70 +2689,6 @@ printf("\nDone -------------------\n"); -} - - -void mem_check_full_group() -{ - unsigned long htab_size_bytes; - unsigned count; - unsigned count_array[] = {0,0,0,0,0,0,0,0,0}; - unsigned i; - unsigned long htab_end; - HPTE *hpte1, *hpte2, *hpte3; - u64 rpn = 0; - - htab_size_bytes = htab_data.htab_num_ptegs * 128; // 128B / PTEG - htab_end = (unsigned long)htab_data.htab + htab_size_bytes; - - printf("\nHardware Page Find full groups \n-------------------\n"); - printf("htab base : %.16lx\n", htab_data.htab); - printf("htab size : %.16lx\n", htab_size_bytes); - - for (hpte1 = htab_data.htab; (unsigned long)hpte1 < htab_end; hpte1= hpte1 + 8) - { - count = 0; - hpte2 = hpte1; - for (i=0; i<8; ++i) - { - if ( hpte2->dw0.dw0.v != 0 ) - { - count++; - } - hpte2++; - } - if (count == 8 ) - { - printf(" full group starting with entry %lx \n", hpte1); - hpte3 = hpte1; - for (i=0; i<8; ++i) - { - if ( hpte3->dw0.dw0.v != 0 ) - { - printf(" entry number %d \n",i); - printf(" vsid: %.13lx api: %.2lx hash: %.1lx\n", - (hpte3->dw0.dw0.avpn)>>5, - (hpte3->dw0.dw0.avpn) & 0x1f, - (hpte3->dw0.dw0.h)); - printf(" rpn: %.13lx \n", (hpte3->dw1.dw1.rpn)); - // Dump out the memmap array entry address, corresponding virtual address, and reference count. - rpn = hpte3->dw1.dw1.rpn; - printf(" mem_map+rpn=%p, virtual@=%p, count=%lx \n", mem_map+rpn, (mem_map+rpn)->virtual, (mem_map+rpn)->count); - } - hpte3++; - } - if (xmon_interrupted()) - return; - } - - count_array[count]++; - } - for (i=1; i<9; ++i) - { - printf(" group count for size %i = %lx \n", i, count_array[i]); - } - - printf("\nDone -------------------\n"); } static void debug_trace(void) { diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/Makefile linuxppc64_2_4/drivers/Makefile --- ../kernel.org/linux-2.4.19/drivers/Makefile Wed May 15 08:29:37 2002 +++ linuxppc64_2_4/drivers/Makefile Mon May 13 16:39:12 2002 @@ -8,7 +8,7 @@ mod-subdirs := dio mtd sbus video macintosh usb input telephony sgi ide \ message/i2o message/fusion scsi md ieee1394 pnp isdn atm \ - fc4 net/hamradio i2c acpi bluetooth + fc4 net/hamradio i2c acpi bluetooth iseries subdir-y := parport char block net sound misc media cdrom hotplug subdir-m := $(subdir-y) @@ -24,7 +24,8 @@ subdir-$(CONFIG_TC) += tc subdir-$(CONFIG_VT) += video subdir-$(CONFIG_MAC) += macintosh -subdir-$(CONFIG_PPC) += macintosh +subdir-$(CONFIG_PPC) += macintosh +subdir-$(CONFIG_PPC_ISERIES) += iseries subdir-$(CONFIG_USB) += usb subdir-$(CONFIG_INPUT) += input subdir-$(CONFIG_PHONE) += telephony diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/block/genhd.c linuxppc64_2_4/drivers/block/genhd.c --- ../kernel.org/linux-2.4.19/drivers/block/genhd.c Wed May 15 08:29:37 2002 +++ linuxppc64_2_4/drivers/block/genhd.c Mon May 13 16:39:12 2002 @@ -220,6 +220,9 @@ #ifdef CONFIG_VT console_map_init(); #endif +#ifdef CONFIG_VIODASD + viodasd_init(); +#endif return 0; } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/block/ll_rw_blk.c linuxppc64_2_4/drivers/block/ll_rw_blk.c --- ../kernel.org/linux-2.4.19/drivers/block/ll_rw_blk.c Fri Apr 19 11:00:44 2002 +++ linuxppc64_2_4/drivers/block/ll_rw_blk.c Mon Apr 22 10:32:49 2002 @@ -1345,6 +1345,9 @@ #ifdef CONFIG_BLK_DEV_XD xd_init(); #endif +#ifdef CONFIG_VIOCD + viocd_init(); +#endif #ifdef CONFIG_BLK_DEV_MFM mfm_init(); #endif diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/cdrom/Makefile linuxppc64_2_4/drivers/cdrom/Makefile --- ../kernel.org/linux-2.4.19/drivers/cdrom/Makefile Fri Apr 19 10:30:25 2002 +++ linuxppc64_2_4/drivers/cdrom/Makefile Thu Oct 11 11:10:49 2001 @@ -27,6 +27,7 @@ obj-$(CONFIG_BLK_DEV_IDECD) += cdrom.o obj-$(CONFIG_BLK_DEV_SR) += cdrom.o obj-$(CONFIG_PARIDE_PCD) += cdrom.o +obj-$(CONFIG_VIOCD) += cdrom.o obj-$(CONFIG_AZTCD) += aztcd.o obj-$(CONFIG_CDU31A) += cdu31a.o cdrom.o diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/char/Config.in linuxppc64_2_4/drivers/char/Config.in --- ../kernel.org/linux-2.4.19/drivers/char/Config.in Mon Apr 22 11:34:26 2002 +++ linuxppc64_2_4/drivers/char/Config.in Tue Apr 23 09:37:26 2002 @@ -45,6 +45,10 @@ if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then dep_tristate ' Multi-Tech multiport card support (EXPERIMENTAL)' CONFIG_ISI m fi + tristate ' IBM Multiport Serial Adapter' CONFIG_ICOM + if [ "$CONFIG_ICOM" = "y" ]; then + string 'Modem Country Code (Internal Modem Users only)' CONFIG_ICOM_MODEM_CC "" + fi tristate ' Microgate SyncLink card support' CONFIG_SYNCLINK tristate ' HDLC line discipline support' CONFIG_N_HDLC tristate ' SDL RISCom/8 card support' CONFIG_RISCOM8 @@ -134,6 +138,7 @@ fi dep_tristate 'Support for user-space parallel port device drivers' CONFIG_PPDEV $CONFIG_PARPORT fi +dep_bool 'pSeries Hypervisor Virtual Console support' CONFIG_HVC_CONSOLE $CONFIG_PPC64 source drivers/i2c/Config.in @@ -228,6 +233,9 @@ dep_tristate 'Intel i8x0 Random Number Generator support' CONFIG_INTEL_RNG $CONFIG_PCI fi tristate '/dev/nvram support' CONFIG_NVRAM +if [ "$CONFIG_PPC_ISERIES" != "y" ]; then + tristate 'Enhanced Real Time Clock Support' CONFIG_RTC +fi tristate 'Enhanced Real Time Clock Support' CONFIG_RTC if [ "$CONFIG_IA64" = "y" ]; then bool 'EFI Real Time Clock Services' CONFIG_EFI_RTC diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/char/Makefile linuxppc64_2_4/drivers/char/Makefile --- ../kernel.org/linux-2.4.19/drivers/char/Makefile Mon Apr 22 11:34:26 2002 +++ linuxppc64_2_4/drivers/char/Makefile Tue Apr 23 09:37:26 2002 @@ -164,6 +164,7 @@ obj-$(CONFIG_COMPUTONE) += ip2.o ip2main.o obj-$(CONFIG_RISCOM8) += riscom8.o obj-$(CONFIG_ISI) += isicom.o +obj-$(CONFIG_ICOM) += icom.o obj-$(CONFIG_ESPSERIAL) += esp.o obj-$(CONFIG_SYNCLINK) += synclink.o obj-$(CONFIG_N_HDLC) += n_hdlc.o @@ -177,6 +178,7 @@ obj-$(CONFIG_MVME147_SCC) += generic_serial.o vme_scc.o obj-$(CONFIG_MVME162_SCC) += generic_serial.o vme_scc.o obj-$(CONFIG_BVME6000_SCC) += generic_serial.o vme_scc.o +obj-$(CONFIG_HVC_CONSOLE) += hvc_console.o obj-$(CONFIG_SERIAL_TX3912) += generic_serial.o serial_tx3912.o obj-$(CONFIG_TXX927_SERIAL) += serial_txx927.o diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/char/hvc_console.c linuxppc64_2_4/drivers/char/hvc_console.c --- ../kernel.org/linux-2.4.19/drivers/char/hvc_console.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/drivers/char/hvc_console.c Mon Apr 22 10:32:50 2002 @@ -0,0 +1,355 @@ +/* + * Copyright (C) 2001 Anton Blanchard , IBM + * Copyright (C) 2001 Paul Mackerras , IBM + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern int hvc_count(int *); +extern int hvc_get_chars(int index, char *buf, int count); +extern int hvc_put_chars(int index, const char *buf, int count); + +#define HVC_MAJOR 229 +#define HVC_MINOR 0 + +#define MAX_NR_HVC_CONSOLES 4 + +#define TIMEOUT ((HZ + 99) / 100) + +struct tty_driver hvc_driver; +static int hvc_refcount; +static struct tty_struct *hvc_table[MAX_NR_HVC_CONSOLES]; +static struct termios *hvc_termios[MAX_NR_HVC_CONSOLES]; +static struct termios *hvc_termios_locked[MAX_NR_HVC_CONSOLES]; +static int hvc_offset; +#ifdef CONFIG_MAGIC_SYSRQ +static int sysrq_pressed; +#endif + +#define N_OUTBUF 16 + +#define __ALIGNED__ __attribute__((__aligned__(8))) + +struct hvc_struct { + spinlock_t lock; + int index; + struct tty_struct *tty; + unsigned int count; + int do_wakeup; + char outbuf[N_OUTBUF] __ALIGNED__; + int n_outbuf; +}; + +struct hvc_struct hvc_struct[MAX_NR_HVC_CONSOLES]; + +static int hvc_open(struct tty_struct *tty, struct file * filp) +{ + int line = MINOR(tty->device) - tty->driver.minor_start; + struct hvc_struct *hp; + unsigned long flags; + + if (line < 0 || line >= MAX_NR_HVC_CONSOLES) + return -ENODEV; + hp = &hvc_struct[line]; + + tty->driver_data = hp; + spin_lock_irqsave(&hp->lock, flags); + hp->tty = tty; + hp->count++; + spin_unlock_irqrestore(&hp->lock, flags); + + return 0; +} + +static void hvc_close(struct tty_struct *tty, struct file * filp) +{ + struct hvc_struct *hp = tty->driver_data; + unsigned long flags; + + if (tty_hung_up_p(filp)) + return; + spin_lock_irqsave(&hp->lock, flags); + if (--hp->count == 0) + hp->tty = NULL; + else if (hp->count < 0) + printk(KERN_ERR "hvc_close %lu: oops, count is %d\n", + hp - hvc_struct, hp->count); + spin_unlock_irqrestore(&hp->lock, flags); +} + +/* called with hp->lock held */ +static void hvc_push(struct hvc_struct *hp) +{ + int n; + + n = hvc_put_chars(hp->index + hvc_offset, hp->outbuf, hp->n_outbuf); + if (n <= 0) { + if (n == 0) + return; + /* throw away output on error; this happens when + there is no session connected to the vterm. */ + hp->n_outbuf = 0; + } else + hp->n_outbuf -= n; + if (hp->n_outbuf > 0) + memmove(hp->outbuf, hp->outbuf + n, hp->n_outbuf); + else + hp->do_wakeup = 1; +} + +static int hvc_write(struct tty_struct *tty, int from_user, + const unsigned char *buf, int count) +{ + struct hvc_struct *hp = tty->driver_data; + char *p; + int todo, written = 0; + unsigned long flags; + + spin_lock_irqsave(&hp->lock, flags); + while (count > 0 && (todo = N_OUTBUF - hp->n_outbuf) > 0) { + if (todo > count) + todo = count; + p = hp->outbuf + hp->n_outbuf; + if (from_user) { + todo -= copy_from_user(p, buf, todo); + if (todo == 0) { + if (written == 0) + written = -EFAULT; + break; + } + } else + memcpy(p, buf, todo); + count -= todo; + buf += todo; + hp->n_outbuf += todo; + written += todo; + hvc_push(hp); + } + spin_unlock_irqrestore(&hp->lock, flags); + + return written; +} + +static int hvc_write_room(struct tty_struct *tty) +{ + struct hvc_struct *hp = tty->driver_data; + + return N_OUTBUF - hp->n_outbuf; +} + +static int hvc_chars_in_buffer(struct tty_struct *tty) +{ + struct hvc_struct *hp = tty->driver_data; + + return hp->n_outbuf; +} + +static void hvc_poll(int index) +{ + struct hvc_struct *hp = &hvc_struct[index]; + struct tty_struct *tty; + int i, n; + char buf[16] __ALIGNED__; + unsigned long flags; + + spin_lock_irqsave(&hp->lock, flags); + + if (hp->n_outbuf > 0) + hvc_push(hp); + + tty = hp->tty; + if (tty) { + for (;;) { + if (TTY_FLIPBUF_SIZE - tty->flip.count < sizeof(buf)) + break; + n = hvc_get_chars(index + hvc_offset, buf, sizeof(buf)); + if (n <= 0) + break; + for (i = 0; i < n; ++i) { +#ifdef CONFIG_MAGIC_SYSRQ /* Handle the SysRq Hack */ + if (buf[i] == '\x0f') { /* ^O -- should support a sequence */ + sysrq_pressed = 1; + continue; + } else if (sysrq_pressed) { + handle_sysrq(buf[i], NULL, NULL, tty); + sysrq_pressed = 0; + continue; + } +#endif + tty_insert_flip_char(tty, buf[i], 0); + } + } + if (tty->flip.count) + tty_schedule_flip(tty); + + if (hp->do_wakeup) { + hp->do_wakeup = 0; + if ((tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) + && tty->ldisc.write_wakeup) + (tty->ldisc.write_wakeup)(tty); + wake_up_interruptible(&tty->write_wait); + } + } + + spin_unlock_irqrestore(&hp->lock, flags); +} + +int khvcd(void *unused) +{ + int i; + + daemonize(); + reparent_to_init(); + strcpy(current->comm, "khvcd"); + sigfillset(¤t->blocked); + + for (;;) { + for (i = 0; i < MAX_NR_HVC_CONSOLES; ++i) + hvc_poll(i); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(TIMEOUT); + } +} + +int __init hvc_init(void) +{ + int i; + + memset(&hvc_driver, 0, sizeof(struct tty_driver)); + + hvc_driver.magic = TTY_DRIVER_MAGIC; + hvc_driver.driver_name = "hvc"; + hvc_driver.name = "hvc/%d"; + hvc_driver.major = HVC_MAJOR; + hvc_driver.minor_start = HVC_MINOR; + hvc_driver.num = hvc_count(&hvc_offset); + if (hvc_driver.num > MAX_NR_HVC_CONSOLES) + hvc_driver.num = MAX_NR_HVC_CONSOLES; + hvc_driver.type = TTY_DRIVER_TYPE_SYSTEM; + hvc_driver.init_termios = tty_std_termios; + hvc_driver.flags = TTY_DRIVER_REAL_RAW; + hvc_driver.refcount = &hvc_refcount; + hvc_driver.table = hvc_table; + hvc_driver.termios = hvc_termios; + hvc_driver.termios_locked = hvc_termios_locked; + + hvc_driver.open = hvc_open; + hvc_driver.close = hvc_close; + hvc_driver.write = hvc_write; + hvc_driver.write_room = hvc_write_room; + hvc_driver.chars_in_buffer = hvc_chars_in_buffer; + + for (i = 0; i < hvc_driver.num; i++) { + hvc_struct[i].lock = SPIN_LOCK_UNLOCKED; + hvc_struct[i].index = i; + tty_register_devfs(&hvc_driver, 0, hvc_driver.minor_start + i); + } + + if (tty_register_driver(&hvc_driver)) + panic("Couldn't register hvc console driver\n"); + + if (hvc_driver.num > 0) + kernel_thread(khvcd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + + return 0; +} + +static void __exit hvc_exit(void) +{ +} + +void hvc_console_print(struct console *co, const char *b, unsigned count) +{ + char c[16] __ALIGNED__; + unsigned i, n; + int r, donecr = 0; + + i = n = 0; + while (count > 0 || i > 0) { + if (count > 0 && i < sizeof(c)) { + if (b[n] == '\n' && !donecr) { + c[i++] = '\r'; + donecr = 1; + } else { + c[i++] = b[n++]; + donecr = 0; + --count; + } + } else { + r = hvc_put_chars(co->index + hvc_offset, c, i); + if (r < 0) { + /* throw away chars on error */ + i = 0; + } else if (r > 0) { + i -= r; + if (i > 0) + memmove(c, c+r, i); + } + } + } +} + +static kdev_t hvc_console_device(struct console *c) +{ + return MKDEV(HVC_MAJOR, HVC_MINOR + c->index); +} + +int hvc_wait_for_keypress(struct console *co) +{ + char c[16] __ALIGNED__; + + while (hvc_get_chars(co->index, &c[0], 1) < 1) + ; + return 0; +} + +static int __init hvc_console_setup(struct console *co, char *options) +{ + if (co->index < 0 || co->index >= MAX_NR_HVC_CONSOLES + || co->index >= hvc_count(&hvc_offset)) + return -1; + return 0; +} + +struct console hvc_con_driver = { + name: "hvc", + write: hvc_console_print, + device: hvc_console_device, + setup: hvc_console_setup, + flags: CON_PRINTBUFFER, + index: -1, +}; + +int __init hvc_console_init(void) +{ + register_console(&hvc_con_driver); + return 0; +} + +module_init(hvc_init); +module_exit(hvc_exit); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/char/icom.c linuxppc64_2_4/drivers/char/icom.c --- ../kernel.org/linux-2.4.19/drivers/char/icom.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/drivers/char/icom.c Mon Apr 15 11:14:27 2002 @@ -0,0 +1,3311 @@ +/* + * iCom.c + * + * Copyright (C) 2001 Michael Anderson, IBM Corporation + * + * Serial device driver. + * + * Based on code from serial.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * NOTE: Only for users with internal modem cards (eg. iSeries 2771, 2772) + * find the decimal number associated with your country below + * and set country code operations using module parameter at install + * time, eg. for Argentina the command would be (be sure number is in "") + * insmod iCom.o iCom_country_code="52" + * + * Module paramter values for + * iCom_country_code: + 52 * AR Argentina * + 52 * AW Aruba * + 1 * AU Australia * + 52 * AT Austria * + 52 * BH Bahrain * + 52 * BE Belgium * + 52 * BR Brazil * + 52 * BN Brunei Darussalam * + 52 * CA Canada * + 52 * KY Cayman Islands * + 52 * CL Chile * + 52 * CN China * + 52 * CO Colombia * + 52 * CR Costa Rica * + 52 * HR Croatia * + 52 * CY Cyprus * + 37 * CZ Czech Republic * + 52 * DK Denmark * + 52 * EC Ecuador * + 52 * EG Egypt * + 52 * FI Finland * + 52 * FR France * + 52 * DE Germany * + 52 * GR Greece * + 52 * GT Guatemala * + 48 * HK China (Hong Kong S.A.R.) * + 48 * HU Hungary * + 52 * IS Iceland * + 48 * IN India * + 48 * ID Indonesia * + 52 * IE Ireland * + 48 * IL Israel * + 52 * IT Italy * + 52 * JM Jamaica * + 16 * JP Japan * + 52 * KR Korea, Republic of * + 52 * LU Luxembourg * + 52 * MO China (Macau S.A.R.) * + 48 * MY Malaysia * + 52 * MX Mexico * + 52 * MA Morocco * + 52 * NL Netherlands * + 52 * AN Netherlands Antilles * + 9 * NZ New Zealand * + 52 * NO Norway * + 52 * PK Pakistan * + 52 * PA Panama * + 52 * PE Peru * + 48 * PH Philippines * + 48 * PL Poland * + 52 * PT Portugal * + 52 * QA Qatar * + 52 * RO Romania * + 52 * RU Russia * + 52 * SA Saudi Arabia * + 48 * SG Singapore * + 52 * SK Slovakia * + 48 * SI Slovenia * + 53 * ZA South Africa * + 52 * ES Spain * + 52 * LK Sri Lanka * + 52 * SE Sweden * + 52 * CH Switzerland * + 52 * TW Taiwan * + 52 * TH Thailand * + 52 * TT Trinidad and Tobago * + 52 * TR Turkey * + 52 * UA Ukraine * + 52 * AE United Arab Emirates * + 52 * GB United Kingdom * + 52 * US United States of America* + 52 * UY Uruguay * + 52 * VE Venezuela * + 48 * VN Vietnam * +*/ +#define SERIAL_DO_RESTART +#ifdef MODVERSIONS +#include +#endif +#include + +MODULE_AUTHOR ("Michael Anderson "); +MODULE_DESCRIPTION ("IBM iSeries Serial IOA driver"); +MODULE_SUPPORTED_DEVICE("IBM iSeries 2745, 2771, 2772 Communications adapters"); +MODULE_LICENSE("GPL"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +/* adapter code loads */ +#include "icom.h" + +#ifdef MODULE +#define CONFIG_ICOM_MODEM_CC "" +#endif + +static char *iCom_country_code = CONFIG_ICOM_MODEM_CC; +MODULE_PARM(iCom_country_code, "s"); +MODULE_PARM_DESC(iCom_country_code, "Modem country code configuration"); + +#define ICOM_TRACE /* enable port trace capabalities */ + +#define DRIVER_NAME "iCom" +#define VENDOR_ID 0x1014 +#define DEVICE_ID 0x0031 +#define DEVICE_ID2 0x0219 +#define MAX_ADAPTERS 4 +#define NR_PORTS (active_adapters * 4) +#define MAX_PORTS (MAX_ADAPTERS * 4) +#define ASYNC_CLOSING 0x08000000 /* Serial port is closing */ +#define ASYNC_HUP_NOTIFY 0x0001 /* Notify getty on hangups and closes + on the callout port */ + +#ifdef MODULE +static const struct pci_device_id iCom_pci_table[] __initdata = +{ + { + vendor: VENDOR_ID, + device: DEVICE_ID, + subvendor: 0xFFFF, + subdevice: 0xFFFF, + }, + { + vendor: VENDOR_ID, + device: DEVICE_ID2, + subvendor: VENDOR_ID, + subdevice: 0x021a, + }, + { + vendor: VENDOR_ID, + device: DEVICE_ID2, + subvendor: VENDOR_ID, + subdevice: 0x0251, + }, + { + vendor: VENDOR_ID, + device: DEVICE_ID2, + subvendor: VENDOR_ID, + subdevice: 0x0252, + }, + { } +}; +MODULE_DEVICE_TABLE(pci, iCom_pci_table); +#endif + +/* + * adapter defines and structures + */ +#define ICOM_CONTROL_START_A 0x00000008 +#define ICOM_CONTROL_STOP_A 0x00000004 +#define ICOM_CONTROL_START_B 0x00000002 +#define ICOM_CONTROL_STOP_B 0x00000001 +#define ICOM_CONTROL_START_C 0x00000008 +#define ICOM_CONTROL_STOP_C 0x00000004 +#define ICOM_CONTROL_START_D 0x00000002 +#define ICOM_CONTROL_STOP_D 0x00000001 +#define ICOM_IRAM_OFFSET 0x1000 +#define ICOM_DCE_IRAM_OFFSET 0x0A00 +#define ICOM_CABLE_ID_VALID 0x01 +#define ICOM_CABLE_ID_MASK 0xF0 +#define ICOM_DISABLE 0x80 +#define CMD_XMIT_RCV_ENABLE 0xC0 +#define CMD_XMIT_ENABLE 0x40 +#define CMD_RCV_DISABLE 0x00 +#define CMD_RCV_ENABLE 0x80 +#define CMD_RESTART 0x01 +#define CMD_HOLD_XMIT 0x02 +#define CMD_SND_BREAK 0x04 +#define RS232_CABLE 0x06 +#define V24_CABLE 0x0E +#define V35_CABLE 0x0C +#define V36_CABLE 0x02 +#define START_DOWNLOAD 0x80 +#define ICOM_INT_MASK_PRC_A 0x00003FFF +#define ICOM_INT_MASK_PRC_B 0x3FFF0000 +#define ICOM_INT_MASK_PRC_C 0x00003FFF +#define ICOM_INT_MASK_PRC_D 0x3FFF0000 +#define INT_RCV_COMPLETED 0x1000 +#define INT_XMIT_COMPLETED 0x2000 +#define INT_IDLE_DETECT 0x0800 +#define INT_RCV_DISABLED 0x0400 +#define INT_XMIT_DISABLED 0x0200 +#define INT_RCV_XMIT_SHUTDOWN 0x0100 +#define INT_FATAL_ERROR 0x0080 +#define INT_CABLE_PULL 0x0020 +#define INT_SIGNAL_CHANGE 0x0010 +#define HDLC_PPP_PURE_ASYNC 0x02 +#define HDLC_FF_FILL 0x00 +#define HDLC_HDW_FLOW 0x01 +#define START_XMIT 0x80 +#define ICOM_ACFG_DRIVE1 0x20 +#define ICOM_ACFG_NO_PARITY 0x00 +#define ICOM_ACFG_PARITY_ENAB 0x02 +#define ICOM_ACFG_PARITY_ODD 0x01 +#define ICOM_ACFG_8BPC 0x00 +#define ICOM_ACFG_7BPC 0x04 +#define ICOM_ACFG_6BPC 0x08 +#define ICOM_ACFG_5BPC 0x0C +#define ICOM_ACFG_1STOP_BIT 0x00 +#define ICOM_ACFG_2STOP_BIT 0x10 +#define DTR 0x80 +#define RTS 0x40 +#define RI 0x08 +#define DSR 0x80 +#define DCD 0x20 +#define CTS 0x40 + +#define BAUD_TABLE_LIMIT 20 +static int icom_acfg_baud[] = { + 300, + 600, + 900, + 1200, + 1800, + 2400, + 3600, + 4800, + 7200, + 9600, + 14400, + 19200, + 28800, + 38400, + 57600, + 76800, + 115200, + 153600, + 230400, + 307200, + 460800}; + +static int active_adapters; +static struct tty_driver serial_driver; +static int serial_refcount = 0; +static struct tty_struct *serial_table[MAX_PORTS]; +static struct termios *serial_termios[MAX_PORTS]; +static struct termios *serial_termios_locked[MAX_PORTS]; + +struct iCom_regs { + u32 control; /* Adapter Control Register */ + u32 interrupt; /* Adapter Interrupt Register */ + u32 int_mask; /* Adapter Interrupt Mask Reg */ + u32 int_pri; /* Adapter Interrupt Priority r */ + u32 int_reg_b; /* Adapter non-masked Interrupt */ + u32 resvd01; + u32 resvd02; + u32 resvd03; + u32 control_2; /* Adapter Control Register 2 */ + u32 interrupt_2; /* Adapter Interrupt Register 2 */ + u32 int_mask_2; /* Adapter Interrupt Mask 2 */ + u32 int_pri_2; /* Adapter Interrupt Prior 2 */ + u32 int_reg_2b; /* Adapter non-masked 2 */ +}; + +struct func_dram { + u32 reserved[108]; /* 0-1B0 reserved by personality code */ + u32 RcvStatusAddr; /* 1B0-1B3 Status Address for Next rcv */ + u8 RcvStnAddr; /* 1B4 Receive Station Addr */ + u8 IdleState; /* 1B5 Idle State */ + u8 IdleMonitor; /* 1B6 Idle Monitor */ + u8 FlagFillIdleTimer; /* 1B7 Flag Fill Idle Timer */ + u32 XmitStatusAddr; /* 1B8-1BB Transmit Status Address */ + u8 StartXmitCmd; /* 1BC Start Xmit Command */ + u8 HDLCConfigReg; /* 1BD Reserved */ + u8 CauseCode; /* 1BE Cause code for fatal error */ + u8 xchar; /* 1BF High priority send */ + u32 reserved3; /* 1C0-1C3 Reserved */ + u8 PrevCmdReg; /* 1C4 Reserved */ + u8 CmdReg; /* 1C5 Command Register */ + u8 async_config2; /* 1C6 Async Config Byte 2*/ + u8 async_config3; /* 1C7 Async Config Byte 3*/ + u8 dce_resvd[20]; /* 1C8-1DB DCE Rsvd */ + u8 dce_resvd21; /* 1DC DCE Rsvd (21st byte*/ + u8 misc_flags; /* 1DD misc flags */ +#define V2_HARDWARE 0x40 + u8 call_length; /* 1DE Phone #/CFI buff ln*/ + u8 call_length2; /* 1DF Upper byte (unused)*/ + u32 call_addr; /* 1E0-1E3 Phn #/CFI buff addr*/ + u16 timer_value; /* 1E4-1E5 general timer value*/ + u8 timer_command; /* 1E6 general timer cmd */ + u8 dce_command; /* 1E7 dce command reg */ + u8 dce_cmd_status; /* 1E8 dce command stat */ + u8 x21_r1_ioff; /* 1E9 dce ready counter */ + u8 x21_r0_ioff; /* 1EA dce not ready ctr */ + u8 x21_ralt_ioff; /* 1EB dce CNR counter */ + u8 x21_r1_ion; /* 1EC dce ready I on ctr */ + u8 rsvd_ier; /* 1ED Rsvd for IER (if ne*/ + u8 ier; /* 1EE Interrupt Enable */ + u8 isr; /* 1EF Input Signal Reg */ + u8 osr; /* 1F0 Output Signal Reg */ + u8 reset; /* 1F1 Reset/Reload Reg */ + u8 disable; /* 1F2 Disable Reg */ + u8 sync; /* 1F3 Sync Reg */ + u8 error_stat; /* 1F4 Error Status */ + u8 cable_id; /* 1F5 Cable ID */ + u8 cs_length; /* 1F6 CS Load Length */ + u8 mac_length; /* 1F7 Mac Load Length */ + u32 cs_load_addr; /* 1F8-1FB Call Load PCI Addr */ + u32 mac_load_addr; /* 1FC-1FF Mac Load PCI Addr */ +}; + +#define NUM_XBUFFS 1 +#define NUM_RBUFFS 2 +#define RCV_BUFF_SZ 0x0200 +#define XMIT_BUFF_SZ 0x1000 +struct statusArea +{ + /**********************************************/ + /* Transmit Status Area */ + /**********************************************/ + struct { + u32 leNext; /* Next entry in Little Endian on Adapter */ + u32 leNextASD; + u32 leBuffer; /* Buffer for entry in LE for Adapter */ + u16 leLengthASD; + u16 leOffsetASD; + u16 leLength; /* Length of data in segment */ + u16 flags; +#define SA_FLAGS_DONE 0x0080 /* Done with Segment */ +#define SA_FLAGS_CONTINUED 0x8000 /* More Segments */ +#define SA_FLAGS_IDLE 0x4000 /* Mark IDLE after frm */ +#define SA_FLAGS_READY_TO_XMIT 0x0800 +#define SA_FLAGS_STAT_MASK 0x007F + } xmit[NUM_XBUFFS]; + + /**********************************************/ + /* Receive Status Area */ + /**********************************************/ + struct { + u32 leNext; /* Next entry in Little Endian on Adapter */ + u32 leNextASD; + u32 leBuffer; /* Buffer for entry in LE for Adapter */ + u16 WorkingLength; /* size of segment */ + u16 reserv01; + u16 leLength; /* Length of data in segment */ + u16 flags; +#define SA_FL_RCV_DONE 0x0010 /* Data ready */ +#define SA_FLAGS_OVERRUN 0x0040 +#define SA_FLAGS_PARITY_ERROR 0x0080 +#define SA_FLAGS_FRAME_ERROR 0x0001 +#define SA_FLAGS_FRAME_TRUNC 0x0002 +#define SA_FLAGS_BREAK_DET 0x0004 /* set conditionally by device driver, not hardware */ +#define SA_FLAGS_RCV_MASK 0xFFE6 + } rcv[NUM_RBUFFS]; +}; + +struct iCom_port { + int open_active_count; + struct tty_struct *tty; + unsigned long int event; + struct tq_struct tqueue; + int flags; + int xmit_fifo_size; + int baud_base; + wait_queue_head_t close_wait; + wait_queue_head_t open_wait; + wait_queue_head_t delta_msr_wait; + int blocked_open; + unsigned short close_delay; + unsigned short closing_wait; + unsigned long int timeout; + long session; /* Session of opening process */ + long pgrp; /* pgrp of opening process */ + unsigned char read_status_mask; + unsigned char ignore_status_mask; + struct async_icount icount; + struct termios normal_termios; + struct termios callout_termios; + unsigned long int int_reg; + struct iCom_regs *global_reg; + struct func_dram *dram; + int adapter; + int port; + struct statusArea *statStg; + dma_addr_t statStg_pci; + u32 *xmitRestart; + dma_addr_t xmitRestart_pci; + unsigned char *xmit_buf; + dma_addr_t xmit_buf_pci; + unsigned char *recv_buf; + dma_addr_t recv_buf_pci; + int next_rcv; + int put_length; + int status; +#define ICOM_PORT_ACTIVE 1 +#define ICOM_PORT_OFF 0 + unsigned long *trace_blk; +}; + +static struct iCom_adapter { + unsigned long int base_addr; + unsigned char irq_number; + struct pci_dev *pci_dev; + struct iCom_port port_info[4]; + int version; +#define ADAPTER_V1 0x0001 +#define ADAPTER_V2 0x0002 + unsigned long int subsystem_id; +#define FOUR_PORT_MODEL 0x02521014 + int numb_ports; +} *iCom_adapter_info; + + +static DECLARE_MUTEX(tmp_buf_sem); + +static spinlock_t iComlock; + +/* + Utility functions +*/ +static void return_port_memory(struct iCom_port *iCom_port_info); +static void iCom_wait_until_sent(struct tty_struct *tty, int timeout); +static void do_softint(void *); +static void iCom_start(struct tty_struct * tty); +static void iCom_flush_buffer(struct tty_struct * tty); +static void iCom_set_code(struct iCom_port *iCom_port_info); +#ifdef ICOM_TRACE +static void TRACE(struct iCom_port *,u32 , u32); +#else +#define TRACE(x,y,z) /* nub out calls to TRACE function */ +#endif + +#ifdef CONFIG_PPC64 +extern int register_ioctl32_conversion(unsigned int cmd, + int (*handler)(unsigned int, unsigned int, unsigned long, struct file *)); +extern int unregister_ioctl32_conversion(unsigned int cmd); +#else +static inline int register_ioctl32_conversion(unsigned int cmd, + int (*handler)(unsigned int, + unsigned int, unsigned long, struct file *)) +{ + return 0; +} +static inline int unregister_ioctl32_conversion(unsigned int cmd) +{ + return 0; +} +#endif + +static u8 iCom_readb(void *address) +{ + /* Issue a 'write memory barrier' prior to the mmio to ensure ordering */ + /* This translates to an eieio instruction on ppc */ + wmb(); + return readb(address); +} + +static void iCom_writeb(u8 value, void *address) +{ + /* Issue a 'memory barrier' prior to the mmio to ensure data is flushed + to memory. This translates to a sync instruction on ppc */ + mb(); + writeb(value, address); +} + +static u16 iCom_readw(void *address) +{ + /* Issue a 'write memory barrier' prior to the mmio to ensure ordering */ + /* This translates to an eieio instruction on ppc */ + wmb(); + return readw(address); +} + +static void iCom_writew(u16 value, void *address) +{ + /* Issue a 'memory barrier' prior to the mmio to ensure data is flushed + to memory. This translates to a sync instruction on ppc */ + mb(); + writew(value, address); +} + +static u32 iCom_readl(void *address) +{ + /* Issue a 'write memory barrier' prior to the mmio to ensure ordering */ + /* This translates to an eieio instruction on ppc */ + wmb(); + return readl(address); +} + +static void iCom_writel(u32 value, void *address) +{ + /* Issue a 'memory barrier' prior to the mmio to ensure data is flushed + to memory. This translates to a sync instruction on ppc */ + mb(); + writel(value, address); +} + +static int get_port_memory(struct iCom_port *iCom_port_info) +{ + int index; + int number_of_buffs; + unsigned long int stgAddr; + unsigned long int startStgAddr; + unsigned long int offset; + + TRACE(iCom_port_info,TRACE_GET_PORT_MEM,0); + + iCom_port_info->xmit_buf = (unsigned char *)kmalloc(4096,GFP_KERNEL | GFP_DMA); + iCom_port_info->xmit_buf_pci = pci_map_single(iCom_adapter_info[iCom_port_info->adapter].pci_dev, + (void *)iCom_port_info->xmit_buf, + 4096, + PCI_DMA_BIDIRECTIONAL); + + if (!iCom_port_info->xmit_buf) { + printk("iCom: ERROR, Can not allocate Transmit buffer\n"); + return -ENOMEM; + } + TRACE(iCom_port_info,TRACE_GET_PORT_MEM,(unsigned long)iCom_port_info->xmit_buf); + + iCom_port_info->recv_buf = (unsigned char *)kmalloc(4096,GFP_KERNEL | GFP_DMA); + iCom_port_info->recv_buf_pci = pci_map_single(iCom_adapter_info[iCom_port_info->adapter].pci_dev, + (void *)iCom_port_info->recv_buf, + 4096, + PCI_DMA_BIDIRECTIONAL); + + if (!iCom_port_info->recv_buf) { + printk("iCom: ERROR, Can not allocate Receive buffer\n"); + return_port_memory(iCom_port_info); + return -ENOMEM; + } + TRACE(iCom_port_info,TRACE_GET_PORT_MEM,(unsigned long)iCom_port_info->recv_buf); + + iCom_port_info->statStg = (struct statusArea *)kmalloc(4096,GFP_KERNEL | GFP_DMA); + iCom_port_info->statStg_pci = pci_map_single(iCom_adapter_info[iCom_port_info->adapter].pci_dev, + (void *)iCom_port_info->statStg, + 4096, + PCI_DMA_BIDIRECTIONAL); + + if (!iCom_port_info->statStg) { + printk("iCom: ERROR, Can not allocate Status buffer\n"); + return_port_memory(iCom_port_info); + return -ENOMEM; + } + TRACE(iCom_port_info,TRACE_GET_PORT_MEM,(unsigned long)iCom_port_info->statStg); + + iCom_port_info->xmitRestart = (u32 *)kmalloc(sizeof(u32),GFP_KERNEL | GFP_DMA); + iCom_port_info->xmitRestart_pci = pci_map_single(iCom_adapter_info[iCom_port_info->adapter].pci_dev, + iCom_port_info->xmitRestart, + 4, + PCI_DMA_BIDIRECTIONAL); + + if (!iCom_port_info->xmitRestart) { + printk("iCom: ERROR, Can not allocate xmit Restart buffer\n"); + return_port_memory(iCom_port_info); + return -ENOMEM; + } + + memset(iCom_port_info->statStg, 0,4096); + + /* FODs */ + number_of_buffs = NUM_XBUFFS; + stgAddr = (unsigned long int)iCom_port_info->statStg; + startStgAddr = stgAddr; + for (index = 0; index < number_of_buffs; index++) + { + TRACE(iCom_port_info,TRACE_FOD_ADDR,stgAddr); + stgAddr = stgAddr + sizeof(iCom_port_info->statStg->xmit[0]); + if (index < (number_of_buffs - 1)) + { + iCom_port_info->statStg->xmit[index].flags = 0; + iCom_port_info->statStg->xmit[index].leNext = 0; + iCom_port_info->statStg->xmit[index].leNextASD = 0; + iCom_port_info->statStg->xmit[index].leLengthASD = (unsigned short int)cpu_to_le16(XMIT_BUFF_SZ); + iCom_port_info->statStg->xmit[index].leOffsetASD = 0; + TRACE(iCom_port_info,TRACE_FOD_ADDR,stgAddr); + TRACE(iCom_port_info,TRACE_FOD_XBUFF,(unsigned long)iCom_port_info->xmit_buf); + iCom_port_info->statStg->xmit[index].leBuffer = cpu_to_le32(iCom_port_info->xmit_buf_pci); + } + else if (index == (number_of_buffs - 1)) + { + iCom_port_info->statStg->xmit[index].flags = 0; + iCom_port_info->statStg->xmit[index].leNext = 0; + iCom_port_info->statStg->xmit[index].leNextASD = 0; + iCom_port_info->statStg->xmit[index].leLengthASD = (unsigned short int)cpu_to_le16(XMIT_BUFF_SZ); + iCom_port_info->statStg->xmit[index].leOffsetASD = 0; + TRACE(iCom_port_info,TRACE_FOD_XBUFF,(unsigned long)iCom_port_info->xmit_buf); + iCom_port_info->statStg->xmit[index].leBuffer = cpu_to_le32(iCom_port_info->xmit_buf_pci); + } + else + { + iCom_port_info->statStg->xmit[index].flags = 0; + iCom_port_info->statStg->xmit[index].leNext = 0; + iCom_port_info->statStg->xmit[index].leNextASD = 0; + iCom_port_info->statStg->xmit[index].leLengthASD = 0; + iCom_port_info->statStg->xmit[index].leOffsetASD = 0; + iCom_port_info->statStg->xmit[index].leBuffer = 0; + } + } + /* FIDs */ + startStgAddr = stgAddr; + + /* fill in every entry, even if no buffer */ + number_of_buffs = NUM_RBUFFS; + for (index = 0; index < number_of_buffs; index++) + { + TRACE(iCom_port_info,TRACE_FID_ADDR,stgAddr); + stgAddr = stgAddr + sizeof(iCom_port_info->statStg->rcv[0]); + iCom_port_info->statStg->rcv[index].leLength = 0; + iCom_port_info->statStg->rcv[index].WorkingLength = (unsigned short int)cpu_to_le16(RCV_BUFF_SZ); + if (index < (number_of_buffs - 1)) + { + offset = stgAddr - (unsigned long)iCom_port_info->statStg; + iCom_port_info->statStg->rcv[index].leNext = (unsigned long)cpu_to_le32(iCom_port_info->statStg_pci + offset); + TRACE(iCom_port_info,TRACE_FID_RBUFF,(unsigned long)iCom_port_info->recv_buf); + iCom_port_info->statStg->rcv[index].leBuffer = cpu_to_le32(iCom_port_info->recv_buf_pci); + } + else if (index == (number_of_buffs - 1)) + { + offset = startStgAddr - (unsigned long)iCom_port_info->statStg; + iCom_port_info->statStg->rcv[index].leNext = (unsigned long)cpu_to_le32(iCom_port_info->statStg_pci + offset); + TRACE(iCom_port_info,TRACE_FID_RBUFF,(unsigned long)iCom_port_info->recv_buf + 2048); + iCom_port_info->statStg->rcv[index].leBuffer = cpu_to_le32(iCom_port_info->recv_buf_pci + 2048); + } + else + { + iCom_port_info->statStg->rcv[index].leNext = 0; + iCom_port_info->statStg->rcv[index].leBuffer = 0; + } + } + + return 0; +} + +static void return_port_memory(struct iCom_port *iCom_port_info) +{ + TRACE(iCom_port_info, TRACE_RET_PORT_MEM,0); + if (iCom_port_info->recv_buf) { + pci_unmap_single(iCom_adapter_info[iCom_port_info->adapter].pci_dev, + iCom_port_info->recv_buf_pci, + 4096, + PCI_DMA_BIDIRECTIONAL); + kfree((void *)iCom_port_info->recv_buf); + iCom_port_info->recv_buf = 0; + } + if (iCom_port_info->xmit_buf) { + pci_unmap_single(iCom_adapter_info[iCom_port_info->adapter].pci_dev, + iCom_port_info->xmit_buf_pci, + 4096, + PCI_DMA_BIDIRECTIONAL); + kfree((void *)iCom_port_info->xmit_buf); + iCom_port_info->xmit_buf = 0; + } + if (iCom_port_info->statStg) { + pci_unmap_single(iCom_adapter_info[iCom_port_info->adapter].pci_dev, + iCom_port_info->statStg_pci, + 4096, + PCI_DMA_BIDIRECTIONAL); + kfree((void *)iCom_port_info->statStg); + iCom_port_info->statStg = 0; + } + + if (iCom_port_info->xmitRestart) { + pci_unmap_single(iCom_adapter_info[iCom_port_info->adapter].pci_dev, + iCom_port_info->xmitRestart_pci, + 4, + PCI_DMA_BIDIRECTIONAL); + kfree(iCom_port_info->xmitRestart); + iCom_port_info->xmitRestart = 0; + } + TRACE(iCom_port_info,TRACE_RET_MEM,0); +} + +static void stop_processor(struct iCom_port *iCom_port_info) +{ + unsigned long temp; + + switch (iCom_port_info->port) { + case 0: + temp = iCom_readl(&iCom_port_info->global_reg->control); + temp = (temp & ~ICOM_CONTROL_START_A) | ICOM_CONTROL_STOP_A; + iCom_writel(temp,&iCom_port_info->global_reg->control); + TRACE(iCom_port_info,TRACE_STOP_PROC_A,0); + break; + case 1: + temp = iCom_readl(&iCom_port_info->global_reg->control); + temp = (temp & ~ICOM_CONTROL_START_B) | ICOM_CONTROL_STOP_B; + iCom_writel(temp,&iCom_port_info->global_reg->control); + TRACE(iCom_port_info,TRACE_STOP_PROC_B,0); + break; + case 2: + temp = iCom_readl(&iCom_port_info->global_reg->control_2); + temp = (temp & ~ICOM_CONTROL_START_C) | ICOM_CONTROL_STOP_C; + iCom_writel(temp,&iCom_port_info->global_reg->control_2); + TRACE(iCom_port_info,TRACE_STOP_PROC_C,0); + break; + case 3: + temp = iCom_readl(&iCom_port_info->global_reg->control_2); + temp = (temp & ~ICOM_CONTROL_START_D) | ICOM_CONTROL_STOP_D; + iCom_writel(temp,&iCom_port_info->global_reg->control_2); + TRACE(iCom_port_info,TRACE_STOP_PROC_D,0); + break; + default: + printk("iCom: ERROR: invalid port assignment\n"); + } +} + +static void start_processor(struct iCom_port *iCom_port_info) +{ + unsigned long temp; + + switch (iCom_port_info->port) { + case 0: + temp = iCom_readl(&iCom_port_info->global_reg->control); + temp = (temp & ~ICOM_CONTROL_STOP_A) | ICOM_CONTROL_START_A; + iCom_writel(temp,&iCom_port_info->global_reg->control); + TRACE(iCom_port_info,TRACE_START_PROC_A,0); + break; + case 1: + temp = iCom_readl(&iCom_port_info->global_reg->control); + temp = (temp & ~ICOM_CONTROL_STOP_B) | ICOM_CONTROL_START_B; + iCom_writel(temp,&iCom_port_info->global_reg->control); + TRACE(iCom_port_info,TRACE_START_PROC_B,0); + break; + case 2: + temp = iCom_readl(&iCom_port_info->global_reg->control_2); + temp = (temp & ~ICOM_CONTROL_STOP_C) | ICOM_CONTROL_START_C; + iCom_writel(temp,&iCom_port_info->global_reg->control_2); + TRACE(iCom_port_info,TRACE_START_PROC_C,0); + break; + case 3: + temp = iCom_readl(&iCom_port_info->global_reg->control_2); + temp = (temp & ~ICOM_CONTROL_STOP_D) | ICOM_CONTROL_START_D; + iCom_writel(temp,&iCom_port_info->global_reg->control_2); + TRACE(iCom_port_info,TRACE_START_PROC_D,0); + break; + default: + printk("iCom: ERROR: invalid port assignment\n"); + } +} + +/* + * irq = no lock + */ +static int loadCode (struct iCom_port *iCom_port_info) +{ + char *iram_ptr; + int index; + int status = 0; + char *dram_ptr = (char *)iCom_port_info->dram; + unsigned long int temp; + unsigned char *new_page; + + TRACE(iCom_port_info,TRACE_GET_MEM,0); /* this really gets memory for trace */ + TRACE(iCom_port_info,TRACE_LOAD_MEM,0); + + /* Clear out any pending interrupts */ + iCom_writew(0x3FFF,(void *)iCom_port_info->int_reg); + + TRACE(iCom_port_info,TRACE_CLEAR_INTERRUPTS,0); + + /* Stop processor */ + stop_processor(iCom_port_info); + + /* Zero out DRAM */ + for (index = 0; index < 512; index++) + { + iCom_writeb(0x00,&dram_ptr[index]); + } + + /* Load Call Setup into Adapter */ + iram_ptr = (char *)iCom_port_info->dram + ICOM_IRAM_OFFSET; + for (index = 0; index < sizeof(callSetup); index++) + { + iCom_writeb(callSetup[index],&iram_ptr[index]); + } + + /* Load Resident DCE portion of Adapter */ + iram_ptr = (char *) iCom_port_info->dram + ICOM_IRAM_OFFSET + + ICOM_DCE_IRAM_OFFSET; + + /* Load the RV dce code */ + for (index = 0; index < sizeof(resRVdce); index++) { + iCom_writeb(resRVdce[index],&iram_ptr[index]); + } + + /* Set Hardware level */ + if ((iCom_adapter_info[iCom_port_info->adapter].version | ADAPTER_V2) == ADAPTER_V2) { + iCom_writeb(V2_HARDWARE,&(iCom_port_info->dram->misc_flags)); + } + + /* Start the processor in Adapter */ + start_processor(iCom_port_info); + + /* Wait 0.1 Sec for simple Init to complete */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/10); + + /* + * Verify Code is running + */ + status = 0; + + iCom_writeb((HDLC_PPP_PURE_ASYNC | HDLC_FF_FILL),&(iCom_port_info->dram->HDLCConfigReg)); + iCom_writeb(0x04,&(iCom_port_info->dram->FlagFillIdleTimer)); /* 0.5 seconds */ + iCom_writeb(0x00,&(iCom_port_info->dram->CmdReg)); + iCom_writeb(0x10,&(iCom_port_info->dram->async_config3)); + iCom_writeb((ICOM_ACFG_DRIVE1 | ICOM_ACFG_NO_PARITY | ICOM_ACFG_8BPC | ICOM_ACFG_1STOP_BIT),&(iCom_port_info->dram->async_config2)); + + /*Set up data in iCom DRAM to indicate where personality + *code is located and its length. + */ + new_page = (unsigned char *)kmalloc(4096,GFP_KERNEL | GFP_DMA); + for (index = 0; index < sizeof(funcLoad); index++) { + new_page[index] = funcLoad[index]; + } + temp = pci_map_single(iCom_adapter_info[iCom_port_info->adapter].pci_dev,new_page,4096,PCI_DMA_BIDIRECTIONAL); + + iCom_writeb((char)(sizeof(funcLoad)/16),&iCom_port_info->dram->mac_length); + iCom_writel(temp,&iCom_port_info->dram->mac_load_addr); + + /*Setting the syncReg to 0x80 causes adapter to start downloading + the personality code into adapter instruction RAM. + Once code is loaded, it will begin executing and, based on + information provided above, will start DMAing data from + shared memory to adapter DRAM. + */ + iCom_writeb(START_DOWNLOAD,&iCom_port_info->dram->sync); + + /* Wait 1 Sec for data download */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ); + pci_unmap_single(iCom_adapter_info[iCom_port_info->adapter].pci_dev,temp,4096,PCI_DMA_BIDIRECTIONAL); + kfree(new_page); + + if (status != 0) + { + /* Clear out any pending interrupts */ + iCom_writew(0x3FFF,(void *)iCom_port_info->int_reg); + + /* Turn off port */ + iCom_writeb(ICOM_DISABLE,&iCom_port_info->dram->disable); + } + + return status; +} + +/* + * This routine is called to set the port to match + * the specified baud rate for a serial port. + * irq = locked + */ +static void change_speed(struct iCom_port *iCom_port_info, + struct termios *old_termios, unsigned long flags) +{ + int baud; + unsigned cflag; + int bits; + char new_config2; + char new_config3; + char tmp_byte; + int index; + int rcv_buff,xmit_buff; + unsigned long int offset; + + TRACE(iCom_port_info,TRACE_CHANGE_SPEED | TRACE_TIME,jiffies); + + if (!iCom_port_info->tty || !iCom_port_info->tty->termios) + return; + cflag = iCom_port_info->tty->termios->c_cflag; + + new_config2 = ICOM_ACFG_DRIVE1; + + /* byte size and parity */ + switch (cflag & CSIZE) { + case CS5: /* 5 bits/char */ + new_config2 |= ICOM_ACFG_5BPC; + bits = 7; + break; + case CS6: /* 6 bits/char */ + new_config2 |= ICOM_ACFG_6BPC; + bits = 8; + break; + case CS7: /* 7 bits/char */ + new_config2 |= ICOM_ACFG_7BPC; + bits = 9; + break; + case CS8: /* 8 bits/char */ + new_config2 |= ICOM_ACFG_8BPC; + bits = 10; + break; + default: bits = 10; break; + } + if (cflag & CSTOPB) { + /* 2 stop bits */ + new_config2 |= ICOM_ACFG_2STOP_BIT; + bits++; + } + if (cflag & PARENB) { + /* parity bit enabled */ + new_config2 |= ICOM_ACFG_PARITY_ENAB; + TRACE(iCom_port_info, TRACE_PARENB,0); + bits++; + } + if (cflag & PARODD) { + /* odd parity */ + new_config2 |= ICOM_ACFG_PARITY_ODD; + TRACE(iCom_port_info, TRACE_PARODD,0); + } + + /* Determine divisor based on baud rate */ + baud = tty_get_baud_rate(iCom_port_info->tty); + if (!baud) + baud = 9600; /* B0 transition handled in rs_set_termios */ + + for (index = 0; index < BAUD_TABLE_LIMIT; index++) { + if (icom_acfg_baud[index] == baud) { + new_config3 = index; + break; + } + } + + iCom_port_info->timeout = XMIT_BUFF_SZ*HZ*bits/baud; + iCom_port_info->timeout += HZ/50; /* Add .02 seconds of slop */ + + /* CTS flow control flag and modem status interrupts */ + if (cflag & CRTSCTS) { + iCom_port_info->flags |= ASYNC_CTS_FLOW; + tmp_byte = iCom_readb(&(iCom_port_info->dram->HDLCConfigReg)); + tmp_byte |= HDLC_HDW_FLOW; + iCom_writeb(tmp_byte, &(iCom_port_info->dram->HDLCConfigReg)); + } else { + iCom_port_info->flags &= ~ASYNC_CTS_FLOW; + tmp_byte = iCom_readb(&(iCom_port_info->dram->HDLCConfigReg)); + tmp_byte &= ~HDLC_HDW_FLOW; + iCom_writeb(tmp_byte, &(iCom_port_info->dram->HDLCConfigReg)); + } + if (cflag & CLOCAL) + iCom_port_info->flags &= ~ASYNC_CHECK_CD; + else { + iCom_port_info->flags |= ASYNC_CHECK_CD; + } + + /* + * Set up parity check flag + */ + iCom_port_info->read_status_mask = SA_FLAGS_OVERRUN | SA_FL_RCV_DONE; + if (I_INPCK(iCom_port_info->tty)) + iCom_port_info->read_status_mask |= SA_FLAGS_FRAME_ERROR | SA_FLAGS_PARITY_ERROR; + + if (I_BRKINT(iCom_port_info->tty) || I_PARMRK(iCom_port_info->tty)) + iCom_port_info->read_status_mask |= SA_FLAGS_BREAK_DET; + + /* + * Characters to ignore + */ + iCom_port_info->ignore_status_mask = 0; + if (I_IGNPAR(iCom_port_info->tty)) + iCom_port_info->ignore_status_mask |= SA_FLAGS_PARITY_ERROR | SA_FLAGS_FRAME_ERROR; + if (I_IGNBRK(iCom_port_info->tty)) { + iCom_port_info->ignore_status_mask |= SA_FLAGS_BREAK_DET; + /* + * If we're ignore parity and break indicators, ignore + * overruns too. (For real raw support). + */ + if (I_IGNPAR(iCom_port_info->tty)) + iCom_port_info->ignore_status_mask |= SA_FLAGS_OVERRUN; + } + + /* + * !!! ignore all characters if CREAD is not set + */ + if ((cflag & CREAD) == 0) + iCom_port_info->ignore_status_mask |= SA_FL_RCV_DONE; + + /* Turn off Receiver to prepare for reset */ + iCom_writeb(CMD_RCV_DISABLE,&iCom_port_info->dram->CmdReg); + + spin_unlock_irqrestore(&iComlock,flags); + for (index = 0; index < 10; index++) { + /* Wait 0.1 Sec for receive operations to complete*/ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/10); + + if (iCom_readb(&iCom_port_info->dram->PrevCmdReg) == 0x00) { + break; + } + } + spin_lock_irqsave(&iComlock, flags); + + /* clear all current buffers of data */ + for (rcv_buff = 0; rcv_buff < NUM_RBUFFS; rcv_buff++) { + iCom_port_info->statStg->rcv[rcv_buff].flags = 0; + iCom_port_info->statStg->rcv[rcv_buff].leLength = 0; + iCom_port_info->statStg->rcv[rcv_buff].WorkingLength = (unsigned short int)cpu_to_le16(RCV_BUFF_SZ); + } + + for (xmit_buff = 0; xmit_buff < NUM_XBUFFS; xmit_buff++) { + iCom_port_info->statStg->xmit[xmit_buff].flags = 0; + } + + /* activate changes and start xmit and receiver here */ + /* Enable the receiver */ + iCom_writeb(new_config3,&(iCom_port_info->dram->async_config3)); + iCom_writeb(new_config2,&(iCom_port_info->dram->async_config2)); + tmp_byte = iCom_readb(&(iCom_port_info->dram->HDLCConfigReg)); + tmp_byte |= HDLC_PPP_PURE_ASYNC | HDLC_FF_FILL; + iCom_writeb(tmp_byte,&(iCom_port_info->dram->HDLCConfigReg)); + iCom_writeb(0x04, &(iCom_port_info->dram->FlagFillIdleTimer)); /* 0.5 seconds */ + iCom_writeb(0xFF, &(iCom_port_info->dram->ier)); /* enable modem signal interrupts */ + + /* reset processor */ + iCom_writeb(CMD_RESTART,&iCom_port_info->dram->CmdReg); + spin_unlock_irqrestore(&iComlock, flags); + for (index = 0; index < 10; index++) { + /* Wait for reset operation */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/10); + + if (iCom_readb(&iCom_port_info->dram->CmdReg) == 0x00) { + break; + } + } + spin_lock_irqsave(&iComlock, flags); + + /* Enable Transmitter and Reciever */ + offset = (unsigned long int)&iCom_port_info->statStg->rcv[0] - (unsigned long int)iCom_port_info->statStg; + iCom_writel(iCom_port_info->statStg_pci + offset,&iCom_port_info->dram->RcvStatusAddr); + iCom_port_info->next_rcv = 0; + iCom_port_info->put_length = 0; + *iCom_port_info->xmitRestart = 0; + iCom_writel(iCom_port_info->xmitRestart_pci,&iCom_port_info->dram->XmitStatusAddr); + TRACE(iCom_port_info,TRACE_XR_ENAB,0); + iCom_writeb(CMD_XMIT_RCV_ENABLE,&iCom_port_info->dram->CmdReg); +} + +static int block_til_ready(struct tty_struct *tty, struct file * filp, + struct iCom_port *iCom_port_info, long int flags) +{ + DECLARE_WAITQUEUE(wait, current); + int retval; + int do_clocal = 0, extra_count = 0; + + /* + * If the device is in the middle of being closed, then block + * until it's done, and then try again. + */ + if (tty_hung_up_p(filp) || + (iCom_port_info->flags & ASYNC_CLOSING)) { + if (iCom_port_info->flags & ASYNC_CLOSING) { + spin_unlock_irqrestore(&iComlock,flags); + interruptible_sleep_on(&iCom_port_info->close_wait); + spin_lock_irqsave(&iComlock,flags); + } +#ifdef SERIAL_DO_RESTART + return ((iCom_port_info->flags & ASYNC_HUP_NOTIFY) ? + -EAGAIN : -ERESTARTSYS); +#else + return -EAGAIN; +#endif + } + + /* + * If this is a callout device, then just make sure the normal + * device isn't being used. + */ + if (tty->driver.subtype == SERIAL_TYPE_CALLOUT) { + if (iCom_port_info->flags & ASYNC_NORMAL_ACTIVE) + return -EBUSY; + if ((iCom_port_info->flags & ASYNC_CALLOUT_ACTIVE) && + (iCom_port_info->flags & ASYNC_SESSION_LOCKOUT) && + (iCom_port_info->session != current->session)) + return -EBUSY; + if ((iCom_port_info->flags & ASYNC_CALLOUT_ACTIVE) && + (iCom_port_info->flags & ASYNC_PGRP_LOCKOUT) && + (iCom_port_info->pgrp != current->pgrp)) + return -EBUSY; + iCom_port_info->flags |= ASYNC_CALLOUT_ACTIVE; + return 0; + } + + /* + * If non-blocking mode is set, or the port is not enabled, + * then make the check up front and then exit. + */ + if ((filp->f_flags & O_NONBLOCK) || + (tty->flags & (1 << TTY_IO_ERROR))) { + if (iCom_port_info->flags & ASYNC_CALLOUT_ACTIVE) + return -EBUSY; + iCom_port_info->flags |= ASYNC_NORMAL_ACTIVE; + return 0; + } + + if (iCom_port_info->flags & ASYNC_CALLOUT_ACTIVE) { + if (iCom_port_info->normal_termios.c_cflag & CLOCAL) + do_clocal = 1; + } else { + if (tty->termios->c_cflag & CLOCAL) + do_clocal = 1; + } + + /* + * Block waiting for the carrier detect and the line to become + * free (i.e., not in use by the callout). While we are in + * this loop, open_active_count is dropped by one, so that + * rs_close() knows when to free things. We restore it upon + * exit, either normal or abnormal. + */ + retval = 0; + add_wait_queue(&iCom_port_info->open_wait, &wait); + + if (!tty_hung_up_p(filp)) { + extra_count = 1; + iCom_port_info->open_active_count--; + } + iCom_port_info->blocked_open++; + while (1) { + if (!(iCom_port_info->flags & ASYNC_CALLOUT_ACTIVE) && + (tty->termios->c_cflag & CBAUD)) { + /* raise DTR and RTS */ + TRACE(iCom_port_info,TRACE_RAISE_DTR_RTS,0); + iCom_writeb(0xC0,&iCom_port_info->dram->osr); + } + current->state = TASK_INTERRUPTIBLE; + if (tty_hung_up_p(filp) || + !(iCom_port_info->flags & ASYNC_INITIALIZED)) { +#ifdef SERIAL_DO_RESTART + if (iCom_port_info->flags & ASYNC_HUP_NOTIFY) + retval = -EAGAIN; + else + retval = -ERESTARTSYS; +#else + retval = -EAGAIN; +#endif + break; + } + + if (!(iCom_port_info->flags & ASYNC_CALLOUT_ACTIVE) && + !(iCom_port_info->flags & ASYNC_CLOSING) && + (do_clocal || (iCom_readb(&iCom_port_info->dram->isr) & 0x20))) /* 0x20 = Carrier Detect */ + break; + if (signal_pending(current)) { + retval = -ERESTARTSYS; + break; + } + spin_unlock_irqrestore(&iComlock,flags); + printk("iCom: WAIT for CD\n"); + schedule(); + spin_lock_irqsave(&iComlock,flags); + } + current->state = TASK_RUNNING; + remove_wait_queue(&iCom_port_info->open_wait, &wait); + if (extra_count) + iCom_port_info->open_active_count++; + iCom_port_info->blocked_open--; + + if (retval) + return retval; + iCom_port_info->flags |= ASYNC_NORMAL_ACTIVE; + + return 0; +} + +static int startup(struct iCom_port *iCom_port_info, unsigned long flags) +{ + int retval=0; + unsigned long int temp; + unsigned char cable_id; + + TRACE(iCom_port_info,TRACE_STARTUP,0); + + if (iCom_port_info->flags & ASYNC_INITIALIZED) { + goto errout; + } + + /* + * check Cable ID + */ + cable_id = iCom_readb(&iCom_port_info->dram->cable_id); + TRACE(iCom_port_info,TRACE_CABLE_ID,cable_id); + if (cable_id & ICOM_CABLE_ID_VALID) + { + /* Get cable ID into the lower 4 bits (standard form) */ + cable_id = (cable_id & ICOM_CABLE_ID_MASK) >> 4; + + /* Check Cable ID valid */ + if ((cable_id == RS232_CABLE) || (cable_id == V24_CABLE) || + (cable_id == V35_CABLE) || (cable_id == V36_CABLE)) + { + ; + } + } + + /* + * set appropriate modem signals + */ + if (iCom_port_info->tty->termios->c_cflag & CBAUD) { + /* raise DTR and RTS */ + TRACE(iCom_port_info,TRACE_RAISE_DTR_RTS,0); + iCom_writeb(0xC0,&iCom_port_info->dram->osr); + } + + /* + * Finally, clear and enable interrupts + */ + switch (iCom_port_info->port) { + case 0: + /* Clear out any pending interrupts */ + iCom_writew(0x00FF,(void *)iCom_port_info->int_reg); + + /* Enable interrupts for first port */ + TRACE(iCom_port_info,TRACE_ENABLE_INTERRUPTS_PA,0); + temp = iCom_readl(&iCom_port_info->global_reg->int_mask); + iCom_writel((temp & ~ICOM_INT_MASK_PRC_A),&iCom_port_info->global_reg->int_mask); + break; + case 1: + /* Clear out any pending interrupts */ + iCom_writew(0x3F00,(void *)iCom_port_info->int_reg); + + /* Enable interrupts for second port */ + TRACE(iCom_port_info,TRACE_ENABLE_INTERRUPTS_PB,0); + temp = iCom_readl(&iCom_port_info->global_reg->int_mask); + iCom_writel((temp & ~ICOM_INT_MASK_PRC_B),&iCom_port_info->global_reg->int_mask); + break; + case 2: + /* Clear out any pending interrupts */ + iCom_writew(0x00FF,(void *)iCom_port_info->int_reg); + + /* Enable interrupts for first port */ + TRACE(iCom_port_info,TRACE_ENABLE_INTERRUPTS_PC,0); + temp = iCom_readl(&iCom_port_info->global_reg->int_mask_2); + iCom_writel((temp & ~ICOM_INT_MASK_PRC_C),&iCom_port_info->global_reg->int_mask_2); + break; + case 3: + /* Clear out any pending interrupts */ + iCom_writew(0x3F00,(void *)iCom_port_info->int_reg); + + /* Enable interrupts for second port */ + TRACE(iCom_port_info,TRACE_ENABLE_INTERRUPTS_PD,0); + temp = iCom_readl(&iCom_port_info->global_reg->int_mask_2); + iCom_writel((temp & ~ICOM_INT_MASK_PRC_D),&iCom_port_info->global_reg->int_mask_2); + break; + default: + printk("iCom: ERROR: Invalid port defined\n"); + } + + if (iCom_port_info->tty) + clear_bit(TTY_IO_ERROR, &iCom_port_info->tty->flags); + + /* + * Set up the tty->alt_speed kludge + */ + if (iCom_port_info->tty) { + if ((iCom_port_info->flags & ASYNC_SPD_MASK) == ASYNC_SPD_HI) + iCom_port_info->tty->alt_speed = 57600; + if ((iCom_port_info->flags & ASYNC_SPD_MASK) == ASYNC_SPD_VHI) + iCom_port_info->tty->alt_speed = 115200; + if ((iCom_port_info->flags & ASYNC_SPD_MASK) == ASYNC_SPD_SHI) + iCom_port_info->tty->alt_speed = 230400; + if ((iCom_port_info->flags & ASYNC_SPD_MASK) == ASYNC_SPD_WARP) + iCom_port_info->tty->alt_speed = 460800; + } + + /* + * and set the speed of the serial port + */ + change_speed(iCom_port_info, 0, flags); + + iCom_port_info->flags |= ASYNC_INITIALIZED; + return 0; + + errout: + return retval; +} + +/* + * This routine will shutdown a serial port; interrupts are disabled, and + * DTR is dropped if the hangup on close termio flag is on. + * + * irq = locked + */ +static void shutdown(struct iCom_port * iCom_port_info) +{ + unsigned long int temp; + unsigned char cmdReg; + + TRACE(iCom_port_info,TRACE_SHUTDOWN | TRACE_TIME,jiffies); + + if (!(iCom_port_info->flags & ASYNC_INITIALIZED)) + return; + + /* + * clear delta_msr_wait queue to avoid mem leaks: we may free the irq + * here so the queue might never be waken up + */ + wake_up_interruptible(&iCom_port_info->delta_msr_wait); + + /* + * disable all interrupts + */ + switch (iCom_port_info->port) { + case 0: + TRACE(iCom_port_info,TRACE_DIS_INTERRUPTS_PA,0); + temp = iCom_readl(&iCom_port_info->global_reg->int_mask); + iCom_writel((temp | ICOM_INT_MASK_PRC_A),&iCom_port_info->global_reg->int_mask); + break; + case 1: + TRACE(iCom_port_info,TRACE_DIS_INTERRUPTS_PB,0); + temp = iCom_readl(&iCom_port_info->global_reg->int_mask); + iCom_writel((temp | ICOM_INT_MASK_PRC_B),&iCom_port_info->global_reg->int_mask); + break; + case 2: + TRACE(iCom_port_info,TRACE_DIS_INTERRUPTS_PC,0); + temp = iCom_readl(&iCom_port_info->global_reg->int_mask_2); + iCom_writel((temp | ICOM_INT_MASK_PRC_C),&iCom_port_info->global_reg->int_mask_2); + break; + case 3: + TRACE(iCom_port_info,TRACE_DIS_INTERRUPTS_PD,0); + temp = iCom_readl(&iCom_port_info->global_reg->int_mask_2); + iCom_writel((temp | ICOM_INT_MASK_PRC_D),&iCom_port_info->global_reg->int_mask_2); + break; + default: + printk("iCom: ERROR: Invalid port assignment\n"); + } + + /* + * disable break condition + */ + cmdReg = iCom_readb(&iCom_port_info->dram->CmdReg); + if ((cmdReg | CMD_SND_BREAK) == CMD_SND_BREAK) { + iCom_writeb(cmdReg & ~CMD_SND_BREAK,&iCom_port_info->dram->CmdReg); + } + + if (!iCom_port_info->tty || (iCom_port_info->tty->termios->c_cflag & HUPCL)) { + /* drop DTR and RTS */ + TRACE(iCom_port_info,TRACE_DROP_DTR_RTS,0); + iCom_writeb(0x00,&iCom_port_info->dram->osr); + } + + if (iCom_port_info->tty) + set_bit(TTY_IO_ERROR, &iCom_port_info->tty->flags); + + iCom_port_info->flags &= ~ASYNC_INITIALIZED; +} + +/* + Primary interface routines to iCom Driver +*/ +static int iCom_open(struct tty_struct * tty, struct file * filp) +{ + int line; + int adapter_entry; + int port_entry; + struct iCom_port *iCom_port_info; + int retval; + unsigned long flags; + + /* + Minor Number + _ _ _ _ b (lower nibble) + ___ ___ + | | + | - port number (lowest 2 bits is port identifier) + - adapter number (remaining higher order bits identify adapter #) + */ + + MOD_INC_USE_COUNT; + line = MINOR(tty->device) - tty->driver.minor_start; + if ((line < 0) || (line >= NR_PORTS)) { + MOD_DEC_USE_COUNT; + return -ENODEV; + } + + adapter_entry = (line & 0xFFFE) >> 2; /* shift adapter # into position */ + port_entry = line & 0x0003; /* mask of port number */ + + if ((port_entry == 1) && + (iCom_adapter_info[adapter_entry].version == ADAPTER_V2) && + (iCom_adapter_info[adapter_entry].subsystem_id != FOUR_PORT_MODEL)) { + port_entry = 2; + } + iCom_port_info = &iCom_adapter_info[adapter_entry].port_info[port_entry]; + + spin_lock_irqsave(&iComlock,flags); + TRACE(iCom_port_info,TRACE_DEVICE_NUMB,tty->device); + tty->driver_data = iCom_port_info; + iCom_port_info->tty = tty; + iCom_port_info->open_active_count++; + + /* + * If the port is the middle of closing, bail out now + */ + if (tty_hung_up_p(filp) || + (iCom_port_info->flags & ASYNC_CLOSING)) { + + spin_unlock_irqrestore(&iComlock,flags); + if (iCom_port_info->flags & ASYNC_CLOSING) + interruptible_sleep_on(&iCom_port_info->close_wait); +#ifdef SERIAL_DO_RESTART + return ((iCom_port_info->flags & ASYNC_HUP_NOTIFY) ? + -EAGAIN : -ERESTARTSYS); +#else + return -EAGAIN; +#endif + } + + /* + * Start up serial port + */ + retval = startup(iCom_port_info, flags); + if (retval) { + /* reset open variables */ + TRACE(iCom_port_info,TRACE_STARTUP_ERROR,0); + spin_unlock_irqrestore(&iComlock,flags); + return retval; + } + + retval = block_til_ready(tty, filp, iCom_port_info, flags); + if (retval) { + spin_unlock_irqrestore(&iComlock,flags); + return retval; + } + + if ((iCom_port_info->open_active_count == 1) && + (iCom_port_info->flags & ASYNC_SPLIT_TERMIOS)) { + if (tty->driver.subtype == SERIAL_TYPE_NORMAL) + *tty->termios = iCom_port_info->normal_termios; + else + *tty->termios = iCom_port_info->callout_termios; + change_speed(iCom_port_info, 0, flags); + } + + iCom_port_info->session = current->session; + iCom_port_info->pgrp = current->pgrp; + + spin_unlock_irqrestore(&iComlock,flags); + return 0; +} + +/* + * ------------------------------------------------------------ + * iCom_close() + * + * This routine is called when the serial port gets closed. First, we + * wait for the last remaining data to be sent. Then, we unlink its + * async structure from the interrupt chain if necessary. + * ------------------------------------------------------------ + */ +static void iCom_close(struct tty_struct * tty, struct file * filp) +{ + struct iCom_port *iCom_port_info; + unsigned long flags; + unsigned char cmdReg; + + + if (!tty) { + printk("iCom: iCom_close - no tty\n"); + return; + } + + iCom_port_info = (struct iCom_port *)tty->driver_data; + if (!iCom_port_info) { + printk("iCom: iCom_close - no tty->driver_data\n"); + return; + } + + TRACE(iCom_port_info,TRACE_CLOSE,0); + spin_lock_irqsave(&iComlock,flags); + + if (tty_hung_up_p(filp)) { + TRACE(iCom_port_info,TRACE_CLOSE_HANGUP,0); + MOD_DEC_USE_COUNT; + spin_unlock_irqrestore(&iComlock,flags); + return; + } + + if ((tty->count == 1) && (iCom_port_info->open_active_count != 1)) { + /* + * Uh, oh. tty->count is 1, which means that the tty + * structure will be freed. open_active_count should always + * be one in these conditions. If it's greater than + * one, we've got real problems, since it means the + * serial port won't be shutdown. + */ + iCom_port_info->open_active_count = 1; + } + + if (--iCom_port_info->open_active_count < 0) { + iCom_port_info->open_active_count = 0; + } + + if (iCom_port_info->open_active_count) { + TRACE(iCom_port_info,TRACE_OPEN_ACTIVE,0); + MOD_DEC_USE_COUNT; + spin_unlock_irqrestore(&iComlock,flags); + return; + } + iCom_port_info->flags |= ASYNC_CLOSING; + + /* + * Save the termios structure, since this port may have + * separate termios for callout and dialin. + */ + if (iCom_port_info->flags & ASYNC_NORMAL_ACTIVE) + iCom_port_info->normal_termios = *tty->termios; + if (iCom_port_info->flags & ASYNC_CALLOUT_ACTIVE) + iCom_port_info->callout_termios = *tty->termios; + + /* + * Now we wait for the transmit buffer to clear; and we notify + * the line discipline to only process XON/XOFF characters. + */ + tty->closing = 1; + if (iCom_port_info->closing_wait != ASYNC_CLOSING_WAIT_NONE) { + spin_unlock_irqrestore(&iComlock,flags); + tty_wait_until_sent(tty, iCom_port_info->closing_wait); + spin_lock_irqsave(&iComlock,flags); + } + + /* + * At this point we stop accepting input. To do this, we + * disable the receive line status interrupts, and tell the + * interrupt driver to stop checking the data ready bit in the + * line status register. + */ + if (iCom_port_info->flags & ASYNC_INITIALIZED) { + cmdReg = iCom_readb(&iCom_port_info->dram->CmdReg); + iCom_writeb(cmdReg & (unsigned char)~CMD_RCV_ENABLE,&iCom_port_info->dram->CmdReg); + + /* + * Before we drop DTR, make sure the UART transmitter + * has completely drained; this is especially + * important if there is a transmit FIFO! + */ + spin_unlock_irqrestore(&iComlock,flags); + iCom_wait_until_sent(tty, iCom_port_info->timeout); + spin_lock_irqsave(&iComlock,flags); + } + + shutdown(iCom_port_info); + + spin_unlock_irqrestore(&iComlock,flags); + if (tty->driver.flush_buffer) + tty->driver.flush_buffer(tty); + if (tty->ldisc.flush_buffer) + tty->ldisc.flush_buffer(tty); + spin_lock_irqsave(&iComlock,flags); + tty->closing = 0; + iCom_port_info->event = 0; + iCom_port_info->tty = 0; + + if (iCom_port_info->blocked_open) { + if (iCom_port_info->close_delay) { + current->state = TASK_INTERRUPTIBLE; + spin_unlock_irqrestore(&iComlock,flags); + schedule_timeout(iCom_port_info->close_delay); + spin_lock_irqsave(&iComlock,flags); + } + wake_up_interruptible(&iCom_port_info->open_wait); + } + iCom_port_info->flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CALLOUT_ACTIVE| + ASYNC_CLOSING); + + wake_up_interruptible(&iCom_port_info->close_wait); + + MOD_DEC_USE_COUNT; + spin_unlock_irqrestore(&iComlock,flags); +} + +static int iCom_write(struct tty_struct * tty, int from_user, + const unsigned char * buf, int count) +{ + struct iCom_port *iCom_port_info = (struct iCom_port *)tty->driver_data; + unsigned long data_count = count; + unsigned char *data; + unsigned char cmdReg; + unsigned long int offset; + unsigned long int flags; + + + if (!tty) { + printk("iCom: iCom_write - no tty\n"); + return 0; + } + + spin_lock_irqsave(&iComlock,flags); + + iCom_port_info = (struct iCom_port *)tty->driver_data; + TRACE(iCom_port_info,TRACE_WRITE | TRACE_TIME,jiffies); + + down(&tmp_buf_sem); + + if (cpu_to_le16(iCom_port_info->statStg->xmit[0].flags) & SA_FLAGS_READY_TO_XMIT) { + TRACE(iCom_port_info,TRACE_WRITE_FULL,0); + up(&tmp_buf_sem); + spin_unlock_irqrestore(&iComlock,flags); + return 0; + } + + if (data_count > XMIT_BUFF_SZ) + data_count = XMIT_BUFF_SZ; + + if (from_user) { + data_count -= copy_from_user(iCom_port_info->xmit_buf, buf, data_count); + if (!data_count) { + TRACE(iCom_port_info,TRACE_WRITE_NODATA,0); + up(&tmp_buf_sem); + spin_unlock_irqrestore(&iComlock,flags); + return -EFAULT; + } + } else { + memcpy(iCom_port_info->xmit_buf, buf, data_count); + } + + data = iCom_port_info->xmit_buf; + + if (data_count) { + iCom_port_info->statStg->xmit[0].flags = (unsigned short int)cpu_to_le16(SA_FLAGS_READY_TO_XMIT); + iCom_port_info->statStg->xmit[0].leLength = (unsigned short int)cpu_to_le16(data_count); + offset = (unsigned long int)&iCom_port_info->statStg->xmit[0] - (unsigned long int)iCom_port_info->statStg; + *iCom_port_info->xmitRestart = cpu_to_le32(iCom_port_info->statStg_pci + offset); + cmdReg = iCom_readb(&iCom_port_info->dram->CmdReg); + iCom_writeb(cmdReg | CMD_XMIT_RCV_ENABLE,&iCom_port_info->dram->CmdReg); + iCom_writeb(START_XMIT,&iCom_port_info->dram->StartXmitCmd); + TRACE(iCom_port_info,TRACE_WRITE_START,data_count); + } + + up(&tmp_buf_sem); + spin_unlock_irqrestore(&iComlock,flags); + + return data_count; +} + +static void iCom_put_char(struct tty_struct * tty, unsigned char ch) +{ + /* iCom_put_char adds the character to the current buffer, the + * data is not actually sent until iCom_flush_chars is called. + * Per definition iCom_flush_chars MUST be called after + * iCom_put_char + */ + + unsigned char *data; + struct iCom_port *iCom_port_info = (struct iCom_port *)tty->driver_data; + unsigned long int flags; + + spin_lock_irqsave(&iComlock,flags); + TRACE(iCom_port_info,TRACE_PUT_CHAR, ch); + + down(&tmp_buf_sem); + + if (cpu_to_le16(iCom_port_info->statStg->xmit[0].flags) & SA_FLAGS_READY_TO_XMIT) { + TRACE(iCom_port_info,TRACE_PUT_FULL,0); + up(&tmp_buf_sem); + spin_unlock_irqrestore(&iComlock,flags); + return; + } + + data = iCom_port_info->xmit_buf; + data[iCom_port_info->put_length] = ch; + + if (!tty->stopped && !tty->hw_stopped) { + iCom_port_info->put_length++; + } + + up(&tmp_buf_sem); + spin_unlock_irqrestore(&iComlock,flags); +} + +static void iCom_flush_chars(struct tty_struct * tty) +{ + struct iCom_port *iCom_port_info = (struct iCom_port *)tty->driver_data; + unsigned char cmdReg; + unsigned long int offset; + unsigned long int flags; + + spin_lock_irqsave(&iComlock,flags); + TRACE(iCom_port_info,TRACE_FLUSH_CHAR | TRACE_TIME,jiffies); + if (iCom_port_info->put_length) { + TRACE(iCom_port_info,TRACE_START_FLUSH,iCom_port_info->put_length); + iCom_port_info->statStg->xmit[0].flags = (unsigned short int)cpu_to_le16(SA_FLAGS_READY_TO_XMIT); + iCom_port_info->statStg->xmit[0].leLength = (unsigned short int)cpu_to_le16(iCom_port_info->put_length); + offset = (unsigned long int)&iCom_port_info->statStg->xmit[0] - (unsigned long int)iCom_port_info->statStg; + *iCom_port_info->xmitRestart = cpu_to_le32(iCom_port_info->statStg_pci + offset); + cmdReg = iCom_readb(&iCom_port_info->dram->CmdReg); + iCom_writeb(cmdReg | CMD_XMIT_RCV_ENABLE,&iCom_port_info->dram->CmdReg); + iCom_writeb(START_XMIT,&iCom_port_info->dram->StartXmitCmd); + } + iCom_port_info->put_length = 0; + spin_unlock_irqrestore(&iComlock,flags); +} + +static int iCom_write_room(struct tty_struct * tty) +{ + int bytes_avail; + struct iCom_port *iCom_port_info = tty->driver_data; + + if (cpu_to_le16(iCom_port_info->statStg->xmit[0].flags) & SA_FLAGS_READY_TO_XMIT) + bytes_avail = 0; + else + bytes_avail = XMIT_BUFF_SZ; + + TRACE(iCom_port_info,TRACE_WRITE_ROOM,bytes_avail); + return bytes_avail; +} + +static int iCom_chars_in_buffer(struct tty_struct * tty) +{ + unsigned long int dram; + struct iCom_port *iCom_port_info = (struct iCom_port *)tty->driver_data; + int number_remaining = 0; + + TRACE(iCom_port_info,TRACE_CHARS_IN_BUFF,0); + if (cpu_to_le16(iCom_port_info->statStg->xmit[0].flags) & SA_FLAGS_READY_TO_XMIT) { + dram = (unsigned long int)iCom_port_info->dram; + number_remaining = iCom_readw((void *)(dram + 0x168)); + TRACE(iCom_port_info,TRACE_CHARS_REMAIN,number_remaining); + } + return number_remaining; +} + +static int get_modem_info(struct iCom_port * iCom_port_info, unsigned int *value) +{ + unsigned char status,control; + unsigned int result; + + TRACE(iCom_port_info,TRACE_GET_MODEM,0); + + status = iCom_readb(&iCom_port_info->dram->isr); + control = iCom_readb(&iCom_port_info->dram->osr); + + result = ((control & 0x40) ? TIOCM_RTS : 0) + | ((control & DTR) ? TIOCM_DTR : 0) + | ((status & DCD) ? TIOCM_CAR : 0) + | ((status & RI ) ? TIOCM_RNG : 0) + | ((status & DSR) ? TIOCM_DSR : 0) + | ((status & CTS) ? TIOCM_CTS : 0); + return put_user(result,value); +} + +static int set_modem_info(struct iCom_port * iCom_port_info, unsigned int cmd, + unsigned int *value) +{ + int error; + unsigned int arg; + unsigned char local_osr; + + TRACE(iCom_port_info,TRACE_SET_MODEM,0); + local_osr = iCom_readb(&iCom_port_info->dram->osr); + + error = get_user(arg, value); + if (error) + return error; + switch (cmd) { + case TIOCMBIS: + if (arg & TIOCM_RTS) { + TRACE(iCom_port_info,TRACE_RAISE_RTS,0); + local_osr |= RTS; + } + if (arg & TIOCM_DTR) { + TRACE(iCom_port_info,TRACE_RAISE_DTR,0); + local_osr |= DTR; + } + break; + case TIOCMBIC: + if (arg & TIOCM_RTS) { + TRACE(iCom_port_info,TRACE_LOWER_RTS,0); + local_osr &= ~RTS; + } + if (arg & TIOCM_DTR) { + TRACE(iCom_port_info,TRACE_LOWER_DTR,0); + local_osr &= ~DTR; + } + break; + case TIOCMSET: + local_osr = ((local_osr & ~(RTS | DTR)) + | ((arg & TIOCM_RTS) ? RTS : 0) + | ((arg & TIOCM_DTR) ? DTR : 0)); + break; + default: + return -EINVAL; + } + + iCom_writeb(local_osr,&iCom_port_info->dram->osr); + return 0; +} + +static int get_serial_info(struct iCom_port * iCom_port_info, + struct serial_struct * retinfo) +{ + struct serial_struct tmp; + + TRACE(iCom_port_info,TRACE_GET_SERIAL,0); + + if (!retinfo) + return -EFAULT; + memset(&tmp, 0, sizeof(tmp)); + tmp.type = 0x00; /* device specific, PORT_UNKNOWN */ + tmp.line = iCom_port_info->adapter; /* adapter number */ + tmp.port = iCom_port_info->port; /* port number on adapter */ + tmp.irq = iCom_adapter_info[iCom_port_info->adapter].irq_number; + tmp.flags = iCom_port_info->flags; + tmp.xmit_fifo_size = XMIT_BUFF_SZ; + tmp.baud_base = 0x00; /* device specific */ + tmp.close_delay = iCom_port_info->close_delay; + tmp.closing_wait = iCom_port_info->closing_wait; + tmp.custom_divisor = 0x00; /* device specific */ + tmp.hub6 = 0x00; /* device specific */ + if (copy_to_user(retinfo,&tmp,sizeof(*retinfo))) + return -EFAULT; + return 0; +} + +static int set_serial_info(struct iCom_port * iCom_port_info, + struct serial_struct * new_info) +{ + struct serial_struct new_serial; + int old_flags; + int retval = 0; + unsigned long flags; + + TRACE(iCom_port_info,TRACE_SET_SERIAL,0); + + if (copy_from_user(&new_serial,new_info,sizeof(new_serial))) + return -EFAULT; + + old_flags = iCom_port_info->flags; + /* new_serial.irq --- irq of adapter will not change, PCI only */ + /* new_serial.xmit_fifo_size -- can not change on this device */ + /* new_serial.baud_base -- ??? */ + /* new_serial.custom_divisor -- device specific */ + /* new_serial.hub6 -- device specific */ + /* new_serial.type -- device specific */ + /* new_serial.port -- address of port will not change, PCI only */ + + if (!capable(CAP_SYS_ADMIN)) { + if ((new_serial.baud_base != iCom_port_info->baud_base) || + (new_serial.close_delay != iCom_port_info->close_delay) || + ((new_serial.flags & ~ASYNC_USR_MASK) != + (iCom_port_info->flags & ~ASYNC_USR_MASK))) + return -EPERM; + iCom_port_info->flags = ((iCom_port_info->flags & ~ASYNC_USR_MASK) | + (new_serial.flags & ASYNC_USR_MASK)); + goto check_and_exit; + } + + if (new_serial.baud_base < 9600) { + return -EINVAL; + } + + /* + * OK, past this point, all the error checking has been done. + * At this point, we start making changes..... + */ + iCom_port_info->baud_base = new_serial.baud_base; + iCom_port_info->flags = ((iCom_port_info->flags & ~ASYNC_FLAGS) | + (new_serial.flags & ASYNC_FLAGS)); + iCom_port_info->close_delay = new_serial.close_delay * HZ/100; + iCom_port_info->closing_wait = new_serial.closing_wait * HZ/100; + iCom_port_info->tty->low_latency = (iCom_port_info->flags & ASYNC_LOW_LATENCY) ? 1 : 0; + + check_and_exit: + spin_lock_irqsave(&iComlock,flags); + if (iCom_port_info->flags & ASYNC_INITIALIZED) { + if (((iCom_port_info->flags & ASYNC_SPD_MASK) != + (old_flags & ASYNC_SPD_MASK))) { + if ((iCom_port_info->flags & ASYNC_SPD_MASK) == ASYNC_SPD_HI) + iCom_port_info->tty->alt_speed = 57600; + if ((iCom_port_info->flags & ASYNC_SPD_MASK) == ASYNC_SPD_VHI) + iCom_port_info->tty->alt_speed = 115200; + if ((iCom_port_info->flags & ASYNC_SPD_MASK) == ASYNC_SPD_SHI) + iCom_port_info->tty->alt_speed = 230400; + if ((iCom_port_info->flags & ASYNC_SPD_MASK) == ASYNC_SPD_WARP) + iCom_port_info->tty->alt_speed = 460800; + change_speed(iCom_port_info, 0, flags); + } + } else + retval = startup(iCom_port_info, flags); + + spin_unlock_irqrestore(&iComlock,flags); + + return retval; +} + +/* + * get_lsr_info - get line status register info + * + * Purpose: Let user call ioctl() to get info when the UART physically + * is emptied. On bus types like RS485, the transmitter must + * release the bus after transmitting. This must be done when + * the transmit shift register is empty, not be done when the + * transmit holding register is empty. This functionality + * allows an RS485 driver to be written in user space. + */ +static int get_lsr_info(struct iCom_port * info, unsigned int *value) +{ + unsigned char status; + unsigned int result; + + TRACE(info,TRACE_SET_LSR,0); + + status = cpu_to_le16(info->statStg->xmit[0].flags); + result = ((status & SA_FLAGS_DONE) ? TIOCSER_TEMT : 0); + return put_user(result,value); +} + +static int iCom_ioctl(struct tty_struct * tty, struct file * filp, + unsigned int cmd, unsigned long arg) +{ + int error; + struct iCom_port * iCom_port_info = (struct iCom_port *)tty->driver_data; + struct async_icount cprev, cnow; /* kernel counter temps */ + struct serial_icounter_struct *p_cuser; /* user space */ + unsigned long flags; + + TRACE(iCom_port_info,TRACE_IOCTL | TRACE_TIME,jiffies); + if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) && + (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) && + (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) { + if (tty->flags & (1 << TTY_IO_ERROR)) + return -EIO; + } + + switch (cmd) { + case 0x4300: + if (copy_to_user((void *)arg,iCom_port_info->trace_blk,TRACE_BLK_SZ)) + return -EFAULT; + return 0; + case TIOCMGET: + return get_modem_info(iCom_port_info, (unsigned int *) arg); + case TIOCMBIS: + case TIOCMBIC: + case TIOCMSET: + return set_modem_info(iCom_port_info, cmd, (unsigned int *) arg); + case TIOCGSERIAL: + return get_serial_info(iCom_port_info, + (struct serial_struct *) arg); + case TIOCSSERIAL: + return set_serial_info(iCom_port_info, + (struct serial_struct *) arg); + + case TIOCSERGETLSR: /* Get line status register */ + return get_lsr_info(iCom_port_info, (unsigned int *) arg); + + /* + * Wait for any of the 4 modem inputs (DCD,RI,DSR,CTS) to change + * - mask passed in arg for lines of interest + * (use |'ed TIOCM_RNG/DSR/CD/CTS for masking) + * Caller should use TIOCGICOUNT to see which one it was + */ + case TIOCMIWAIT: + spin_lock_irqsave(&iComlock,flags); + /* note the counters on entry */ + cprev = iCom_port_info->icount; + spin_unlock_irqrestore(&iComlock,flags); + while (1) { + interruptible_sleep_on(&iCom_port_info->delta_msr_wait); + /* see if a signal did it */ + if (signal_pending(current)) + return -ERESTARTSYS; + spin_lock_irqsave(&iComlock,flags); + cnow = iCom_port_info->icount; /* atomic copy */ + spin_unlock_irqrestore(&iComlock,flags); + if (cnow.rng == cprev.rng && cnow.dsr == cprev.dsr && + cnow.dcd == cprev.dcd && cnow.cts == cprev.cts) + return -EIO; /* no change => error */ + if ( ((arg & TIOCM_RNG) && (cnow.rng != cprev.rng)) || + ((arg & TIOCM_DSR) && (cnow.dsr != cprev.dsr)) || + ((arg & TIOCM_CD) && (cnow.dcd != cprev.dcd)) || + ((arg & TIOCM_CTS) && (cnow.cts != cprev.cts)) ) { + return 0; + } + cprev = cnow; + } + /* NOTREACHED */ + + /* + * Get counter of input serial line interrupts (DCD,RI,DSR,CTS) + * Return: write counters to the user passed counter struct + * NB: both 1->0 and 0->1 transitions are counted except for + * RI where only 0->1 is counted. + */ + case TIOCGICOUNT: + spin_lock_irqsave(&iComlock,flags); + cnow = iCom_port_info->icount; + spin_unlock_irqrestore(&iComlock,flags); + p_cuser = (struct serial_icounter_struct *) arg; + error = put_user(cnow.cts, &p_cuser->cts); + if (error) return error; + error = put_user(cnow.dsr, &p_cuser->dsr); + if (error) return error; + error = put_user(cnow.rng, &p_cuser->rng); + if (error) return error; + error = put_user(cnow.dcd, &p_cuser->dcd); + if (error) return error; + error = put_user(cnow.rx, &p_cuser->rx); + if (error) return error; + error = put_user(cnow.tx, &p_cuser->tx); + if (error) return error; + error = put_user(cnow.frame, &p_cuser->frame); + if (error) return error; + error = put_user(cnow.overrun, &p_cuser->overrun); + if (error) return error; + error = put_user(cnow.parity, &p_cuser->parity); + if (error) return error; + error = put_user(cnow.brk, &p_cuser->brk); + if (error) return error; + error = put_user(cnow.buf_overrun, &p_cuser->buf_overrun); + if (error) return error; + return 0; + + case TIOCSERGWILD: + case TIOCSERSWILD: + /* "setserial -W" is called in Debian boot */ + printk ("TIOCSER?WILD ioctl obsolete, ignored.\n"); + return 0; + + default: + TRACE(iCom_port_info,TRACE_IOCTL_IGNORE,cmd); + return -ENOIOCTLCMD; + } + return 0; +} + +static void iCom_send_xchar(struct tty_struct * tty, char ch) +{ + struct iCom_port *iCom_port_info = (struct iCom_port *)tty->driver_data; + unsigned char xdata; + int index; + unsigned long flags; + + spin_lock_irqsave(&iComlock,flags); + TRACE(iCom_port_info,TRACE_SEND_XCHAR,ch); + /* attempt sending char for a period of .1 second */ + for (index = 0; index < 10; index++ ) { + xdata = iCom_readb(&iCom_port_info->dram->xchar); + if (xdata == 0x00) { + TRACE(iCom_port_info,TRACE_QUICK_WRITE,0); + iCom_writeb(ch,&iCom_port_info->dram->xchar); + break; + } + current->state = TASK_INTERRUPTIBLE; + spin_unlock_irqrestore(&iComlock,flags); + schedule_timeout(HZ/100); + spin_lock_irqsave(&iComlock,flags); + } + spin_unlock_irqrestore(&iComlock,flags); +} + +static void iCom_throttle(struct tty_struct * tty) +{ + struct iCom_port *iCom_port_info = (struct iCom_port *)tty->driver_data; + unsigned char osr; + + TRACE(iCom_port_info,TRACE_THROTTLE,0); + if (I_IXOFF(tty)) + iCom_send_xchar(tty, STOP_CHAR(tty)); + + if (tty->termios->c_cflag & CRTSCTS) { + osr = iCom_readb(&iCom_port_info->dram->osr); + iCom_writeb(osr & ~RTS,&iCom_port_info->dram->osr); + } +} + +static void iCom_unthrottle(struct tty_struct * tty) +{ + struct iCom_port *iCom_port_info = (struct iCom_port *)tty->driver_data; + unsigned char osr; + + TRACE(iCom_port_info,TRACE_UNTHROTTLE,0); + if (I_IXOFF(tty)) { + iCom_send_xchar(tty, START_CHAR(tty)); + } + if (tty->termios->c_cflag & CRTSCTS) { + osr = iCom_readb(&iCom_port_info->dram->osr); + iCom_writeb(osr | RTS,&iCom_port_info->dram->osr); + } +} + +static void iCom_set_termios(struct tty_struct * tty, struct termios * old_termios) +{ + struct iCom_port *iCom_port_info = (struct iCom_port *)tty->driver_data; + unsigned int cflag = tty->termios->c_cflag; + unsigned char osr; + unsigned long flags; +#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK)) + + spin_lock_irqsave(&iComlock,flags); + TRACE(iCom_port_info,TRACE_SET_TERMIOS,0); + if ((cflag == old_termios->c_cflag) + && (RELEVANT_IFLAG(tty->termios->c_iflag) + == RELEVANT_IFLAG(old_termios->c_iflag))) { + spin_unlock_irqrestore(&iComlock,flags); + return; + } + + change_speed(iCom_port_info, old_termios, flags); + + /* Handle transition to B0 status */ + if ((old_termios->c_cflag & CBAUD) && + !(cflag & CBAUD)) { + osr = iCom_readb(&iCom_port_info->dram->osr); + TRACE(iCom_port_info,TRACE_DROP_DTR_RTS,0); + iCom_writeb(osr & ~(DTR|RTS),&iCom_port_info->dram->osr); + } + + /* Handle transition away from B0 status */ + if (!(old_termios->c_cflag & CBAUD) && + (cflag & CBAUD)) { + osr = iCom_readb(&iCom_port_info->dram->osr); + TRACE(iCom_port_info,TRACE_RAISE_DTR,0); + osr |= DTR; + if (!(tty->termios->c_cflag & CRTSCTS) || + !test_bit(TTY_THROTTLED, &tty->flags)) { + TRACE(iCom_port_info,TRACE_RAISE_RTS,0); + osr |= RTS; + } + iCom_writeb(osr,&iCom_port_info->dram->osr); + } + + spin_unlock_irqrestore(&iComlock,flags); + + /* Handle turning off CRTSCTS */ + if ((old_termios->c_cflag & CRTSCTS) && + !(tty->termios->c_cflag & CRTSCTS)) { + tty->hw_stopped = 0; + iCom_start(tty); + } + +#if 0 + /* + * No need to wake up processes in open wait, since they + * sample the CLOCAL flag once, and don't recheck it. + * XXX It's not clear whether the current behavior is correct + * or not. Hence, this may change..... + */ + if (!(old_termios->c_cflag & CLOCAL) && + (tty->termios->c_cflag & CLOCAL)) + wake_up_interruptible(&iCom_port_info->open_wait); +#endif +} + +static void iCom_stop(struct tty_struct * tty) +{ + struct iCom_port *iCom_port_info = (struct iCom_port *)tty->driver_data; + unsigned char cmdReg; + unsigned long flags; + + spin_lock_irqsave(&iComlock,flags); + TRACE(iCom_port_info,TRACE_STOP,0); + cmdReg = iCom_readb(&iCom_port_info->dram->CmdReg); + iCom_writeb(cmdReg | CMD_HOLD_XMIT,&iCom_port_info->dram->CmdReg); + spin_unlock_irqrestore(&iComlock,flags); +} + +static void iCom_start(struct tty_struct * tty) +{ + struct iCom_port *iCom_port_info = (struct iCom_port *)tty->driver_data; + unsigned char cmdReg; + unsigned long flags; + + spin_lock_irqsave(&iComlock,flags); + TRACE(iCom_port_info,TRACE_START,0); + cmdReg = iCom_readb(&iCom_port_info->dram->CmdReg); + iCom_writeb(cmdReg & ~CMD_HOLD_XMIT,&iCom_port_info->dram->CmdReg); + spin_unlock_irqrestore(&iComlock,flags); +} + +static void iCom_hangup(struct tty_struct * tty) +{ + struct iCom_port *iCom_port_info = (struct iCom_port *)tty->driver_data; + unsigned long flags; + + TRACE(iCom_port_info,TRACE_HANGUP,0); + iCom_flush_buffer(tty); + spin_lock_irqsave(&iComlock,flags); + shutdown(iCom_port_info); + iCom_port_info->open_active_count = 0; + iCom_port_info->flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CALLOUT_ACTIVE); + iCom_port_info->tty = 0; + wake_up_interruptible(&iCom_port_info->open_wait); + spin_unlock_irqrestore(&iComlock,flags); +} + +static void iCom_break(struct tty_struct *tty, int break_state) +{ + struct iCom_port *iCom_port_info = (struct iCom_port *)tty->driver_data; + unsigned char cmdReg; + unsigned long flags; + + spin_lock_irqsave(&iComlock,flags); + TRACE(iCom_port_info,TRACE_BREAK,0); + cmdReg = iCom_readb(&iCom_port_info->dram->CmdReg); + if (break_state == -1) { + iCom_writeb(cmdReg | CMD_SND_BREAK,&iCom_port_info->dram->CmdReg); + } + else{ + iCom_writeb(cmdReg & ~CMD_SND_BREAK,&iCom_port_info->dram->CmdReg); + } + spin_unlock_irqrestore(&iComlock,flags); +} + +/* + * iCom_wait_until_sent() --- wait until the transmitter is empty + */ +static void iCom_wait_until_sent(struct tty_struct *tty, int timeout) +{ + struct iCom_port *iCom_port_info = (struct iCom_port *)tty->driver_data; + unsigned long orig_jiffies, char_time; + int status; + + TRACE(iCom_port_info,TRACE_WAIT_UNTIL_SENT,0); + + orig_jiffies = jiffies; + /* + * Set the check interval to be 1/5 of the estimated time to + * send a single character, and make it at least 1. The check + * interval should also be less than the timeout. + * + * Note: we have to use pretty tight timings here to satisfy + * the NIST-PCTS. + */ + char_time = (iCom_port_info->timeout - HZ/50) / iCom_port_info->xmit_fifo_size; + char_time = char_time / 5; + if (char_time == 0) + char_time = 1; + if (timeout) { + if (timeout < char_time) + char_time = timeout; + } + /* + * If the transmitter hasn't cleared in twice the approximate + * amount of time to send the entire FIFO, it probably won't + * ever clear. This assumes the UART isn't doing flow + * control, which is currently the case. Hence, if it ever + * takes longer than iCom_port_info->timeout, this is probably due to a + * UART bug of some kind. So, we clamp the timeout parameter at + * 2*iCom_port_info->timeout. + */ + if (!timeout || timeout > 2*iCom_port_info->timeout) + timeout = 2*iCom_port_info->timeout; + + status = cpu_to_le16(iCom_port_info->statStg->xmit[0].flags); + while (status & SA_FLAGS_DONE ) { /*data still transmitting*/ + + current->state = TASK_INTERRUPTIBLE; + current->counter = 0; /* make us low-priority */ + schedule_timeout(char_time); + if (signal_pending(current)) + break; + if (timeout && time_after(jiffies, orig_jiffies + timeout)) + break; + status = cpu_to_le16(iCom_port_info->statStg->xmit[0].flags); + } + current->state = TASK_RUNNING; +} + +/* + * /proc fs routines.... + */ +static inline int line_info(char *buf, struct iCom_port *iCom_port_info) +{ + char stat_buf[30], control, status; + int ret, baud_index; + int port; + + if ((iCom_port_info->port == 2) && + (iCom_adapter_info[iCom_port_info->adapter].subsystem_id != FOUR_PORT_MODEL)) + port = 1; + else + port = iCom_port_info->port; + + ret = sprintf(buf, "%d: port:%X irq:%d", + iCom_port_info->adapter, + port, + iCom_adapter_info[iCom_port_info->adapter].irq_number); + + status = iCom_readb(&iCom_port_info->dram->isr); + control = iCom_readb(&iCom_port_info->dram->osr); + + stat_buf[0] = 0; + stat_buf[1] = 0; + if (control & RTS) + strcat(stat_buf, "|RTS"); + if (status & CTS) + strcat(stat_buf, "|CTS"); + if (control & DTR) + strcat(stat_buf, "|DTR"); + if (status & DSR) + strcat(stat_buf, "|DSR"); + if (status & DCD) + strcat(stat_buf, "|CD"); + if (status & RI) + strcat(stat_buf, "|RI"); + + baud_index = iCom_readb(&iCom_port_info->dram->async_config3); + ret += sprintf(buf+ret, " baud:%d",icom_acfg_baud[baud_index]); + + ret += sprintf(buf+ret, " tx:%d rx:%d", + iCom_port_info->icount.tx, iCom_port_info->icount.rx); + + if (iCom_port_info->icount.frame) + ret += sprintf(buf+ret, " fe:%d", iCom_port_info->icount.frame); + + if (iCom_port_info->icount.parity) + ret += sprintf(buf+ret, " pe:%d", iCom_port_info->icount.parity); + + if (iCom_port_info->icount.brk) + ret += sprintf(buf+ret, " brk:%d", iCom_port_info->icount.brk); + + if (iCom_port_info->icount.overrun) + ret += sprintf(buf+ret, " oe:%d", iCom_port_info->icount.overrun); + + /* + * Last thing is the RS-232 status lines + */ + ret += sprintf(buf+ret, " %s\n", stat_buf+1); + return ret; +} + +int iCom_read_proc(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + int i, j, len = 0, l; + off_t begin = 0; + + len += sprintf(page, "iCom driver: %s\n", "1.0"); + for (i = 0; i < active_adapters && len < 4000; i++) { + for (j= 0; j < 4 && len < 4000; j++) { + if (iCom_adapter_info[i].port_info[j].status == ICOM_PORT_ACTIVE) { + l = line_info(page + len, &iCom_adapter_info[i].port_info[j]); + len += l; + if (len+begin > off+count) + goto done; + if (len+begin < off) { + begin += len; + len = 0; + } + } + } + } + *eof = 1; + done: + if (off >= len+begin) + return 0; + *start = page + (begin-off); + return ((count < begin+len-off) ? count : begin+len-off); +} + +static void iCom_flush_buffer(struct tty_struct * tty) +{ + struct iCom_port *iCom_port_info = (struct iCom_port *)tty->driver_data; + unsigned char cmdReg; + unsigned long flags; + + spin_lock_irqsave(&iComlock,flags); + TRACE(iCom_port_info,TRACE_FLUSH_BUFFER,0); + /* + * with no CMD_XMIT_ENABLE is same as disabling xmitter. This should + * result in an interrupt if currently transmitting + */ + cmdReg = iCom_readb(&iCom_port_info->dram->CmdReg); + iCom_writeb(cmdReg & ~CMD_XMIT_ENABLE,&iCom_port_info->dram->CmdReg); + spin_unlock_irqrestore(&iComlock,flags); +} + +/* + * This routine is used by the interrupt handler to schedule + * processing in the software interrupt portion of the driver. + */ +static inline void rs_sched_event(struct iCom_port *info, + int event) +{ + info->event |= 1 << event; + queue_task(&info->tqueue, &tq_immediate); + mark_bh(IMMEDIATE_BH); +} + +static inline void check_modem_status(struct iCom_port *iCom_port_info) +{ + static char old_status = 0; + char delta_status; + unsigned char status; + + /*modem input register */ + status = iCom_readb(&iCom_port_info->dram->isr); + TRACE(iCom_port_info,TRACE_CHECK_MODEM,status); + delta_status = status ^ old_status; + if (delta_status) { + if (delta_status & RI) + iCom_port_info->icount.rng++; + if (delta_status & DSR) + iCom_port_info->icount.dsr++; + if (delta_status & DCD) + iCom_port_info->icount.dcd++; + if (delta_status & CTS) + iCom_port_info->icount.cts++; + + wake_up_interruptible(&iCom_port_info->delta_msr_wait); + old_status = status; + } + + if ((iCom_port_info->flags & ASYNC_CHECK_CD) && (status & 0x20)) { + if (status & 0x20) /* Carrier Detect up */ + wake_up_interruptible(&iCom_port_info->open_wait); + else if (!((iCom_port_info->flags & ASYNC_CALLOUT_ACTIVE) && + (iCom_port_info->flags & ASYNC_CALLOUT_NOHUP))) { + if (iCom_port_info->tty) + tty_hangup(iCom_port_info->tty); + } + } + + if (iCom_port_info->flags & ASYNC_CTS_FLOW) { + if (iCom_port_info->tty->hw_stopped) { + if (status & 0x40) { /* CTS up */ + iCom_port_info->tty->hw_stopped = 0; + TRACE(iCom_port_info,TRACE_CTS_UP,0); + rs_sched_event(iCom_port_info, 0); + return; + } + } else { + if (!(status & 0x40)) { /* CTS down */ + iCom_port_info->tty->hw_stopped = 1; + TRACE(iCom_port_info,TRACE_CTS_DOWN,0); + } + } + } +} + +static void process_interrupt(u16 port_int_reg, struct iCom_port *iCom_port_info) +{ + short int count, rcv_buff; + struct tty_struct *tty = iCom_port_info->tty; + unsigned char *data; + unsigned short int status; + struct async_icount *icount; + unsigned long int offset; + + + TRACE(iCom_port_info,TRACE_INTERRUPT | TRACE_TIME,jiffies); + + if (port_int_reg & (INT_XMIT_COMPLETED | INT_XMIT_DISABLED)) { + if (port_int_reg & (INT_XMIT_COMPLETED)) + TRACE(iCom_port_info,TRACE_XMIT_COMPLETE,0); + else + TRACE(iCom_port_info,TRACE_XMIT_DISABLED,0); + + /* clear buffer in use bit */ + iCom_port_info->statStg->xmit[0].flags &= cpu_to_le16(~SA_FLAGS_READY_TO_XMIT); + iCom_port_info->icount.tx += (unsigned short int)cpu_to_le16(iCom_port_info->statStg->xmit[0].leLength); + + /* activate write queue */ + rs_sched_event(iCom_port_info, 0); + } + + if (port_int_reg & INT_RCV_COMPLETED) { + + TRACE(iCom_port_info,TRACE_RCV_COMPLETE,0); + rcv_buff = iCom_port_info->next_rcv; + + status = cpu_to_le16(iCom_port_info->statStg->rcv[rcv_buff].flags); + while (status & SA_FL_RCV_DONE) { + + TRACE(iCom_port_info,TRACE_FID_STATUS,status); + + count = cpu_to_le16(iCom_port_info->statStg->rcv[rcv_buff].leLength); + + TRACE(iCom_port_info,TRACE_RCV_COUNT,count); + if (count > (TTY_FLIPBUF_SIZE - tty->flip.count)) + count = TTY_FLIPBUF_SIZE - tty->flip.count; + + TRACE(iCom_port_info,TRACE_REAL_COUNT,count); + + offset = cpu_to_le32(iCom_port_info->statStg->rcv[rcv_buff].leBuffer) - iCom_port_info->recv_buf_pci; + + memcpy(tty->flip.char_buf_ptr,(unsigned char *)((unsigned long int)iCom_port_info->recv_buf + offset),count); + + data = (unsigned char *)tty->flip.char_buf_ptr; + + if (count > 0) { + tty->flip.count += count - 1; + tty->flip.char_buf_ptr += count - 1; + + memset(tty->flip.flag_buf_ptr, 0, count); + tty->flip.flag_buf_ptr += count - 1; + } + + icount = &iCom_port_info->icount; + icount->rx += count; + + /* Break detect logic */ + if ((status & SA_FLAGS_FRAME_ERROR) && (tty->flip.char_buf_ptr[0] == 0x00)) { + status &= ~SA_FLAGS_FRAME_ERROR; + status |= SA_FLAGS_BREAK_DET; + TRACE(iCom_port_info,TRACE_BREAK_DET,0); + } + + if (status & (SA_FLAGS_BREAK_DET | SA_FLAGS_PARITY_ERROR | + SA_FLAGS_FRAME_ERROR | SA_FLAGS_OVERRUN)) { + + if (status & SA_FLAGS_BREAK_DET) + icount->brk++; + if (status & SA_FLAGS_PARITY_ERROR) + icount->parity++; + if (status & SA_FLAGS_FRAME_ERROR) + icount->frame++; + if (status & SA_FLAGS_OVERRUN) + icount->overrun++; + + /* + * Now check to see if character should be + * ignored, and mask off conditions which + * should be ignored. + */ + if (status & iCom_port_info->ignore_status_mask) { + TRACE(iCom_port_info,TRACE_IGNORE_CHAR,0); + goto ignore_char; + } + + status &= iCom_port_info->read_status_mask; + + if (status & SA_FLAGS_BREAK_DET) { + *tty->flip.flag_buf_ptr = TTY_BREAK; + if (iCom_port_info->flags & ASYNC_SAK) + do_SAK(tty); + } else if (status & SA_FLAGS_PARITY_ERROR) { + TRACE(iCom_port_info,TRACE_PARITY_ERROR,0); + *tty->flip.flag_buf_ptr = TTY_PARITY; + } + else if (status & SA_FLAGS_FRAME_ERROR) + *tty->flip.flag_buf_ptr = TTY_FRAME; + if (status & SA_FLAGS_OVERRUN) { + /* + * Overrun is special, since it's + * reported immediately, and doesn't + * affect the current character + */ + if (tty->flip.count < TTY_FLIPBUF_SIZE) { + tty->flip.count++; + tty->flip.flag_buf_ptr++; + tty->flip.char_buf_ptr++; + *tty->flip.flag_buf_ptr = TTY_OVERRUN; + } + } + } + + tty->flip.flag_buf_ptr++; + tty->flip.char_buf_ptr++; + tty->flip.count++; + ignore_char: + iCom_port_info->statStg->rcv[rcv_buff].flags = 0; + iCom_port_info->statStg->rcv[rcv_buff].leLength = 0; + iCom_port_info->statStg->rcv[rcv_buff].WorkingLength = (unsigned short int)cpu_to_le16(RCV_BUFF_SZ); + + rcv_buff++; + if (rcv_buff == NUM_RBUFFS) rcv_buff = 0; + + status = cpu_to_le16(iCom_port_info->statStg->rcv[rcv_buff].flags); + } + iCom_port_info->next_rcv = rcv_buff; + tty_flip_buffer_push(tty); + } +} + +static void iCom_interrupt(int irq, void * dev_id, struct pt_regs * regs) +{ + unsigned long int int_reg; + u32 adapter_interrupts; + u16 port_int_reg; + struct iCom_adapter *iCom_adapter_ptr; + struct iCom_port *iCom_port_info; + unsigned long flags; + + spin_lock_irqsave(&iComlock,flags); + + /* find iCom_port_info for this interrupt */ + iCom_adapter_ptr = (struct iCom_adapter *)dev_id; + + if ((iCom_adapter_ptr->version | ADAPTER_V2) == ADAPTER_V2) { + int_reg = iCom_adapter_ptr->base_addr + 0x8024; + + adapter_interrupts = iCom_readl((void *)int_reg); + + if (adapter_interrupts & 0x00003FFF) { + /* port 2 interrupt, NOTE: for all ADAPTER_V2, port 2 will be active */ + iCom_port_info = &iCom_adapter_ptr->port_info[2]; + port_int_reg = (u16)adapter_interrupts; + process_interrupt(port_int_reg, iCom_port_info); + check_modem_status(iCom_port_info); + } + if (adapter_interrupts & 0x3FFF0000) { + /* port 3 interrupt */ + iCom_port_info = &iCom_adapter_ptr->port_info[3]; + if (iCom_port_info->status == ICOM_PORT_ACTIVE) { + port_int_reg = (u16)(adapter_interrupts >> 16); + process_interrupt(port_int_reg, iCom_port_info); + check_modem_status(iCom_port_info); + } + } + + /* Clear out any pending interrupts */ + iCom_writel(adapter_interrupts,(void *)int_reg); + + int_reg = iCom_adapter_ptr->base_addr + 0x8004; + } + else { + int_reg = iCom_adapter_ptr->base_addr + 0x4004; + } + + adapter_interrupts = iCom_readl((void *)int_reg); + + if (adapter_interrupts & 0x00003FFF) { + /* port 0 interrupt, NOTE: for all adapters, port 0 will be active */ + iCom_port_info = &iCom_adapter_ptr->port_info[0]; + port_int_reg = (u16)adapter_interrupts; + process_interrupt(port_int_reg, iCom_port_info); + check_modem_status(iCom_port_info); + } + if (adapter_interrupts & 0x3FFF0000) { + /* port 1 interrupt */ + iCom_port_info = &iCom_adapter_ptr->port_info[1]; + if (iCom_port_info->status == ICOM_PORT_ACTIVE) { + port_int_reg = (u16)(adapter_interrupts >> 16); + process_interrupt(port_int_reg, iCom_port_info); + check_modem_status(iCom_port_info); + } + } + + /* Clear out any pending interrupts */ + iCom_writel(adapter_interrupts,(void *)int_reg); + spin_unlock_irqrestore(&iComlock,flags); +} + +/* + * ------------------------------------------------------------------- + * Here ends the serial interrupt routines. + * ------------------------------------------------------------------- + */ + +/* + * This routine is used to handle the "bottom half" processing for the + * serial driver, known also the "software interrupt" processing. + * This processing is done at the kernel interrupt level, after the + * iCom_interrupt() has returned, BUT WITH INTERRUPTS TURNED ON. This + * is where time-consuming activities which can not be done in the + * interrupt driver proper are done; the interrupt driver schedules + * them using rs_sched_event(), and they get done here. + */ +static void do_softint(void *private_) +{ + struct iCom_port *info = (struct iCom_port *) private_; + struct tty_struct *tty; + + tty = info->tty; + if (!tty) + return; + + if (test_and_clear_bit(0, &info->event)) { + if ((tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) && tty->ldisc.write_wakeup) + (tty->ldisc.write_wakeup)(tty); + wake_up_interruptible(&tty->write_wait); + TRACE(info,TRACE_WAKEUP,0); + } +} + +/* + Module operations +*/ +int iCom_init(void) +{ + int index, + index_v2, + index2, + scan_index; + struct pci_dev *dev[MAX_ADAPTERS]; + unsigned int irq_number[MAX_ADAPTERS]; + unsigned long int base_addr[MAX_ADAPTERS]; + unsigned char valid_indices[MAX_ADAPTERS]; +#define VALID 1 +#define INVALID 0 + unsigned int command_reg; + struct iCom_port *iCom_port_info; + int retval; + int status; + int port_num; + int adapter_count = 0; + int duplicate; + unsigned int subsystem_id; + + + /* + * Find base addresses and IRQs for any/all installed cards + */ + for (index=0; index < MAX_ADAPTERS; index++) { + valid_indices[index] = INVALID; + dev[index] = NULL; + } + + /* check for Version 1 Adapters */ + for (index = 0; index < MAX_ADAPTERS; index++){ + if (index == 0) { + if (!(dev[index] = pci_find_device(VENDOR_ID, DEVICE_ID, dev[index]))) + break; + } + else { + if (!(dev[index] = pci_find_device(VENDOR_ID, DEVICE_ID, dev[index-1]))) + break; + } + + adapter_count++; + + if (pci_enable_device(dev[index])) { + printk("iCom: Device enable FAILED\n"); + continue; + } + + if (pci_read_config_dword(dev[index], PCI_COMMAND, &command_reg)) { + printk("iCom: PCI Config read FAILED\n"); + continue; + } + + pci_write_config_dword(dev[index],PCI_COMMAND, command_reg | 0x00000146); + pci_write_config_dword(dev[index],0x44, 0x8300830A); + + base_addr[index] = pci_resource_start(dev[index],0); + base_addr[index] &= PCI_BASE_ADDRESS_MEM_MASK; + + duplicate = 0; + for (index2 = 0; index2 < index; index2++) { + if (base_addr[index] == base_addr[index2]) + duplicate = 1; + } + if (duplicate) continue; + + irq_number[index] = dev[index]->irq; + + valid_indices[index] = ADAPTER_V1; + } + + /* check for version 2 Adapters */ + for (index_v2=0; index_v2 < (MAX_ADAPTERS - adapter_count); index_v2++){ + if (index_v2 == 0) { + if (!(dev[index] = pci_find_device(VENDOR_ID, DEVICE_ID2, NULL))) + break; + } + else { + if (!(dev[index] = pci_find_device(VENDOR_ID, DEVICE_ID2, dev[index-1]))) + break; + } + + adapter_count++; + + if (pci_enable_device(dev[index])) { + printk("iCom: Device enable FAILED\n"); + continue; + } + + if (pci_read_config_dword(dev[index], PCI_COMMAND, &command_reg)) { + printk("iCom: PCI Config read FAILED\n"); + continue; + } + + pci_write_config_dword(dev[index],PCI_COMMAND, command_reg | 0x00000146); + pci_write_config_dword(dev[index],0x44, 0x42004200); + pci_write_config_dword(dev[index],0x48, 0x42004200); + + base_addr[index] = pci_resource_start(dev[index],0); + base_addr[index] &= PCI_BASE_ADDRESS_MEM_MASK; + + duplicate = 0; + for (index2 = 0; index2 < index; index2++) { + if (base_addr[index] == base_addr[index2]) + duplicate = 1; + } + if (duplicate) continue; + + irq_number[index] = dev[index]->irq; + + valid_indices[index++] = ADAPTER_V2; + } + + /* allocate memory for control blocks representing each adapter */ + iCom_adapter_info = (struct iCom_adapter *) + kmalloc(adapter_count*sizeof(struct iCom_adapter),GFP_KERNEL); + + if (!iCom_adapter_info) { + return -ENOMEM; + } + + memset(iCom_adapter_info, 0,adapter_count*sizeof(struct iCom_adapter)); + + /* store information just obtained on base_addr and irq */ + for (index = scan_index = 0; (scan_index < MAX_ADAPTERS) & + (index < adapter_count); scan_index++) { + + if (valid_indices[scan_index]) { + iCom_adapter_info[index].base_addr = base_addr[scan_index]; + iCom_adapter_info[index].irq_number = irq_number[scan_index]; + iCom_adapter_info[index].pci_dev = dev[scan_index]; + iCom_adapter_info[index].version = valid_indices[scan_index]; + pci_read_config_dword(dev[index], PCI_SUBSYSTEM_VENDOR_ID, &subsystem_id); + iCom_adapter_info[index].subsystem_id = subsystem_id; + + /* save off irq and request irq line */ + if (request_irq(irq_number[scan_index], iCom_interrupt, SA_INTERRUPT | + SA_SHIRQ, DRIVER_NAME, (void *)&iCom_adapter_info[index])) { + printk("iCom: request_irq FAILED\n"); + continue; + } + + if (iCom_adapter_info[index].version == ADAPTER_V1) { + iCom_adapter_info[index].numb_ports = 2; + iCom_adapter_info[index].port_info[0].port = 0; + iCom_adapter_info[index].port_info[0].status = ICOM_PORT_ACTIVE; + iCom_adapter_info[index].port_info[1].port = 1; + iCom_adapter_info[index].port_info[1].status = ICOM_PORT_ACTIVE; + } + else { + if (subsystem_id == FOUR_PORT_MODEL) { + iCom_adapter_info[index].numb_ports = 4; + iCom_adapter_info[index].port_info[0].port = 0; + iCom_adapter_info[index].port_info[0].status = ICOM_PORT_ACTIVE; + iCom_adapter_info[index].port_info[1].port = 1; + iCom_adapter_info[index].port_info[1].status = ICOM_PORT_ACTIVE; + iCom_adapter_info[index].port_info[2].port = 2; + iCom_adapter_info[index].port_info[2].status = ICOM_PORT_ACTIVE; + iCom_adapter_info[index].port_info[3].port = 3; + iCom_adapter_info[index].port_info[3].status = ICOM_PORT_ACTIVE; + } + else { + iCom_adapter_info[index].numb_ports = 4; + iCom_adapter_info[index].port_info[0].port = 0; + iCom_adapter_info[index].port_info[0].status = ICOM_PORT_ACTIVE; + iCom_adapter_info[index].port_info[1].status = ICOM_PORT_OFF; + iCom_adapter_info[index].port_info[2].port = 2; + iCom_adapter_info[index].port_info[2].status = ICOM_PORT_ACTIVE; + iCom_adapter_info[index].port_info[3].status = ICOM_PORT_OFF; + } + } + + if (!request_mem_region(iCom_adapter_info[index].base_addr, + pci_resource_len(iCom_adapter_info[index].pci_dev,0), + "iCom")) { + printk("iCom: request_mem_region FAILED\n"); + } + + for (port_num = 0; port_num < iCom_adapter_info[index].numb_ports; port_num++) { + iCom_port_info = &iCom_adapter_info[index].port_info[port_num]; + + if (iCom_port_info->status == ICOM_PORT_ACTIVE) { + /* initialize wait queues */ + init_waitqueue_head(&iCom_port_info->open_wait); + init_waitqueue_head(&iCom_port_info->close_wait); + init_waitqueue_head(&iCom_port_info->delta_msr_wait); + + /* initialize port specific variables */ + iCom_port_info->tqueue.routine = do_softint; + iCom_port_info->tqueue.data = iCom_port_info; + if (iCom_adapter_info[index].version == ADAPTER_V1) { + iCom_port_info->global_reg = (struct iCom_regs *)((char *)iCom_adapter_info[index].base_addr + 0x4000); + iCom_port_info->int_reg = (unsigned long)iCom_adapter_info[index].base_addr + 0x4004 + 2 - 2 * port_num; + } + else { + iCom_port_info->global_reg = (struct iCom_regs *)((char *)iCom_adapter_info[index].base_addr + 0x8000); + if (iCom_port_info->port < 2) + iCom_port_info->int_reg = (unsigned long)iCom_adapter_info[index].base_addr + 0x8004 + 2 - 2 * iCom_port_info->port; + else + iCom_port_info->int_reg = (unsigned long)iCom_adapter_info[index].base_addr + 0x8024 + 2 - 2 * (iCom_port_info->port - 2); + } + iCom_port_info->dram = (struct func_dram*)((char*)iCom_adapter_info[index].base_addr + 0x2000 * iCom_port_info->port); + iCom_port_info->close_delay = 5*HZ/10; + iCom_port_info->closing_wait = 30*HZ; + iCom_port_info->adapter = index; + + /* + * Load and start processor + */ + retval = loadCode(iCom_port_info); + if (retval != 0) { + printk("iCom%d: pico-code load of adapter FAILED\n",iCom_port_info->adapter); + return -ENODEV; + } + + /* get port memory */ + if ((status = get_port_memory(iCom_port_info)) != 0) { + return -ENODEV; + /* return status; *** -ENOMEM didn't work right for me */ + } + + /* Set Country Code */ + iCom_set_code(iCom_port_info); + } + } + index++; + } + } + active_adapters = index; + + printk("iCom: Adapter detection complete, %d adapters found with %d valid\n",adapter_count,active_adapters); + + if (active_adapters > 0) { + + /* Initialize the tty_driver structure */ + memset(&serial_driver, 0, sizeof(struct tty_driver)); + serial_driver.magic = TTY_DRIVER_MAGIC; + serial_driver.driver_name = DRIVER_NAME; +#if defined(CONFIG_DEVFS_FS) + serial_driver.name = "ttyA%d"; +#else + serial_driver.name = "ttyA"; +#endif + serial_driver.major = 243; + serial_driver.minor_start = 0; + serial_driver.num = NR_PORTS; + serial_driver.type = TTY_DRIVER_TYPE_SERIAL; + serial_driver.subtype = SERIAL_TYPE_NORMAL; + serial_driver.init_termios = tty_std_termios; + serial_driver.init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL; + serial_driver.flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_NO_DEVFS; + serial_driver.refcount = &serial_refcount; + serial_driver.table = serial_table; + serial_driver.termios = serial_termios; + serial_driver.termios_locked = serial_termios_locked; + + serial_driver.open = iCom_open; + serial_driver.close = iCom_close; + serial_driver.write = iCom_write; + serial_driver.put_char = iCom_put_char; + serial_driver.flush_chars = iCom_flush_chars; + serial_driver.write_room = iCom_write_room; + serial_driver.chars_in_buffer = iCom_chars_in_buffer; + serial_driver.flush_buffer = iCom_flush_buffer; + serial_driver.ioctl = iCom_ioctl; + serial_driver.throttle = iCom_throttle; + serial_driver.unthrottle = iCom_unthrottle; + serial_driver.send_xchar = iCom_send_xchar; + serial_driver.set_termios = iCom_set_termios; + serial_driver.stop = iCom_stop; + serial_driver.start = iCom_start; + serial_driver.hangup = iCom_hangup; + serial_driver.break_ctl = iCom_break; + serial_driver.wait_until_sent = iCom_wait_until_sent; + serial_driver.read_proc = iCom_read_proc; + + + for (index=0; index < active_adapters; index++) { + iCom_adapter_info[index].port_info[0].callout_termios = serial_driver.init_termios; + iCom_adapter_info[index].port_info[0].normal_termios = serial_driver.init_termios; + iCom_adapter_info[index].port_info[1].callout_termios = serial_driver.init_termios; + iCom_adapter_info[index].port_info[1].normal_termios = serial_driver.init_termios; + iCom_adapter_info[index].port_info[2].callout_termios = serial_driver.init_termios; + iCom_adapter_info[index].port_info[2].normal_termios = serial_driver.init_termios; + iCom_adapter_info[index].port_info[3].callout_termios = serial_driver.init_termios; + iCom_adapter_info[index].port_info[3].normal_termios = serial_driver.init_termios; + } + + if (tty_register_driver(&serial_driver)) { + for (index=0; index < active_adapters; index++) { + free_irq(iCom_adapter_info[index].irq_number, (void *)&iCom_adapter_info[index]); + } + kfree(iCom_adapter_info); + panic("Couldn't register serial driver\n"); + } + +#if defined(CONFIG_DEVFS_FS) + for (index = 0; index < active_adapters; index++) { + tty_register_devfs(&serial_driver, + 0, index*4 + serial_driver.minor_start); + tty_register_devfs(&serial_driver, + 0, index*4 + serial_driver.minor_start + 1); + + if ((iCom_adapter_info[index].version == ADAPTER_V2) && + (iCom_adapter_info[index].subsystem_id == FOUR_PORT_MODEL)) { + tty_register_devfs(&serial_driver, + 0, index*4 + serial_driver.minor_start + 2); + tty_register_devfs(&serial_driver, + 0, index*4 + serial_driver.minor_start + 3); + } + } +#endif + + /* lastly, register unique ioctl */ + register_ioctl32_conversion(0x4300,NULL); + + return 0; + } + else { + if (adapter_count > 0) { + kfree(iCom_adapter_info); + } + } + + return -ENODEV; +} + +int init_module(void) +{ + return iCom_init(); +} + +void cleanup_module(void) +{ + unsigned long flags; + int e1; + int index; + int port_num; + struct iCom_port *iCom_port_info; + + /* remove registered ioctl */ + unregister_ioctl32_conversion(0x4300); + + spin_lock_irqsave(&iComlock,flags); + if ((e1 = tty_unregister_driver(&serial_driver))) + printk("iCom: failed to unregister serial driver (%d)\n",e1); + +#if defined(CONFIG_DEVFS_FS) + for (index = 0; index < active_adapters; index++) { + tty_unregister_devfs(&serial_driver, + index*4 + serial_driver.minor_start); + tty_unregister_devfs(&serial_driver, + index*4 + serial_driver.minor_start + 1); + + if ((iCom_adapter_info[index].version == ADAPTER_V2) && + (iCom_adapter_info[index].subsystem_id == FOUR_PORT_MODEL)) { + tty_unregister_devfs(&serial_driver, + index*4 + serial_driver.minor_start + 2); + tty_unregister_devfs(&serial_driver, + index*4 + serial_driver.minor_start + 3); + } + } +#endif + + for (index=0; index < active_adapters; index++) { + + for (port_num = 0; port_num < iCom_adapter_info[index].numb_ports; port_num++) { + iCom_port_info = &iCom_adapter_info[index].port_info[port_num]; + + if (iCom_port_info->status == ICOM_PORT_ACTIVE) { + + /* be sure that DTR and RTS are dropped */ + iCom_writeb(0x00,&iCom_port_info->dram->osr); + + /* Wait 0.1 Sec for simple Init to complete */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/10); + + /* Stop proccessor */ + stop_processor(iCom_port_info); + + return_port_memory(iCom_port_info); + } + } + + free_irq(iCom_adapter_info[index].irq_number, (void *)&iCom_adapter_info[index]); + release_mem_region(iCom_adapter_info[index].base_addr, + pci_resource_len(iCom_adapter_info[index].pci_dev,0)); + } + spin_unlock_irqrestore(&iComlock,flags); + + kfree(iCom_adapter_info); + printk("iCom: Driver removed\n"); +} + +/* the interrupts should be disabled here so that no hardware + interrupts should occur, polling will be done to check received + data to avoid interrupt level processing */ +static int mdm_rcv(struct iCom_port *iCom_port_info, char *exp_str) { + int status = 0; + int loop_count = 0; + char *start_str; + int rcv_buff; + + /* search expected string in received data */ + while (!status && (loop_count++ < 10)) { + /* check buffer 1 */ + start_str = (char *)iCom_port_info->recv_buf; + if (strstr(start_str, exp_str)) { + /* string found! */ + status = 1; + break; + } + + /* check buffer 2 */ + start_str = (char *)iCom_port_info->recv_buf + 2048; + if (strstr(start_str, exp_str)) { + /* string found! */ + status = 1; + break; + } + + /* wait .5 seconds */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/2); + + /* free up buffers if they had been used */ + for (rcv_buff = 0; rcv_buff < NUM_RBUFFS; rcv_buff++) { + iCom_port_info->statStg->rcv[rcv_buff].flags = 0; + iCom_port_info->statStg->rcv[rcv_buff].leLength = 0; + iCom_port_info->statStg->rcv[rcv_buff].WorkingLength = (unsigned short int)cpu_to_le16(RCV_BUFF_SZ); + } + } + + /* clear interrupts */ + iCom_writew(0x3FFF,(void *)iCom_port_info->int_reg); + + return status; +} + +static void mdm_send(struct iCom_port *iCom_port_info, char *mdm_cmnd, + int cmnd_length) { + + unsigned char cmdReg; + unsigned long int offset; + + + /* initialize transmit and receive operations */ + offset = (unsigned long int)&iCom_port_info->statStg->rcv[0] - (unsigned long int)iCom_port_info->statStg; + iCom_writel(iCom_port_info->statStg_pci + offset,&iCom_port_info->dram->RcvStatusAddr); + iCom_port_info->next_rcv = 0; + iCom_port_info->put_length = 0; + *iCom_port_info->xmitRestart = 0; + iCom_writel(iCom_port_info->xmitRestart_pci,&iCom_port_info->dram->XmitStatusAddr); + iCom_writeb(CMD_XMIT_RCV_ENABLE,&iCom_port_info->dram->CmdReg); + + /* clear target receive buffers 1 and 2 */ + memset(iCom_port_info->recv_buf,0,4096); + + memcpy(iCom_port_info->xmit_buf, mdm_cmnd, cmnd_length); + + iCom_port_info->statStg->xmit[0].flags = (unsigned short int)cpu_to_le16(SA_FLAGS_READY_TO_XMIT); + iCom_port_info->statStg->xmit[0].leLength = (unsigned short int)cpu_to_le16(cmnd_length); + offset = (unsigned long int)&iCom_port_info->statStg->xmit[0] - (unsigned long int)iCom_port_info->statStg; + *iCom_port_info->xmitRestart = cpu_to_le32(iCom_port_info->statStg_pci + offset); + + cmdReg = iCom_readb(&iCom_port_info->dram->CmdReg); + iCom_writeb(cmdReg | CMD_XMIT_RCV_ENABLE,&iCom_port_info->dram->CmdReg); + iCom_writeb(START_XMIT,&iCom_port_info->dram->StartXmitCmd); + TRACE(iCom_port_info,TRACE_WRITE_START,cmnd_length); +} + +static void iCom_set_code(struct iCom_port *iCom_port_info) { + char mdm_cmnd[15]; + int index; + + /* check if country code should be set at all */ + if (strlen(iCom_country_code) == 0) return; + + printk("iCom: Checking for country code on serial adapter %d port %d...",iCom_port_info->adapter, iCom_port_info->port); + + /* sync up modems (if present) */ + mdm_send(iCom_port_info,"AT\r\n",4); + mdm_rcv(iCom_port_info,"OK"); + + printk("."); + + /* send ATI0 to check for internal modem */ + mdm_send(iCom_port_info,"ATI0\r\n",6); + + /* check returned data for internal modem identification */ + if (!mdm_rcv(iCom_port_info,"SMI")) { + /* must not be internal modem - return */ + printk("\n\tno internal modem on this port\n"); + return; + } + + /* send ATE0S0=0 to turn off command Echo and turn off Auto Answer */ + mdm_send(iCom_port_info,"ATE0S0=0\r\n",10); + + /* wait for OK */ + if (!mdm_rcv(iCom_port_info,"OK")) { + /* unable to send command to modem - error */ + printk("\niCom: Error, unable to set modem Country Code\n"); + return; + } + + printk("."); + + /* Send new country code AT%T19,0, */ + sprintf(mdm_cmnd,"AT%%T19,0,%s\r\n",iCom_country_code); + mdm_send(iCom_port_info,mdm_cmnd,strlen(mdm_cmnd)); + + /* wait for OK */ + if (!mdm_rcv(iCom_port_info,"OK")) { + /* unable to set country code */ + printk("\niCom: Error, unable to set modem Country Code\n"); + return; + } + + printk(".\n"); + + /* send ATE1S0=2 to enable command Echo and auto answer */ + mdm_send(iCom_port_info,"ATE1S0=2\r\n",10); + + /* wait for OK */ + if (!mdm_rcv(iCom_port_info,"OK")) { + /* modem state unknown */ + printk("iCom: Warning, modem state unknown\n"); + } + + /* print message that Country Code set appropriately */ + printk("iCom: Modem country code for adapter %d port %d has been set to %s\n",iCom_port_info->adapter, iCom_port_info->port,iCom_country_code); + + /* disable xmitter/recvr */ + iCom_writeb(CMD_RCV_DISABLE,&iCom_port_info->dram->CmdReg); + for (index = 0; index < 10; index++) { + /* Wait 0.1 Sec for receive operations to complete*/ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/10); + + if (iCom_readb(&iCom_port_info->dram->PrevCmdReg) == 0x00) { + break; + } + } + + /* clear interrupts */ + iCom_writew(0x3FFF,(void *)iCom_port_info->int_reg); +} + +#ifdef ICOM_TRACE +void TRACE(struct iCom_port *iCom_port_info, u32 trace_pt, + u32 trace_data) { + + u32 *tp_start, *tp_end, **tp_next; + + if (trace_pt == TRACE_GET_MEM) { + if (iCom_port_info->trace_blk != 0) return; + iCom_port_info->trace_blk = kmalloc(TRACE_BLK_SZ,GFP_KERNEL); + memset(iCom_port_info->trace_blk, 0,TRACE_BLK_SZ); + iCom_port_info->trace_blk[0] = (unsigned long)iCom_port_info->trace_blk + 3*sizeof(unsigned long); + iCom_port_info->trace_blk[1] = (unsigned long)iCom_port_info->trace_blk + TRACE_BLK_SZ; + iCom_port_info->trace_blk[2] = iCom_port_info->trace_blk[0]; + } + if (iCom_port_info->trace_blk == 0) return; + + if (trace_pt == TRACE_RET_MEM) { + kfree(iCom_port_info->trace_blk); + iCom_port_info->trace_blk = 0; + return; + } + + tp_start = (u32 *)iCom_port_info->trace_blk[0]; + tp_end = (u32 *)iCom_port_info->trace_blk[1]; + tp_next = (u32 **)&iCom_port_info->trace_blk[2]; + + if (trace_data != 0) { + **tp_next = trace_data; + *tp_next = *tp_next + 1; + if (*tp_next == tp_end) *tp_next = tp_start; + **tp_next = TRACE_WITH_DATA | trace_pt; + } + else + **tp_next = trace_pt; + + *tp_next = *tp_next + 1; + if (*tp_next == tp_end) *tp_next = tp_start; +} +#endif diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/char/icom.h linuxppc64_2_4/drivers/char/icom.h --- ../kernel.org/linux-2.4.19/drivers/char/icom.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/drivers/char/icom.h Fri Dec 14 09:07:47 2001 @@ -0,0 +1,364 @@ +/* + * iCom.h + * + * Copyright (C) 2001 Michael Anderson, IBM Corporation + * + * Serial device driver include file. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#define TRACE_BLK_SZ 1024 +#define TRACE_WITH_DATA 0x80000000 +#define TRACE_TIME 0x40000000 +#define TRACE_GET_MEM 0x20000000 +#define TRACE_RET_MEM 0x10000000 +#define TRACE_GET_PORT_MEM 0x00000001 +#define TRACE_FOD_ADDR 0x00000005 +#define TRACE_FOD_XBUFF 0x00000006 +#define TRACE_FID_ADDR 0x00000007 +#define TRACE_FID_RBUFF 0x00000008 +#define TRACE_RET_PORT_MEM 0x00000100 +#define TRACE_LOAD_MEM 0x00000200 +#define TRACE_CHANGE_SPEED 0x00000300 +#define TRACE_PARENB 0x00000301 +#define TRACE_PARODD 0x00000302 +#define TRACE_XR_ENAB 0x00000303 +#define TRACE_STARTUP 0x00000400 +#define TRACE_CABLE_ID 0x00000401 +#define TRACE_SHUTDOWN 0x00000500 +#define TRACE_DEVICE_NUMB 0x00000600 +#define TRACE_STARTUP_ERROR 0x00000601 +#define TRACE_CLOSE 0x00000700 +#define TRACE_CLOSE_HANGUP 0x00000701 +#define TRACE_OPEN_ACTIVE 0x00000702 +#define TRACE_WRITE 0x00000800 +#define TRACE_WRITE_FULL 0x00000801 +#define TRACE_WRITE_NODATA 0x00000802 +#define TRACE_WRITE_START 0x00000803 +#define TRACE_PUT_CHAR 0x00000900 +#define TRACE_PUT_FULL 0x00000901 +#define TRACE_FLUSH_CHAR 0x00000a00 +#define TRACE_START_FLUSH 0x00000a01 +#define TRACE_WRITE_ROOM 0x00000b00 +#define TRACE_CHARS_IN_BUFF 0x00000c00 +#define TRACE_CHARS_REMAIN 0x00000c01 +#define TRACE_GET_MODEM 0x00000d00 +#define TRACE_SET_MODEM 0x00000e00 +#define TRACE_RAISE_RTS 0x00000e01 +#define TRACE_RAISE_DTR 0x00000e02 +#define TRACE_LOWER_RTS 0x00000e03 +#define TRACE_LOWER_DTR 0x00000e04 +#define TRACE_GET_SERIAL 0x00000f00 +#define TRACE_SET_SERIAL 0x00001000 +#define TRACE_SET_LSR 0x00001100 +#define TRACE_IOCTL 0x00001200 +#define TRACE_IOCTL_IGNORE 0x00001201 +#define TRACE_SEND_XCHAR 0x00001300 +#define TRACE_QUICK_WRITE 0x00001301 +#define TRACE_THROTTLE 0x00001400 +#define TRACE_UNTHROTTLE 0x00001500 +#define TRACE_SET_TERMIOS 0x00001600 +#define TRACE_STOP 0x00001700 +#define TRACE_START 0x00001800 +#define TRACE_HANGUP 0x00001900 +#define TRACE_BREAK 0x00001a00 +#define TRACE_WAIT_UNTIL_SENT 0x00001b00 +#define TRACE_FLUSH_BUFFER 0x00001c00 +#define TRACE_CHECK_MODEM 0x00001d00 +#define TRACE_CTS_UP 0x00001d01 +#define TRACE_CTS_DOWN 0x00001d02 +#define TRACE_INTERRUPT 0x00001e00 +#define TRACE_XMIT_COMPLETE 0x00001e01 +#define TRACE_RCV_COMPLETE 0x00001e02 +#define TRACE_FID_STATUS 0x00001e03 +#define TRACE_RCV_COUNT 0x00001e04 +#define TRACE_REAL_COUNT 0x00001e05 +#define TRACE_BREAK_DET 0x00001e06 +#define TRACE_IGNORE_CHAR 0x00001e07 +#define TRACE_PARITY_ERROR 0x00001e08 +#define TRACE_XMIT_DISABLED 0x00001e09 +#define TRACE_WAKEUP 0x00001f00 +#define TRACE_CLEAR_INTERRUPTS 0x0000ff00 +#define TRACE_START_PROC_A 0x0000ff01 +#define TRACE_START_PROC_B 0x0000ff02 +#define TRACE_STOP_PROC_A 0x0000ff03 +#define TRACE_STOP_PROC_B 0x0000ff04 +#define TRACE_RAISE_DTR_RTS 0x0000ff05 +#define TRACE_START_PROC_C 0x0000ff06 +#define TRACE_START_PROC_D 0x0000ff07 +#define TRACE_STOP_PROC_C 0x0000ff08 +#define TRACE_STOP_PROC_D 0x0000ff09 +#define TRACE_ENABLE_INTERRUPTS_PA 0x0000ff0a +#define TRACE_ENABLE_INTERRUPTS_PB 0x0000ff0b +#define TRACE_ENABLE_INTERRUPTS_PC 0x0000ff0c +#define TRACE_ENABLE_INTERRUPTS_PD 0x0000ff0d +#define TRACE_DIS_INTERRUPTS_PA 0x0000ff0e +#define TRACE_DIS_INTERRUPTS_PB 0x0000ff0f +#define TRACE_DIS_INTERRUPTS_PC 0x0000ff10 +#define TRACE_DIS_INTERRUPTS_PD 0x0000ff11 +#define TRACE_DROP_DTR_RTS 0x0000ff12 + +#ifndef TRACE_ONLY + +static unsigned char callSetup[1936] = + {0xBD,0xD9,0x23,0x00,0xDD,0xDD,0x18,0x05,0x23,0x80,0x3E,0x7F,0x23,0x00,0x3E,0x7C, + 0x23,0x10,0x3E,0xB7,0x3E,0xB5,0x23,0x20,0x3E,0xB6,0x3E,0xB4,0xA2,0x0A,0x86,0x0A, + 0xAE,0x0A,0x23,0x80,0x3E,0x01,0x23,0x2A,0x3E,0x02,0x3D,0xDA,0x23,0xFF,0x3E,0x03, + 0x22,0x86,0xD9,0xDD,0x10,0x1C,0x22,0x08,0x2F,0xF0,0xD9,0xF1,0x18,0x48,0xD9,0xDD, + 0x18,0x32,0x23,0x00,0x3E,0x06,0x3E,0x46,0x3E,0x86,0x3E,0xC6,0x21,0xF0,0x37,0xFA, + 0x2F,0xFA,0x3E,0x08,0x23,0x00,0x3E,0x07,0x3E,0x87,0x3E,0xC7,0x23,0xFF,0x3E,0x09, + 0x3E,0x47,0x00,0x39,0x23,0x00,0x3E,0x06,0x3E,0x08,0x23,0x13,0x3E,0x07,0x23,0x01, + 0x3E,0x09,0xA2,0xFC,0xA2,0xF6,0xCD,0xF0,0x10,0x3E,0x82,0xFC,0x23,0xFF,0x68,0x69, + 0x3B,0x00,0x10,0x3F,0x22,0x86,0xD9,0xDD,0x10,0x46,0x22,0x08,0x2F,0xF0,0x3D,0xF5, + 0x3B,0xB0,0x10,0x60,0x23,0x00,0x3E,0x02,0x23,0x01,0x3E,0x7C,0x23,0x14,0x3E,0x03, + 0x23,0x00,0x3E,0x7C,0x23,0x80,0xDD,0xF2,0x1B,0x95,0x68,0x69,0x3B,0x00,0x10,0x53, + 0x23,0x09,0x3E,0x02,0x3D,0xDA,0x23,0x08,0xDD,0xDD,0x18,0x5F,0x23,0x88,0x3E,0x7F, + 0x21,0xDA,0xC9,0xF0,0x10,0x64,0x33,0x04,0x3E,0x02,0x3D,0xDA,0xD9,0xF1,0x10,0x6B, + 0x23,0x00,0x3D,0xF1,0x00,0x93,0x21,0xF5,0x2F,0xF0,0x3B,0xE0,0x10,0x71,0x23,0x22, + 0x00,0x8A,0x3B,0x60,0x10,0x75,0x23,0x22,0x00,0x8A,0x3B,0xC0,0x10,0x79,0x23,0xEE, + 0x00,0x8A,0x3B,0xB0,0x10,0x7D,0x23,0x44,0x00,0x8A,0x3B,0x20,0x10,0x81,0x23,0xC4, + 0x00,0x8A,0x3B,0x30,0x10,0x85,0x23,0x22,0x00,0x8A,0x3B,0xF0,0x1B,0x99,0x00,0xB0, + 0xD9,0xDD,0x18,0x8B,0x3E,0x46,0x23,0x11,0x68,0x69,0x3B,0x00,0x10,0x8C,0x22,0x06, + 0x37,0xFF,0x3D,0xEF,0x81,0xF5,0x0A,0xD2,0x08,0xB6,0x08,0xD5,0x0A,0x4D,0x0B,0x4F, + 0xDD,0xF3,0x10,0x93,0xBD,0xF3,0x9D,0xD9,0x21,0xDE,0x3B,0x00,0x18,0xA7,0x5D,0x00, + 0x4B,0xC8,0x41,0xE0,0x22,0x7F,0x33,0x40,0x3E,0x7F,0x21,0xDE,0x4D,0xC8,0x58,0x00, + 0x4B,0xC8,0x41,0xFC,0x22,0x7F,0x2F,0xBF,0x3E,0x7F,0x21,0xF7,0x4D,0xC8,0x00,0xAF, + 0x23,0xDD,0x3D,0xF4,0x0B,0xB8,0x96,0xF4,0x9E,0xF6,0x00,0xB5,0xDD,0xE6,0x68,0x6E, + 0xC5,0xE6,0x68,0x6F,0xC1,0xE6,0x18,0xC6,0x81,0xE6,0x68,0x70,0x43,0xC8,0x45,0xCA, + 0x21,0xE4,0x68,0x62,0x21,0xE5,0x68,0x74,0x47,0xD4,0x68,0x6D,0x41,0xD2,0x43,0xC8, + 0x50,0x00,0x47,0xC8,0x41,0xC8,0x68,0x71,0x68,0x6E,0xC1,0xE6,0x68,0x6E,0x85,0xE6, + 0x8A,0xF4,0xC1,0xE6,0x68,0x6F,0xA5,0xE6,0x68,0x6D,0x21,0xE7,0x2F,0x0F,0x3D,0xCA, + 0x2F,0x0C,0x3B,0x0C,0x68,0x6E,0x21,0xE7,0xDE,0xF0,0x1A,0x41,0xDA,0xF0,0x18,0xFE, + 0xA2,0x0A,0x99,0xE7,0x23,0x00,0x3D,0xD6,0x3D,0xE8,0x21,0xCA,0x3B,0x0F,0x10,0xF0, + 0x23,0x81,0x3E,0x01,0x21,0xDA,0x3E,0x02,0x23,0x7E,0x3E,0x03,0x92,0xF1,0x00,0xFC, + 0x23,0x40,0x3E,0x01,0x21,0xC6,0x3E,0x02,0x21,0xC7,0x3E,0x03,0x21,0xCA,0x3B,0x0C, + 0x23,0x00,0x10,0xFB,0x23,0x01,0x3D,0xD7,0x82,0x0A,0x68,0x6D,0x21,0xCA,0x3B,0x0F, + 0x19,0x3C,0x21,0xE7,0xDE,0xF0,0x1A,0x41,0xD6,0xF0,0x11,0x22,0x68,0x70,0x43,0xC8, + 0x21,0xCA,0xC2,0xF0,0x21,0xC9,0x68,0x64,0xDD,0xD7,0x11,0x18,0x3D,0xC8,0x21,0xCA, + 0xC6,0xF0,0x21,0xC8,0x68,0x64,0x3D,0xC8,0x21,0xCA,0xCA,0xF0,0x21,0xC8,0x68,0x64, + 0x68,0x6B,0x29,0xD8,0x3B,0x7F,0xDA,0xF1,0x21,0xE7,0x2F,0x0F,0x3D,0xCA,0x11,0x22, + 0x91,0xE7,0x02,0x06,0xDE,0x04,0x68,0x6E,0x68,0x70,0x43,0xC8,0x21,0xCA,0xC2,0xF0, + 0x21,0xC9,0x68,0x64,0x3D,0xD8,0x21,0xCA,0x3D,0xCB,0xDD,0xD7,0x11,0x39,0x21,0xCA, + 0xC6,0xF0,0x21,0xD8,0x68,0x64,0x3D,0xD8,0x21,0xCA,0xCA,0xF0,0x21,0xD8,0x68,0x64, + 0x3D,0xD8,0x21,0xE7,0x2F,0x0F,0x3D,0xCA,0xDE,0x04,0x68,0x6E,0x22,0x00,0xCA,0x0A, + 0x11,0x48,0xA2,0x0A,0x99,0xE8,0x21,0xCA,0x3B,0x0F,0x1A,0x48,0x82,0x0A,0x68,0x6D, + 0x3D,0xC8,0x22,0x04,0x3D,0xC9,0x21,0xCA,0x3B,0x0F,0x11,0x58,0x21,0xC9,0x2F,0x38, + 0x3B,0x00,0x11,0x55,0x21,0xC8,0x3E,0xB0,0x01,0xEC,0xD5,0xE7,0x68,0x6E,0x02,0x06, + 0x21,0xC9,0xD6,0xF0,0x11,0x5C,0x95,0xE8,0xCE,0xF0,0x11,0x5F,0x9D,0xE8,0x21,0xD7, + 0x3B,0x00,0x19,0xEB,0xDE,0xF0,0x11,0x6C,0xDA,0xF0,0x19,0xAE,0x21,0xC8,0x3B,0x54, + 0x11,0xEC,0x23,0xC2,0x3D,0xD7,0x01,0xEC,0x3B,0x01,0x11,0x72,0x21,0xC8,0x3B,0x52, + 0x11,0xE9,0x01,0xE5,0x3B,0x02,0x11,0x78,0x21,0xC8,0x3B,0x4F,0x11,0xE9,0x01,0xE5, + 0x3B,0x03,0x11,0x7E,0x21,0xC8,0x3B,0x42,0x11,0xE9,0x01,0xE5,0x3B,0x04,0x11,0x84, + 0x21,0xC8,0x3B,0x47,0x11,0xE9,0x01,0xE5,0x3B,0x05,0x11,0x8A,0x21,0xC8,0x3B,0x20, + 0x11,0xE9,0x01,0xE5,0x3B,0x06,0x11,0x90,0x21,0xC8,0x3B,0x54,0x11,0xE9,0x01,0xE5, + 0x3B,0x07,0x11,0x96,0x21,0xC8,0x3B,0x4F,0x11,0xE9,0x01,0xE5,0x3B,0x08,0x11,0x9C, + 0x21,0xC8,0x3B,0x4E,0x11,0xE9,0x01,0xE5,0x3B,0x09,0x11,0xE9,0x21,0xC8,0x3B,0x59, + 0x11,0xE9,0x23,0x80,0x3D,0xD7,0x21,0xCB,0xC6,0xF0,0x21,0xD8,0x68,0x64,0x3D,0xD8, + 0x21,0xCB,0xCA,0xF0,0x21,0xD8,0x68,0x64,0x3D,0xD8,0x01,0xEC,0x2F,0x3F,0x3B,0x02, + 0x11,0xB5,0x21,0xC8,0x3B,0x4F,0x11,0xE2,0x01,0xE5,0x3B,0x03,0x11,0xBB,0x21,0xC8, + 0x3B,0x4E,0x11,0xE2,0x01,0xE5,0x3B,0x04,0x11,0xC1,0x21,0xC8,0x3B,0x59,0x11,0xE2, + 0x01,0xE5,0x3B,0x05,0x11,0xC7,0x21,0xC8,0x3B,0x20,0x11,0xE2,0x01,0xE5,0x3B,0x06, + 0x11,0xCD,0x21,0xC8,0x3B,0x52,0x11,0xE2,0x01,0xE5,0x3B,0x07,0x11,0xD3,0x21,0xC8, + 0x3B,0x4F,0x11,0xE2,0x01,0xE5,0x3B,0x08,0x11,0xD9,0x21,0xC8,0x3B,0x42,0x11,0xE2, + 0x01,0xE5,0x3B,0x09,0x11,0xE2,0x21,0xC8,0x3B,0x47,0x11,0xE2,0x21,0xE8,0x33,0x01, + 0x3D,0xE8,0x01,0xEC,0x23,0x80,0x3D,0xD7,0x01,0x66,0x21,0xD7,0x68,0x68,0x3D,0xD7, + 0x01,0xEB,0x23,0x00,0x3D,0xD7,0x95,0xE7,0xD1,0xE8,0x1A,0x15,0x45,0xDE,0xDE,0xF1, + 0x11,0xF3,0x91,0xE8,0x02,0x08,0x51,0xCC,0x21,0xD6,0x68,0x62,0x21,0xC8,0x68,0x67, + 0x45,0xDE,0x23,0x01,0x68,0x63,0x47,0xDE,0x21,0xD6,0x68,0x68,0x3D,0xD6,0x21,0xE8, + 0x2F,0x07,0x3B,0x01,0x1A,0x06,0x21,0xD6,0x3B,0x08,0x12,0x15,0xD1,0xE8,0x1A,0x15, + 0x21,0xD6,0x3B,0x00,0x1A,0x15,0x41,0xE0,0x57,0xCC,0xDD,0xF2,0x1B,0x95,0xC2,0xF1, + 0x12,0x0D,0x68,0x62,0x43,0xE0,0x23,0x00,0x3D,0xD6,0x21,0xCA,0x3B,0x0F,0x12,0x3A, + 0x21,0xC9,0x2F,0x38,0x3B,0x00,0x12,0x1E,0x95,0xE7,0x68,0x6D,0x3B,0x10,0x12,0x35, + 0x21,0xC8,0x3B,0x7E,0x12,0x32,0x22,0x74,0x3B,0x47,0x12,0x2F,0x22,0x75,0x3B,0x0F, + 0x12,0x2F,0x21,0xE7,0xDE,0xF0,0x1A,0x41,0xD6,0xF0,0x68,0x6E,0x02,0x48,0x23,0x04, + 0x3D,0xE8,0x02,0x48,0x23,0x03,0x3D,0xE8,0x02,0x48,0xCE,0xF0,0x68,0x6F,0x23,0x02, + 0x3D,0xE8,0x02,0x48,0x21,0xE8,0x2F,0x07,0x3B,0x01,0x1A,0x48,0xD1,0xE7,0x68,0x6E, + 0x02,0x48,0xA2,0x0A,0xCE,0xF4,0x68,0x6F,0xDD,0xE7,0x68,0x6E,0x23,0x07,0x3D,0xE8, + 0xA2,0x0A,0x23,0x00,0x3D,0xE7,0x8E,0xF4,0x68,0x6D,0x21,0xE7,0x2F,0x0F,0x3D,0xCA, + 0x2F,0x0C,0x3D,0xCB,0x3B,0x08,0x68,0x6E,0x21,0xE7,0xDE,0xF0,0x1A,0xC6,0xDA,0xF0, + 0x1A,0x72,0x99,0xE7,0x23,0x00,0x3D,0xD6,0x3D,0xE8,0x21,0xCA,0x3B,0x0B,0x12,0x69, + 0x23,0x81,0x3E,0x01,0x21,0xDA,0x3E,0x02,0x23,0x7E,0x3E,0x03,0x86,0x0A,0xAE,0x0A, + 0x68,0x6D,0x23,0x40,0x3E,0x01,0x21,0xC6,0x3E,0x02,0x21,0xC7,0x3E,0x03,0x86,0x0A, + 0xAE,0x0A,0x68,0x6D,0x21,0xD6,0x3B,0x00,0x12,0x96,0x45,0xDE,0xDE,0xF1,0x12,0x80, + 0x21,0xCA,0x3B,0x0B,0x12,0xCE,0xD1,0xE7,0x12,0xB2,0xC6,0x04,0x68,0x6E,0x02,0xCE, + 0x21,0xDF,0x3B,0x00,0x12,0x87,0x21,0xDE,0x3B,0x08,0xDA,0xF1,0x12,0x88,0x23,0x08, + 0x41,0xE0,0x55,0xCC,0xDD,0xF2,0x1B,0x95,0xC2,0xF1,0x12,0x8A,0x68,0x62,0x43,0xE0, + 0x45,0xDE,0x68,0x63,0x47,0xDE,0x3D,0xD6,0x23,0x00,0x3D,0xD8,0xC2,0x04,0x68,0x6E, + 0xD5,0xE7,0x1A,0xA3,0x21,0xCA,0x3B,0x0B,0x12,0xA3,0x23,0x7E,0x68,0x6A,0x3E,0x00, + 0x95,0xE7,0x96,0xF1,0xAE,0x0A,0x51,0xCC,0x21,0xD8,0x68,0x62,0x68,0x68,0x3D,0xD8, + 0x68,0x66,0x68,0x6B,0x3E,0x00,0x3E,0xB1,0xCE,0x0A,0x1A,0xC2,0x21,0xD6,0x68,0x69, + 0x3D,0xD6,0x02,0x72,0x22,0x05,0x2F,0x0F,0x3B,0x0C,0xDA,0xF1,0x68,0x6F,0x22,0x78, + 0x68,0x6B,0x3E,0x00,0x22,0x79,0x3E,0x00,0x23,0x7E,0x68,0x6A,0x3E,0x00,0x91,0xE7, + 0xCE,0x0A,0x68,0x6E,0x86,0x0A,0xAE,0x0A,0x99,0xE8,0x02,0xCE,0x86,0x0A,0xAE,0x0A, + 0xCE,0xF4,0x68,0x6F,0xDD,0xE7,0x68,0x6E,0x23,0x07,0x3D,0xE8,0x23,0x00,0x3D,0xE7, + 0x8E,0xF4,0x68,0x6D,0x22,0x86,0xD9,0xDD,0x12,0xD6,0x22,0x08,0x2F,0xF0,0x3B,0xB0, + 0x1B,0x05,0x22,0x86,0xD9,0xDD,0x12,0xDD,0x22,0x08,0x2F,0xF0,0x3B,0xF0,0x1B,0x99, + 0xD9,0xDD,0x1A,0xE7,0x21,0xF0,0x37,0xFA,0x2F,0xFA,0x3E,0x08,0x02,0xF0,0xDD,0xF0, + 0x68,0x6C,0xF2,0x06,0xD9,0xF0,0x68,0x6C,0xE6,0x06,0xC5,0xF0,0x68,0x6C,0xE2,0x06, + 0xDD,0xF2,0x1B,0x95,0xDD,0xF1,0x12,0xF7,0xDD,0xD9,0x1B,0x9F,0xBD,0xF1,0x22,0x06, + 0x37,0xFF,0x39,0xEF,0x12,0xFC,0x68,0x6D,0x3D,0xC8,0x35,0xEF,0x2D,0xEE,0x3B,0x00, + 0x21,0xC8,0x3D,0xEF,0x68,0x6F,0x92,0xF4,0x68,0x6D,0x22,0x86,0xD9,0xDD,0x13,0x09, + 0x22,0x08,0x2F,0xF0,0x3B,0xF0,0x1B,0x99,0xD9,0xDD,0x1B,0x12,0xDD,0xF0,0x68,0x6C, + 0xFE,0x08,0x03,0x15,0xDD,0xF0,0x68,0x6C,0xF2,0x06,0xDD,0xE9,0x13,0x1E,0xD9,0xE9, + 0x1B,0x1E,0x22,0xBC,0x3B,0x10,0x13,0x1F,0x99,0xE9,0x82,0xF4,0x3E,0xBC,0xDD,0xEA, + 0x13,0x28,0xD9,0xEA,0x1B,0x28,0x22,0xBD,0x3B,0x10,0x13,0x29,0x99,0xEA,0x82,0xF4, + 0x3E,0xBD,0xDD,0xEB,0x13,0x32,0xD9,0xEB,0x1B,0x32,0x22,0xBE,0x3B,0x10,0x13,0x33, + 0x99,0xEB,0x82,0xF4,0x3E,0xBE,0xDD,0xEC,0x13,0x3C,0xD9,0xEC,0x1B,0x3C,0x22,0xBF, + 0x3B,0x10,0x13,0x3D,0x99,0xEC,0x82,0xF4,0x3E,0xBF,0xDD,0xF2,0x1B,0x95,0xDD,0xF1, + 0x13,0x44,0xDD,0xD9,0x1B,0x9F,0xBD,0xF1,0x22,0x06,0x37,0xFF,0x2F,0x80,0x39,0xEF, + 0x13,0x4A,0x68,0x6D,0x3D,0xEF,0xDD,0xEE,0x68,0x6E,0x92,0xF4,0x68,0x6D,0x21,0xE7, + 0x2F,0x0F,0x3B,0x04,0x68,0x6E,0x21,0xE7,0xDE,0xF0,0x1B,0x75,0xDA,0xF0,0x1B,0x6A, + 0x68,0x70,0x43,0xC8,0x21,0xCA,0xC2,0xF0,0x21,0xC9,0x68,0x64,0x3D,0xD8,0x23,0x81, + 0x3E,0x01,0x21,0xDA,0x33,0x20,0x3E,0x02,0x23,0x80,0x3E,0x01,0x23,0x00,0x3E,0xF5, + 0x99,0xE7,0x68,0x6D,0x68,0x70,0x43,0xC8,0x21,0xCA,0xC2,0xF0,0x21,0xC9,0x68,0x64, + 0x68,0x6B,0x29,0xD8,0x3B,0x3B,0xDA,0xF1,0x13,0x80,0xCE,0xF4,0x68,0x6F,0xDD,0xE7, + 0x68,0x6E,0x21,0xDA,0x2F,0xDF,0x3E,0x02,0x68,0x6A,0xFE,0x08,0xBD,0xF0,0x03,0x91, + 0xD6,0xF0,0x1B,0x8D,0x22,0xF5,0x3B,0x08,0xDA,0xF1,0x68,0x6E,0x21,0xDA,0x2F,0xDF, + 0x3E,0x02,0x68,0x6A,0xFE,0x08,0xBD,0xF0,0x95,0xE7,0x22,0xF5,0x3B,0x22,0xDA,0xF1, + 0x68,0x6E,0x23,0x00,0x3D,0xE7,0x8E,0xF4,0x68,0x6D,0x0B,0xB8,0xBD,0xF2,0x9E,0xF6, + 0x03,0x96,0x23,0xCC,0x3D,0xF4,0x0B,0xB8,0x96,0xF4,0x9E,0xF6,0x03,0x9E,0xBD,0xD9, + 0xA2,0x0A,0x86,0x0A,0xAE,0x0A,0xBD,0xF1,0x0B,0x4F,0xD9,0xF1,0x1B,0xAF,0x0A,0xD2, + 0xDD,0xF3,0x13,0xA3,0x23,0x00,0x3D,0xF1,0xBD,0xF3,0x9D,0xD9,0x00,0x00,0x58,0x00, + 0x4B,0xC8,0x41,0xF8,0x22,0x7F,0x2F,0xBF,0x3E,0x7F,0x21,0xF6,0x4D,0xC8,0x03,0xB7, + 0xA2,0x0A,0x86,0x0A,0xAE,0x0A,0xD9,0xDD,0x1B,0xBF,0x23,0x00,0x3E,0x46,0x23,0x00, + 0x3D,0xF5,0x3D,0xF0,0x3D,0xEF,0x23,0x12,0x3E,0x06,0x23,0xC0,0x3E,0x08,0x68,0x6D}; + +static unsigned char resRVdce[192] = + {0x22,0x86,0xD9,0xDD,0x15,0x04,0x22,0x08,0x2F,0xF0,0x3B,0xF0,0x1D,0x30,0xD9,0xDD, + 0x1D,0x0E,0x21,0xF0,0x37,0xFA,0x2F,0xFA,0x3E,0x08,0x05,0x17,0xDD,0xF0,0x68,0x6C, + 0xF2,0x06,0xD9,0xF0,0x68,0x6C,0xE6,0x06,0xC5,0xF0,0x68,0x6C,0xE2,0x06,0xDD,0xF2, + 0x1D,0x2C,0xDD,0xF1,0x15,0x1E,0xDD,0xD9,0x1D,0x36,0xBD,0xF1,0x22,0x06,0x37,0xFF, + 0x39,0xEF,0x15,0x23,0x68,0x6D,0x3D,0xC8,0x35,0xEF,0x2D,0xEE,0x3B,0x00,0x21,0xC8, + 0x3D,0xEF,0x68,0x6F,0x92,0xF4,0x68,0x6D,0x0D,0x4F,0xBD,0xF2,0x9E,0xF6,0x05,0x2D, + 0x23,0xCC,0x3D,0xF4,0x0D,0x4F,0x96,0xF4,0x9E,0xF6,0x05,0x35,0xBD,0xD9,0xA2,0x0A, + 0x86,0x0A,0xAE,0x0A,0xBD,0xF1,0x0D,0x5F,0xD9,0xF1,0x1D,0x46,0x0D,0x00,0xDD,0xF3, + 0x15,0x3A,0x23,0x00,0x3D,0xF1,0xBD,0xF3,0x9D,0xD9,0x00,0x00,0x58,0x00,0x4B,0xC8, + 0x41,0xF8,0x22,0x7F,0x2F,0xBF,0x3E,0x7F,0x21,0xF6,0x4D,0xC8,0x05,0x4E,0xA2,0x0A, + 0x86,0x0A,0xAE,0x0A,0xD9,0xDD,0x1D,0x56,0x23,0x00,0x3E,0x46,0x23,0x00,0x3D,0xF5, + 0x3D,0xF0,0x3D,0xEF,0x23,0x12,0x3E,0x06,0x23,0xC0,0x3E,0x08,0x68,0x6D,0x68,0x6D}; + +static unsigned char funcLoad[1760] = + {0xA1,0xC5,0x23,0x40,0x3E,0x01,0x21,0xC6,0x3E,0x02,0x21,0xC7,0x3E,0x03,0x23,0x10, + 0x3E,0xB7,0x3E,0xB5,0x23,0x20,0x3E,0xB6,0x3E,0xB4,0xCD,0xBD,0x18,0x12,0x23,0x7E, + 0x3D,0x9F,0x00,0x14,0x23,0xC0,0x3D,0x9F,0x50,0x00,0x43,0x84,0x23,0x00,0x3D,0x98, + 0x0B,0x1E,0x09,0xAE,0xC1,0xC5,0x18,0x00,0x49,0x8C,0x68,0x6D,0x21,0xBF,0x3B,0x00, + 0x18,0x26,0xC2,0x04,0x10,0x32,0x3E,0x00,0x23,0x00,0x3D,0xBF,0xC5,0xC5,0x18,0x32, + 0xC9,0xC5,0x10,0x30,0xC2,0x04,0x10,0x32,0x23,0x00,0x68,0x6A,0x3E,0x00,0x00,0x32, + 0x49,0x8E,0x68,0x6D,0x21,0xC5,0x39,0xC4,0x13,0x3A,0xC2,0xF1,0x10,0x59,0xD5,0x98, + 0x18,0x44,0x95,0x98,0x45,0x8A,0xDE,0xF1,0x10,0x4F,0x21,0x42,0x39,0x40,0x10,0x54, + 0x45,0x88,0xDE,0xF1,0x10,0x55,0x00,0x59,0xB5,0x98,0x45,0x88,0xDE,0xF1,0x10,0x55, + 0x45,0x8A,0xDE,0xF1,0x10,0x4F,0x21,0x42,0x39,0x40,0x10,0x54,0x00,0x59,0x49,0x8A, + 0x45,0x84,0x47,0x8A,0x68,0x6D,0x00,0x59,0x00,0xD9,0x49,0x88,0x45,0x84,0x47,0x88, + 0x68,0x6D,0x0D,0x00,0x00,0x1A,0x00,0x1E,0xDE,0x04,0x10,0x1E,0x78,0x60,0x67,0x8C, + 0xD9,0x97,0x10,0x64,0xB9,0x97,0x00,0x97,0xDE,0x04,0x10,0xB6,0x68,0x70,0x43,0x80, + 0x45,0x82,0x21,0xB7,0x68,0x62,0x47,0x82,0x22,0x00,0x3D,0x90,0xCA,0x0A,0x18,0x97, + 0x22,0x04,0x2F,0x28,0x3D,0x91,0x3B,0x00,0x18,0x7E,0xD6,0xF0,0x10,0x7A,0x95,0xA4, + 0x8D,0xA4,0x00,0x7E,0xCE,0xF0,0x10,0x7E,0x9D,0xA4,0x8D,0xA4,0x45,0x70,0x21,0x90, + 0x68,0x67,0x68,0x60,0x47,0x70,0x21,0x99,0x68,0x68,0x3D,0x99,0x53,0x74,0x18,0xD7, + 0xCD,0xA4,0xAD,0xA4,0x18,0x97,0x3B,0x20,0x68,0x6E,0x21,0x70,0x3B,0x60,0xDA,0xF1, + 0x10,0x95,0x23,0x00,0x3D,0x70,0x39,0x74,0x18,0xD7,0x23,0x00,0x00,0xBA,0x78,0x5C, + 0x67,0x8C,0x21,0x70,0x3B,0x60,0xDA,0xF1,0x10,0xA1,0x23,0x00,0x3D,0x70,0x39,0x74, + 0x18,0xA5,0xCA,0x0A,0x18,0xA5,0xC1,0x97,0x10,0xAA,0xAA,0x0A,0xA1,0x97,0xA2,0x0A, + 0x23,0x50,0x00,0xB3,0xDD,0xA4,0x10,0xAE,0x23,0x90,0x00,0xB3,0xD5,0xA4,0x10,0xB2, + 0x23,0x11,0x00,0xB3,0x23,0x10,0xBD,0xA4,0xB5,0xA4,0x00,0xBA,0x41,0x80,0x68,0x71, + 0x68,0x6E,0x00,0x97,0x45,0x40,0x68,0x67,0x68,0x60,0x21,0x99,0x68,0x67,0x23,0x00, + 0x3D,0x99,0x21,0x40,0x68,0x6B,0x27,0x02,0x3B,0x40,0x10,0xC7,0x23,0x00,0x39,0x42, + 0x18,0xCB,0x3D,0x40,0x68,0x6D,0x45,0x40,0x3D,0x40,0x23,0x50,0x68,0x67,0x68,0x60, + 0x68,0x66,0x33,0x80,0x68,0x67,0xA2,0x0A,0x78,0x5C,0x67,0x8C,0x00,0x1E,0x81,0x97, + 0x00,0x97,0x45,0x42,0x68,0x66,0x3D,0x94,0x68,0x60,0x68,0x66,0x2F,0x3F,0x3D,0x95, + 0x3B,0x00,0x19,0x1E,0xDD,0x97,0x18,0xFE,0xD9,0x97,0x10,0xE8,0x9D,0x97,0x00,0xFE, + 0x45,0x50,0x43,0x90,0x50,0x00,0x21,0x95,0x68,0x62,0x53,0x90,0xDA,0xF1,0x19,0x59, + 0x45,0x50,0x68,0x63,0x47,0x50,0x41,0x4C,0x21,0x95,0x4F,0x74,0x68,0x62,0x43,0x4C, + 0x45,0x54,0x68,0x62,0x47,0x54,0x79,0x02,0x67,0x8A,0x00,0x59,0x21,0x95,0x45,0x54, + 0x68,0x62,0x47,0x54,0x21,0x74,0x68,0x6B,0x25,0x95,0x3B,0x60,0xDA,0xF1,0x11,0x09, + 0x23,0x00,0x3D,0x74,0x21,0x94,0xDA,0xF0,0x11,0x0E,0x82,0x0A,0x21,0x94,0xD2,0xF0, + 0x19,0x1E,0x45,0x50,0xDE,0xF1,0x11,0x86,0xDD,0x97,0x19,0x86,0x41,0x48,0xDE,0xF1, + 0x19,0x1C,0x23,0x0C,0x55,0x48,0x01,0x86,0x99,0x97,0x01,0x86,0xCD,0x94,0x11,0x22, + 0x8E,0xF2,0x01,0x86,0x41,0x44,0xDE,0xF1,0x19,0x43,0x45,0x42,0x68,0x60,0x68,0x66, + 0xDE,0xF0,0x11,0x2B,0x82,0x0A,0x21,0x94,0xDD,0x97,0x11,0x30,0x33,0x02,0xBD,0x97, + 0xB9,0x97,0x3D,0x56,0x41,0xB0,0x23,0x10,0x68,0x62,0x23,0x04,0x57,0x54,0x79,0x3A, + 0x67,0x8A,0x00,0x59,0x92,0xF2,0x23,0x00,0x3D,0x54,0x3D,0x55,0x41,0x44,0x43,0xB0, + 0x23,0x10,0x55,0x44,0x01,0x86,0x41,0xB0,0x23,0x10,0x55,0x44,0x79,0x49,0x67,0x8A, + 0x00,0x59,0x41,0x44,0xDE,0xF1,0x11,0x25,0x68,0x70,0x23,0x0E,0x68,0x62,0x43,0x78, + 0x79,0x53,0x67,0x8A,0x00,0x59,0x41,0x78,0x68,0x71,0x19,0x43,0x79,0x53,0x67,0x8A, + 0x00,0x59,0x21,0x95,0x68,0x6B,0x29,0x50,0x3D,0x95,0x41,0x4C,0x21,0x50,0x4F,0x74, + 0x45,0x54,0x68,0x62,0x47,0x54,0x79,0x66,0x67,0x8A,0x00,0x59,0x21,0x74,0x68,0x6B, + 0x25,0x50,0x3D,0x74,0x41,0x48,0xDE,0xF1,0x11,0x75,0x9D,0x97,0x21,0x95,0x45,0x54, + 0x68,0x62,0x47,0x54,0x50,0x00,0x47,0x50,0x01,0x02,0x23,0x0C,0x55,0x48,0x79,0x7A, + 0x67,0x8A,0x00,0x59,0x45,0x50,0x43,0x90,0x50,0x00,0x21,0x95,0x68,0x62,0x53,0x90, + 0xDA,0xF1,0x19,0x59,0x45,0x50,0x68,0x63,0x47,0x50,0x00,0xF3,0x21,0x42,0x68,0x6B, + 0x27,0x02,0x3B,0x40,0x11,0x8C,0x23,0x00,0x3D,0x42,0x00,0x59,0x4B,0x92,0x09,0xAE, + 0x41,0xB0,0xDE,0xF1,0x11,0x99,0x79,0x97,0x67,0x8A,0x49,0x92,0x68,0x6D,0x4B,0x92, + 0x01,0x90,0x23,0x10,0x55,0x44,0x79,0x9F,0x67,0x8A,0x49,0x92,0x68,0x6D,0x82,0x0A, + 0x78,0x5C,0x67,0x8C,0x50,0x00,0x43,0x70,0x43,0x74,0x23,0x00,0x3D,0xB5,0x3D,0x99, + 0x3D,0x54,0x3D,0x55,0x3D,0x97,0x00,0x53,0x81,0x98,0x01,0xAF,0xA1,0x98,0xA2,0x0A, + 0x51,0x00,0x47,0x40,0x47,0x42,0x45,0x84,0x47,0x8A,0x78,0x5B,0x67,0x8C,0xC1,0x98, + 0x68,0x6E,0x8A,0xF2,0x68,0x6D,0x00,0x32,0x96,0xF2,0xB1,0x98,0xDD,0xBC,0x10,0x32, + 0xBD,0xBC,0x79,0xC6,0x67,0x8E,0x7A,0x5D,0x67,0x88,0x00,0x32,0x00,0x32,0xC2,0x04, + 0x10,0x32,0x4B,0x92,0x0A,0x4D,0x49,0x92,0xC1,0x9D,0x11,0xCF,0x00,0x32,0x79,0xD2, + 0x67,0x8E,0x68,0x6D,0xC2,0x04,0x10,0x32,0x4B,0x92,0x0A,0x4D,0x49,0x92,0xC1,0x9D, + 0x11,0xDA,0x00,0x32,0x45,0x6E,0x68,0x66,0x68,0x6B,0x3E,0x00,0x21,0x6E,0x68,0x68, + 0x3B,0x00,0x11,0xE3,0x23,0x80,0x3D,0x6E,0x68,0x6B,0x29,0x6C,0x11,0xEA,0xC9,0x98, + 0x19,0xF7,0x01,0xEC,0xC9,0x98,0x19,0xF1,0x3B,0x00,0x68,0x6E,0x7A,0x0E,0x67,0x8E, + 0x68,0x6D,0x3B,0x1F,0xDA,0xF1,0x19,0xF7,0x3B,0x00,0x19,0xFD,0x68,0x6D,0x45,0x88, + 0xDE,0xF1,0x68,0x6E,0x7A,0x80,0x67,0x88,0x68,0x6D,0x7A,0x00,0x67,0x8E,0x68,0x6D, + 0xC2,0x04,0x10,0x32,0x4B,0x92,0x0A,0x4D,0x49,0x92,0xC1,0x9D,0x12,0x08,0x00,0x32, + 0x21,0x6C,0x39,0x6E,0x18,0x32,0x79,0xD2,0x67,0x8E,0x01,0xDA,0xC2,0x04,0x10,0x32, + 0x4B,0x92,0x0A,0x4D,0x49,0x92,0xC1,0x9D,0x12,0x16,0x00,0x32,0xBD,0x98,0x91,0x98, + 0xAD,0x98,0x7A,0x3D,0x67,0x8E,0x7A,0x1E,0x67,0x88,0x68,0x6D,0x41,0x58,0xDE,0xF1, + 0x12,0x27,0x41,0x7C,0x23,0x14,0x55,0x58,0x7A,0x27,0x67,0x88,0x00,0x59,0x23,0x80, + 0x3D,0x6A,0x41,0x7C,0x23,0x12,0x68,0x62,0x23,0x01,0x57,0x6A,0x7A,0x31,0x67,0x88, + 0x00,0x59,0x8D,0x98,0x41,0x58,0xDE,0xF1,0x12,0x37,0x9D,0x98,0x68,0x6D,0x96,0xF2, + 0xBD,0xBC,0xB1,0x98,0x79,0xC6,0x67,0x8E,0x02,0x6B,0xDD,0x98,0x10,0x32,0xDD,0xBC, + 0x19,0xBC,0xD5,0x6B,0x1A,0x48,0xD1,0x6B,0x1A,0x46,0x91,0x6B,0xC6,0x04,0x10,0x32, + 0x96,0xF2,0xB1,0x98,0x79,0xBE,0x67,0x8E,0x00,0x32,0xC1,0xBD,0x12,0x54,0x22,0x06, + 0xDA,0xF0,0x1A,0x55,0xBE,0x0A,0xA1,0x9D,0x68,0x6D,0x81,0x9D,0x22,0x05,0x2F,0x0F, + 0x3B,0x01,0xDA,0xF1,0x68,0x6E,0x9E,0x0A,0x68,0x6D,0x41,0xB8,0x23,0x04,0x55,0x7C, + 0x7A,0x63,0x67,0x88,0x00,0x59,0x41,0x7C,0xDE,0xF1,0x12,0x6B,0x23,0x88,0x3D,0xBE, + 0x79,0xBE,0x67,0x8E,0x00,0x59,0x23,0x14,0x55,0x58,0x43,0x7C,0x7A,0x72,0x67,0x88, + 0x99,0xB6,0x00,0x59,0x41,0x58,0xDE,0xF1,0x12,0x7B,0x41,0xB8,0x23,0x04,0x57,0x84, + 0x7A,0x7B,0x67,0x88,0x00,0x59,0x99,0x98,0x89,0x98,0x50,0x80,0x47,0x6C,0x47,0x6E, + 0x45,0x68,0xDE,0xF1,0x12,0xA5,0x23,0x80,0xDD,0x6B,0x13,0x17,0x23,0x10,0x3D,0x6A, + 0x41,0x7C,0x23,0x12,0x68,0x62,0x23,0x01,0x57,0x6A,0x7A,0x90,0x67,0x88,0x00,0x59, + 0x41,0x58,0x23,0x82,0xDE,0xF1,0x1B,0x17,0x23,0x14,0x55,0x58,0x43,0x7C,0x7A,0x9A, + 0x67,0x88,0x00,0x59,0xDD,0x6B,0x1A,0xA5,0x41,0x58,0xDE,0xF1,0x12,0xA5,0x41,0xB8, + 0x23,0x04,0x57,0x84,0x7A,0xA5,0x67,0x88,0x00,0x59,0x45,0x66,0xDE,0xF1,0x1A,0xC0, + 0x65,0x66,0x63,0x90,0x45,0x64,0x53,0x90,0xDA,0xF1,0x1A,0xB9,0x68,0x5D,0x67,0x66, + 0x41,0x5C,0x23,0x84,0xDE,0xF1,0x1B,0x17,0x23,0x0A,0x55,0x5C,0x7A,0xA8,0x67,0x88, + 0x00,0x59,0x68,0x7D,0x47,0x64,0x41,0x60,0x68,0x7C,0x43,0x60,0x70,0x00,0x67,0x66, + 0x45,0x64,0xDE,0xF1,0x12,0xCC,0x41,0x5C,0x23,0x86,0xDE,0xF1,0x1B,0x17,0x23,0x0A, + 0x55,0x5C,0x7A,0xCC,0x67,0x88,0x00,0x59,0x21,0x6E,0x68,0x6B,0x29,0x6C,0x1A,0xD3, + 0x3B,0x00,0x1A,0xD3,0x02,0xD6,0x23,0x00,0x68,0x6B,0x29,0x6C,0x3D,0x96,0x45,0x68, + 0x43,0x90,0x70,0x00,0x68,0x42,0x73,0x90,0xDA,0xF1,0x12,0xE5,0x45,0x64,0x53,0x90, + 0xDA,0xF1,0x21,0x64,0x12,0xEB,0x21,0x68,0x02,0xEB,0x63,0x90,0x45,0x64,0x53,0x90, + 0xDA,0xF1,0x1A,0xEC,0x21,0x64,0x3D,0x96,0x21,0x96,0x41,0x60,0x4D,0x6C,0x68,0x62, + 0x43,0x60,0x45,0x64,0x68,0x63,0x47,0x64,0x45,0x68,0x68,0x63,0x47,0x68,0x7A,0xFA, + 0x67,0x88,0x00,0x59,0x21,0x6C,0x68,0x6B,0x25,0x96,0x3B,0x00,0x13,0x00,0x23,0x80, + 0x3D,0x6C,0x45,0x68,0xDE,0xF1,0x13,0x0A,0xDD,0x6B,0x1B,0x0A,0xA9,0x98,0xD9,0x98, + 0x10,0x59,0x03,0x0F,0xD9,0x98,0x10,0x59,0x21,0x6C,0x3B,0x80,0x12,0x80,0xB9,0x98, + 0x79,0xC7,0x67,0x8E,0xC2,0x04,0x10,0x59,0x49,0x8E,0x68,0x6D,0x03,0x12,0x3D,0xBE, + 0x0B,0x1E,0x09,0xAE,0x9E,0xF4,0x00,0x1A,0x81,0x98,0x03,0x1F,0xA1,0x98,0x86,0x0A, + 0xAE,0x0A,0xC5,0x98,0x13,0x26,0x22,0x01,0x2F,0xFC,0x3E,0x01,0x45,0x84,0x47,0x88, + 0x79,0xBB,0x67,0x8E,0xC1,0x98,0x68,0x6E,0x86,0xF2,0x68,0x6D,0x79,0xBE,0x67,0x8E, + 0xB1,0x98,0xC5,0x98,0x13,0x5D,0x22,0x01,0x2F,0xFD,0xD5,0xBD,0x13,0x38,0x33,0x03, + 0x3E,0x01,0x03,0x5D,0x3D,0x90,0x35,0xC4,0x3D,0x91,0xD1,0x90,0x1B,0x60,0xD5,0x90, + 0x13,0x48,0xD5,0x91,0x10,0x35,0x09,0xAE,0x0B,0x1E,0x82,0xF2,0x95,0xC4,0x00,0x35, + 0xDD,0x91,0x13,0x4F,0xDD,0x90,0x13,0x4E,0x09,0x8E,0x03,0x4F,0x09,0xAC,0xD9,0x91, + 0x13,0x5D,0xD9,0x90,0x1B,0x2E,0xD1,0x98,0x13,0x5C,0xCD,0x98,0x1B,0x5A,0x21,0x90, + 0x33,0x40,0x03,0x5E,0x96,0xF2,0xB1,0x98,0x0B,0x1C,0x21,0x90,0x3D,0xC4,0x00,0x35, + 0x91,0xC4,0x58,0x00,0x4B,0xC8,0x41,0xFC,0x22,0x7F,0x2F,0xBF,0x3E,0x7F,0x21,0xF7, + 0x4D,0xC8,0x03,0x69,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}; + +#endif diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/char/lp.c linuxppc64_2_4/drivers/char/lp.c --- ../kernel.org/linux-2.4.19/drivers/char/lp.c Fri Apr 19 10:59:54 2002 +++ linuxppc64_2_4/drivers/char/lp.c Mon Apr 22 10:32:50 2002 @@ -611,10 +611,13 @@ return -EFAULT; break; case LPGETSTATUS: - lp_claim_parport_or_block (&lp_table[minor]); + if (down_interruptible (&lp_table[minor].port_mutex)) + return -EINTR; + parport_claim_or_block (lp_table[minor].dev); status = r_str(minor); - lp_release_parport (&lp_table[minor]); - + parport_release (lp_table[minor].dev); + + up (&lp_table[minor].port_mutex); if (copy_to_user((int *) arg, &status, sizeof(int))) return -EFAULT; break; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/char/tty_io.c linuxppc64_2_4/drivers/char/tty_io.c --- ../kernel.org/linux-2.4.19/drivers/char/tty_io.c Wed May 15 08:29:38 2002 +++ linuxppc64_2_4/drivers/char/tty_io.c Mon May 13 16:39:12 2002 @@ -2184,6 +2184,11 @@ * set up the console device so that later boot sequences can * inform about problems etc.. */ + +#ifdef CONFIG_VIOCONS + viocons_init(); +#endif + #ifdef CONFIG_VT con_init(); #endif @@ -2245,6 +2250,9 @@ #ifdef CONFIG_SERIAL_TX3912_CONSOLE tx3912_console_init(); #endif +#ifdef CONFIG_HVC_CONSOLE + hvc_console_init(); +#endif #ifdef CONFIG_TXX927_SERIAL_CONSOLE txx927_console_init(); #endif @@ -2301,6 +2309,10 @@ /* console calls tty_register_driver() before kmalloc() works. * Thus, we can't devfs_register() then. Do so now, instead. */ +#ifdef CONFIG_VIOCONS + viocons_init2(); +#endif + #ifdef CONFIG_VT con_init_devfs(); #endif @@ -2389,5 +2401,8 @@ #endif #ifdef CONFIG_A2232 a2232board_init(); +#endif +#ifdef CONFIG_ICOM + iCom_init(); #endif } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/ide/sl82c105.c linuxppc64_2_4/drivers/ide/sl82c105.c --- ../kernel.org/linux-2.4.19/drivers/ide/sl82c105.c Fri Apr 19 10:30:27 2002 +++ linuxppc64_2_4/drivers/ide/sl82c105.c Mon Dec 17 15:04:57 2001 @@ -156,6 +156,29 @@ } /* + * Reset the controller. + * If we are using INTC under a w83c553 we need to use a magic test + * bit to do this. Return zero if successful (or applicable). + * + */ +static int sl82c105_hard_reset(ide_drive_t *drive) +{ + ide_hwif_t *hwif = HWIF(drive); + struct pci_dev *dev = hwif->pci_dev; + unsigned int reg; + + pci_read_config_dword(dev, 0x40, ®); /* LEGIRQ register */ + if (reg & (1<<11)) { /* Using INTC? */ + printk("sl82c105: resetting device\n"); + pci_read_config_dword(dev, 0x7e, ®); + pci_write_config_word(dev, 0x7e, reg | (1<<2)); + pci_write_config_word(dev, 0x7e, reg & (~(1<<2))); + return 0; + } + return 1; +} + +/* * Our own dmaproc, only to intercept ide_dma_check */ static int sl82c105_dmaproc(ide_dma_action_t func, ide_drive_t *drive) @@ -171,6 +194,11 @@ case ide_dma_off: config_for_pio(drive, 4, 0); break; + case ide_dma_lostirq: + case ide_dma_timeout: + if (sl82c105_hard_reset(drive) == 0) + return 0; + break; default: break; } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/iseries/Makefile linuxppc64_2_4/drivers/iseries/Makefile --- ../kernel.org/linux-2.4.19/drivers/iseries/Makefile Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/drivers/iseries/Makefile Thu Oct 11 11:10:49 2001 @@ -0,0 +1,43 @@ +# +# Makefile for the iSeries-specific device drivers. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (ie not a .c file). +# +# Note 2! The CFLAGS definitions are now inherited from the +# parent makes.. +# + +# The target object and module list name. + +# O_TARGET := macintosh.o + +O_TARGET := iseries.o + +# Objects that export symbols. + +# export-objs := adb.o rtc.o mac_hid.o via-pmu.o + +export-objs := veth.o viocons.o viotape.o viodasd.o viocd.o viopath.o + +# Object file lists. + +obj-y := +obj-m := +obj-n := +obj- := + +# Each configuration option enables a list of files. + +obj-$(CONFIG_VETH) += veth.o +obj-$(CONFIG_VIOCONS) += viocons.o +obj-$(CONFIG_VIOPATH) += viopath.o +obj-$(CONFIG_VIOTAPE) += viotape.o +obj-$(CONFIG_VIODASD) += viodasd.o +obj-$(CONFIG_VIOCD) += viocd.o + +# The global Rules.make. + +include $(TOPDIR)/Rules.make + diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/iseries/veth.c linuxppc64_2_4/drivers/iseries/veth.c --- ../kernel.org/linux-2.4.19/drivers/iseries/veth.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/drivers/iseries/veth.c Mon May 6 10:47:06 2002 @@ -0,0 +1,1731 @@ +/* File veth.c created by Kyle A. Lucke on Mon Aug 7 2000. */ + +/**************************************************************************/ +/* */ +/* IBM eServer iSeries Virtual Ethernet Device Driver */ +/* Copyright (C) 2001 Kyle A. Lucke (klucke@us.ibm.com), IBM Corp. */ +/* */ +/* This program is free software; you can redistribute it and/or modify */ +/* it under the terms of the GNU General Public License as published by */ +/* the Free Software Foundation; either version 2 of the License, or */ +/* (at your option) any later version. */ +/* */ +/* This program is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */ +/* GNU General Public License for more details. */ +/* */ +/* You should have received a copy of the GNU General Public License */ +/* along with this program; if not, write to the Free Software */ +/* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 */ +/* USA */ +/* */ +/* This module contains the implementation of a virtual ethernet device */ +/* for use with iSeries LPAR Linux. It utilizes low-level message passing*/ +/* provided by the hypervisor to enable an ethernet-like network device */ +/* that can be used to enable inter-partition communications on the same */ +/* physical iSeries. */ +/* */ +/* The iSeries LPAR hypervisor has currently defined the ability for a */ +/* partition to communicate on up to 16 different virtual ethernets, all */ +/* dynamically configurable, at least for an OS/400 partition. The */ +/* dynamic nature is not supported for Linux yet. */ +/* */ +/* Each virtual ethernet a given Linux partition participates in will */ +/* cause a network device with the form ethXX to be created, */ +/* */ +/* The virtual ethernet a given ethXX virtual ethernet device talks on */ +/* can be determined either by dumping /proc/iSeries/veth/vethX, where */ +/* X is the virtual ethernet number, and the netdevice name will be */ +/* printed out. The virtual ethernet a given ethX device communicates on */ +/* is also printed to the printk() buffer at module load time. */ +/* */ +/* This driver (and others like it on other partitions) is responsible for*/ +/* routing packets to and from other partitions. The MAC addresses used */ +/* by the virtual ethernets contain meaning, and should not be modified. */ +/* Doing so could disable the ability of your Linux partition to */ +/* communicate with the other OS/400 partitions on your physical iSeries. */ +/* Similarly, setting the MAC address to something other than the */ +/* "virtual burned-in" address is not allowed, for the same reason. */ +/* */ +/* Notes: */ +/* */ +/* 1. Although there is the capability to talk on multiple shared */ +/* ethernets to communicate to the same partition, each shared */ +/* ethernet to a given partition X will use a finite, shared amount */ +/* of hypervisor messages to do the communication. So having 2 shared */ +/* ethernets to the same remote partition DOES NOT double the */ +/* available bandwidth. Each of the 2 shared ethernets will share the */ +/* same bandwidth available to another. */ +/* */ +/* 2. It is allowed to have a virtual ethernet that does not communicate */ +/* with any other partition. It won't do anything, but it's allowed. */ +/* */ +/* 3. There is no "loopback" mode for a virtual ethernet device. If you */ +/* send a packet to your own mac address, it will just be dropped, you */ +/* won't get it on the receive side. Such a thing could be done, */ +/* but my default driver DOES NOT do so. */ +/* */ +/* 4. Multicast addressing is implemented via broadcasting the multicast */ +/* frames to other partitions. It is the responsibility of the */ +/* receiving partition to filter the addresses desired. */ +/* */ +/* 5. This module utilizes several different bottom half handlers for */ +/* non-high-use path function (setup, error handling, etc.). Multiple */ +/* bottom halves were used because only one would not keep up to the */ +/* much faster iSeries device drivers this Linux driver is talking to. */ +/* All hi-priority work (receiving frames, handling frame acks) is done*/ +/* in the interrupt handler for maximum performance. */ +/* */ +/* Tunable parameters: */ +/* */ +/* VethBuffersToAllocate: This compile time option defaults to 120. It can*/ +/* be safely changed to something greater or less than the default. It */ +/* controls how much memory Linux will allocate per remote partition it is*/ +/* communicating with. The user can play with this to see how it affects */ +/* performance, packets dropped, etc. Without trying to understand the */ +/* complete driver, it can be thought of as the maximum number of packets */ +/* outstanding to a remote partition at a time. */ +/* */ +/**************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef SIOCETHTOOL +#include +#endif +#include +#include + +#ifndef _VETH_H +#include "veth.h" +#endif +#ifndef _HVLPCONFIG_H +#include +#endif +#ifndef _VETH_PROC_H +#include +#endif +#ifndef _HVTYPES_H +#include +#endif +#ifndef _ISERIES_PROC_H +#include +#endif +#include +#include + + +#define veth_printk(fmt, args...) \ +printk(KERN_INFO "%s: " fmt, __FILE__, ## args) + +#define veth_error_printk(fmt, args...) \ +printk(KERN_ERR "(%s:%3.3d) ERROR: " fmt, __FILE__, __LINE__ , ## args) + +static const char *version __initdata = "v1.0 03/11/2002 Kyle Lucke, klucke@us.ibm.com\n"; + +static int probed __initdata = 0; +#define VethBuffersToAllocate 120 + +static struct VethFabricMgr *mFabricMgr = NULL; +static struct proc_dir_entry *veth_proc_root = NULL; + +DECLARE_MUTEX_LOCKED(VethProcSemaphore); + +static int veth_open(struct net_device *dev); +static int veth_close(struct net_device *dev); +static int veth_start_xmit(struct sk_buff *skb, struct net_device *dev); +static int veth_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); +static void veth_handleEvent(struct HvLpEvent *, struct pt_regs *); +static void veth_handleAck(struct HvLpEvent *); +static void veth_handleInt(struct HvLpEvent *); +static void veth_openConnections(void); +static void veth_openConnection(u8, int lockMe); +static void veth_closeConnection(u8, int lockMe); +static void veth_intFinishOpeningConnections(void *, int number); +static void veth_finishOpeningConnections(void *); +static void veth_finishOpeningConnectionsLocked(struct VethLpConnection *); +static int veth_multicast_wanted(struct VethPort *port, u64 dest); +static void veth_set_multicast_list(struct net_device *dev); + +static void veth_sendCap(struct VethLpConnection *); +static void veth_sendMonitor(struct VethLpConnection *); +static void veth_takeCap(struct VethLpConnection *, struct VethLpEvent *); +static void veth_takeCapAck(struct VethLpConnection *, struct VethLpEvent *); +static void veth_takeMonitorAck(struct VethLpConnection *, struct VethLpEvent *); +static void veth_msgsInit(struct VethLpConnection *connection); +static void veth_recycleMsg(struct VethLpConnection *, u16); +static void veth_capBh(struct VethLpConnection *); +static void veth_capAckBh(struct VethLpConnection *); +static void veth_monitorAckBh(struct VethLpConnection *); +static void veth_takeFrames(struct VethLpConnection *, struct VethLpEvent *); +static void veth_pTransmit(struct sk_buff *skb, HvLpIndex remoteLp, struct net_device *dev); +static struct net_device_stats *veth_get_stats(struct net_device *dev); +static void veth_intFinishMsgsInit(void *, int); +static void veth_finishMsgsInit(struct VethLpConnection *connection); +static void veth_intFinishCapBh(void *, int); +static void veth_finishCapBh(struct VethLpConnection *connection); +static void veth_finishCapBhLocked(struct VethLpConnection *connection); +static void veth_finishSendCap(struct VethLpConnection *connection); +static void veth_timedAck(unsigned long connectionPtr); +#ifdef MODULE +static void veth_waitForEnd(void); +#endif +static void veth_failMe(struct VethLpConnection *connection); + +extern struct pci_dev *iSeries_veth_dev; + +int __init veth_probe(void) +{ + struct net_device *dev = NULL; + struct VethPort *port = NULL; + int vlansFound = 0; + int displayVersion = 0; + + u16 vlanMap = HvLpConfig_getVirtualLanIndexMap(); + int vlanIndex = 0; + + if (probed) + return -ENODEV; + probed = 1; + + while (vlanMap != 0) { + int bitOn = vlanMap & 0x8000; + + if (bitOn) { + vlansFound++; + + dev = init_etherdev(NULL, sizeof(struct VethPort)); + + if (dev == NULL) { + veth_error_printk("Unable to allocate net_device structure!\n"); + break; + } + + if (!dev->priv) + dev->priv = kmalloc(sizeof(struct VethPort), GFP_KERNEL); + if (!dev->priv) { + veth_error_printk("Unable to allocate memory\n"); + return -ENOMEM; + } + + veth_printk("Found an ethernet device %s (veth=%d) (addr=%p)\n", dev->name, vlanIndex, dev); + port = mFabricMgr->mPorts[vlanIndex] = (struct VethPort *) dev->priv; + memset(port, 0, sizeof(struct VethPort)); + rwlock_init(&(port->mMcastGate)); + mFabricMgr->mPorts[vlanIndex]->mDev = dev; + + dev->dev_addr[0] = 0x02; + dev->dev_addr[1] = 0x01; + dev->dev_addr[2] = 0xFF; + dev->dev_addr[3] = vlanIndex; + dev->dev_addr[4] = 0xFF; + dev->dev_addr[5] = HvLpConfig_getLpIndex_outline(); + dev->mtu = 9000; + + memcpy(&(port->mMyAddress), dev->dev_addr, 6); + + dev->open = &veth_open; + dev->hard_start_xmit = &veth_start_xmit; + dev->stop = &veth_close; + dev->get_stats = veth_get_stats; + dev->set_multicast_list = &veth_set_multicast_list; + dev->do_ioctl = &veth_ioctl; + dev->features |= NETIF_F_SG; + + /* display version info if adapter is found */ + if (!displayVersion) { + /* set display flag to TRUE so that */ + /* we only display this string ONCE */ + displayVersion = 1; + veth_printk("%s", version); + } + + } + + ++vlanIndex; + vlanMap = vlanMap << 1; + } + + if (vlansFound > 0) + return 0; + else + return -ENODEV; +} + +#ifdef MODULE +MODULE_AUTHOR("Kyle Lucke "); +MODULE_DESCRIPTION("iSeries Virtual ethernet driver"); +MODULE_LICENSE("GPL"); + +DECLARE_MUTEX_LOCKED(VethModuleBhDone); +int VethModuleReopen = 1; + +void veth_proc_delete(struct proc_dir_entry *iSeries_proc) +{ + int i = 0; + HvLpIndex thisLp = HvLpConfig_getLpIndex_outline(); + u16 vlanMap = HvLpConfig_getVirtualLanIndexMap(); + int vlanIndex = 0; + + for (i = 0; i < HvMaxArchitectedLps; ++i) { + if (i != thisLp) { + if (HvLpConfig_doLpsCommunicateOnVirtualLan(thisLp, i)) { + char name[10] = ""; + sprintf(name, "lpar%d", i); + remove_proc_entry(name, veth_proc_root); + } + } + } + + while (vlanMap != 0) { + int bitOn = vlanMap & 0x8000; + + if (bitOn) { + char name[10] = ""; + sprintf(name, "veth%d", vlanIndex); + remove_proc_entry(name, veth_proc_root); + } + + ++vlanIndex; + vlanMap = vlanMap << 1; + } + + remove_proc_entry("veth", iSeries_proc); + + up(&VethProcSemaphore); +} + +void veth_waitForEnd(void) +{ + up(&VethModuleBhDone); +} + +void __exit veth_module_cleanup(void) +{ + int i; + struct VethFabricMgr *myFm = mFabricMgr; + struct tq_struct myBottomHalf; + struct net_device *thisOne = NULL; + + VethModuleReopen = 0; + + for (i = 0; i < HvMaxArchitectedLps; ++i) { + veth_closeConnection(i, 1); + } + + myBottomHalf.routine = (void *) (void *) veth_waitForEnd; + + queue_task(&myBottomHalf, &tq_immediate); + mark_bh(IMMEDIATE_BH); + + down(&VethModuleBhDone); + + HvLpEvent_unregisterHandler(HvLpEvent_Type_VirtualLan); + + mb(); + mFabricMgr = NULL; + mb(); + + down(&VethProcSemaphore); + + iSeries_proc_callback(&veth_proc_delete); + + down(&VethProcSemaphore); + + for (i = 0; i < HvMaxArchitectedLps; ++i) { + if (myFm->mConnection[i].mNumberAllocated + myFm->mConnection[i].mNumberRcvMsgs > 0) { + mf_deallocateLpEvents(myFm->mConnection[i].mRemoteLp, + HvLpEvent_Type_VirtualLan, + myFm->mConnection[i].mNumberAllocated + myFm->mConnection[i].mNumberRcvMsgs, + NULL, NULL); + } + + if (myFm->mConnection[i].mMsgs != NULL) { + kfree(myFm->mConnection[i].mMsgs); + } + } + + for (i = 0; i < HvMaxArchitectedVirtualLans; ++i) { + if (myFm->mPorts[i] != NULL) { + thisOne = myFm->mPorts[i]->mDev; + myFm->mPorts[i] = NULL; + + mb(); + + if (thisOne != NULL) { + veth_printk("Unregistering %s (veth=%d)\n", thisOne->name, i); + unregister_netdev(thisOne); + } + } + } + + kfree(myFm); +} + +module_exit(veth_module_cleanup); +#endif + + +void veth_proc_init(struct proc_dir_entry *iSeries_proc) +{ + long i = 0; + HvLpIndex thisLp = HvLpConfig_getLpIndex_outline(); + u16 vlanMap = HvLpConfig_getVirtualLanIndexMap(); + long vlanIndex = 0; + + + veth_proc_root = proc_mkdir("veth", iSeries_proc); + if (!veth_proc_root) + return; + + for (i = 0; i < HvMaxArchitectedLps; ++i) { + if (i != thisLp) { + if (HvLpConfig_doLpsCommunicateOnVirtualLan(thisLp, i)) { + struct proc_dir_entry *ent; + char name[10] = ""; + sprintf(name, "lpar%d", (int) i); + ent = create_proc_entry(name, S_IFREG | S_IRUSR, veth_proc_root); + if (!ent) + return; + ent->nlink = 1; + ent->data = (void *) i; + ent->read_proc = proc_veth_dump_connection; + ent->write_proc = NULL; + } + } + } + + while (vlanMap != 0) { + int bitOn = vlanMap & 0x8000; + + if (bitOn) { + struct proc_dir_entry *ent; + char name[10] = ""; + sprintf(name, "veth%d", (int) vlanIndex); + ent = create_proc_entry(name, S_IFREG | S_IRUSR, veth_proc_root); + if (!ent) + return; + ent->nlink = 1; + ent->data = (void *) vlanIndex; + ent->read_proc = proc_veth_dump_port; + ent->write_proc = NULL; + } + + ++vlanIndex; + vlanMap = vlanMap << 1; + } + + up(&VethProcSemaphore); +} + +int __init veth_module_init(void) +{ + int status; + int i; + + mFabricMgr = kmalloc(sizeof(struct VethFabricMgr), GFP_KERNEL); + memset(mFabricMgr, 0, sizeof(struct VethFabricMgr)); + veth_printk("Initializing veth module, fabric mgr (address=%p)\n", mFabricMgr); + + mFabricMgr->mEyecatcher = 0x56455448464D4752ULL; + mFabricMgr->mThisLp = HvLpConfig_getLpIndex_outline(); + + for (i = 0; i < HvMaxArchitectedLps; ++i) { + mFabricMgr->mConnection[i].mEyecatcher = 0x564554484C50434EULL; + veth_failMe(mFabricMgr->mConnection + i); + spin_lock_init(&mFabricMgr->mConnection[i].mAckGate); + spin_lock_init(&mFabricMgr->mConnection[i].mStatusGate); + } + + status = veth_probe(); + + if (status == 0) { + veth_openConnections(); + iSeries_proc_callback(&veth_proc_init); + } + + return status; +} + +module_init(veth_module_init); + +static void veth_failMe(struct VethLpConnection *connection) +{ + connection->mConnectionStatus.mSentCap = 0; + connection->mConnectionStatus.mCapAcked = 0; + connection->mConnectionStatus.mGotCap = 0; + connection->mConnectionStatus.mGotCapAcked = 0; + connection->mConnectionStatus.mSentMonitor = 0; + connection->mConnectionStatus.mFailed = 1; +} + +static int veth_open(struct net_device *dev) +{ + struct VethPort *port = (struct VethPort *) dev->priv; + + memset(&port->mStats, 0, sizeof(port->mStats)); + MOD_INC_USE_COUNT; + + netif_start_queue(dev); + + return 0; +} + +static int veth_close(struct net_device *dev) +{ + netif_stop_queue(dev); + + MOD_DEC_USE_COUNT; + + return 0; +} + +static struct net_device_stats *veth_get_stats(struct net_device *dev) +{ + struct VethPort *port = (struct VethPort *) dev->priv; + + return (&port->mStats); +} + + +static int veth_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + unsigned char *frame = skb->data; + HvLpIndex remoteLp = frame[5]; + int i = 0; + int clone = 0; + + if (mFabricMgr == NULL) { + veth_error_printk("NULL fabric manager with active ports!\n"); + netif_stop_queue(dev); + return 1; + } + + mb(); + + if ((*frame & 0x01) != 0x01) { /* broadcast or multicast */ + if ((remoteLp != mFabricMgr->mThisLp) && (HvLpConfig_doLpsCommunicateOnVirtualLan(mFabricMgr->mThisLp, remoteLp))) + veth_pTransmit(skb, remoteLp, dev); + } else { + for (i = 0; i < HvMaxArchitectedLps; ++i) { + if (i != mFabricMgr->mThisLp) { + if (HvLpConfig_doLpsCommunicateOnVirtualLan(mFabricMgr->mThisLp, i)) { + if (clone) + skb = skb_clone(skb, GFP_ATOMIC); + else + clone = 1; + + /* the ack handles deleting the skb */ + veth_pTransmit(skb, i, dev); + } + } + } + } + + return 0; +} + +static void veth_pTransmit(struct sk_buff *skb, HvLpIndex remoteLp, struct net_device *dev) +{ + struct VethLpConnection *connection = mFabricMgr->mConnection + remoteLp; + HvLpEvent_Rc returnCode; + struct scatterlist sg[VethMaxFramesPerMsg]; + int nfrags = 0; + int nsg; + + if (connection->mConnectionStatus.mFailed != 1) { + int rc = 0; + struct VethMsg *msg = NULL; + VETHSTACKPOP(&(connection->mMsgStack), msg); + + /* We can't handle a fragmented frame if it has + more than VethMaxFramesPerMsg fragments. + Attempt to coalesce the fragments if possible, + otherwise drop the frame */ + + if ((skb_shinfo(skb)->nr_frags + 1) > VethMaxFramesPerMsg) { + veth_printk("Linearizing frame to handle > 6 frags\n"); + rc = skb_linearize(skb, GFP_ATOMIC); + } + + if (msg != NULL && rc == 0 && ((skb->len - 14) <= 9000)) { + /* Use a scatterlist for both the fraged un-fraged + case. pci_map_sg has a fastpast for the single + case so we can simplify this code and not take + too big of a perf hit. + */ + + if (skb_shinfo(skb)->nr_frags) { /* fragmented frame */ + int i = 0; + + sg[nfrags].address = skb->data; + sg[nfrags].length = skb->len - skb->data_len; + ++nfrags; + + do { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + sg[nfrags].address = page_address(frag->page) + frag->page_offset; + sg[nfrags].length = frag->size; + ++nfrags; + ++i; + } while (i < skb_shinfo(skb)->nr_frags); + } else { /* unfragmented frame */ + sg[nfrags].address = skb->data; + sg[nfrags].length = skb->len; + ++nfrags; + } + /* + Note: nsg may be less than nfrags. Each frag entry + in the skb can only be a page in size, so they can + be contigous in memory but be spread over multiple + frag entries in the skb. pci_map_sg will coalesce + contiguous fragments and into a single dma address. + */ + + nsg = pci_map_sg(iSeries_veth_dev, sg, nfrags, PCI_DMA_TODEVICE); + + /* Is it really necessary to check the length and address fields of the + first entry here? */ + if (nsg) { + int i = 0; + msg->mSkb = skb; + do { + msg->mEvent.mSendData.mAddress[i] = sg[i].dma_address; + msg->mEvent.mSendData.mLength[i] = sg[i].dma_length; + ++i; + } while (i < nsg); + + msg->mEvent.mSendData.mEofMask = (1 << (nsg - 1)); + test_and_set_bit(0, &(msg->mInUse)); + + returnCode = HvCallEvent_signalLpEventFast(remoteLp, + HvLpEvent_Type_VirtualLan, + VethEventTypeFrames, + HvLpEvent_AckInd_NoAck, + HvLpEvent_AckType_ImmediateAck, + connection->mSourceInst, + connection->mTargetInst, + msg->mIndex, + msg->mEvent.mFpData.mData1, + msg->mEvent.mFpData.mData2, + msg->mEvent.mFpData.mData3, + msg->mEvent.mFpData.mData4, + msg->mEvent.mFpData.mData5); + } else { + returnCode = -1; /* Bad return code */ + } + + if (returnCode != HvLpEvent_Rc_Good) { + struct VethPort *port = (struct VethPort *) dev->priv; + if (nsg) + pci_unmap_sg(iSeries_veth_dev, sg, nsg, PCI_DMA_TODEVICE); + + dev_kfree_skb_any(skb); + + msg->mSkb = NULL; + memset(&(msg->mEvent.mSendData), 0, sizeof(struct VethFramesData)); + VETHSTACKPUSH(&(connection->mMsgStack), msg); + port->mStats.tx_dropped++; + } else { + struct VethPort *port = (struct VethPort *) dev->priv; + port->mStats.tx_packets++; + port->mStats.tx_bytes += skb->len; + } + } else { + struct VethPort *port = (struct VethPort *) dev->priv; + port->mStats.tx_dropped++; + if (rc) + port->mLinearized++; + dev_kfree_skb_any(skb); + } + } else { + struct VethPort *port = (struct VethPort *) dev->priv; + port->mStats.tx_dropped++; + dev_kfree_skb_any(skb); + } +} + +static int veth_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ +#ifdef SIOCETHTOOL + struct ethtool_cmd ecmd; + u32 link, speed; + + if (cmd != SIOCETHTOOL) + return -EOPNOTSUPP; + if (copy_from_user(&ecmd, ifr->ifr_data, sizeof(ecmd))) + return -EFAULT; + switch (ecmd.cmd) { + case ETHTOOL_GSET: + ecmd.supported = + (SUPPORTED_1000baseT_Full | + SUPPORTED_Autoneg | SUPPORTED_FIBRE); + ecmd.advertising = + (SUPPORTED_1000baseT_Full | + SUPPORTED_Autoneg | SUPPORTED_FIBRE); + + ecmd.port = PORT_FIBRE; + ecmd.transceiver = XCVR_INTERNAL; + ecmd.phy_address = 0; + ecmd.speed = SPEED_1000; + ecmd.duplex = DUPLEX_FULL; + ecmd.autoneg = AUTONEG_ENABLE; + ecmd.maxtxpkt = 120; + ecmd.maxrxpkt = 120; + if(copy_to_user(ifr->ifr_data, &ecmd, sizeof(ecmd))) + return -EFAULT; + return 0; + + case ETHTOOL_GDRVINFO: { + struct ethtool_drvinfo info = {ETHTOOL_GDRVINFO}; + strncpy(info.driver, "veth", sizeof(info.driver) - 1); + info.driver[sizeof(info.driver) - 1] = '\0'; + strncpy(info.version, "1.0", sizeof(info.version) - 1); + if (copy_to_user(ifr->ifr_data, &info, sizeof(info))) + return -EFAULT; + return 0; + } + /* get link status */ + case ETHTOOL_GLINK: { + struct ethtool_value edata = {ETHTOOL_GLINK}; + edata.data = 1; + if (copy_to_user(ifr->ifr_data, &edata, sizeof(edata))) + return -EFAULT; + return 0; + } + + default: + break; + } + +#endif + return -EOPNOTSUPP; +} + +static void veth_set_multicast_list(struct net_device *dev) +{ + char *addrs; + struct VethPort *port = (struct VethPort *) dev->priv; + u64 newAddress = 0; + unsigned long flags; + + write_lock_irqsave(&port->mMcastGate, flags); + + if (dev->flags & IFF_PROMISC) { /* set promiscuous mode */ + port->mPromiscuous = 1; + } else { + struct dev_mc_list *dmi = dev->mc_list; + + if (dev->flags & IFF_ALLMULTI) { + port->mAllMcast = 1; + } else { + int i; + /* Update table */ + port->mNumAddrs = 0; + + for (i = 0; ((i < dev->mc_count) && (i < 12)); i++) { /* for each address in the list */ + addrs = dmi->dmi_addr; + dmi = dmi->next; + if ((*addrs & 0x01) == 1) { /* multicast address? */ + memcpy(&newAddress, addrs, 6); + newAddress &= 0xFFFFFFFFFFFF0000; + + port->mMcasts[port->mNumAddrs] = newAddress; + mb(); + port->mNumAddrs = port->mNumAddrs + 1; + } + } + } + } + + write_unlock_irqrestore(&port->mMcastGate, flags); +} + + +static void veth_handleEvent(struct HvLpEvent *event, struct pt_regs *regs) +{ + if (event->xFlags.xFunction == HvLpEvent_Function_Ack) { + veth_handleAck(event); + } else if (event->xFlags.xFunction == HvLpEvent_Function_Int) { + veth_handleInt(event); + } +} + +static void veth_handleAck(struct HvLpEvent *event) +{ + struct VethLpConnection *connection = &(mFabricMgr->mConnection[event->xTargetLp]); + struct VethLpEvent *vethEvent = (struct VethLpEvent *) event; + + switch (event->xSubtype) { + case VethEventTypeCap: + { + veth_takeCapAck(connection, vethEvent); + break; + } + case VethEventTypeMonitor: + { + veth_takeMonitorAck(connection, vethEvent); + break; + } + default: + { + veth_error_printk("Unknown ack type %d from lpar %d\n", event->xSubtype, connection->mRemoteLp); + } + }; +} + +static void veth_handleInt(struct HvLpEvent *event) +{ + int i = 0; + struct VethLpConnection *connection = &(mFabricMgr->mConnection[event->xSourceLp]); + struct VethLpEvent *vethEvent = (struct VethLpEvent *) event; + + switch (event->xSubtype) { + case VethEventTypeCap: + { + veth_takeCap(connection, vethEvent); + break; + } + case VethEventTypeMonitor: + { + /* do nothing... this'll hang out here til we're dead, and the hypervisor will return it for us. */ + break; + } + case VethEventTypeFramesAck: + { + for (i = 0; i < VethMaxFramesMsgsAcked; ++i) { + u16 msg = vethEvent->mDerivedData.mFramesAckData.mToken[i]; + veth_recycleMsg(connection, msg); + } + break; + } + case VethEventTypeFrames: + { + veth_takeFrames(connection, vethEvent); + break; + } + default: + { + veth_error_printk("Unknown interrupt type %d from lpar %d\n", event->xSubtype, connection->mRemoteLp); + } + }; +} + +static void veth_openConnections() +{ + int i = 0; + + HvLpEvent_registerHandler(HvLpEvent_Type_VirtualLan, &veth_handleEvent); + + /* Now I need to run through the active lps and open connections to the ones I'm supposed to + open to. */ + + for (i = HvMaxArchitectedLps - 1; i >= 0; --i) { + if (i != mFabricMgr->mThisLp) { + if (HvLpConfig_doLpsCommunicateOnVirtualLan(mFabricMgr->mThisLp, i)) { + veth_openConnection(i, 1); + } else { + veth_closeConnection(i, 1); + } + } + } +} + +static void veth_intFinishOpeningConnections(void *parm, int number) +{ + struct VethLpConnection *connection = (struct VethLpConnection *) parm; + connection->mAllocBhTq.data = parm; + connection->mNumberAllocated = number; + queue_task(&connection->mAllocBhTq, &tq_immediate); + mark_bh(IMMEDIATE_BH); +} + +static void veth_finishOpeningConnections(void *parm) +{ + unsigned long flags; + struct VethLpConnection *connection = (struct VethLpConnection *) parm; + spin_lock_irqsave(&connection->mStatusGate, flags); + veth_finishOpeningConnectionsLocked(connection); + spin_unlock_irqrestore(&connection->mStatusGate, flags); +} + +static void veth_finishOpeningConnectionsLocked(struct VethLpConnection *connection) +{ + if (connection->mNumberAllocated >= 2) { + connection->mConnectionStatus.mCapMonAlloced = 1; + veth_sendCap(connection); + } else { + veth_error_printk("Couldn't allocate base msgs for lpar %d, only got %d\n", connection->mRemoteLp, + connection->mNumberAllocated); + veth_failMe(connection); + } +} + +static void veth_openConnection(u8 remoteLp, int lockMe) +{ + unsigned long flags; + unsigned long flags2; + HvLpInstanceId source; + HvLpInstanceId target; + u64 i = 0; + struct VethLpConnection *connection = &(mFabricMgr->mConnection[remoteLp]); + + memset(&connection->mCapBhTq, 0, sizeof(connection->mCapBhTq)); + connection->mCapBhTq.routine = (void *) (void *) veth_capBh; + + memset(&connection->mCapAckBhTq, 0, sizeof(connection->mCapAckBhTq)); + connection->mCapAckBhTq.routine = (void *) (void *) veth_capAckBh; + + memset(&connection->mMonitorAckBhTq, 0, sizeof(connection->mMonitorAckBhTq)); + connection->mMonitorAckBhTq.routine = (void *) (void *) veth_monitorAckBh; + + memset(&connection->mAllocBhTq, 0, sizeof(connection->mAllocBhTq)); + connection->mAllocBhTq.routine = (void *) (void *) veth_finishOpeningConnections; + + if (lockMe) + spin_lock_irqsave(&connection->mStatusGate, flags); + + connection->mRemoteLp = remoteLp; + + spin_lock_irqsave(&connection->mAckGate, flags2); + + memset(&connection->mEventData, 0xFF, sizeof(connection->mEventData)); + connection->mNumAcks = 0; + + HvCallEvent_openLpEventPath(remoteLp, HvLpEvent_Type_VirtualLan); + + /* clean up non-acked msgs */ + for (i = 0; i < connection->mNumMsgs; ++i) { + veth_recycleMsg(connection, i); + } + + connection->mConnectionStatus.mOpen = 1; + + source = connection->mSourceInst = HvCallEvent_getSourceLpInstanceId(remoteLp, HvLpEvent_Type_VirtualLan); + target = connection->mTargetInst = HvCallEvent_getTargetLpInstanceId(remoteLp, HvLpEvent_Type_VirtualLan); + + if (connection->mConnectionStatus.mCapMonAlloced != 1) { + connection->mAllocBhTq.routine = (void *) (void *) veth_finishOpeningConnections; + mf_allocateLpEvents(remoteLp, + HvLpEvent_Type_VirtualLan, + sizeof(struct VethLpEvent), 2, &veth_intFinishOpeningConnections, connection); + } else { + veth_finishOpeningConnectionsLocked(connection); + } + + spin_unlock_irqrestore(&connection->mAckGate, flags2); + + if (lockMe) + spin_unlock_irqrestore(&connection->mStatusGate, flags); +} + +static void veth_closeConnection(u8 remoteLp, int lockMe) +{ + struct VethLpConnection *connection = &(mFabricMgr->mConnection[remoteLp]); + unsigned long flags; + unsigned long flags2; + if (lockMe) + spin_lock_irqsave(&connection->mStatusGate, flags); + + del_timer(&connection->mAckTimer); + + if (connection->mConnectionStatus.mOpen == 1) { + HvCallEvent_closeLpEventPath(remoteLp, HvLpEvent_Type_VirtualLan); + connection->mConnectionStatus.mOpen = 0; + veth_failMe(connection); + + /* reset ack data */ + spin_lock_irqsave(&connection->mAckGate, flags2); + + memset(&connection->mEventData, 0xFF, sizeof(connection->mEventData)); + connection->mNumAcks = 0; + + spin_unlock_irqrestore(&connection->mAckGate, flags2); + } + + if (lockMe) + spin_unlock_irqrestore(&connection->mStatusGate, flags); +} + +static void veth_msgsInit(struct VethLpConnection *connection) +{ + connection->mAllocBhTq.routine = (void *) (void *) veth_finishMsgsInit; + mf_allocateLpEvents(connection->mRemoteLp, + HvLpEvent_Type_VirtualLan, + sizeof(struct VethLpEvent), + connection->mMyCap.mUnionData.mFields.mNumberBuffers, &veth_intFinishMsgsInit, connection); +} + +static void veth_intFinishMsgsInit(void *parm, int number) +{ + struct VethLpConnection *connection = (struct VethLpConnection *) parm; + connection->mAllocBhTq.data = parm; + connection->mNumberRcvMsgs = number; + queue_task(&connection->mAllocBhTq, &tq_immediate); + mark_bh(IMMEDIATE_BH); +} + +static void veth_intFinishCapBh(void *parm, int number) +{ + struct VethLpConnection *connection = (struct VethLpConnection *) parm; + connection->mAllocBhTq.data = parm; + if (number > 0) + connection->mNumberLpAcksAlloced += number; + + queue_task(&connection->mAllocBhTq, &tq_immediate); + mark_bh(IMMEDIATE_BH); +} + +static void veth_finishMsgsInit(struct VethLpConnection *connection) +{ + int i = 0; + unsigned int numberGotten = 0; + u64 amountOfHeapToGet = connection->mMyCap.mUnionData.mFields.mNumberBuffers * sizeof(struct VethMsg); + char *msgs = NULL; + unsigned long flags; + spin_lock_irqsave(&connection->mStatusGate, flags); + + if (connection->mNumberRcvMsgs >= connection->mMyCap.mUnionData.mFields.mNumberBuffers) { + msgs = kmalloc(amountOfHeapToGet, GFP_ATOMIC); + + connection->mMsgs = (struct VethMsg *) msgs; + + if (msgs != NULL) { + memset(msgs, 0, amountOfHeapToGet); + + for (i = 0; i < connection->mMyCap.mUnionData.mFields.mNumberBuffers; ++i) { + connection->mMsgs[i].mIndex = i; + ++numberGotten; + VETHSTACKPUSH(&(connection->mMsgStack), (connection->mMsgs + i)); + } + if (numberGotten > 0) { + connection->mNumMsgs = numberGotten; + } + } else { + kfree(msgs); + connection->mMsgs = NULL; + } + } + + connection->mMyCap.mUnionData.mFields.mNumberBuffers = connection->mNumMsgs; + + if (connection->mNumMsgs < 10) + connection->mMyCap.mUnionData.mFields.mThreshold = 1; + else if (connection->mNumMsgs < 20) + connection->mMyCap.mUnionData.mFields.mThreshold = 4; + else if (connection->mNumMsgs < 40) + connection->mMyCap.mUnionData.mFields.mThreshold = 10; + else + connection->mMyCap.mUnionData.mFields.mThreshold = 20; + + connection->mMyCap.mUnionData.mFields.mTimer = VethAckTimeoutUsec; + + veth_finishSendCap(connection); + + spin_unlock_irqrestore(&connection->mStatusGate, flags); +} + +static void veth_sendCap(struct VethLpConnection *connection) +{ + if (connection->mMsgs == NULL) { + connection->mMyCap.mUnionData.mFields.mNumberBuffers = VethBuffersToAllocate; + veth_msgsInit(connection); + } else { + veth_finishSendCap(connection); + } +} + +static void veth_finishSendCap(struct VethLpConnection *connection) +{ + HvLpEvent_Rc returnCode = HvCallEvent_signalLpEventFast(connection->mRemoteLp, + HvLpEvent_Type_VirtualLan, + VethEventTypeCap, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + connection->mSourceInst, + connection->mTargetInst, + 0, + connection->mMyCap.mUnionData.mNoFields.mReserved1, + connection->mMyCap.mUnionData.mNoFields.mReserved2, + connection->mMyCap.mUnionData.mNoFields.mReserved3, + connection->mMyCap.mUnionData.mNoFields.mReserved4, + connection->mMyCap.mUnionData.mNoFields.mReserved5); + + if ((returnCode == HvLpEvent_Rc_PartitionDead) || (returnCode == HvLpEvent_Rc_PathClosed)) { + connection->mConnectionStatus.mSentCap = 0; + } else if (returnCode != HvLpEvent_Rc_Good) { + veth_error_printk("Couldn't send cap to lpar %d, rc %x\n", connection->mRemoteLp, (int) returnCode); + veth_failMe(connection); + } else { + connection->mConnectionStatus.mSentCap = 1; + } +} + +static void veth_takeCap(struct VethLpConnection *connection, struct VethLpEvent *event) +{ + if (!test_and_set_bit(0, &(connection->mCapBhPending))) { + connection->mCapBhTq.data = connection; + memcpy(&connection->mCapEvent, event, sizeof(connection->mCapEvent)); + queue_task(&connection->mCapBhTq, &tq_immediate); + mark_bh(IMMEDIATE_BH); + } else { + veth_error_printk("Received a capabilities from lpar %d while already processing one\n", connection->mRemoteLp); + event->mBaseEvent.xRc = HvLpEvent_Rc_BufferNotAvailable; + HvCallEvent_ackLpEvent((struct HvLpEvent *) event); + } +} + +static void veth_takeCapAck(struct VethLpConnection *connection, struct VethLpEvent *event) +{ + if (!test_and_set_bit(0, &(connection->mCapAckBhPending))) { + connection->mCapAckBhTq.data = connection; + memcpy(&connection->mCapAckEvent, event, sizeof(connection->mCapAckEvent)); + queue_task(&connection->mCapAckBhTq, &tq_immediate); + mark_bh(IMMEDIATE_BH); + } else { + veth_error_printk("Received a capabilities ack from lpar %d while already processing one\n", + connection->mRemoteLp); + } +} + +static void veth_takeMonitorAck(struct VethLpConnection *connection, struct VethLpEvent *event) +{ + if (!test_and_set_bit(0, &(connection->mMonitorAckBhPending))) { + connection->mMonitorAckBhTq.data = connection; + memcpy(&connection->mMonitorAckEvent, event, sizeof(connection->mMonitorAckEvent)); + queue_task(&connection->mMonitorAckBhTq, &tq_immediate); + mark_bh(IMMEDIATE_BH); + } else { + veth_error_printk("Received a monitor ack from lpar %d while already processing one\n", connection->mRemoteLp); + } +} + +static void veth_recycleMsg(struct VethLpConnection *connection, u16 msg) +{ + struct scatterlist sg[VethMaxFramesPerMsg]; + if (msg < connection->mNumMsgs) { + struct VethMsg *myMsg = connection->mMsgs + msg; + if (test_and_clear_bit(0, &(myMsg->mInUse))) { + int i; + int nsg = 0; + for (i = 0; i < VethMaxFramesPerMsg; i++) { + if (myMsg->mEvent.mSendData.mAddress[i] != 0) { + sg[nsg].dma_address = myMsg->mEvent.mSendData.mAddress[i]; + sg[nsg].dma_length = myMsg->mEvent.mSendData.mLength[i]; + nsg++; + } + } + pci_unmap_sg(iSeries_veth_dev, sg, nsg, PCI_DMA_TODEVICE); + + dev_kfree_skb_any(myMsg->mSkb); + + myMsg->mSkb = NULL; + memset(&(myMsg->mEvent.mSendData), 0, sizeof(struct VethFramesData)); + VETHSTACKPUSH(&connection->mMsgStack, myMsg); + } else { + if (connection->mConnectionStatus.mOpen) { + veth_error_printk("Received a frames ack for msg %d from lpar %d while not outstanding\n", msg, + connection->mRemoteLp); + } + } + } +} + +static void veth_capBh(struct VethLpConnection *connection) +{ + struct VethLpEvent *event = &connection->mCapEvent; + unsigned long flags; + struct VethCapData *remoteCap = &(connection->mRemoteCap); + u64 numAcks = 0; + spin_lock_irqsave(&connection->mStatusGate, flags); + connection->mConnectionStatus.mGotCap = 1; + + memcpy(remoteCap, &(event->mDerivedData.mCapabilitiesData), sizeof(connection->mRemoteCap)); + + if ((remoteCap->mUnionData.mFields.mNumberBuffers <= VethMaxFramesMsgs) && + (remoteCap->mUnionData.mFields.mNumberBuffers != 0) && + (remoteCap->mUnionData.mFields.mThreshold <= VethMaxFramesMsgsAcked) && + (remoteCap->mUnionData.mFields.mThreshold != 0)) { + numAcks = (remoteCap->mUnionData.mFields.mNumberBuffers / remoteCap->mUnionData.mFields.mThreshold) + 1; + + if (connection->mNumberLpAcksAlloced < numAcks) { + numAcks = numAcks - connection->mNumberLpAcksAlloced; + connection->mAllocBhTq.routine = (void *) (void *) veth_finishCapBh; + mf_allocateLpEvents(connection->mRemoteLp, + HvLpEvent_Type_VirtualLan, + sizeof(struct VethLpEvent), numAcks, &veth_intFinishCapBh, connection); + } else + veth_finishCapBhLocked(connection); + } else { + veth_error_printk("Received incompatible capabilities from lpar %d\n", connection->mRemoteLp); + event->mBaseEvent.xRc = HvLpEvent_Rc_InvalidSubtypeData; + HvCallEvent_ackLpEvent((struct HvLpEvent *) event); + } + + clear_bit(0, &(connection->mCapBhPending)); + spin_unlock_irqrestore(&connection->mStatusGate, flags); +} + +static void veth_capAckBh(struct VethLpConnection *connection) +{ + struct VethLpEvent *event = &connection->mCapAckEvent; + unsigned long flags; + + spin_lock_irqsave(&connection->mStatusGate, flags); + + if (event->mBaseEvent.xRc == HvLpEvent_Rc_Good) { + connection->mConnectionStatus.mCapAcked = 1; + + if ((connection->mConnectionStatus.mGotCap == 1) && (connection->mConnectionStatus.mGotCapAcked == 1)) { + if (connection->mConnectionStatus.mSentMonitor != 1) + veth_sendMonitor(connection); + } + } else { + veth_error_printk("Bad rc(%d) from lpar %d on capabilities\n", event->mBaseEvent.xRc, connection->mRemoteLp); + veth_failMe(connection); + } + + clear_bit(0, &(connection->mCapAckBhPending)); + spin_unlock_irqrestore(&connection->mStatusGate, flags); +} + +static void veth_monitorAckBh(struct VethLpConnection *connection) +{ + unsigned long flags; + + spin_lock_irqsave(&connection->mStatusGate, flags); + + veth_failMe(connection); + + veth_printk("Monitor ack returned for lpar %d\n", connection->mRemoteLp); + + if (connection->mConnectionStatus.mOpen) { + veth_closeConnection(connection->mRemoteLp, 0); + + udelay(100); + + queue_task(&connection->mMonitorAckBhTq, &tq_immediate); + mark_bh(IMMEDIATE_BH); + } else { +#ifdef MODULE + if (VethModuleReopen) +#endif + veth_openConnection(connection->mRemoteLp, 0); +#ifdef MODULE + else { + int i = 0; + + for (i = 0; i < connection->mNumMsgs; ++i) { + veth_recycleMsg(connection, i); + } + } +#endif + clear_bit(0, &(connection->mMonitorAckBhPending)); + } + + spin_unlock_irqrestore(&connection->mStatusGate, flags); +} + +#define number_of_pages(v, l) ((((unsigned long)(v) & ((1 << 12) - 1)) + (l) + 4096 - 1) / 4096) +#define page_offset(v) ((unsigned long)(v) & ((1 << 12) - 1)) + +static void veth_takeFrames(struct VethLpConnection *connection, struct VethLpEvent *event) +{ + int i = 0; + struct VethPort *port = NULL; + struct BufList { + union { + struct { + u32 token2; + u32 garbage; + } token1; + u64 address; + } addr; + u64 size; + }; + + struct BufList myBufList[4]; /* max pages per frame */ + struct BufList remoteList[VethMaxFramesPerMsg]; /* max frags per frame */ + + do { + int nfrags = 0; + u16 length = 0; + + /* a 0 address marks the end of the valid entries */ + if (event->mDerivedData.mSendData.mAddress[i] == 0) + break; + + /* make sure that we have at least 1 EOF entry in the remaining entries */ + if (!(event->mDerivedData.mSendData.mEofMask >> i)) { + veth_printk("bad lp event: missing EOF frag in event mEofMask 0x%x i %d\n", + event->mDerivedData.mSendData.mEofMask, i); + break; + } + + /* add up length of non-EOF frags */ + do { + remoteList[nfrags].addr.token1.token2 = event->mDerivedData.mSendData.mAddress[i + nfrags]; + remoteList[nfrags].addr.token1.garbage = 0; + length += remoteList[nfrags].size = event->mDerivedData.mSendData.mLength[i + nfrags]; + } + while (!(event->mDerivedData.mSendData.mEofMask & (1 << (i + nfrags++)))); + + + /* length == total length of all framgents */ + /* nfrags == # of fragments in this frame */ + + if ((length - 14) <= 9000) { /* save as 13 < length <= 9014 */ + struct sk_buff *skb = alloc_skb(length, GFP_ATOMIC); + if (skb != NULL) { + HvLpDma_Rc returnCode = HvLpDma_Rc_Good; + + /* build the buffer list for the dma operation */ + int numPages = number_of_pages((skb->data), length); /* number of pages in this fragment of the complete buffer */ + myBufList[0].addr.address = + (0x8000000000000000LL | (virt_to_absolute((unsigned long) skb->data))); + myBufList[0].size = (numPages > 1) ? (4096 - page_offset(skb->data)) : length; + if (numPages > 1) { + myBufList[1].addr.address = + (0x8000000000000000LL | + (virt_to_absolute((unsigned long) skb->data + myBufList[0].size))); + myBufList[1].size = (numPages > 2) ? 4096 : length - myBufList[0].size; + if (numPages > 2) { + myBufList[2].addr.address = + (0x8000000000000000LL | + (virt_to_absolute + ((unsigned long) skb->data + myBufList[0].size + myBufList[1].size))); + myBufList[2].size = + (numPages > 3) ? 4096 : length - myBufList[0].size - myBufList[1].size; + if (numPages > 3) { + myBufList[3].addr.address = + 0x8000000000000000LL | + (virt_to_absolute + ((unsigned long) skb->data + myBufList[0].size + myBufList[1].size + + myBufList[2].size)); + myBufList[3].size = + length - myBufList[0].size - myBufList[1].size - myBufList[2].size; + } + } + } + returnCode = HvCallEvent_dmaBufList(HvLpEvent_Type_VirtualLan, + event->mBaseEvent.xSourceLp, + HvLpDma_Direction_RemoteToLocal, + connection->mSourceInst, + connection->mTargetInst, + HvLpDma_AddressType_RealAddress, + HvLpDma_AddressType_TceIndex, + 0x8000000000000000LL | + (virt_to_absolute((unsigned long) &myBufList)), + 0x8000000000000000LL | + (virt_to_absolute((unsigned long) &remoteList)), length); + + if (returnCode == HvLpDma_Rc_Good) { + HvLpVirtualLanIndex vlan = skb->data[9]; + u64 dest = *((u64 *) skb->data) & 0xFFFFFFFFFFFF0000; + + if (((vlan < HvMaxArchitectedVirtualLans) && ((port = mFabricMgr->mPorts[vlan]) != NULL)) && ((dest == port->mMyAddress) || /* it's for me */ + (dest == 0xFFFFFFFFFFFF0000) || /* it's a broadcast */ + (veth_multicast_wanted(port, dest)) || /* it's one of my multicasts */ + (port->mPromiscuous == 1))) { /* I'm promiscuous */ + skb_put(skb, length); + skb->dev = port->mDev; + skb->protocol = eth_type_trans(skb, port->mDev); + skb->ip_summed = CHECKSUM_NONE; + netif_rx(skb); /* send it up */ + port->mStats.rx_packets++; + port->mStats.rx_bytes += length; + + } else { + dev_kfree_skb_irq(skb); + } + } else { + dev_kfree_skb_irq(skb); + } + } + } else { + break; + } + i += nfrags; + } while (i < VethMaxFramesPerMsg); + + /* Ack it */ + + { + unsigned long flags; + spin_lock_irqsave(&connection->mAckGate, flags); + + if (connection->mNumAcks < VethMaxFramesMsgsAcked) { + connection->mEventData.mAckData.mToken[connection->mNumAcks] = event->mBaseEvent.xCorrelationToken; + ++connection->mNumAcks; + + if (connection->mNumAcks == connection->mRemoteCap.mUnionData.mFields.mThreshold) { + HvLpEvent_Rc rc = HvCallEvent_signalLpEventFast(connection->mRemoteLp, + HvLpEvent_Type_VirtualLan, + VethEventTypeFramesAck, + HvLpEvent_AckInd_NoAck, + HvLpEvent_AckType_ImmediateAck, + connection->mSourceInst, + connection->mTargetInst, + 0, + connection->mEventData.mFpData.mData1, + connection->mEventData.mFpData.mData2, + connection->mEventData.mFpData.mData3, + connection->mEventData.mFpData.mData4, + connection->mEventData.mFpData.mData5); + + if (rc != HvLpEvent_Rc_Good) { + veth_error_printk("Bad lp event return code(%x) acking frames from lpar %d\n", (int) rc, + connection->mRemoteLp); + } + + connection->mNumAcks = 0; + + memset(&connection->mEventData, 0xFF, sizeof(connection->mEventData)); + } + + } + + spin_unlock_irqrestore(&connection->mAckGate, flags); + } +} + +#undef number_of_pages +#undef page_offset + +static void veth_timedAck(unsigned long connectionPtr) +{ + unsigned long flags; + HvLpEvent_Rc rc; + struct VethLpConnection *connection = (struct VethLpConnection *) connectionPtr; + /* Ack all the events */ + spin_lock_irqsave(&connection->mAckGate, flags); + + if (connection->mNumAcks > 0) { + rc = HvCallEvent_signalLpEventFast(connection->mRemoteLp, + HvLpEvent_Type_VirtualLan, + VethEventTypeFramesAck, + HvLpEvent_AckInd_NoAck, + HvLpEvent_AckType_ImmediateAck, + connection->mSourceInst, + connection->mTargetInst, + 0, + connection->mEventData.mFpData.mData1, + connection->mEventData.mFpData.mData2, + connection->mEventData.mFpData.mData3, + connection->mEventData.mFpData.mData4, connection->mEventData.mFpData.mData5); + + if (rc != HvLpEvent_Rc_Good) { + veth_error_printk("Bad lp event return code(%x) acking frames from lpar %d!\n", (int) rc, + connection->mRemoteLp); + } + + connection->mNumAcks = 0; + + memset(&connection->mEventData, 0xFF, sizeof(connection->mEventData)); + } + + spin_unlock_irqrestore(&connection->mAckGate, flags); + + /* Reschedule the timer */ + connection->mAckTimer.expires = jiffies + connection->mTimeout; + add_timer(&connection->mAckTimer); +} + +static int veth_multicast_wanted(struct VethPort *port, u64 thatAddr) +{ + int returnParm = 0; + int i; + unsigned long flags; + + if ((*((char *) &thatAddr) & 0x01) != 1) + return 0; + + read_lock_irqsave(&port->mMcastGate, flags); + if (port->mAllMcast) { + read_unlock_irqrestore(&port->mMcastGate, flags); + return 1; + } + + for (i = 0; i < port->mNumAddrs; ++i) { + u64 thisAddr = port->mMcasts[i]; + + if (thisAddr == thatAddr) { + returnParm = 1; + break; + } + } + read_unlock_irqrestore(&port->mMcastGate, flags); + + return returnParm; +} + +static void veth_sendMonitor(struct VethLpConnection *connection) +{ + HvLpEvent_Rc returnCode = HvCallEvent_signalLpEventFast(connection->mRemoteLp, + HvLpEvent_Type_VirtualLan, + VethEventTypeMonitor, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_DeferredAck, + connection->mSourceInst, + connection->mTargetInst, + 0, 0, 0, 0, 0, 0); + + if (returnCode == HvLpEvent_Rc_Good) { + connection->mConnectionStatus.mSentMonitor = 1; + connection->mConnectionStatus.mFailed = 0; + + /* Start the ACK timer */ + init_timer(&connection->mAckTimer); + connection->mAckTimer.function = veth_timedAck; + connection->mAckTimer.data = (unsigned long) connection; + connection->mAckTimer.expires = jiffies + connection->mTimeout; + add_timer(&connection->mAckTimer); + + } else { + veth_error_printk("Monitor send to lpar %d failed with rc %x\n", connection->mRemoteLp, (int) returnCode); + veth_failMe(connection); + } +} + +static void veth_finishCapBh(struct VethLpConnection *connection) +{ + unsigned long flags; + spin_lock_irqsave(&connection->mStatusGate, flags); + veth_finishCapBhLocked(connection); + spin_unlock_irqrestore(&connection->mStatusGate, flags); +} + +static void veth_finishCapBhLocked(struct VethLpConnection *connection) +{ + struct VethLpEvent *event = &connection->mCapEvent; + struct VethCapData *remoteCap = &(connection->mRemoteCap); + int numAcks = (remoteCap->mUnionData.mFields.mNumberBuffers / remoteCap->mUnionData.mFields.mThreshold) + 1; + + /* Convert timer to jiffies */ + if (connection->mMyCap.mUnionData.mFields.mTimer) + connection->mTimeout = remoteCap->mUnionData.mFields.mTimer * HZ / 1000000; + else + connection->mTimeout = VethAckTimeoutUsec * HZ / 1000000; + + if (connection->mNumberLpAcksAlloced >= numAcks) { + HvLpEvent_Rc returnCode = HvCallEvent_ackLpEvent((struct HvLpEvent *) event); + + if (returnCode == HvLpEvent_Rc_Good) { + connection->mConnectionStatus.mGotCapAcked = 1; + + if (connection->mConnectionStatus.mSentCap != 1) { + connection->mTargetInst = + HvCallEvent_getTargetLpInstanceId(connection->mRemoteLp, HvLpEvent_Type_VirtualLan); + + veth_sendCap(connection); + } else if (connection->mConnectionStatus.mCapAcked == 1) { + if (connection->mConnectionStatus.mSentMonitor != 1) + veth_sendMonitor(connection); + } + } else { + veth_error_printk("Failed to ack remote cap for lpar %d with rc %x\n", connection->mRemoteLp, + (int) returnCode); + veth_failMe(connection); + } + } else { + veth_error_printk("Couldn't allocate all the frames ack events for lpar %d\n", connection->mRemoteLp); + event->mBaseEvent.xRc = HvLpEvent_Rc_BufferNotAvailable; + HvCallEvent_ackLpEvent((struct HvLpEvent *) event); + } +} + +int proc_veth_dump_connection(char *page, char **start, off_t off, int count, int *eof, void *data) { + char *out = page; + long whichConnection = (long) data; + int len = 0; + struct VethLpConnection *connection = NULL; + + if ((whichConnection < 0) || (whichConnection > HvMaxArchitectedLps) || (mFabricMgr == NULL)) { + veth_error_printk("Got bad data from /proc file system\n"); + len = sprintf(page, "ERROR\n"); + } else { + int thereWasStuffBefore = 0; + connection = &(mFabricMgr->mConnection[whichConnection]); + + out += sprintf(out, "Remote Lp:\t%d\n", connection->mRemoteLp); + out += sprintf(out, "Source Inst:\t%04X\n", connection->mSourceInst); + out += sprintf(out, "Target Inst:\t%04X\n", connection->mTargetInst); + out += sprintf(out, "Num Msgs:\t%d\n", connection->mNumMsgs); + out += sprintf(out, "Num Lp Acks:\t%d\n", connection->mNumberLpAcksAlloced); + out += sprintf(out, "Num Acks:\t%d\n", connection->mNumAcks); + + if (connection->mConnectionStatus.mOpen) { + out += sprintf(out, "mConnectionStatus.mCapMonAlloced) { + if (thereWasStuffBefore) + out += sprintf(out, "/"); + else + out += sprintf(out, "<"); + out += sprintf(out, "CapMonAlloced"); + thereWasStuffBefore = 1; + } + + if (connection->mConnectionStatus.mBaseMsgsAlloced) { + if (thereWasStuffBefore) + out += sprintf(out, "/"); + else + out += sprintf(out, "<"); + out += sprintf(out, "BaseMsgsAlloced"); + thereWasStuffBefore = 1; + } + + if (connection->mConnectionStatus.mSentCap) { + if (thereWasStuffBefore) + out += sprintf(out, "/"); + else + out += sprintf(out, "<"); + out += sprintf(out, "SentCap"); + thereWasStuffBefore = 1; + } + + if (connection->mConnectionStatus.mCapAcked) { + if (thereWasStuffBefore) + out += sprintf(out, "/"); + else + out += sprintf(out, "<"); + out += sprintf(out, "CapAcked"); + thereWasStuffBefore = 1; + } + + if (connection->mConnectionStatus.mGotCap) { + if (thereWasStuffBefore) + out += sprintf(out, "/"); + else + out += sprintf(out, "<"); + out += sprintf(out, "GotCap"); + thereWasStuffBefore = 1; + } + + if (connection->mConnectionStatus.mGotCapAcked) { + if (thereWasStuffBefore) + out += sprintf(out, "/"); + else + out += sprintf(out, "<"); + out += sprintf(out, "GotCapAcked"); + thereWasStuffBefore = 1; + } + + if (connection->mConnectionStatus.mSentMonitor) { + if (thereWasStuffBefore) + out += sprintf(out, "/"); + else + out += sprintf(out, "<"); + out += sprintf(out, "SentMonitor"); + thereWasStuffBefore = 1; + } + + if (connection->mConnectionStatus.mPopulatedRings) { + if (thereWasStuffBefore) + out += sprintf(out, "/"); + else + out += sprintf(out, "<"); + out += sprintf(out, "PopulatedRings"); + thereWasStuffBefore = 1; + } + + if (connection->mConnectionStatus.mFailed) { + if (thereWasStuffBefore) + out += sprintf(out, "/"); + else + out += sprintf(out, "<"); + out += sprintf(out, "Failed"); + thereWasStuffBefore = 1; + } + + if (thereWasStuffBefore) + out += sprintf(out, ">"); + + out += sprintf(out, "\n"); + + out += sprintf(out, "Capabilities (System:):\n"); + out += sprintf(out, "\tLocal:<"); + out += sprintf(out, "%d/%d/%d/%d>\n", + connection->mMyCap.mUnionData.mFields.mVersion, + connection->mMyCap.mUnionData.mFields.mNumberBuffers, + connection->mMyCap.mUnionData.mFields.mThreshold, connection->mMyCap.mUnionData.mFields.mTimer); + out += sprintf(out, "\tRemote:<"); + out += sprintf(out, "%d/%d/%d/%d>\n", + connection->mRemoteCap.mUnionData.mFields.mVersion, + connection->mRemoteCap.mUnionData.mFields.mNumberBuffers, + connection->mRemoteCap.mUnionData.mFields.mThreshold, + connection->mRemoteCap.mUnionData.mFields.mTimer); + len = out - page; + } + len -= off; + if (len < count) { + *eof = 1; + if (len <= 0) + return 0; + } else + len = count; + *start = page + off; + return len; +} + +int proc_veth_dump_port(char *page, char **start, off_t off, int count, int *eof, void *data) { + char *out = page; + long whichPort = (long) data; + int len = 0; + struct VethPort *port = NULL; + + if ((whichPort < 0) || (whichPort > HvMaxArchitectedVirtualLans) || (mFabricMgr == NULL)) + len = sprintf(page, "Virtual ethernet is not configured.\n"); + else { + int i = 0; + u32 *myAddr; + u16 *myEndAddr; + port = mFabricMgr->mPorts[whichPort]; + + if (port != NULL) { + myAddr = (u32 *) & (port->mMyAddress); + myEndAddr = (u16 *) (myAddr + 1); + out += sprintf(out, "Net device:\t%p\n", port->mDev); + out += sprintf(out, "Net device name:\t%s\n", port->mDev->name); + out += sprintf(out, "Address:\t%08X%04X\n", myAddr[0], myEndAddr[0]); + out += sprintf(out, "Promiscuous:\t%d\n", port->mPromiscuous); + out += sprintf(out, "All multicast:\t%d\n", port->mAllMcast); + out += sprintf(out, "Number sk_buffs linearized:\t%u\n", port->mLinearized); + out += sprintf(out, "Number multicast:\t%d\n", port->mNumAddrs); + + for (i = 0; i < port->mNumAddrs; ++i) { + u32 *multi = (u32 *) & (port->mMcasts[i]); + u16 *multiEnd = (u16 *) (multi + 1); + out += sprintf(out, " %08X%04X\n", multi[0], multiEnd[0]); + } + } else { + out += sprintf(page, "veth%d is not configured.\n", (int) whichPort); + } + + len = out - page; + } + len -= off; + if (len < count) { + *eof = 1; + if (len <= 0) + return 0; + } else + len = count; + *start = page + off; + return len; +} diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/iseries/veth.h linuxppc64_2_4/drivers/iseries/veth.h --- ../kernel.org/linux-2.4.19/drivers/iseries/veth.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/drivers/iseries/veth.h Tue Mar 12 08:51:03 2002 @@ -0,0 +1,242 @@ +/* File veth.h created by Kyle A. Lucke on Mon Aug 7 2000. */ + +/* Change Activity: */ +/* End Change Activity */ + +#ifndef _VETH_H +#define _VETH_H + +#ifndef _HVTYPES_H +#include +#endif +#ifndef _HVLPEVENT_H +#include +#endif +#include + +#define VethEventNumTypes (4) +#define VethEventTypeCap (0) +#define VethEventTypeFrames (1) +#define VethEventTypeMonitor (2) +#define VethEventTypeFramesAck (3) + +#define VethMaxFramesMsgsAcked (20) +#define VethMaxFramesMsgs (0xFFFF) +#define VethMaxFramesPerMsg (6) +#define VethAckTimeoutUsec (1000000) + +#define VETHSTACKTYPE(T) struct VethStack##T +#define VETHSTACK(T) \ +VETHSTACKTYPE(T) \ +{ \ +struct T *head; \ +spinlock_t lock; \ +} +#define VETHSTACKCTOR(s) do { (s)->head = NULL; spin_lock_init(&(s)->lock); } while(0) +#define VETHSTACKPUSH(s, p) \ +do { \ +unsigned long flags; \ +spin_lock_irqsave(&(s)->lock,flags); \ +(p)->next = (s)->head; \ +(s)->head = (p); \ +spin_unlock_irqrestore(&(s)->lock, flags); \ +} while(0) + +#define VETHSTACKPOP(s,p) \ +do { \ +unsigned long flags; \ +spin_lock_irqsave(&(s)->lock,flags); \ +(p) = (s)->head; \ +if ((s)->head != NULL) \ +{ \ +(s)->head = (s)->head->next; \ +} \ +spin_unlock_irqrestore(&(s)->lock, flags); \ +} while(0) + +#define VETHQUEUE(T) \ +struct VethQueue##T \ +{ \ +T *head; \ +T *tail; \ +spinlock_t lock; \ +} +#define VETHQUEUECTOR(q) do { (q)->head = NULL; (q)->tail = NULL; spin_lock_init(&(q)->lock); } while(0) +#define VETHQUEUEENQ(q, p) \ +do { \ +unsigned long flags; \ +spin_lock_irqsave(&(q)->lock,flags); \ +(p)->next = NULL; \ +if ((q)->head != NULL) \ +{ \ +(q)->head->next = (p); \ +(q)->head = (p); \ +} \ +else \ +{ \ +(q)->tail = (q)->head = (p); \ +} \ +spin_unlock_irqrestore(&(q)->lock, flags); \ +} while(0) + +#define VETHQUEUEDEQ(q,p) \ +do { \ +unsigned long flags; \ +spin_lock_irqsave(&(q)->lock,flags); \ +(p) = (q)->tail; \ +if ((p) != NULL) \ +{ \ +(q)->tail = (p)->next; \ +(p)->next = NULL; \ +} \ +if ((q)->tail == NULL) \ +(q)->head = NULL; \ +spin_unlock_irqrestore(&(q)->lock, flags); \ +} while(0) + +struct VethFramesData { + u32 mAddress[6]; + u16 mLength[6]; + u32 mEofMask:6; + u32 mReserved:26; +}; + +struct VethFramesAckData { + u16 mToken[VethMaxFramesMsgsAcked]; +}; + +struct VethCapData { + union { + struct Fields { + u8 mVersion; + u8 mReserved1; + u16 mNumberBuffers; + u16 mThreshold; + u16 mReserved2; + u32 mTimer; + u32 mReserved3; + u64 mReserved4; + u64 mReserved5; + u64 mReserved6; + } mFields; + struct NoFields { + u64 mReserved1; + u64 mReserved2; + u64 mReserved3; + u64 mReserved4; + u64 mReserved5; + } mNoFields; + } mUnionData; +}; + +struct VethFastPathData { + u64 mData1; + u64 mData2; + u64 mData3; + u64 mData4; + u64 mData5; +}; + +struct VethLpEvent { + struct HvLpEvent mBaseEvent; + union { + struct VethFramesData mSendData; + struct VethCapData mCapabilitiesData; + struct VethFramesAckData mFramesAckData; + struct VethFastPathData mFastPathData; + } mDerivedData; + +}; + +struct VethMsg { + struct VethMsg *next; + union { + struct VethFramesData mSendData; + struct VethFastPathData mFpData; + } mEvent; + int mIndex; + unsigned long mInUse; + struct sk_buff *mSkb; +}; + + +struct VethControlBlock { + struct net_device *mDev; + struct VethControlBlock *mNext; + HvLpVirtualLanIndex mVlanId; +}; + +struct VethLpConnection { + u64 mEyecatcher; + HvLpIndex mRemoteLp; + HvLpInstanceId mSourceInst; + HvLpInstanceId mTargetInst; + u32 mNumMsgs; + struct VethMsg *mMsgs; + int mNumberRcvMsgs; + int mNumberLpAcksAlloced; + union { + struct VethFramesAckData mAckData; + struct VethFastPathData mFpData; + } mEventData; + spinlock_t mAckGate; + u32 mNumAcks; + spinlock_t mStatusGate; + struct { + u64 mOpen:1; + u64 mCapMonAlloced:1; + u64 mBaseMsgsAlloced:1; + u64 mSentCap:1; + u64 mCapAcked:1; + u64 mGotCap:1; + u64 mGotCapAcked:1; + u64 mSentMonitor:1; + u64 mPopulatedRings:1; + u64 mReserved:54; + u64 mFailed:1; + } mConnectionStatus; + struct VethCapData mMyCap; + struct VethCapData mRemoteCap; + unsigned long mCapAckBhPending; + struct tq_struct mCapAckBhTq; + struct VethLpEvent mCapAckEvent; + unsigned long mCapBhPending; + struct tq_struct mCapBhTq; + struct VethLpEvent mCapEvent; + unsigned long mMonitorAckBhPending; + struct tq_struct mMonitorAckBhTq; + struct VethLpEvent mMonitorAckEvent; + unsigned long mAllocBhPending; + struct tq_struct mAllocBhTq; + int mNumberAllocated; + struct timer_list mAckTimer; + u32 mTimeout; + VETHSTACK(VethMsg) mMsgStack; +}; +#define HVMAXARCHITECTEDVIRTUALLANS 16 +struct VethPort { + struct net_device *mDev; + struct net_device_stats mStats; + int mLock; + u64 mMyAddress; + int mPromiscuous; + int mAllMcast; + rwlock_t mMcastGate; + int mNumAddrs; + u64 mMcasts[12]; + u32 mLinearized; +}; + +struct VethFabricMgr { + u64 mEyecatcher; + HvLpIndex mThisLp; + struct VethLpConnection mConnection[HVMAXARCHITECTEDLPS]; + spinlock_t mPortListGate; + u64 mNumPorts; + struct VethPort *mPorts[HVMAXARCHITECTEDVIRTUALLANS]; +}; + +int proc_veth_dump_connection(char *page, char **start, off_t off, int count, int *eof, void *data); +int proc_veth_dump_port(char *page, char **start, off_t off, int count, int *eof, void *data); + +#endif /* _VETH_H */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/iseries/vio.h linuxppc64_2_4/drivers/iseries/vio.h --- ../kernel.org/linux-2.4.19/drivers/iseries/vio.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/drivers/iseries/vio.h Wed Feb 27 11:09:05 2002 @@ -0,0 +1,130 @@ +/* -*- linux-c -*- + * drivers/char/vio.h + * + * iSeries Virtual I/O Message Path header + * + * Authors: Dave Boutcher + * Ryan Arnold + * Colin Devilbiss + * + * (C) Copyright 2000 IBM Corporation + * + * This header file is used by the iSeries virtual I/O device + * drivers. It defines the interfaces to the common functions + * (implemented in drivers/char/viopath.h) as well as defining + * common functions and structures. Currently (at the time I + * wrote this comment) the iSeries virtual I/O device drivers + * that use this are + * drivers/block/viodasd.c + * drivers/char/viocons.c + * drivers/char/viotape.c + * drivers/cdrom/viocd.c + * + * The iSeries virtual ethernet support (veth.c) uses a whole + * different set of functions. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) anyu later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#ifndef _VIO_H +#define _VIO_H + +#include +#include + +/* iSeries virtual I/O events use the subtype field in + * HvLpEvent to figure out what kind of vio event is coming + * in. We use a table to route these, and this defines + * the maximum number of distinct subtypes + */ +#define VIO_MAX_SUBTYPES 7 + +/* Each subtype can register a handler to process their events. + * The handler must have this interface. + */ +typedef void (vio_event_handler_t) (struct HvLpEvent * event); + +int viopath_open(HvLpIndex remoteLp, int subtype, int numReq); +int viopath_close(HvLpIndex remoteLp, int subtype, int numReq); +int vio_setHandler(int subtype, vio_event_handler_t * beh); +int vio_clearHandler(int subtype); +int viopath_isactive(HvLpIndex lp); +HvLpInstanceId viopath_sourceinst(HvLpIndex lp); +HvLpInstanceId viopath_targetinst(HvLpIndex lp); +void vio_set_hostlp(void); +void *vio_get_event_buffer(int subtype); +void vio_free_event_buffer(int subtype, void *buffer); + +extern HvLpIndex viopath_hostLp; +extern HvLpIndex viopath_ourLp; + +#define VIO_MESSAGE "iSeries virtual I/O: " +#define KERN_DEBUG_VIO KERN_DEBUG VIO_MESSAGE +#define KERN_INFO_VIO KERN_INFO VIO_MESSAGE +#define KERN_WARNING_VIO KERN_WARNING VIO_MESSAGE + +#define VIOCHAR_MAX_DATA 200 + +#define VIOMAJOR_SUBTYPE_MASK 0xff00 +#define VIOMINOR_SUBTYPE_MASK 0x00ff +#define VIOMAJOR_SUBTYPE_SHIFT 8 + +#define VIOVERSION 0x0101 + +/* +This is the general structure for VIO errors; each module should have a table +of them, and each table should be terminated by an entry of { 0, 0, NULL }. +Then, to find a specific error message, a module should pass its local table +and the return code. +*/ +struct vio_error_entry { + u16 rc; + int errno; + const char *msg; +}; +const struct vio_error_entry *vio_lookup_rc(const struct vio_error_entry + *local_table, u16 rc); + +enum viosubtypes { + viomajorsubtype_monitor = 0x0100, + viomajorsubtype_blockio = 0x0200, + viomajorsubtype_chario = 0x0300, + viomajorsubtype_config = 0x0400, + viomajorsubtype_cdio = 0x0500, + viomajorsubtype_tape = 0x0600 +}; + + +enum vioconfigsubtype { + vioconfigget = 0x0001, +}; + +enum viorc { + viorc_good = 0x0000, + viorc_noConnection = 0x0001, + viorc_noReceiver = 0x0002, + viorc_noBufferAvailable = 0x0003, + viorc_invalidMessageType = 0x0004, + viorc_invalidRange = 0x0201, + viorc_invalidToken = 0x0202, + viorc_DMAError = 0x0203, + viorc_useError = 0x0204, + viorc_releaseError = 0x0205, + viorc_invalidDisk = 0x0206, + viorc_openRejected = 0x0301 +}; + + +#endif /* _VIO_H */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/iseries/viocd.c linuxppc64_2_4/drivers/iseries/viocd.c --- ../kernel.org/linux-2.4.19/drivers/iseries/viocd.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/drivers/iseries/viocd.c Fri Mar 29 11:51:14 2002 @@ -0,0 +1,818 @@ +/* -*- linux-c -*- + * drivers/cdrom/viocd.c + * + *************************************************************************** + * iSeries Virtual CD Rom + * + * Authors: Dave Boutcher + * Ryan Arnold + * Colin Devilbiss + * + * (C) Copyright 2000 IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) anyu later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + *************************************************************************** + * This routine provides access to CD ROM drives owned and managed by an + * OS/400 partition running on the same box as this Linux partition. + * + * All operations are performed by sending messages back and forth to + * the OS/400 partition. + * + * + * This device driver can either use it's own major number, or it can + * pretend to be an AZTECH drive. This is controlled with a + * CONFIG option. You can either call this an elegant solution to the + * fact that a lot of software doesn't recognize a new CD major number... + * or you can call this a really ugly hack. Your choice. + * + */ + +#include +#include + +/* Decide on the proper naming convention to use for our device */ +#ifdef CONFIG_DEVFS_FS +#define VIOCD_DEVICE "cdroms/cdrom%d" +#define VIOCD_DEVICE_OFFSET 0 +#else +#ifdef CONFIG_VIOCD_AZTECH +#define VIOCD_DEVICE "aztcd" +#define VIOCD_DEVICE_OFFSET 0 +#else +#define VIOCD_DEVICE "iseries/vcd%c" +#define VIOCD_DEVICE_OFFSET 'a' +#endif +#endif + +/*************************************************************************** + * Decide if we are using our own major or pretending to be an AZTECH drive + ***************************************************************************/ +#ifdef CONFIG_VIOCD_AZTECH +#define MAJOR_NR AZTECH_CDROM_MAJOR +#define do_viocd_request do_aztcd_request +#else +#define MAJOR_NR VIOCD_MAJOR +#endif + +#define VIOCD_VERS "1.04" + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "vio.h" +#include + +extern struct pci_dev * iSeries_vio_dev; + +#define signalLpEventFast HvCallEvent_signalLpEventFast + +struct viocdlpevent { + struct HvLpEvent event; + u32 mReserved1; + u16 mVersion; + u16 mSubTypeRc; + u16 mDisk; + u16 mFlags; + u32 mToken; + u64 mOffset; // On open, the max number of disks + u64 mLen; // On open, the size of the disk + u32 mBlockSize; // Only set on open + u32 mMediaSize; // Only set on open +}; + +enum viocdsubtype { + viocdopen = 0x0001, + viocdclose = 0x0002, + viocdread = 0x0003, + viocdwrite = 0x0004, + viocdlockdoor = 0x0005, + viocdgetinfo = 0x0006, + viocdcheck = 0x0007 +}; + +/* Should probably make this a module parameter....sigh + */ +#define VIOCD_MAX_CD 8 +int viocd_blocksizes[VIOCD_MAX_CD]; +static u64 viocd_size_in_bytes[VIOCD_MAX_CD]; + +static const struct vio_error_entry viocd_err_table[] = { + {0x0201, EINVAL, "Invalid Range"}, + {0x0202, EINVAL, "Invalid Token"}, + {0x0203, EIO, "DMA Error"}, + {0x0204, EIO, "Use Error"}, + {0x0205, EIO, "Release Error"}, + {0x0206, EINVAL, "Invalid CD"}, + {0x020C, EROFS, "Read Only Device"}, + {0x020D, EIO, "Changed or Missing Volume (or Varied Off?)"}, + {0x020E, EIO, "Optical System Error (Varied Off?)"}, + {0x02FF, EIO, "Internal Error"}, + {0x3010, EIO, "Changed Volume"}, + {0xC100, EIO, "Optical System Error"}, + {0x0000, 0, NULL}, +}; + +/* This is the structure we use to exchange info between driver and interrupt + * handler + */ +struct viocd_waitevent { + struct semaphore *sem; + int rc; + u16 subtypeRc; + int changed; +}; + +/* this is a lookup table for the true capabilities of a device */ +struct capability_entry { + char *type; + int capability; +}; + +static struct capability_entry capability_table[] = { + { "6330", CDC_LOCK | CDC_DVD_RAM }, + { "6321", CDC_LOCK }, + { "632B", 0 }, + { NULL , CDC_LOCK }, +}; + +struct block_device_operations viocd_fops = +{ + owner: THIS_MODULE, + open: cdrom_open, + release: cdrom_release, + ioctl: cdrom_ioctl, + check_media_change: cdrom_media_changed, +}; + +/* These are our internal structures for keeping track of devices + */ +static int viocd_numdev; + +struct cdrom_info { + char rsrcname[10]; + char type[4]; + char model[3]; +}; +static struct cdrom_info *viocd_unitinfo = NULL; + +struct disk_info{ + u32 useCount; + u32 blocksize; + u32 mediasize; +}; +static struct disk_info viocd_diskinfo[VIOCD_MAX_CD]; + +static struct cdrom_device_info viocd_info[VIOCD_MAX_CD]; + +static spinlock_t viocd_lock = SPIN_LOCK_UNLOCKED; + +#define MAX_CD_REQ 1 +static LIST_HEAD(reqlist); + +/* End a request + */ +static int viocd_end_request(struct request *req, int uptodate) +{ + if (end_that_request_first(req, uptodate, DEVICE_NAME)) + return 0; + end_that_request_last(req); + return 1; +} + + +/* Get info on CD devices from OS/400 + */ +static void get_viocd_info(void) +{ + dma_addr_t dmaaddr; + HvLpEvent_Rc hvrc; + int i; + DECLARE_MUTEX_LOCKED(Semaphore); + struct viocd_waitevent we; + + // If we don't have a host, bail out + if (viopath_hostLp == HvLpIndexInvalid) + return; + + if (viocd_unitinfo == NULL) + viocd_unitinfo = + kmalloc(sizeof(struct cdrom_info) * VIOCD_MAX_CD, + GFP_KERNEL); + + memset(viocd_unitinfo, 0x00, + sizeof(struct cdrom_info) * VIOCD_MAX_CD); + + dmaaddr = pci_map_single(iSeries_vio_dev, viocd_unitinfo, + sizeof(struct cdrom_info) * VIOCD_MAX_CD, + PCI_DMA_FROMDEVICE); + if (dmaaddr == 0xFFFFFFFF) { + printk(KERN_WARNING_VIO "error allocating tce\n"); + return; + } + + we.sem = &Semaphore; + + hvrc = signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_cdio | viocdgetinfo, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst(viopath_hostLp), + viopath_targetinst(viopath_hostLp), + (u64) (unsigned long) &we, + VIOVERSION << 16, + dmaaddr, + 0, + sizeof(struct cdrom_info) * VIOCD_MAX_CD, + 0); + if (hvrc != HvLpEvent_Rc_Good) { + printk(KERN_WARNING_VIO "cdrom error sending event. rc %d\n", (int) hvrc); + return; + } + + down(&Semaphore); + + if (we.rc) { + const struct vio_error_entry *err = vio_lookup_rc(viocd_err_table, we.subtypeRc); + printk(KERN_WARNING_VIO "bad rc %d:0x%04X on getinfo: %s\n", we.rc, we.subtypeRc, err->msg); + return; + } + + + for (i = 0; (i < VIOCD_MAX_CD) && (viocd_unitinfo[i].rsrcname[0]); i++) { + viocd_numdev++; + } +} + +/* Open a device + */ +static int viocd_open(struct cdrom_device_info *cdi, int purpose) +{ + DECLARE_MUTEX_LOCKED(Semaphore); + int device_no = MINOR(cdi->dev); + HvLpEvent_Rc hvrc; + struct viocd_waitevent we; + struct disk_info *diskinfo = &viocd_diskinfo[device_no]; + + // If we don't have a host, bail out + if (viopath_hostLp == HvLpIndexInvalid || device_no >= viocd_numdev) + return -ENODEV; + + we.sem = &Semaphore; + hvrc = signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_cdio | viocdopen, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst(viopath_hostLp), + viopath_targetinst(viopath_hostLp), + (u64) (unsigned long) &we, + VIOVERSION << 16, + ((u64) device_no << 48), + 0, 0, 0); + if (hvrc != 0) { + printk(KERN_WARNING_VIO "bad rc on signalLpEventFast %d\n", + (int) hvrc); + return -EIO; + } + + down(&Semaphore); + + if (we.rc) { + const struct vio_error_entry *err = vio_lookup_rc(viocd_err_table, we.subtypeRc); + printk(KERN_WARNING_VIO "bad rc %d:0x%04X on open: %s\n", we.rc, we.subtypeRc, err->msg); + return -err->errno; + } + + if (diskinfo->useCount == 0) { + if(diskinfo->blocksize > 0) { + viocd_blocksizes[device_no] = diskinfo->blocksize; + viocd_size_in_bytes[device_no] = diskinfo->blocksize * diskinfo->mediasize; + } else { + viocd_size_in_bytes[device_no] = 0xFFFFFFFFFFFFFFFF; + } + } + MOD_INC_USE_COUNT; + return 0; +} + +/* Release a device + */ +static void viocd_release(struct cdrom_device_info *cdi) +{ + int device_no = MINOR(cdi->dev); + HvLpEvent_Rc hvrc; + + /* If we don't have a host, bail out */ + if (viopath_hostLp == HvLpIndexInvalid + || device_no >= viocd_numdev) + return; + + hvrc = signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_cdio | viocdclose, + HvLpEvent_AckInd_NoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst(viopath_hostLp), + viopath_targetinst(viopath_hostLp), + 0, + VIOVERSION << 16, + ((u64) device_no << 48), + 0, 0, 0); + if (hvrc != 0) { + printk(KERN_WARNING_VIO "bad rc on signalLpEventFast %d\n", (int) hvrc); + return; + } + + MOD_DEC_USE_COUNT; +} + +/* Send a read or write request to OS/400 + */ +static int send_request(struct request *req) +{ + HvLpEvent_Rc hvrc; + dma_addr_t dmaaddr; + int device_no = DEVICE_NR(req->rq_dev); + u64 start = req->sector * 512, + len = req->current_nr_sectors * 512; + char reading = req->cmd == READ; + u16 command = reading ? viocdread : viocdwrite; + + + if(start + len > viocd_size_in_bytes[device_no]) { + printk(KERN_WARNING_VIO "viocd%d; access position %lx, past size %lx\n", + device_no, start + len, viocd_size_in_bytes[device_no]); + return -1; + } + + dmaaddr = pci_map_single(iSeries_vio_dev, req->buffer, len, + reading ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE); + if (dmaaddr == 0xFFFFFFFF) { + printk(KERN_WARNING_VIO "error allocating tce for address %p len %ld\n", + req->buffer, len); + return -1; + } + + hvrc = signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_cdio | command, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst(viopath_hostLp), + viopath_targetinst(viopath_hostLp), + (u64) (unsigned long) req->buffer, + VIOVERSION << 16, + ((u64) device_no << 48) | dmaaddr, + start, len, 0); + if (hvrc != HvLpEvent_Rc_Good) { + printk(KERN_WARNING_VIO "hv error on op %d\n", (int) hvrc); + return -1; + } + + return 0; +} + + +/* Do a request + */ +static int rwreq; +static void do_viocd_request(request_queue_t * q) +{ + for (;;) { + struct request *req; + char err_str[80] = ""; + int device_no; + + INIT_REQUEST; + if (rwreq >= MAX_CD_REQ) { + return; + } + + device_no = CURRENT_DEV; + + /* remove the current request from the queue */ + req = CURRENT; + blkdev_dequeue_request(req); + + /* check for any kind of error */ + if (device_no > viocd_numdev) + sprintf(err_str, "Invalid device number %d", device_no); + else if (send_request(req) < 0) + strcpy(err_str, "unable to send message to OS/400!"); + + /* if we had any sort of error, log it and cancel the request */ + if (*err_str) { + printk(KERN_WARNING_VIO "%s\n", err_str); + viocd_end_request(req, 0); + } else { + spin_lock(&viocd_lock); + list_add_tail(&req->queue, &reqlist); + ++rwreq; + spin_unlock(&viocd_lock); + } + } +} + +/* Check if the CD changed + */ +static int viocd_media_changed(struct cdrom_device_info *cdi, int disc_nr) +{ + struct viocd_waitevent we; + HvLpEvent_Rc hvrc; + int device_no = MINOR(cdi->dev); + + /* This semaphore is raised in the interrupt handler */ + DECLARE_MUTEX_LOCKED(Semaphore); + + /* Check that we are dealing with a valid hosting partition */ + if (viopath_hostLp == HvLpIndexInvalid) { + printk(KERN_WARNING_VIO "Invalid hosting partition\n"); + return -EIO; + } + + we.sem = &Semaphore; + + /* Send the open event to OS/400 */ + hvrc = signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_cdio | viocdcheck, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst(viopath_hostLp), + viopath_targetinst(viopath_hostLp), + (u64) (unsigned long) &we, + VIOVERSION << 16, + ((u64) device_no << 48), + 0, 0, 0); + + if (hvrc != 0) { + printk(KERN_WARNING_VIO "bad rc on signalLpEventFast %d\n", (int) hvrc); + return -EIO; + } + + /* Wait for the interrupt handler to get the response */ + down(&Semaphore); + + /* Check the return code. If bad, assume no change */ + if (we.rc) { + const struct vio_error_entry *err = vio_lookup_rc(viocd_err_table, we.subtypeRc); + printk(KERN_WARNING_VIO "bad rc %d:0x%04X on check_change: %s; Assuming no change\n", we.rc, we.subtypeRc, err->msg); + return 0; + } + + return we.changed; +} + +static int viocd_lock_door(struct cdrom_device_info *cdi, int locking) +{ + HvLpEvent_Rc hvrc; + u64 device_no = MINOR(cdi->dev); + /* NOTE: flags is 1 or 0 so it won't overwrite the device_no */ + u64 flags = !!locking; + /* This semaphore is raised in the interrupt handler */ + DECLARE_MUTEX_LOCKED(Semaphore); + struct viocd_waitevent we = { sem:&Semaphore }; + + /* Check that we are dealing with a valid hosting partition */ + if (viopath_hostLp == HvLpIndexInvalid) { + printk(KERN_WARNING_VIO "Invalid hosting partition\n"); + return -EIO; + } + + we.sem = &Semaphore; + + /* Send the lockdoor event to OS/400 */ + hvrc = signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_cdio | viocdlockdoor, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst(viopath_hostLp), + viopath_targetinst(viopath_hostLp), + (u64) (unsigned long) &we, + VIOVERSION << 16, + (device_no << 48) | (flags << 32), + 0, 0, 0); + + if (hvrc != 0) { + printk(KERN_WARNING_VIO "bad rc on signalLpEventFast %d\n", (int) hvrc); + return -EIO; + } + + /* Wait for the interrupt handler to get the response */ + down(&Semaphore); + + /* Check the return code. If bad, assume no change */ + if (we.rc != 0) { + return -EIO; + } + + return 0; +} + +/* This routine handles incoming CD LP events + */ +static void vioHandleCDEvent(struct HvLpEvent *event) +{ + struct viocdlpevent *bevent = (struct viocdlpevent *) event; + struct viocd_waitevent *pwe; + + if (event == NULL) { + /* Notification that a partition went away! */ + return; + } + /* First, we should NEVER get an int here...only acks */ + if (event->xFlags.xFunction == HvLpEvent_Function_Int) { + printk(KERN_WARNING_VIO "Yikes! got an int in viocd event handler!\n"); + if (event->xFlags.xAckInd == HvLpEvent_AckInd_DoAck) { + event->xRc = HvLpEvent_Rc_InvalidSubtype; + HvCallEvent_ackLpEvent(event); + } + } + + switch (event->xSubtype & VIOMINOR_SUBTYPE_MASK) { + case viocdopen: + viocd_diskinfo[bevent->mDisk].blocksize = bevent->mBlockSize; + viocd_diskinfo[bevent->mDisk].mediasize = bevent->mMediaSize; + /* FALLTHROUGH !! */ + case viocdgetinfo: + case viocdlockdoor: + pwe = (struct viocd_waitevent *) (unsigned long) event->xCorrelationToken; + pwe->rc = event->xRc; + pwe->subtypeRc = bevent->mSubTypeRc; + up(pwe->sem); + break; + + case viocdclose: + break; + + case viocdwrite: + case viocdread:{ + unsigned long flags; + int reading = ((event->xSubtype & VIOMINOR_SUBTYPE_MASK) == viocdread); + struct request *req = blkdev_entry_to_request(reqlist.next); + /* Since this is running in interrupt mode, we need to make sure we're not + * stepping on any global I/O operations + */ + spin_lock_irqsave(&io_request_lock, flags); + + pci_unmap_single(iSeries_vio_dev, + bevent->mToken, + bevent->mLen, + reading ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE); + + /* find the event to which this is a response */ + while ((&req->queue != &reqlist) && + ((u64) (unsigned long) req->buffer != bevent->event.xCorrelationToken)) + req = blkdev_entry_to_request(req->queue.next); + + /* if the event was not there, then what are we responding to?? */ + if (&req->queue == &reqlist) { + printk(KERN_WARNING_VIO "Yikes! we never enqueued this guy!\n"); + spin_unlock_irqrestore(&io_request_lock, + flags); + break; + } + + /* we don't need to keep it around anymore... */ + spin_lock(&viocd_lock); + list_del(&req->queue); + --rwreq; + spin_unlock(&viocd_lock); + { + char stat = event->xRc == HvLpEvent_Rc_Good; + int nsect = bevent->mLen >> 9; + + if (!stat) { + const struct vio_error_entry *err = + vio_lookup_rc(viocd_err_table, bevent->mSubTypeRc); + printk(KERN_WARNING_VIO "request %p failed with rc %d:0x%04X: %s\n", + req->buffer, event->xRc, bevent->mSubTypeRc, err->msg); + } + while ((nsect > 0) && (req->bh)) { + nsect -= req->current_nr_sectors; + viocd_end_request(req, stat); + } + /* we weren't done yet */ + if (req->bh) { + if (send_request(req) < 0) { + printk(KERN_WARNING_VIO + "couldn't re-submit req %p\n", req->buffer); + viocd_end_request(req, 0); + } else { + spin_lock(&viocd_lock); + list_add_tail(&req->queue, &reqlist); + ++rwreq; + spin_unlock(&viocd_lock); + } + } + } + + /* restart handling of incoming requests */ + do_viocd_request(NULL); + spin_unlock_irqrestore(&io_request_lock, flags); + break; + } + case viocdcheck: + pwe = (struct viocd_waitevent *) (unsigned long) event->xCorrelationToken; + pwe->rc = event->xRc; + pwe->subtypeRc = bevent->mSubTypeRc; + pwe->changed = bevent->mFlags; + up(pwe->sem); + break; + + default: + printk(KERN_WARNING_VIO "message with invalid subtype %0x04X!\n", event->xSubtype & VIOMINOR_SUBTYPE_MASK); + if (event->xFlags.xAckInd == HvLpEvent_AckInd_DoAck) { + event->xRc = HvLpEvent_Rc_InvalidSubtype; + HvCallEvent_ackLpEvent(event); + } + } +} + +/* Our file operations table + */ +static struct cdrom_device_ops viocd_dops = { + open:viocd_open, + release:viocd_release, + media_changed:viocd_media_changed, + lock_door:viocd_lock_door, + capability:CDC_CLOSE_TRAY | CDC_OPEN_TRAY | CDC_LOCK | CDC_SELECT_SPEED | CDC_SELECT_DISC | CDC_MULTI_SESSION | CDC_MCN | CDC_MEDIA_CHANGED | CDC_PLAY_AUDIO | CDC_RESET | CDC_IOCTLS | CDC_DRIVE_STATUS | CDC_GENERIC_PACKET | CDC_CD_R | CDC_CD_RW | CDC_DVD | CDC_DVD_R | CDC_DVD_RAM +}; + +/* Handle reads from the proc file system + */ +static int proc_read(char *buf, char **start, off_t offset, + int blen, int *eof, void *data) +{ + int len = 0; + int i; + + for (i = 0; i < viocd_numdev; i++) { + len += + sprintf(buf + len, + "viocd device %d is iSeries resource %10.10s type %4.4s, model %3.3s\n", + i, viocd_unitinfo[i].rsrcname, + viocd_unitinfo[i].type, + viocd_unitinfo[i].model); + } + *eof = 1; + return len; +} + + +/* setup our proc file system entries + */ +void viocd_proc_init(struct proc_dir_entry *iSeries_proc) +{ + struct proc_dir_entry *ent; + ent = create_proc_entry("viocd", S_IFREG | S_IRUSR, iSeries_proc); + if (!ent) + return; + ent->nlink = 1; + ent->data = NULL; + ent->read_proc = proc_read; +} + +/* clean up our proc file system entries + */ +void viocd_proc_delete(struct proc_dir_entry *iSeries_proc) +{ + remove_proc_entry("viocd", iSeries_proc); +} + +static int find_capability(const char *type) +{ + struct capability_entry *entry; + for(entry = capability_table; entry->type; ++entry) + if(!strncmp(entry->type, type, 4)) + break; + return entry->capability; +} + +/* Initialize the whole device driver. Handle module and non-module + * versions + */ +__init int viocd_init(void) +{ + int i, rc; + + if (viopath_hostLp == HvLpIndexInvalid) + vio_set_hostlp(); + + /* If we don't have a host, bail out */ + if (viopath_hostLp == HvLpIndexInvalid) + return -ENODEV; + + rc = viopath_open(viopath_hostLp, viomajorsubtype_cdio, MAX_CD_REQ+2); + if (rc) { + printk(KERN_WARNING_VIO "error opening path to host partition %d\n", + viopath_hostLp); + return rc; + } + + /* Initialize our request handler + */ + rwreq = 0; + vio_setHandler(viomajorsubtype_cdio, vioHandleCDEvent); + + memset(&viocd_diskinfo, 0x00, sizeof(viocd_diskinfo)); + + get_viocd_info(); + + if (viocd_numdev == 0) { + vio_clearHandler(viomajorsubtype_cdio); + viopath_close(viopath_hostLp, viomajorsubtype_cdio, MAX_CD_REQ+2); + return 0; + } + + printk(KERN_INFO_VIO + "%s: iSeries Virtual CD vers %s, major %d, max disks %d, hosting partition %d\n", + DEVICE_NAME, VIOCD_VERS, MAJOR_NR, VIOCD_MAX_CD, viopath_hostLp); + + if (devfs_register_blkdev(MAJOR_NR, "viocd", &viocd_fops) != 0) { + printk(KERN_WARNING_VIO "Unable to get major %d for viocd CD-ROM\n", MAJOR_NR); + return -EIO; + } + + blksize_size[MAJOR_NR] = viocd_blocksizes; + blk_init_queue(BLK_DEFAULT_QUEUE(MAJOR_NR), DEVICE_REQUEST); + read_ahead[MAJOR_NR] = 4; + + memset(&viocd_info, 0x00, sizeof(viocd_info)); + for (i = 0; i < viocd_numdev; i++) { + viocd_info[i].dev = MKDEV(MAJOR_NR, i); + viocd_info[i].ops = &viocd_dops; + viocd_info[i].speed = 4; + viocd_info[i].capacity = 1; + viocd_info[i].mask = ~find_capability(viocd_unitinfo[i].type); + sprintf(viocd_info[i].name, VIOCD_DEVICE, VIOCD_DEVICE_OFFSET + i); + if (register_cdrom(&viocd_info[i]) != 0) { + printk(KERN_WARNING_VIO "Cannot register viocd CD-ROM %s!\n", viocd_info[i].name); + } else { + printk(KERN_INFO_VIO + "cd %s is iSeries resource %10.10s type %4.4s, model %3.3s\n", + viocd_info[i].name, + viocd_unitinfo[i].rsrcname, + viocd_unitinfo[i].type, + viocd_unitinfo[i].model); + } + } + + /* + * Create the proc entry + */ + iSeries_proc_callback(&viocd_proc_init); + + return 0; +} + +#ifdef MODULE +void viocd_exit(void) +{ + int i; + for (i = 0; i < viocd_numdev; i++) { + if (unregister_cdrom(&viocd_info[i]) != 0) { + printk(KERN_WARNING_VIO "Cannot unregister viocd CD-ROM %s!\n", viocd_info[i].name); + } + } + if ((devfs_unregister_blkdev(MAJOR_NR, "viocd") == -EINVAL)) { + printk(KERN_WARNING_VIO "can't unregister viocd\n"); + return; + } + blk_cleanup_queue(BLK_DEFAULT_QUEUE(MAJOR_NR)); + if (viocd_unitinfo) + kfree(viocd_unitinfo); + + iSeries_proc_callback(&viocd_proc_delete); + + viopath_close(viopath_hostLp, viomajorsubtype_cdio, MAX_CD_REQ+2); + vio_clearHandler(viomajorsubtype_cdio); +} +#endif + +#ifdef MODULE +module_init(viocd_init); +module_exit(viocd_exit); +MODULE_LICENSE("GPL"); +#endif diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/iseries/viocons.c linuxppc64_2_4/drivers/iseries/viocons.c --- ../kernel.org/linux-2.4.19/drivers/iseries/viocons.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/drivers/iseries/viocons.c Thu Apr 25 19:59:10 2002 @@ -0,0 +1,1390 @@ +/* -*- linux-c -*- + * drivers/char/viocons.c + * + * iSeries Virtual Terminal + * + * Authors: Dave Boutcher + * Ryan Arnold + * Colin Devilbiss + * + * (C) Copyright 2000 IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) anyu later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vio.h" + +#include +#include "asm/iSeries/HvCallEvent.h" +#include "asm/iSeries/HvLpConfig.h" +#include "asm/iSeries/HvCall.h" +#include + +/* Check that the tty_driver_data actually points to our stuff + */ +#define VIOTTY_PARANOIA_CHECK 1 +#define VIOTTY_MAGIC (0x0DCB) + +static int debug; + +static DECLARE_WAIT_QUEUE_HEAD(viocons_wait_queue); + +#define VTTY_PORTS 10 +#define VIOTTY_SERIAL_START 65 + +static u64 sndMsgSeq[VTTY_PORTS]; +static u64 sndMsgAck[VTTY_PORTS]; + +static spinlock_t consolelock = SPIN_LOCK_UNLOCKED; + +/* THe structure of the events that flow between us and OS/400. You can't + * mess with this unless the OS/400 side changes too + */ +struct viocharlpevent { + struct HvLpEvent event; + u32 mReserved1; + u16 mVersion; + u16 mSubTypeRc; + u8 virtualDevice; + u8 immediateDataLen; + u8 immediateData[VIOCHAR_MAX_DATA]; +}; + +#define viochar_window (10) +#define viochar_highwatermark (3) + +enum viocharsubtype { + viocharopen = 0x0001, + viocharclose = 0x0002, + viochardata = 0x0003, + viocharack = 0x0004, + viocharconfig = 0x0005 +}; + +enum viochar_rc { + viochar_rc_ebusy = 1 +}; + +/* When we get writes faster than we can send it to the partition, + * buffer the data here. There is one set of buffers for each virtual + * port. + * Note that bufferUsed is a bit map of used buffers. + * It had better have enough bits to hold NUM_BUF + * the bitops assume it is a multiple of unsigned long + */ +#define NUM_BUF (8) +#define OVERFLOW_SIZE VIOCHAR_MAX_DATA + +static struct overflowBuffers { + unsigned long bufferUsed; + u8 *buffer[NUM_BUF]; + int bufferBytes[NUM_BUF]; + int curbuf; + int bufferOverflow; + int overflowMessage; +} overflow[VTTY_PORTS]; + +static void initDataEvent(struct viocharlpevent *viochar, HvLpIndex lp); + +static struct tty_driver viotty_driver; +static struct tty_driver viottyS_driver; +static int viotty_refcount; + +static struct tty_struct *viotty_table[VTTY_PORTS]; +static struct tty_struct *viottyS_table[VTTY_PORTS]; +static struct termios *viotty_termios[VTTY_PORTS]; +static struct termios *viottyS_termios[VTTY_PORTS]; +static struct termios *viotty_termios_locked[VTTY_PORTS]; +static struct termios *viottyS_termios_locked[VTTY_PORTS]; + +void hvlog(char *fmt, ...) +{ + int i; + static char buf[256]; + va_list args; + va_start(args, fmt); + i = vsprintf(buf, fmt, args); + va_end(args); + HvCall_writeLogBuffer(buf, i); + HvCall_writeLogBuffer("\r", 1); + +} + +/* Our port information. We store a pointer to one entry in the + * tty_driver_data + */ +static struct port_info_tag { + int magic; + struct tty_struct *tty; + HvLpIndex lp; + u8 vcons; + u8 port; +} port_info[VTTY_PORTS]; + +/* Make sure we're pointing to a valid port_info structure. Shamelessly + * plagerized from serial.c + */ +static inline int viotty_paranoia_check(struct port_info_tag *pi, + kdev_t device, const char *routine) +{ +#ifdef VIOTTY_PARANOIA_CHECK + static const char *badmagic = + "%s Warning: bad magic number for port_info struct (%s) in %s\n"; + static const char *badinfo = + "%s Warning: null port_info for (%s) in %s\n"; + + if (!pi) { + printk(badinfo, KERN_WARNING_VIO, kdevname(device), + routine); + return 1; + } + if (pi->magic != VIOTTY_MAGIC) { + printk(badmagic, KERN_WARNING_VIO, kdevname(device), + routine); + return 1; + } +#endif + return 0; +} + +/* + * Handle reads from the proc file system. Right now we just dump the + * state of the first TTY + */ +static int proc_read(char *buf, char **start, off_t offset, + int blen, int *eof, void *data) +{ + int len = 0; + struct tty_struct *tty = viotty_table[0]; + struct termios *termios; + if (tty == NULL) { + len += sprintf(buf + len, "no tty\n"); + *eof = 1; + return len; + } + + len += + sprintf(buf + len, + "tty info: COOK_OUT %ld COOK_IN %ld, NO_WRITE_SPLIT %ld\n", + tty->flags & TTY_HW_COOK_OUT, + tty->flags & TTY_HW_COOK_IN, + tty->flags & TTY_NO_WRITE_SPLIT); + + termios = tty->termios; + if (termios == NULL) { + len += sprintf(buf + len, "no termios\n"); + *eof = 1; + return len; + } + len += sprintf(buf + len, "INTR_CHAR %2.2x\n", INTR_CHAR(tty)); + len += sprintf(buf + len, "QUIT_CHAR %2.2x\n", QUIT_CHAR(tty)); + len += + sprintf(buf + len, "ERASE_CHAR %2.2x\n", ERASE_CHAR(tty)); + len += sprintf(buf + len, "KILL_CHAR %2.2x\n", KILL_CHAR(tty)); + len += sprintf(buf + len, "EOF_CHAR %2.2x\n", EOF_CHAR(tty)); + len += sprintf(buf + len, "TIME_CHAR %2.2x\n", TIME_CHAR(tty)); + len += sprintf(buf + len, "MIN_CHAR %2.2x\n", MIN_CHAR(tty)); + len += sprintf(buf + len, "SWTC_CHAR %2.2x\n", SWTC_CHAR(tty)); + len += + sprintf(buf + len, "START_CHAR %2.2x\n", START_CHAR(tty)); + len += sprintf(buf + len, "STOP_CHAR %2.2x\n", STOP_CHAR(tty)); + len += sprintf(buf + len, "SUSP_CHAR %2.2x\n", SUSP_CHAR(tty)); + len += sprintf(buf + len, "EOL_CHAR %2.2x\n", EOL_CHAR(tty)); + len += + sprintf(buf + len, "REPRINT_CHAR %2.2x\n", REPRINT_CHAR(tty)); + len += + sprintf(buf + len, "DISCARD_CHAR %2.2x\n", DISCARD_CHAR(tty)); + len += + sprintf(buf + len, "WERASE_CHAR %2.2x\n", WERASE_CHAR(tty)); + len += + sprintf(buf + len, "LNEXT_CHAR %2.2x\n", LNEXT_CHAR(tty)); + len += sprintf(buf + len, "EOL2_CHAR %2.2x\n", EOL2_CHAR(tty)); + + len += sprintf(buf + len, "I_IGNBRK %4.4x\n", I_IGNBRK(tty)); + len += sprintf(buf + len, "I_BRKINT %4.4x\n", I_BRKINT(tty)); + len += sprintf(buf + len, "I_IGNPAR %4.4x\n", I_IGNPAR(tty)); + len += sprintf(buf + len, "I_PARMRK %4.4x\n", I_PARMRK(tty)); + len += sprintf(buf + len, "I_INPCK %4.4x\n", I_INPCK(tty)); + len += sprintf(buf + len, "I_ISTRIP %4.4x\n", I_ISTRIP(tty)); + len += sprintf(buf + len, "I_INLCR %4.4x\n", I_INLCR(tty)); + len += sprintf(buf + len, "I_IGNCR %4.4x\n", I_IGNCR(tty)); + len += sprintf(buf + len, "I_ICRNL %4.4x\n", I_ICRNL(tty)); + len += sprintf(buf + len, "I_IUCLC %4.4x\n", I_IUCLC(tty)); + len += sprintf(buf + len, "I_IXON %4.4x\n", I_IXON(tty)); + len += sprintf(buf + len, "I_IXANY %4.4x\n", I_IXANY(tty)); + len += sprintf(buf + len, "I_IXOFF %4.4x\n", I_IXOFF(tty)); + len += sprintf(buf + len, "I_IMAXBEL %4.4x\n", I_IMAXBEL(tty)); + + len += sprintf(buf + len, "O_OPOST %4.4x\n", O_OPOST(tty)); + len += sprintf(buf + len, "O_OLCUC %4.4x\n", O_OLCUC(tty)); + len += sprintf(buf + len, "O_ONLCR %4.4x\n", O_ONLCR(tty)); + len += sprintf(buf + len, "O_OCRNL %4.4x\n", O_OCRNL(tty)); + len += sprintf(buf + len, "O_ONOCR %4.4x\n", O_ONOCR(tty)); + len += sprintf(buf + len, "O_ONLRET %4.4x\n", O_ONLRET(tty)); + len += sprintf(buf + len, "O_OFILL %4.4x\n", O_OFILL(tty)); + len += sprintf(buf + len, "O_OFDEL %4.4x\n", O_OFDEL(tty)); + len += sprintf(buf + len, "O_NLDLY %4.4x\n", O_NLDLY(tty)); + len += sprintf(buf + len, "O_CRDLY %4.4x\n", O_CRDLY(tty)); + len += sprintf(buf + len, "O_TABDLY %4.4x\n", O_TABDLY(tty)); + len += sprintf(buf + len, "O_BSDLY %4.4x\n", O_BSDLY(tty)); + len += sprintf(buf + len, "O_VTDLY %4.4x\n", O_VTDLY(tty)); + len += sprintf(buf + len, "O_FFDLY %4.4x\n", O_FFDLY(tty)); + + len += sprintf(buf + len, "C_BAUD %4.4x\n", C_BAUD(tty)); + len += sprintf(buf + len, "C_CSIZE %4.4x\n", C_CSIZE(tty)); + len += sprintf(buf + len, "C_CSTOPB %4.4x\n", C_CSTOPB(tty)); + len += sprintf(buf + len, "C_CREAD %4.4x\n", C_CREAD(tty)); + len += sprintf(buf + len, "C_PARENB %4.4x\n", C_PARENB(tty)); + len += sprintf(buf + len, "C_PARODD %4.4x\n", C_PARODD(tty)); + len += sprintf(buf + len, "C_HUPCL %4.4x\n", C_HUPCL(tty)); + len += sprintf(buf + len, "C_CLOCAL %4.4x\n", C_CLOCAL(tty)); + len += sprintf(buf + len, "C_CRTSCTS %4.4x\n", C_CRTSCTS(tty)); + + len += sprintf(buf + len, "L_ISIG %4.4x\n", L_ISIG(tty)); + len += sprintf(buf + len, "L_ICANON %4.4x\n", L_ICANON(tty)); + len += sprintf(buf + len, "L_XCASE %4.4x\n", L_XCASE(tty)); + len += sprintf(buf + len, "L_ECHO %4.4x\n", L_ECHO(tty)); + len += sprintf(buf + len, "L_ECHOE %4.4x\n", L_ECHOE(tty)); + len += sprintf(buf + len, "L_ECHOK %4.4x\n", L_ECHOK(tty)); + len += sprintf(buf + len, "L_ECHONL %4.4x\n", L_ECHONL(tty)); + len += sprintf(buf + len, "L_NOFLSH %4.4x\n", L_NOFLSH(tty)); + len += sprintf(buf + len, "L_TOSTOP %4.4x\n", L_TOSTOP(tty)); + len += sprintf(buf + len, "L_ECHOCTL %4.4x\n", L_ECHOCTL(tty)); + len += sprintf(buf + len, "L_ECHOPRT %4.4x\n", L_ECHOPRT(tty)); + len += sprintf(buf + len, "L_ECHOKE %4.4x\n", L_ECHOKE(tty)); + len += sprintf(buf + len, "L_FLUSHO %4.4x\n", L_FLUSHO(tty)); + len += sprintf(buf + len, "L_PENDIN %4.4x\n", L_PENDIN(tty)); + len += sprintf(buf + len, "L_IEXTEN %4.4x\n", L_IEXTEN(tty)); + + *eof = 1; + return len; +} + +/* + * Handle writes to our proc file system. Right now just turns on and off + * our debug flag + */ +static int proc_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + if (count) { + if (buffer[0] == '1') { + printk("viocons: debugging on\n"); + debug = 1; + } else { + printk("viocons: debugging off\n"); + debug = 0; + } + } + return count; +} + +/* + * setup our proc file system entries + */ +void viocons_proc_init(struct proc_dir_entry *iSeries_proc) +{ + struct proc_dir_entry *ent; + ent = + create_proc_entry("viocons", S_IFREG | S_IRUSR, iSeries_proc); + if (!ent) + return; + ent->nlink = 1; + ent->data = NULL; + ent->read_proc = proc_read; + ent->write_proc = proc_write; +} + +/* + * clean up our proc file system entries + */ +void viocons_proc_delete(struct proc_dir_entry *iSeries_proc) +{ + remove_proc_entry("viocons", iSeries_proc); +} + +/* + * Add data to our pending-send buffers. + * + * NOTE: Don't use printk in here because it gets nastily recursive. hvlog can be + * used to log to the hypervisor buffer + */ +static int bufferAdd(u8 port, const char *buf, size_t len, int userFlag) +{ + size_t bleft = len; + size_t curlen; + char *cbuf = (char *) buf; + int nextbuf; + struct overflowBuffers *pov = &overflow[port]; + while (bleft > 0) { + /* If there is no space left in the current buffer, we have + * filled everything up, so return. If we filled the previous + * buffer we would already have moved to the next one. + */ + if (pov->bufferBytes[pov->curbuf] == OVERFLOW_SIZE) { + hvlog("buffer %d full. no more space\n", + pov->curbuf); + pov->bufferOverflow++; + pov->overflowMessage = 1; + return len - bleft; + } + + /* Turn on the "used" bit for this buffer. If it's already on, that's + * fine. + */ + set_bit(pov->curbuf, &pov->bufferUsed); + + /* + * See if this buffer has been allocated. If not, allocate it + */ + if (pov->buffer[pov->curbuf] == NULL) + pov->buffer[pov->curbuf] = + kmalloc(OVERFLOW_SIZE, GFP_ATOMIC); + + /* + * Figure out how much we can copy into this buffer + */ + if (bleft < + (OVERFLOW_SIZE - pov->bufferBytes[pov->curbuf])) + curlen = bleft; + else + curlen = + OVERFLOW_SIZE - pov->bufferBytes[pov->curbuf]; + + /* + * Copy the data into the buffer + */ + if (userFlag) + copy_from_user(pov->buffer[pov->curbuf] + + pov->bufferBytes[pov->curbuf], cbuf, + curlen); + else + memcpy(pov->buffer[pov->curbuf] + + pov->bufferBytes[pov->curbuf], cbuf, + curlen); + + pov->bufferBytes[pov->curbuf] += curlen; + cbuf += curlen; + bleft -= curlen; + + /* + * Now see if we've filled this buffer + */ + if (pov->bufferBytes[pov->curbuf] == OVERFLOW_SIZE) { + nextbuf = (pov->curbuf + 1) % NUM_BUF; + + /* + * Move to the next buffer if it hasn't been used yet + */ + if (test_bit(nextbuf, &pov->bufferUsed) == 0) { + pov->curbuf = nextbuf; + } + } + } + return len; +} + +/* Send pending data + * + * NOTE: Don't use printk in here because it gets nastily recursive. hvlog can be + * used to log to the hypervisor buffer + */ +void sendBuffers(u8 port, HvLpIndex lp) +{ + HvLpEvent_Rc hvrc; + int nextbuf; + struct viocharlpevent *viochar; + unsigned long flags; + struct overflowBuffers *pov = &overflow[port]; + + spin_lock_irqsave(&consolelock, flags); + + viochar = (struct viocharlpevent *) + vio_get_event_buffer(viomajorsubtype_chario); + + /* Make sure we got a buffer + */ + if (viochar == NULL) { + hvlog("Yikes...can't get viochar buffer"); + spin_unlock_irqrestore(&consolelock, flags); + return; + } + + if (pov->bufferUsed == 0) { + hvlog("in sendbuffers, but no buffers used\n"); + vio_free_event_buffer(viomajorsubtype_chario, viochar); + spin_unlock_irqrestore(&consolelock, flags); + return; + } + + /* + * curbuf points to the buffer we're filling. We want to start sending AFTER + * this one. + */ + nextbuf = (pov->curbuf + 1) % NUM_BUF; + + /* + * Loop until we find a buffer with the bufferUsed bit on + */ + while (test_bit(nextbuf, &pov->bufferUsed) == 0) + nextbuf = (nextbuf + 1) % NUM_BUF; + + initDataEvent(viochar, lp); + + /* + * While we have buffers with data, and our send window is open, send them + */ + while ((test_bit(nextbuf, &pov->bufferUsed)) && + ((sndMsgSeq[port] - sndMsgAck[port]) < viochar_window)) { + viochar->immediateDataLen = pov->bufferBytes[nextbuf]; + viochar->event.xCorrelationToken = sndMsgSeq[port]++; + viochar->event.xSizeMinus1 = + offsetof(struct viocharlpevent, + immediateData) + viochar->immediateDataLen; + + memcpy(viochar->immediateData, pov->buffer[nextbuf], + viochar->immediateDataLen); + + hvrc = HvCallEvent_signalLpEvent(&viochar->event); + if (hvrc) { + /* + * MUST unlock the spinlock before doing a printk + */ + vio_free_event_buffer(viomajorsubtype_chario, + viochar); + spin_unlock_irqrestore(&consolelock, flags); + + printk(KERN_WARNING_VIO + "console error sending event! return code %d\n", + (int) hvrc); + return; + } + + /* + * clear the bufferUsed bit, zero the number of bytes in this buffer, + * and move to the next buffer + */ + clear_bit(nextbuf, &pov->bufferUsed); + pov->bufferBytes[nextbuf] = 0; + nextbuf = (nextbuf + 1) % NUM_BUF; + } + + + /* + * If we have emptied all the buffers, start at 0 again. + * this will re-use any allocated buffers + */ + if (pov->bufferUsed == 0) { + pov->curbuf = 0; + + if (pov->overflowMessage) + pov->overflowMessage = 0; + + if (port_info[port].tty) { + if ((port_info[port].tty-> + flags & (1 << TTY_DO_WRITE_WAKEUP)) + && (port_info[port].tty->ldisc.write_wakeup)) + (port_info[port].tty->ldisc. + write_wakeup) (port_info[port].tty); + wake_up_interruptible(&port_info[port].tty-> + write_wait); + } + } + + vio_free_event_buffer(viomajorsubtype_chario, viochar); + spin_unlock_irqrestore(&consolelock, flags); + +} + +/* Our internal writer. Gets called both from the console device and + * the tty device. the tty pointer will be NULL if called from the console. + * + * NOTE: Don't use printk in here because it gets nastily recursive. hvlog can be + * used to log to the hypervisor buffer + */ +static int internal_write(struct tty_struct *tty, const char *buf, + size_t len, int userFlag) +{ + HvLpEvent_Rc hvrc; + size_t bleft = len; + size_t curlen; + const char *curbuf = buf; + struct viocharlpevent *viochar; + unsigned long flags; + struct port_info_tag *pi = NULL; + HvLpIndex lp; + u8 port; + + if (tty) { + pi = (struct port_info_tag *) tty->driver_data; + + if (!pi + || viotty_paranoia_check(pi, tty->device, + "viotty_internal_write")) + return -ENODEV; + + lp = pi->lp; + port = pi->port; + } else { + /* If this is the console device, use the lp from the first port entry + */ + port = 0; + lp = port_info[0].lp; + } + + /* Always put console output in the hypervisor console log + */ + if (port == 0) + HvCall_writeLogBuffer(buf, len); + + /* If the path to this LP is closed, don't bother doing anything more. + * just dump the data on the floor + */ + if (!viopath_isactive(lp)) + return len; + + /* + * If there is already data queued for this port, send it + */ + if (overflow[port].bufferUsed) + sendBuffers(port, lp); + + spin_lock_irqsave(&consolelock, flags); + + viochar = (struct viocharlpevent *) + vio_get_event_buffer(viomajorsubtype_chario); + /* Make sure we got a buffer + */ + if (viochar == NULL) { + hvlog("Yikes...can't get viochar buffer"); + spin_unlock_irqrestore(&consolelock, flags); + return -1; + } + + initDataEvent(viochar, lp); + + /* Got the lock, don't cause console output */ + while ((bleft > 0) && + (overflow[port].bufferUsed == 0) && + ((sndMsgSeq[port] - sndMsgAck[port]) < viochar_window)) { + if (bleft > VIOCHAR_MAX_DATA) + curlen = VIOCHAR_MAX_DATA; + else + curlen = bleft; + + viochar->immediateDataLen = curlen; + viochar->event.xCorrelationToken = sndMsgSeq[port]++; + + if (userFlag) + copy_from_user(viochar->immediateData, curbuf, + curlen); + else + memcpy(viochar->immediateData, curbuf, curlen); + + viochar->event.xSizeMinus1 = + offsetof(struct viocharlpevent, + immediateData) + curlen; + + hvrc = HvCallEvent_signalLpEvent(&viochar->event); + if (hvrc) { + /* + * MUST unlock the spinlock before doing a printk + */ + vio_free_event_buffer(viomajorsubtype_chario, + viochar); + spin_unlock_irqrestore(&consolelock, flags); + + hvlog("viocons: error sending event! %d\n", + (int) hvrc); + return len - bleft; + } + + curbuf += curlen; + bleft -= curlen; + } + + /* + * If we didn't send it all, buffer it + */ + if (bleft > 0) { + bleft -= bufferAdd(port, curbuf, bleft, userFlag); + } + vio_free_event_buffer(viomajorsubtype_chario, viochar); + spin_unlock_irqrestore(&consolelock, flags); + + return len - bleft; +} + +/* Initialize the common fields in a charLpEvent + */ +static void initDataEvent(struct viocharlpevent *viochar, HvLpIndex lp) +{ + memset(viochar, 0x00, sizeof(struct viocharlpevent)); + + viochar->event.xFlags.xValid = 1; + viochar->event.xFlags.xFunction = HvLpEvent_Function_Int; + viochar->event.xFlags.xAckInd = HvLpEvent_AckInd_NoAck; + viochar->event.xFlags.xAckType = HvLpEvent_AckType_DeferredAck; + viochar->event.xType = HvLpEvent_Type_VirtualIo; + viochar->event.xSubtype = viomajorsubtype_chario | viochardata; + viochar->event.xSourceLp = HvLpConfig_getLpIndex(); + viochar->event.xTargetLp = lp; + viochar->event.xSizeMinus1 = sizeof(struct viocharlpevent); + viochar->event.xSourceInstanceId = viopath_sourceinst(lp); + viochar->event.xTargetInstanceId = viopath_targetinst(lp); +} + + +/* console device write + */ +static void viocons_write(struct console *co, const char *s, + unsigned count) +{ + /* This parser will ensure that all single instances of either \n or \r are + * matched into carriage return/line feed combinations. It also allows for + * instances where there already exist \n\r combinations as well as the + * reverse, \r\n combinations. + */ + + int index; + char charptr[1]; + int foundcr; + int slicebegin; + int sliceend; + + foundcr = 0; + slicebegin = 0; + sliceend = 0; + + for (index = 0; index < count; index++) { + if (!foundcr && s[index] == 0x0a) { + if ((slicebegin - sliceend > 0) + && sliceend < count) { + internal_write(NULL, &s[slicebegin], + sliceend - slicebegin, 0); + slicebegin = sliceend; + } + charptr[0] = '\r'; + internal_write(NULL, charptr, 1, 0); + } + if (foundcr && s[index] != 0x0a) { + if ((index - 2) >= 0) { + if (s[index - 2] != 0x0a) { + internal_write(NULL, + &s[slicebegin], + sliceend - + slicebegin, 0); + slicebegin = sliceend; + charptr[0] = '\n'; + internal_write(NULL, charptr, 1, + 0); + } + } + } + sliceend++; + + if (s[index] == 0x0d) + foundcr = 1; + else + foundcr = 0; + } + + internal_write(NULL, &s[slicebegin], sliceend - slicebegin, 0); + + if (count > 1) { + if (foundcr == 1 && s[count - 1] != 0x0a) { + charptr[0] = '\n'; + internal_write(NULL, charptr, 1, 0); + } else if (s[count - 1] == 0x0a && s[count - 2] != 0x0d) { + + charptr[0] = '\r'; + internal_write(NULL, charptr, 1, 0); + } + } +} + +/* Work out a the device associate with this console + */ +static kdev_t viocons_device(struct console *c) +{ + return MKDEV(TTY_MAJOR, c->index + viotty_driver.minor_start); +} + +/* console device read method + */ +static int viocons_read(struct console *co, const char *s, unsigned count) +{ + printk(KERN_DEBUG_VIO "viocons_read\n"); + // Implement me + interruptible_sleep_on(&viocons_wait_queue); + return 0; +} + +/* Do console device setup + */ +static int __init viocons_setup(struct console *co, char *options) +{ + return 0; +} + +/* console device I/O methods + */ +static struct console viocons = { + name:"ttyS", + write:viocons_write, + read:viocons_read, + device:viocons_device, + setup:viocons_setup, + flags:CON_PRINTBUFFER, +}; + + +/* TTY Open method + */ +static int viotty_open(struct tty_struct *tty, struct file *filp) +{ + int port; + unsigned long flags; + MOD_INC_USE_COUNT; + port = MINOR(tty->device) - tty->driver.minor_start; + + if (port >= VIOTTY_SERIAL_START) + port -= VIOTTY_SERIAL_START; + + if ((port < 0) || (port >= VTTY_PORTS)) { + MOD_DEC_USE_COUNT; + return -ENODEV; + } + + spin_lock_irqsave(&consolelock, flags); + + /* + * If some other TTY is already connected here, reject the open + */ + if ((port_info[port].tty) && (port_info[port].tty != tty)) { + spin_unlock_irqrestore(&consolelock, flags); + MOD_DEC_USE_COUNT; + printk(KERN_WARNING_VIO + "console attempt to open device twice from different ttys\n"); + return -EBUSY; + } + tty->driver_data = &port_info[port]; + port_info[port].tty = tty; + spin_unlock_irqrestore(&consolelock, flags); + + return 0; +} + +/* TTY Close method + */ +static void viotty_close(struct tty_struct *tty, struct file *filp) +{ + unsigned long flags; + struct port_info_tag *pi = + (struct port_info_tag *) tty->driver_data; + + if (!pi || viotty_paranoia_check(pi, tty->device, "viotty_close")) + return; + + spin_lock_irqsave(&consolelock, flags); + if (tty->count == 1) { + pi->tty = NULL; + } + + spin_unlock_irqrestore(&consolelock, flags); + + MOD_DEC_USE_COUNT; +} + +/* TTY Write method + */ +static int viotty_write(struct tty_struct *tty, int from_user, + const unsigned char *buf, int count) +{ + return internal_write(tty, buf, count, from_user); +} + +/* TTY put_char method + */ +static void viotty_put_char(struct tty_struct *tty, unsigned char ch) +{ + internal_write(tty, &ch, 1, 0); +} + +/* TTY flush_chars method + */ +static void viotty_flush_chars(struct tty_struct *tty) +{ +} + +/* TTY write_room method + */ +static int viotty_write_room(struct tty_struct *tty) +{ + int i; + int room = 0; + struct port_info_tag *pi = + (struct port_info_tag *) tty->driver_data; + + if (!pi + || viotty_paranoia_check(pi, tty->device, + "viotty_sendbuffers")) + return 0; + + // If no buffers are used, return the max size + if (overflow[pi->port].bufferUsed == 0) + return VIOCHAR_MAX_DATA * NUM_BUF; + + for (i = 0; ((i < NUM_BUF) && (room < VIOCHAR_MAX_DATA)); i++) { + room += + (OVERFLOW_SIZE - overflow[pi->port].bufferBytes[i]); + } + + if (room > VIOCHAR_MAX_DATA) + return VIOCHAR_MAX_DATA; + else + return room; +} + +/* TTY chars_in_buffer_room method + */ +static int viotty_chars_in_buffer(struct tty_struct *tty) +{ + return 0; +} + +static void viotty_flush_buffer(struct tty_struct *tty) +{ +} + +static int viotty_ioctl(struct tty_struct *tty, struct file *file, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + /* the ioctls below read/set the flags usually shown in the leds */ + /* don't use them - they will go away without warning */ + case KDGETLED: + case KDGKBLED: + return put_user(0, (char *) arg); + + case KDSKBLED: + return 0; + } + + return n_tty_ioctl(tty, file, cmd, arg); +} + +static void viotty_throttle(struct tty_struct *tty) +{ +} + +static void viotty_unthrottle(struct tty_struct *tty) +{ +} + +static void viotty_set_termios(struct tty_struct *tty, + struct termios *old_termios) +{ +} + +static void viotty_stop(struct tty_struct *tty) +{ +} + +static void viotty_start(struct tty_struct *tty) +{ +} + +static void viotty_hangup(struct tty_struct *tty) +{ +} + +static void viotty_break(struct tty_struct *tty, int break_state) +{ +} + +static void viotty_send_xchar(struct tty_struct *tty, char ch) +{ +} + +static void viotty_wait_until_sent(struct tty_struct *tty, int timeout) +{ +} + +/* Handle an open charLpEvent. Could be either interrupt or ack + */ +static void vioHandleOpenEvent(struct HvLpEvent *event) +{ + unsigned long flags; + u8 eventRc; + u16 eventSubtypeRc; + struct viocharlpevent *cevent = (struct viocharlpevent *) event; + u8 port = cevent->virtualDevice; + + if (event->xFlags.xFunction == HvLpEvent_Function_Ack) { + if (port >= VTTY_PORTS) + return; + + spin_lock_irqsave(&consolelock, flags); + /* Got the lock, don't cause console output */ + + if (event->xRc == HvLpEvent_Rc_Good) { + sndMsgSeq[port] = sndMsgAck[port] = 0; + } + + port_info[port].lp = event->xTargetLp; + + spin_unlock_irqrestore(&consolelock, flags); + + if (event->xCorrelationToken != 0) { + unsigned long semptr = event->xCorrelationToken; + up((struct semaphore *) semptr); + } else + printk(KERN_WARNING_VIO + "console: wierd...got open ack without semaphore\n"); + } else { + /* This had better require an ack, otherwise complain + */ + if (event->xFlags.xAckInd != HvLpEvent_AckInd_DoAck) { + printk(KERN_WARNING_VIO + "console: viocharopen without ack bit!\n"); + return; + } + + spin_lock_irqsave(&consolelock, flags); + /* Got the lock, don't cause console output */ + + /* Make sure this is a good virtual tty */ + if (port >= VTTY_PORTS) { + eventRc = HvLpEvent_Rc_SubtypeError; + eventSubtypeRc = viorc_openRejected; + } + + /* If this is tty is already connected to a different + partition, fail */ + else if ((port_info[port].lp != HvLpIndexInvalid) && + (port_info[port].lp != event->xSourceLp)) { + eventRc = HvLpEvent_Rc_SubtypeError; + eventSubtypeRc = viorc_openRejected; + } else { + port_info[port].lp = event->xSourceLp; + eventRc = HvLpEvent_Rc_Good; + eventSubtypeRc = viorc_good; + sndMsgSeq[port] = sndMsgAck[port] = 0; + } + + spin_unlock_irqrestore(&consolelock, flags); + + /* Return the acknowledgement */ + HvCallEvent_ackLpEvent(event); + } +} + +/* Handle a close open charLpEvent. Could be either interrupt or ack + */ +static void vioHandleCloseEvent(struct HvLpEvent *event) +{ + unsigned long flags; + struct viocharlpevent *cevent = (struct viocharlpevent *) event; + u8 port = cevent->virtualDevice; + + if (event->xFlags.xFunction == HvLpEvent_Function_Int) { + if (port >= VTTY_PORTS) + return; + + /* For closes, just mark the console partition invalid */ + spin_lock_irqsave(&consolelock, flags); + /* Got the lock, don't cause console output */ + + if (port_info[port].lp == event->xSourceLp) + port_info[port].lp = HvLpIndexInvalid; + + spin_unlock_irqrestore(&consolelock, flags); + printk(KERN_INFO_VIO + "console close from %d\n", event->xSourceLp); + } else { + printk(KERN_WARNING_VIO + "console got unexpected close acknowlegement\n"); + } +} + +/* Handle a config charLpEvent. Could be either interrupt or ack + */ +static void vioHandleConfig(struct HvLpEvent *event) +{ + struct viocharlpevent *cevent = (struct viocharlpevent *) event; + int len; + + len = cevent->immediateDataLen; + HvCall_writeLogBuffer(cevent->immediateData, + cevent->immediateDataLen); + + if (cevent->immediateData[0] == 0x01) { + printk(KERN_INFO_VIO + "console window resized to %d: %d: %d: %d\n", + cevent->immediateData[1], + cevent->immediateData[2], + cevent->immediateData[3], cevent->immediateData[4]); + } else { + printk(KERN_WARNING_VIO "console unknown config event\n"); + } + return; +} + +/* Handle a data charLpEvent. + */ +static void vioHandleData(struct HvLpEvent *event) +{ + struct tty_struct *tty; + struct viocharlpevent *cevent = (struct viocharlpevent *) event; + struct port_info_tag *pi; + int len; + u8 port = cevent->virtualDevice; + + if (port >= VTTY_PORTS) { + printk(KERN_WARNING_VIO + "console data on invalid virtual device %d\n", + port); + return; + } + + tty = port_info[port].tty; + + if (tty == NULL) { + printk(KERN_WARNING_VIO + "no tty for virtual device %d\n", port); + return; + } + + if (tty->magic != TTY_MAGIC) { + printk(KERN_WARNING_VIO "tty bad magic\n"); + return; + } + + /* + * Just to be paranoid, make sure the tty points back to this port + */ + pi = (struct port_info_tag *) tty->driver_data; + + if (!pi || viotty_paranoia_check(pi, tty->device, "vioHandleData")) + return; + + len = cevent->immediateDataLen; + + if (len == 0) + return; + + /* + * Log port 0 data to the hypervisor log + */ + if (port == 0) + HvCall_writeLogBuffer(cevent->immediateData, + cevent->immediateDataLen); + + /* Don't copy more bytes than there is room for in the buffer */ + if (tty->flip.count + len > TTY_FLIPBUF_SIZE) { + len = TTY_FLIPBUF_SIZE - tty->flip.count; + printk(KERN_WARNING_VIO + "console input buffer overflow!\n"); + } + + memcpy(tty->flip.char_buf_ptr, cevent->immediateData, len); + memset(tty->flip.flag_buf_ptr, TTY_NORMAL, len); + + /* Update the kernel buffer end */ + tty->flip.count += len; + tty->flip.char_buf_ptr += len; + + tty->flip.flag_buf_ptr += len; + + tty_flip_buffer_push(tty); +} + +/* Handle an ack charLpEvent. + */ +static void vioHandleAck(struct HvLpEvent *event) +{ + struct viocharlpevent *cevent = (struct viocharlpevent *) event; + unsigned long flags; + u8 port = cevent->virtualDevice; + + if (port >= VTTY_PORTS) { + printk(KERN_WARNING_VIO + "viocons: data on invalid virtual device\n"); + return; + } + + spin_lock_irqsave(&consolelock, flags); + sndMsgAck[port] = event->xCorrelationToken; + spin_unlock_irqrestore(&consolelock, flags); + + if (overflow[port].bufferUsed) + sendBuffers(port, port_info[port].lp); +} + +/* Handle charLpEvents and route to the appropriate routine + */ +static void vioHandleCharEvent(struct HvLpEvent *event) +{ + int charminor; + + if (event == NULL) { + return; + } + charminor = event->xSubtype & VIOMINOR_SUBTYPE_MASK; + switch (charminor) { + case viocharopen: + vioHandleOpenEvent(event); + break; + case viocharclose: + vioHandleCloseEvent(event); + break; + case viochardata: + vioHandleData(event); + break; + case viocharack: + vioHandleAck(event); + break; + case viocharconfig: + vioHandleConfig(event); + break; + default: + if ((event->xFlags.xFunction == HvLpEvent_Function_Int) && + (event->xFlags.xAckInd == HvLpEvent_AckInd_DoAck)) { + event->xRc = HvLpEvent_Rc_InvalidSubtype; + HvCallEvent_ackLpEvent(event); + } + } +} + +/* Send an open event + */ +static int viocons_sendOpen(HvLpIndex remoteLp, u8 port, void *sem) +{ + return HvCallEvent_signalLpEventFast(remoteLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_chario + | viocharopen, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (remoteLp), + viopath_targetinst + (remoteLp), + (u64) (unsigned long) + sem, VIOVERSION << 16, + ((u64) port << 48), 0, 0, 0); + +} + +int __init viocons_init2(void) +{ + DECLARE_MUTEX_LOCKED(Semaphore); + int rc; + + /* + * Now open to the primary LP + */ + printk(KERN_INFO_VIO "console open path to primary\n"); + rc = viopath_open(HvLpConfig_getPrimaryLpIndex(), viomajorsubtype_chario, viochar_window + 2); /* +2 for fudge */ + if (rc) { + printk(KERN_WARNING_VIO + "console error opening to primary %d\n", rc); + } + + if (viopath_hostLp == HvLpIndexInvalid) { + vio_set_hostlp(); + } + + /* + * And if the primary is not the same as the hosting LP, open to the + * hosting lp + */ + if ((viopath_hostLp != HvLpIndexInvalid) && + (viopath_hostLp != HvLpConfig_getPrimaryLpIndex())) { + printk(KERN_INFO_VIO + "console open path to hosting (%d)\n", + viopath_hostLp); + rc = viopath_open(viopath_hostLp, viomajorsubtype_chario, viochar_window + 2); /* +2 for fudge */ + if (rc) { + printk(KERN_WARNING_VIO + "console error opening to partition %d: %d\n", + viopath_hostLp, rc); + } + } + + if (vio_setHandler(viomajorsubtype_chario, vioHandleCharEvent) < 0) { + printk(KERN_WARNING_VIO + "Error seting handler for console events!\n"); + } + + printk(KERN_INFO_VIO "console major number is %d\n", TTY_MAJOR); + + /* First, try to open the console to the hosting lp. + * Wait on a semaphore for the response. + */ + if ((viopath_isactive(viopath_hostLp)) && + (viocons_sendOpen(viopath_hostLp, 0, &Semaphore) == 0)) { + printk(KERN_INFO_VIO + "opening console to hosting partition %d\n", + viopath_hostLp); + down(&Semaphore); + } + + /* + * If we don't have an active console, try the primary + */ + if ((!viopath_isactive(port_info[0].lp)) && + (viopath_isactive(HvLpConfig_getPrimaryLpIndex())) && + (viocons_sendOpen + (HvLpConfig_getPrimaryLpIndex(), 0, &Semaphore) == 0)) { + printk(KERN_INFO_VIO + "opening console to primary partition\n"); + down(&Semaphore); + } + + /* Initialize the tty_driver structure */ + memset(&viotty_driver, 0, sizeof(struct tty_driver)); + viotty_driver.magic = TTY_DRIVER_MAGIC; + viotty_driver.driver_name = "vioconsole"; +#if defined(CONFIG_DEVFS_FS) + viotty_driver.name = "tty%d"; +#else + viotty_driver.name = "tty"; +#endif + viotty_driver.major = TTY_MAJOR; + viotty_driver.minor_start = 1; + viotty_driver.name_base = 1; + viotty_driver.num = VTTY_PORTS; + viotty_driver.type = TTY_DRIVER_TYPE_CONSOLE; + viotty_driver.subtype = 1; + viotty_driver.init_termios = tty_std_termios; + viotty_driver.flags = + TTY_DRIVER_REAL_RAW | TTY_DRIVER_RESET_TERMIOS; + viotty_driver.refcount = &viotty_refcount; + viotty_driver.table = viotty_table; + viotty_driver.termios = viotty_termios; + viotty_driver.termios_locked = viotty_termios_locked; + + viotty_driver.open = viotty_open; + viotty_driver.close = viotty_close; + viotty_driver.write = viotty_write; + viotty_driver.put_char = viotty_put_char; + viotty_driver.flush_chars = viotty_flush_chars; + viotty_driver.write_room = viotty_write_room; + viotty_driver.chars_in_buffer = viotty_chars_in_buffer; + viotty_driver.flush_buffer = viotty_flush_buffer; + viotty_driver.ioctl = viotty_ioctl; + viotty_driver.throttle = viotty_throttle; + viotty_driver.unthrottle = viotty_unthrottle; + viotty_driver.set_termios = viotty_set_termios; + viotty_driver.stop = viotty_stop; + viotty_driver.start = viotty_start; + viotty_driver.hangup = viotty_hangup; + viotty_driver.break_ctl = viotty_break; + viotty_driver.send_xchar = viotty_send_xchar; + viotty_driver.wait_until_sent = viotty_wait_until_sent; + + viottyS_driver = viotty_driver; +#if defined(CONFIG_DEVFS_FS) + viottyS_driver.name = "ttyS%d"; +#else + viottyS_driver.name = "ttyS"; +#endif + viottyS_driver.major = TTY_MAJOR; + viottyS_driver.minor_start = VIOTTY_SERIAL_START; + viottyS_driver.type = TTY_DRIVER_TYPE_SERIAL; + viottyS_driver.table = viottyS_table; + viottyS_driver.termios = viottyS_termios; + viottyS_driver.termios_locked = viottyS_termios_locked; + + if (tty_register_driver(&viotty_driver)) { + printk(KERN_WARNING_VIO + "Couldn't register console driver\n"); + } + + if (tty_register_driver(&viottyS_driver)) { + printk(KERN_WARNING_VIO + "Couldn't register console S driver\n"); + } + /* Now create the vcs and vcsa devfs entries so mingetty works */ +#if defined(CONFIG_DEVFS_FS) + { + struct tty_driver temp_driver = viotty_driver; + int i; + + temp_driver.name = "vcs%d"; + for (i = 0; i < VTTY_PORTS; i++) + tty_register_devfs(&temp_driver, + 0, i + temp_driver.minor_start); + + temp_driver.name = "vcsa%d"; + for (i = 0; i < VTTY_PORTS; i++) + tty_register_devfs(&temp_driver, + 0, i + temp_driver.minor_start); + + // For compatibility with some earlier code only! + // This will go away!!! + temp_driver.name = "viocons/%d"; + temp_driver.name_base = 0; + for (i = 0; i < VTTY_PORTS; i++) + tty_register_devfs(&temp_driver, + 0, i + temp_driver.minor_start); + } +#endif + + /* + * Create the proc entry + */ + iSeries_proc_callback(&viocons_proc_init); + + return 0; +} + +void __init viocons_init(void) +{ + int i; + printk(KERN_INFO_VIO "registering console\n"); + + memset(&port_info, 0x00, sizeof(port_info)); + for (i = 0; i < VTTY_PORTS; i++) { + sndMsgSeq[i] = sndMsgAck[i] = 0; + port_info[i].port = i; + port_info[i].lp = HvLpIndexInvalid; + port_info[i].magic = VIOTTY_MAGIC; + } + + register_console(&viocons); + memset(overflow, 0x00, sizeof(overflow)); + debug = 0; + + HvCall_setLogBufferFormatAndCodepage(HvCall_LogBuffer_ASCII, 437); +} diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/iseries/viodasd.c linuxppc64_2_4/drivers/iseries/viodasd.c --- ../kernel.org/linux-2.4.19/drivers/iseries/viodasd.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/drivers/iseries/viodasd.c Wed Apr 3 12:27:16 2002 @@ -0,0 +1,1623 @@ +/* -*- linux-c -*- + * viodasd.c + * Authors: Dave Boutcher + * Ryan Arnold + * Colin Devilbiss + * + * (C) Copyright 2000 IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + *************************************************************************** + * This routine provides access to disk space (termed "DASD" in historical + * IBM terms) owned and managed by an OS/400 partition running on the + * same box as this Linux partition. + * + * All disk operations are performed by sending messages back and forth to + * the OS/400 partition. + * + * This device driver can either use its own major number, or it can + * pretend to be an IDE drive (grep 'IDE[0-9]_MAJOR' ../../include/linux/major.h). + * This is controlled with a CONFIG option. You can either call this an + * elegant solution to the fact that a lot of software doesn't recognize + * a new disk major number...or you can call this a really ugly hack. + * Your choice. + */ + +#include +#include + +/* Changelog: + 2001-11-27 devilbis Added first pass at complete IDE emulation + */ + +/* Decide if we are using our own major or pretending to be an IDE drive + * + * If we are using our own major, we only support 7 partitions per physical + * disk....so with minor numbers 0-255 we get a maximum of 32 disks. If we + * are emulating IDE, we get 63 partitions per disk, with a maximum of 4 + * disks per major, but common practice is to place only 2 devices in /dev + * for each IDE major, for a total of 20 (since there are 10 IDE majors). + */ + +#ifdef CONFIG_VIODASD_IDE +static const int major_table[] = { + IDE0_MAJOR, + IDE1_MAJOR, + IDE2_MAJOR, + IDE3_MAJOR, + IDE4_MAJOR, + IDE5_MAJOR, + IDE6_MAJOR, + IDE7_MAJOR, + IDE8_MAJOR, + IDE9_MAJOR, +}; +enum { + DEV_PER_MAJOR = 2, + PARTITION_SHIFT = 6, +}; +static int major_to_index(int major) +{ + switch(major) { + case IDE0_MAJOR: return 0; + case IDE1_MAJOR: return 1; + case IDE2_MAJOR: return 2; + case IDE3_MAJOR: return 3; + case IDE4_MAJOR: return 4; + case IDE5_MAJOR: return 5; + case IDE6_MAJOR: return 6; + case IDE7_MAJOR: return 7; + case IDE8_MAJOR: return 8; + case IDE9_MAJOR: return 9; + default: + return -1; + } +} +#define do_viodasd_request do_hd_request +#define VIOD_DEVICE_NAME "hd" +#define VIOD_GENHD_NAME "hd" +#else /* !CONFIG_VIODASD_IDE */ +static const int major_table[] = { + VIODASD_MAJOR, +}; +enum { + DEV_PER_MAJOR = 32, + PARTITION_SHIFT = 3, +}; +static int major_to_index(int major) +{ + if(major != VIODASD_MAJOR) + return -1; + return 0; +} +#define VIOD_DEVICE_NAME "viod" +#ifdef CONFIG_DEVFS_FS +#define VIOD_GENHD_NAME "viod" +#else +#define VIOD_GENHD_NAME "iSeries/vd" +#endif +#endif /* CONFIG_VIODASD_IDE */ + +#define DEVICE_NR(dev) (devt_to_diskno(dev)) +#define LOCAL_END_REQUEST + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include "vio.h" +#include + +MODULE_DESCRIPTION("iSeries Virtual DASD"); +MODULE_AUTHOR("Dave Boutcher"); +MODULE_LICENSE("GPL"); + +#define VIODASD_VERS "1.50" + +enum { + NUM_MAJORS = sizeof(major_table) / sizeof(major_table[0]), + MAX_DISKNO = DEV_PER_MAJOR * NUM_MAJORS, + MAX_MAJOR_NAME = 4 + 1, /* maximum length of a gendisk->name */ +}; + +static volatile int viodasd_max_disk = MAX_DISKNO - 1; + +static int diskno_to_major(int diskno) +{ + if (diskno >= MAX_DISKNO) + return -1; + return major_table[diskno / DEV_PER_MAJOR]; +} +static int devt_to_diskno(kdev_t dev) +{ + return major_to_index(MAJOR(dev)) * DEV_PER_MAJOR + + (MINOR(dev) >> PARTITION_SHIFT); +} +static int diskno_to_devt(int diskno, int partition) +{ + return MKDEV(diskno_to_major(diskno), + ((diskno % DEV_PER_MAJOR) << PARTITION_SHIFT) + + partition); +} + +#define VIOMAXREQ 16 +#define VIOMAXBLOCKDMA 12 + +extern struct pci_dev *iSeries_vio_dev; + +struct openData { + u64 mDiskLen; + u16 mMaxDisks; + u16 mCylinders; + u16 mTracks; + u16 mSectors; + u16 mBytesPerSector; +}; + +struct rwData { // Used during rw + u64 mOffset; + struct { + u32 mToken; + u32 reserved; + u64 mLen; + } dmaInfo[VIOMAXBLOCKDMA]; +}; + +struct vioblocklpevent { + struct HvLpEvent event; + u32 mReserved1; + u16 mVersion; + u16 mSubTypeRc; + u16 mDisk; + u16 mFlags; + union { + struct openData openData; + struct rwData rwData; + struct { + u64 changed; + } check; + } u; +}; + +#define vioblockflags_ro 0x0001 + +enum vioblocksubtype { + vioblockopen = 0x0001, + vioblockclose = 0x0002, + vioblockread = 0x0003, + vioblockwrite = 0x0004, + vioblockflush = 0x0005, + vioblockcheck = 0x0007 +}; + +/* In a perfect world we will perform better if we get page-aligned I/O + * requests, in multiples of pages. At least peg our block size to the + * actual page size. + */ +static int blksize = HVPAGESIZE; /* in bytes */ + +static DECLARE_WAIT_QUEUE_HEAD(viodasd_wait); +struct viodasd_waitevent { + struct semaphore *sem; + int rc; + union { + int changed; /* Used only for check_change */ + u16 subRC; + } data; +}; + +static const struct vio_error_entry viodasd_err_table[] = { + {0x0201, EINVAL, "Invalid Range"}, + {0x0202, EINVAL, "Invalid Token"}, + {0x0203, EIO, "DMA Error"}, + {0x0204, EIO, "Use Error"}, + {0x0205, EIO, "Release Error"}, + {0x0206, EINVAL, "Invalid Disk"}, + {0x0207, EBUSY, "Cant Lock"}, + {0x0208, EIO, "Already Locked"}, + {0x0209, EIO, "Already Unlocked"}, + {0x020A, EIO, "Invalid Arg"}, + {0x020B, EIO, "Bad IFS File"}, + {0x020C, EROFS, "Read Only Device"}, + {0x02FF, EIO, "Internal Error"}, + {0x0000, 0, NULL}, +}; + +/* Our gendisk table + */ +static struct gendisk viodasd_gendisk[NUM_MAJORS]; + +static struct gendisk *major_to_gendisk(int major) +{ + int index = major_to_index(major); + return index < 0 ? NULL : &viodasd_gendisk[index]; +} +static struct hd_struct *devt_to_partition(kdev_t dev) +{ + return &major_to_gendisk(MAJOR(dev))->part[MINOR(dev)]; +} + +/* Figure out the biggest I/O request (in sectors) we can accept + */ +#define VIODASD_MAXSECTORS (4096 / 512 * VIOMAXBLOCKDMA) + +/* Keep some statistics on what's happening for the PROC file system + */ +static struct { + long tot; + long nobh; + long ntce[VIOMAXBLOCKDMA]; +} viod_stats[MAX_DISKNO][2]; + +/* Number of disk I/O requests we've sent to OS/400 + */ +static int num_req_outstanding; + +/* This is our internal structure for keeping track of disk devices + */ +struct viodasd_device { + int useCount; + u16 cylinders; + u16 tracks; + u16 sectors; + u16 bytesPerSector; + u64 size; + int readOnly; +} *viodasd_devices; + +/* When we get a disk I/O request we take it off the general request queue + * and put it here. + */ +static LIST_HEAD(reqlist); + +/* Handle reads from the proc file system + */ +static int proc_read(char *buf, char **start, off_t offset, + int blen, int *eof, void *data) +{ + int len = 0; + int i; + int j; + +#if defined(MODULE) + len += + sprintf(buf + len, + "viod Module opened %d times. Major number %d\n", + MOD_IN_USE, major_table[0]); +#endif + len += + sprintf(buf + len, "viod %d possible devices\n", MAX_DISKNO); + + for (i = 0; i < 16; i++) { + if (viod_stats[i][0].tot || viod_stats[i][1].tot) { + len += + sprintf(buf + len, + "DISK %2.2d: rd %-10.10ld wr %-10.10ld (no buffer list rd %-10.10ld wr %-10.10ld\n", + i, viod_stats[i][0].tot, + viod_stats[i][1].tot, + viod_stats[i][0].nobh, + viod_stats[i][1].nobh); + + len += sprintf(buf + len, "rd DMA: "); + + for (j = 0; j < VIOMAXBLOCKDMA; j++) + len += sprintf(buf + len, " [%2.2d] %ld", + j, + viod_stats[i][0].ntce[j]); + + len += sprintf(buf + len, "\nwr DMA: "); + + for (j = 0; j < VIOMAXBLOCKDMA; j++) + len += sprintf(buf + len, " [%2.2d] %ld", + j, + viod_stats[i][1].ntce[j]); + len += sprintf(buf + len, "\n"); + } + } + + *eof = 1; + return len; +} + +/* Handle writes to our proc file system + */ +static int proc_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + return count; +} + +/* setup our proc file system entries + */ +void viodasd_proc_init(struct proc_dir_entry *iSeries_proc) +{ + struct proc_dir_entry *ent; + ent = + create_proc_entry("viodasd", S_IFREG | S_IRUSR, iSeries_proc); + if (!ent) + return; + ent->nlink = 1; + ent->data = NULL; + ent->read_proc = proc_read; + ent->write_proc = proc_write; +} + +/* clean up our proc file system entries + */ +void viodasd_proc_delete(struct proc_dir_entry *iSeries_proc) +{ + remove_proc_entry("viodasd", iSeries_proc); +} + +/* End a request + */ +static void viodasd_end_request(struct request *req, int uptodate) +{ + if (end_that_request_first(req, uptodate, VIOD_DEVICE_NAME)) + return; + + end_that_request_last(req); +} + +/* This rebuilds the partition information for a single disk device + */ +static int viodasd_revalidate(kdev_t dev) +{ + int i; + int device_no = DEVICE_NR(dev); + int dev_within_major = device_no % DEV_PER_MAJOR; + int part0 = (dev_within_major << PARTITION_SHIFT); + int npart = (1 << PARTITION_SHIFT); + int major = MAJOR(dev); + struct gendisk *gendisk = major_to_gendisk(major); + + if (viodasd_devices[device_no].size == 0) + return 0; + + for (i = npart - 1; i >= 0; i--) { + int minor = part0 + i; + struct hd_struct *partition = &gendisk->part[minor]; + + if (partition->nr_sects != 0) { + kdev_t devp = MKDEV(major, minor); + struct super_block *sb; + fsync_dev(devp); + + sb = get_super(devp); + if (sb) + invalidate_inodes(sb); + + invalidate_buffers(devp); + } + + partition->start_sect = 0; + partition->nr_sects = 0; + } + + grok_partitions(gendisk, dev_within_major, npart, + viodasd_devices[device_no].size >> 9); + + return 0; +} + + +static u16 access_flags(mode_t mode) +{ + u16 flags = 0; + if (!(mode & FMODE_WRITE)) + flags |= vioblockflags_ro; + return flags; +} + +/* This is the actual open code. It gets called from the external + * open entry point, as well as from the init code when we're figuring + * out what disks we have + */ +static int internal_open(int device_no, u16 flags) +{ + int i; + const int dev_within_major = device_no % DEV_PER_MAJOR; + struct gendisk *gendisk = + major_to_gendisk(diskno_to_major(device_no)); + HvLpEvent_Rc hvrc; + /* This semaphore is raised in the interrupt handler */ + DECLARE_MUTEX_LOCKED(Semaphore); + struct viodasd_waitevent we = { sem:&Semaphore }; + + /* Check that we are dealing with a valid hosting partition */ + if (viopath_hostLp == HvLpIndexInvalid) { + printk(KERN_WARNING_VIO "Invalid hosting partition\n"); + return -EIO; + } + + /* Send the open event to OS/400 */ + hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_blockio | + vioblockopen, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (viopath_hostLp), + viopath_targetinst + (viopath_hostLp), + (u64) (unsigned long) &we, + VIOVERSION << 16, + ((u64) device_no << 48) | + ((u64) flags << 32), 0, 0, 0); + + if (hvrc != 0) { + printk(KERN_WARNING_VIO "bad rc on signalLpEvent %d\n", + (int) hvrc); + return -EIO; + } + + /* Wait for the interrupt handler to get the response */ + down(&Semaphore); + + /* Check the return code */ + if (we.rc != 0) { + const struct vio_error_entry *err = + vio_lookup_rc(viodasd_err_table, we.data.subRC); + printk(KERN_WARNING_VIO + "bad rc opening disk: %d:0x%04x (%s)\n", + (int) we.rc, we.data.subRC, err->msg); + return -err->errno; + } + + /* If this is the first open of this device, update the device information */ + /* If this is NOT the first open, assume that it isn't changing */ + if (viodasd_devices[device_no].useCount == 0) { + if (viodasd_devices[device_no].size > 0) { + /* divide by 512 */ + u64 tmpint = viodasd_devices[device_no].size >> 9; + gendisk->part[dev_within_major << PARTITION_SHIFT].nr_sects = tmpint; + /* Now the value divided by 1024 */ + tmpint = tmpint >> 1; + gendisk->sizes[dev_within_major << PARTITION_SHIFT] = tmpint; + + for (i = dev_within_major << PARTITION_SHIFT; + i < ((dev_within_major + 1) << PARTITION_SHIFT); + i++) + { + hardsect_size[diskno_to_major(device_no)][i] = + viodasd_devices[device_no].bytesPerSector; + } + } + } else { + /* If the size of the device changed, weird things are happening! */ + if (gendisk->sizes[dev_within_major << PARTITION_SHIFT] != + viodasd_devices[device_no].size >> 10) { + printk(KERN_WARNING_VIO + "disk size change (%dK to %dK) for device %d\n", + gendisk->sizes[dev_within_major << PARTITION_SHIFT], + (int) viodasd_devices[device_no].size >> 10, device_no); + } + } + + /* Bump the use count */ + viodasd_devices[device_no].useCount++; + return 0; +} + +/* This is the actual release code. It gets called from the external + * release entry point, as well as from the init code when we're figuring + * out what disks we have + */ +static int internal_release(int device_no, u16 flags) +{ + /* Send the event to OS/400. We DON'T expect a response */ + HvLpEvent_Rc hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_blockio + | vioblockclose, + HvLpEvent_AckInd_NoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (viopath_hostLp), + viopath_targetinst + (viopath_hostLp), + 0, + VIOVERSION << 16, + ((u64) device_no + << 48) | ((u64) + flags + << + 32), + 0, 0, 0); + + viodasd_devices[device_no].useCount--; + + if (hvrc != 0) { + printk(KERN_WARNING_VIO + "bad rc sending event to OS/400 %d\n", (int) hvrc); + return -EIO; + } + return 0; +} + + +static void internal_register_new_disk(int diskno); + +/* External open entry point. + */ +static int viodasd_open(struct inode *ino, struct file *fil) +{ + int device_no; + int old_max_disk = viodasd_max_disk; + + /* Do a bunch of sanity checks */ + if (!ino) { + printk(KERN_WARNING_VIO "no inode provided in open\n"); + return -ENODEV; + } + + if (major_to_index(MAJOR(ino->i_rdev)) < 0) { + printk(KERN_WARNING_VIO + "Weird error...wrong major number on open\n"); + return -ENODEV; + } + + device_no = DEVICE_NR(ino->i_rdev); + if (device_no > MAX_DISKNO || device_no < 0) { + printk(KERN_WARNING_VIO + "Invalid device number %d in open\n", device_no); + return -ENODEV; + } + + /* Call the actual open code */ + if (internal_open(device_no, access_flags(fil ? fil->f_mode : 0)) == 0) { + int i; + MOD_INC_USE_COUNT; + /* For each new disk: */ + /* update the disk's geometry via internal_open and register it */ + for (i = old_max_disk + 1; i <= viodasd_max_disk; ++i) { + internal_open(i, vioblockflags_ro); + internal_register_new_disk(i); + internal_release(i, vioblockflags_ro); + } + return 0; + } else { + return -EIO; + } +} + +/* External release entry point. + */ +static int viodasd_release(struct inode *ino, struct file *fil) +{ + int device_no; + + /* Do a bunch of sanity checks */ + if (!ino) { + printk(KERN_WARNING_VIO "no inode provided in release\n"); + return -ENODEV; + } + + if (major_to_index(MAJOR(ino->i_rdev)) < 0) { + printk(KERN_WARNING_VIO + "Weird error...wrong major number on release\n"); + return -ENODEV; + } + + device_no = DEVICE_NR(ino->i_rdev); + + if (device_no > MAX_DISKNO || device_no < 0) { + printk("Tried to release invalid disk number %d\n", + device_no); + return -ENODEV; + } + + /* Call the actual release code */ + internal_release(device_no, access_flags(fil ? fil->f_mode : 0)); + + MOD_DEC_USE_COUNT; + return 0; +} + +/* External ioctl entry point. + */ +static int viodasd_ioctl(struct inode *ino, struct file *fil, + unsigned int cmd, unsigned long arg) +{ + int device_no; + int err; + HvLpEvent_Rc hvrc; + struct hd_struct *partition; + DECLARE_MUTEX_LOCKED(Semaphore); + + /* Sanity checks */ + if (!ino) { + printk(KERN_WARNING_VIO "no inode provided in ioctl\n"); + return -ENODEV; + } + + if (major_to_index(MAJOR(ino->i_rdev)) < 0) { + printk(KERN_WARNING_VIO + "Weird error...wrong major number on ioctl\n"); + return -ENODEV; + } + + partition = devt_to_partition(ino->i_rdev); + + device_no = DEVICE_NR(ino->i_rdev); + if (device_no > viodasd_max_disk) { + printk(KERN_WARNING_VIO + "Invalid device number %d in ioctl\n", device_no); + return -ENODEV; + } + + switch (cmd) { + case BLKGETSIZE: + /* return the device size in sectors */ + if (!arg) + return -EINVAL; + err = + verify_area(VERIFY_WRITE, (long *) arg, sizeof(long)); + if (err) + return err; + + put_user(partition->nr_sects, (long *) arg); + return 0; + + case FDFLUSH: + case BLKFLSBUF: + if (!suser()) + return -EACCES; + fsync_dev(ino->i_rdev); + invalidate_buffers(ino->i_rdev); + hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_blockio + | vioblockflush, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (viopath_hostLp), + viopath_targetinst + (viopath_hostLp), + (u64) (unsigned long) + &Semaphore, + VIOVERSION << 16, + ((u64) device_no << + 48), 0, 0, 0); + + + if (hvrc != 0) { + printk(KERN_WARNING_VIO + "bad rc on sync signalLpEvent %d\n", + (int) hvrc); + return -EIO; + } + + down(&Semaphore); + + return 0; + + case BLKRAGET: + if (!arg) + return -EINVAL; + err = + verify_area(VERIFY_WRITE, (long *) arg, sizeof(long)); + if (err) + return err; + put_user(read_ahead[MAJOR(ino->i_rdev)], (long *) arg); + return 0; + + case BLKRASET: + if (!suser()) + return -EACCES; + if (arg > 0x00ff) + return -EINVAL; + read_ahead[MAJOR(ino->i_rdev)] = arg; + return 0; + + case BLKRRPART: + viodasd_revalidate(ino->i_rdev); + return 0; + + case HDIO_GETGEO: + { + unsigned char sectors; + unsigned char heads; + unsigned short cylinders; + + struct hd_geometry *geo = + (struct hd_geometry *) arg; + if (geo == NULL) + return -EINVAL; + + err = verify_area(VERIFY_WRITE, geo, sizeof(*geo)); + if (err) + return err; + + sectors = viodasd_devices[device_no].sectors; + if (sectors == 0) + sectors = 32; + + heads = viodasd_devices[device_no].tracks; + if (heads == 0) + heads = 64; + + cylinders = viodasd_devices[device_no].cylinders; + if (cylinders == 0) + cylinders = + partition->nr_sects / (sectors * + heads); + + put_user(sectors, &geo->sectors); + put_user(heads, &geo->heads); + put_user(cylinders, &geo->cylinders); + + put_user(partition->start_sect, + (long *) &geo->start); + + return 0; + } + +#define PRTIOC(x) case x: printk(KERN_WARNING_VIO "got unsupported FD ioctl " #x "\n"); \ + return -EINVAL; + + PRTIOC(FDCLRPRM); + PRTIOC(FDSETPRM); + PRTIOC(FDDEFPRM); + PRTIOC(FDGETPRM); + PRTIOC(FDMSGON); + PRTIOC(FDMSGOFF); + PRTIOC(FDFMTBEG); + PRTIOC(FDFMTTRK); + PRTIOC(FDFMTEND); + PRTIOC(FDSETEMSGTRESH); + PRTIOC(FDSETMAXERRS); + PRTIOC(FDGETMAXERRS); + PRTIOC(FDGETDRVTYP); + PRTIOC(FDSETDRVPRM); + PRTIOC(FDGETDRVPRM); + PRTIOC(FDGETDRVSTAT); + PRTIOC(FDPOLLDRVSTAT); + PRTIOC(FDRESET); + PRTIOC(FDGETFDCSTAT); + PRTIOC(FDWERRORCLR); + PRTIOC(FDWERRORGET); + PRTIOC(FDRAWCMD); + PRTIOC(FDEJECT); + PRTIOC(FDTWADDLE); + + } + + return -EINVAL; +} + +/* Send an actual I/O request to OS/400 + */ +static int send_request(struct request *req) +{ + u64 sect_size; + u64 start; + u64 len; + int direction; + int nsg; + u16 viocmd; + HvLpEvent_Rc hvrc; + struct vioblocklpevent *bevent; + struct scatterlist sg[VIOMAXBLOCKDMA]; + struct buffer_head *bh; + int sgindex; + int device_no = DEVICE_NR(req->rq_dev); + int dev_within_major = device_no % DEV_PER_MAJOR; + int statindex; + struct hd_struct *partition = devt_to_partition(req->rq_dev); + + if (device_no > viodasd_max_disk || device_no < 0) { + printk + ("yikes! sending a request to device %d of %d possible?\n", + device_no, viodasd_max_disk + 1); + } + + /* Note that this SHOULD always be 512...but lets be architecturally correct */ + sect_size = hardsect_size[MAJOR(req->rq_dev)][dev_within_major]; + + /* Figure out the starting sector and length */ + start = (req->sector + partition->start_sect) * sect_size; + len = req->nr_sectors * sect_size; + + /* More paranoia checks */ + if ((req->sector + req->nr_sectors) > + (partition->start_sect + partition->nr_sects)) { + printk(KERN_WARNING_VIO + "Invalid request offset & length\n"); + printk(KERN_WARNING_VIO + "req->sector: %ld, req->nr_sectors: %ld\n", + req->sector, req->nr_sectors); + printk(KERN_WARNING_VIO "major: %d, minor: %d\n", + MAJOR(req->rq_dev), MINOR(req->rq_dev)); + return -1; + } + + if (req->cmd == READ) { + direction = PCI_DMA_FROMDEVICE; + viocmd = viomajorsubtype_blockio | vioblockread; + statindex = 0; + } else { + direction = PCI_DMA_TODEVICE; + viocmd = viomajorsubtype_blockio | vioblockwrite; + statindex = 1; + } + + /* Update totals */ + viod_stats[device_no][statindex].tot++; + + /* Now build the scatter-gather list */ + memset(&sg, 0x00, sizeof(sg)); + sgindex = 0; + + /* See if this is a swap I/O (without a bh pointer) or a regular I/O */ + if (req->bh) { + /* OK...this loop takes buffers from the request and adds them to the SG + until we're done, or until we hit a maximum. If we hit a maximum we'll + just finish this request later */ + bh = req->bh; + while ((bh) && (sgindex < VIOMAXBLOCKDMA)) { + sg[sgindex].address = bh->b_data; + sg[sgindex].length = bh->b_size; + + sgindex++; + bh = bh->b_reqnext; + } + nsg = pci_map_sg(iSeries_vio_dev, sg, sgindex, direction); + if ((nsg == 0) || (sg[0].dma_length == 0) + || (sg[0].dma_address == 0xFFFFFFFF)) { + printk(KERN_WARNING_VIO "error getting sg tces\n"); + return -1; + } + + } else { + /* Update stats */ + viod_stats[device_no][statindex].nobh++; + + sg[0].dma_address = + pci_map_single(iSeries_vio_dev, req->buffer, len, + direction); + if (sg[0].dma_address == 0xFFFFFFFF) { + printk(KERN_WARNING_VIO + "error allocating tce for address %p len %ld\n", + req->buffer, (long) len); + return -1; + } + sg[0].dma_length = len; + nsg = 1; + } + + /* Update stats */ + viod_stats[device_no][statindex].ntce[sgindex]++; + + /* This optimization handles a single DMA block */ + if (sgindex == 1) { + /* Send the open event to OS/400 */ + hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_blockio + | viocmd, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (viopath_hostLp), + viopath_targetinst + (viopath_hostLp), + (u64) (unsigned long) + req->buffer, + VIOVERSION << 16, + ((u64) device_no << + 48), start, + ((u64) sg[0]. + dma_address) << 32, + sg[0].dma_length); + } else { + bevent = + (struct vioblocklpevent *) + vio_get_event_buffer(viomajorsubtype_blockio); + if (bevent == NULL) { + printk(KERN_WARNING_VIO + "error allocating disk event buffer\n"); + return -1; + } + + /* Now build up the actual request. Note that we store the pointer */ + /* to the request buffer in the correlation token so we can match */ + /* this response up later */ + memset(bevent, 0x00, sizeof(struct vioblocklpevent)); + bevent->event.xFlags.xValid = 1; + bevent->event.xFlags.xFunction = HvLpEvent_Function_Int; + bevent->event.xFlags.xAckInd = HvLpEvent_AckInd_DoAck; + bevent->event.xFlags.xAckType = + HvLpEvent_AckType_ImmediateAck; + bevent->event.xType = HvLpEvent_Type_VirtualIo; + bevent->event.xSubtype = viocmd; + bevent->event.xSourceLp = HvLpConfig_getLpIndex(); + bevent->event.xTargetLp = viopath_hostLp; + bevent->event.xSizeMinus1 = + offsetof(struct vioblocklpevent, + u.rwData.dmaInfo) + + (sizeof(bevent->u.rwData.dmaInfo[0]) * (sgindex)) - 1; + bevent->event.xSizeMinus1 = + sizeof(struct vioblocklpevent) - 1; + bevent->event.xSourceInstanceId = + viopath_sourceinst(viopath_hostLp); + bevent->event.xTargetInstanceId = + viopath_targetinst(viopath_hostLp); + bevent->event.xCorrelationToken = + (u64) (unsigned long) req->buffer; + bevent->mVersion = VIOVERSION; + bevent->mDisk = device_no; + bevent->u.rwData.mOffset = start; + + /* Copy just the dma information from the sg list into the request */ + for (sgindex = 0; sgindex < nsg; sgindex++) { + bevent->u.rwData.dmaInfo[sgindex].mToken = + sg[sgindex].dma_address; + bevent->u.rwData.dmaInfo[sgindex].mLen = + sg[sgindex].dma_length; + } + + /* Send the request */ + hvrc = HvCallEvent_signalLpEvent(&bevent->event); + vio_free_event_buffer(viomajorsubtype_blockio, bevent); + } + + if (hvrc != HvLpEvent_Rc_Good) { + printk(KERN_WARNING_VIO + "error sending disk event to OS/400 (rc %d)\n", + (int) hvrc); + return -1; + } else { + /* If the request was successful, bump the number of outstanding */ + num_req_outstanding++; + } + return 0; +} + +/* This is the external request processing routine + */ +static void do_viodasd_request(request_queue_t * q) +{ + int device_no; + for (;;) { + struct request *req; + struct gendisk *gendisk; + + /* inlined INIT_REQUEST here because we don't define MAJOR_NR before blk.h */ + if (list_empty(&q->queue_head)) + return; + req = blkdev_entry_next_request(&q->queue_head); + if (major_to_index(MAJOR(req->rq_dev)) < 0) + panic(VIOD_DEVICE_NAME ": request list destroyed"); + if (req->bh) { + if (!buffer_locked(req->bh)) + panic(VIOD_DEVICE_NAME + ": block not locked"); + } + + gendisk = major_to_gendisk(MAJOR(req->rq_dev)); + + device_no = DEVICE_NR(req->rq_dev); + if (device_no > MAX_DISKNO || device_no < 0) { + printk(KERN_WARNING_VIO "Invalid device # %d\n", + device_no); + viodasd_end_request(req, 0); + continue; + } + + if (gendisk->sizes == NULL) { + printk(KERN_WARNING_VIO + "Ouch! gendisk->sizes is NULL\n"); + viodasd_end_request(req, 0); + continue; + } + + /* If the queue is plugged, don't dequeue anything right now */ + if ((q) && (q->plugged)) { + return; + } + + /* If we already have the maximum number of requests outstanding to OS/400 + just bail out. We'll come back later */ + if (num_req_outstanding >= VIOMAXREQ) { + return; + } + + /* get the current request, then dequeue it from the queue */ + blkdev_dequeue_request(req); + + /* Try sending the request */ + if (send_request(req) == 0) { + list_add_tail(&req->queue, &reqlist); + } else { + viodasd_end_request(req, 0); + } + } +} + +/* Check for changed disks + */ +static int viodasd_check_change(kdev_t dev) +{ + struct viodasd_waitevent we; + HvLpEvent_Rc hvrc; + int device_no = DEVICE_NR(dev); + + /* This semaphore is raised in the interrupt handler */ + DECLARE_MUTEX_LOCKED(Semaphore); + + /* Check that we are dealing with a valid hosting partition */ + if (viopath_hostLp == HvLpIndexInvalid) { + printk(KERN_WARNING_VIO "Invalid hosting partition\n"); + return -EIO; + } + + we.sem = &Semaphore; + + /* Send the open event to OS/400 */ + hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_blockio | + vioblockcheck, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (viopath_hostLp), + viopath_targetinst + (viopath_hostLp), + (u64) (unsigned long) &we, + VIOVERSION << 16, + ((u64) device_no << 48), 0, 0, + 0); + + if (hvrc != 0) { + printk(KERN_WARNING_VIO "bad rc on signalLpEvent %d\n", + (int) hvrc); + return -EIO; + } + + /* Wait for the interrupt handler to get the response */ + down(&Semaphore); + + /* Check the return code. If bad, assume no change */ + if (we.rc != 0) { + printk(KERN_WARNING_VIO + "bad rc %d on check_change. Assuming no change\n", + (int) we.rc); + return 0; + } + + return we.data.changed; +} + +/* Our file operations table + */ +static struct block_device_operations viodasd_fops = { + open:viodasd_open, + release:viodasd_release, + ioctl:viodasd_ioctl, + check_media_change:viodasd_check_change, + revalidate:viodasd_revalidate +}; + +/* returns the total number of scatterlist elements converted */ +static int block_event_to_scatterlist(const struct vioblocklpevent *bevent, + struct scatterlist *sg, + int *total_len) +{ + int i, numsg; + const struct rwData *rwData = &bevent->u.rwData; + static const int offset = + offsetof(struct vioblocklpevent, u.rwData.dmaInfo); + static const int element_size = sizeof(rwData->dmaInfo[0]); + + numsg = ((bevent->event.xSizeMinus1 + 1) - offset) / element_size; + if (numsg > VIOMAXBLOCKDMA) + numsg = VIOMAXBLOCKDMA; + + *total_len = 0; + memset(sg, 0x00, sizeof(sg[0]) * VIOMAXBLOCKDMA); + + for (i = 0; (i < numsg) && (rwData->dmaInfo[i].mLen > 0); ++i) { + sg[i].dma_address = rwData->dmaInfo[i].mToken; + sg[i].dma_length = rwData->dmaInfo[i].mLen; + *total_len += rwData->dmaInfo[i].mLen; + } + return i; +} + +static struct request *find_request_with_token(u64 token) +{ + struct request *req = blkdev_entry_to_request(reqlist.next); + while ((&req->queue != &reqlist) && + ((u64) (unsigned long) req->buffer != token)) + req = blkdev_entry_to_request(req->queue.next); + if (&req->queue == &reqlist) { + return NULL; + } + return req; +} + +/* Restart all queues, starting with the one _after_ the major given, */ +/* thus reducing the chance of starvation of disks with late majors. */ +static void viodasd_restart_all_queues_starting_from(int first_major) +{ + int i, first_index = major_to_index(first_major); + for(i = first_index + 1; i < NUM_MAJORS; ++i) + do_viodasd_request(BLK_DEFAULT_QUEUE(major_table[i])); + for(i = 0; i <= first_index; ++i) + do_viodasd_request(BLK_DEFAULT_QUEUE(major_table[i])); +} + +/* For read and write requests, decrement the number of outstanding requests, + * Free the DMA buffers we allocated, and find the matching request by + * using the buffer pointer we stored in the correlation token. + */ +static int viodasd_handleReadWrite(struct vioblocklpevent *bevent) +{ + int num_sg, num_sect, pci_direction, total_len, major; + struct request *req; + struct scatterlist sg[VIOMAXBLOCKDMA]; + struct HvLpEvent *event = &bevent->event; + unsigned long irq_flags; + + num_sg = block_event_to_scatterlist(bevent, sg, &total_len); + num_sect = total_len >> 9; + if (event->xSubtype == (viomajorsubtype_blockio | vioblockread)) + pci_direction = PCI_DMA_FROMDEVICE; + else + pci_direction = PCI_DMA_TODEVICE; + pci_unmap_sg(iSeries_vio_dev, sg, num_sg, pci_direction); + + + /* Since this is running in interrupt mode, we need to make sure we're not + * stepping on any global I/O operations + */ + spin_lock_irqsave(&io_request_lock, irq_flags); + + num_req_outstanding--; + + /* Now find the matching request in OUR list (remember we moved the request + * from the global list to our list when we got it) + */ + req = find_request_with_token(bevent->event.xCorrelationToken); + if (req == NULL) { + printk(KERN_WARNING_VIO + "Yikes! No request matching 0x%lx found\n", + bevent->event.xCorrelationToken); + spin_unlock_irqrestore(&io_request_lock, irq_flags); + return -1; + } + + /* Remove the request from our list */ + list_del(&req->queue); + /* Record this event's major number so we can check that queue again */ + major = MAJOR(req->rq_dev); + + if (!req->bh) { + if (event->xRc != HvLpEvent_Rc_Good) { + const struct vio_error_entry *err = + vio_lookup_rc(viodasd_err_table, + bevent->mSubTypeRc); + printk(KERN_WARNING_VIO + "read/write error %d:0x%04x (%s)\n", + event->xRc, bevent->mSubTypeRc, err->msg); + viodasd_end_request(req, 0); + } else { + if (num_sect != req->current_nr_sectors) { + printk(KERN_WARNING_VIO + "Yikes...non bh i/o # sect doesn't match!!!\n"); + } + viodasd_end_request(req, 1); + } + } else { + /* record having received the answers we did */ + while ((num_sect > 0) && (req->bh)) { + num_sect -= req->current_nr_sectors; + viodasd_end_request(req, 1); + } + /* if they somehow answered _more_ than we asked for...something weird happened */ + if (num_sect) + printk(KERN_WARNING_VIO + "Yikes...sectors left over on a request!!!\n"); + + /* if they didn't answer the whole request this time, re-submit the request */ + if (req->bh) { + if (send_request(req) == 0) { + list_add_tail(&req->queue, &reqlist); + } else { + viodasd_end_request(req, 0); + } + } + } + + /* Finally, try to get more requests off of this device's queue */ + viodasd_restart_all_queues_starting_from(major); + + spin_unlock_irqrestore(&io_request_lock, irq_flags); + + return 0; +} + +/* This routine handles incoming block LP events */ +static void vioHandleBlockEvent(struct HvLpEvent *event) +{ + struct vioblocklpevent *bevent = (struct vioblocklpevent *) event; + struct viodasd_waitevent *pwe; + + if (event == NULL) { + /* Notification that a partition went away! */ + return; + } + // First, we should NEVER get an int here...only acks + if (event->xFlags.xFunction == HvLpEvent_Function_Int) { + printk(KERN_WARNING_VIO + "Yikes! got an int in viodasd event handler!\n"); + if (event->xFlags.xAckInd == HvLpEvent_AckInd_DoAck) { + event->xRc = HvLpEvent_Rc_InvalidSubtype; + HvCallEvent_ackLpEvent(event); + } + } + + switch (event->xSubtype & VIOMINOR_SUBTYPE_MASK) { + + /* Handle a response to an open request. We get all the disk information + * in the response, so update it. The correlation token contains a pointer to + * a waitevent structure that has a semaphore in it. update the return code + * in the waitevent structure and post the semaphore to wake up the guy who + * sent the request */ + case vioblockopen: + pwe = + (struct viodasd_waitevent *) (unsigned long) event-> + xCorrelationToken; + pwe->rc = event->xRc; + pwe->data.subRC = bevent->mSubTypeRc; + if (event->xRc == HvLpEvent_Rc_Good) { + const struct openData *data = &bevent->u.openData; + struct viodasd_device *device = + &viodasd_devices[bevent->mDisk]; + device->readOnly = + bevent->mFlags & vioblockflags_ro; + device->size = data->mDiskLen; + device->cylinders = data->mCylinders; + device->tracks = data->mTracks; + device->sectors = data->mSectors; + device->bytesPerSector = data->mBytesPerSector; + viodasd_max_disk = data->mMaxDisks; + } + up(pwe->sem); + break; + case vioblockclose: + break; + case vioblockcheck: + pwe = + (struct viodasd_waitevent *) (unsigned long) event-> + xCorrelationToken; + pwe->rc = event->xRc; + pwe->data.changed = bevent->u.check.changed; + up(pwe->sem); + break; + case vioblockflush: + up((void *) (unsigned long) event->xCorrelationToken); + break; + case vioblockread: + case vioblockwrite: + viodasd_handleReadWrite(bevent); + break; + + default: + printk(KERN_WARNING_VIO "invalid subtype!"); + if (event->xFlags.xAckInd == HvLpEvent_AckInd_DoAck) { + event->xRc = HvLpEvent_Rc_InvalidSubtype; + HvCallEvent_ackLpEvent(event); + } + } +} + +/* This routine tries to clean up anything we allocated/registered + */ +static void viodasd_cleanup_major(int major) +{ + const int num_partitions = DEV_PER_MAJOR << PARTITION_SHIFT; + int minor; + +#define CLEANIT(x) if (x) {kfree(x); x=NULL;} + + for (minor = 0; minor < num_partitions; minor++) + fsync_dev(MKDEV(major, minor)); + + blk_cleanup_queue(BLK_DEFAULT_QUEUE(major)); + + read_ahead[major] = 0; + + CLEANIT(blk_size[major]); + CLEANIT(blksize_size[major]); + CLEANIT(hardsect_size[major]); + CLEANIT(max_sectors[major]); + CLEANIT(major_to_gendisk(major)->part); + + blk_cleanup_queue(BLK_DEFAULT_QUEUE(major)); + + devfs_unregister_blkdev(major, VIOD_DEVICE_NAME); +} + +static const char *major_name(int major) +{ + static char major_names[NUM_MAJORS][MAX_MAJOR_NAME]; + int index = major_to_index(major); + + if(index < 0) + return NULL; + else if(index == 0) + strcpy(major_names[index], VIOD_GENHD_NAME); + else + sprintf(major_names[index], VIOD_GENHD_NAME"%d", index); + + return major_names[index]; +} + +/* in case of bad return code, caller must cleanup2() for this major */ +static int viodasd_init_major(int major) +{ + int i; + const int numpart = DEV_PER_MAJOR << PARTITION_SHIFT; + int *sizes, *sectsizes, *blksizes, *maxsectors; + struct hd_struct *partitions; + struct gendisk *gendisk = major_to_gendisk(major); + + /* + * Do the devfs_register. This works even if devfs is not + * configured + */ + if (devfs_register_blkdev(major, VIOD_DEVICE_NAME, &viodasd_fops)) { + printk(KERN_WARNING_VIO + "%s: can't register major number %d\n", + VIOD_DEVICE_NAME, major); + return -1; + } + + blk_init_queue(BLK_DEFAULT_QUEUE(major), do_viodasd_request); + + read_ahead[major] = 8; /* 8 sector (4kB) read ahead */ + + /* initialize the struct */ + gendisk->major = major; + gendisk->major_name = major_name(major); + gendisk->minor_shift = PARTITION_SHIFT; + gendisk->max_p = 1 << PARTITION_SHIFT; + gendisk->nr_real = DEV_PER_MAJOR; + gendisk->fops = &viodasd_fops; + + /* to be assigned later */ + gendisk->next = NULL; + gendisk->part = NULL; + gendisk->sizes = NULL; + gendisk->de_arr = NULL; + gendisk->flags = NULL; + + /* register us in the global list */ + add_gendisk(gendisk); + + /* + * Now fill in all the device driver info + */ + sizes = kmalloc(numpart * sizeof(int), GFP_KERNEL); + if (!sizes) + return -ENOMEM; + memset(sizes, 0x00, numpart * sizeof(int)); + blk_size[major] = gendisk->sizes = sizes; + + partitions = + kmalloc(numpart * sizeof(struct hd_struct), GFP_KERNEL); + if (!partitions) + return -ENOMEM; + memset(partitions, 0x00, numpart * sizeof(struct hd_struct)); + gendisk->part = partitions; + + blksizes = kmalloc(numpart * sizeof(int), GFP_KERNEL); + if (!blksizes) + return -ENOMEM; + for (i = 0; i < numpart; i++) + blksizes[i] = blksize; + blksize_size[major] = blksizes; + + sectsizes = kmalloc(numpart * sizeof(int), GFP_KERNEL); + if (!sectsizes) + return -ENOMEM; + for (i = 0; i < numpart; i++) + sectsizes[i] = 0; + hardsect_size[major] = sectsizes; + + maxsectors = kmalloc(numpart * sizeof(int), GFP_KERNEL); + if (!maxsectors) + return -ENOMEM; + for (i = 0; i < numpart; i++) + maxsectors[i] = VIODASD_MAXSECTORS; + max_sectors[major] = maxsectors; + + return 0; +} + +static void internal_register_new_disk(int diskno) +{ + int major = diskno_to_major(diskno); + int dev_within_major = diskno % DEV_PER_MAJOR; + struct gendisk *gendisk = major_to_gendisk(major); + int i; + + if (diskno == 0) { + printk(KERN_INFO_VIO + "%s: Currently %d disks connected\n", + VIOD_DEVICE_NAME, (int) viodasd_max_disk + 1); + if (viodasd_max_disk > MAX_DISKNO - 1) + printk(KERN_INFO_VIO + "Only examining the first %d\n", + MAX_DISKNO); + } + + register_disk(gendisk, + MKDEV(major, + dev_within_major << + PARTITION_SHIFT), + 1 << PARTITION_SHIFT, &viodasd_fops, + gendisk-> + part[dev_within_major << PARTITION_SHIFT].nr_sects); + + printk(KERN_INFO_VIO + "%s: Disk %2.2d size %dM, sectors %d, heads %d, cylinders %d, sectsize %d\n", + VIOD_DEVICE_NAME, + diskno, + (int) (viodasd_devices[diskno].size / + (1024 * 1024)), + (int) viodasd_devices[diskno].sectors, + (int) viodasd_devices[diskno].tracks, + (int) viodasd_devices[diskno].cylinders, + (int) hardsect_size[major][dev_within_major << + PARTITION_SHIFT]); + + for (i = 1; i < (1 << PARTITION_SHIFT); ++i) { + int minor = (dev_within_major << PARTITION_SHIFT) + i; + struct hd_struct *partition = &gendisk->part[minor]; + if (partition->nr_sects) + printk(KERN_INFO_VIO + "%s: Disk %2.2d partition %2.2d start sector %ld, # sector %ld\n", + VIOD_DEVICE_NAME, diskno, i, + partition->start_sect, partition->nr_sects); + } +} + +/* Initialize the whole device driver. Handle module and non-module + * versions + */ +__init int viodasd_init(void) +{ + int i, j; + int rc; + + /* Try to open to our host lp + */ + if (viopath_hostLp == HvLpIndexInvalid) { + vio_set_hostlp(); + } + + if (viopath_hostLp == HvLpIndexInvalid) { + printk(KERN_WARNING_VIO "%s: invalid hosting partition\n", + VIOD_DEVICE_NAME); + return -EIO; + } + + printk(KERN_INFO_VIO + "%s: Disk vers %s, major %d, max disks %d, hosting partition %d\n", + VIOD_DEVICE_NAME, VIODASD_VERS, major_table[0], MAX_DISKNO, + viopath_hostLp); + + if (ROOT_DEV == NODEV) { + /* first disk, first partition */ + ROOT_DEV = diskno_to_devt(0, 1); + + printk(KERN_INFO_VIO + "Claiming root file system as first partition of first virtual disk"); + } + + /* Actually open the path to the hosting partition */ + rc = viopath_open(viopath_hostLp, viomajorsubtype_blockio, + VIOMAXREQ + 2); + if (rc) { + printk(KERN_WARNING_VIO + "error opening path to host partition %d\n", + viopath_hostLp); + return -EIO; + } else { + printk("%s: opened path to hosting partition %d\n", + VIOD_DEVICE_NAME, viopath_hostLp); + } + + viodasd_devices = + kmalloc(MAX_DISKNO * sizeof(struct viodasd_device), + GFP_KERNEL); + if (!viodasd_devices) + return -ENOMEM; + memset(viodasd_devices, 0x00, + MAX_DISKNO * sizeof(struct viodasd_device)); + + /* + * Initialize our request handler + */ + vio_setHandler(viomajorsubtype_blockio, vioHandleBlockEvent); + + for (i = 0; i < NUM_MAJORS; ++i) { + int init_rc = viodasd_init_major(major_table[i]); + if (init_rc < 0) { + for (j = 0; j <= i; ++j) + viodasd_cleanup_major(major_table[j]); + return init_rc; + } + } + + viodasd_max_disk = MAX_DISKNO - 1; + for (i = 0; i <= viodasd_max_disk && i < MAX_DISKNO; i++) { + // Note that internal_open has two side effects: + // a) it updates the size of the disk + // b) it updates viodasd_max_disk + if (internal_open(i, vioblockflags_ro) == 0) { + internal_register_new_disk(i); + internal_release(i, vioblockflags_ro); + } + } + + /* + * Create the proc entry + */ + iSeries_proc_callback(&viodasd_proc_init); + + return 0; +} + +#ifdef MODULE +void viodasd_exit(void) +{ + int i; + for(i = 0; i < NUM_MAJORS; ++i) + viodasd_cleanup_major(major_table[i]); + + CLEANIT(viodasd_devices); + + viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2); + iSeries_proc_callback(&viodasd_proc_delete); + +} +#endif + +#ifdef MODULE +module_init(viodasd_init); +module_exit(viodasd_exit); +#endif diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/iseries/viopath.c linuxppc64_2_4/drivers/iseries/viopath.c --- ../kernel.org/linux-2.4.19/drivers/iseries/viopath.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/drivers/iseries/viopath.c Tue Apr 16 10:47:52 2002 @@ -0,0 +1,661 @@ +/* -*- linux-c -*- + * arch/ppc64/viopath.c + * + * iSeries Virtual I/O Message Path code + * + * Authors: Dave Boutcher + * Ryan Arnold + * Colin Devilbiss + * + * (C) Copyright 2000 IBM Corporation + * + * This code is used by the iSeries virtual disk, cd, + * tape, and console to communicate with OS/400 in another + * partition. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) anyu later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "vio.h" + +EXPORT_SYMBOL(viopath_hostLp); +EXPORT_SYMBOL(viopath_ourLp); +EXPORT_SYMBOL(vio_set_hostlp); +EXPORT_SYMBOL(vio_lookup_rc); +EXPORT_SYMBOL(viopath_open); +EXPORT_SYMBOL(viopath_close); +EXPORT_SYMBOL(viopath_isactive); +EXPORT_SYMBOL(viopath_sourceinst); +EXPORT_SYMBOL(viopath_targetinst); +EXPORT_SYMBOL(vio_setHandler); +EXPORT_SYMBOL(vio_clearHandler); +EXPORT_SYMBOL(vio_get_event_buffer); +EXPORT_SYMBOL(vio_free_event_buffer); + +extern struct pci_dev * iSeries_vio_dev; + +/* Status of the path to each other partition in the system. + * This is overkill, since we will only ever establish connections + * to our hosting partition and the primary partition on the system. + * But this allows for other support in the future. + */ +static struct viopathStatus { + int isOpen:1; /* Did we open the path? */ + int isActive:1; /* Do we have a mon msg outstanding */ + int users[VIO_MAX_SUBTYPES]; + HvLpInstanceId mSourceInst; + HvLpInstanceId mTargetInst; + int numberAllocated; +} viopathStatus[HVMAXARCHITECTEDLPS]; + +static spinlock_t statuslock = SPIN_LOCK_UNLOCKED; + +/* + * For each kind of event we allocate a buffer that is + * guaranteed not to cross a page boundary + */ +static void *event_buffer[VIO_MAX_SUBTYPES]; +static atomic_t event_buffer_available[VIO_MAX_SUBTYPES]; + +static void handleMonitorEvent(struct HvLpEvent *event); + +/* We use this structure to handle asynchronous responses. The caller + * blocks on the semaphore and the handler posts the semaphore. + */ +struct doneAllocParms_t { + struct semaphore *sem; + int number; +}; + +/* Put a sequence number in each mon msg. The value is not + * important. Start at something other than 0 just for + * readability. wrapping this is ok. + */ +static u8 viomonseq = 22; + +/* Our hosting logical partition. We get this at startup + * time, and different modules access this variable directly. + */ +HvLpIndex viopath_hostLp = 0xff; /* HvLpIndexInvalid */ +HvLpIndex viopath_ourLp = 0xff; + +/* For each kind of incoming event we set a pointer to a + * routine to call. + */ +static vio_event_handler_t *vio_handler[VIO_MAX_SUBTYPES]; + +/* Handle reads from the proc file system + */ +static int proc_read(char *buf, char **start, off_t offset, + int blen, int *eof, void *data) +{ + HvLpEvent_Rc hvrc; + DECLARE_MUTEX_LOCKED(Semaphore); + dma_addr_t dmaa = + pci_map_single(iSeries_vio_dev, buf, PAGE_SIZE, PCI_DMA_FROMDEVICE); + int len = PAGE_SIZE; + + if (len > blen) + len = blen; + + memset(buf, 0x00, len); + hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_config | + vioconfigget, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (viopath_hostLp), + viopath_targetinst + (viopath_hostLp), + (u64) (unsigned long) + &Semaphore, VIOVERSION << 16, + ((u64) dmaa) << 32, len, 0, + 0); + if (hvrc != HvLpEvent_Rc_Good) { + printk("viopath hv error on op %d\n", (int) hvrc); + } + + down(&Semaphore); + + pci_unmap_single(iSeries_vio_dev, dmaa, PAGE_SIZE, PCI_DMA_FROMDEVICE); + + *eof = 1; + return strlen(buf); +} + +/* Handle writes to our proc file system + */ +static int proc_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + /* Doesn't do anything today!!! + */ + return count; +} + +/* setup our proc file system entries + */ +static void vio_proc_init(struct proc_dir_entry *iSeries_proc) +{ + struct proc_dir_entry *ent; + ent = create_proc_entry("config", S_IFREG | S_IRUSR, iSeries_proc); + if (!ent) + return; + ent->nlink = 1; + ent->data = NULL; + ent->read_proc = proc_read; + ent->write_proc = proc_write; +} + +/* See if a given LP is active. Allow for invalid lps to be passed in + * and just return invalid + */ +int viopath_isactive(HvLpIndex lp) +{ + if (lp == HvLpIndexInvalid) + return 0; + if (lp < HVMAXARCHITECTEDLPS) + return viopathStatus[lp].isActive; + else + return 0; +} + +/* We cache the source and target instance ids for each + * partition. + */ +HvLpInstanceId viopath_sourceinst(HvLpIndex lp) +{ + return viopathStatus[lp].mSourceInst; +} + +HvLpInstanceId viopath_targetinst(HvLpIndex lp) +{ + return viopathStatus[lp].mTargetInst; +} + +/* Send a monitor message. This is a message with the acknowledge + * bit on that the other side will NOT explicitly acknowledge. When + * the other side goes down, the hypervisor will acknowledge any + * outstanding messages....so we will know when the other side dies. + */ +static void sendMonMsg(HvLpIndex remoteLp) +{ + HvLpEvent_Rc hvrc; + + viopathStatus[remoteLp].mSourceInst = + HvCallEvent_getSourceLpInstanceId(remoteLp, + HvLpEvent_Type_VirtualIo); + viopathStatus[remoteLp].mTargetInst = + HvCallEvent_getTargetLpInstanceId(remoteLp, + HvLpEvent_Type_VirtualIo); + + /* Deliberately ignore the return code here. if we call this + * more than once, we don't care. + */ + vio_setHandler(viomajorsubtype_monitor, handleMonitorEvent); + + hvrc = HvCallEvent_signalLpEventFast(remoteLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_monitor, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_DeferredAck, + viopathStatus[remoteLp]. + mSourceInst, + viopathStatus[remoteLp]. + mTargetInst, viomonseq++, + 0, 0, 0, 0, 0); + + if (hvrc == HvLpEvent_Rc_Good) { + viopathStatus[remoteLp].isActive = 1; + } else { + printk(KERN_WARNING_VIO + "could not connect to partition %d\n", remoteLp); + viopathStatus[remoteLp].isActive = 0; + } +} + +static void handleMonitorEvent(struct HvLpEvent *event) +{ + HvLpIndex remoteLp; + int i; + + /* This handler is _also_ called as part of the loop + * at the end of this routine, so it must be able to + * ignore NULL events... + */ + if(!event) + return; + + /* First see if this is just a normal monitor message from the + * other partition + */ + if (event->xFlags.xFunction == HvLpEvent_Function_Int) { + remoteLp = event->xSourceLp; + if (!viopathStatus[remoteLp].isActive) + sendMonMsg(remoteLp); + return; + } + + /* This path is for an acknowledgement; the other partition + * died + */ + remoteLp = event->xTargetLp; + if ((event->xSourceInstanceId != + viopathStatus[remoteLp].mSourceInst) + || (event->xTargetInstanceId != + viopathStatus[remoteLp].mTargetInst)) { + printk(KERN_WARNING_VIO + "ignoring ack....mismatched instances\n"); + return; + } + + printk(KERN_WARNING_VIO "partition %d ended\n", remoteLp); + + viopathStatus[remoteLp].isActive = 0; + + /* For each active handler, pass them a NULL + * message to indicate that the other partition + * died + */ + for (i = 0; i < VIO_MAX_SUBTYPES; i++) { + if (vio_handler[i] != NULL) + (*vio_handler[i]) (NULL); + } +} + +int vio_setHandler(int subtype, vio_event_handler_t * beh) +{ + subtype = subtype >> VIOMAJOR_SUBTYPE_SHIFT; + + if ((subtype < 0) || (subtype >= VIO_MAX_SUBTYPES)) + return -EINVAL; + + if (vio_handler[subtype] != NULL) + return -EBUSY; + + vio_handler[subtype] = beh; + return 0; +} + +int vio_clearHandler(int subtype) +{ + subtype = subtype >> VIOMAJOR_SUBTYPE_SHIFT; + + if ((subtype < 0) || (subtype >= VIO_MAX_SUBTYPES)) + return -EINVAL; + + if (vio_handler[subtype] == NULL) + return -EAGAIN; + + vio_handler[subtype] = NULL; + return 0; +} + +static void handleConfig(struct HvLpEvent *event) +{ + if(!event) + return; + if (event->xFlags.xFunction == HvLpEvent_Function_Int) { + printk(KERN_WARNING_VIO + "unexpected config request from partition %d", + event->xSourceLp); + + if ((event->xFlags.xFunction == HvLpEvent_Function_Int) && + (event->xFlags.xAckInd == HvLpEvent_AckInd_DoAck)) { + event->xRc = HvLpEvent_Rc_InvalidSubtype; + HvCallEvent_ackLpEvent(event); + } + return; + } + + up((struct semaphore *) event->xCorrelationToken); +} + +/* Initialization of the hosting partition + */ +void vio_set_hostlp(void) +{ + /* If this has already been set then we DON'T want to either change + * it or re-register the proc file system + */ + if (viopath_hostLp != HvLpIndexInvalid) + return; + + /* Figure out our hosting partition. This isn't allowed to change + * while we're active + */ + viopath_ourLp = HvLpConfig_getLpIndex(); + viopath_hostLp = HvCallCfg_getHostingLpIndex(viopath_ourLp); + + /* If we have a valid hosting LP, create a proc file system entry + * for config information + */ + if (viopath_hostLp != HvLpIndexInvalid) { + iSeries_proc_callback(&vio_proc_init); + vio_setHandler(viomajorsubtype_config, handleConfig); + } +} + +static void vio_handleEvent(struct HvLpEvent *event, struct pt_regs *regs) +{ + HvLpIndex remoteLp; + int subtype = + (event-> + xSubtype & VIOMAJOR_SUBTYPE_MASK) >> VIOMAJOR_SUBTYPE_SHIFT; + + if (event->xFlags.xFunction == HvLpEvent_Function_Int) { + remoteLp = event->xSourceLp; + if (event->xSourceInstanceId != + viopathStatus[remoteLp].mTargetInst) { + printk(KERN_WARNING_VIO + "message from invalid partition. " + "int msg rcvd, source inst (%d) doesnt match (%d)\n", + viopathStatus[remoteLp].mTargetInst, + event->xSourceInstanceId); + return; + } + + if (event->xTargetInstanceId != + viopathStatus[remoteLp].mSourceInst) { + printk(KERN_WARNING_VIO + "message from invalid partition. " + "int msg rcvd, target inst (%d) doesnt match (%d)\n", + viopathStatus[remoteLp].mSourceInst, + event->xTargetInstanceId); + return; + } + } else { + remoteLp = event->xTargetLp; + if (event->xSourceInstanceId != + viopathStatus[remoteLp].mSourceInst) { + printk(KERN_WARNING_VIO + "message from invalid partition. " + "ack msg rcvd, source inst (%d) doesnt match (%d)\n", + viopathStatus[remoteLp].mSourceInst, + event->xSourceInstanceId); + return; + } + + if (event->xTargetInstanceId != + viopathStatus[remoteLp].mTargetInst) { + printk(KERN_WARNING_VIO + "message from invalid partition. " + "viopath: ack msg rcvd, target inst (%d) doesnt match (%d)\n", + viopathStatus[remoteLp].mTargetInst, + event->xTargetInstanceId); + return; + } + } + + if (vio_handler[subtype] == NULL) { + printk(KERN_WARNING_VIO + "unexpected virtual io event subtype %d from partition %d\n", + event->xSubtype, remoteLp); + /* No handler. Ack if necessary + */ + if ((event->xFlags.xFunction == HvLpEvent_Function_Int) && + (event->xFlags.xAckInd == HvLpEvent_AckInd_DoAck)) { + event->xRc = HvLpEvent_Rc_InvalidSubtype; + HvCallEvent_ackLpEvent(event); + } + return; + } + + /* This innocuous little line is where all the real work happens + */ + (*vio_handler[subtype]) (event); +} + +static void viopath_donealloc(void *parm, int number) +{ + struct doneAllocParms_t *doneAllocParmsp = + (struct doneAllocParms_t *) parm; + doneAllocParmsp->number = number; + up(doneAllocParmsp->sem); +} + +static int allocateEvents(HvLpIndex remoteLp, int numEvents) +{ + struct doneAllocParms_t doneAllocParms; + DECLARE_MUTEX_LOCKED(Semaphore); + doneAllocParms.sem = &Semaphore; + + mf_allocateLpEvents(remoteLp, HvLpEvent_Type_VirtualIo, 250, /* It would be nice to put a real number here! */ + numEvents, + &viopath_donealloc, &doneAllocParms); + + down(&Semaphore); + + return doneAllocParms.number; +} + +int viopath_open(HvLpIndex remoteLp, int subtype, int numReq) +{ + int i; + unsigned long flags; + + if ((remoteLp >= HvMaxArchitectedLps) + || (remoteLp == HvLpIndexInvalid)) + return -EINVAL; + + subtype = subtype >> VIOMAJOR_SUBTYPE_SHIFT; + if ((subtype < 0) || (subtype >= VIO_MAX_SUBTYPES)) + return -EINVAL; + + spin_lock_irqsave(&statuslock, flags); + + /* OK...we can fit 4 maximum-sized events (256 bytes) in + * each page (4096). Get a new page every 4 + */ + if (event_buffer[0] == NULL) { + for (i = 0; i < VIO_MAX_SUBTYPES; i++) { + if ((i % 4) == 0) { + event_buffer[i] = + (void *) get_free_page(GFP_KERNEL); + if (event_buffer[i] == NULL) { + spin_unlock_irqrestore(&statuslock, flags); + return -ENOMEM; + } + } else { + event_buffer[i] = + event_buffer[i - 1] + 256; + } + atomic_set(&event_buffer_available[i], 1); + } + } + + viopathStatus[remoteLp].users[subtype]++; + + if (!viopathStatus[remoteLp].isOpen) { + HvCallEvent_openLpEventPath(remoteLp, + HvLpEvent_Type_VirtualIo); + + viopathStatus[remoteLp].numberAllocated += + allocateEvents(remoteLp, 1); + + if (viopathStatus[remoteLp].numberAllocated == 0) { + HvCallEvent_closeLpEventPath(remoteLp, + HvLpEvent_Type_VirtualIo); + + spin_unlock_irqrestore(&statuslock, flags); + return -ENOMEM; + } + + viopathStatus[remoteLp].mSourceInst = + HvCallEvent_getSourceLpInstanceId(remoteLp, + HvLpEvent_Type_VirtualIo); + viopathStatus[remoteLp].mTargetInst = + HvCallEvent_getTargetLpInstanceId(remoteLp, + HvLpEvent_Type_VirtualIo); + + HvLpEvent_registerHandler(HvLpEvent_Type_VirtualIo, + &vio_handleEvent); + + viopathStatus[remoteLp].isOpen = 1; + + sendMonMsg(remoteLp); + + printk(KERN_INFO_VIO + "Opening connection to partition %d, setting sinst %d, tinst %d\n", + remoteLp, + viopathStatus[remoteLp].mSourceInst, + viopathStatus[remoteLp].mTargetInst); + } + + viopathStatus[remoteLp].numberAllocated += + allocateEvents(remoteLp, numReq); + spin_unlock_irqrestore(&statuslock, flags); + + return 0; +} + +int viopath_close(HvLpIndex remoteLp, int subtype, int numReq) +{ + unsigned long flags; + int i; + int numOpen; + struct doneAllocParms_t doneAllocParms; + DECLARE_MUTEX_LOCKED(Semaphore); + doneAllocParms.sem = &Semaphore; + + if ((remoteLp >= HvMaxArchitectedLps) + || (remoteLp == HvLpIndexInvalid)) + return -EINVAL; + + subtype = subtype >> VIOMAJOR_SUBTYPE_SHIFT; + if ((subtype < 0) || (subtype >= VIO_MAX_SUBTYPES)) + return -EINVAL; + + spin_lock_irqsave(&statuslock, flags); + + viopathStatus[remoteLp].users[subtype]--; + + mf_deallocateLpEvents( remoteLp,HvLpEvent_Type_VirtualIo, + numReq, + &viopath_donealloc, + &doneAllocParms ); + down(&Semaphore); + + for (i = 0, numOpen = 0; i < VIO_MAX_SUBTYPES; i++) { + numOpen += viopathStatus[remoteLp].users[i]; + } + + if ((viopathStatus[remoteLp].isOpen) && (numOpen == 0)) { + printk(KERN_INFO_VIO + "Closing connection to partition %d", remoteLp); + + HvCallEvent_closeLpEventPath(remoteLp, + HvLpEvent_Type_VirtualIo); + viopathStatus[remoteLp].isOpen = 0; + viopathStatus[remoteLp].isActive = 0; + + for (i = 0; i < VIO_MAX_SUBTYPES; i++) { + atomic_set(&event_buffer_available[i], 0); + + for (i = 0; i < VIO_MAX_SUBTYPES; i += 4) { + free_page((unsigned long) event_buffer[i]); + } + } + + } + spin_unlock_irqrestore(&statuslock, flags); + return 0; +} + +void *vio_get_event_buffer(int subtype) +{ + subtype = subtype >> VIOMAJOR_SUBTYPE_SHIFT; + if ((subtype < 0) || (subtype >= VIO_MAX_SUBTYPES)) + return NULL; + + if (atomic_dec_if_positive(&event_buffer_available[subtype]) == 0) + return event_buffer[subtype]; + else + return NULL; +} + +void vio_free_event_buffer(int subtype, void *buffer) +{ + subtype = subtype >> VIOMAJOR_SUBTYPE_SHIFT; + if ((subtype < 0) || (subtype >= VIO_MAX_SUBTYPES)) { + printk(KERN_WARNING_VIO + "unexpected subtype %d freeing event buffer\n", + subtype); + return; + } + + if (atomic_read(&event_buffer_available[subtype]) != 0) { + printk(KERN_WARNING_VIO + "freeing unallocated event buffer, subtype %d\n", + subtype); + return; + } + + if (buffer != event_buffer[subtype]) { + printk(KERN_WARNING_VIO + "freeing invalid event buffer, subtype %d\n", + subtype); + } + + atomic_set(&event_buffer_available[subtype], 1); +} + +static const struct vio_error_entry vio_no_error = + { 0, 0, "Non-VIO Error" }; +static const struct vio_error_entry vio_unknown_error = + { 0, EIO, "Unknown Error" }; + +static const struct vio_error_entry vio_default_errors[] = { + {0x0001, EIO, "No Connection"}, + {0x0002, EIO, "No Receiver"}, + {0x0003, EIO, "No Buffer Available"}, + {0x0004, EBADRQC, "Invalid Message Type"}, + {0x0000, 0, NULL}, +}; + +const struct vio_error_entry *vio_lookup_rc(const struct vio_error_entry + *local_table, u16 rc) +{ + const struct vio_error_entry *cur; + if (!rc) + return &vio_no_error; + if (local_table) + for (cur = local_table; cur->rc; ++cur) + if (cur->rc == rc) + return cur; + for (cur = vio_default_errors; cur->rc; ++cur) + if (cur->rc == rc) + return cur; + return &vio_unknown_error; +} diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/iseries/viotape.c linuxppc64_2_4/drivers/iseries/viotape.c --- ../kernel.org/linux-2.4.19/drivers/iseries/viotape.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/drivers/iseries/viotape.c Wed Dec 12 13:48:34 2001 @@ -0,0 +1,1185 @@ +/* -*- linux-c -*- + * drivers/char/viotape.c + * + * iSeries Virtual Tape + *************************************************************************** + * + * Authors: Dave Boutcher + * Ryan Arnold + * Colin Devilbiss + * + * (C) Copyright 2000 IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) anyu later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software Foundation, + * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + *************************************************************************** + * This routine provides access to tape drives owned and managed by an OS/400 + * partition running on the same box as this Linux partition. + * + * All tape operations are performed by sending messages back and forth to + * the OS/400 partition. The format of the messages is defined in + * iSeries/vio.h + * + */ + + +#undef VIOT_DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vio.h" +#include +#include "asm/iSeries/HvCallEvent.h" +#include "asm/iSeries/HvLpConfig.h" +#include + +extern struct pci_dev * iSeries_vio_dev; + +static int viotape_major = 230; +static int viotape_numdev = 0; + +#define VIOTAPE_MAXREQ 1 + +/* version number for viotape driver */ +static unsigned int version_major = 1; +static unsigned int version_minor = 0; + +static u64 sndMsgSeq; +static u64 sndMsgAck; +static u64 rcvMsgSeq; +static u64 rcvMsgAck; + +/*************************************************************************** + * The minor number follows the conventions of the SCSI tape drives. The + * rewind and mode are encoded in the minor #. We use this struct to break + * them out + ***************************************************************************/ +struct viot_devinfo_struct { + int major; + int minor; + int devno; + int mode; + int rewind; +}; + +#define VIOTAPOP_RESET 0 +#define VIOTAPOP_FSF 1 +#define VIOTAPOP_BSF 2 +#define VIOTAPOP_FSR 3 +#define VIOTAPOP_BSR 4 +#define VIOTAPOP_WEOF 5 +#define VIOTAPOP_REW 6 +#define VIOTAPOP_NOP 7 +#define VIOTAPOP_EOM 8 +#define VIOTAPOP_ERASE 9 +#define VIOTAPOP_SETBLK 10 +#define VIOTAPOP_SETDENSITY 11 +#define VIOTAPOP_SETPOS 12 +#define VIOTAPOP_GETPOS 13 +#define VIOTAPOP_SETPART 14 + +struct viotapelpevent { + struct HvLpEvent event; + u32 mReserved1; + u16 mVersion; + u16 mSubTypeRc; + u16 mTape; + u16 mFlags; + u32 mToken; + u64 mLen; + union { + struct { + u32 mTapeOp; + u32 mCount; + } tapeOp; + struct { + u32 mType; + u32 mResid; + u32 mDsreg; + u32 mGstat; + u32 mErreg; + u32 mFileNo; + u32 mBlkNo; + } getStatus; + struct { + u32 mBlkNo; + } getPos; + } u; +}; +enum viotapesubtype { + viotapeopen = 0x0001, + viotapeclose = 0x0002, + viotaperead = 0x0003, + viotapewrite = 0x0004, + viotapegetinfo = 0x0005, + viotapeop = 0x0006, + viotapegetpos = 0x0007, + viotapesetpos = 0x0008, + viotapegetstatus = 0x0009 +}; + +enum viotapeRc { + viotape_InvalidRange = 0x0601, + viotape_InvalidToken = 0x0602, + viotape_DMAError = 0x0603, + viotape_UseError = 0x0604, + viotape_ReleaseError = 0x0605, + viotape_InvalidTape = 0x0606, + viotape_InvalidOp = 0x0607, + viotape_TapeErr = 0x0608, + + viotape_AllocTimedOut = 0x0640, + viotape_BOTEnc = 0x0641, + viotape_BlankTape = 0x0642, + viotape_BufferEmpty = 0x0643, + viotape_CleanCartFound = 0x0644, + viotape_CmdNotAllowed = 0x0645, + viotape_CmdNotSupported = 0x0646, + viotape_DataCheck = 0x0647, + viotape_DecompressErr = 0x0648, + viotape_DeviceTimeout = 0x0649, + viotape_DeviceUnavail = 0x064a, + viotape_DeviceBusy = 0x064b, + viotape_EndOfMedia = 0x064c, + viotape_EndOfTape = 0x064d, + viotape_EquipCheck = 0x064e, + viotape_InsufficientRs = 0x064f, + viotape_InvalidLogBlk = 0x0650, + viotape_LengthError = 0x0651, + viotape_LibDoorOpen = 0x0652, + viotape_LoadFailure = 0x0653, + viotape_NotCapable = 0x0654, + viotape_NotOperational = 0x0655, + viotape_NotReady = 0x0656, + viotape_OpCancelled = 0x0657, + viotape_PhyLinkErr = 0x0658, + viotape_RdyNotBOT = 0x0659, + viotape_TapeMark = 0x065a, + viotape_WriteProt = 0x065b +}; + +static const struct vio_error_entry viotape_err_table[] = { + {viotape_InvalidRange, EIO, "Internal error"}, + {viotape_InvalidToken, EIO, "Internal error"}, + {viotape_DMAError, EIO, "DMA error"}, + {viotape_UseError, EIO, "Internal error"}, + {viotape_ReleaseError, EIO, "Internal error"}, + {viotape_InvalidTape, EIO, "Invalid tape device"}, + {viotape_InvalidOp, EIO, "Invalid operation"}, + {viotape_TapeErr, EIO, "Tape error"}, + {viotape_AllocTimedOut, EBUSY, "Allocate timed out"}, + {viotape_BOTEnc, EIO, "Beginning of tape encountered"}, + {viotape_BlankTape, EIO, "Blank tape"}, + {viotape_BufferEmpty, EIO, "Buffer empty"}, + {viotape_CleanCartFound, ENOMEDIUM, "Cleaning cartridge found"}, + {viotape_CmdNotAllowed, EIO, "Command not allowed"}, + {viotape_CmdNotSupported, EIO, "Command not supported"}, + {viotape_DataCheck, EIO, "Data check"}, + {viotape_DecompressErr, EIO, "Decompression error"}, + {viotape_DeviceTimeout, EBUSY, "Device timeout"}, + {viotape_DeviceUnavail, EIO, "Device unavailable"}, + {viotape_DeviceBusy, EBUSY, "Device busy"}, + {viotape_EndOfMedia, ENOSPC, "End of media"}, + {viotape_EndOfTape, ENOSPC, "End of tape"}, + {viotape_EquipCheck, EIO, "Equipment check"}, + {viotape_InsufficientRs, EOVERFLOW, "Insufficient tape resources"}, + {viotape_InvalidLogBlk, EIO, "Invalid logical block location"}, + {viotape_LengthError, EOVERFLOW, "Length error"}, + {viotape_LibDoorOpen, EBUSY, "Door open"}, + {viotape_LoadFailure, ENOMEDIUM, "Load failure"}, + {viotape_NotCapable, EIO, "Not capable"}, + {viotape_NotOperational, EIO, "Not operational"}, + {viotape_NotReady, EIO, "Not ready"}, + {viotape_OpCancelled, EIO, "Operation cancelled"}, + {viotape_PhyLinkErr, EIO, "Physical link error"}, + {viotape_RdyNotBOT, EIO, "Ready but not beginning of tape"}, + {viotape_TapeMark, EIO, "Tape mark"}, + {viotape_WriteProt, EROFS, "Write protection error"}, + {0, 0, NULL}, +}; + +/* Maximum # tapes we support + */ +#define VIOTAPE_MAX_TAPE 8 +#define MAX_PARTITIONS 4 + +/* defines for current tape state */ +#define VIOT_IDLE 0 +#define VIOT_READING 1 +#define VIOT_WRITING 2 + +/* Our info on the tapes + */ +struct tape_descr { + char rsrcname[10]; + char type[4]; + char model[3]; +}; + +static struct tape_descr *viotape_unitinfo = NULL; + +static const char *lasterr[VIOTAPE_MAX_TAPE]; + +static struct mtget viomtget[VIOTAPE_MAX_TAPE]; + +/* maintain the current state of each tape (and partition) + so that we know when to write EOF marks. +*/ +static struct { + unsigned char cur_part; + devfs_handle_t dev_handle; + struct { + unsigned char rwi; + } part_stat[MAX_PARTITIONS]; +} state[VIOTAPE_MAX_TAPE]; + +/* We single-thread + */ +static struct semaphore reqSem; + +/* When we send a request, we use this struct to get the response back + * from the interrupt handler + */ +struct opStruct { + void *buffer; + dma_addr_t dmaaddr; + size_t count; + int rc; + struct semaphore *sem; + struct opStruct *free; +}; + +static spinlock_t opStructListLock; +static struct opStruct *opStructList; + +/* forward declaration to resolve interdependence */ +static int chg_state(int index, unsigned char new_state, + struct file *file); + +/* Decode the kdev_t into its parts + */ +void getDevInfo(kdev_t dev, struct viot_devinfo_struct *devi) +{ + devi->major = MAJOR(dev); + devi->minor = MINOR(dev); + devi->devno = devi->minor & 0x1F; + devi->mode = (devi->minor & 0x60) >> 5; + /* if bit is set in the minor, do _not_ rewind automatically */ + devi->rewind = !(devi->minor & 0x80); +} + + +/* Allocate an op structure from our pool + */ +static struct opStruct *getOpStruct(void) +{ + struct opStruct *newOpStruct; + spin_lock(&opStructListLock); + + if (opStructList == NULL) { + newOpStruct = kmalloc(sizeof(struct opStruct), GFP_KERNEL); + } else { + newOpStruct = opStructList; + opStructList = opStructList->free; + } + + if (newOpStruct) + memset(newOpStruct, 0x00, sizeof(struct opStruct)); + + spin_unlock(&opStructListLock); + + return newOpStruct; +} + +/* Return an op structure to our pool + */ +static void freeOpStruct(struct opStruct *opStruct) +{ + spin_lock(&opStructListLock); + opStruct->free = opStructList; + opStructList = opStruct; + spin_unlock(&opStructListLock); +} + +/* Map our tape return codes to errno values + */ +int tapeRcToErrno(int tapeRc, char *operation, int tapeno) +{ + const struct vio_error_entry *err; + if(tapeRc == 0) + return 0; + err = vio_lookup_rc(viotape_err_table, tapeRc); + + printk(KERN_WARNING_VIO "tape error 0x%04x on Device %d (%-10s): %s\n", + tapeRc, tapeno, viotape_unitinfo[tapeno].rsrcname, err->msg); + + lasterr[tapeno] = err->msg; + + return -err->errno; +} + +/* Handle reads from the proc file system. + */ +static int proc_read(char *buf, char **start, off_t offset, + int blen, int *eof, void *data) +{ + int len = 0; + int i; + + len += sprintf(buf + len, "viotape driver version %d.%d\n", + version_major, version_minor); + + for (i = 0; i < viotape_numdev; i++) { + + len += + sprintf(buf + len, + "viotape device %d is iSeries resource %10.10s type %4.4s, model %3.3s\n", + i, viotape_unitinfo[i].rsrcname, + viotape_unitinfo[i].type, + viotape_unitinfo[i].model); + if (lasterr[i]) + len += + sprintf(buf + len, " last error: %s\n", + lasterr[i]); + } + + *eof = 1; + return len; +} + +/* setup our proc file system entries + */ +void viotape_proc_init(struct proc_dir_entry *iSeries_proc) +{ + struct proc_dir_entry *ent; + ent = + create_proc_entry("viotape", S_IFREG | S_IRUSR, iSeries_proc); + if (!ent) + return; + ent->nlink = 1; + ent->data = NULL; + ent->read_proc = proc_read; +} + +/* clean up our proc file system entries + */ +void viotape_proc_delete(struct proc_dir_entry *iSeries_proc) +{ + remove_proc_entry("viotape", iSeries_proc); +} + + +/* Get info on all tapes from OS/400 + */ +static void get_viotape_info(void) +{ + dma_addr_t dmaaddr; + HvLpEvent_Rc hvrc; + int i; + struct opStruct *op = getOpStruct(); + DECLARE_MUTEX_LOCKED(Semaphore); + if (op == NULL) + return; + + if (viotape_unitinfo == NULL) { + viotape_unitinfo = + kmalloc(sizeof(struct tape_descr) * VIOTAPE_MAX_TAPE, + GFP_KERNEL); + } + memset(viotape_unitinfo, 0x00, + sizeof(struct tape_descr) * VIOTAPE_MAX_TAPE); + memset(lasterr, 0x00, sizeof(lasterr)); + + op->sem = &Semaphore; + + dmaaddr = pci_map_single(iSeries_vio_dev, viotape_unitinfo, + sizeof(struct tape_descr) * + VIOTAPE_MAX_TAPE, PCI_DMA_FROMDEVICE); + if (dmaaddr == 0xFFFFFFFF) { + printk(KERN_WARNING_VIO "viotape error allocating tce\n"); + return; + } + + hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_tape | + viotapegetinfo, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (viopath_hostLp), + viopath_targetinst + (viopath_hostLp), + (u64) (unsigned long) op, + VIOVERSION << 16, dmaaddr, + sizeof(struct tape_descr) * + VIOTAPE_MAX_TAPE, 0, 0); + if (hvrc != HvLpEvent_Rc_Good) { + printk("viotape hv error on op %d\n", (int) hvrc); + } + + down(&Semaphore); + + freeOpStruct(op); + + + for (i = 0; + ((i < VIOTAPE_MAX_TAPE) && (viotape_unitinfo[i].rsrcname[0])); + i++) { + printk("found a tape %10.10s\n", + viotape_unitinfo[i].rsrcname); + viotape_numdev++; + } +} + + +/* Write + */ +static ssize_t viotap_write(struct file *file, const char *buf, + size_t count, loff_t * ppos) +{ + HvLpEvent_Rc hvrc; + kdev_t dev = file->f_dentry->d_inode->i_rdev; + unsigned short flags = file->f_flags; + struct opStruct *op = getOpStruct(); + int noblock = ((flags & O_NONBLOCK) != 0); + int err; + struct viot_devinfo_struct devi; + DECLARE_MUTEX_LOCKED(Semaphore); + + if (op == NULL) + return -ENOMEM; + + getDevInfo(dev, &devi); + + /* We need to make sure we can send a request. We use + * a semaphore to keep track of # requests in use. If + * we are non-blocking, make sure we don't block on the + * semaphore + */ + if (noblock) { + if (down_trylock(&reqSem)) { + freeOpStruct(op); + return -EWOULDBLOCK; + } + } else { + down(&reqSem); + } + + /* Allocate a DMA buffer */ + op->buffer = pci_alloc_consistent(iSeries_vio_dev, count, &op->dmaaddr); + + if ((op->dmaaddr == 0xFFFFFFFF) || (op->buffer == NULL)) { + printk(KERN_WARNING_VIO + "tape error allocating dma buffer for len %ld\n", + count); + freeOpStruct(op); + up(&reqSem); + return -EFAULT; + } + + op->count = count; + + /* Copy the data into the buffer */ + err = copy_from_user(op->buffer, (const void *) buf, count); + if (err) { + printk(KERN_WARNING_VIO + "tape: error on copy from user\n"); + pci_free_consistent(iSeries_vio_dev, count, op->buffer, op->dmaaddr); + freeOpStruct(op); + up(&reqSem); + return -EFAULT; + } + + if (noblock) { + op->sem = NULL; + } else { + op->sem = &Semaphore; + } + + hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_tape | + viotapewrite, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (viopath_hostLp), + viopath_targetinst + (viopath_hostLp), + (u64) (unsigned long) op, + VIOVERSION << 16, + ((u64) devi. + devno << 48) | op->dmaaddr, + count, 0, 0); + if (hvrc != HvLpEvent_Rc_Good) { + printk("viotape hv error on op %d\n", (int) hvrc); + pci_free_consistent(iSeries_vio_dev, count, op->buffer, op->dmaaddr); + freeOpStruct(op); + up(&reqSem); + return -EIO; + } + + if (noblock) + return count; + + down(&Semaphore); + + err = op->rc; + + /* Free the buffer */ + pci_free_consistent(iSeries_vio_dev, count, op->buffer, op->dmaaddr); + + count = op->count; + + freeOpStruct(op); + up(&reqSem); + if (err) + return tapeRcToErrno(err, "write", devi.devno); + else { + chg_state(devi.devno, VIOT_WRITING, file); + return count; + } +} + +/* read + */ +static ssize_t viotap_read(struct file *file, char *buf, size_t count, + loff_t * ptr) +{ + HvLpEvent_Rc hvrc; + kdev_t dev = file->f_dentry->d_inode->i_rdev; + unsigned short flags = file->f_flags; + struct opStruct *op = getOpStruct(); + int noblock = ((flags & O_NONBLOCK) != 0); + int err; + struct viot_devinfo_struct devi; + DECLARE_MUTEX_LOCKED(Semaphore); + + if (op == NULL) + return -ENOMEM; + + getDevInfo(dev, &devi); + + /* We need to make sure we can send a request. We use + * a semaphore to keep track of # requests in use. If + * we are non-blocking, make sure we don't block on the + * semaphore + */ + if (noblock) { + if (down_trylock(&reqSem)) { + freeOpStruct(op); + return -EWOULDBLOCK; + } + } else { + down(&reqSem); + } + + chg_state(devi.devno, VIOT_READING, file); + + /* Allocate a DMA buffer */ + op->buffer = pci_alloc_consistent(iSeries_vio_dev, count, &op->dmaaddr); + + if ((op->dmaaddr == 0xFFFFFFFF) || (op->buffer == NULL)) { + freeOpStruct(op); + up(&reqSem); + return -EFAULT; + } + + op->count = count; + + op->sem = &Semaphore; + + hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_tape | + viotaperead, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (viopath_hostLp), + viopath_targetinst + (viopath_hostLp), + (u64) (unsigned long) op, + VIOVERSION << 16, + ((u64) devi. + devno << 48) | op->dmaaddr, + count, 0, 0); + if (hvrc != HvLpEvent_Rc_Good) { + printk(KERN_WARNING_VIO + "tape hv error on op %d\n", (int) hvrc); + pci_free_consistent(iSeries_vio_dev, count, op->buffer, op->dmaaddr); + freeOpStruct(op); + up(&reqSem); + return -EIO; + } + + down(&Semaphore); + + if (op->rc == 0) { + /* If we got data back */ + if (op->count) { + /* Copy the data into the buffer */ + err = copy_to_user(buf, op->buffer, count); + if (err) { + printk("error on copy_to_user\n"); + pci_free_consistent(iSeries_vio_dev, count, + op->buffer, + op->dmaaddr); + freeOpStruct(op); + up(&reqSem); + return -EFAULT; + } + } + } + + err = op->rc; + + /* Free the buffer */ + pci_free_consistent(iSeries_vio_dev, count, op->buffer, op->dmaaddr); + count = op->count; + + freeOpStruct(op); + up(&reqSem); + if (err) + return tapeRcToErrno(err, "read", devi.devno); + else + return count; +} + +/* read + */ +static int viotap_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + HvLpEvent_Rc hvrc; + int err; + DECLARE_MUTEX_LOCKED(Semaphore); + kdev_t dev = file->f_dentry->d_inode->i_rdev; + struct opStruct *op = getOpStruct(); + struct viot_devinfo_struct devi; + if (op == NULL) + return -ENOMEM; + + getDevInfo(dev, &devi); + + down(&reqSem); + + switch (cmd) { + case MTIOCTOP:{ + struct mtop mtc; + u32 myOp; + + /* inode is null if and only if we (the kernel) made the request */ + if (inode == NULL) + memcpy(&mtc, (void *) arg, + sizeof(struct mtop)); + else if (copy_from_user + ((char *) &mtc, (char *) arg, + sizeof(struct mtop))) { + freeOpStruct(op); + up(&reqSem); + return -EFAULT; + } + + switch (mtc.mt_op) { + case MTRESET: + myOp = VIOTAPOP_RESET; + break; + case MTFSF: + myOp = VIOTAPOP_FSF; + break; + case MTBSF: + myOp = VIOTAPOP_BSF; + break; + case MTFSR: + myOp = VIOTAPOP_FSR; + break; + case MTBSR: + myOp = VIOTAPOP_BSR; + break; + case MTWEOF: + myOp = VIOTAPOP_WEOF; + break; + case MTREW: + myOp = VIOTAPOP_REW; + break; + case MTNOP: + myOp = VIOTAPOP_NOP; + break; + case MTEOM: + myOp = VIOTAPOP_EOM; + break; + case MTERASE: + myOp = VIOTAPOP_ERASE; + break; + case MTSETBLK: + myOp = VIOTAPOP_SETBLK; + break; + case MTSETDENSITY: + myOp = VIOTAPOP_SETDENSITY; + break; + case MTTELL: + myOp = VIOTAPOP_GETPOS; + break; + case MTSEEK: + myOp = VIOTAPOP_SETPOS; + break; + case MTSETPART: + myOp = VIOTAPOP_SETPART; + break; + default: + return -EIO; + } + +/* if we moved the head, we are no longer reading or writing */ + switch (mtc.mt_op) { + case MTFSF: + case MTBSF: + case MTFSR: + case MTBSR: + case MTTELL: + case MTSEEK: + case MTREW: + chg_state(devi.devno, VIOT_IDLE, file); + } + + op->sem = &Semaphore; + hvrc = + HvCallEvent_signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_tape + | viotapeop, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (viopath_hostLp), + viopath_targetinst + (viopath_hostLp), + (u64) (unsigned + long) op, + VIOVERSION << 16, + ((u64) devi. + devno << 48), 0, + (((u64) myOp) << + 32) | mtc. + mt_count, 0); + if (hvrc != HvLpEvent_Rc_Good) { + printk("viotape hv error on op %d\n", + (int) hvrc); + freeOpStruct(op); + up(&reqSem); + return -EIO; + } + down(&Semaphore); + if (op->rc) { + freeOpStruct(op); + up(&reqSem); + return tapeRcToErrno(op->rc, + "tape operation", + devi.devno); + } else { + freeOpStruct(op); + up(&reqSem); + return 0; + } + break; + } + + case MTIOCGET: + op->sem = &Semaphore; + hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_tape | + viotapegetstatus, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (viopath_hostLp), + viopath_targetinst + (viopath_hostLp), + (u64) (unsigned long) + op, VIOVERSION << 16, + ((u64) devi. + devno << 48), 0, 0, + 0); + if (hvrc != HvLpEvent_Rc_Good) { + printk("viotape hv error on op %d\n", (int) hvrc); + freeOpStruct(op); + up(&reqSem); + return -EIO; + } + down(&Semaphore); + up(&reqSem); + if (op->rc) { + freeOpStruct(op); + return tapeRcToErrno(op->rc, "get status", + devi.devno); + } else { + freeOpStruct(op); + err = + copy_to_user((void *) arg, &viomtget[dev], + sizeof(viomtget[0])); + if (err) { + freeOpStruct(op); + return -EFAULT; + } + return 0; + } + break; + case MTIOCPOS: + printk("Got an MTIOCPOS\n"); + default: + return -ENOSYS; + } + return 0; +} + +/* Open + */ +static int viotap_open(struct inode *inode, struct file *file) +{ + DECLARE_MUTEX_LOCKED(Semaphore); + kdev_t dev = file->f_dentry->d_inode->i_rdev; + HvLpEvent_Rc hvrc; + struct opStruct *op = getOpStruct(); + struct viot_devinfo_struct devi; + if (op == NULL) + return -ENOMEM; + + getDevInfo(dev, &devi); + +// Note: We currently only support one mode! + if ((devi.devno >= viotape_numdev) || (devi.mode)) { + freeOpStruct(op); + return -ENODEV; + } + + op->sem = &Semaphore; + + hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_tape | + viotapeopen, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (viopath_hostLp), + viopath_targetinst + (viopath_hostLp), + (u64) (unsigned long) op, + VIOVERSION << 16, + ((u64) devi.devno << 48), 0, + 0, 0); + + + if (hvrc != 0) { + printk("viotape bad rc on signalLpEvent %d\n", (int) hvrc); + freeOpStruct(op); + return -EIO; + } + + down(&Semaphore); + + if (op->rc) { + freeOpStruct(op); + return tapeRcToErrno(op->rc, "open", devi.devno); + } else { + freeOpStruct(op); + MOD_INC_USE_COUNT; + return 0; + } +} + + +/* Release + */ +static int viotap_release(struct inode *inode, struct file *file) +{ + DECLARE_MUTEX_LOCKED(Semaphore); + kdev_t dev = file->f_dentry->d_inode->i_rdev; + HvLpEvent_Rc hvrc; + struct viot_devinfo_struct devi; + struct opStruct *op = getOpStruct(); + + if (op == NULL) + return -ENOMEM; + op->sem = &Semaphore; + + getDevInfo(dev, &devi); + + if (devi.devno >= viotape_numdev) { + freeOpStruct(op); + return -ENODEV; + } + + chg_state(devi.devno, VIOT_IDLE, file); + + if (devi.rewind) { + hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_tape | + viotapeop, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (viopath_hostLp), + viopath_targetinst + (viopath_hostLp), + (u64) (unsigned long) + op, VIOVERSION << 16, + ((u64) devi. + devno << 48), 0, + ((u64) VIOTAPOP_REW) + << 32, 0); + down(&Semaphore); + + if (op->rc) { + tapeRcToErrno(op->rc, "rewind", devi.devno); + } + } + + hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, + HvLpEvent_Type_VirtualIo, + viomajorsubtype_tape | + viotapeclose, + HvLpEvent_AckInd_DoAck, + HvLpEvent_AckType_ImmediateAck, + viopath_sourceinst + (viopath_hostLp), + viopath_targetinst + (viopath_hostLp), + (u64) (unsigned long) op, + VIOVERSION << 16, + ((u64) devi.devno << 48), 0, + 0, 0); + + + if (hvrc != 0) { + printk("viotape: bad rc on signalLpEvent %d\n", + (int) hvrc); + return -EIO; + } + + down(&Semaphore); + + if (op->rc) { + printk("viotape: close failed\n"); + } + MOD_DEC_USE_COUNT; + return 0; +} + +struct file_operations viotap_fops = { + owner:THIS_MODULE, + read:viotap_read, + write:viotap_write, + ioctl:viotap_ioctl, + open:viotap_open, + release:viotap_release, +}; + +/* Handle interrupt events for tape + */ +static void vioHandleTapeEvent(struct HvLpEvent *event) +{ + int tapeminor; + struct opStruct *op; + struct viotapelpevent *tevent = (struct viotapelpevent *) event; + + if (event == NULL) { + /* Notification that a partition went away! */ + if (!viopath_isactive(viopath_hostLp)) { + /* TODO! Clean up */ + } + return; + } + + tapeminor = event->xSubtype & VIOMINOR_SUBTYPE_MASK; + switch (tapeminor) { + case viotapegetinfo: + case viotapeopen: + case viotapeclose: + op = (struct opStruct *) (unsigned long) event-> + xCorrelationToken; + op->rc = tevent->mSubTypeRc; + up(op->sem); + break; + case viotaperead: + case viotapewrite: + op = (struct opStruct *) (unsigned long) event-> + xCorrelationToken; + op->rc = tevent->mSubTypeRc; + op->count = tevent->mLen; + + if (op->sem) { + up(op->sem); + } else { + freeOpStruct(op); + up(&reqSem); + } + break; + case viotapeop: + case viotapegetpos: + case viotapesetpos: + case viotapegetstatus: + op = (struct opStruct *) (unsigned long) event-> + xCorrelationToken; + if (op) { + op->count = tevent->u.tapeOp.mCount; + op->rc = tevent->mSubTypeRc; + + if (op->sem) { + up(op->sem); + } + } + break; + default: + printk("viotape: wierd ack\n"); + } +} + + +/* Do initialization + */ +int __init viotap_init(void) +{ + DECLARE_MUTEX_LOCKED(Semaphore); + int rc; + char tapename[32]; + int i; + + printk("viotape driver version %d.%d\n", version_major, + version_minor); + + sndMsgSeq = sndMsgAck = 0; + rcvMsgSeq = rcvMsgAck = 0; + opStructList = NULL; + spin_lock_init(&opStructListLock); + + sema_init(&reqSem, VIOTAPE_MAXREQ); + + if (viopath_hostLp == HvLpIndexInvalid) + vio_set_hostlp(); + + /* + * Open to our hosting lp + */ + if (viopath_hostLp == HvLpIndexInvalid) + return -1; + + printk("viotape: init - open path to hosting (%d)\n", + viopath_hostLp); + + rc = viopath_open(viopath_hostLp, viomajorsubtype_tape, VIOTAPE_MAXREQ + 2); + if (rc) { + printk("viotape: error on viopath_open to hostlp %d\n", + rc); + } + + vio_setHandler(viomajorsubtype_tape, vioHandleTapeEvent); + + printk("viotape major is %d\n", viotape_major); + + get_viotape_info(); + + if (devfs_register_chrdev(viotape_major, "viotape", &viotap_fops)) { + printk("Error registering viotape device\n"); + return -1; + } + + for (i = 0; i < viotape_numdev; i++) { + int j; + state[i].cur_part = 0; + for (j = 0; j < MAX_PARTITIONS; ++j) + state[i].part_stat[j].rwi = VIOT_IDLE; + sprintf(tapename, "viotape%d", i); + state[i].dev_handle = + devfs_register(NULL, tapename, DEVFS_FL_DEFAULT, + viotape_major, i, + S_IFCHR | S_IRUSR | S_IWUSR | S_IRGRP | + S_IWGRP, &viotap_fops, NULL); + printk + ("viotape device %s is iSeries resource %10.10s type %4.4s, model %3.3s\n", + tapename, viotape_unitinfo[i].rsrcname, + viotape_unitinfo[i].type, viotape_unitinfo[i].model); + } + + /* + * Create the proc entry + */ + iSeries_proc_callback(&viotape_proc_init); + + return 0; +} + +/* Give a new state to the tape object + */ +static int chg_state(int index, unsigned char new_state, struct file *file) +{ + unsigned char *cur_state = + &state[index].part_stat[state[index].cur_part].rwi; + int rc = 0; + + /* if the same state, don't bother */ + if (*cur_state == new_state) + return 0; + + /* write an EOF if changing from writing to some other state */ + if (*cur_state == VIOT_WRITING) { + struct mtop write_eof = { MTWEOF, 1 }; + rc = viotap_ioctl(NULL, file, MTIOCTOP, + (unsigned long) &write_eof); + } + *cur_state = new_state; + return rc; +} + +/* Cleanup + */ +static void __exit viotap_exit(void) +{ + int i, ret; + for (i = 0; i < viotape_numdev; ++i) + devfs_unregister(state[i].dev_handle); + ret = devfs_unregister_chrdev(viotape_major, "viotape"); + if (ret < 0) + printk("Error unregistering device: %d\n", ret); + iSeries_proc_callback(&viotape_proc_delete); + if (viotape_unitinfo != NULL) { + kfree(viotape_unitinfo); + viotape_unitinfo = NULL; + } + viopath_close(viopath_hostLp, viomajorsubtype_tape, VIOTAPE_MAXREQ + 2); + vio_clearHandler(viomajorsubtype_tape); +} + +MODULE_LICENSE("GPL"); +module_init(viotap_init); +module_exit(viotap_exit); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/net/Config.in linuxppc64_2_4/drivers/net/Config.in --- ../kernel.org/linux-2.4.19/drivers/net/Config.in Mon Apr 22 11:34:26 2002 +++ linuxppc64_2_4/drivers/net/Config.in Tue Apr 23 09:37:28 2002 @@ -250,10 +250,6 @@ endmenu -if [ "$CONFIG_PPC_ISERIES" = "y" ]; then - dep_tristate 'iSeries Virtual Ethernet driver support' CONFIG_VETH $CONFIG_PPC_ISERIES -fi - bool 'FDDI driver support' CONFIG_FDDI if [ "$CONFIG_FDDI" = "y" ]; then if [ "$CONFIG_PCI" = "y" -o "$CONFIG_EISA" = "y" ]; then diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/net/Makefile linuxppc64_2_4/drivers/net/Makefile --- ../kernel.org/linux-2.4.19/drivers/net/Makefile Mon Apr 22 11:34:26 2002 +++ linuxppc64_2_4/drivers/net/Makefile Tue Apr 23 09:37:28 2002 @@ -73,7 +73,6 @@ obj-$(CONFIG_DM9102) += dmfe.o obj-$(CONFIG_YELLOWFIN) += yellowfin.o obj-$(CONFIG_ACENIC) += acenic.o -obj-$(CONFIG_VETH) += veth.o obj-$(CONFIG_NATSEMI) += natsemi.o obj-$(CONFIG_NS83820) += ns83820.o obj-$(CONFIG_STNIC) += stnic.o 8390.o diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/net/Space.c linuxppc64_2_4/drivers/net/Space.c --- ../kernel.org/linux-2.4.19/drivers/net/Space.c Wed May 15 08:29:38 2002 +++ linuxppc64_2_4/drivers/net/Space.c Mon May 13 16:39:13 2002 @@ -541,9 +541,10 @@ -#ifdef CONFIG_TR +#if 0 /* ifdef CONFIG_TR */ /* Token-ring device probe */ extern int ibmtr_probe(struct net_device *); +extern int olympic_probe(struct net_device *); extern int smctr_probe(struct net_device *); static int @@ -552,6 +553,9 @@ if (1 #ifdef CONFIG_IBMTR && ibmtr_probe(dev) +#endif +#ifdef CONFIG_IBMOL + && olympic_probe(dev) #endif #ifdef CONFIG_SMCTR && smctr_probe(dev) diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/net/acenic.c linuxppc64_2_4/drivers/net/acenic.c --- ../kernel.org/linux-2.4.19/drivers/net/acenic.c Fri Apr 19 11:00:36 2002 +++ linuxppc64_2_4/drivers/net/acenic.c Mon Apr 22 10:32:58 2002 @@ -1917,6 +1917,7 @@ atomic_add(i, &ap->cur_rx_bufs); ap->rx_std_skbprd = idx; + mb(); // DRENG if (ACE_IS_TIGON_I(ap)) { struct cmd cmd; cmd.evt = C_SET_RX_PRD_IDX; @@ -2289,7 +2290,7 @@ writel(idx, ®s->RxRetCsm); } ap->cur_rx = idx; - + mb(); // DRENG return; error: idx = rxretprd; @@ -2815,6 +2816,7 @@ wmb(); ap->tx_prd = idx; + mb(); // DRENG prd must be visible before telling HW to advance ace_set_txprd(regs, ap, idx); if (flagsize & BD_FLG_COAL_NOW) { diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/net/pcnet32.c linuxppc64_2_4/drivers/net/pcnet32.c --- ../kernel.org/linux-2.4.19/drivers/net/pcnet32.c Fri Apr 19 11:00:36 2002 +++ linuxppc64_2_4/drivers/net/pcnet32.c Mon Apr 22 10:33:07 2002 @@ -55,6 +55,8 @@ #include #include +#define DO_DXSUFLO + /* * PCI device identifiers for "new style" Linux PCI Device Drivers */ @@ -221,8 +223,8 @@ * That translates to 2 (4 == 2^^2) and 4 (16 == 2^^4). */ #ifndef PCNET32_LOG_TX_BUFFERS -#define PCNET32_LOG_TX_BUFFERS 4 -#define PCNET32_LOG_RX_BUFFERS 5 +#define PCNET32_LOG_TX_BUFFERS 6 +#define PCNET32_LOG_RX_BUFFERS 7 #endif #define TX_RING_SIZE (1 << (PCNET32_LOG_TX_BUFFERS)) @@ -233,7 +235,7 @@ #define RX_RING_MOD_MASK (RX_RING_SIZE - 1) #define RX_RING_LEN_BITS ((PCNET32_LOG_RX_BUFFERS) << 4) -#define PKT_BUF_SZ 1544 +#define PKT_BUF_SZ 2048 /* Offsets from base I/O address. */ #define PCNET32_WIO_RDP 0x10 @@ -294,34 +296,34 @@ */ struct pcnet32_private { /* The Tx and Rx ring entries must be aligned on 16-byte boundaries in 32bit mode. */ - struct pcnet32_rx_head rx_ring[RX_RING_SIZE]; - struct pcnet32_tx_head tx_ring[TX_RING_SIZE]; - struct pcnet32_init_block init_block; - dma_addr_t dma_addr; /* DMA address of beginning of this object, - returned by pci_alloc_consistent */ - struct pci_dev *pci_dev; /* Pointer to the associated pci device structure */ - const char *name; + struct pcnet32_rx_head rx_ring[RX_RING_SIZE]; + struct pcnet32_tx_head tx_ring[TX_RING_SIZE]; + struct pcnet32_init_block init_block; + dma_addr_t dma_addr; /* DMA address of beginning of this object, + returned by pci_alloc_consistent */ + struct pci_dev *pci_dev; /* Pointer to the associated pci device structure */ + const char *name; /* The saved address of a sent-in-place packet/buffer, for skfree(). */ - struct sk_buff *tx_skbuff[TX_RING_SIZE]; - struct sk_buff *rx_skbuff[RX_RING_SIZE]; - dma_addr_t tx_dma_addr[TX_RING_SIZE]; - dma_addr_t rx_dma_addr[RX_RING_SIZE]; + struct sk_buff *tx_skbuff[TX_RING_SIZE]; + struct sk_buff *rx_skbuff[RX_RING_SIZE]; + dma_addr_t tx_dma_addr[TX_RING_SIZE]; + dma_addr_t rx_dma_addr[RX_RING_SIZE]; struct pcnet32_access a; - spinlock_t lock; /* Guard lock */ - unsigned int cur_rx, cur_tx; /* The next free ring entry */ - unsigned int dirty_rx, dirty_tx; /* The ring entries to be free()ed. */ + spinlock_t lock; /* Guard lock */ + unsigned int cur_rx, cur_tx; /* The next free ring entry */ + unsigned int dirty_rx, dirty_tx; /* The ring entries to be free()ed. */ struct net_device_stats stats; - char tx_full; - int options; - int shared_irq:1, /* shared irq possible */ - ltint:1, /* enable TxDone-intr inhibitor */ - dxsuflo:1, /* disable transmit stop on uflo */ - mii:1; /* mii port available */ - struct net_device *next; + char tx_full; + int options; + int shared_irq:1, /* shared irq possible */ + ltint:1, + dxsuflo:1, /* disable transmit stop on uflo */ + mii:1; /* mii port available */ + struct net_device *next; struct mii_if_info mii_if; }; -static void pcnet32_probe_vlbus(void); +static void pcnet32_probe_vlbus(void); static int pcnet32_probe_pci(struct pci_dev *, const struct pci_device_id *); static int pcnet32_probe1(unsigned long, unsigned int, int, struct pci_dev *); static int pcnet32_open(struct net_device *); @@ -342,7 +344,6 @@ PCI_ADDR0=0x10<<0, PCI_ADDR1=0x10<<1, PCI_ADDR2=0x10<<2, PCI_ADDR3=0x10<<3, }; - static u16 pcnet32_wio_read_csr (unsigned long addr, int index) { outw (index, addr+PCNET32_WIO_RAP); @@ -444,15 +445,15 @@ } static struct pcnet32_access pcnet32_dwio = { - read_csr: pcnet32_dwio_read_csr, - write_csr: pcnet32_dwio_write_csr, - read_bcr: pcnet32_dwio_read_bcr, - write_bcr: pcnet32_dwio_write_bcr, - read_rap: pcnet32_dwio_read_rap, - write_rap: pcnet32_dwio_write_rap, + read_csr: pcnet32_dwio_read_csr, + write_csr: pcnet32_dwio_write_csr, + read_bcr: pcnet32_dwio_read_bcr, + write_bcr: pcnet32_dwio_write_bcr, + read_rap: pcnet32_dwio_read_rap, + write_rap: pcnet32_dwio_write_rap, reset: pcnet32_dwio_reset -}; +}; /* only probes for non-PCI devices, the rest are handled by @@ -461,40 +462,44 @@ static void __devinit pcnet32_probe_vlbus(void) { - unsigned int *port, ioaddr; + unsigned long ioaddr = 0; // FIXME dev ? dev->base_addr: 0; + int *port; + + printk(KERN_INFO "pcnet32_probe_vlbus: cards_found=%d\n", cards_found); - /* search for PCnet32 VLB cards at known addresses */ + /* now look for PCnet32 VLB cards */ for (port = pcnet32_portlist; (ioaddr = *port); port++) { if (!check_region(ioaddr, PCNET32_TOTAL_SIZE)) { /* check if there is really a pcnet chip on that ioaddr */ - if ((inb(ioaddr + 14) == 0x57) && (inb(ioaddr + 15) == 0x57)) + if ((inb(ioaddr + 14) == 0x57) && (inb(ioaddr + 15) == 0x57)) pcnet32_probe1(ioaddr, 0, 0, NULL); } } } + static int __devinit pcnet32_probe_pci(struct pci_dev *pdev, const struct pci_device_id *ent) { unsigned long ioaddr; - int err; + int err = 0; err = pci_enable_device(pdev); if (err < 0) { - printk(KERN_ERR PFX "failed to enable device -- err=%d\n", err); - return err; + printk(KERN_ERR PFX "failed to enable device -- err=%d\n", err); + return err; } pci_set_master(pdev); ioaddr = pci_resource_start (pdev, 0); if (!ioaddr) { - printk (KERN_ERR PFX "card has no PCI IO resources, aborting\n"); + printk (KERN_ERR "card has no PCI IO resources, aborting\n"); return -ENODEV; } - + if (!pci_dma_supported(pdev, PCNET32_DMA_MASK)) { - printk(KERN_ERR PFX "architecture does not support 32bit PCI busmaster DMA\n"); + printk(KERN_ERR "pcnet32.c: architecture does not support 32bit PCI busmaster DMA\n"); return -ENODEV; } @@ -508,13 +513,13 @@ */ static int __devinit pcnet32_probe1(unsigned long ioaddr, unsigned int irq_line, int shared, - struct pci_dev *pdev) + struct pci_dev *pdev) { struct pcnet32_private *lp; struct resource *res; dma_addr_t lp_dma_addr; - int i, media; - int fdx, mii, fset, dxsuflo, ltint; + int i,media; + int fdx = 0, mii = 0, fset = 0, dxsuflo=0, ltint=0; int chip_version; char *chipname; struct net_device *dev; @@ -522,25 +527,27 @@ u8 promaddr[6]; /* reset the chip */ + pcnet32_dwio_reset(ioaddr); + udelay (100); pcnet32_wio_reset(ioaddr); - /* NOTE: 16-bit check is first, otherwise some older PCnet chips fail */ - if (pcnet32_wio_read_csr(ioaddr, 0) == 4 && pcnet32_wio_check(ioaddr)) { - a = &pcnet32_wio; + /* Important to do the check for dwio mode first. */ + if (pcnet32_dwio_read_csr(ioaddr, 0) == 4 && pcnet32_dwio_check(ioaddr)) { + a = &pcnet32_dwio; } else { - pcnet32_dwio_reset(ioaddr); - if (pcnet32_dwio_read_csr(ioaddr, 0) == 4 && pcnet32_dwio_check(ioaddr)) { - a = &pcnet32_dwio; + if (pcnet32_wio_read_csr(ioaddr, 0) == 4 && + pcnet32_wio_check(ioaddr)) { + a = &pcnet32_wio; } else return -ENODEV; } - chip_version = a->read_csr(ioaddr, 88) | (a->read_csr(ioaddr,89) << 16); + chip_version = a->read_csr(ioaddr, 88) | (a->read_csr (ioaddr,89) << 16); if (pcnet32_debug > 2) printk(KERN_INFO " PCnet chip version is %#x.\n", chip_version); if ((chip_version & 0xfff) != 0x003) return -ENODEV; - + /* initialize variables */ fdx = mii = fset = dxsuflo = ltint = 0; chip_version = (chip_version >> 12) & 0xffff; @@ -610,20 +617,27 @@ * one for latency - although on PCI this isnt a big loss. Older chips * have FIFO's smaller than a packet, so you can't do this. */ - + /* + * UPDATE + * Got to make sure that BCR18:MEMCMD, BCR18:BREADE, BCR18:BWRITE are + * set on a PCI + */ if(fset) { - a->write_bcr(ioaddr, 18, (a->read_bcr(ioaddr, 18) | 0x0800)); - a->write_csr(ioaddr, 80, (a->read_csr(ioaddr, 80) & 0x0C00) | 0x0c00); - dxsuflo = 1; - ltint = 1; + a->write_bcr(ioaddr, 18, (a->read_bcr(ioaddr, 18) | 0xA60)); + a->write_csr(ioaddr, 3, 0x2eb7); + a->write_csr(ioaddr, 4, 0x32ea); + a->write_csr(ioaddr, 80, 0x3f00); + + dxsuflo = 1; + ltint = 1; } dev = alloc_etherdev(0); if(!dev) return -ENOMEM; - printk(KERN_INFO PFX "%s at %#3lx,", chipname, ioaddr); + printk(KERN_INFO "%s at %#3lx,", chipname, ioaddr); /* In most chips, after a chip reset, the ethernet address is read from the * station address PROM at the base address and programmed into the @@ -632,6 +646,7 @@ * they disagree with the CSRs. Either way, we use the CSR values, and * double check that they are valid. */ +#ifndef CONFIG_PPC for (i = 0; i < 3; i++) { unsigned int val; val = a->read_csr(ioaddr, i+12) & 0x0ffff; @@ -639,28 +654,29 @@ dev->dev_addr[2*i] = val & 0x0ff; dev->dev_addr[2*i+1] = (val >> 8) & 0x0ff; } +#endif /* read PROM address and compare with CSR address */ - for (i = 0; i < 6; i++) + for (i = 0; i < 6; i++) { promaddr[i] = inb(ioaddr + i); - + if( memcmp( promaddr, dev->dev_addr, 6) - || !is_valid_ether_addr(dev->dev_addr) ) { -#ifndef __powerpc__ + || !is_valid_ether_addr(dev->dev_addr) ) { +#ifndef __powerpc__ if( is_valid_ether_addr(promaddr) ){ #else - if( !is_valid_ether_addr(dev->dev_addr) - && is_valid_ether_addr(promaddr)) { + if (!is_valid_ether_addr(dev->dev_addr) + && is_valid_ether_addr(promaddr)) { #endif - printk(" warning: CSR address invalid,\n"); - printk(KERN_INFO " using instead PROM address of"); - memcpy(dev->dev_addr, promaddr, 6); + printk(" warning: CSR address invalid,\n"); + printk(KERN_INFO " using instead PROM address of"); + memcpy(dev->dev_addr, promaddr, 6); } - } + } /* if the ethernet address is not valid, force to 00:00:00:00:00:00 */ if( !is_valid_ether_addr(dev->dev_addr) ) - memset(dev->dev_addr, 0, sizeof(dev->dev_addr)); + memset(dev->dev_addr, 0, sizeof(dev->dev_addr)); for (i = 0; i < 6; i++) printk(" %2.2x", dev->dev_addr[i] ); @@ -800,6 +816,7 @@ cards_found++; return 0; } +} static int @@ -893,7 +910,7 @@ lp->init_block.filter[1] = 0x00000000; if (pcnet32_init_ring(dev)) return -ENOMEM; - + /* Re-initialize the PCNET32, and start it when done. */ lp->a.write_csr (ioaddr, 1, (lp->dma_addr + offsetof(struct pcnet32_private, init_block)) &0xffff); lp->a.write_csr (ioaddr, 2, (lp->dma_addr + offsetof(struct pcnet32_private, init_block)) >> 16); @@ -975,7 +992,10 @@ } skb_reserve (rx_skbuff, 2); } - lp->rx_dma_addr[i] = pci_map_single(lp->pci_dev, rx_skbuff->tail, rx_skbuff->len, PCI_DMA_FROMDEVICE); + + if (lp->rx_dma_addr[i] == NULL) + lp->rx_dma_addr[i] = pci_map_single(lp->pci_dev, rx_skbuff->tail, PKT_BUF_SZ-2, PCI_DMA_FROMDEVICE); + lp->rx_ring[i].base = (u32)le32_to_cpu(lp->rx_dma_addr[i]); lp->rx_ring[i].buf_length = le16_to_cpu(-PKT_BUF_SZ); lp->rx_ring[i].status = le16_to_cpu(0x8000); @@ -1010,7 +1030,7 @@ /* ReInit Ring */ lp->a.write_csr (ioaddr, 0, 1); i = 0; - while (i++ < 100) + while (1) if (lp->a.read_csr (ioaddr, 0) & 0x0100) break; @@ -1024,10 +1044,10 @@ struct pcnet32_private *lp = dev->priv; unsigned long ioaddr = dev->base_addr, flags; - spin_lock_irqsave(&lp->lock, flags); + spin_lock_irqsave(&lp->lock, flags); /* Transmitter timeout, serious problems. */ printk(KERN_ERR "%s: transmit timed out, status %4.4x, resetting.\n", - dev->name, lp->a.read_csr(ioaddr, 0)); + dev->name, lp->a.read_csr (ioaddr, 0)); lp->a.write_csr (ioaddr, 0, 0x0004); lp->stats.tx_errors++; if (pcnet32_debug > 2) { @@ -1050,7 +1070,7 @@ dev->trans_start = jiffies; netif_start_queue(dev); - spin_unlock_irqrestore(&lp->lock, flags); + spin_unlock_irqrestore(&lp->lock, flags); } @@ -1065,7 +1085,7 @@ if (pcnet32_debug > 3) { printk(KERN_DEBUG "%s: pcnet32_start_xmit() called, csr0 %4.4x.\n", - dev->name, lp->a.read_csr(ioaddr, 0)); + dev->name, lp->a.read_csr (ioaddr, 0)); } spin_lock_irqsave(&lp->lock, flags); @@ -1310,12 +1330,12 @@ if ((newskb = dev_alloc_skb (PKT_BUF_SZ))) { skb_reserve (newskb, 2); skb = lp->rx_skbuff[entry]; - pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[entry], skb->len, PCI_DMA_FROMDEVICE); skb_put (skb, pkt_len); lp->rx_skbuff[entry] = newskb; newskb->dev = dev; + pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[entry], PKT_BUF_SZ-2, PCI_DMA_FROMDEVICE); lp->rx_dma_addr[entry] = - pci_map_single(lp->pci_dev, newskb->tail, + pci_map_single(lp->pci_dev, newskb->tail, newskb->len, PCI_DMA_FROMDEVICE); lp->rx_ring[entry].base = le32_to_cpu(lp->rx_dma_addr[entry]); rx_in_place = 1; @@ -1369,7 +1389,7 @@ static int pcnet32_close(struct net_device *dev) { - unsigned long ioaddr = dev->base_addr; + unsigned long ioaddr = dev->base_addr, flags; struct pcnet32_private *lp = dev->priv; int i; @@ -1390,13 +1410,23 @@ */ lp->a.write_bcr (ioaddr, 20, 4); + /* + * FIXME: What happens if the bcr write is posted, the buffers are + * freed and there is still incoming DMA traffic + */ + +#warning "PCI posting bug" + free_irq(dev->irq, dev); - + + /* Lock after free_irq to avoid deadlock with interrupt handler. */ + spin_lock_irqsave(&lp->lock, flags); + /* free all allocated skbuffs */ for (i = 0; i < RX_RING_SIZE; i++) { lp->rx_ring[i].status = 0; if (lp->rx_skbuff[i]) { - pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[i], lp->rx_skbuff[i]->len, PCI_DMA_FROMDEVICE); + pci_unmap_single(lp->pci_dev, lp->rx_dma_addr[i], PKT_BUF_SZ-2, PCI_DMA_FROMDEVICE); dev_kfree_skb(lp->rx_skbuff[i]); } lp->rx_skbuff[i] = NULL; @@ -1412,6 +1442,8 @@ lp->tx_dma_addr[i] = 0; } + spin_unlock_irqrestore(&lp->lock, flags); + MOD_DEC_USE_COUNT; return 0; @@ -1505,13 +1537,13 @@ if (!lp->mii) return 0; - + phyaddr = lp->a.read_bcr(ioaddr, 33); - lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f)); + lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f)); val_out = lp->a.read_bcr(ioaddr, 34); lp->a.write_bcr(ioaddr, 33, phyaddr); - + return val_out; } @@ -1523,7 +1555,7 @@ if (!lp->mii) return; - + phyaddr = lp->a.read_bcr(ioaddr, 33); lp->a.write_bcr(ioaddr, 33, ((phy_id & 0x1f) << 5) | (reg_num & 0x1f)); @@ -1549,76 +1581,76 @@ return -EFAULT; switch (ethcmd) { - case ETHTOOL_GDRVINFO: { - struct ethtool_drvinfo info = { ETHTOOL_GDRVINFO }; - strcpy (info.driver, DRV_NAME); - strcpy (info.version, DRV_VERSION); - if (lp->pci_dev) - strcpy (info.bus_info, lp->pci_dev->slot_name); - else - sprintf(info.bus_info, "VLB 0x%lx", dev->base_addr); - if (copy_to_user (useraddr, &info, sizeof (info))) - return -EFAULT; - return 0; - } - - /* get settings */ - case ETHTOOL_GSET: { - struct ethtool_cmd ecmd = { ETHTOOL_GSET }; - spin_lock_irq(&lp->lock); - mii_ethtool_gset(&lp->mii_if, &ecmd); - spin_unlock_irq(&lp->lock); - if (copy_to_user(useraddr, &ecmd, sizeof(ecmd))) - return -EFAULT; - return 0; - } - /* set settings */ - case ETHTOOL_SSET: { - int r; - struct ethtool_cmd ecmd; - if (copy_from_user(&ecmd, useraddr, sizeof(ecmd))) - return -EFAULT; - spin_lock_irq(&lp->lock); - r = mii_ethtool_sset(&lp->mii_if, &ecmd); - spin_unlock_irq(&lp->lock); - return r; - } - /* restart autonegotiation */ - case ETHTOOL_NWAY_RST: { - return mii_nway_restart(&lp->mii_if); - } - /* get link status */ - case ETHTOOL_GLINK: { - struct ethtool_value edata = {ETHTOOL_GLINK}; - edata.data = mii_link_ok(&lp->mii_if); - if (copy_to_user(useraddr, &edata, sizeof(edata))) + case ETHTOOL_GDRVINFO: { + struct ethtool_drvinfo info = { ETHTOOL_GDRVINFO }; + strcpy (info.driver, DRV_NAME); + strcpy (info.version, DRV_VERSION); + if (lp->pci_dev) + strcpy (info.bus_info, lp->pci_dev->slot_name); + else + sprintf(info.bus_info, "VLB 0x%lx", dev->base_addr); + if (copy_to_user (useraddr, &info, sizeof (info))) + return -EFAULT; + return 0; + } + /* get settings */ + case ETHTOOL_GSET: { + struct ethtool_cmd ecmd = { ETHTOOL_GSET }; + spin_lock_irq(&lp->lock); + mii_ethtool_gset(&lp->mii_if, &ecmd); + spin_unlock_irq(&lp->lock); + if (copy_to_user(useraddr, &ecmd, sizeof(ecmd))) + return -EFAULT; + return 0; + } + /* set settings */ + case ETHTOOL_SSET: { + int r; + struct ethtool_cmd ecmd; + if (copy_from_user(&ecmd, useraddr, sizeof(ecmd))) + return -EFAULT; + spin_lock_irq(&lp->lock); + r = mii_ethtool_sset(&lp->mii_if, &ecmd); + spin_unlock_irq(&lp->lock); + return r; + } + /* restart autonegotiation */ + case ETHTOOL_NWAY_RST: { + return mii_nway_restart(&lp->mii_if); + } + /* get link status */ + case ETHTOOL_GLINK: { + struct ethtool_value edata = {ETHTOOL_GLINK}; + edata.data = mii_link_ok(&lp->mii_if); + if (copy_to_user(useraddr, &edata, sizeof(edata))) return -EFAULT; - return 0; - } + return 0; + } - /* get message-level */ - case ETHTOOL_GMSGLVL: { - struct ethtool_value edata = {ETHTOOL_GMSGLVL}; - edata.data = pcnet32_debug; - if (copy_to_user(useraddr, &edata, sizeof(edata))) - return -EFAULT; - return 0; - } - /* set message-level */ - case ETHTOOL_SMSGLVL: { - struct ethtool_value edata; - if (copy_from_user(&edata, useraddr, sizeof(edata))) - return -EFAULT; - pcnet32_debug = edata.data; - return 0; - } - default: - break; + /* get message-level */ + case ETHTOOL_GMSGLVL: { + struct ethtool_value edata = {ETHTOOL_GMSGLVL}; + edata.data = pcnet32_debug; + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; + } + /* set message-level */ + case ETHTOOL_SMSGLVL: { + struct ethtool_value edata; + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + pcnet32_debug = edata.data; + return 0; + } + default: + break; } - return -EOPNOTSUPP; +return -EOPNOTSUPP; } + static int pcnet32_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) { unsigned long ioaddr = dev->base_addr; @@ -1626,26 +1658,29 @@ struct mii_ioctl_data *data = (struct mii_ioctl_data *)&rq->ifr_data; int phyaddr = lp->a.read_bcr (ioaddr, 33); - if (cmd == SIOCETHTOOL) - return pcnet32_ethtool_ioctl(dev, (void *) rq->ifr_data); + if (cmd == SIOCETHTOOL) + return pcnet32_ethtool_ioctl(dev, (void *) rq->ifr_data); if (lp->mii) { switch(cmd) { - case SIOCGMIIPHY: /* Get address of MII PHY in use. */ + case SIOCGMIIPHY: /* Get the address of the PHY in use. */ data->phy_id = (phyaddr >> 5) & 0x1f; /* Fall Through */ - case SIOCGMIIREG: /* Read MII PHY register. */ + + case SIOCGMIIREG: /* Read the specified MII register. */ lp->a.write_bcr (ioaddr, 33, ((data->phy_id & 0x1f) << 5) | (data->reg_num & 0x1f)); data->val_out = lp->a.read_bcr (ioaddr, 34); lp->a.write_bcr (ioaddr, 33, phyaddr); return 0; - case SIOCSMIIREG: /* Write MII PHY register. */ + + case SIOCSMIIREG: /* Write the specified MII register */ if (!capable(CAP_NET_ADMIN)) return -EPERM; lp->a.write_bcr (ioaddr, 33, ((data->phy_id & 0x1f) << 5) | (data->reg_num & 0x1f)); lp->a.write_bcr (ioaddr, 34, data->val_in); lp->a.write_bcr (ioaddr, 33, phyaddr); return 0; + default: return -EOPNOTSUPP; } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/net/tokenring/olympic.c linuxppc64_2_4/drivers/net/tokenring/olympic.c --- ../kernel.org/linux-2.4.19/drivers/net/tokenring/olympic.c Wed May 15 08:29:39 2002 +++ linuxppc64_2_4/drivers/net/tokenring/olympic.c Mon May 13 16:39:14 2002 @@ -311,20 +311,20 @@ writel(readl(olympic_mmio+BCTL)|BCTL_MIMREB,olympic_mmio+BCTL); if (olympic_priv->olympic_ring_speed == 0) { /* Autosense */ - writel(readl(olympic_mmio+GPR)|GPR_AUTOSENSE,olympic_mmio+GPR); + writew(readw(olympic_mmio+GPR)|GPR_AUTOSENSE,olympic_mmio+GPR); if (olympic_priv->olympic_message_level) printk(KERN_INFO "%s: Ringspeed autosense mode on\n",olympic_priv->olympic_card_name); } else if (olympic_priv->olympic_ring_speed == 16) { if (olympic_priv->olympic_message_level) printk(KERN_INFO "%s: Trying to open at 16 Mbps as requested\n", olympic_priv->olympic_card_name); - writel(GPR_16MBPS, olympic_mmio+GPR); + writew(GPR_16MBPS, olympic_mmio+GPR); } else if (olympic_priv->olympic_ring_speed == 4) { if (olympic_priv->olympic_message_level) printk(KERN_INFO "%s: Trying to open at 4 Mbps as requested\n", olympic_priv->olympic_card_name) ; - writel(0, olympic_mmio+GPR); + writew(0, olympic_mmio+GPR); } - writel(readl(olympic_mmio+GPR)|GPR_NEPTUNE_BF,olympic_mmio+GPR); + writew(readw(olympic_mmio+GPR)|GPR_NEPTUNE_BF,olympic_mmio+GPR); #if OLYMPIC_DEBUG printk("GPR = %x\n",readw(olympic_mmio + GPR) ) ; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/pci/pci.c linuxppc64_2_4/drivers/pci/pci.c --- ../kernel.org/linux-2.4.19/drivers/pci/pci.c Fri Apr 19 11:00:45 2002 +++ linuxppc64_2_4/drivers/pci/pci.c Mon Apr 22 10:33:13 2002 @@ -1079,8 +1079,8 @@ res = child->resource[0]; pci_read_config_byte(dev, PCI_IO_BASE, &io_base_lo); pci_read_config_byte(dev, PCI_IO_LIMIT, &io_limit_lo); - base = (io_base_lo & PCI_IO_RANGE_MASK) << 8; - limit = (io_limit_lo & PCI_IO_RANGE_MASK) << 8; + base = (unsigned long)(io_base_lo & PCI_IO_RANGE_MASK) << 8; + limit = (unsigned long)(io_limit_lo & PCI_IO_RANGE_MASK) << 8; if ((io_base_lo & PCI_IO_RANGE_TYPE_MASK) == PCI_IO_RANGE_TYPE_32) { u16 io_base_hi, io_limit_hi; @@ -1107,8 +1107,8 @@ res = child->resource[1]; pci_read_config_word(dev, PCI_MEMORY_BASE, &mem_base_lo); pci_read_config_word(dev, PCI_MEMORY_LIMIT, &mem_limit_lo); - base = (mem_base_lo & PCI_MEMORY_RANGE_MASK) << 16; - limit = (mem_limit_lo & PCI_MEMORY_RANGE_MASK) << 16; + base = (unsigned long)(mem_base_lo & PCI_MEMORY_RANGE_MASK) << 16; + limit = (unsigned long)(mem_limit_lo & PCI_MEMORY_RANGE_MASK) << 16; if (base && base <= limit) { res->flags = (mem_base_lo & PCI_MEMORY_RANGE_TYPE_MASK) | IORESOURCE_MEM; res->start = base; @@ -1123,16 +1123,16 @@ res = child->resource[2]; pci_read_config_word(dev, PCI_PREF_MEMORY_BASE, &mem_base_lo); pci_read_config_word(dev, PCI_PREF_MEMORY_LIMIT, &mem_limit_lo); - base = (mem_base_lo & PCI_PREF_RANGE_MASK) << 16; - limit = (mem_limit_lo & PCI_PREF_RANGE_MASK) << 16; + base = (unsigned long)(mem_base_lo & PCI_PREF_RANGE_MASK) << 16; + limit = (unsigned long)(mem_limit_lo & PCI_PREF_RANGE_MASK) << 16; if ((mem_base_lo & PCI_PREF_RANGE_TYPE_MASK) == PCI_PREF_RANGE_TYPE_64) { u32 mem_base_hi, mem_limit_hi; pci_read_config_dword(dev, PCI_PREF_BASE_UPPER32, &mem_base_hi); pci_read_config_dword(dev, PCI_PREF_LIMIT_UPPER32, &mem_limit_hi); #if BITS_PER_LONG == 64 - base |= ((long) mem_base_hi) << 32; - limit |= ((long) mem_limit_hi) << 32; + base |= ((unsigned long) mem_base_hi) << 32; + limit |= ((unsigned long) mem_limit_hi) << 32; #else if (mem_base_hi || mem_limit_hi) { printk(KERN_ERR "PCI: Unable to handle 64-bit address space for %s\n", child->name); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/pci/pci.ids linuxppc64_2_4/drivers/pci/pci.ids --- ../kernel.org/linux-2.4.19/drivers/pci/pci.ids Mon Apr 22 11:34:26 2002 +++ linuxppc64_2_4/drivers/pci/pci.ids Tue Apr 23 09:37:29 2002 @@ -556,6 +556,7 @@ 0022 IBM27-82351 002d Python 002e ServeRAID-3x + 0031 Serial Adapter 0036 Miami 003a CPU to PCI Bridge 003e 16/4 Token ring UTP/STP controller diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/scsi/sr_ioctl.c linuxppc64_2_4/drivers/scsi/sr_ioctl.c --- ../kernel.org/linux-2.4.19/drivers/scsi/sr_ioctl.c Fri Apr 19 10:59:57 2002 +++ linuxppc64_2_4/drivers/scsi/sr_ioctl.c Mon Apr 22 10:33:14 2002 @@ -334,7 +334,12 @@ { u_char sr_cmd[10]; int result, target = MINOR(cdi->dev); - unsigned char buffer[32]; + unsigned char *buffer = scsi_malloc(512); + + if (buffer == NULL) { + printk("SCSI DMA pool exhausted."); + return -ENOMEM; + } memset(sr_cmd, 0, sizeof(sr_cmd)); @@ -407,6 +412,7 @@ return -EINVAL; } + scsi_free(buffer, 512); #if 0 if (result) printk("DEBUG: sr_audio: result for ioctl %x: %x\n", cmd, result); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/sound/dmabuf.c linuxppc64_2_4/drivers/sound/dmabuf.c --- ../kernel.org/linux-2.4.19/drivers/sound/dmabuf.c Fri Apr 19 10:30:24 2002 +++ linuxppc64_2_4/drivers/sound/dmabuf.c Mon Feb 25 08:44:33 2002 @@ -113,7 +113,7 @@ } } dmap->raw_buf = start_addr; - dmap->raw_buf_phys = virt_to_bus(start_addr); + dmap->raw_buf_phys = pci_map_single(NULL, start_addr, dmap->buffsize, PCI_DMA_BIDIRECTIONAL); for (page = virt_to_page(start_addr); page <= virt_to_page(end_addr); page++) mem_map_reserve(page); @@ -134,6 +134,8 @@ start_addr = (unsigned long) dmap->raw_buf; end_addr = start_addr + dmap->buffsize; + + pci_unmap_single(NULL, dmap->raw_buf_phys, dmap->buffsize, PCI_DMA_BIDIRECTIONAL); for (page = virt_to_page(start_addr); page <= virt_to_page(end_addr); page++) mem_map_unreserve(page); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/drivers/video/offb.c linuxppc64_2_4/drivers/video/offb.c --- ../kernel.org/linux-2.4.19/drivers/video/offb.c Fri Apr 19 10:30:28 2002 +++ linuxppc64_2_4/drivers/video/offb.c Wed Nov 14 21:23:38 2001 @@ -430,7 +430,7 @@ info->cmap_type = cmap_unknown; if (depth == 8) { - /* XXX kludge for ati's */ + /* XXX kludge for ati */ if (dp && !strncmp(name, "ATY,Rage128", 11)) { unsigned long regbase = dp->addrs[2].address; info->cmap_adr = ioremap(regbase, 0x1FFF); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/Config.in linuxppc64_2_4/fs/Config.in --- ../kernel.org/linux-2.4.19/fs/Config.in Fri Apr 19 11:00:46 2002 +++ linuxppc64_2_4/fs/Config.in Mon Apr 22 10:35:08 2002 @@ -52,6 +52,8 @@ dep_mbool ' Transparent decompression extension' CONFIG_ZISOFS $CONFIG_ISO9660_FS tristate 'Minix fs support' CONFIG_MINIX_FS +tristate 'JFS filesystem support' CONFIG_JFS_FS +dep_mbool ' JFS debugging' CONFIG_JFS_DEBUG $CONFIG_JFS_FS tristate 'FreeVxFS file system support (VERITAS VxFS(TM) compatible)' CONFIG_VXFS_FS tristate 'NTFS file system support (read only)' CONFIG_NTFS_FS diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/Makefile linuxppc64_2_4/fs/Makefile --- ../kernel.org/linux-2.4.19/fs/Makefile Fri Apr 19 10:29:59 2002 +++ linuxppc64_2_4/fs/Makefile Thu Feb 21 20:57:39 2002 @@ -67,6 +67,7 @@ subdir-$(CONFIG_REISERFS_FS) += reiserfs subdir-$(CONFIG_DEVPTS_FS) += devpts subdir-$(CONFIG_SUN_OPENPROMFS) += openpromfs +subdir-$(CONFIG_JFS_FS) += jfs obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/binfmt_elf.c linuxppc64_2_4/fs/binfmt_elf.c --- ../kernel.org/linux-2.4.19/fs/binfmt_elf.c Wed May 15 08:29:42 2002 +++ linuxppc64_2_4/fs/binfmt_elf.c Mon May 13 16:41:07 2002 @@ -440,6 +440,7 @@ unsigned int size; unsigned long elf_entry, interp_load_addr = 0; unsigned long start_code, end_code, start_data, end_data; + unsigned long reloc_func_desc = 0; struct elfhdr elf_ex; struct elfhdr interp_elf_ex; struct exec interp_ex; @@ -536,8 +537,6 @@ interp_ex = *((struct exec *) bprm->buf); interp_elf_ex = *((struct elfhdr *) bprm->buf); break; - } else { - SET_PERSONALITY(elf_ex, ibcs2_interpreter); } elf_ppnt++; } @@ -669,6 +668,7 @@ load_bias += error - ELF_PAGESTART(load_bias + vaddr); load_addr += load_bias; + reloc_func_desc = load_addr; } } k = elf_ppnt->p_vaddr; @@ -715,6 +715,7 @@ send_sig(SIGSEGV, current, 0); return 0; } + reloc_func_desc = interp_load_addr; } kfree(elf_phdata); @@ -777,10 +778,14 @@ /* * The ABI may specify that certain registers be set up in special * ways (on i386 %edx is the address of a DT_FINI function, for - * example. This macro performs whatever initialization to - * the regs structure is required. + * example. In addition, it may also specify (eg, PowerPC64 ELF) + * that the e_entry field is the address of the function descriptor + * for the startup routine, rather than the address of the startup + * routine itself. This macro performs whatever initialization to + * the regs structure is required as well as any relocations to the + * function descriptor entries when executing dynamically links apps. */ - ELF_PLAT_INIT(regs); + ELF_PLAT_INIT(regs, reloc_func_desc); #endif start_thread(regs, elf_entry, bprm->p); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/buffer.c linuxppc64_2_4/fs/buffer.c --- ../kernel.org/linux-2.4.19/fs/buffer.c Wed May 15 08:29:42 2002 +++ linuxppc64_2_4/fs/buffer.c Mon May 13 16:41:08 2002 @@ -2761,7 +2761,7 @@ bh_hash_shift++; hash_table = (struct buffer_head **) - __get_free_pages(GFP_ATOMIC, order); + vmalloc(PAGE_SIZE << order); } while (hash_table == NULL && --order > 0); printk("Buffer-cache hash table entries: %d (order: %d, %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/dcache.c linuxppc64_2_4/fs/dcache.c --- ../kernel.org/linux-2.4.19/fs/dcache.c Wed May 15 08:29:42 2002 +++ linuxppc64_2_4/fs/dcache.c Mon May 13 16:41:08 2002 @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -1206,7 +1207,7 @@ d_hash_shift++; dentry_hashtable = (struct list_head *) - __get_free_pages(GFP_ATOMIC, order); + vmalloc(PAGE_SIZE << order); } while (dentry_hashtable == NULL && --order >= 0); printk(KERN_INFO "Dentry cache hash table entries: %d (order: %ld, %ld bytes)\n", diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/exec.c linuxppc64_2_4/fs/exec.c --- ../kernel.org/linux-2.4.19/fs/exec.c Fri Apr 19 11:00:23 2002 +++ linuxppc64_2_4/fs/exec.c Mon Apr 22 10:35:08 2002 @@ -313,7 +313,7 @@ mpnt->vm_mm = current->mm; mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; mpnt->vm_end = STACK_TOP; - mpnt->vm_page_prot = PAGE_COPY; + mpnt->vm_page_prot = protection_map[VM_STACK_FLAGS & 0xf]; mpnt->vm_flags = VM_STACK_FLAGS; mpnt->vm_ops = NULL; mpnt->vm_pgoff = 0; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/inode.c linuxppc64_2_4/fs/inode.c --- ../kernel.org/linux-2.4.19/fs/inode.c Wed May 15 08:29:42 2002 +++ linuxppc64_2_4/fs/inode.c Mon May 13 16:41:08 2002 @@ -17,6 +17,7 @@ #include #include #include +#include /* * New inode.c implementation. @@ -1148,7 +1149,7 @@ i_hash_shift++; inode_hashtable = (struct list_head *) - __get_free_pages(GFP_ATOMIC, order); + vmalloc(PAGE_SIZE << order); } while (inode_hashtable == NULL && --order >= 0); printk(KERN_INFO "Inode cache hash table entries: %d (order: %ld, %ld bytes)\n", diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/Makefile linuxppc64_2_4/fs/jfs/Makefile --- ../kernel.org/linux-2.4.19/fs/jfs/Makefile Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/Makefile Wed Nov 14 10:19:35 2001 @@ -0,0 +1,20 @@ +# +# Makefile for the Linux JFS filesystem routines. +# +# Note! Dependencies are done automagically by 'make dep', which also +# removes any old dependencies. DON'T put your own dependencies here +# unless it's something special (not a .c file). +# +# Note 2! The CFLAGS definitions are now in the main makefile. + +O_TARGET := jfs.o +obj-y := super.o file.o inode.o namei.o jfs_mount.o jfs_umount.o \ + jfs_xtree.o jfs_imap.o jfs_debug.o jfs_dmap.o \ + jfs_unicode.o jfs_dtree.o jfs_inode.o \ + jfs_extent.o symlink.o jfs_metapage.o \ + jfs_logmgr.o jfs_txnmgr.o jfs_uniupr.o +obj-m := $(O_TARGET) + +EXTRA_CFLAGS += -D_JFS_4K + +include $(TOPDIR)/Rules.make diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/dir.c linuxppc64_2_4/fs/jfs/dir.c --- ../kernel.org/linux-2.4.19/fs/jfs/dir.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/dir.c Wed Nov 14 10:19:35 2001 @@ -0,0 +1,112 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern int jfs_create(struct inode *, struct dentry *, int); +extern int jfs_mkdir(struct inode *, struct dentry *, int); +extern int jfs_unlink(struct inode *, struct dentry *); +extern int jfs_rmdir(struct inode *, struct dentry *); +extern int jfs_link(struct dentry *, struct inode *, struct dentry *); +extern int jfs_symlink(struct inode *, struct dentry *, const char *); +extern int jfs_rename(struct inode *, struct dentry *, struct inode *, + struct dentry *); +extern int jfs_mknod(struct inode *, struct dentry *, int, int); +extern int jfs_fsync_file(struct file *, struct dentry *, int); + +static ssize_t jfs_dir_read(struct file *filp, + char *buf, size_t count, loff_t * ppos) +{ + return -EISDIR; +} + +struct file_operations jfs_dir_operations = { + fsync: jfs_fsync_file, + read: jfs_dir_read, + readdir: jfs_readdir, +}; + +static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry) +{ + btstack_t btstack; + ino_t inum; + struct inode *ip; + component_t key; + const char *name = dentry->d_name.name; + int len = dentry->d_name.len; + int rc; + + jFYI(1, ("jfs_lookup: name = %s\n", name)); + + + if ((name[0] == '.') && (len == 1)) + inum = dip->i_ino; + else if (strcmp(name, "..") == 0) + inum = PARENT(dip); + else { + if ((rc = + get_UCSname(&key, dentry, JFS_SBI(dip->i_sb)->nls_tab))) + return ERR_PTR(-rc); + IREAD_LOCK(dip); + rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP); + IREAD_UNLOCK(dip); + free_UCSname(&key); + if (rc == ENOENT) { + d_add(dentry, NULL); + return ERR_PTR(0); + } else if (rc) { + jERROR(1, + ("jfs_lookup: dtSearch returned %d\n", rc)); + return ERR_PTR(-rc); + } + } + + ip = iget(dip->i_sb, inum); + if (ip == NULL) { + jERROR(1, + ("jfs_lookup: iget failed on inum %d\n", + (uint) inum)); + return ERR_PTR(-EACCES); + } + + d_add(dentry, ip); + + return ERR_PTR(0); +} + +struct inode_operations jfs_dir_inode_operations = { + create: jfs_create, + lookup: jfs_lookup, + link: jfs_link, + unlink: jfs_unlink, + symlink: jfs_symlink, + mkdir: jfs_mkdir, + rmdir: jfs_rmdir, + mknod: jfs_mknod, + rename: jfs_rename, +}; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/endian24.h linuxppc64_2_4/fs/jfs/endian24.h --- ../kernel.org/linux-2.4.19/fs/jfs/endian24.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/endian24.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,50 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _H_ENDIAN24 +#define _H_ENDIAN24 + +/* + * fs/jfs/endian24.h: + * + * Endian conversion for 24-byte data + * + */ +#define __swab24(x) \ +({ \ + __u32 __x = (x); \ + ((__u32)( \ + ((__x & (__u32)0x000000ffUL) << 16) | \ + (__x & (__u32)0x0000ff00UL) | \ + ((__x & (__u32)0x00ff0000UL) >> 16) )); \ +}) + +#if (defined(__KERNEL__) && defined(__LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN)) + #define __cpu_to_le24(x) ((__u32)(x)) + #define __le24_to_cpu(x) ((__u32)(x)) +#else + #define __cpu_to_le24(x) __swab24(x) + #define __le24_to_cpu(x) __swab24(x) +#endif + +#ifdef __KERNEL__ + #define cpu_to_le24 __cpu_to_le24 + #define le24_to_cpu __le24_to_cpu +#endif + +#endif /* !_H_ENDIAN24 */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/file.c linuxppc64_2_4/fs/jfs/file.c --- ../kernel.org/linux-2.4.19/fs/jfs/file.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/file.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,105 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_txnmgr.h" +#include "jfs_debug.h" + + +extern int generic_file_open(struct inode *, struct file *) __weak; +extern loff_t generic_file_llseek(struct file *, loff_t, int origin) __weak; + +extern int jfs_commit_inode(struct inode *, int); + +int jfs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + int rc = 0; + + rc = fsync_inode_data_buffers(inode); + + if (!(inode->i_state & I_DIRTY)) + return rc; + if (datasync || !(inode->i_state & I_DIRTY_DATASYNC)) + return rc; + + IWRITE_LOCK(inode); + rc |= jfs_commit_inode(inode, 1); + IWRITE_UNLOCK(inode); + + return rc ? -EIO : 0; +} + +struct file_operations jfs_file_operations = { + open: generic_file_open, + llseek: generic_file_llseek, + write: generic_file_write, + read: generic_file_read, + mmap: generic_file_mmap, + fsync: jfs_fsync, +}; + +/* + * Guts of jfs_truncate. Called with locks already held. Can be called + * with directory for truncating directory index table. + */ +void jfs_truncate_nolock(struct inode *ip, loff_t length) +{ + loff_t newsize; + tid_t tid; + + ASSERT(length >= 0); + + if (test_cflag(COMMIT_Nolink, ip)) { + xtTruncate(0, ip, length, COMMIT_WMAP); + return; + } + + do { + tid = txBegin(ip->i_sb, 0); + + newsize = xtTruncate(tid, ip, length, + COMMIT_TRUNCATE | COMMIT_PWMAP); + if (newsize < 0) { + txEnd(tid); + break; + } + + ip->i_mtime = ip->i_ctime = CURRENT_TIME; + mark_inode_dirty(ip); + + txCommit(tid, 1, &ip, 0); + txEnd(tid); + } while (newsize > length); /* Truncate isn't always atomic */ +} + +static void jfs_truncate(struct inode *ip) +{ + jFYI(1, ("jfs_truncate: size = 0x%lx\n", (ulong) ip->i_size)); + + IWRITE_LOCK(ip); + jfs_truncate_nolock(ip, ip->i_size); + IWRITE_UNLOCK(ip); +} + +struct inode_operations jfs_file_inode_operations = { + truncate: jfs_truncate, +}; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/inode.c linuxppc64_2_4/fs/jfs/inode.c --- ../kernel.org/linux-2.4.19/fs/jfs/inode.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/inode.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,329 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_imap.h" +#include "jfs_extent.h" +#include "jfs_unicode.h" +#include "jfs_debug.h" + + +extern struct inode_operations jfs_dir_inode_operations; +extern struct inode_operations jfs_file_inode_operations; +extern struct inode_operations jfs_symlink_inode_operations; +extern struct file_operations jfs_dir_operations; +extern struct file_operations jfs_file_operations; +struct address_space_operations jfs_aops; +extern int freeZeroLink(struct inode *); + +void jfs_put_inode(struct inode *inode) +{ + jFYI(1, ("In jfs_put_inode, inode = 0x%p\n", inode)); +} + +void jfs_read_inode(struct inode *inode) +{ + int rc; + + rc = alloc_jfs_inode(inode); + if (rc) { + printk(__FUNCTION__ ": failed."); + goto bad_inode; + } + jFYI(1, ("In jfs_read_inode, inode = 0x%p\n", inode)); + + if (diRead(inode)) + goto bad_inode_free; + + if (S_ISREG(inode->i_mode)) { + inode->i_op = &jfs_file_inode_operations; + inode->i_fop = &jfs_file_operations; + inode->i_mapping->a_ops = &jfs_aops; + } else if (S_ISDIR(inode->i_mode)) { + inode->i_op = &jfs_dir_inode_operations; + inode->i_fop = &jfs_dir_operations; + inode->i_mapping->a_ops = &jfs_aops; + inode->i_mapping->gfp_mask = GFP_NOFS; + } else if (S_ISLNK(inode->i_mode)) { + if (inode->i_size > IDATASIZE) { + inode->i_op = &page_symlink_inode_operations; + inode->i_mapping->a_ops = &jfs_aops; + } else + inode->i_op = &jfs_symlink_inode_operations; + } else { + init_special_inode(inode, inode->i_mode, + kdev_t_to_nr(inode->i_rdev)); + } + + return; + + bad_inode_free: + free_jfs_inode(inode); + bad_inode: + make_bad_inode(inode); +} + +/* This define is from fs/open.c */ +#define special_file(m) (S_ISCHR(m)||S_ISBLK(m)||S_ISFIFO(m)||S_ISSOCK(m)) + +/* + * Workhorse of both fsync & write_inode + */ +int jfs_commit_inode(struct inode *inode, int wait) +{ + int rc = 0; + tid_t tid; + static int noisy = 5; + + jFYI(1, ("In jfs_commit_inode, inode = 0x%p\n", inode)); + + /* + * Don't commit if inode has been committed since last being + * marked dirty, or if it has been deleted. + */ + if (test_cflag(COMMIT_Nolink, inode) || + !test_cflag(COMMIT_Dirty, inode)) + return 0; + + if (isReadOnly(inode)) { + /* kernel allows writes to devices on read-only + * partitions and may think inode is dirty + */ + if (!special_file(inode->i_mode) && noisy) { + jERROR(1, ("jfs_commit_inode(0x%p) called on " + "read-only volume\n", inode)); + jERROR(1, ("Is remount racy?\n")); + noisy--; + } + return 0; + } + + tid = txBegin(inode->i_sb, COMMIT_INODE); + rc = txCommit(tid, 1, &inode, wait ? COMMIT_SYNC : 0); + txEnd(tid); + return -rc; +} + +void jfs_write_inode(struct inode *inode, int wait) +{ + /* + * If COMMIT_DIRTY is not set, the inode isn't really dirty. + * It has been committed since the last change, but was still + * on the dirty inode list + */ + if (test_cflag(COMMIT_Nolink, inode) || + !test_cflag(COMMIT_Dirty, inode)) + return; + + IWRITE_LOCK(inode); + + if (jfs_commit_inode(inode, wait)) { + jERROR(1, ("jfs_write_inode: jfs_commit_inode failed!\n")); + } + + IWRITE_UNLOCK(inode); +} + +void jfs_delete_inode(struct inode *inode) +{ + jFYI(1, ("In jfs_delete_inode, inode = 0x%p\n", inode)); + + IWRITE_LOCK(inode); + if (test_cflag(COMMIT_Freewmap, inode)) + freeZeroLink(inode); + + diFree(inode); + IWRITE_UNLOCK(inode); + + clear_inode(inode); +} + +void jfs_dirty_inode(struct inode *inode) +{ + static int noisy = 5; + + if (isReadOnly(inode)) { + if (!special_file(inode->i_mode) && noisy) { + /* kernel allows writes to devices on read-only + * partitions and may try to mark inode dirty + */ + jERROR(1, ("jfs_dirty_inode called on " + "read-only volume\n")); + jERROR(1, ("Is remount racy?\n")); + noisy--; + } + return; + } + + set_cflag(COMMIT_Dirty, inode); +} + +static int jfs_get_block(struct inode *ip, long lblock, + struct buffer_head *bh_result, int create) +{ + s64 lblock64 = lblock; + int no_size_check = 0; + int rc = 0; + int take_locks; + xad_t xad; + s64 xaddr; + int xflag; + s32 xlen; + + /* + * If this is a special inode (imap, dmap) or directory, + * the lock should already be taken + */ + take_locks = ((JFS_IP(ip)->fileset != AGGREGATE_I) && + !S_ISDIR(ip->i_mode)); + /* + * Take appropriate lock on inode + */ + if (take_locks) { + if (create) + IWRITE_LOCK(ip); + else + IREAD_LOCK(ip); + } + + /* + * A directory's "data" is the inode index table, but i_size is the + * size of the d-tree, so don't check the offset against i_size + */ + if (S_ISDIR(ip->i_mode)) + no_size_check = 1; + + if ((no_size_check || + ((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size)) && + (xtLookup + (ip, lblock64, 1, &xflag, &xaddr, &xlen, no_size_check) + == 0) && xlen) { + if (xflag & XAD_NOTRECORDED) { + if (!create) + /* + * Allocated but not recorded, read treats + * this as a hole + */ + goto unlock; +#ifdef _JFS_4K + XADoffset(&xad, lblock64); + XADlength(&xad, xlen); + XADaddress(&xad, xaddr); +#else /* _JFS_4K */ + /* + * As long as block size = 4K, this isn't a problem. + * We should mark the whole page not ABNR, but how + * will we know to mark the other blocks BH_New? + */ + BUG(); +#endif /* _JFS_4K */ + rc = extRecord(ip, &xad); + if (rc) + goto unlock; + bh_result->b_state |= (1UL << BH_New); + } + + bh_result->b_dev = ip->i_dev; + bh_result->b_blocknr = xaddr; + bh_result->b_state |= (1UL << BH_Mapped); + goto unlock; + } + if (!create) + goto unlock; + + /* + * Allocate a new block + */ +#ifdef _JFS_4K + if ((rc = + extHint(ip, lblock64 << ip->i_sb->s_blocksize_bits, &xad))) + goto unlock; + rc = extAlloc(ip, 1, lblock64, &xad, FALSE); + if (rc) + goto unlock; + + bh_result->b_dev = ip->i_dev; + bh_result->b_blocknr = addressXAD(&xad); + bh_result->b_state |= ((1UL << BH_Mapped) | (1UL << BH_New)); + +#else /* _JFS_4K */ + /* + * We need to do whatever it takes to keep all but the last buffers + * in 4K pages - see jfs_write.c + */ + BUG(); +#endif /* _JFS_4K */ + + unlock: + /* + * Release lock on inode + */ + if (take_locks) { + if (create) + IWRITE_UNLOCK(ip); + else + IREAD_UNLOCK(ip); + } + return -rc; +} + +static int jfs_writepage(struct page *page) +{ + return block_write_full_page(page, jfs_get_block); +} + +static int jfs_readpage(struct file *file, struct page *page) +{ + return block_read_full_page(page, jfs_get_block); +} + +static int jfs_prepare_write(struct file *file, + struct page *page, unsigned from, unsigned to) +{ + return block_prepare_write(page, from, to, jfs_get_block); +} + +static int jfs_bmap(struct address_space *mapping, long block) +{ + return generic_block_bmap(mapping, block, jfs_get_block); +} + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,15)) +static int jfs_direct_IO(int rw, struct inode *inode, struct kiobuf *iobuf, + unsigned long blocknr, int blocksize) +{ + return generic_direct_IO(rw, inode, iobuf, blocknr, + blocksize, jfs_get_block); +} +#endif /* Kernel >= 2.4.15 */ + +struct address_space_operations jfs_aops = { + readpage: jfs_readpage, + writepage: jfs_writepage, + sync_page: block_sync_page, + prepare_write: jfs_prepare_write, + commit_write: generic_commit_write, + bmap: jfs_bmap, +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,15)) + direct_IO: jfs_direct_IO, +#endif +}; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_btree.h linuxppc64_2_4/fs/jfs/jfs_btree.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_btree.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_btree.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,163 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#ifndef _H_JFS_BTREE +#define _H_JFS_BTREE +/* + * jfs_btree.h: B+-tree + * + * JFS B+-tree (dtree and xtree) common definitions + */ + +/* + * basic btree page - btpage_t + */ +typedef struct { + s64 next; /* 8: right sibling bn */ + s64 prev; /* 8: left sibling bn */ + + u8 flag; /* 1: */ + u8 rsrvd[7]; /* 7: type specific */ + s64 self; /* 8: self address */ + + u8 entry[4064]; /* 4064: */ +} btpage_t; /* (4096) */ + +/* btpaget_t flag */ +#define BT_TYPE 0x07 /* B+-tree index */ +#define BT_ROOT 0x01 /* root page */ +#define BT_LEAF 0x02 /* leaf page */ +#define BT_INTERNAL 0x04 /* internal page */ +#define BT_RIGHTMOST 0x10 /* rightmost page */ +#define BT_LEFTMOST 0x20 /* leftmost page */ +#define BT_SWAPPED 0x80 /* used by fsck for endian swapping */ + +/* btorder (in inode) */ +#define BT_RANDOM 0x0000 +#define BT_SEQUENTIAL 0x0001 +#define BT_LOOKUP 0x0010 +#define BT_INSERT 0x0020 +#define BT_DELETE 0x0040 + +/* + * btree page buffer cache access + */ +#define BT_IS_ROOT(MP) (((MP)->xflag & COMMIT_PAGE) == 0) + +/* get page from buffer page */ +#define BT_PAGE(IP, MP, TYPE, ROOT)\ + (BT_IS_ROOT(MP) ? (TYPE *)&JFS_IP(IP)->ROOT : (TYPE *)(MP)->data) + +/* get the page buffer and the page for specified block address */ +#define BT_GETPAGE(IP, BN, MP, TYPE, SIZE, P, RC, ROOT)\ +{\ + if ((BN) == 0)\ + {\ + MP = (metapage_t *)&JFS_IP(IP)->bxflag;\ + P = (TYPE *)&JFS_IP(IP)->ROOT;\ + RC = 0;\ + jEVENT(0,("%d BT_GETPAGE returning root\n", __LINE__));\ + }\ + else\ + {\ + jEVENT(0,("%d BT_GETPAGE reading block %d\n", __LINE__,\ + (int)BN));\ + MP = read_metapage((IP), BN, SIZE, 1);\ + if (MP) {\ + RC = 0;\ + P = (MP)->data;\ + } else {\ + P = NULL;\ + jERROR(1,("bread failed!\n"));\ + RC = EIO;\ + }\ + }\ +} + +#define BT_MARK_DIRTY(MP, IP)\ +{\ + if (BT_IS_ROOT(MP))\ + mark_inode_dirty(IP);\ + else\ + mark_metapage_dirty(MP);\ +} + +/* put the page buffer */ +#define BT_PUTPAGE(MP)\ +{\ + if (! BT_IS_ROOT(MP)) \ + release_metapage(MP); \ +} + + +/* + * btree traversal stack + * + * record the path traversed during the search; + * top frame record the leaf page/entry selected. + */ +#define MAXTREEHEIGHT 8 +typedef struct btframe { /* stack frame */ + s64 bn; /* 8: */ + s16 index; /* 2: */ + s16 lastindex; /* 2: */ + struct metapage *mp; /* 4: */ +} btframe_t; /* (16) */ + +typedef struct btstack { + btframe_t *top; /* 4: */ + int nsplit; /* 4: */ + btframe_t stack[MAXTREEHEIGHT]; +} btstack_t; + +#define BT_CLR(btstack)\ + (btstack)->top = (btstack)->stack + +#define BT_PUSH(BTSTACK, BN, INDEX)\ +{\ + (BTSTACK)->top->bn = BN;\ + (BTSTACK)->top->index = INDEX;\ + ++(BTSTACK)->top;\ + assert((BTSTACK)->top != &((BTSTACK)->stack[MAXTREEHEIGHT]));\ +} + +#define BT_POP(btstack)\ + ( (btstack)->top == (btstack)->stack ? NULL : --(btstack)->top ) + +#define BT_STACK(btstack)\ + ( (btstack)->top == (btstack)->stack ? NULL : (btstack)->top ) + +/* retrieve search results */ +#define BT_GETSEARCH(IP, LEAF, BN, MP, TYPE, P, INDEX, ROOT)\ +{\ + BN = (LEAF)->bn;\ + MP = (LEAF)->mp;\ + if (BN)\ + P = (TYPE *)MP->data;\ + else\ + P = (TYPE *)&JFS_IP(IP)->ROOT;\ + INDEX = (LEAF)->index;\ +} + +/* put the page buffer of search */ +#define BT_PUTSEARCH(BTSTACK)\ +{\ + if (! BT_IS_ROOT((BTSTACK)->top->mp))\ + release_metapage((BTSTACK)->top->mp);\ +} +#endif /* _H_JFS_BTREE */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_compat.h linuxppc64_2_4/fs/jfs/jfs_compat.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_compat.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_compat.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,87 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _H_JFS_COMPAT +#define _H_JFS_COMPAT + +/* + * jfs_compat.h: + * + * Definitions to allow JFS to build on older kernels. + * + * This file should be removed when JFS is merged with linux kernel + * + */ + +#include +#include +#include + +#ifndef __weak +#define __weak __attribute__((weak)); +#endif + +#ifndef MODULE_LICENSE +#define MODULE_LICENSE(x) +#endif + +#ifndef GFP_NOFS +#define GFP_NOFS GFP_BUFFER +#endif + +#if !defined(KERNEL_HAS_O_DIRECT) +#define fsync_inode_data_buffers fsync_inode_buffers +#endif + +/* + * Linux 2.4.9 has broken min/max macros. + * Linux < 2.4.9 doesn't have min/max at all. + */ +#if (LINUX_VERSION_CODE == KERNEL_VERSION(2,4,9)) +#undef min +#undef max +#endif + +/* + * Completions are new in 2.4.7. + */ +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,6)) +#define DECLARE_COMPLETION(c) DECLARE_MUTEX_LOCKED(c) +#define complete(c) up(c) +#define wait_for_completion(c) down(c) +/* must be last to not mess up the namespace */ +#define completion semaphore +#else +#include +#endif + +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,4,9)) +#define min(x,y) ({ \ + const typeof(x) _x = (x); \ + const typeof(y) _y = (y); \ + (void) (&_x == &_y); \ + _x < _y ? _x : _y; }) + +#define max(x,y) ({ \ + const typeof(x) _x = (x); \ + const typeof(y) _y = (y); \ + (void) (&_x == &_y); \ + _x > _y ? _x : _y; }) +#endif + +#endif /* !_H_JFS_COMPAT */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_debug.c linuxppc64_2_4/fs/jfs/jfs_debug.c --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_debug.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_debug.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,145 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_debug.h" + +#ifdef CONFIG_JFS_DEBUG +void dump_mem(char *label, void *data, int length) +{ + int i, j; + int *intptr = data; + char *charptr = data; + char buf[10], line[80]; + + printk("%s: dump of %d bytes of data at 0x%p\n\n", label, length, + data); + for (i = 0; i < length; i += 16) { + line[0] = 0; + for (j = 0; (j < 4) && (i + j * 4 < length); j++) { + sprintf(buf, " %08x", intptr[i / 4 + j]); + strcat(line, buf); + } + buf[0] = ' '; + buf[2] = 0; + for (j = 0; (j < 16) && (i + j < length); j++) { + buf[1] = + isprint(charptr[i + j]) ? charptr[i + j] : '.'; + strcat(line, buf); + } + printk("%s\n", line); + } +} + +#ifdef CONFIG_PROC_FS +static int loglevel_read(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + int len; + + len = sprintf(page, "%d\n", jfsloglevel); + + len -= off; + *start = page + off; + + if (len > count) + len = count; + else + *eof = 1; + + if (len < 0) + len = 0; + + return len; +} + +static int loglevel_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + char c; + + if (get_user(c, buffer)) + return -EFAULT; + + /* yes, I know this is an ASCIIism. --hch */ + if (c < '0' || c > '9') + return -EINVAL; + jfsloglevel = c - '0'; + return count; +} + + +extern read_proc_t jfs_txanchor_read; +#ifdef CONFIG_JFS_STATISTICS +extern read_proc_t jfs_lmstats_read; +extern read_proc_t jfs_xtstat_read; +extern read_proc_t jfs_mpstat_read; +#endif +static struct proc_dir_entry *base; + +static struct { + const char *name; + read_proc_t *read_fn; + write_proc_t *write_fn; +} Entries[] = { + { "TxAnchor", jfs_txanchor_read, }, +#ifdef CONFIG_JFS_STATISTICS + { "lmstats", jfs_lmstats_read, }, + { "xtstat", jfs_xtstat_read, }, + { "mpstat", jfs_mpstat_read, }, +#endif + { "loglevel", loglevel_read, loglevel_write } +}; +#define NPROCENT (sizeof(Entries)/sizeof(Entries[0])) + +void jfs_proc_init(void) +{ + int i; + + if (!(base = proc_mkdir("jfs", proc_root_fs))) + return; + base->owner = THIS_MODULE; + + for (i = 0; i < NPROCENT; i++) { + struct proc_dir_entry *p; + if ((p = create_proc_entry(Entries[i].name, 0, base))) { + p->read_proc = Entries[i].read_fn; + p->write_proc = Entries[i].write_fn; + } + } +} + +void jfs_proc_clean(void) +{ + int i; + + if (base) { + for (i = 0; i < NPROCENT; i++) + remove_proc_entry(Entries[i].name, base); + remove_proc_entry("jfs", base); + } +} + +#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_JFS_DEBUG */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_debug.h linuxppc64_2_4/fs/jfs/jfs_debug.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_debug.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_debug.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,96 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * +*/ +#ifndef _H_JFS_DEBUG +#define _H_JFS_DEBUG + +/* + * jfs_debug.h + * + * global debug message, data structure/macro definitions + * under control of CONFIG_JFS_DEBUG, CONFIG_JFS_STATISTICS; + */ + +/* + * assert with traditional printf/panic + */ +#ifdef CONFIG_KERNEL_ASSERTS +/* kgdb stuff */ +#define assert(p) KERNEL_ASSERT(#p, p) +#else +#define assert(p) {\ +if (!(p))\ + {\ + printk("assert(%s)\n",#p);\ + BUG();\ + }\ +} +#endif + +/* + * debug ON + * -------- + */ +#ifdef CONFIG_JFS_DEBUG +#define ASSERT(p) assert(p) + +/* dump memory contents */ +extern void dump_mem(char *label, void *data, int length); +extern int jfsloglevel; + +/* information message: e.g., configuration, major event */ +#define jFYI(button, prspec) \ + do { if (button && jfsloglevel > 1) printk prspec; } while (0) + +/* error event message: e.g., i/o error */ +extern int jfsERROR; +#define jERROR(button, prspec) \ + do { if (button && jfsloglevel > 0) { printk prspec; } } while (0) + +/* debug event message: */ +#define jEVENT(button,prspec) \ + do { if (button) printk prspec; } while (0) + +/* + * debug OFF + * --------- + */ +#else /* CONFIG_JFS_DEBUG */ +#define dump_mem(label,data,length) +#define ASSERT(p) +#define jEVENT(button,prspec) +#define jERROR(button,prspec) +#define jFYI(button,prspec) +#endif /* CONFIG_JFS_DEBUG */ + +/* + * statistics + * ---------- + */ +#ifdef CONFIG_JFS_STATISTICS +#define INCREMENT(x) ((x)++) +#define DECREMENT(x) ((x)--) +#define HIGHWATERMARK(x,y) ((x) = max((x), (y))) +#else +#define INCREMENT(x) +#define DECREMENT(x) +#define HIGHWATERMARK(x,y) +#endif /* CONFIG_JFS_STATISTICS */ + +#endif /* _H_JFS_DEBUG */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_defragfs.h linuxppc64_2_4/fs/jfs/jfs_defragfs.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_defragfs.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_defragfs.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,55 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#ifndef _H_JFS_DEFRAGFS +#define _H_JFS_DEFRAGFS + +/* + * jfs_defragfs.h + */ +/* + * defragfs parameter list + */ +typedef struct { + uint flag; /* 4: */ + u8 dev; /* 1: */ + u8 pad[3]; /* 3: */ + s32 fileset; /* 4: */ + u32 inostamp; /* 4: */ + u32 ino; /* 4: */ + u32 gen; /* 4: */ + s64 xoff; /* 8: */ + s64 old_xaddr; /* 8: */ + s64 new_xaddr; /* 8: */ + s32 xlen; /* 4: */ +} defragfs_t; /* (52) */ + +/* plist flag */ +#define DEFRAGFS_SYNC 0x80000000 +#define DEFRAGFS_COMMIT 0x40000000 +#define DEFRAGFS_RELOCATE 0x10000000 + +#define INODE_TYPE 0x0000F000 /* IFREG or IFDIR */ + +#define EXTENT_TYPE 0x000000ff +#define DTPAGE 0x00000001 +#define XTPAGE 0x00000002 +#define DATAEXT 0x00000004 +#define EAEXT 0x00000008 + +#endif /* _H_JFS_DEFRAGFS */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_dinode.h linuxppc64_2_4/fs/jfs/jfs_dinode.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_dinode.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_dinode.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,157 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#ifndef _H_JFS_DINODE +#define _H_JFS_DINODE + +/* + * jfs_dinode.h: on-disk inode manager + * + */ + +#define INODESLOTSIZE 128 +#define L2INODESLOTSIZE 7 +#define log2INODESIZE 9 /* log2(bytes per dinode) */ + + +/* + * on-disk inode (dinode_t): 512 bytes + * + * note: align 64-bit fields on 8-byte boundary. + */ +struct dinode { + /* + * I. base area (128 bytes) + * ------------------------ + * + * define generic/POSIX attributes + */ + u32 di_inostamp; /* 4: stamp to show inode belongs to fileset */ + s32 di_fileset; /* 4: fileset number */ + u32 di_number; /* 4: inode number, aka file serial number */ + u32 di_gen; /* 4: inode generation number */ + + pxd_t di_ixpxd; /* 8: inode extent descriptor */ + + s64 di_size; /* 8: size */ + s64 di_nblocks; /* 8: number of blocks allocated */ + + u32 di_nlink; /* 4: number of links to the object */ + + u32 di_uid; /* 4: user id of owner */ + u32 di_gid; /* 4: group id of owner */ + + u32 di_mode; /* 4: attribute, format and permission */ + + struct timestruc_t di_atime; /* 8: time last data accessed */ + struct timestruc_t di_ctime; /* 8: time last status changed */ + struct timestruc_t di_mtime; /* 8: time last data modified */ + struct timestruc_t di_otime; /* 8: time created */ + + dxd_t di_acl; /* 16: acl descriptor */ + + dxd_t di_ea; /* 16: ea descriptor */ + + u32 di_next_index; /* 4: Next available dir_table index */ + + s32 di_acltype; /* 4: Type of ACL */ + + /* + * Extension Areas. + * + * Historically, the inode was partitioned into 4 128-byte areas, + * the last 3 being defined as unions which could have multiple + * uses. The first 96 bytes had been completely unused until + * an index table was added to the directory. It is now more + * useful to describe the last 3/4 of the inode as a single + * union. We would probably be better off redesigning the + * entire structure from scratch, but we don't want to break + * commonality with OS/2's JFS at this time. + */ + union { + struct { + /* + * This table contains the information needed to + * find a directory entry from a 32-bit index. + * If the index is small enough, the table is inline, + * otherwise, an x-tree root overlays this table + */ + dir_table_slot_t _table[12]; /* 96: inline */ + + dtroot_t _dtroot; /* 288: dtree root */ + } _dir; /* (384) */ +#define di_dirtable u._dir._table +#define di_dtroot u._dir._dtroot +#define di_parent di_dtroot.header.idotdot +#define di_DASD di_dtroot.header.DASD + + struct { + union { + u8 _data[96]; /* 96: unused */ + struct { + void *_imap; /* 4: unused */ + u32 _gengen; /* 4: generator */ + } _imap; + } _u1; /* 96: */ +#define di_gengen u._file._u1._imap._gengen + + union { + xtpage_t _xtroot; + struct { + u8 unused[16]; /* 16: */ + dxd_t _dxd; /* 16: */ + union { + u32 _rdev; /* 4: */ + u8 _fastsymlink[128]; + } _u; + u8 _inlineea[128]; + } _special; + } _u2; + } _file; +#define di_xtroot u._file._u2._xtroot +#define di_dxd u._file._u2._special._dxd +#define di_btroot di_xtroot +#define di_inlinedata u._file._u2._special._u +#define di_rdev u._file._u2._special._u._rdev +#define di_fastsymlink u._file._u2._special._u._fastsymlink +#define di_inlineea u._file._u2._special._inlineea + } u; +}; + +typedef struct dinode dinode_t; + + +/* extended mode bits (on-disk inode di_mode) */ +#define IFJOURNAL 0x00010000 /* journalled file */ +#define ISPARSE 0x00020000 /* sparse file enabled */ +#define INLINEEA 0x00040000 /* inline EA area free */ +#define ISWAPFILE 0x00800000 /* file open for pager swap space */ + +/* more extended mode bits: attributes for OS/2 */ +#define IREADONLY 0x02000000 /* no write access to file */ +#define IARCHIVE 0x40000000 /* file archive bit */ +#define ISYSTEM 0x08000000 /* system file */ +#define IHIDDEN 0x04000000 /* hidden file */ +#define IRASH 0x4E000000 /* mask for changeable attributes */ +#define INEWNAME 0x80000000 /* non-8.3 filename format */ +#define IDIRECTORY 0x20000000 /* directory (shadow of real bit) */ +#define ATTRSHIFT 25 /* bits to shift to move attribute + specification to mode position */ + +#endif /*_H_JFS_DINODE */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_dmap.c linuxppc64_2_4/fs/jfs/jfs_dmap.c --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_dmap.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_dmap.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,4189 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * MODULE_NAME: jfs_dmap.c + * + * COMPONENT_NAME: sysjfs + * + * FUNCTION: block allocation map manager + * +*/ + +/* + * Change History : + * + */ + +#include +#include "jfs_incore.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_lock.h" +#include "jfs_metapage.h" +#include "jfs_debug.h" + +/* + * Debug code for double-checking block map + */ +/* #define _JFS_DEBUG_DMAP 1 */ + +#ifdef _JFS_DEBUG_DMAP +#define DBINITMAP(size,ipbmap,results) \ + DBinitmap(size,ipbmap,results) +#define DBALLOC(dbmap,mapsize,blkno,nblocks) \ + DBAlloc(dbmap,mapsize,blkno,nblocks) +#define DBFREE(dbmap,mapsize,blkno,nblocks) \ + DBFree(dbmap,mapsize,blkno,nblocks) +#define DBALLOCCK(dbmap,mapsize,blkno,nblocks) \ + DBAllocCK(dbmap,mapsize,blkno,nblocks) +#define DBFREECK(dbmap,mapsize,blkno,nblocks) \ + DBFreeCK(dbmap,mapsize,blkno,nblocks) + +static void DBinitmap(s64, struct inode *, u32 **); +static void DBAlloc(uint *, s64, s64, s64); +static void DBFree(uint *, s64, s64, s64); +static void DBAllocCK(uint *, s64, s64, s64); +static void DBFreeCK(uint *, s64, s64, s64); +#else +#define DBINITMAP(size,ipbmap,results) +#define DBALLOC(dbmap, mapsize, blkno, nblocks) +#define DBFREE(dbmap, mapsize, blkno, nblocks) +#define DBALLOCCK(dbmap, mapsize, blkno, nblocks) +#define DBFREECK(dbmap, mapsize, blkno, nblocks) +#endif /* _JFS_DEBUG_DMAP */ + +/* + * SERIALIZATION of the Block Allocation Map. + * + * the working state of the block allocation map is accessed in + * two directions: + * + * 1) allocation and free requests that start at the dmap + * level and move up through the dmap control pages (i.e. + * the vast majority of requests). + * + * 2) allocation requests that start at dmap control page + * level and work down towards the dmaps. + * + * the serialization scheme used here is as follows. + * + * requests which start at the bottom are serialized against each + * other through buffers and each requests holds onto its buffers + * as it works it way up from a single dmap to the required level + * of dmap control page. + * requests that start at the top are serialized against each other + * and request that start from the bottom by the multiple read/single + * write inode lock of the bmap inode. requests starting at the top + * take this lock in write mode while request starting at the bottom + * take the lock in read mode. a single top-down request may proceed + * exclusively while multiple bottoms-up requests may proceed + * simultaneously (under the protection of busy buffers). + * + * in addition to information found in dmaps and dmap control pages, + * the working state of the block allocation map also includes read/ + * write information maintained in the bmap descriptor (i.e. total + * free block count, allocation group level free block counts). + * a single exclusive lock (BMAP_LOCK) is used to guard this information + * in the face of multiple-bottoms up requests. + * (lock ordering: IREAD_LOCK, BMAP_LOCK); + * + * accesses to the persistent state of the block allocation map (limited + * to the persistent bitmaps in dmaps) is guarded by (busy) buffers. + */ + +#define BMAP_LOCK_INIT(bmp) init_MUTEX(&bmp->db_bmaplock) +#define BMAP_LOCK(bmp) down(&bmp->db_bmaplock) +#define BMAP_UNLOCK(bmp) up(&bmp->db_bmaplock) + +/* + * forward references + */ +static void dbAllocBits(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks); +static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval); +static void dbBackSplit(dmtree_t * tp, int leafno); +static void dbJoin(dmtree_t * tp, int leafno, int newval); +static void dbAdjTree(dmtree_t * tp, int leafno, int newval); +static int dbAdjCtl(bmap_t * bmp, s64 blkno, int newval, int alloc, + int level); +static int dbAllocAny(bmap_t * bmp, s64 nblocks, int l2nb, s64 * results); +static int dbAllocNext(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks); +static int dbAllocNear(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks, + int l2nb, s64 * results); +static int dbAllocDmap(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks); +static int dbAllocDmapLev(bmap_t * bmp, dmap_t * dp, int nblocks, int l2nb, + s64 * results); +static int dbAllocAG(bmap_t * bmp, int agno, s64 nblocks, int l2nb, + s64 * results); +static int dbAllocCtl(bmap_t * bmp, s64 nblocks, int l2nb, s64 blkno, + s64 * results); +int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks); +static int dbFindBits(u32 word, int l2nb); +static int dbFindCtl(bmap_t * bmp, int l2nb, int level, s64 * blkno); +static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx); +static void dbFreeBits(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks); +static int dbFreeDmap(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks); +static int dbMaxBud(u8 * cp); +s64 dbMapFileSizeToMapSize(struct inode *ipbmap); +int blkstol2(s64 nb); +void fsDirty(void); + +int cntlz(u32 value); +int cnttz(u32 word); + +static int dbAllocDmapBU(bmap_t * bmp, dmap_t * dp, s64 blkno, + int nblocks); +static int dbInitDmap(dmap_t * dp, s64 blkno, int nblocks); +static int dbInitDmapTree(dmap_t * dp); +static int dbInitTree(dmaptree_t * dtp); +static int dbInitDmapCtl(dmapctl_t * dcp, int level, int i); +static int dbGetL2AGSize(s64 nblocks); + +/* + * buddy table + * + * table used for determining buddy sizes within characters of + * dmap bitmap words. the characters themselves serve as indexes + * into the table, with the table elements yielding the maximum + * binary buddy of free bits within the character. + */ +signed char budtab[256] = { + 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, + 2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1 +}; + + +/* + * NAME: dbMount() + * + * FUNCTION: initializate the block allocation map. + * + * memory is allocated for the in-core bmap descriptor and + * the in-core descriptor is initialized from disk. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * 0 - success + * ENOMEM - insufficient memory + * EIO - i/o error + */ +int dbMount(struct inode *ipbmap) +{ + bmap_t *bmp; + dbmap_t *dbmp_le; + metapage_t *mp; + int i; + + /* + * allocate/initialize the in-memory bmap descriptor + */ + /* allocate memory for the in-memory bmap descriptor */ + bmp = kmalloc(sizeof(bmap_t), GFP_KERNEL); + if (bmp == NULL) + return (ENOMEM); + + /* read the on-disk bmap descriptor. */ + mp = read_metapage(ipbmap, + BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + kfree(bmp); + return (EIO); + } + + /* copy the on-disk bmap descriptor to its in-memory version. */ + dbmp_le = (dbmap_t *) mp->data; + bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize); + bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); + bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); + bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); + bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); + bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); + bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); + bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); + bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth); + bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); + bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); + bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); + for (i = 0; i < MAXAG; i++) + bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]); + bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize); + bmp->db_maxfreebud = dbmp_le->dn_maxfreebud; + + /* release the buffer. */ + release_metapage(mp); + + /* bind the bmap inode and the bmap descriptor to each other. */ + bmp->db_ipbmap = ipbmap; + JFS_SBI(ipbmap->i_sb)->bmap = bmp; + + DBINITMAP(bmp->db_mapsize, ipbmap, &bmp->db_DBmap); + + /* + * allocate/initialize the bmap lock + */ + BMAP_LOCK_INIT(bmp); + + return (0); +} + + +/* + * NAME: dbUnmount() + * + * FUNCTION: terminate the block allocation map in preparation for + * file system unmount. + * + * the in-core bmap descriptor is written to disk and + * the memory for this descriptor is freed. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error + */ +int dbUnmount(struct inode *ipbmap, int mounterror) +{ + bmap_t *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + + if (!(mounterror || isReadOnly(ipbmap))) + dbSync(ipbmap); + + /* + * Invalidate the page cache buffers + */ + truncate_inode_pages(ipbmap->i_mapping, 0); + + /* free the memory for the in-memory bmap. */ + kfree(bmp); + + return (0); +} + +/* + * dbSync() + */ +int dbSync(struct inode *ipbmap) +{ + dbmap_t *dbmp_le; + bmap_t *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + metapage_t *mp; + int i; + + /* + * write bmap global control page + */ + /* get the buffer for the on-disk bmap descriptor. */ + mp = read_metapage(ipbmap, + BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + jERROR(1,("dbSync: read_metapage failed!\n")); + return (EIO); + } + /* copy the in-memory version of the bmap to the on-disk version */ + dbmp_le = (dbmap_t *) mp->data; + dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize); + dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree); + dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage); + dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag); + dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel); + dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag); + dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref); + dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel); + dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth); + dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth); + dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart); + dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size); + for (i = 0; i < MAXAG; i++) + dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]); + dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize); + dbmp_le->dn_maxfreebud = bmp->db_maxfreebud; + + /* write the buffer */ + write_metapage(mp); + + /* + * write out dirty pages of bmap + */ + fsync_inode_data_buffers(ipbmap); + + ipbmap->i_state |= I_DIRTY; + diWriteSpecial(ipbmap); + + return (0); +} + + +/* + * NAME: dbFree() + * + * FUNCTION: free the specified block range from the working block + * allocation map. + * + * the blocks will be free from the working map one dmap + * at a time. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * blkno - starting block number to be freed. + * nblocks - number of blocks to be freed. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error + */ +int dbFree(struct inode *ip, s64 blkno, s64 nblocks) +{ + metapage_t *mp; + dmap_t *dp; + int nb, rc; + s64 lblkno, rem; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + bmap_t *bmp = JFS_SBI(ip->i_sb)->bmap; + + IREAD_LOCK(ipbmap); + + /* block to be freed better be within the mapsize. */ + assert(blkno + nblocks <= bmp->db_mapsize); + + /* + * free the blocks a dmap at a time. + */ + mp = NULL; + for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) { + /* release previous dmap if any */ + if (mp) { + write_metapage(mp); + } + + /* get the buffer for the current dmap. */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return (EIO); + } + dp = (dmap_t *) mp->data; + + /* determine the number of blocks to be freed from + * this dmap. + */ + nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); + + DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb); + + /* free the blocks. */ + if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) { + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + return (rc); + } + + DBFREE(bmp->db_DBmap, bmp->db_mapsize, blkno, nb); + } + + /* write the last buffer. */ + write_metapage(mp); + + IREAD_UNLOCK(ipbmap); + + return (0); +} + + +/* + * NAME: dbUpdatePMap() + * + * FUNCTION: update the allocation state (free or allocate) of the + * specified block range in the persistent block allocation map. + * + * the blocks will be updated in the persistent map one + * dmap at a time. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * free - TRUE if block range is to be freed from the persistent + * map; FALSE if it is to be allocated. + * blkno - starting block number of the range. + * nblocks - number of contiguous blocks in the range. + * tblk - transaction block; + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error + */ +int +dbUpdatePMap(struct inode *ipbmap, + int free, s64 blkno, s64 nblocks, tblock_t * tblk) +{ + int nblks, dbitno, wbitno, rbits; + int word, nbits, nwords; + bmap_t *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + s64 lblkno, rem, lastlblkno; + u32 mask; + dmap_t *dp; + metapage_t *mp; + log_t *log; + int lsn, difft, diffp; + + /* the blocks better be within the mapsize. */ + assert(blkno + nblocks <= bmp->db_mapsize); + + /* compute delta of transaction lsn from log syncpt */ + lsn = tblk->lsn; + log = (log_t *) JFS_SBI(tblk->sb)->log; + logdiff(difft, lsn, log); + + /* + * update the block state a dmap at a time. + */ + mp = NULL; + lastlblkno = 0; + for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) { + /* get the buffer for the current dmap. */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + if (lblkno != lastlblkno) { + if (mp) { + write_metapage(mp); + } + + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, + 0); + if (mp == NULL) + return (EIO); + } + dp = (dmap_t *) mp->data; + + /* determine the bit number and word within the dmap of + * the starting block. also determine how many blocks + * are to be updated within this dmap. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + nblks = min(rem, (s64)BPERDMAP - dbitno); + + /* update the bits of the dmap words. the first and last + * words may only have a subset of their bits updated. if + * this is the case, we'll work against that word (i.e. + * partial first and/or last) only in a single pass. a + * single pass will also be used to update all words that + * are to have all their bits updated. + */ + for (rbits = nblks; rbits > 0; + rbits -= nbits, dbitno += nbits) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nbits = min(rbits, DBWORD - wbitno); + + /* check if only part of the word is to be updated. */ + if (nbits < DBWORD) { + /* update (free or allocate) the bits + * in this word. + */ + mask = + (ONES << (DBWORD - nbits) >> wbitno); + if (free) + dp->pmap[word] &= + cpu_to_le32(~mask); + else + dp->pmap[word] |= + cpu_to_le32(mask); + + word += 1; + } else { + /* one or more words are to have all + * their bits updated. determine how + * many words and how many bits. + */ + nwords = rbits >> L2DBWORD; + nbits = nwords << L2DBWORD; + + /* update (free or allocate) the bits + * in these words. + */ + if (free) + memset(&dp->pmap[word], 0, + nwords * 4); + else + memset(&dp->pmap[word], (int) ONES, + nwords * 4); + + word += nwords; + } + } + + /* + * update dmap lsn + */ + if (lblkno == lastlblkno) + continue; + + lastlblkno = lblkno; + + if (mp->lsn != 0) { + /* inherit older/smaller lsn */ + logdiff(diffp, mp->lsn, log); + if (difft < diffp) { + mp->lsn = lsn; + + /* move bp after tblock in logsync list */ + LOGSYNC_LOCK(log); + list_del(&mp->synclist); + list_add(&mp->synclist, &tblk->synclist); + LOGSYNC_UNLOCK(log); + } + + /* inherit younger/larger clsn */ + LOGSYNC_LOCK(log); + logdiff(difft, tblk->clsn, log); + logdiff(diffp, mp->clsn, log); + if (difft > diffp) + mp->clsn = tblk->clsn; + LOGSYNC_UNLOCK(log); + } else { + mp->log = log; + mp->lsn = lsn; + + /* insert bp after tblock in logsync list */ + LOGSYNC_LOCK(log); + + log->count++; + list_add(&mp->synclist, &tblk->synclist); + + mp->clsn = tblk->clsn; + LOGSYNC_UNLOCK(log); + } + } + + /* write the last buffer. */ + if (mp) { + write_metapage(mp); + } + + return (0); +} + + +/* + * NAME: dbNextAG() + * + * FUNCTION: find the preferred allocation group for new allocations. + * + * we try to keep the trailing (rightmost) allocation groups + * free for large allocations. we try to do this by targeting + * new inode allocations towards the leftmost or 'active' + * allocation groups while keeping the rightmost or 'inactive' + * allocation groups free. once the active allocation groups + * have dropped to a certain percentage of free space, we add + * the leftmost inactive allocation group to the active set. + * + * within the active allocation groups, we maintain a preferred + * allocation group which consists of a group with at least + * average free space over the active set. it is the preferred + * group that we target new inode allocation towards. the + * tie-in between inode allocation and block allocation occurs + * as we allocate the first (data) block of an inode and specify + * the inode (block) as the allocation hint for this block. + * + * PARAMETERS: + * ipbmap - pointer to in-core inode for the block map. + * + * RETURN VALUES: + * the preferred allocation group number. + * + * note: only called by dbAlloc(); + */ +int dbNextAG(struct inode *ipbmap) +{ + s64 avgfree, inactfree, actfree, rem; + int actags, inactags, l2agsize; + bmap_t *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + + BMAP_LOCK(bmp); + + /* determine the number of active allocation groups (i.e. + * the number of allocation groups up to and including + * the rightmost allocation group with blocks allocated + * in it. + */ + actags = bmp->db_maxag + 1; + assert(actags <= bmp->db_numag); + + /* get the number of inactive allocation groups (i.e. the + * number of allocation group following the rightmost group + * with allocation in it. + */ + inactags = bmp->db_numag - actags; + + /* determine how many blocks are in the inactive allocation + * groups. in doing this, we must account for the fact that + * the rightmost group might be a partial group (i.e. file + * system size is not a multiple of the group size). + */ + l2agsize = bmp->db_agl2size; + rem = bmp->db_mapsize & (bmp->db_agsize - 1); + inactfree = (inactags + && rem) ? ((inactags - 1) << l2agsize) + + rem : inactags << l2agsize; + + /* now determine how many free blocks are in the active + * allocation groups plus the average number of free blocks + * within the active ags. + */ + actfree = bmp->db_nfree - inactfree; + avgfree = (u32) actfree / (u32) actags; + + /* check if not all of the allocation groups are active. + */ + if (actags < bmp->db_numag) { + /* not all of the allocation groups are active. determine + * if we should extend the active set by 1 (i.e. add the + * group following the current active set). we do so if + * the number of free blocks within the active set is less + * than the allocation group set and average free within + * the active set is less than 60%. we activate a new group + * by setting the allocation group preference to the new + * group. + */ + if (actfree < bmp->db_agsize && + ((avgfree * 100) >> l2agsize) < 60) + bmp->db_agpref = actags; + } else { + /* all allocation groups are in the active set. check if + * the preferred allocation group has average free space. + * if not, re-establish the preferred group as the leftmost + * group with average free space. + */ + if (bmp->db_agfree[bmp->db_agpref] < avgfree) { + for (bmp->db_agpref = 0; bmp->db_agpref < actags; + bmp->db_agpref++) { + if (bmp->db_agfree[bmp->db_agpref] <= + avgfree) + break; + } + assert(bmp->db_agpref < bmp->db_numag); + } + } + + BMAP_UNLOCK(bmp); + + /* return the preferred group. + */ + return (bmp->db_agpref); +} + + +/* + * NAME: dbAlloc() + * + * FUNCTION: attempt to allocate a specified number of contiguous free + * blocks from the working allocation block map. + * + * the block allocation policy uses hints and a multi-step + * approach. + * + * for allocation requests smaller than the number of blocks + * per dmap, we first try to allocate the new blocks + * immediately following the hint. if these blocks are not + * available, we try to allocate blocks near the hint. if + * no blocks near the hint are available, we next try to + * allocate within the same dmap as contains the hint. + * + * if no blocks are available in the dmap or the allocation + * request is larger than the dmap size, we try to allocate + * within the same allocation group as contains the hint. if + * this does not succeed, we finally try to allocate anywhere + * within the aggregate. + * + * we also try to allocate anywhere within the aggregate for + * for allocation requests larger than the allocation group + * size or requests that specify no hint value. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * hint - allocation hint. + * nblocks - number of contiguous blocks in the range. + * results - on successful return, set to the starting block number + * of the newly allocated contiguous range. + * + * RETURN VALUES: + * 0 - success + * ENOSPC - insufficient disk resources + * EIO - i/o error + */ +int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) +{ + int rc, agno; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + bmap_t *bmp; + metapage_t *mp; + s64 lblkno, blkno; + dmap_t *dp; + int l2nb; + s64 mapSize; + + /* assert that nblocks is valid */ + assert(nblocks > 0); + +#ifdef _STILL_TO_PORT + /* DASD limit check F226941 */ + if (OVER_LIMIT(ip, nblocks)) + return ENOSPC; +#endif /* _STILL_TO_PORT */ + + /* get the log2 number of blocks to be allocated. + * if the number of blocks is not a log2 multiple, + * it will be rounded up to the next log2 multiple. + */ + l2nb = BLKSTOL2(nblocks); + + bmp = JFS_SBI(ip->i_sb)->bmap; + +//retry: /* serialize w.r.t.extendfs() */ + mapSize = bmp->db_mapsize; + + /* the hint should be within the map */ + assert(hint < mapSize); + + /* if no hint was specified or the number of blocks to be + * allocated is greater than the allocation group size, try + * to allocate anywhere. + */ + if (hint == 0 || l2nb > bmp->db_agl2size) { + IWRITE_LOCK(ipbmap); + + rc = dbAllocAny(bmp, nblocks, l2nb, results); + if (rc == 0) { + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results, + nblocks); + } + + IWRITE_UNLOCK(ipbmap); + return (rc); + } + + /* we would like to allocate close to the hint. adjust the + * hint to the block following the hint since the allocators + * will start looking for free space starting at this point. + * if the hint was the last block of the file system, try to + * allocate in the same allocation group as the hint. + */ + blkno = hint + 1; + if (blkno >= bmp->db_mapsize) { + blkno--; + goto tryag; + } + + /* check if blkno crosses over into a new allocation group. + * if so, check if we should allow allocations within this + * allocation group. we try to keep the trailing (rightmost) + * allocation groups of the file system free for large + * allocations and may want to prevent this allocation from + * spilling over into this space. + */ + if ((blkno & (bmp->db_agsize - 1)) == 0) { + /* check if the AG is beyond the rightmost AG with + * allocations in it. if so, call dbNextAG() to + * determine if the allocation should be allowed + * to proceed within this AG or should be targeted + * to another AG. + */ + agno = blkno >> bmp->db_agl2size; + if (agno > bmp->db_maxag) { + agno = dbNextAG(ipbmap); + blkno = (s64) agno << bmp->db_agl2size; + goto tryag; + } + } + + /* check if the allocation request size can be satisfied from a + * single dmap. if so, try to allocate from the dmap containing + * the hint using a tiered strategy. + */ + if (nblocks <= BPERDMAP) { + IREAD_LOCK(ipbmap); + + /* get the buffer for the dmap containing the hint. + */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return (EIO); + } + dp = (dmap_t *) mp->data; + + /* first, try to satisfy the allocation request with the + * blocks beginning at the hint. + */ + if ((rc = + dbAllocNext(bmp, dp, blkno, + (int) nblocks)) != ENOSPC) { + if (rc == 0) { + *results = blkno; + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, + *results, nblocks); + write_metapage(mp); + } else { + assert(rc == EIO); + release_metapage(mp); + } + + IREAD_UNLOCK(ipbmap); + return (rc); + } + + /* next, try to satisfy the allocation request with blocks + * near the hint. + */ + if ((rc = + dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, + results)) + != ENOSPC) { + if (rc == 0) { + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, + *results, nblocks); + mark_metapage_dirty(mp); + } + release_metapage(mp); + + IREAD_UNLOCK(ipbmap); + return (rc); + } + + /* try to satisfy the allocation request with blocks within + * the same allocation group as the hint. + */ + if ((rc = + dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results)) + != ENOSPC) { + if (rc == 0) { + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, + *results, nblocks); + mark_metapage_dirty(mp); + } + release_metapage(mp); + + IREAD_UNLOCK(ipbmap); + return (rc); + } + + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + } + + tryag: + IWRITE_LOCK(ipbmap); + + /* determine the allocation group number of the hint and try to + * allocate within this allocation group. if that fails, try to + * allocate anywhere in the map. + */ + agno = blkno >> bmp->db_agl2size; + if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == ENOSPC) + rc = dbAllocAny(bmp, nblocks, l2nb, results); + if (rc == 0) { + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results, nblocks); + } + + IWRITE_UNLOCK(ipbmap); + + return (rc); +} + + +/* + * NAME: dbAllocExact() + * + * FUNCTION: try to allocate the requested extent; + * + * PARAMETERS: + * ip - pointer to in-core inode; + * blkno - extent address; + * nblocks - extent length; + * + * RETURN VALUES: + * 0 - success + * ENOSPC - insufficient disk resources + * EIO - i/o error + */ +int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) +{ + int rc; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + bmap_t *bmp = JFS_SBI(ip->i_sb)->bmap; + dmap_t *dp; + s64 lblkno; + metapage_t *mp; + + IREAD_LOCK(ipbmap); + + /* + * validate extent request: + * + * note: defragfs policy: + * max 64 blocks will be moved. + * allocation request size must be satisfied from a single dmap. + */ + if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) { + IREAD_UNLOCK(ipbmap); + return EINVAL; + } + + if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) { + /* the free space is no longer available */ + IREAD_UNLOCK(ipbmap); + return ENOSPC; + } + + /* read in the dmap covering the extent */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return (EIO); + } + dp = (dmap_t *) mp->data; + + /* try to allocate the requested extent */ + rc = dbAllocNext(bmp, dp, blkno, nblocks); + + IREAD_UNLOCK(ipbmap); + + if (rc == 0) { + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks); + mark_metapage_dirty(mp); + } + release_metapage(mp); + + return (rc); +} + + +/* + * NAME: dbReAlloc() + * + * FUNCTION: attempt to extend a current allocation by a specified + * number of blocks. + * + * this routine attempts to satisfy the allocation request + * by first trying to extend the existing allocation in + * place by allocating the additional blocks as the blocks + * immediately following the current allocation. if these + * blocks are not available, this routine will attempt to + * allocate a new set of contiguous blocks large enough + * to cover the existing allocation plus the additional + * number of blocks required. + * + * PARAMETERS: + * ip - pointer to in-core inode requiring allocation. + * blkno - starting block of the current allocation. + * nblocks - number of contiguous blocks within the current + * allocation. + * addnblocks - number of blocks to add to the allocation. + * results - on successful return, set to the starting block number + * of the existing allocation if the existing allocation + * was extended in place or to a newly allocated contiguous + * range if the existing allocation could not be extended + * in place. + * + * RETURN VALUES: + * 0 - success + * ENOSPC - insufficient disk resources + * EIO - i/o error + */ +int +dbReAlloc(struct inode *ip, + s64 blkno, s64 nblocks, s64 addnblocks, s64 * results) +{ + int rc; + + /* try to extend the allocation in place. + */ + if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) { + *results = blkno; + return (0); + } else { + if (rc != ENOSPC) + return (rc); + } + + /* could not extend the allocation in place, so allocate a + * new set of blocks for the entire request (i.e. try to get + * a range of contiguous blocks large enough to cover the + * existing allocation plus the additional blocks.) + */ + return (dbAlloc + (ip, blkno + nblocks - 1, addnblocks + nblocks, results)); +} + + +/* + * NAME: dbExtend() + * + * FUNCTION: attempt to extend a current allocation by a specified + * number of blocks. + * + * this routine attempts to satisfy the allocation request + * by first trying to extend the existing allocation in + * place by allocating the additional blocks as the blocks + * immediately following the current allocation. + * + * PARAMETERS: + * ip - pointer to in-core inode requiring allocation. + * blkno - starting block of the current allocation. + * nblocks - number of contiguous blocks within the current + * allocation. + * addnblocks - number of blocks to add to the allocation. + * + * RETURN VALUES: + * 0 - success + * ENOSPC - insufficient disk resources + * EIO - i/o error + */ +int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + s64 lblkno, lastblkno, extblkno; + uint rel_block; + metapage_t *mp; + dmap_t *dp; + int rc; + struct inode *ipbmap = sbi->ipbmap; + bmap_t *bmp; + + /* + * We don't want a non-aligned extent to cross a page boundary + */ + if (((rel_block = blkno & (sbi->nbperpage - 1))) && + (rel_block + nblocks + addnblocks > sbi->nbperpage)) + return (ENOSPC); + + /* get the last block of the current allocation */ + lastblkno = blkno + nblocks - 1; + + /* determine the block number of the block following + * the existing allocation. + */ + extblkno = lastblkno + 1; + + IREAD_LOCK(ipbmap); + + /* better be within the file system */ + bmp = sbi->bmap; + assert(lastblkno >= 0 && lastblkno < bmp->db_mapsize); + + /* we'll attempt to extend the current allocation in place by + * allocating the additional blocks as the blocks immediately + * following the current allocation. we only try to extend the + * current allocation in place if the number of additional blocks + * can fit into a dmap, the last block of the current allocation + * is not the last block of the file system, and the start of the + * inplace extension is not on an allocation group boundry. + */ + if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize || + (extblkno & (bmp->db_agsize - 1)) == 0) { + IREAD_UNLOCK(ipbmap); + return (ENOSPC); + } + + /* get the buffer for the dmap containing the first block + * of the extension. + */ + lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return (EIO); + } + + DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks); + dp = (dmap_t *) mp->data; + + /* try to allocate the blocks immediately following the + * current allocation. + */ + rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks); + + IREAD_UNLOCK(ipbmap); + + /* were we successful ? */ + if (rc == 0) { + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, extblkno, + addnblocks); + write_metapage(mp); + } else { + /* we were not successful */ + release_metapage(mp); + assert(rc == ENOSPC || rc == EIO); + } + + return (rc); +} + + +/* + * NAME: dbAllocNext() + * + * FUNCTION: attempt to allocate the blocks of the specified block + * range within a dmap. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap. + * blkno - starting block number of the range. + * nblocks - number of contiguous free blocks of the range. + * + * RETURN VALUES: + * 0 - success + * ENOSPC - insufficient disk resources + * EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) held on entry/exit; + */ +static int dbAllocNext(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks) +{ + int dbitno, word, rembits, nb, nwords, wbitno, nw; + int l2size; + s8 *leaf; + u32 mask; + + /* pick up a pointer to the leaves of the dmap tree. + */ + leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx); + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* check if the specified block range is contained within + * this dmap. + */ + if (dbitno + nblocks > BPERDMAP) + return (ENOSPC); + + /* check if the starting leaf indicates that anything + * is free. + */ + if (leaf[word] == NOFREE) + return (ENOSPC); + + /* check the dmaps words corresponding to block range to see + * if the block range is free. not all bits of the first and + * last words may be contained within the block range. if this + * is the case, we'll work against those words (i.e. partial first + * and/or last) on an individual basis (a single pass) and examine + * the actual bits to determine if they are free. a single pass + * will be used for all dmap words fully contained within the + * specified range. within this pass, the leaves of the dmap + * tree will be examined to determine if the blocks are free. a + * single leaf may describe the free space of multiple dmap + * words, so we may visit only a subset of the actual leaves + * corresponding to the dmap words of the block range. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of the word is to be examined. + */ + if (nb < DBWORD) { + /* check if the bits are free. + */ + mask = (ONES << (DBWORD - nb) >> wbitno); + if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask) + return (ENOSPC); + + word += 1; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and how many bits. + */ + nwords = rembits >> L2DBWORD; + nb = nwords << L2DBWORD; + + /* now examine the appropriate leaves to determine + * if the blocks are free. + */ + while (nwords > 0) { + /* does the leaf describe any free space ? + */ + if (leaf[word] < BUDMIN) + return (ENOSPC); + + /* determine the l2 number of bits provided + * by this leaf. + */ + l2size = + min((int)leaf[word], NLSTOL2BSZ(nwords)); + + /* determine how many words were handled. + */ + nw = BUDSIZE(l2size, BUDMIN); + + nwords -= nw; + word += nw; + } + } + } + + /* allocate the blocks. + */ + return (dbAllocDmap(bmp, dp, blkno, nblocks)); +} + + +/* + * NAME: dbAllocNear() + * + * FUNCTION: attempt to allocate a number of contiguous free blocks near + * a specified block (hint) within a dmap. + * + * starting with the dmap leaf that covers the hint, we'll + * check the next four contiguous leaves for sufficient free + * space. if sufficient free space is found, we'll allocate + * the desired free space. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap. + * blkno - block number to allocate near. + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * ENOSPC - insufficient disk resources + * EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) held on entry/exit; + */ +static int +dbAllocNear(bmap_t * bmp, + dmap_t * dp, s64 blkno, int nblocks, int l2nb, s64 * results) +{ + int word, lword, rc; + s8 *leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx); + + /* determine the word within the dmap that holds the hint + * (i.e. blkno). also, determine the last word in the dmap + * that we'll include in our examination. + */ + word = (blkno & (BPERDMAP - 1)) >> L2DBWORD; + lword = min(word + 4, LPERDMAP); + + /* examine the leaves for sufficient free space. + */ + for (; word < lword; word++) { + /* does the leaf describe sufficient free space ? + */ + if (leaf[word] < l2nb) + continue; + + /* determine the block number within the file system + * of the first block described by this dmap word. + */ + blkno = le64_to_cpu(dp->start) + (word << L2DBWORD); + + /* if not all bits of the dmap word are free, get the + * starting bit number within the dmap word of the required + * string of free bits and adjust the block number with the + * value. + */ + if (leaf[word] < BUDMIN) + blkno += + dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb); + + /* allocate the blocks. + */ + if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0) + *results = blkno; + + return (rc); + } + + return (ENOSPC); +} + + +/* + * NAME: dbAllocAG() + * + * FUNCTION: attempt to allocate the specified number of contiguous + * free blocks within the specified allocation group. + * + * unless the allocation group size is equal to the number + * of blocks per dmap, the dmap control pages will be used to + * find the required free space, if available. we start the + * search at the highest dmap control page level which + * distinctly describes the allocation group's free space + * (i.e. the highest level at which the allocation group's + * free space is not mixed in with that of any other group). + * in addition, we start the search within this level at a + * height of the dmapctl dmtree at which the nodes distinctly + * describe the allocation group's free space. at this height, + * the allocation group's free space may be represented by 1 + * or two sub-trees, depending on the allocation group size. + * we search the top nodes of these subtrees left to right for + * sufficient free space. if sufficient free space is found, + * the subtree is searched to find the leftmost leaf that + * has free space. once we have made it to the leaf, we + * move the search to the next lower level dmap control page + * corresponding to this leaf. we continue down the dmap control + * pages until we find the dmap that contains or starts the + * sufficient free space and we allocate at this dmap. + * + * if the allocation group size is equal to the dmap size, + * we'll start at the dmap corresponding to the allocation + * group and attempt the allocation at this level. + * + * the dmap control page search is also not performed if the + * allocation group is completely free and we go to the first + * dmap of the allocation group to do the allocation. this is + * done because the allocation group may be part (not the first + * part) of a larger binary buddy system, causing the dmap + * control pages to indicate no free space (NOFREE) within + * the allocation group. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * agno - allocation group number. + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * ENOSPC - insufficient disk resources + * EIO - i/o error + * + * note: IWRITE_LOCK(ipmap) held on entry/exit; + */ +static int +dbAllocAG(bmap_t * bmp, int agno, s64 nblocks, int l2nb, s64 * results) +{ + metapage_t *mp; + dmapctl_t *dcp; + int rc, ti, i, k, m, n, agperlev; + s64 blkno, lblkno; + int budmin; + + /* allocation request should not be for more than the + * allocation group size. + */ + assert(l2nb <= bmp->db_agl2size); + + /* determine the starting block number of the allocation + * group. + */ + blkno = (s64) agno << bmp->db_agl2size; + + /* check if the allocation group size is the minimum allocation + * group size or if the allocation group is completely free. if + * the allocation group size is the minimum size of BPERDMAP (i.e. + * 1 dmap), there is no need to search the dmap control page (below) + * that fully describes the allocation group since the allocation + * group is already fully described by a dmap. in this case, we + * just call dbAllocCtl() to search the dmap tree and allocate the + * required space if available. + * + * if the allocation group is completely free, dbAllocCtl() is + * also called to allocate the required space. this is done for + * two reasons. first, it makes no sense searching the dmap control + * pages for free space when we know that free space exists. second, + * the dmap control pages may indicate that the allocation group + * has no free space if the allocation group is part (not the first + * part) of a larger binary buddy system. + */ + if (bmp->db_agsize == BPERDMAP + || bmp->db_agfree[agno] == bmp->db_agsize) { + rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); + /* assert(!(rc == ENOSPC && bmp->db_agfree[agno] == bmp->db_agsize)); */ + if ((rc == ENOSPC) && + (bmp->db_agfree[agno] == bmp->db_agsize)) { + jERROR(1, + ("dbAllocAG: removed assert, but still need to debug here\nblkno = 0x%Lx, nblocks = 0x%Lx\n", + (unsigned long long) blkno, + (unsigned long long) nblocks)); + } + return (rc); + } + + /* the buffer for the dmap control page that fully describes the + * allocation group. + */ + lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return (EIO); + dcp = (dmapctl_t *) mp->data; + budmin = dcp->budmin; + + /* search the subtree(s) of the dmap control page that describes + * the allocation group, looking for sufficient free space. to begin, + * determine how many allocation groups are represented in a dmap + * control page at the control page level (i.e. L0, L1, L2) that + * fully describes an allocation group. next, determine the starting + * tree index of this allocation group within the control page. + */ + agperlev = + (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth; + ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); + + /* dmap control page trees fan-out by 4 and a single allocation + * group may be described by 1 or 2 subtrees within the ag level + * dmap control page, depending upon the ag size. examine the ag's + * subtrees for sufficient free space, starting with the leftmost + * subtree. + */ + for (i = 0; i < bmp->db_agwidth; i++, ti++) { + /* is there sufficient free space ? + */ + if (l2nb > dcp->stree[ti]) + continue; + + /* sufficient free space found in a subtree. now search down + * the subtree to find the leftmost leaf that describes this + * free space. + */ + for (k = bmp->db_agheigth; k > 0; k--) { + for (n = 0, m = (ti << 2) + 1; n < 4; n++) { + if (l2nb <= dcp->stree[m + n]) { + ti = m + n; + break; + } + } + assert(n < 4); + } + + /* determine the block number within the file system + * that corresponds to this leaf. + */ + if (bmp->db_aglevel == 2) + blkno = 0; + else if (bmp->db_aglevel == 1) + blkno &= ~(MAXL1SIZE - 1); + else /* bmp->db_aglevel == 0 */ + blkno &= ~(MAXL0SIZE - 1); + + blkno += + ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin; + + /* release the buffer in preparation for going down + * the next level of dmap control pages. + */ + release_metapage(mp); + + /* check if we need to continue to search down the lower + * level dmap control pages. we need to if the number of + * blocks required is less than maximum number of blocks + * described at the next lower level. + */ + if (l2nb < budmin) { + + /* search the lower level dmap control pages to get + * the starting block number of the the dmap that + * contains or starts off the free space. + */ + if ((rc = + dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1, + &blkno))) { + assert(rc != ENOSPC); + return (rc); + } + } + + /* allocate the blocks. + */ + rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); + assert(rc != ENOSPC); + return (rc); + } + + /* no space in the allocation group. release the buffer and + * return ENOSPC. + */ + release_metapage(mp); + + return (ENOSPC); +} + + +/* + * NAME: dbAllocAny() + * + * FUNCTION: attempt to allocate the specified number of contiguous + * free blocks anywhere in the file system. + * + * dbAllocAny() attempts to find the sufficient free space by + * searching down the dmap control pages, starting with the + * highest level (i.e. L0, L1, L2) control page. if free space + * large enough to satisfy the desired free space is found, the + * desired free space is allocated. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * nblocks - actual number of contiguous free blocks desired. + * l2nb - log2 number of contiguous free blocks desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * ENOSPC - insufficient disk resources + * EIO - i/o error + * + * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbAllocAny(bmap_t * bmp, s64 nblocks, int l2nb, s64 * results) +{ + int rc; + s64 blkno = 0; + + /* starting with the top level dmap control page, search + * down the dmap control levels for sufficient free space. + * if free space is found, dbFindCtl() returns the starting + * block number of the dmap that contains or starts off the + * range of free space. + */ + if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno))) + return (rc); + + /* allocate the blocks. + */ + rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results); + assert(rc != ENOSPC); + return (rc); +} + + +/* + * NAME: dbFindCtl() + * + * FUNCTION: starting at a specified dmap control page level and block + * number, search down the dmap control levels for a range of + * contiguous free blocks large enough to satisfy an allocation + * request for the specified number of free blocks. + * + * if sufficient contiguous free blocks are found, this routine + * returns the starting block number within a dmap page that + * contains or starts a range of contiqious free blocks that + * is sufficient in size. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * level - starting dmap control page level. + * l2nb - log2 number of contiguous free blocks desired. + * *blkno - on entry, starting block number for conducting the search. + * on successful return, the first block within a dmap page + * that contains or starts a range of contiguous free blocks. + * + * RETURN VALUES: + * 0 - success + * ENOSPC - insufficient disk resources + * EIO - i/o error + * + * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbFindCtl(bmap_t * bmp, int l2nb, int level, s64 * blkno) +{ + int rc, leafidx, lev; + s64 b, lblkno; + dmapctl_t *dcp; + int budmin; + metapage_t *mp; + + /* starting at the specified dmap control page level and block + * number, search down the dmap control levels for the starting + * block number of a dmap page that contains or starts off + * sufficient free blocks. + */ + for (lev = level, b = *blkno; lev >= 0; lev--) { + /* get the buffer of the dmap control page for the block + * number and level (i.e. L0, L1, L2). + */ + lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return (EIO); + dcp = (dmapctl_t *) mp->data; + budmin = dcp->budmin; + + /* search the tree within the dmap control page for + * sufficent free space. if sufficient free space is found, + * dbFindLeaf() returns the index of the leaf at which + * free space was found. + */ + rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx); + + /* release the buffer. + */ + release_metapage(mp); + + /* space found ? + */ + if (rc) { + assert(lev == level); + return (ENOSPC); + } + + /* adjust the block number to reflect the location within + * the dmap control page (i.e. the leaf) at which free + * space was found. + */ + b += (((s64) leafidx) << budmin); + + /* we stop the search at this dmap control page level if + * the number of blocks required is greater than or equal + * to the maximum number of blocks described at the next + * (lower) level. + */ + if (l2nb >= budmin) + break; + } + + *blkno = b; + return (0); +} + + +/* + * NAME: dbAllocCtl() + * + * FUNCTION: attempt to allocate a specified number of contiguous + * blocks starting within a specific dmap. + * + * this routine is called by higher level routines that search + * the dmap control pages above the actual dmaps for contiguous + * free space. the result of successful searches by these + * routines are the starting block numbers within dmaps, with + * the dmaps themselves containing the desired contiguous free + * space or starting a contiguous free space of desired size + * that is made up of the blocks of one or more dmaps. these + * calls should not fail due to insufficent resources. + * + * this routine is called in some cases where it is not known + * whether it will fail due to insufficient resources. more + * specifically, this occurs when allocating from an allocation + * group whose size is equal to the number of blocks per dmap. + * in this case, the dmap control pages are not examined prior + * to calling this routine (to save pathlength) and the call + * might fail. + * + * for a request size that fits within a dmap, this routine relies + * upon the dmap's dmtree to find the requested contiguous free + * space. for request sizes that are larger than a dmap, the + * requested free space will start at the first block of the + * first dmap (i.e. blkno). + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * nblocks - actual number of contiguous free blocks to allocate. + * l2nb - log2 number of contiguous free blocks to allocate. + * blkno - starting block number of the dmap to start the allocation + * from. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * ENOSPC - insufficient disk resources + * EIO - i/o error + * + * serialization: IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int +dbAllocCtl(bmap_t * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results) +{ + int rc, nb; + s64 b, lblkno, n; + metapage_t *mp; + dmap_t *dp; + + /* check if the allocation request is confined to a single dmap. + */ + if (l2nb <= L2BPERDMAP) { + /* get the buffer for the dmap. + */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return (EIO); + dp = (dmap_t *) mp->data; + + /* try to allocate the blocks. + */ + rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results); + if (rc == 0) + mark_metapage_dirty(mp); + + release_metapage(mp); + + return (rc); + } + + /* allocation request involving multiple dmaps. it must start on + * a dmap boundary. + */ + assert((blkno & (BPERDMAP - 1)) == 0); + + /* allocate the blocks dmap by dmap. + */ + for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) { + /* get the buffer for the dmap. + */ + lblkno = BLKTODMAP(b, bmp->db_l2nbperpage); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + rc = EIO; + goto backout; + } + dp = (dmap_t *) mp->data; + + /* the dmap better be all free. + */ + assert(dp->tree.stree[ROOT] == L2BPERDMAP); + + /* determine how many blocks to allocate from this dmap. + */ + nb = min(n, (s64)BPERDMAP); + + /* allocate the blocks from the dmap. + */ + if ((rc = dbAllocDmap(bmp, dp, b, nb))) { + release_metapage(mp); + goto backout; + } + + /* write the buffer. + */ + write_metapage(mp); + } + + /* set the results (starting block number) and return. + */ + *results = blkno; + return (0); + + /* something failed in handling an allocation request involving + * multiple dmaps. we'll try to clean up by backing out any + * allocation that has already happened for this request. if + * we fail in backing out the allocation, we'll mark the file + * system to indicate that blocks have been leaked. + */ + backout: + + /* try to backout the allocations dmap by dmap. + */ + for (n = nblocks - n, b = blkno; n > 0; + n -= BPERDMAP, b += BPERDMAP) { + /* get the buffer for this dmap. + */ + lblkno = BLKTODMAP(b, bmp->db_l2nbperpage); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + /* could not back out. mark the file system + * to indicate that we have leaked blocks. + */ + fsDirty(); /* !!! */ + jERROR(1, + ("dbAllocCtl: I/O Error: Block Leakage.\n")); + continue; + } + dp = (dmap_t *) mp->data; + + /* free the blocks is this dmap. + */ + if (dbFreeDmap(bmp, dp, b, BPERDMAP)) { + /* could not back out. mark the file system + * to indicate that we have leaked blocks. + */ + release_metapage(mp); + fsDirty(); /* !!! */ + jERROR(1, ("dbAllocCtl: Block Leakage.\n")); + continue; + } + + /* write the buffer. + */ + write_metapage(mp); + } + + return (rc); +} + + +/* + * NAME: dbAllocDmapLev() + * + * FUNCTION: attempt to allocate a specified number of contiguous blocks + * from a specified dmap. + * + * this routine checks if the contiguous blocks are available. + * if so, nblocks of blocks are allocated; otherwise, ENOSPC is + * returned. + * + * PARAMETERS: + * mp - pointer to bmap descriptor + * dp - pointer to dmap to attempt to allocate blocks from. + * l2nb - log2 number of contiguous block desired. + * nblocks - actual number of contiguous block desired. + * results - on successful return, set to the starting block number + * of the newly allocated range. + * + * RETURN VALUES: + * 0 - success + * ENOSPC - insufficient disk resources + * EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or + * IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit; + */ +static int +dbAllocDmapLev(bmap_t * bmp, + dmap_t * dp, int nblocks, int l2nb, s64 * results) +{ + s64 blkno; + int leafidx, rc; + + /* can't be more than a dmaps worth of blocks */ + assert(l2nb <= L2BPERDMAP); + + /* search the tree within the dmap page for sufficient + * free space. if sufficient free space is found, dbFindLeaf() + * returns the index of the leaf at which free space was found. + */ + if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx)) + return (ENOSPC); + + /* determine the block number within the file system corresponding + * to the leaf at which free space was found. + */ + blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD); + + /* if not all bits of the dmap word are free, get the starting + * bit number within the dmap word of the required string of free + * bits and adjust the block number with this value. + */ + if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN) + blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb); + + /* allocate the blocks */ + if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0) + *results = blkno; + + return (rc); +} + + +/* + * NAME: dbAllocDmap() + * + * FUNCTION: adjust the disk allocation map to reflect the allocation + * of a specified block range within a dmap. + * + * this routine allocates the specified blocks from the dmap + * through a call to dbAllocBits(). if the allocation of the + * block range causes the maximum string of free blocks within + * the dmap to change (i.e. the value of the root of the dmap's + * dmtree), this routine will cause this change to be reflected + * up through the appropriate levels of the dmap control pages + * by a call to dbAdjCtl() for the L0 dmap control page that + * covers this dmap. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to allocate the block range from. + * blkno - starting block number of the block to be allocated. + * nblocks - number of blocks to be allocated. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbAllocDmap(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks) +{ + s8 oldroot; + int rc; + + /* save the current value of the root (i.e. maximum free string) + * of the dmap tree. + */ + oldroot = dp->tree.stree[ROOT]; + + /* allocate the specified (blocks) bits */ + dbAllocBits(bmp, dp, blkno, nblocks); + + /* if the root has not changed, done. */ + if (dp->tree.stree[ROOT] == oldroot) + return (0); + + /* root changed. bubble the change up to the dmap control pages. + * if the adjustment of the upper level control pages fails, + * backout the bit allocation (thus making everything consistent). + */ + if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0))) + dbFreeBits(bmp, dp, blkno, nblocks); + + return (rc); +} + + +/* + * NAME: dbFreeDmap() + * + * FUNCTION: adjust the disk allocation map to reflect the allocation + * of a specified block range within a dmap. + * + * this routine frees the specified blocks from the dmap through + * a call to dbFreeBits(). if the deallocation of the block range + * causes the maximum string of free blocks within the dmap to + * change (i.e. the value of the root of the dmap's dmtree), this + * routine will cause this change to be reflected up through the + * appropriate levels of the dmap control pages by a call to + * dbAdjCtl() for the L0 dmap control page that covers this dmap. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to free the block range from. + * blkno - starting block number of the block to be freed. + * nblocks - number of blocks to be freed. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int dbFreeDmap(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks) +{ + s8 oldroot; + int rc, word; + + /* save the current value of the root (i.e. maximum free string) + * of the dmap tree. + */ + oldroot = dp->tree.stree[ROOT]; + + /* free the specified (blocks) bits */ + dbFreeBits(bmp, dp, blkno, nblocks); + + /* if the root has not changed, done. */ + if (dp->tree.stree[ROOT] == oldroot) + return (0); + + /* root changed. bubble the change up to the dmap control pages. + * if the adjustment of the upper level control pages fails, + * backout the deallocation. + */ + if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) { + word = (blkno & (BPERDMAP - 1)) >> L2DBWORD; + + /* as part of backing out the deallocation, we will have + * to back split the dmap tree if the deallocation caused + * the freed blocks to become part of a larger binary buddy + * system. + */ + if (dp->tree.stree[word] == NOFREE) + dbBackSplit((dmtree_t *) & dp->tree, word); + + dbAllocBits(bmp, dp, blkno, nblocks); + } + + return (rc); +} + + +/* + * NAME: dbAllocBits() + * + * FUNCTION: allocate a specified block range from a dmap. + * + * this routine updates the dmap to reflect the working + * state allocation of the specified block range. it directly + * updates the bits of the working map and causes the adjustment + * of the binary buddy system described by the dmap's dmtree + * leaves to reflect the bits allocated. it also causes the + * dmap's dmtree, as a whole, to reflect the allocated range. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to allocate bits from. + * blkno - starting block number of the bits to be allocated. + * nblocks - number of bits to be allocated. + * + * RETURN VALUES: none + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static void dbAllocBits(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks) +{ + int dbitno, word, rembits, nb, nwords, wbitno, nw, agno; + dmtree_t *tp = (dmtree_t *) & dp->tree; + int size; + s8 *leaf; + + /* pick up a pointer to the leaves of the dmap tree */ + leaf = dp->tree.stree + LEAFIND; + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* block range better be within the dmap */ + assert(dbitno + nblocks <= BPERDMAP); + + /* allocate the bits of the dmap's words corresponding to the block + * range. not all bits of the first and last words may be contained + * within the block range. if this is the case, we'll work against + * those words (i.e. partial first and/or last) on an individual basis + * (a single pass), allocating the bits of interest by hand and + * updating the leaf corresponding to the dmap word. a single pass + * will be used for all dmap words fully contained within the + * specified range. within this pass, the bits of all fully contained + * dmap words will be marked as free in a single shot and the leaves + * will be updated. a single leaf may describe the free space of + * multiple dmap words, so we may update only a subset of the actual + * leaves corresponding to the dmap words of the block range. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of a word is to be allocated. + */ + if (nb < DBWORD) { + /* allocate (set to 1) the appropriate bits within + * this dmap word. + */ + dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb) + >> wbitno); + + /* update the leaf for this dmap word. in addition + * to setting the leaf value to the binary buddy max + * of the updated dmap word, dbSplit() will split + * the binary system of the leaves if need be. + */ + dbSplit(tp, word, BUDMIN, + dbMaxBud((u8 *) & dp->wmap[word])); + + word += 1; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and allocate (set to 1) the bits of these + * words. + */ + nwords = rembits >> L2DBWORD; + memset(&dp->wmap[word], (int) ONES, nwords * 4); + + /* determine how many bits. + */ + nb = nwords << L2DBWORD; + + /* now update the appropriate leaves to reflect + * the allocated words. + */ + for (; nwords > 0; nwords -= nw) { + assert(leaf[word] >= BUDMIN); + + /* determine what the leaf value should be + * updated to as the minimum of the l2 number + * of bits being allocated and the l2 number + * of bits currently described by this leaf. + */ + size = min((int)leaf[word], NLSTOL2BSZ(nwords)); + + /* update the leaf to reflect the allocation. + * in addition to setting the leaf value to + * NOFREE, dbSplit() will split the binary + * system of the leaves to reflect the current + * allocation (size). + */ + dbSplit(tp, word, size, NOFREE); + + /* get the number of dmap words handled */ + nw = BUDSIZE(size, BUDMIN); + word += nw; + } + } + } + + /* update the free count for this dmap */ + dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks); + + BMAP_LOCK(bmp); + + /* if this allocation group is completely free, + * update the maximum allocation group number if this allocation + * group is the new max. + */ + agno = blkno >> bmp->db_agl2size; + if (agno > bmp->db_maxag) + bmp->db_maxag = agno; + + /* update the free count for the allocation group and map */ + bmp->db_agfree[agno] -= nblocks; + bmp->db_nfree -= nblocks; + + BMAP_UNLOCK(bmp); +} + + +/* + * NAME: dbFreeBits() + * + * FUNCTION: free a specified block range from a dmap. + * + * this routine updates the dmap to reflect the working + * state allocation of the specified block range. it directly + * updates the bits of the working map and causes the adjustment + * of the binary buddy system described by the dmap's dmtree + * leaves to reflect the bits freed. it also causes the dmap's + * dmtree, as a whole, to reflect the deallocated range. + * + * PARAMETERS: + * bmp - pointer to bmap descriptor + * dp - pointer to dmap to free bits from. + * blkno - starting block number of the bits to be freed. + * nblocks - number of bits to be freed. + * + * RETURN VALUES: none + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static void dbFreeBits(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks) +{ + int dbitno, word, rembits, nb, nwords, wbitno, nw, agno; + dmtree_t *tp = (dmtree_t *) & dp->tree; + int size; + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* block range better be within the dmap. + */ + assert(dbitno + nblocks <= BPERDMAP); + + /* free the bits of the dmaps words corresponding to the block range. + * not all bits of the first and last words may be contained within + * the block range. if this is the case, we'll work against those + * words (i.e. partial first and/or last) on an individual basis + * (a single pass), freeing the bits of interest by hand and updating + * the leaf corresponding to the dmap word. a single pass will be used + * for all dmap words fully contained within the specified range. + * within this pass, the bits of all fully contained dmap words will + * be marked as free in a single shot and the leaves will be updated. a + * single leaf may describe the free space of multiple dmap words, + * so we may update only a subset of the actual leaves corresponding + * to the dmap words of the block range. + * + * dbJoin() is used to update leaf values and will join the binary + * buddy system of the leaves if the new leaf values indicate this + * should be done. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of a word is to be freed. + */ + if (nb < DBWORD) { + /* free (zero) the appropriate bits within this + * dmap word. + */ + dp->wmap[word] &= + cpu_to_le32(~(ONES << (DBWORD - nb) + >> wbitno)); + + /* update the leaf for this dmap word. + */ + dbJoin(tp, word, + dbMaxBud((u8 *) & dp->wmap[word])); + + word += 1; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and free (zero) the bits of these words. + */ + nwords = rembits >> L2DBWORD; + memset(&dp->wmap[word], 0, nwords * 4); + + /* determine how many bits. + */ + nb = nwords << L2DBWORD; + + /* now update the appropriate leaves to reflect + * the freed words. + */ + for (; nwords > 0; nwords -= nw) { + /* determine what the leaf value should be + * updated to as the minimum of the l2 number + * of bits being freed and the l2 (max) number + * of bits that can be described by this leaf. + */ + size = + min(LITOL2BSZ + (word, L2LPERDMAP, BUDMIN), + NLSTOL2BSZ(nwords)); + + /* update the leaf. + */ + dbJoin(tp, word, size); + + /* get the number of dmap words handled. + */ + nw = BUDSIZE(size, BUDMIN); + word += nw; + } + } + } + + /* update the free count for this dmap. + */ + dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks); + + BMAP_LOCK(bmp); + + /* update the free count for the allocation group and + * map. + */ + agno = blkno >> bmp->db_agl2size; + bmp->db_nfree += nblocks; + bmp->db_agfree[agno] += nblocks; + + /* check if this allocation group is not completely free and + * if it is currently the maximum (rightmost) allocation group. + * if so, establish the new maximum allocation group number by + * searching left for the first allocation group with allocation. + */ + if ((bmp->db_agfree[agno] == bmp->db_agsize + && agno == bmp->db_maxag) || (agno == bmp->db_numag - 1 + && bmp->db_agfree[agno] == + (bmp-> db_mapsize & + (BPERDMAP - 1)))) { + while (bmp->db_maxag > 0) { + bmp->db_maxag -= 1; + if (bmp->db_agfree[bmp->db_maxag] != + bmp->db_agsize) + break; + } + + /* re-establish the allocation group preference if the + * current preference is right of the maximum allocation + * group. + */ + if (bmp->db_agpref > bmp->db_maxag) + bmp->db_agpref = bmp->db_maxag; + } + + BMAP_UNLOCK(bmp); +} + + +/* + * NAME: dbAdjCtl() + * + * FUNCTION: adjust a dmap control page at a specified level to reflect + * the change in a lower level dmap or dmap control page's + * maximum string of free blocks (i.e. a change in the root + * of the lower level object's dmtree) due to the allocation + * or deallocation of a range of blocks with a single dmap. + * + * on entry, this routine is provided with the new value of + * the lower level dmap or dmap control page root and the + * starting block number of the block range whose allocation + * or deallocation resulted in the root change. this range + * is respresented by a single leaf of the current dmapctl + * and the leaf will be updated with this value, possibly + * causing a binary buddy system within the leaves to be + * split or joined. the update may also cause the dmapctl's + * dmtree to be updated. + * + * if the adjustment of the dmap control page, itself, causes its + * root to change, this change will be bubbled up to the next dmap + * control level by a recursive call to this routine, specifying + * the new root value and the next dmap control page level to + * be adjusted. + * PARAMETERS: + * bmp - pointer to bmap descriptor + * blkno - the first block of a block range within a dmap. it is + * the allocation or deallocation of this block range that + * requires the dmap control page to be adjusted. + * newval - the new value of the lower level dmap or dmap control + * page root. + * alloc - TRUE if adjustment is due to an allocation. + * level - current level of dmap control page (i.e. L0, L1, L2) to + * be adjusted. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static int +dbAdjCtl(bmap_t * bmp, s64 blkno, int newval, int alloc, int level) +{ + metapage_t *mp; + s8 oldroot; + int oldval; + s64 lblkno; + dmapctl_t *dcp; + int rc, leafno, ti; + + /* get the buffer for the dmap control page for the specified + * block number and control page level. + */ + lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level); + mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) + return (EIO); + dcp = (dmapctl_t *) mp->data; + + /* determine the leaf number corresponding to the block and + * the index within the dmap control tree. + */ + leafno = BLKTOCTLLEAF(blkno, dcp->budmin); + ti = leafno + le32_to_cpu(dcp->leafidx); + + /* save the current leaf value and the current root level (i.e. + * maximum l2 free string described by this dmapctl). + */ + oldval = dcp->stree[ti]; + oldroot = dcp->stree[ROOT]; + + /* check if this is a control page update for an allocation. + * if so, update the leaf to reflect the new leaf value using + * dbSplit(); otherwise (deallocation), use dbJoin() to udpate + * the leaf with the new value. in addition to updating the + * leaf, dbSplit() will also split the binary buddy system of + * the leaves, if required, and bubble new values within the + * dmapctl tree, if required. similarly, dbJoin() will join + * the binary buddy system of leaves and bubble new values up + * the dmapctl tree as required by the new leaf value. + */ + if (alloc) { + /* check if we are in the middle of a binary buddy + * system. this happens when we are performing the + * first allocation out of an allocation group that + * is part (not the first part) of a larger binary + * buddy system. if we are in the middle, back split + * the system prior to calling dbSplit() which assumes + * that it is at the front of a binary buddy system. + */ + if (oldval == NOFREE) { + dbBackSplit((dmtree_t *) dcp, leafno); + oldval = dcp->stree[ti]; + } + dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval); + } else { + dbJoin((dmtree_t *) dcp, leafno, newval); + } + + /* check if the root of the current dmap control page changed due + * to the update and if the current dmap control page is not at + * the current top level (i.e. L0, L1, L2) of the map. if so (i.e. + * root changed and this is not the top level), call this routine + * again (recursion) for the next higher level of the mapping to + * reflect the change in root for the current dmap control page. + */ + if (dcp->stree[ROOT] != oldroot) { + /* are we below the top level of the map. if so, + * bubble the root up to the next higher level. + */ + if (level < bmp->db_maxlevel) { + /* bubble up the new root of this dmap control page to + * the next level. + */ + if ((rc = + dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc, + level + 1))) { + /* something went wrong in bubbling up the new + * root value, so backout the changes to the + * current dmap control page. + */ + if (alloc) { + dbJoin((dmtree_t *) dcp, leafno, + oldval); + } else { + /* the dbJoin() above might have + * caused a larger binary buddy system + * to form and we may now be in the + * middle of it. if this is the case, + * back split the buddies. + */ + if (dcp->stree[ti] == NOFREE) + dbBackSplit((dmtree_t *) + dcp, leafno); + dbSplit((dmtree_t *) dcp, leafno, + dcp->budmin, oldval); + } + + /* release the buffer and return the error. + */ + release_metapage(mp); + return (rc); + } + } else { + /* we're at the top level of the map. update + * the bmap control page to reflect the size + * of the maximum free buddy system. + */ + assert(level == bmp->db_maxlevel); + assert(bmp->db_maxfreebud == oldroot); + bmp->db_maxfreebud = dcp->stree[ROOT]; + } + } + + /* write the buffer. + */ + write_metapage(mp); + + return (0); +} + + +/* + * NAME: dbSplit() + * + * FUNCTION: update the leaf of a dmtree with a new value, splitting + * the leaf from the binary buddy system of the dmtree's + * leaves, as required. + * + * PARAMETERS: + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * splitsz - the size the binary buddy system starting at the leaf + * must be split to, specified as the log2 number of blocks. + * newval - the new value for the leaf. + * + * RETURN VALUES: none + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval) +{ + int budsz; + int cursz; + s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); + + /* check if the leaf needs to be split. + */ + if (leaf[leafno] > tp->dmt_budmin) { + /* the split occurs by cutting the buddy system in half + * at the specified leaf until we reach the specified + * size. pick up the starting split size (current size + * - 1 in l2) and the corresponding buddy size. + */ + cursz = leaf[leafno] - 1; + budsz = BUDSIZE(cursz, tp->dmt_budmin); + + /* split until we reach the specified size. + */ + while (cursz >= splitsz) { + /* update the buddy's leaf with its new value. + */ + dbAdjTree(tp, leafno ^ budsz, cursz); + + /* on to the next size and buddy. + */ + cursz -= 1; + budsz >>= 1; + } + } + + /* adjust the dmap tree to reflect the specified leaf's new + * value. + */ + dbAdjTree(tp, leafno, newval); +} + + +/* + * NAME: dbBackSplit() + * + * FUNCTION: back split the binary buddy system of dmtree leaves + * that hold a specified leaf until the specified leaf + * starts its own binary buddy system. + * + * the allocators typically perform allocations at the start + * of binary buddy systems and dbSplit() is used to accomplish + * any required splits. in some cases, however, allocation + * may occur in the middle of a binary system and requires a + * back split, with the split proceeding out from the middle of + * the system (less efficient) rather than the start of the + * system (more efficient). the cases in which a back split + * is required are rare and are limited to the first allocation + * within an allocation group which is a part (not first part) + * of a larger binary buddy system and a few exception cases + * in which a previous join operation must be backed out. + * + * PARAMETERS: + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * + * RETURN VALUES: none + * + * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit; + */ +static void dbBackSplit(dmtree_t * tp, int leafno) +{ + int budsz, bud, w, bsz, size; + int cursz; + s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); + + /* leaf should be part (not first part) of a binary + * buddy system. + */ + assert(leaf[leafno] == NOFREE); + + /* the back split is accomplished by iteratively finding the leaf + * that starts the buddy system that contains the specified leaf and + * splitting that system in two. this iteration continues until + * the specified leaf becomes the start of a buddy system. + * + * determine maximum possible l2 size for the specified leaf. + */ + size = + LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs), + tp->dmt_budmin); + + /* determine the number of leaves covered by this size. this + * is the buddy size that we will start with as we search for + * the buddy system that contains the specified leaf. + */ + budsz = BUDSIZE(size, tp->dmt_budmin); + + /* back split. + */ + while (leaf[leafno] == NOFREE) { + /* find the leftmost buddy leaf. + */ + for (w = leafno, bsz = budsz;; bsz <<= 1, + w = (w < bud) ? w : bud) { + assert(bsz < le32_to_cpu(tp->dmt_nleafs)); + + /* determine the buddy. + */ + bud = w ^ bsz; + + /* check if this buddy is the start of the system. + */ + if (leaf[bud] != NOFREE) { + /* split the leaf at the start of the + * system in two. + */ + cursz = leaf[bud] - 1; + dbSplit(tp, bud, cursz, cursz); + break; + } + } + } + + assert(leaf[leafno] == size); +} + + +/* + * NAME: dbJoin() + * + * FUNCTION: update the leaf of a dmtree with a new value, joining + * the leaf with other leaves of the dmtree into a multi-leaf + * binary buddy system, as required. + * + * PARAMETERS: + * tp - pointer to the tree containing the leaf. + * leafno - the number of the leaf to be updated. + * newval - the new value for the leaf. + * + * RETURN VALUES: none + */ +static void dbJoin(dmtree_t * tp, int leafno, int newval) +{ + int budsz, buddy; + s8 *leaf; + + /* can the new leaf value require a join with other leaves ? + */ + if (newval >= tp->dmt_budmin) { + /* pickup a pointer to the leaves of the tree. + */ + leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx); + + /* try to join the specified leaf into a large binary + * buddy system. the join proceeds by attempting to join + * the specified leafno with its buddy (leaf) at new value. + * if the join occurs, we attempt to join the left leaf + * of the joined buddies with its buddy at new value + 1. + * we continue to join until we find a buddy that cannot be + * joined (does not have a value equal to the size of the + * last join) or until all leaves have been joined into a + * single system. + * + * get the buddy size (number of words covered) of + * the new value. + */ + budsz = BUDSIZE(newval, tp->dmt_budmin); + + /* try to join. + */ + while (budsz < le32_to_cpu(tp->dmt_nleafs)) { + /* get the buddy leaf. + */ + buddy = leafno ^ budsz; + + /* if the leaf's new value is greater than its + * buddy's value, we join no more. + */ + if (newval > leaf[buddy]) + break; + + assert(newval == leaf[buddy]); + + /* check which (leafno or buddy) is the left buddy. + * the left buddy gets to claim the blocks resulting + * from the join while the right gets to claim none. + * the left buddy is also eligable to participate in + * a join at the next higher level while the right + * is not. + * + */ + if (leafno < buddy) { + /* leafno is the left buddy. + */ + dbAdjTree(tp, buddy, NOFREE); + } else { + /* buddy is the left buddy and becomes + * leafno. + */ + dbAdjTree(tp, leafno, NOFREE); + leafno = buddy; + } + + /* on to try the next join. + */ + newval += 1; + budsz <<= 1; + } + } + + /* update the leaf value. + */ + dbAdjTree(tp, leafno, newval); +} + + +/* + * NAME: dbAdjTree() + * + * FUNCTION: update a leaf of a dmtree with a new value, adjusting + * the dmtree, as required, to reflect the new leaf value. + * the combination of any buddies must already be done before + * this is called. + * + * PARAMETERS: + * tp - pointer to the tree to be adjusted. + * leafno - the number of the leaf to be updated. + * newval - the new value for the leaf. + * + * RETURN VALUES: none + */ +static void dbAdjTree(dmtree_t * tp, int leafno, int newval) +{ + int lp, pp, k; + int max; + + /* pick up the index of the leaf for this leafno. + */ + lp = leafno + le32_to_cpu(tp->dmt_leafidx); + + /* is the current value the same as the old value ? if so, + * there is nothing to do. + */ + if (tp->dmt_stree[lp] == newval) + return; + + /* set the new value. + */ + tp->dmt_stree[lp] = newval; + + /* bubble the new value up the tree as required. + */ + for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) { + /* get the index of the first leaf of the 4 leaf + * group containing the specified leaf (leafno). + */ + lp = ((lp - 1) & ~0x03) + 1; + + /* get the index of the parent of this 4 leaf group. + */ + pp = (lp - 1) >> 2; + + /* determine the maximum of the 4 leaves. + */ + max = TREEMAX(&tp->dmt_stree[lp]); + + /* if the maximum of the 4 is the same as the + * parent's value, we're done. + */ + if (tp->dmt_stree[pp] == max) + break; + + /* parent gets new value. + */ + tp->dmt_stree[pp] = max; + + /* parent becomes leaf for next go-round. + */ + lp = pp; + } +} + + +/* + * NAME: dbFindLeaf() + * + * FUNCTION: search a dmtree_t for sufficient free blocks, returning + * the index of a leaf describing the free blocks if + * sufficient free blocks are found. + * + * the search starts at the top of the dmtree_t tree and + * proceeds down the tree to the leftmost leaf with sufficient + * free space. + * + * PARAMETERS: + * tp - pointer to the tree to be searched. + * l2nb - log2 number of free blocks to search for. + * leafidx - return pointer to be set to the index of the leaf + * describing at least l2nb free blocks if sufficient + * free blocks are found. + * + * RETURN VALUES: + * 0 - success + * ENOSPC - insufficient free blocks. + */ +static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx) +{ + int ti, n = 0, k, x = 0; + + /* first check the root of the tree to see if there is + * sufficient free space. + */ + if (l2nb > tp->dmt_stree[ROOT]) + return (ENOSPC); + + /* sufficient free space available. now search down the tree + * starting at the next level for the leftmost leaf that + * describes sufficient free space. + */ + for (k = le32_to_cpu(tp->dmt_height), ti = 1; + k > 0; k--, ti = ((ti + n) << 2) + 1) { + /* search the four nodes at this level, starting from + * the left. + */ + for (x = ti, n = 0; n < 4; n++) { + /* sufficient free space found. move to the next + * level (or quit if this is the last level). + */ + if (l2nb <= tp->dmt_stree[x + n]) + break; + } + + /* better have found something since the higher + * levels of the tree said it was here. + */ + assert(n < 4); + } + + /* set the return to the leftmost leaf describing sufficient + * free space. + */ + *leafidx = x + n - le32_to_cpu(tp->dmt_leafidx); + + return (0); +} + + +/* + * NAME: dbFindBits() + * + * FUNCTION: find a specified number of binary buddy free bits within a + * dmap bitmap word value. + * + * this routine searches the bitmap value for (1 << l2nb) free + * bits at (1 << l2nb) alignments within the value. + * + * PARAMETERS: + * word - dmap bitmap word value. + * l2nb - number of free bits specified as a log2 number. + * + * RETURN VALUES: + * starting bit number of free bits. + */ +static int dbFindBits(u32 word, int l2nb) +{ + int bitno, nb; + u32 mask; + + /* get the number of bits. + */ + nb = 1 << l2nb; + assert(nb <= DBWORD); + + /* complement the word so we can use a mask (i.e. 0s represent + * free bits) and compute the mask. + */ + word = ~word; + mask = ONES << (DBWORD - nb); + + /* scan the word for nb free bits at nb alignments. + */ + for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) { + if ((mask & word) == mask) + break; + } + + ASSERT(bitno < 32); + + /* return the bit number. + */ + return (bitno); +} + + +/* + * NAME: dbMaxBud(u8 *cp) + * + * FUNCTION: determine the largest binary buddy string of free + * bits within 32-bits of the map. + * + * PARAMETERS: + * cp - pointer to the 32-bit value. + * + * RETURN VALUES: + * largest binary buddy of free bits within a dmap word. + */ +static int dbMaxBud(u8 * cp) +{ + signed char tmp1, tmp2; + + /* check if the wmap word is all free. if so, the + * free buddy size is BUDMIN. + */ + if (*((uint *) cp) == 0) + return (BUDMIN); + + /* check if the wmap word is half free. if so, the + * free buddy size is BUDMIN-1. + */ + if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0) + return (BUDMIN - 1); + + /* not all free or half free. determine the free buddy + * size thru table lookup using quarters of the wmap word. + */ + tmp1 = max(budtab[cp[2]], budtab[cp[3]]); + tmp2 = max(budtab[cp[0]], budtab[cp[1]]); + return (max(tmp1, tmp2)); +} + + +/* + * NAME: cnttz(uint word) + * + * FUNCTION: determine the number of trailing zeros within a 32-bit + * value. + * + * PARAMETERS: + * value - 32-bit value to be examined. + * + * RETURN VALUES: + * count of trailing zeros + */ +int cnttz(u32 word) +{ + int n; + + for (n = 0; n < 32; n++, word >>= 1) { + if (word & 0x01) + break; + } + + return (n); +} + + +/* + * NAME: cntlz(u32 value) + * + * FUNCTION: determine the number of leading zeros within a 32-bit + * value. + * + * PARAMETERS: + * value - 32-bit value to be examined. + * + * RETURN VALUES: + * count of leading zeros + */ +int cntlz(u32 value) +{ + int n; + + for (n = 0; n < 32; n++, value <<= 1) { + if (value & HIGHORDER) + break; + } + return (n); +} + + +/* + * NAME: blkstol2(s64 nb) + * + * FUNCTION: convert a block count to its log2 value. if the block + * count is not a l2 multiple, it is rounded up to the next + * larger l2 multiple. + * + * PARAMETERS: + * nb - number of blocks + * + * RETURN VALUES: + * log2 number of blocks + */ +int blkstol2(s64 nb) +{ + int l2nb; + s64 mask; /* meant to be signed */ + + mask = (s64) 1 << (64 - 1); + + /* count the leading bits. + */ + for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) { + /* leading bit found. + */ + if (nb & mask) { + /* determine the l2 value. + */ + l2nb = (64 - 1) - l2nb; + + /* check if we need to round up. + */ + if (~mask & nb) + l2nb++; + + return (l2nb); + } + } + assert(0); + return 0; /* fix compiler warning */ +} + + +/* + * NAME: fsDirty() + * + * FUNCTION: xxx + * + * PARAMETERS: + * ipmnt - mount inode + * + * RETURN VALUES: + * none + */ +void fsDirty() +{ + printk("fsDirty(): bye-bye\n"); + assert(0); +} + + +/* + * NAME: dbAllocBottomUp() + * + * FUNCTION: alloc the specified block range from the working block + * allocation map. + * + * the blocks will be alloc from the working map one dmap + * at a time. + * + * PARAMETERS: + * ip - pointer to in-core inode; + * blkno - starting block number to be freed. + * nblocks - number of blocks to be freed. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error + */ +int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks) +{ + metapage_t *mp; + dmap_t *dp; + int nb, rc; + s64 lblkno, rem; + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + bmap_t *bmp = JFS_SBI(ip->i_sb)->bmap; + + IREAD_LOCK(ipbmap); + + /* block to be allocated better be within the mapsize. */ + ASSERT(nblocks <= bmp->db_mapsize - blkno); + + /* + * allocate the blocks a dmap at a time. + */ + mp = NULL; + for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) { + /* release previous dmap if any */ + if (mp) { + write_metapage(mp); + } + + /* get the buffer for the current dmap. */ + lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + IREAD_UNLOCK(ipbmap); + return (EIO); + } + dp = (dmap_t *) mp->data; + + /* determine the number of blocks to be allocated from + * this dmap. + */ + nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1))); + + DBFREECK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb); + + /* allocate the blocks. */ + if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) { + release_metapage(mp); + IREAD_UNLOCK(ipbmap); + return (rc); + } + + DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nb); + } + + /* write the last buffer. */ + write_metapage(mp); + + IREAD_UNLOCK(ipbmap); + + return (0); +} + + +static int dbAllocDmapBU(bmap_t * bmp, dmap_t * dp, s64 blkno, int nblocks) +{ + int rc; + int dbitno, word, rembits, nb, nwords, wbitno, agno; + s8 oldroot, *leaf; + dmaptree_t *tp = (dmaptree_t *) & dp->tree; + + /* save the current value of the root (i.e. maximum free string) + * of the dmap tree. + */ + oldroot = tp->stree[ROOT]; + + /* pick up a pointer to the leaves of the dmap tree */ + leaf = tp->stree + LEAFIND; + + /* determine the bit number and word within the dmap of the + * starting block. + */ + dbitno = blkno & (BPERDMAP - 1); + word = dbitno >> L2DBWORD; + + /* block range better be within the dmap */ + assert(dbitno + nblocks <= BPERDMAP); + + /* allocate the bits of the dmap's words corresponding to the block + * range. not all bits of the first and last words may be contained + * within the block range. if this is the case, we'll work against + * those words (i.e. partial first and/or last) on an individual basis + * (a single pass), allocating the bits of interest by hand and + * updating the leaf corresponding to the dmap word. a single pass + * will be used for all dmap words fully contained within the + * specified range. within this pass, the bits of all fully contained + * dmap words will be marked as free in a single shot and the leaves + * will be updated. a single leaf may describe the free space of + * multiple dmap words, so we may update only a subset of the actual + * leaves corresponding to the dmap words of the block range. + */ + for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) { + /* determine the bit number within the word and + * the number of bits within the word. + */ + wbitno = dbitno & (DBWORD - 1); + nb = min(rembits, DBWORD - wbitno); + + /* check if only part of a word is to be allocated. + */ + if (nb < DBWORD) { + /* allocate (set to 1) the appropriate bits within + * this dmap word. + */ + dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb) + >> wbitno); + + word += 1; + } else { + /* one or more dmap words are fully contained + * within the block range. determine how many + * words and allocate (set to 1) the bits of these + * words. + */ + nwords = rembits >> L2DBWORD; + memset(&dp->wmap[word], (int) ONES, nwords * 4); + + /* determine how many bits */ + nb = nwords << L2DBWORD; + } + } + + /* update the free count for this dmap */ + dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks); + + /* reconstruct summary tree */ + dbInitDmapTree(dp); + + BMAP_LOCK(bmp); + + /* if this allocation group is completely free, + * update the highest active allocation group number + * if this allocation group is the new max. + */ + agno = blkno >> bmp->db_agl2size; + if (agno > bmp->db_maxag) + bmp->db_maxag = agno; + + /* update the free count for the allocation group and map */ + bmp->db_agfree[agno] -= nblocks; + bmp->db_nfree -= nblocks; + + BMAP_UNLOCK(bmp); + + /* if the root has not changed, done. */ + if (tp->stree[ROOT] == oldroot) + return (0); + + /* root changed. bubble the change up to the dmap control pages. + * if the adjustment of the upper level control pages fails, + * backout the bit allocation (thus making everything consistent). + */ + if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0))) + dbFreeBits(bmp, dp, blkno, nblocks); + + return (rc); +} + + +/* + * NAME: dbExtendFS() + * + * FUNCTION: extend bmap from blkno for nblocks; + * dbExtendFS() updates bmap ready for dbAllocBottomUp(); + * + * L2 + * | + * L1---------------------------------L1 + * | | + * L0---------L0---------L0 L0---------L0---------L0 + * | | | | | | + * d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,...,dn d0,.,dm; + * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm + * + * <---old---><----------------------------extend-----------------------> + */ +int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) +{ + struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb); + int nbperpage = sbi->nbperpage; + int i, i0 = TRUE, j, j0 = TRUE, k, n; + s64 newsize; + s64 p; + metapage_t *mp, *l2mp, *l1mp, *l0mp; + dmapctl_t *l2dcp, *l1dcp, *l0dcp; + dmap_t *dp; + s8 *l0leaf, *l1leaf, *l2leaf; + bmap_t *bmp = sbi->bmap; + int agno, l2agsize, oldl2agsize; + s64 ag_rem; + + newsize = blkno + nblocks; + + jEVENT(0, ("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld\n", + (long long) blkno, (long long) nblocks, + (long long) newsize)); + + /* + * initialize bmap control page. + * + * all the data in bmap control page should exclude + * the mkfs hidden dmap page. + */ + + /* update mapsize */ + bmp->db_mapsize = newsize; + bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize); + + /* compute new AG size */ + l2agsize = dbGetL2AGSize(newsize); + oldl2agsize = bmp->db_agl2size; + + bmp->db_agl2size = l2agsize; + bmp->db_agsize = 1 << l2agsize; + + /* compute new number of AG */ + agno = bmp->db_numag; + bmp->db_numag = newsize >> l2agsize; + bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0; + + /* + * reconfigure db_agfree[] + * from old AG configuration to new AG configuration; + * + * coalesce contiguous k (newAGSize/oldAGSize) AGs; + * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; + * note: new AG size = old AG size * (2**x). + */ + if (l2agsize == oldl2agsize) + goto extend; + k = 1 << (l2agsize - oldl2agsize); + ag_rem = bmp->db_agfree[0]; /* save agfree[0] */ + for (i = 0, n = 0; i < agno; n++) { + bmp->db_agfree[n] = 0; /* init collection point */ + + /* coalesce cotiguous k AGs; */ + for (j = 0; j < k && i < agno; j++, i++) { + /* merge AGi to AGn */ + bmp->db_agfree[n] += bmp->db_agfree[i]; + } + } + bmp->db_agfree[0] += ag_rem; /* restore agfree[0] */ + + for (; n < MAXAG; n++) + bmp->db_agfree[n] = 0; + + /* + * update highest active ag number + */ + + bmp->db_maxag = bmp->db_maxag / k; + + /* + * extend bmap + * + * update bit maps and corresponding level control pages; + * global control page db_nfree, db_agfree[agno], db_maxfreebud; + */ + extend: + /* get L2 page */ + p = BMAPBLKNO + nbperpage; /* L2 page */ + l2mp = read_metapage(ipbmap, p, PSIZE, 0); + assert(l2mp); + l2dcp = (dmapctl_t *) l2mp->data; + + /* compute start L1 */ + k = blkno >> L2MAXL1SIZE; + l2leaf = l2dcp->stree + CTLLEAFIND + k; + p = BLKTOL1(blkno, sbi->l2nbperpage); /* L1 page */ + + /* + * extend each L1 in L2 + */ + for (; k < LPERCTL; k++, p += nbperpage) { + /* get L1 page */ + if (j0) { + /* read in L1 page: (blkno & (MAXL1SIZE - 1)) */ + l1mp = read_metapage(ipbmap, p, PSIZE, 0); + if (l1mp == NULL) + goto errout; + l1dcp = (dmapctl_t *) l1mp->data; + + /* compute start L0 */ + j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE; + l1leaf = l1dcp->stree + CTLLEAFIND + j; + p = BLKTOL0(blkno, sbi->l2nbperpage); + j0 = FALSE; + } else { + /* assign/init L1 page */ + l1mp = get_metapage(ipbmap, p, PSIZE, 0); + if (l1mp == NULL) + goto errout; + + l1dcp = (dmapctl_t *) l1mp->data; + + /* compute start L0 */ + j = 0; + l1leaf = l1dcp->stree + CTLLEAFIND; + p += nbperpage; /* 1st L0 of L1.k */ + } + + /* + * extend each L0 in L1 + */ + for (; j < LPERCTL; j++) { + /* get L0 page */ + if (i0) { + /* read in L0 page: (blkno & (MAXL0SIZE - 1)) */ + + l0mp = read_metapage(ipbmap, p, PSIZE, 0); + if (l0mp == NULL) + goto errout; + l0dcp = (dmapctl_t *) l0mp->data; + + /* compute start dmap */ + i = (blkno & (MAXL0SIZE - 1)) >> + L2BPERDMAP; + l0leaf = l0dcp->stree + CTLLEAFIND + i; + p = BLKTODMAP(blkno, + sbi->l2nbperpage); + i0 = FALSE; + } else { + /* assign/init L0 page */ + l0mp = get_metapage(ipbmap, p, PSIZE, 0); + if (l0mp == NULL) + goto errout; + + l0dcp = (dmapctl_t *) l0mp->data; + + /* compute start dmap */ + i = 0; + l0leaf = l0dcp->stree + CTLLEAFIND; + p += nbperpage; /* 1st dmap of L0.j */ + } + + /* + * extend each dmap in L0 + */ + for (; i < LPERCTL; i++) { + /* + * reconstruct the dmap page, and + * initialize corresponding parent L0 leaf + */ + if ((n = blkno & (BPERDMAP - 1))) { + /* read in dmap page: */ + mp = read_metapage(ipbmap, p, + PSIZE, 0); + if (mp == NULL) + goto errout; + n = min(nblocks, (s64)BPERDMAP - n); + } else { + /* assign/init dmap page */ + mp = read_metapage(ipbmap, p, + PSIZE, 0); + if (mp == NULL) + goto errout; + + n = min(nblocks, (s64)BPERDMAP); + } + + dp = (dmap_t *) mp->data; + *l0leaf = dbInitDmap(dp, blkno, n); + + bmp->db_nfree += n; + agno = le64_to_cpu(dp->start) >> l2agsize; + bmp->db_agfree[agno] += n; + + write_metapage(mp); + + l0leaf++; + p += nbperpage; + + blkno += n; + nblocks -= n; + if (nblocks == 0) + break; + } /* for each dmap in a L0 */ + + /* + * build current L0 page from its leaves, and + * initialize corresponding parent L1 leaf + */ + *l1leaf = dbInitDmapCtl(l0dcp, 0, ++i); + write_metapage(l0mp); + + if (nblocks) + l1leaf++; /* continue for next L0 */ + else { + /* more than 1 L0 ? */ + if (j > 0) + break; /* build L1 page */ + else { + /* summarize in global bmap page */ + bmp->db_maxfreebud = *l1leaf; + release_metapage(l1mp); + release_metapage(l2mp); + goto finalize; + } + } + } /* for each L0 in a L1 */ + + /* + * build current L1 page from its leaves, and + * initialize corresponding parent L2 leaf + */ + *l2leaf = dbInitDmapCtl(l1dcp, 1, ++j); + write_metapage(l1mp); + + if (nblocks) + l2leaf++; /* continue for next L1 */ + else { + /* more than 1 L1 ? */ + if (k > 0) + break; /* build L2 page */ + else { + /* summarize in global bmap page */ + bmp->db_maxfreebud = *l2leaf; + release_metapage(l2mp); + goto finalize; + } + } + } /* for each L1 in a L2 */ + + assert(0); + + /* + * finalize bmap control page + */ + finalize: + + return 0; + + errout: + return EIO; +} + + +/* + * dbFinalizeBmap() + */ +void dbFinalizeBmap(struct inode *ipbmap) +{ + bmap_t *bmp = JFS_SBI(ipbmap->i_sb)->bmap; + int actags, inactags, l2nl; + s64 ag_rem, actfree, inactfree, avgfree; + int i, n; + + /* + * finalize bmap control page + */ +//finalize: + /* + * compute db_agpref: preferred ag to allocate from + * (the leftmost ag with average free space in it); + */ +//agpref: + /* get the number of active ags and inacitve ags */ + actags = bmp->db_maxag + 1; + inactags = bmp->db_numag - actags; + ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1); /* ??? */ + + /* determine how many blocks are in the inactive allocation + * groups. in doing this, we must account for the fact that + * the rightmost group might be a partial group (i.e. file + * system size is not a multiple of the group size). + */ + inactfree = (inactags && ag_rem) ? + ((inactags - 1) << bmp->db_agl2size) + ag_rem + : inactags << bmp->db_agl2size; + + /* determine how many free blocks are in the active + * allocation groups plus the average number of free blocks + * within the active ags. + */ + actfree = bmp->db_nfree - inactfree; + avgfree = (u32) actfree / (u32) actags; + + /* if the preferred allocation group has not average free space. + * re-establish the preferred group as the leftmost + * group with average free space. + */ + if (bmp->db_agfree[bmp->db_agpref] < avgfree) { + for (bmp->db_agpref = 0; bmp->db_agpref < actags; + bmp->db_agpref++) { + if (bmp->db_agfree[bmp->db_agpref] >= avgfree) + break; + } + assert(bmp->db_agpref < bmp->db_numag); + } + + /* + * compute db_aglevel, db_agheigth, db_width, db_agstart: + * an ag is covered in aglevel dmapctl summary tree, + * at agheight level height (from leaf) with agwidth number of nodes + * each, which starts at agstart index node of the smmary tree node + * array; + */ + bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize); + l2nl = + bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL); + bmp->db_agheigth = l2nl >> 1; + bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1)); + for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0; + i--) { + bmp->db_agstart += n; + n <<= 2; + } + +/* +printk("bmap: agpref:%d aglevel:%d agheigth:%d agwidth:%d\n", + bmp->db_agpref, bmp->db_aglevel, bmp->db_agheigth, bmp->db_agwidth); +*/ +} + + +/* + * NAME: dbInitDmap()/ujfs_idmap_page() + * + * FUNCTION: initialize working/persistent bitmap of the dmap page + * for the specified number of blocks: + * + * at entry, the bitmaps had been initialized as free (ZEROS); + * The number of blocks will only account for the actually + * existing blocks. Blocks which don't actually exist in + * the aggregate will be marked as allocated (ONES); + * + * PARAMETERS: + * dp - pointer to page of map + * nblocks - number of blocks this page + * + * RETURNS: NONE + */ +static int dbInitDmap(dmap_t * dp, s64 Blkno, int nblocks) +{ + int blkno, w, b, r, nw, nb, i; +/* +printk("sbh_dmap: in dbInitDmap blkno:%Ld nblocks:%ld\n", Blkno, nblocks); +*/ + + /* starting block number within the dmap */ + blkno = Blkno & (BPERDMAP - 1); + + if (blkno == 0) { + dp->nblocks = dp->nfree = cpu_to_le32(nblocks); + dp->start = cpu_to_le64(Blkno); + + if (nblocks == BPERDMAP) { + memset(&dp->wmap[0], 0, LPERDMAP * 4); + memset(&dp->pmap[0], 0, LPERDMAP * 4); + goto initTree; + } + } else { + dp->nblocks = + cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks); + dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks); + } + + /* word number containing start block number */ + w = blkno >> L2DBWORD; + + /* + * free the bits corresponding to the block range (ZEROS): + * note: not all bits of the first and last words may be contained + * within the block range. + */ + for (r = nblocks; r > 0; r -= nb, blkno += nb) { + /* number of bits preceding range to be freed in the word */ + b = blkno & (DBWORD - 1); + /* number of bits to free in the word */ + nb = min(r, DBWORD - b); + + /* is partial word to be freed ? */ + if (nb < DBWORD) { + /* free (set to 0) from the bitmap word */ + dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb) + >> b)); + dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb) + >> b)); + + /* skip the word freed */ + w++; + } else { + /* free (set to 0) contiguous bitmap words */ + nw = r >> L2DBWORD; + memset(&dp->wmap[w], 0, nw * 4); + memset(&dp->pmap[w], 0, nw * 4); + + /* skip the words freed */ + nb = nw << L2DBWORD; + w += nw; + } + } + + /* + * mark bits following the range to be freed (non-existing + * blocks) as allocated (ONES) + */ +/* +printk("sbh_dmap: in dbInitDmap, preparing to mark unbacked, blkno:%ld nblocks:%ld\n", + blkno, nblocks); +*/ + + if (blkno == BPERDMAP) + goto initTree; + + /* the first word beyond the end of existing blocks */ + w = blkno >> L2DBWORD; + + /* does nblocks fall on a 32-bit boundary ? */ + b = blkno & (DBWORD - 1); +/* +printk("sbh_dmap: in dbInitDmap, b:%ld w:%ld mask: %lx\n", b, w, (ONES>>b)); +*/ + if (b) { + /* mark a partial word allocated */ + dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b); + w++; + } + + /* set the rest of the words in the page to allocated (ONES) */ + for (i = w; i < LPERDMAP; i++) + dp->pmap[i] = dp->wmap[i] = ONES; + + /* + * init tree + */ + initTree: + return (dbInitDmapTree(dp)); +} + + +/* + * NAME: dbInitDmapTree()/ujfs_complete_dmap() + * + * FUNCTION: initialize summary tree of the specified dmap: + * + * at entry, bitmap of the dmap has been initialized; + * + * PARAMETERS: + * dp - dmap to complete + * blkno - starting block number for this dmap + * treemax - will be filled in with max free for this dmap + * + * RETURNS: max free string at the root of the tree + */ +static int dbInitDmapTree(dmap_t * dp) +{ + dmaptree_t *tp; + s8 *cp; + int i; + + /* init fixed info of tree */ + tp = &dp->tree; + tp->nleafs = cpu_to_le32(LPERDMAP); + tp->l2nleafs = cpu_to_le32(L2LPERDMAP); + tp->leafidx = cpu_to_le32(LEAFIND); + tp->height = cpu_to_le32(4); + tp->budmin = BUDMIN; + + /* init each leaf from corresponding wmap word: + * note: leaf is set to NOFREE(-1) if all blocks of corresponding + * bitmap word are allocated. + */ + cp = tp->stree + le32_to_cpu(tp->leafidx); + for (i = 0; i < LPERDMAP; i++) + *cp++ = dbMaxBud((u8 *) & dp->wmap[i]); + + /* build the dmap's binary buddy summary tree */ + return (dbInitTree(tp)); +} + + +/* + * NAME: dbInitTree()/ujfs_adjtree() + * + * FUNCTION: initialize binary buddy summary tree of a dmap or dmapctl. + * + * at entry, the leaves of the tree has been initialized + * from corresponding bitmap word or root of summary tree + * of the child control page; + * configure binary buddy system at the leaf level, then + * bubble up the values of the leaf nodes up the tree. + * + * PARAMETERS: + * cp - Pointer to the root of the tree + * l2leaves- Number of leaf nodes as a power of 2 + * l2min - Number of blocks that can be covered by a leaf + * as a power of 2 + * + * RETURNS: max free string at the root of the tree + */ +static int dbInitTree(dmaptree_t * dtp) +{ + int l2max, l2free, bsize, nextb, i; + int child, parent, nparent; + s8 *tp, *cp, *cp1; + + tp = dtp->stree; + + /* Determine the maximum free string possible for the leaves */ + l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin; + + /* + * configure the leaf levevl into binary buddy system + * + * Try to combine buddies starting with a buddy size of 1 + * (i.e. two leaves). At a buddy size of 1 two buddy leaves + * can be combined if both buddies have a maximum free of l2min; + * the combination will result in the left-most buddy leaf having + * a maximum free of l2min+1. + * After processing all buddies for a given size, process buddies + * at the next higher buddy size (i.e. current size * 2) and + * the next maximum free (current free + 1). + * This continues until the maximum possible buddy combination + * yields maximum free. + */ + for (l2free = dtp->budmin, bsize = 1; l2free < l2max; + l2free++, bsize = nextb) { + /* get next buddy size == current buddy pair size */ + nextb = bsize << 1; + + /* scan each adjacent buddy pair at current buddy size */ + for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx); + i < le32_to_cpu(dtp->nleafs); + i += nextb, cp += nextb) { + /* coalesce if both adjacent buddies are max free */ + if (*cp == l2free && *(cp + bsize) == l2free) { + *cp = l2free + 1; /* left take right */ + *(cp + bsize) = -1; /* right give left */ + } + } + } + + /* + * bubble summary information of leaves up the tree. + * + * Starting at the leaf node level, the four nodes described by + * the higher level parent node are compared for a maximum free and + * this maximum becomes the value of the parent node. + * when all lower level nodes are processed in this fashion then + * move up to the next level (parent becomes a lower level node) and + * continue the process for that level. + */ + for (child = le32_to_cpu(dtp->leafidx), + nparent = le32_to_cpu(dtp->nleafs) >> 2; + nparent > 0; nparent >>= 2, child = parent) { + /* get index of 1st node of parent level */ + parent = (child - 1) >> 2; + + /* set the value of the parent node as the maximum + * of the four nodes of the current level. + */ + for (i = 0, cp = tp + child, cp1 = tp + parent; + i < nparent; i++, cp += 4, cp1++) + *cp1 = TREEMAX(cp); + } + + return (*tp); +} + + +/* + * dbInitDmapCtl() + * + * function: initialize dmapctl page + */ +static int dbInitDmapCtl(dmapctl_t * dcp, int level, int i) +{ /* start leaf index not covered by range */ + s8 *cp; + + dcp->nleafs = cpu_to_le32(LPERCTL); + dcp->l2nleafs = cpu_to_le32(L2LPERCTL); + dcp->leafidx = cpu_to_le32(CTLLEAFIND); + dcp->height = cpu_to_le32(5); + dcp->budmin = L2BPERDMAP + L2LPERCTL * level; + + /* + * initialize the leaves of current level that were not covered + * by the specified input block range (i.e. the leaves have no + * low level dmapctl or dmap). + */ + cp = &dcp->stree[CTLLEAFIND + i]; + for (; i < LPERCTL; i++) + *cp++ = NOFREE; + + /* build the dmap's binary buddy summary tree */ + return (dbInitTree((dmaptree_t *) dcp)); +} + + +/* + * NAME: dbGetL2AGSize()/ujfs_getagl2size() + * + * FUNCTION: Determine log2(allocation group size) from aggregate size + * + * PARAMETERS: + * nblocks - Number of blocks in aggregate + * + * RETURNS: log2(allocation group size) in aggregate blocks + */ +static int dbGetL2AGSize(s64 nblocks) +{ + s64 sz; + s64 m; + int l2sz; + + if (nblocks < BPERDMAP * MAXAG) + return (L2BPERDMAP); + + /* round up aggregate size to power of 2 */ + m = ((u64) 1 << (64 - 1)); + for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) { + if (m & nblocks) + break; + } + + sz = (s64) 1 << l2sz; + if (sz < nblocks) + l2sz += 1; + + /* agsize = roundupSize/max_number_of_ag */ + return (l2sz - L2MAXAG); +} + + +/* + * NAME: dbMapFileSizeToMapSize() + * + * FUNCTION: compute number of blocks the block allocation map file + * can cover from the map file size; + * + * RETURNS: Number of blocks which can be covered by this block map file; + */ + +/* + * maximum number of map pages at each level including control pages + */ +#define MAXL0PAGES (1 + LPERCTL) +#define MAXL1PAGES (1 + LPERCTL * MAXL0PAGES) +#define MAXL2PAGES (1 + LPERCTL * MAXL1PAGES) + +/* + * convert number of map pages to the zero origin top dmapctl level + */ +#define BMAPPGTOLEV(npages) \ + (((npages) <= 3 + MAXL0PAGES) ? 0 \ + : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2) + +s64 dbMapFileSizeToMapSize(struct inode * ipbmap) +{ + struct super_block *sb = ipbmap->i_sb; + s64 nblocks; + s64 npages, ndmaps; + int level, i; + int complete, factor; + + nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize; + npages = nblocks >> JFS_SBI(sb)->l2nbperpage; + level = BMAPPGTOLEV(npages); + + /* At each level, accumulate the number of dmap pages covered by + * the number of full child levels below it; + * repeat for the last incomplete child level. + */ + ndmaps = 0; + npages--; /* skip the first global control page */ + /* skip higher level control pages above top level covered by map */ + npages -= (2 - level); + npages--; /* skip top level's control page */ + for (i = level; i >= 0; i--) { + factor = + (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1); + complete = (u32) npages / factor; + ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL + : ((i == 1) ? LPERCTL : 1)); + + /* pages in last/incomplete child */ + npages = (u32) npages % factor; + /* skip incomplete child's level control page */ + npages--; + } + + /* convert the number of dmaps into the number of blocks + * which can be covered by the dmaps; + */ + nblocks = ndmaps << L2BPERDMAP; + + return (nblocks); +} + + +#ifdef _JFS_DEBUG_DMAP +/* + * DBinitmap() + */ +static void DBinitmap(s64 size, struct inode *ipbmap, u32 ** results) +{ + int npages; + u32 *dbmap, *d; + int n; + s64 lblkno, cur_block; + dmap_t *dp; + metapage_t *mp; + + npages = size / 32768; + npages += (size % 32768) ? 1 : 0; + + dbmap = (u32 *) xmalloc(npages * 4096, L2PSIZE, kernel_heap); + if (dbmap == NULL) + assert(0); + + for (n = 0, d = dbmap; n < npages; n++, d += 1024) + bzero(d, 4096); + + /* Need to initialize from disk map pages + */ + for (d = dbmap, cur_block = 0; cur_block < size; + cur_block += BPERDMAP, d += LPERDMAP) { + lblkno = BLKTODMAP(cur_block, + JFS_SBI(ipbmap->i_sb)->bmap-> + db_l2nbperpage); + mp = read_metapage(ipbmap, lblkno, PSIZE, 0); + if (mp == NULL) { + assert(0); + } + dp = (dmap_t *) mp->data; + + for (n = 0; n < LPERDMAP; n++) + d[n] = le32_to_cpu(dp->wmap[n]); + + release_metapage(mp); + } + + *results = dbmap; +} + + +/* + * DBAlloc() + */ +void DBAlloc(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks) +{ + int word, nb, bitno; + u32 mask; + + assert(blkno > 0 && blkno < mapsize); + assert(nblocks > 0 && nblocks <= mapsize); + + assert(blkno + nblocks <= mapsize); + + dbmap += (blkno / 32); + while (nblocks > 0) { + bitno = blkno & (32 - 1); + nb = min(nblocks, 32 - bitno); + + mask = (0xffffffff << (32 - nb) >> bitno); + assert((mask & *dbmap) == 0); + *dbmap |= mask; + + dbmap++; + blkno += nb; + nblocks -= nb; + } +} + + +/* + * DBFree() + */ +static void DBFree(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks) +{ + int word, nb, bitno; + u32 mask; + + assert(blkno > 0 && blkno < mapsize); + assert(nblocks > 0 && nblocks <= mapsize); + + assert(blkno + nblocks <= mapsize); + + dbmap += (blkno / 32); + while (nblocks > 0) { + bitno = blkno & (32 - 1); + nb = min(nblocks, 32 - bitno); + + mask = (0xffffffff << (32 - nb) >> bitno); + assert((mask & *dbmap) == mask); + *dbmap &= ~mask; + + dbmap++; + blkno += nb; + nblocks -= nb; + } +} + + +/* + * DBAllocCK() + */ +static void DBAllocCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks) +{ + int word, nb, bitno; + u32 mask; + + assert(blkno > 0 && blkno < mapsize); + assert(nblocks > 0 && nblocks <= mapsize); + + assert(blkno + nblocks <= mapsize); + + dbmap += (blkno / 32); + while (nblocks > 0) { + bitno = blkno & (32 - 1); + nb = min(nblocks, 32 - bitno); + + mask = (0xffffffff << (32 - nb) >> bitno); + assert((mask & *dbmap) == mask); + + dbmap++; + blkno += nb; + nblocks -= nb; + } +} + + +/* + * DBFreeCK() + */ +static void DBFreeCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks) +{ + int word, nb, bitno; + u32 mask; + + assert(blkno > 0 && blkno < mapsize); + assert(nblocks > 0 && nblocks <= mapsize); + + assert(blkno + nblocks <= mapsize); + + dbmap += (blkno / 32); + while (nblocks > 0) { + bitno = blkno & (32 - 1); + nb = min(nblocks, 32 - bitno); + + mask = (0xffffffff << (32 - nb) >> bitno); + assert((mask & *dbmap) == 0); + + dbmap++; + blkno += nb; + nblocks -= nb; + } +} + + +/* + * dbPrtMap() + */ +static void dbPrtMap(bmap_t * bmp) +{ + printk(" mapsize: %d%d\n", bmp->db_mapsize); + printk(" nfree: %d%d\n", bmp->db_nfree); + printk(" numag: %d\n", bmp->db_numag); + printk(" agsize: %d%d\n", bmp->db_agsize); + printk(" agl2size: %d\n", bmp->db_agl2size); + printk(" agwidth: %d\n", bmp->db_agwidth); + printk(" agstart: %d\n", bmp->db_agstart); + printk(" agheigth: %d\n", bmp->db_agheigth); + printk(" aglevel: %d\n", bmp->db_aglevel); + printk(" maxlevel: %d\n", bmp->db_maxlevel); + printk(" maxag: %d\n", bmp->db_maxag); + printk(" agpref: %d\n", bmp->db_agpref); + printk(" l2nbppg: %d\n", bmp->db_l2nbperpage); +} + + +/* + * dbPrtCtl() + */ +static void dbPrtCtl(dmapctl_t * dcp) +{ + int i, j, n; + + printk(" height: %08x\n", le32_to_cpu(dcp->height)); + printk(" leafidx: %08x\n", le32_to_cpu(dcp->leafidx)); + printk(" budmin: %08x\n", dcp->budmin); + printk(" nleafs: %08x\n", le32_to_cpu(dcp->nleafs)); + printk(" l2nleafs: %08x\n", le32_to_cpu(dcp->l2nleafs)); + + printk("\n Tree:\n"); + for (i = 0; i < CTLLEAFIND; i += 8) { + n = min(8, CTLLEAFIND - i); + + for (j = 0; j < n; j++) + printf(" [%03x]: %02x", i + j, + (char) dcp->stree[i + j]); + printf("\n"); + } + + printk("\n Tree Leaves:\n"); + for (i = 0; i < LPERCTL; i += 8) { + n = min(8, LPERCTL - i); + + for (j = 0; j < n; j++) + printf(" [%03x]: %02x", + i + j, + (char) dcp->stree[i + j + CTLLEAFIND]); + printf("\n"); + } +} +#endif /* _JFS_DEBUG_DMAP */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_dmap.h linuxppc64_2_4/fs/jfs/jfs_dmap.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_dmap.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_dmap.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,301 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * jfs_dmap.h: block allocation map manager + */ + +#ifndef _H_JFS_DMAP +#define _H_JFS_DMAP + +#include "jfs_txnmgr.h" + +#define BMAPVERSION 1 /* version number */ +#define TREESIZE (256+64+16+4+1) /* size of a dmap tree */ +#define LEAFIND (64+16+4+1) /* index of 1st leaf of a dmap tree */ +#define LPERDMAP 256 /* num leaves per dmap tree */ +#define L2LPERDMAP 8 /* l2 number of leaves per dmap tree */ +#define DBWORD 32 /* # of blks covered by a map word */ +#define L2DBWORD 5 /* l2 # of blks covered by a mword */ +#define BUDMIN L2DBWORD /* max free string in a map word */ +#define BPERDMAP (LPERDMAP * DBWORD) /* num of blks per dmap */ +#define L2BPERDMAP 13 /* l2 num of blks per dmap */ +#define CTLTREESIZE (1024+256+64+16+4+1) /* size of a dmapctl tree */ +#define CTLLEAFIND (256+64+16+4+1) /* idx of 1st leaf of a dmapctl tree */ +#define LPERCTL 1024 /* num of leaves per dmapctl tree */ +#define L2LPERCTL 10 /* l2 num of leaves per dmapctl tree */ +#define ROOT 0 /* index of the root of a tree */ +#define NOFREE ((s8) -1) /* no blocks free */ +#define MAXAG 128 /* max number of allocation groups */ +#define L2MAXAG 7 /* l2 max num of AG */ +#define L2MINAGSZ 25 /* l2 of minimum AG size in bytes */ +#define BMAPBLKNO 0 /* lblkno of bmap within the map */ + +/* + * maximum l2 number of disk blocks at the various dmapctl levels. + */ +#define L2MAXL0SIZE (L2BPERDMAP + 1 * L2LPERCTL) +#define L2MAXL1SIZE (L2BPERDMAP + 2 * L2LPERCTL) +#define L2MAXL2SIZE (L2BPERDMAP + 3 * L2LPERCTL) + +/* + * maximum number of disk blocks at the various dmapctl levels. + */ +#define MAXL0SIZE ((s64)1 << L2MAXL0SIZE) +#define MAXL1SIZE ((s64)1 << L2MAXL1SIZE) +#define MAXL2SIZE ((s64)1 << L2MAXL2SIZE) + +#define MAXMAPSIZE MAXL2SIZE /* maximum aggregate map size */ + +/* + * determine the maximum free string for four (lower level) nodes + * of the tree. + */ +static __inline signed char TREEMAX(signed char *cp) +{ + signed char tmp1, tmp2; + + tmp1 = max(*(cp+2), *(cp+3)); + tmp2 = max(*(cp), *(cp+1)); + + return max(tmp1, tmp2); +} + +/* + * convert disk block number to the logical block number of the dmap + * describing the disk block. s is the log2(number of logical blocks per page) + * + * The calculation figures out how many logical pages are in front of the dmap. + * - the number of dmaps preceding it + * - the number of L0 pages preceding its L0 page + * - the number of L1 pages preceding its L1 page + * - 3 is added to account for the L2, L1, and L0 page for this dmap + * - 1 is added to account for the control page of the map. + */ +#define BLKTODMAP(b,s) \ + ((((b) >> 13) + ((b) >> 23) + ((b) >> 33) + 3 + 1) << (s)) + +/* + * convert disk block number to the logical block number of the LEVEL 0 + * dmapctl describing the disk block. s is the log2(number of logical blocks + * per page) + * + * The calculation figures out how many logical pages are in front of the L0. + * - the number of dmap pages preceding it + * - the number of L0 pages preceding it + * - the number of L1 pages preceding its L1 page + * - 2 is added to account for the L2, and L1 page for this L0 + * - 1 is added to account for the control page of the map. + */ +#define BLKTOL0(b,s) \ + (((((b) >> 23) << 10) + ((b) >> 23) + ((b) >> 33) + 2 + 1) << (s)) + +/* + * convert disk block number to the logical block number of the LEVEL 1 + * dmapctl describing the disk block. s is the log2(number of logical blocks + * per page) + * + * The calculation figures out how many logical pages are in front of the L1. + * - the number of dmap pages preceding it + * - the number of L0 pages preceding it + * - the number of L1 pages preceding it + * - 1 is added to account for the L2 page + * - 1 is added to account for the control page of the map. + */ +#define BLKTOL1(b,s) \ + (((((b) >> 33) << 20) + (((b) >> 33) << 10) + ((b) >> 33) + 1 + 1) << (s)) + +/* + * convert disk block number to the logical block number of the dmapctl + * at the specified level which describes the disk block. + */ +#define BLKTOCTL(b,s,l) \ + (((l) == 2) ? 1 : ((l) == 1) ? BLKTOL1((b),(s)) : BLKTOL0((b),(s))) + +/* + * convert aggregate map size to the zero origin dmapctl level of the + * top dmapctl. + */ +#define BMAPSZTOLEV(size) \ + (((size) <= MAXL0SIZE) ? 0 : ((size) <= MAXL1SIZE) ? 1 : 2) + +/* convert disk block number to allocation group number. + */ +#define BLKTOAG(b,sbi) ((b) >> ((sbi)->bmap->db_agl2size)) + +/* convert allocation group number to starting disk block + * number. + */ +#define AGTOBLK(a,ip) \ + ((s64)(a) << (JFS_SBI((ip)->i_sb)->bmap->db_agl2size)) + +/* + * dmap summary tree + * + * dmaptree_t must be consistent with dmapctl_t. + */ +typedef struct { + s32 nleafs; /* 4: number of tree leafs */ + s32 l2nleafs; /* 4: l2 number of tree leafs */ + s32 leafidx; /* 4: index of first tree leaf */ + s32 height; /* 4: height of the tree */ + s8 budmin; /* 1: min l2 tree leaf value to combine */ + s8 stree[TREESIZE]; /* TREESIZE: tree */ + u8 pad[2]; /* 2: pad to word boundary */ +} dmaptree_t; /* - 360 - */ + +/* + * dmap page per 8K blocks bitmap + */ +typedef struct { + s32 nblocks; /* 4: num blks covered by this dmap */ + s32 nfree; /* 4: num of free blks in this dmap */ + s64 start; /* 8: starting blkno for this dmap */ + dmaptree_t tree; /* 360: dmap tree */ + u8 pad[1672]; /* 1672: pad to 2048 bytes */ + u32 wmap[LPERDMAP]; /* 1024: bits of the working map */ + u32 pmap[LPERDMAP]; /* 1024: bits of the persistent map */ +} dmap_t; /* - 4096 - */ + +/* + * disk map control page per level. + * + * dmapctl_t must be consistent with dmaptree_t. + */ +typedef struct { + s32 nleafs; /* 4: number of tree leafs */ + s32 l2nleafs; /* 4: l2 number of tree leafs */ + s32 leafidx; /* 4: index of the first tree leaf */ + s32 height; /* 4: height of tree */ + s8 budmin; /* 1: minimum l2 tree leaf value */ + s8 stree[CTLTREESIZE]; /* CTLTREESIZE: dmapctl tree */ + u8 pad[2714]; /* 2714: pad to 4096 */ +} dmapctl_t; /* - 4096 - */ + +/* + * common definition for dmaptree_t within dmap and dmapctl + */ +typedef union { + dmaptree_t t1; + dmapctl_t t2; +} dmtree_t; + +/* macros for accessing fields within dmtree_t */ +#define dmt_nleafs t1.nleafs +#define dmt_l2nleafs t1.l2nleafs +#define dmt_leafidx t1.leafidx +#define dmt_height t1.height +#define dmt_budmin t1.budmin +#define dmt_stree t1.stree + +/* + * on-disk aggregate disk allocation map descriptor. + */ +typedef struct { + s64 dn_mapsize; /* 8: number of blocks in aggregate */ + s64 dn_nfree; /* 8: num free blks in aggregate map */ + s32 dn_l2nbperpage; /* 4: number of blks per page */ + s32 dn_numag; /* 4: total number of ags */ + s32 dn_maxlevel; /* 4: number of active ags */ + s32 dn_maxag; /* 4: max active alloc group number */ + s32 dn_agpref; /* 4: preferred alloc group (hint) */ + s32 dn_aglevel; /* 4: dmapctl level holding the AG */ + s32 dn_agheigth; /* 4: height in dmapctl of the AG */ + s32 dn_agwidth; /* 4: width in dmapctl of the AG */ + s32 dn_agstart; /* 4: start tree index at AG height */ + s32 dn_agl2size; /* 4: l2 num of blks per alloc group */ + s64 dn_agfree[MAXAG]; /* 8*MAXAG: per AG free count */ + s64 dn_agsize; /* 8: num of blks per alloc group */ + s8 dn_maxfreebud; /* 1: max free buddy system */ + u8 pad[3007]; /* 3007: pad to 4096 */ +} dbmap_t; /* - 4096 - */ + +/* + * in-memory aggregate disk allocation map descriptor. + */ +typedef struct bmap { + dbmap_t db_bmap; /* on-disk aggregate map descriptor */ + struct inode *db_ipbmap; /* ptr to aggregate map incore inode */ + struct semaphore db_bmaplock; /* aggregate map lock */ + u32 *db_DBmap; +} bmap_t; + +/* macros for accessing fields within in-memory aggregate map descriptor */ +#define db_mapsize db_bmap.dn_mapsize +#define db_nfree db_bmap.dn_nfree +#define db_agfree db_bmap.dn_agfree +#define db_agsize db_bmap.dn_agsize +#define db_agl2size db_bmap.dn_agl2size +#define db_agwidth db_bmap.dn_agwidth +#define db_agheigth db_bmap.dn_agheigth +#define db_agstart db_bmap.dn_agstart +#define db_numag db_bmap.dn_numag +#define db_maxlevel db_bmap.dn_maxlevel +#define db_aglevel db_bmap.dn_aglevel +#define db_agpref db_bmap.dn_agpref +#define db_maxag db_bmap.dn_maxag +#define db_maxfreebud db_bmap.dn_maxfreebud +#define db_l2nbperpage db_bmap.dn_l2nbperpage + +/* + * macros for various conversions needed by the allocators. + * blkstol2(), cntlz(), and cnttz() are operating system dependent functions. + */ +/* convert number of blocks to log2 number of blocks, rounding up to + * the next log2 value if blocks is not a l2 multiple. + */ +#define BLKSTOL2(d) (blkstol2(d)) + +/* convert number of leafs to log2 leaf value */ +#define NLSTOL2BSZ(n) (31 - cntlz((n)) + BUDMIN) + +/* convert leaf index to log2 leaf value */ +#define LITOL2BSZ(n,m,b) ((((n) == 0) ? (m) : cnttz((n))) + (b)) + +/* convert a block number to a dmap control leaf index */ +#define BLKTOCTLLEAF(b,m) \ + (((b) & (((s64)1 << ((m) + L2LPERCTL)) - 1)) >> (m)) + +/* convert log2 leaf value to buddy size */ +#define BUDSIZE(s,m) (1 << ((s) - (m))) + +/* + * external references. + */ +extern int dbMount(struct inode *ipbmap); + +extern int dbUnmount(struct inode *ipbmap, int mounterror); + +extern int dbFree(struct inode *ipbmap, s64 blkno, s64 nblocks); + +extern int dbUpdatePMap(struct inode *ipbmap, + int free, s64 blkno, s64 nblocks, tblock_t * tblk); + +extern int dbNextAG(struct inode *ipbmap); + +extern int dbAlloc(struct inode *ipbmap, s64 hint, s64 nblocks, s64 * results); + +extern int dbAllocExact(struct inode *ip, s64 blkno, int nblocks); + +extern int dbReAlloc(struct inode *ipbmap, + s64 blkno, s64 nblocks, s64 addnblocks, s64 * results); + +extern int dbSync(struct inode *ipbmap); +extern int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks); +extern int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks); +extern void dbFinalizeBmap(struct inode *ipbmap); +extern s64 dbMapFileSizeToMapSize(struct inode *ipbmap); +#endif /* _H_JFS_DMAP */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_dtree.c linuxppc64_2_4/fs/jfs/jfs_dtree.c --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_dtree.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_dtree.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,4525 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * +*/ + +/* + * jfs_dtree.c: directory B+-tree manager + * + * B+-tree with variable length key directory: + * + * each directory page is structured as an array of 32-byte + * directory entry slots initialized as a freelist + * to avoid search/compaction of free space at insertion. + * when an entry is inserted, a number of slots are allocated + * from the freelist as required to store variable length data + * of the entry; when the entry is deleted, slots of the entry + * are returned to freelist. + * + * leaf entry stores full name as key and file serial number + * (aka inode number) as data. + * internal/router entry stores sufffix compressed name + * as key and simple extent descriptor as data. + * + * each directory page maintains a sorted entry index table + * which stores the start slot index of sorted entries + * to allow binary search on the table. + * + * directory starts as a root/leaf page in on-disk inode + * inline data area. + * when it becomes full, it starts a leaf of a external extent + * of length of 1 block. each time the first leaf becomes full, + * it is extended rather than split (its size is doubled), + * until its length becoms 4 KBytes, from then the extent is split + * with new 4 Kbyte extent when it becomes full + * to reduce external fragmentation of small directories. + * + * blah, blah, blah, for linear scan of directory in pieces by + * readdir(). + * + * + * case-insensitive directory file system + * + * names are stored in case-sensitive way in leaf entry. + * but stored, searched and compared in case-insensitive (uppercase) order + * (i.e., both search key and entry key are folded for search/compare): + * (note that case-sensitive order is BROKEN in storage, e.g., + * sensitive: Ad, aB, aC, aD -> insensitive: aB, aC, aD, Ad + * + * entries which folds to the same key makes up a equivalent class + * whose members are stored as contiguous cluster (may cross page boundary) + * but whose order is arbitrary and acts as duplicate, e.g., + * abc, Abc, aBc, abC) + * + * once match is found at leaf, requires scan forward/backward + * either for, in case-insensitive search, duplicate + * or for, in case-sensitive search, for exact match + * + * router entry must be created/stored in case-insensitive way + * in internal entry: + * (right most key of left page and left most key of right page + * are folded, and its suffix compression is propagated as router + * key in parent) + * (e.g., if split occurs and , trather than + * should be made the router key for the split) + * + * case-insensitive search: + * + * fold search key; + * + * case-insensitive search of B-tree: + * for internal entry, router key is already folded; + * for leaf entry, fold the entry key before comparison. + * + * if (leaf entry case-insensitive match found) + * if (next entry satisfies case-insensitive match) + * return EDUPLICATE; + * if (prev entry satisfies case-insensitive match) + * return EDUPLICATE; + * return match; + * else + * return no match; + * + * serialization: + * target directory inode lock is being held on entry/exit + * of all main directory service routines. + * + * log based recovery: + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_superblock.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dmap.h" +#include "jfs_unicode.h" +#include "jfs_debug.h" + +/* dtree split parameter */ +typedef struct { + metapage_t *mp; + s16 index; + s16 nslot; + component_t *key; + ddata_t *data; + pxdlist_t *pxdlist; +} dtsplit_t; + +#define DT_PAGE(IP, MP) BT_PAGE(IP, MP, dtpage_t, i_dtroot) + +/* get page buffer for specified block address */ +#define DT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ +{\ + BT_GETPAGE(IP, BN, MP, dtpage_t, SIZE, P, RC, i_dtroot)\ + if (!(RC))\ + {\ + if (((P)->header.nextindex > (((BN)==0)?DTROOTMAXSLOT:(P)->header.maxslot)) ||\ + ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT)))\ + {\ + jERROR(1,("DT_GETPAGE: dtree page corrupt\n"));\ + BT_PUTPAGE(MP);\ + updateSuper((IP)->i_sb, FM_DIRTY);\ + MP = NULL;\ + RC = EIO;\ + }\ + }\ +} + +/* for consistency */ +#define DT_PUTPAGE(MP) BT_PUTPAGE(MP) + +#define DT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ + BT_GETSEARCH(IP, LEAF, BN, MP, dtpage_t, P, INDEX, i_dtroot) + +/* + * forward references + */ +static int dtSplitUp(tid_t tid, struct inode *ip, + dtsplit_t * split, btstack_t * btstack); + +static int dtSplitPage(tid_t tid, struct inode *ip, dtsplit_t * split, + metapage_t ** rmpp, dtpage_t ** rpp, pxd_t * rxdp); + +static int dtExtendPage(tid_t tid, struct inode *ip, + dtsplit_t * split, btstack_t * btstack); + +static int dtSplitRoot(tid_t tid, struct inode *ip, + dtsplit_t * split, metapage_t ** rmpp); + +static int dtDeleteUp(tid_t tid, struct inode *ip, metapage_t * fmp, + dtpage_t * fp, btstack_t * btstack); + +static int dtSearchNode(struct inode *ip, + s64 lmxaddr, pxd_t * kpxd, btstack_t * btstack); + +static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p); + +static int dtReadFirst(struct inode *ip, btstack_t * btstack); + +static int dtReadNext(struct inode *ip, + loff_t * offset, btstack_t * btstack); + +static int dtCompare(component_t * key, dtpage_t * p, int si); + +static int ciCompare(component_t * key, dtpage_t * p, int si, int flag); + +static void dtGetKey(dtpage_t * p, int i, component_t * key, int flag); + +static void ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, + int ri, component_t * key, int flag); + +static void dtInsertEntry(dtpage_t * p, int index, component_t * key, + ddata_t * data, dtlock_t ** dtlock); + +static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, + dtlock_t ** sdtlock, dtlock_t ** ddtlock, + int do_index); + +static void dtDeleteEntry(dtpage_t * p, int fi, dtlock_t ** dtlock); + +static void dtTruncateEntry(dtpage_t * p, int ti, dtlock_t ** dtlock); + +static void dtLinelockFreelist(dtpage_t * p, int m, dtlock_t ** dtlock); + +#define ciToUpper(c) UniStrupr((c)->name) + +/* + * find_index() + * + * Returns dtree page containing directory table entry for specified + * index and pointer to its entry. + * + * mp must be released by caller. + */ +static dir_table_slot_t *find_index(struct inode *ip, u32 index, + metapage_t ** mp) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + s64 blkno; + s64 offset; + int page_offset; + dir_table_slot_t *slot; + static int maxWarnings = 10; + + if (index < 2) { + if (maxWarnings) { + jERROR(1, ("find_entry called with index = %d\n", + index)); + maxWarnings--; + } + return 0; + } + + if (index >= jfs_ip->next_index) { + jFYI(1, ("find_entry called with index >= next_index\n")); + return 0; + } + + if (jfs_ip->next_index <= (MAX_INLINE_DIRTABLE_ENTRY + 1)) { + /* + * Inline directory table + */ + *mp = 0; + slot = &jfs_ip->i_dirtable[index - 2]; + } else { + offset = (index - 2) * sizeof(dir_table_slot_t); + page_offset = offset & (PSIZE - 1); + blkno = ((offset + 1) >> L2PSIZE) << + JFS_SBI(ip->i_sb)->l2nbperpage; + + if (*mp && ((*mp)->index != blkno)) { + release_metapage(*mp); + *mp = 0; + } + if (*mp == 0) + *mp = read_metapage(ip, blkno, PSIZE, 0); + if (*mp == 0) { + jERROR(1, + ("free_index: error reading directory table\n")); + return 0; + } + + slot = + (dir_table_slot_t *) ((char *) (*mp)->data + + page_offset); + } + return slot; +} + +static inline void lock_index(tid_t tid, struct inode *ip, metapage_t * mp, + u32 index) +{ + tlock_t *tlck; + linelock_t *llck; + lv_t *lv; + + tlck = txLock(tid, ip, mp, tlckDATA); + llck = (linelock_t *) tlck->lock; + + if (llck->index >= llck->maxcnt) + llck = txLinelock(llck); + lv = &llck->lv[llck->index]; + + /* + * Linelock slot size is twice the size of directory table + * slot size. 512 entries per page. + */ + lv->offset = ((index - 2) & 511) >> 1; + lv->length = 1; + llck->index++; +} + +/* + * add_index() + * + * Adds an entry to the directory index table. This is used to provide + * each directory entry with a persistent index in which to resume + * directory traversals + */ +static u32 add_index(tid_t tid, struct inode *ip, s64 bn, int slot) +{ + struct super_block *sb = ip->i_sb; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + u64 blkno; + dir_table_slot_t *dirtab_slot; + u32 index; + linelock_t *llck; + lv_t *lv; + metapage_t *mp; + s64 offset; + uint page_offset; + int rc; + tlock_t *tlck; + s64 xaddr; + + ASSERT(DO_INDEX(ip)); + + if (jfs_ip->next_index < 2) { + jERROR(1, ("next_index = %d. Please fix this!\n", + jfs_ip->next_index)); + jfs_ip->next_index = 2; + } + + index = jfs_ip->next_index++; + + if (index <= MAX_INLINE_DIRTABLE_ENTRY) { + /* + * i_size reflects size of index table, or 8 bytes per entry. + */ + ip->i_size = (loff_t) (index - 1) << 3; + + /* + * dir table fits inline within inode + */ + dirtab_slot = &jfs_ip->i_dirtable[index-2]; + dirtab_slot->flag = DIR_INDEX_VALID; + dirtab_slot->slot = slot; + DTSaddress(dirtab_slot, bn); + + set_cflag(COMMIT_Dirtable, ip); + + return index; + } + if (index == (MAX_INLINE_DIRTABLE_ENTRY + 1)) { + /* + * It's time to move the inline table to an external + * page and begin to build the xtree + */ + + /* + * Save the table, we're going to overwrite it with the + * xtree root + */ + dir_table_slot_t temp_table[12]; + memcpy(temp_table, &jfs_ip->i_dirtable, sizeof(temp_table)); + + /* + * Initialize empty x-tree + */ + xtInitRoot(tid, ip); + + /* + * Allocate the first block & add it to the xtree + */ + xaddr = 0; + if ((rc = + xtInsert(tid, ip, 0, 0, sbi->nbperpage, + &xaddr, 0))) { + jFYI(1, ("add_index: xtInsert failed!\n")); + return -1; + } + ip->i_size = PSIZE; + ip->i_blocks += LBLK2PBLK(sb, sbi->nbperpage); + + if ((mp = get_metapage(ip, 0, ip->i_blksize, 0)) == 0) { + jERROR(1, ("add_index: get_metapage failed!\n")); + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + return -1; + } + tlck = txLock(tid, ip, mp, tlckDATA); + llck = (linelock_t *) & tlck->lock; + ASSERT(llck->index == 0); + lv = &llck->lv[0]; + + lv->offset = 0; + lv->length = 6; /* tlckDATA slot size is 16 bytes */ + llck->index++; + + memcpy(mp->data, temp_table, sizeof(temp_table)); + + mark_metapage_dirty(mp); + release_metapage(mp); + + /* + * Logging is now directed by xtree tlocks + */ + clear_cflag(COMMIT_Dirtable, ip); + } + + offset = (index - 2) * sizeof(dir_table_slot_t); + page_offset = offset & (PSIZE - 1); + blkno = ((offset + 1) >> L2PSIZE) << sbi->l2nbperpage; + if (page_offset == 0) { + /* + * This will be the beginning of a new page + */ + xaddr = 0; + if ((rc = + xtInsert(tid, ip, 0, blkno, sbi->nbperpage, + &xaddr, 0))) { + jFYI(1, ("add_index: xtInsert failed!\n")); + jfs_ip->next_index--; + return -1; + } + ip->i_size += PSIZE; + ip->i_blocks += LBLK2PBLK(sb, sbi->nbperpage); + + if ((mp = get_metapage(ip, blkno, PSIZE, 0))) + memset(mp->data, 0, PSIZE); /* Just looks better */ + else + xtTruncate(tid, ip, offset, COMMIT_PWMAP); + } else + mp = read_metapage(ip, blkno, PSIZE, 0); + + if (mp == 0) { + jERROR(1, ("add_index: get/read_metapage failed!\n")); + return -1; + } + + lock_index(tid, ip, mp, index); + + dirtab_slot = + (dir_table_slot_t *) ((char *) mp->data + page_offset); + dirtab_slot->flag = DIR_INDEX_VALID; + dirtab_slot->slot = slot; + DTSaddress(dirtab_slot, bn); + + mark_metapage_dirty(mp); + release_metapage(mp); + + return index; +} + +/* + * free_index() + * + * Marks an entry to the directory index table as free. + */ +static void free_index(tid_t tid, struct inode *ip, u32 index, u32 next) +{ + dir_table_slot_t *dirtab_slot; + metapage_t *mp = 0; + + dirtab_slot = find_index(ip, index, &mp); + + if (dirtab_slot == 0) + return; + + dirtab_slot->flag = DIR_INDEX_FREE; + dirtab_slot->slot = dirtab_slot->addr1 = 0; + dirtab_slot->addr2 = cpu_to_le32(next); + + if (mp) { + lock_index(tid, ip, mp, index); + mark_metapage_dirty(mp); + release_metapage(mp); + } else + set_cflag(COMMIT_Dirtable, ip); +} + +/* + * modify_index() + * + * Changes an entry in the directory index table + */ +static void modify_index(tid_t tid, struct inode *ip, u32 index, s64 bn, + int slot, metapage_t ** mp) +{ + dir_table_slot_t *dirtab_slot; + + dirtab_slot = find_index(ip, index, mp); + + if (dirtab_slot == 0) + return; + + DTSaddress(dirtab_slot, bn); + dirtab_slot->slot = slot; + + if (*mp) { + lock_index(tid, ip, *mp, index); + mark_metapage_dirty(*mp); + } else + set_cflag(COMMIT_Dirtable, ip); +} + +/* + * get_index() + * + * reads a directory table slot + */ +static int get_index(struct inode *ip, u32 index, + dir_table_slot_t * dirtab_slot) +{ + metapage_t *mp = 0; + dir_table_slot_t *slot; + + slot = find_index(ip, index, &mp); + if (slot == 0) { + return -EIO; + } + + memcpy(dirtab_slot, slot, sizeof(dir_table_slot_t)); + + if (mp) + release_metapage(mp); + + return 0; +} + +/* + * dtSearch() + * + * function: + * Search for the entry with specified key + * + * parameter: + * + * return: 0 - search result on stack, leaf page pinned; + * errno - I/O error + */ +int dtSearch(struct inode *ip, + component_t * key, ino_t * data, btstack_t * btstack, int flag) +{ + int rc = 0; + int cmp = 1; /* init for empty page */ + s64 bn; + metapage_t *mp; + dtpage_t *p; + s8 *stbl; + int base, index, lim; + btframe_t *btsp; + pxd_t *pxd; + int psize = 288; /* initial in-line directory */ + ino_t inumber; + component_t ciKey; + struct super_block *sb = ip->i_sb; + + ciKey.name = + (wchar_t *) kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), + GFP_NOFS); + if (ciKey.name == 0) { + rc = ENOMEM; + goto dtSearch_Exit2; + } + + + /* uppercase search key for c-i directory */ + UniStrcpy(ciKey.name, key->name); + ciKey.namlen = key->namlen; + + /* only uppercase if case-insensitive support is on */ + if ((JFS_SBI(sb)->mntflag & JFS_OS2) == JFS_OS2) { + ciToUpper(&ciKey); + } + BT_CLR(btstack); /* reset stack */ + + /* init level count for max pages to split */ + btstack->nsplit = 1; + + /* + * search down tree from root: + * + * between two consecutive entries of and of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + DT_GETPAGE(ip, bn, mp, psize, p, rc); + if (rc) + goto dtSearch_Exit1; + + /* get sorted entry table of the page */ + stbl = DT_GETSTBL(p); + + /* + * binary search with search key K on the current page. + */ + for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) { + index = base + (lim >> 1); + + if (p->header.flag & BT_LEAF) { + /* uppercase leaf name to compare */ + cmp = + ciCompare(&ciKey, p, stbl[index], + JFS_SBI(sb)->mntflag); + } else { + /* router key is in uppercase */ + + cmp = dtCompare(&ciKey, p, stbl[index]); + + + } + if (cmp == 0) { + /* + * search hit + */ + /* search hit - leaf page: + * return the entry found + */ + if (p->header.flag & BT_LEAF) { + inumber = le32_to_cpu( + ((ldtentry_t *) & p->slot[stbl[index]])->inumber); + + /* + * search for JFS_LOOKUP + */ + if (flag == JFS_LOOKUP) { + *data = inumber; + rc = 0; + goto out; + } + + /* + * search for JFS_CREATE + */ + if (flag == JFS_CREATE) { + *data = inumber; + rc = EEXIST; + goto out; + } + + /* + * search for JFS_REMOVE or JFS_RENAME + */ + if ((flag == JFS_REMOVE || + flag == JFS_RENAME) && + *data != inumber) { + rc = ESTALE; + goto out; + } + + /* + * JFS_REMOVE|JFS_FINDDIR|JFS_RENAME + */ + /* save search result */ + *data = inumber; + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + rc = 0; + goto dtSearch_Exit1; + } + + /* search hit - internal page: + * descend/search its child page + */ + goto getChild; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or (maxindex + 1) index. + */ + /* + * search miss - leaf page + * + * return location of entry (base) where new entry with + * search key K is to be inserted. + */ + if (p->header.flag & BT_LEAF) { + /* + * search for JFS_LOOKUP, JFS_REMOVE, or JFS_RENAME + */ + if (flag == JFS_LOOKUP || flag == JFS_REMOVE || + flag == JFS_RENAME) { + rc = ENOENT; + goto out; + } + + /* + * search for JFS_CREATE|JFS_FINDDIR: + * + * save search result + */ + *data = 0; + btsp = btstack->top; + btsp->bn = bn; + btsp->index = base; + btsp->mp = mp; + + rc = 0; + goto dtSearch_Exit1; + } + + /* + * search miss - internal page + * + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + getChild: + /* update max. number of pages to split */ + if (btstack->nsplit >= 8) { + /* Something's corrupted, mark filesytem dirty so + * chkdsk will fix it. + */ + jERROR(1, ("stack overrun in dtSearch!\n")); + updateSuper(sb, FM_DIRTY); + rc = EIO; + goto out; + } + btstack->nsplit++; + + /* push (bn, index) of the parent page/entry */ + BT_PUSH(btstack, bn, index); + + /* get the child page block number */ + pxd = (pxd_t *) & p->slot[stbl[index]]; + bn = addressPXD(pxd); + psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; + + /* unpin the parent page */ + DT_PUTPAGE(mp); + } + + out: + DT_PUTPAGE(mp); + + dtSearch_Exit1: + + kfree(ciKey.name); + + dtSearch_Exit2: + + return rc; +} + + +/* + * dtInsert() + * + * function: insert an entry to directory tree + * + * parameter: + * + * return: 0 - success; + * errno - failure; + */ +int dtInsert(tid_t tid, struct inode *ip, + component_t * name, ino_t * fsn, btstack_t * btstack) +{ + int rc = 0; + metapage_t *mp; /* meta-page buffer */ + dtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index; + dtsplit_t split; /* split information */ + ddata_t data; + dtlock_t *dtlck; + int n; + tlock_t *tlck; + lv_t *lv; + + /* + * retrieve search result + * + * dtSearch() returns (leaf page pinned, index at which to insert). + * n.b. dtSearch() may return index of (maxindex + 1) of + * the full page. + */ + DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); + + /* + * insert entry for new key + */ + if (DO_INDEX(ip)) { + if (JFS_IP(ip)->next_index == DIREND) { + DT_PUTPAGE(mp); + return EMLINK; + } + n = NDTLEAF(name->namlen); + data.leaf.tid = tid; + data.leaf.ip = ip; + } else { + n = NDTLEAF_LEGACY(name->namlen); + data.leaf.ip = 0; /* signifies legacy directory format */ + } + data.leaf.ino = cpu_to_le32(*fsn); + + /* + * leaf page does not have enough room for new entry: + * + * extend/split the leaf page; + * + * dtSplitUp() will insert the entry and unpin the leaf page. + */ + if (n > p->header.freecnt) { + split.mp = mp; + split.index = index; + split.nslot = n; + split.key = name; + split.data = &data; + rc = dtSplitUp(tid, ip, &split, btstack); + return rc; + } + + /* + * leaf page does have enough room for new entry: + * + * insert the new data entry into the leaf page; + */ + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (dtlock_t *) & tlck->lock; + ASSERT(dtlck->index == 0); + lv = (lv_t *) & dtlck->lv[0]; + + /* linelock header */ + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + dtInsertEntry(p, index, name, &data, &dtlck); + + /* linelock stbl of non-root leaf page */ + if (!(p->header.flag & BT_ROOT)) { + if (dtlck->index >= dtlck->maxcnt) + dtlck = (dtlock_t *) txLinelock(dtlck); + lv = (lv_t *) & dtlck->lv[dtlck->index]; + n = index >> L2DTSLOTSIZE; + lv->offset = p->header.stblindex + n; + lv->length = + ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; + dtlck->index++; + } + + /* unpin the leaf page */ + DT_PUTPAGE(mp); + + return 0; +} + + +/* + * dtSplitUp() + * + * function: propagate insertion bottom up; + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * leaf page unpinned; + */ +static int dtSplitUp(tid_t tid, + struct inode *ip, dtsplit_t * split, btstack_t * btstack) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + int rc = 0; + metapage_t *smp; + dtpage_t *sp; /* split page */ + metapage_t *rmp; + dtpage_t *rp; /* new right page split from sp */ + pxd_t rpxd; /* new right page extent descriptor */ + metapage_t *lmp; + dtpage_t *lp; /* left child page */ + int skip; /* index of entry of insertion */ + btframe_t *parent; /* parent page entry on traverse stack */ + s64 xaddr, nxaddr; + int xlen, xsize; + pxdlist_t pxdlist; + pxd_t *pxd; + component_t key = { 0, 0 }; + ddata_t *data = split->data; + int n; + dtlock_t *dtlck; + tlock_t *tlck; + lv_t *lv; + + /* get split page */ + smp = split->mp; + sp = DT_PAGE(ip, smp); + + key.name = + (wchar_t *) kmalloc((JFS_NAME_MAX + 2) * sizeof(wchar_t), + GFP_NOFS); + if (key.name == 0) { + DT_PUTPAGE(smp); + rc = ENOMEM; + goto dtSplitUp_Exit; + } + + /* + * split leaf page + * + * The split routines insert the new entry, and + * acquire txLock as appropriate. + */ + /* + * split root leaf page: + */ + if (sp->header.flag & BT_ROOT) { + /* + * allocate a single extent child page + */ + xlen = 1; + n = sbi->bsize >> L2DTSLOTSIZE; + n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ + n -= DTROOTMAXSLOT - sp->header.freecnt; /* header + entries */ + if (n <= split->nslot) + xlen++; + if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr))) + goto freeKeyName; + + pxdlist.maxnpxd = 1; + pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + split->pxdlist = &pxdlist; + rc = dtSplitRoot(tid, ip, split, &rmp); + + DT_PUTPAGE(rmp); + DT_PUTPAGE(smp); + + goto freeKeyName; + } + + /* + * extend first leaf page + * + * extend the 1st extent if less than buffer page size + * (dtExtendPage() reurns leaf page unpinned) + */ + pxd = &sp->header.self; + xlen = lengthPXD(pxd); + xsize = xlen << sbi->l2bsize; + if (xsize < PSIZE) { + xaddr = addressPXD(pxd); + n = xsize >> L2DTSLOTSIZE; + n -= (n + 31) >> L2DTSLOTSIZE; /* stbl size */ + if ((n + sp->header.freecnt) <= split->nslot) + n = xlen + (xlen << 1); + else + n = xlen; + if ((rc = dbReAlloc(sbi->ipbmap, xaddr, (s64) xlen, + (s64) n, &nxaddr))) + goto extendOut; + + pxdlist.maxnpxd = 1; + pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + PXDaddress(pxd, nxaddr) + PXDlength(pxd, xlen + n); + split->pxdlist = &pxdlist; + if ((rc = dtExtendPage(tid, ip, split, btstack))) { + nxaddr = addressPXD(pxd); + if (xaddr != nxaddr) { + /* free relocated extent */ + xlen = lengthPXD(pxd); + dbFree(ip, nxaddr, (s64) xlen); + } else { + /* free extended delta */ + xlen = lengthPXD(pxd) - n; + xaddr = addressPXD(pxd) + xlen; + dbFree(ip, xaddr, (s64) n); + } + } + + extendOut: + DT_PUTPAGE(smp); + goto freeKeyName; + } + + /* + * split leaf page into and a new right page . + * + * return pinned and its extent descriptor + */ + /* + * allocate new directory page extent and + * new index page(s) to cover page split(s) + * + * allocation hint: ? + */ + n = btstack->nsplit; + pxdlist.maxnpxd = pxdlist.npxd = 0; + xlen = sbi->nbperpage; + for (pxd = pxdlist.pxd; n > 0; n--, pxd++) { + if ((rc = dbAlloc(ip, 0, (s64) xlen, &xaddr)) == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + pxdlist.maxnpxd++; + continue; + } + + DT_PUTPAGE(smp); + + /* undo allocation */ + goto splitOut; + } + + split->pxdlist = &pxdlist; + if ((rc = dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd))) { + DT_PUTPAGE(smp); + + /* undo allocation */ + goto splitOut; + } + + /* + * propagate up the router entry for the leaf page just split + * + * insert a router entry for the new page into the parent page, + * propagate the insert/split up the tree by walking back the stack + * of (bn of parent page, index of child page entry in parent page) + * that were traversed during the search for the page that split. + * + * the propagation of insert/split up the tree stops if the root + * splits or the page inserted into doesn't have to split to hold + * the new entry. + * + * the parent entry for the split page remains the same, and + * a new entry is inserted at its right with the first key and + * block number of the new right page. + * + * There are a maximum of 4 pages pinned at any time: + * two children, left parent and right parent (when the parent splits). + * keep the child pages pinned while working on the parent. + * make sure that all pins are released at exit. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* parent page specified by stack frame */ + + /* keep current child pages (, ) pinned */ + lmp = smp; + lp = sp; + + /* + * insert router entry in parent for new right child page + */ + /* get the parent page */ + DT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); + if (rc) { + DT_PUTPAGE(lmp); + DT_PUTPAGE(rmp); + goto splitOut; + } + + /* + * The new key entry goes ONE AFTER the index of parent entry, + * because the split was to the right. + */ + skip = parent->index + 1; + + /* + * compute the key for the router entry + * + * key suffix compression: + * for internal pages that have leaf pages as children, + * retain only what's needed to distinguish between + * the new entry and the entry on the page to its left. + * If the keys compare equal, retain the entire key. + * + * note that compression is performed only at computing + * router key at the lowest internal level. + * further compression of the key between pairs of higher + * level internal pages loses too much information and + * the search may fail. + * (e.g., two adjacent leaf pages of {a, ..., x} {xx, ...,} + * results in two adjacent parent entries (a)(xx). + * if split occurs between these two entries, and + * if compression is applied, the router key of parent entry + * of right page (x) will divert search for x into right + * subtree and miss x in the left subtree.) + * + * the entire key must be retained for the next-to-leftmost + * internal key at any level of the tree, or search may fail + * (e.g., ?) + */ + switch (rp->header.flag & BT_TYPE) { + case BT_LEAF: + /* + * compute the length of prefix for suffix compression + * between last entry of left page and first entry + * of right page + */ + if ((sp->header.flag & BT_ROOT && skip > 1) || + sp->header.prev != 0 || skip > 1) { + /* compute uppercase router prefix key */ + ciGetLeafPrefixKey(lp, + lp->header.nextindex - 1, + rp, 0, &key, sbi->mntflag); + } else { + /* next to leftmost entry of + lowest internal level */ + + /* compute uppercase router key */ + dtGetKey(rp, 0, &key, sbi->mntflag); + key.name[key.namlen] = 0; + + if ((sbi->mntflag & JFS_OS2) == JFS_OS2) + ciToUpper(&key); + } + + n = NDTINTERNAL(key.namlen); + break; + + case BT_INTERNAL: + dtGetKey(rp, 0, &key, sbi->mntflag); + n = NDTINTERNAL(key.namlen); + break; + + default: + jERROR(2, ("dtSplitUp(): UFO!\n")); + break; + } + + /* unpin left child page */ + DT_PUTPAGE(lmp); + + /* + * compute the data for the router entry + */ + data->xd = rpxd; /* child page xd */ + + /* + * parent page is full - split the parent page + */ + if (n > sp->header.freecnt) { + /* init for parent page split */ + split->mp = smp; + split->index = skip; /* index at insert */ + split->nslot = n; + split->key = &key; + /* split->data = data; */ + + /* unpin right child page */ + DT_PUTPAGE(rmp); + + /* The split routines insert the new entry, + * acquire txLock as appropriate. + * return pinned and its block number . + */ + rc = (sp->header.flag & BT_ROOT) ? + dtSplitRoot(tid, ip, split, &rmp) : + dtSplitPage(tid, ip, split, &rmp, &rp, &rpxd); + if (rc) { + DT_PUTPAGE(smp); + goto splitOut; + } + + /* smp and rmp are pinned */ + } + /* + * parent page is not full - insert router entry in parent page + */ + else { + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the parent page + */ + tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); + dtlck = (dtlock_t *) & tlck->lock; + ASSERT(dtlck->index == 0); + lv = (lv_t *) & dtlck->lv[0]; + + /* linelock header */ + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + /* linelock stbl of non-root parent page */ + if (!(sp->header.flag & BT_ROOT)) { + lv++; + n = skip >> L2DTSLOTSIZE; + lv->offset = sp->header.stblindex + n; + lv->length = + ((sp->header.nextindex - + 1) >> L2DTSLOTSIZE) - n + 1; + dtlck->index++; + } + + dtInsertEntry(sp, skip, &key, data, &dtlck); + + /* exit propagate up */ + break; + } + } + + /* unpin current split and its right page */ + DT_PUTPAGE(smp); + DT_PUTPAGE(rmp); + + /* + * free remaining extents allocated for split + */ + splitOut: + n = pxdlist.npxd; + pxd = &pxdlist.pxd[n]; + for (; n < pxdlist.maxnpxd; n++, pxd++) + dbFree(ip, addressPXD(pxd), (s64) lengthPXD(pxd)); + + freeKeyName: + kfree(key.name); + + dtSplitUp_Exit: + + return rc; +} + + +/* + * dtSplitPage() + * + * function: Split a non-root page of a btree. + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * return split and new page pinned; + */ +static int dtSplitPage(tid_t tid, struct inode *ip, dtsplit_t * split, + metapage_t ** rmpp, dtpage_t ** rpp, pxd_t * rpxdp) +{ + struct super_block *sb = ip->i_sb; + int rc = 0; + metapage_t *smp; + dtpage_t *sp; + metapage_t *rmp; + dtpage_t *rp; /* new right page allocated */ + s64 rbn; /* new right page block number */ + metapage_t *mp; + dtpage_t *p; + s64 nextbn; + pxdlist_t *pxdlist; + pxd_t *pxd; + int skip, nextindex, half, left, nxt, off, si; + ldtentry_t *ldtentry; + idtentry_t *idtentry; + u8 *stbl; + dtslot_t *f; + int fsi, stblsize; + int n; + dtlock_t *sdtlck, *rdtlck; + tlock_t *tlck; + dtlock_t *dtlck; + lv_t *slv, *rlv, *lv; + + /* get split page */ + smp = split->mp; + sp = DT_PAGE(ip, smp); + + /* + * allocate the new right page for the split + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + rmp = get_metapage(ip, rbn, PSIZE, 1); + if (rmp == NULL) + return EIO; + + jEVENT(0, + ("dtSplitPage: ip:0x%p smp:0x%p rmp:0x%p\n", ip, smp, rmp)); + + BT_MARK_DIRTY(rmp, ip); + /* + * acquire a transaction lock on the new right page + */ + tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); + rdtlck = (dtlock_t *) & tlck->lock; + + rp = (dtpage_t *) rmp->data; + *rpp = rp; + rp->header.self = *pxd; + + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the split page + * + * action: + */ + tlck = txLock(tid, ip, smp, tlckDTREE | tlckENTRY); + sdtlck = (dtlock_t *) & tlck->lock; + + /* linelock header of split page */ + ASSERT(sdtlck->index == 0); + slv = (lv_t *) & sdtlck->lv[0]; + slv->offset = 0; + slv->length = 1; + sdtlck->index++; + + /* + * initialize/update sibling pointers between sp and rp + */ + nextbn = le64_to_cpu(sp->header.next); + rp->header.next = cpu_to_le64(nextbn); + rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); + sp->header.next = cpu_to_le64(rbn); + + /* + * initialize new right page + */ + rp->header.flag = sp->header.flag; + + /* compute sorted entry table at start of extent data area */ + rp->header.nextindex = 0; + rp->header.stblindex = 1; + + n = PSIZE >> L2DTSLOTSIZE; + rp->header.maxslot = n; + stblsize = (n + 31) >> L2DTSLOTSIZE; /* in unit of slot */ + + /* init freelist */ + fsi = rp->header.stblindex + stblsize; + rp->header.freelist = fsi; + rp->header.freecnt = rp->header.maxslot - fsi; + + /* + * sequential append at tail: append without split + * + * If splitting the last page on a level because of appending + * a entry to it (skip is maxentry), it's likely that the access is + * sequential. Adding an empty page on the side of the level is less + * work and can push the fill factor much higher than normal. + * If we're wrong it's no big deal, we'll just do the split the right + * way next time. + * (It may look like it's equally easy to do a similar hack for + * reverse sorted data, that is, split the tree left, + * but it's not. Be my guest.) + */ + if (nextbn == 0 && split->index == sp->header.nextindex) { + /* linelock header + stbl (first slot) of new page */ + rlv = (lv_t *) & rdtlck->lv[rdtlck->index]; + rlv->offset = 0; + rlv->length = 2; + rdtlck->index++; + + /* + * initialize freelist of new right page + */ + f = &rp->slot[fsi]; + for (fsi++; fsi < rp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* insert entry at the first entry of the new right page */ + dtInsertEntry(rp, 0, split->key, split->data, &rdtlck); + + goto out; + } + + /* + * non-sequential insert (at possibly middle page) + */ + + /* + * update prev pointer of previous right sibling page; + */ + if (nextbn != 0) { + DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the next page + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); + jEVENT(0, + ("dtSplitPage: tlck = 0x%p, ip = 0x%p, mp=0x%p\n", + tlck, ip, mp)); + dtlck = (dtlock_t *) & tlck->lock; + + /* linelock header of previous right sibling page */ + lv = (lv_t *) & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + p->header.prev = cpu_to_le64(rbn); + + DT_PUTPAGE(mp); + } + + /* + * split the data between the split and right pages. + */ + skip = split->index; + half = (PSIZE >> L2DTSLOTSIZE) >> 1; /* swag */ + left = 0; + + /* + * compute fill factor for split pages + * + * traces the next entry to move to rp + * traces the next entry to stay in sp + */ + stbl = (u8 *) & sp->slot[sp->header.stblindex]; + nextindex = sp->header.nextindex; + for (nxt = off = 0; nxt < nextindex; ++off) { + if (off == skip) + /* check for fill factor with new entry size */ + n = split->nslot; + else { + si = stbl[nxt]; + switch (sp->header.flag & BT_TYPE) { + case BT_LEAF: + ldtentry = (ldtentry_t *) & sp->slot[si]; + if (DO_INDEX(ip)) + n = NDTLEAF(ldtentry->namlen); + else + n = NDTLEAF_LEGACY(ldtentry-> + namlen); + break; + + case BT_INTERNAL: + idtentry = (idtentry_t *) & sp->slot[si]; + n = NDTINTERNAL(idtentry->namlen); + break; + + default: + break; + } + + ++nxt; /* advance to next entry to move in sp */ + } + + left += n; + if (left >= half) + break; + } + + /* poins to the 1st entry to move */ + + /* + * move entries to right page + * + * dtMoveEntry() initializes rp and reserves entry for insertion + * + * split page moved out entries are linelocked; + * new/right page moved in entries are linelocked; + */ + /* linelock header + stbl of new right page */ + rlv = (lv_t *) & rdtlck->lv[rdtlck->index]; + rlv->offset = 0; + rlv->length = 5; + rdtlck->index++; + + dtMoveEntry(sp, nxt, rp, &sdtlck, &rdtlck, DO_INDEX(ip)); + + sp->header.nextindex = nxt; + + /* + * finalize freelist of new right page + */ + fsi = rp->header.freelist; + f = &rp->slot[fsi]; + for (fsi++; fsi < rp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* + * Update directory index table for entries now in right page + */ + if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { + mp = 0; + stbl = DT_GETSTBL(rp); + for (n = 0; n < rp->header.nextindex; n++) { + ldtentry = (ldtentry_t *) & rp->slot[stbl[n]]; + modify_index(tid, ip, le32_to_cpu(ldtentry->index), + rbn, n, &mp); + } + if (mp) + release_metapage(mp); + } + + /* + * the skipped index was on the left page, + */ + if (skip <= off) { + /* insert the new entry in the split page */ + dtInsertEntry(sp, skip, split->key, split->data, &sdtlck); + + /* linelock stbl of split page */ + if (sdtlck->index >= sdtlck->maxcnt) + sdtlck = (dtlock_t *) txLinelock(sdtlck); + slv = (lv_t *) & sdtlck->lv[sdtlck->index]; + n = skip >> L2DTSLOTSIZE; + slv->offset = sp->header.stblindex + n; + slv->length = + ((sp->header.nextindex - 1) >> L2DTSLOTSIZE) - n + 1; + sdtlck->index++; + } + /* + * the skipped index was on the right page, + */ + else { + /* adjust the skip index to reflect the new position */ + skip -= nxt; + + /* insert the new entry in the right page */ + dtInsertEntry(rp, skip, split->key, split->data, &rdtlck); + } + + out: + *rmpp = rmp; + *rpxdp = *pxd; + + ip->i_blocks += LBLK2PBLK(sb, lengthPXD(pxd)); + + jEVENT(0, ("dtSplitPage: ip:0x%p sp:0x%p rp:0x%p\n", ip, sp, rp)); + return 0; +} + + +/* + * dtExtendPage() + * + * function: extend 1st/only directory leaf page + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * return extended page pinned; + */ +static int dtExtendPage(tid_t tid, + struct inode *ip, dtsplit_t * split, btstack_t * btstack) +{ + struct super_block *sb = ip->i_sb; + int rc; + metapage_t *smp, *pmp, *mp; + dtpage_t *sp, *pp; + pxdlist_t *pxdlist; + pxd_t *pxd, *tpxd; + int xlen, xsize; + int newstblindex, newstblsize; + int oldstblindex, oldstblsize; + int fsi, last; + dtslot_t *f; + btframe_t *parent; + int n; + dtlock_t *dtlck; + s64 xaddr, txaddr; + tlock_t *tlck; + pxdlock_t *pxdlock; + lv_t *lv; + uint type; + ldtentry_t *ldtentry; + u8 *stbl; + + /* get page to extend */ + smp = split->mp; + sp = DT_PAGE(ip, smp); + + /* get parent/root page */ + parent = BT_POP(btstack); + DT_GETPAGE(ip, parent->bn, pmp, PSIZE, pp, rc); + if (rc) + return (rc); + + /* + * extend the extent + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + + xaddr = addressPXD(pxd); + tpxd = &sp->header.self; + txaddr = addressPXD(tpxd); + /* in-place extension */ + if (xaddr == txaddr) { + type = tlckEXTEND; + } + /* relocation */ + else { + type = tlckNEW; + + /* save moved extent descriptor for later free */ + tlck = txMaplock(tid, ip, tlckDTREE | tlckRELOCATE); + pxdlock = (pxdlock_t *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = sp->header.self; + pxdlock->index = 1; + + /* + * Update directory index table to reflect new page address + */ + if (DO_INDEX(ip)) { + mp = 0; + stbl = DT_GETSTBL(sp); + for (n = 0; n < sp->header.nextindex; n++) { + ldtentry = + (ldtentry_t *) & sp->slot[stbl[n]]; + modify_index(tid, ip, + le32_to_cpu(ldtentry->index), + xaddr, n, &mp); + } + if (mp) + release_metapage(mp); + } + } + + /* + * extend the page + */ + sp->header.self = *pxd; + + jEVENT(0, + ("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p\n", ip, smp, sp)); + + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the extended/leaf page + */ + tlck = txLock(tid, ip, smp, tlckDTREE | type); + dtlck = (dtlock_t *) & tlck->lock; + lv = (lv_t *) & dtlck->lv[0]; + + /* update buffer extent descriptor of extended page */ + xlen = lengthPXD(pxd); + xsize = xlen << JFS_SBI(sb)->l2bsize; +#ifdef _STILL_TO_PORT + bmSetXD(smp, xaddr, xsize); +#endif /* _STILL_TO_PORT */ + + /* + * copy old stbl to new stbl at start of extended area + */ + oldstblindex = sp->header.stblindex; + oldstblsize = (sp->header.maxslot + 31) >> L2DTSLOTSIZE; + newstblindex = sp->header.maxslot; + n = xsize >> L2DTSLOTSIZE; + newstblsize = (n + 31) >> L2DTSLOTSIZE; + memcpy(&sp->slot[newstblindex], &sp->slot[oldstblindex], + sp->header.nextindex); + + /* + * in-line extension: linelock old area of extended page + */ + if (type == tlckEXTEND) { + /* linelock header */ + lv->offset = 0; + lv->length = 1; + dtlck->index++; + lv++; + + /* linelock new stbl of extended page */ + lv->offset = newstblindex; + lv->length = newstblsize; + } + /* + * relocation: linelock whole relocated area + */ + else { + lv->offset = 0; + lv->length = sp->header.maxslot + newstblsize; + } + + dtlck->index++; + + sp->header.maxslot = n; + sp->header.stblindex = newstblindex; + /* sp->header.nextindex remains the same */ + + /* + * add old stbl region at head of freelist + */ + fsi = oldstblindex; + f = &sp->slot[fsi]; + last = sp->header.freelist; + for (n = 0; n < oldstblsize; n++, fsi++, f++) { + f->next = last; + last = fsi; + } + sp->header.freelist = last; + sp->header.freecnt += oldstblsize; + + /* + * append free region of newly extended area at tail of freelist + */ + /* init free region of newly extended area */ + fsi = n = newstblindex + newstblsize; + f = &sp->slot[fsi]; + for (fsi++; fsi < sp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* append new free region at tail of old freelist */ + fsi = sp->header.freelist; + if (fsi == -1) + sp->header.freelist = n; + else { + do { + f = &sp->slot[fsi]; + fsi = f->next; + } while (fsi != -1); + + f->next = n; + } + + sp->header.freecnt += sp->header.maxslot - n; + + /* + * insert the new entry + */ + dtInsertEntry(sp, split->index, split->key, split->data, &dtlck); + + BT_MARK_DIRTY(pmp, ip); + /* + * linelock any freeslots residing in old extent + */ + if (type == tlckEXTEND) { + n = sp->header.maxslot >> 2; + if (sp->header.freelist < n) + dtLinelockFreelist(sp, n, &dtlck); + } + + /* + * update parent entry on the parent/root page + */ + /* + * acquire a transaction lock on the parent/root page + */ + tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); + dtlck = (dtlock_t *) & tlck->lock; + lv = (lv_t *) & dtlck->lv[dtlck->index]; + + /* linelock parent entry - 1st slot */ + lv->offset = 1; + lv->length = 1; + dtlck->index++; + + /* update the parent pxd for page extension */ + tpxd = (pxd_t *) & pp->slot[1]; + *tpxd = *pxd; + + /* Since the directory might have an EA and/or ACL associated with it + * we need to make sure we take that into account when setting the + * i_nblocks + */ + ip->i_blocks = LBLK2PBLK(ip->i_sb, xlen + + ((JFS_IP(ip)->ea.flag & DXD_EXTENT) ? + lengthDXD(&JFS_IP(ip)->ea) : 0) + + ((JFS_IP(ip)->acl.flag & DXD_EXTENT) ? + lengthDXD(&JFS_IP(ip)->acl) : 0)); + + jEVENT(0, + ("dtExtendPage: ip:0x%p smp:0x%p sp:0x%p\n", ip, smp, sp)); + + + DT_PUTPAGE(pmp); + return 0; +} + + +/* + * dtSplitRoot() + * + * function: + * split the full root page into + * original/root/split page and new right page + * i.e., root remains fixed in tree anchor (inode) and + * the root is copied to a single new right child page + * since root page << non-root page, and + * the split root page contains a single entry for the + * new right child page. + * + * parameter: + * + * return: 0 - success; + * errno - failure; + * return new page pinned; + */ +static int dtSplitRoot(tid_t tid, + struct inode *ip, dtsplit_t * split, metapage_t ** rmpp) +{ + struct super_block *sb = ip->i_sb; + metapage_t *smp; + dtroot_t *sp; + metapage_t *rmp; + dtpage_t *rp; + s64 rbn; + int xlen; + int xsize; + dtslot_t *f; + s8 *stbl; + int fsi, stblsize, n; + idtentry_t *s; + pxd_t *ppxd; + pxdlist_t *pxdlist; + pxd_t *pxd; + dtlock_t *dtlck; + tlock_t *tlck; + lv_t *lv; + + /* get split root page */ + smp = split->mp; + sp = &JFS_IP(ip)->i_dtroot; + + /* + * allocate/initialize a single (right) child page + * + * N.B. at first split, a one (or two) block to fit new entry + * is allocated; at subsequent split, a full page is allocated; + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + xlen = lengthPXD(pxd); + xsize = xlen << JFS_SBI(sb)->l2bsize; + rmp = get_metapage(ip, rbn, xsize, 1); + rp = rmp->data; + + BT_MARK_DIRTY(rmp, ip); + /* + * acquire a transaction lock on the new right page + */ + tlck = txLock(tid, ip, rmp, tlckDTREE | tlckNEW); + dtlck = (dtlock_t *) & tlck->lock; + + rp->header.flag = + (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; + rp->header.self = *pxd; + + /* initialize sibling pointers */ + rp->header.next = 0; + rp->header.prev = 0; + + /* + * move in-line root page into new right page extent + */ + /* linelock header + copied entries + new stbl (1st slot) in new page */ + ASSERT(dtlck->index == 0); + lv = (lv_t *) & dtlck->lv[0]; + lv->offset = 0; + lv->length = 10; /* 1 + 8 + 1 */ + dtlck->index++; + + n = xsize >> L2DTSLOTSIZE; + rp->header.maxslot = n; + stblsize = (n + 31) >> L2DTSLOTSIZE; + + /* copy old stbl to new stbl at start of extended area */ + rp->header.stblindex = DTROOTMAXSLOT; + stbl = (s8 *) & rp->slot[DTROOTMAXSLOT]; + memcpy(stbl, sp->header.stbl, sp->header.nextindex); + rp->header.nextindex = sp->header.nextindex; + + /* copy old data area to start of new data area */ + memcpy(&rp->slot[1], &sp->slot[1], IDATASIZE); + + /* + * append free region of newly extended area at tail of freelist + */ + /* init free region of newly extended area */ + fsi = n = DTROOTMAXSLOT + stblsize; + f = &rp->slot[fsi]; + for (fsi++; fsi < rp->header.maxslot; f++, fsi++) + f->next = fsi; + f->next = -1; + + /* append new free region at tail of old freelist */ + fsi = sp->header.freelist; + if (fsi == -1) + rp->header.freelist = n; + else { + rp->header.freelist = fsi; + + do { + f = &rp->slot[fsi]; + fsi = f->next; + } while (fsi != -1); + + f->next = n; + } + + rp->header.freecnt = sp->header.freecnt + rp->header.maxslot - n; + + /* + * Update directory index table for entries now in right page + */ + if ((rp->header.flag & BT_LEAF) && DO_INDEX(ip)) { + metapage_t *mp = 0; + ldtentry_t *ldtentry; + + stbl = DT_GETSTBL(rp); + for (n = 0; n < rp->header.nextindex; n++) { + ldtentry = (ldtentry_t *) & rp->slot[stbl[n]]; + modify_index(tid, ip, le32_to_cpu(ldtentry->index), + rbn, n, &mp); + } + if (mp) + release_metapage(mp); + } + /* + * insert the new entry into the new right/child page + * (skip index in the new right page will not change) + */ + dtInsertEntry(rp, split->index, split->key, split->data, &dtlck); + + /* + * reset parent/root page + * + * set the 1st entry offset to 0, which force the left-most key + * at any level of the tree to be less than any search key. + * + * The btree comparison code guarantees that the left-most key on any + * level of the tree is never used, so it doesn't need to be filled in. + */ + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the root page (in-memory inode) + */ + tlck = txLock(tid, ip, smp, tlckDTREE | tlckNEW | tlckBTROOT); + dtlck = (dtlock_t *) & tlck->lock; + + /* linelock root */ + ASSERT(dtlck->index == 0); + lv = (lv_t *) & dtlck->lv[0]; + lv->offset = 0; + lv->length = DTROOTMAXSLOT; + dtlck->index++; + + /* update page header of root */ + if (sp->header.flag & BT_LEAF) { + sp->header.flag &= ~BT_LEAF; + sp->header.flag |= BT_INTERNAL; + } + + /* init the first entry */ + s = (idtentry_t *) & sp->slot[DTENTRYSTART]; + ppxd = (pxd_t *) s; + *ppxd = *pxd; + s->next = -1; + s->namlen = 0; + + stbl = sp->header.stbl; + stbl[0] = DTENTRYSTART; + sp->header.nextindex = 1; + + /* init freelist */ + fsi = DTENTRYSTART + 1; + f = &sp->slot[fsi]; + + /* init free region of remaining area */ + for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) + f->next = fsi; + f->next = -1; + + sp->header.freelist = DTENTRYSTART + 1; + sp->header.freecnt = DTROOTMAXSLOT - (DTENTRYSTART + 1); + + *rmpp = rmp; + + ip->i_blocks += LBLK2PBLK(ip->i_sb, lengthPXD(pxd)); + return 0; +} + + +/* + * dtDelete() + * + * function: delete the entry(s) referenced by a key. + * + * parameter: + * + * return: + */ +int dtDelete(tid_t tid, + struct inode *ip, component_t * key, ino_t * ino, int flag) +{ + int rc = 0; + s64 bn; + metapage_t *mp, *imp; + dtpage_t *p; + int index; + btstack_t btstack; + dtlock_t *dtlck; + tlock_t *tlck; + lv_t *lv; + int i; + ldtentry_t *ldtentry; + u8 *stbl; + u32 table_index, next_index; + metapage_t *nmp; + dtpage_t *np; + + /* + * search for the entry to delete: + * + * dtSearch() returns (leaf page pinned, index at which to delete). + */ + if ((rc = dtSearch(ip, key, ino, &btstack, flag))) + return rc; + + /* retrieve search result */ + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* + * We need to find put the index of the next entry into the + * directory index table in order to resume a readdir from this + * entry. + */ + if (DO_INDEX(ip)) { + stbl = DT_GETSTBL(p); + ldtentry = (ldtentry_t *) & p->slot[stbl[index]]; + table_index = le32_to_cpu(ldtentry->index); + if (index == (p->header.nextindex - 1)) { + /* + * Last entry in this leaf page + */ + if ((p->header.flag & BT_ROOT) + || (p->header.next == 0)) + next_index = -1; + else { + /* Read next leaf page */ + DT_GETPAGE(ip, le64_to_cpu(p->header.next), + nmp, PSIZE, np, rc); + if (rc) + next_index = -1; + else { + stbl = DT_GETSTBL(np); + ldtentry = + (ldtentry_t *) & np-> + slot[stbl[0]]; + next_index = + le32_to_cpu(ldtentry->index); + DT_PUTPAGE(nmp); + } + } + } else { + ldtentry = + (ldtentry_t *) & p->slot[stbl[index + 1]]; + next_index = le32_to_cpu(ldtentry->index); + } + free_index(tid, ip, table_index, next_index); + } + /* + * the leaf page becomes empty, delete the page + */ + if (p->header.nextindex == 1) { + /* delete empty page */ + rc = dtDeleteUp(tid, ip, mp, p, &btstack); + } + /* + * the leaf page has other entries remaining: + * + * delete the entry from the leaf page. + */ + else { + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (dtlock_t *) & tlck->lock; + + /* + * Do not assume that dtlck->index will be zero. During a + * rename within a directory, this transaction may have + * modified this page already when adding the new entry. + */ + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (dtlock_t *) txLinelock(dtlck); + lv = (lv_t *) & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + /* linelock stbl of non-root leaf page */ + if (!(p->header.flag & BT_ROOT)) { + if (dtlck->index >= dtlck->maxcnt) + dtlck = (dtlock_t *) txLinelock(dtlck); + lv = (lv_t *) & dtlck->lv[dtlck->index]; + i = index >> L2DTSLOTSIZE; + lv->offset = p->header.stblindex + i; + lv->length = + ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - + i + 1; + dtlck->index++; + } + + /* free the leaf entry */ + dtDeleteEntry(p, index, &dtlck); + + /* + * Update directory index table for entries moved in stbl + */ + if (DO_INDEX(ip) && index < p->header.nextindex) { + imp = 0; + stbl = DT_GETSTBL(p); + for (i = index; i < p->header.nextindex; i++) { + ldtentry = + (ldtentry_t *) & p->slot[stbl[i]]; + modify_index(tid, ip, + le32_to_cpu(ldtentry->index), + bn, i, &imp); + } + if (imp) + release_metapage(imp); + } + + DT_PUTPAGE(mp); + } + + return rc; +} + + +/* + * dtDeleteUp() + * + * function: + * free empty pages as propagating deletion up the tree + * + * parameter: + * + * return: + */ +static int dtDeleteUp(tid_t tid, struct inode *ip, + metapage_t * fmp, dtpage_t * fp, btstack_t * btstack) +{ + int rc = 0; + metapage_t *mp; + dtpage_t *p; + int index, nextindex; + int xlen; + btframe_t *parent; + dtlock_t *dtlck; + tlock_t *tlck; + lv_t *lv; + pxdlock_t *pxdlock; + int i; + + /* + * keep the root leaf page which has become empty + */ + if (BT_IS_ROOT(fmp)) { + /* + * reset the root + * + * dtInitRoot() acquires txlock on the root + */ + dtInitRoot(tid, ip, PARENT(ip)); + + DT_PUTPAGE(fmp); + + return 0; + } + + /* + * free the non-root leaf page + */ + /* + * acquire a transaction lock on the page + * + * write FREEXTENT|NOREDOPAGE log record + * N.B. linelock is overlaid as freed extent descriptor, and + * the buffer page is freed; + */ + tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); + pxdlock = (pxdlock_t *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = fp->header.self; + pxdlock->index = 1; + + /* update sibling pointers */ + if ((rc = dtRelink(tid, ip, fp))) + return rc; + + xlen = lengthPXD(&fp->header.self); + ip->i_blocks -= LBLK2PBLK(ip->i_sb, xlen); + + /* free/invalidate its buffer page */ + discard_metapage(fmp); + + /* + * propagate page deletion up the directory tree + * + * If the delete from the parent page makes it empty, + * continue all the way up the tree. + * stop if the root page is reached (which is never deleted) or + * if the entry deletion does not empty the page. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* pin the parent page */ + DT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * free the extent of the child page deleted + */ + index = parent->index; + + /* + * delete the entry for the child page from parent + */ + nextindex = p->header.nextindex; + + /* + * the parent has the single entry being deleted: + * + * free the parent page which has become empty. + */ + if (nextindex == 1) { + /* + * keep the root internal page which has become empty + */ + if (p->header.flag & BT_ROOT) { + /* + * reset the root + * + * dtInitRoot() acquires txlock on the root + */ + dtInitRoot(tid, ip, PARENT(ip)); + + DT_PUTPAGE(mp); + + return 0; + } + /* + * free the parent page + */ + else { + /* + * acquire a transaction lock on the page + * + * write FREEXTENT|NOREDOPAGE log record + */ + tlck = + txMaplock(tid, ip, + tlckDTREE | tlckFREE); + pxdlock = (pxdlock_t *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = p->header.self; + pxdlock->index = 1; + + /* update sibling pointers */ + if ((rc = dtRelink(tid, ip, p))) + return rc; + + xlen = lengthPXD(&p->header.self); + ip->i_blocks -= LBLK2PBLK(ip->i_sb, xlen); + + /* free/invalidate its buffer page */ + discard_metapage(mp); + + /* propagate up */ + continue; + } + } + + /* + * the parent has other entries remaining: + * + * delete the router entry from the parent page. + */ + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the page + * + * action: router entry deletion + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (dtlock_t *) & tlck->lock; + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (dtlock_t *) txLinelock(dtlck); + lv = (lv_t *) & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + /* linelock stbl of non-root leaf page */ + if (!(p->header.flag & BT_ROOT)) { + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (dtlock_t *) txLinelock(dtlck); + lv = (lv_t *) & dtlck->lv[0]; + } + i = index >> L2DTSLOTSIZE; + lv->offset = p->header.stblindex + i; + lv->length = + ((p->header.nextindex - 1) >> L2DTSLOTSIZE) - + i + 1; + dtlck->index++; + } + + /* free the router entry */ + dtDeleteEntry(p, index, &dtlck); + + /* reset key of new leftmost entry of level (for consistency) */ + if (index == 0 && + ((p->header.flag & BT_ROOT) || p->header.prev == 0)) + dtTruncateEntry(p, 0, &dtlck); + + /* unpin the parent page */ + DT_PUTPAGE(mp); + + /* exit propagation up */ + break; + } + + return 0; +} + + +/* + * NAME: dtRelocate() + * + * FUNCTION: relocate dtpage (internal or leaf) of directory; + * This function is mainly used by defragfs utility. + */ +int dtRelocate(tid_t tid, struct inode *ip, s64 lmxaddr, pxd_t * opxd, + s64 nxaddr) +{ + int rc = 0; + metapage_t *mp, *pmp, *lmp, *rmp; + dtpage_t *p, *pp, *rp = 0, *lp= 0; + s64 bn; + int index; + btstack_t btstack; + pxd_t *pxd; + s64 oxaddr, nextbn, prevbn; + int xlen, xsize; + tlock_t *tlck; + dtlock_t *dtlck; + pxdlock_t *pxdlock; + s8 *stbl; + lv_t *lv; + + oxaddr = addressPXD(opxd); + xlen = lengthPXD(opxd); + + jEVENT(0, ("dtRelocate: lmxaddr:%Ld xaddr:%Ld:%Ld xlen:%d\n", + lmxaddr, oxaddr, nxaddr, xlen)); + + /* + * 1. get the internal parent dtpage covering + * router entry for the tartget page to be relocated; + */ + rc = dtSearchNode(ip, lmxaddr, opxd, &btstack); + if (rc) + return rc; + + /* retrieve search result */ + DT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + jEVENT(0, ("dtRelocate: parent router entry validated.\n")); + + /* + * 2. relocate the target dtpage + */ + /* read in the target page from src extent */ + DT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); + if (rc) { + /* release the pinned parent page */ + DT_PUTPAGE(pmp); + return rc; + } + + /* + * read in sibling pages if any to update sibling pointers; + */ + rmp = NULL; + if (p->header.next) { + nextbn = le64_to_cpu(p->header.next); + DT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); + if (rc) { + DT_PUTPAGE(mp); + DT_PUTPAGE(pmp); + return (rc); + } + } + + lmp = NULL; + if (p->header.prev) { + prevbn = le64_to_cpu(p->header.prev); + DT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); + if (rc) { + DT_PUTPAGE(mp); + DT_PUTPAGE(pmp); + if (rmp) + DT_PUTPAGE(rmp); + return (rc); + } + } + + /* at this point, all xtpages to be updated are in memory */ + + /* + * update sibling pointers of sibling dtpages if any; + */ + if (lmp) { + tlck = txLock(tid, ip, lmp, tlckDTREE | tlckRELINK); + dtlck = (dtlock_t *) & tlck->lock; + /* linelock header */ + ASSERT(dtlck->index == 0); + lv = (lv_t *) & dtlck->lv[0]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + lp->header.next = cpu_to_le64(nxaddr); + DT_PUTPAGE(lmp); + } + + if (rmp) { + tlck = txLock(tid, ip, rmp, tlckDTREE | tlckRELINK); + dtlck = (dtlock_t *) & tlck->lock; + /* linelock header */ + ASSERT(dtlck->index == 0); + lv = (lv_t *) & dtlck->lv[0]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + rp->header.prev = cpu_to_le64(nxaddr); + DT_PUTPAGE(rmp); + } + + /* + * update the target dtpage to be relocated + * + * write LOG_REDOPAGE of LOG_NEW type for dst page + * for the whole target page (logredo() will apply + * after image and update bmap for allocation of the + * dst extent), and update bmap for allocation of + * the dst extent; + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckNEW); + dtlck = (dtlock_t *) & tlck->lock; + /* linelock header */ + ASSERT(dtlck->index == 0); + lv = (lv_t *) & dtlck->lv[0]; + + /* update the self address in the dtpage header */ + pxd = &p->header.self; + PXDaddress(pxd, nxaddr); + + /* the dst page is the same as the src page, i.e., + * linelock for afterimage of the whole page; + */ + lv->offset = 0; + lv->length = p->header.maxslot; + dtlck->index++; + + /* update the buffer extent descriptor of the dtpage */ + xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; +#ifdef _STILL_TO_PORT + bmSetXD(mp, nxaddr, xsize); +#endif /* _STILL_TO_PORT */ + /* unpin the relocated page */ + DT_PUTPAGE(mp); + jEVENT(0, ("dtRelocate: target dtpage relocated.\n")); + + /* the moved extent is dtpage, then a LOG_NOREDOPAGE log rec + * needs to be written (in logredo(), the LOG_NOREDOPAGE log rec + * will also force a bmap update ). + */ + + /* + * 3. acquire maplock for the source extent to be freed; + */ + /* for dtpage relocation, write a LOG_NOREDOPAGE record + * for the source dtpage (logredo() will init NoRedoPage + * filter and will also update bmap for free of the source + * dtpage), and upadte bmap for free of the source dtpage; + */ + tlck = txMaplock(tid, ip, tlckDTREE | tlckFREE); + pxdlock = (pxdlock_t *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, oxaddr); + PXDlength(&pxdlock->pxd, xlen); + pxdlock->index = 1; + + /* + * 4. update the parent router entry for relocation; + * + * acquire tlck for the parent entry covering the target dtpage; + * write LOG_REDOPAGE to apply after image only; + */ + jEVENT(0, ("dtRelocate: update parent router entry.\n")); + tlck = txLock(tid, ip, pmp, tlckDTREE | tlckENTRY); + dtlck = (dtlock_t *) & tlck->lock; + lv = (lv_t *) & dtlck->lv[dtlck->index]; + + /* update the PXD with the new address */ + stbl = DT_GETSTBL(pp); + pxd = (pxd_t *) & pp->slot[stbl[index]]; + PXDaddress(pxd, nxaddr); + lv->offset = stbl[index]; + lv->length = 1; + dtlck->index++; + + /* unpin the parent dtpage */ + DT_PUTPAGE(pmp); + + return rc; +} + + +/* + * NAME: dtSearchNode() + * + * FUNCTION: Search for an dtpage containing a specified address + * This function is mainly used by defragfs utility. + * + * NOTE: Search result on stack, the found page is pinned at exit. + * The result page must be an internal dtpage. + * lmxaddr give the address of the left most page of the + * dtree level, in which the required dtpage resides. + */ +static int dtSearchNode(struct inode *ip, s64 lmxaddr, pxd_t * kpxd, + btstack_t * btstack) +{ + int rc = 0; + s64 bn; + metapage_t *mp; + dtpage_t *p; + int psize = 288; /* initial in-line directory */ + s8 *stbl; + int i; + pxd_t *pxd; + btframe_t *btsp; + + BT_CLR(btstack); /* reset stack */ + + /* + * descend tree to the level with specified leftmost page + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + DT_GETPAGE(ip, bn, mp, psize, p, rc); + if (rc) + return rc; + + /* does the xaddr of leftmost page of the levevl + * matches levevl search key ? + */ + if (p->header.flag & BT_ROOT) { + if (lmxaddr == 0) + break; + } else if (addressPXD(&p->header.self) == lmxaddr) + break; + + /* + * descend down to leftmost child page + */ + if (p->header.flag & BT_LEAF) + return ESTALE; + + /* get the leftmost entry */ + stbl = DT_GETSTBL(p); + pxd = (pxd_t *) & p->slot[stbl[0]]; + + /* get the child page block address */ + bn = addressPXD(pxd); + psize = lengthPXD(pxd) << JFS_SBI(ip->i_sb)->l2bsize; + /* unpin the parent page */ + DT_PUTPAGE(mp); + } + + /* + * search each page at the current levevl + */ + loop: + stbl = DT_GETSTBL(p); + for (i = 0; i < p->header.nextindex; i++) { + pxd = (pxd_t *) & p->slot[stbl[i]]; + + /* found the specified router entry */ + if (addressPXD(pxd) == addressPXD(kpxd) && + lengthPXD(pxd) == lengthPXD(kpxd)) { + btsp = btstack->top; + btsp->bn = bn; + btsp->index = i; + btsp->mp = mp; + + return 0; + } + } + + /* get the right sibling page if any */ + if (p->header.next) + bn = le64_to_cpu(p->header.next); + else { + DT_PUTPAGE(mp); + return ESTALE; + } + + /* unpin current page */ + DT_PUTPAGE(mp); + + /* get the right sibling page */ + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + goto loop; +} + + +/* + * dtRelink() + * + * function: + * link around a freed page. + * + * parameter: + * fp: page to be freed + * + * return: + */ +static int dtRelink(tid_t tid, struct inode *ip, dtpage_t * p) +{ + int rc; + metapage_t *mp; + s64 nextbn, prevbn; + tlock_t *tlck; + dtlock_t *dtlck; + lv_t *lv; + + nextbn = le64_to_cpu(p->header.next); + prevbn = le64_to_cpu(p->header.prev); + + /* update prev pointer of the next page */ + if (nextbn != 0) { + DT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the next page + * + * action: update prev pointer; + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); + jEVENT(0, + ("dtRelink nextbn: tlck = 0x%p, ip = 0x%p, mp=0x%p\n", + tlck, ip, mp)); + dtlck = (dtlock_t *) & tlck->lock; + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (dtlock_t *) txLinelock(dtlck); + lv = (lv_t *) & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + p->header.prev = cpu_to_le64(prevbn); + DT_PUTPAGE(mp); + } + + /* update next pointer of the previous page */ + if (prevbn != 0) { + DT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the prev page + * + * action: update next pointer; + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckRELINK); + jEVENT(0, + ("dtRelink prevbn: tlck = 0x%p, ip = 0x%p, mp=0x%p\n", + tlck, ip, mp)); + dtlck = (dtlock_t *) & tlck->lock; + + /* linelock header */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (dtlock_t *) txLinelock(dtlck); + lv = (lv_t *) & dtlck->lv[dtlck->index]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + + p->header.next = cpu_to_le64(nextbn); + DT_PUTPAGE(mp); + } + + return 0; +} + + +/* + * dtInitRoot() + * + * initialize directory root (inline in inode) + */ +void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + dtroot_t *p; + int fsi; + dtslot_t *f; + tlock_t *tlck; + dtlock_t *dtlck; + lv_t *lv; + u16 xflag_save; + + /* + * If this was previously an non-empty directory, we need to remove + * the old directory table. + */ + if (DO_INDEX(ip)) { + if (jfs_ip->next_index > (MAX_INLINE_DIRTABLE_ENTRY + 1)) { + tblock_t *tblk = tid_to_tblock(tid); + /* + * We're playing games with the tid's xflag. If + * we're removing a regular file, the file's xtree + * is committed with COMMIT_PMAP, but we always + * commit the directories xtree with COMMIT_PWMAP. + */ + xflag_save = tblk->xflag; + tblk->xflag = 0; + /* + * xtTruncate isn't guaranteed to fully truncate + * the xtree. The caller needs to check i_size + * after committing the transaction to see if + * additional truncation is needed. The + * COMMIT_Stale flag tells caller that we + * initiated the truncation. + */ + xtTruncate(tid, ip, 0, COMMIT_PWMAP); + set_cflag(COMMIT_Stale, ip); + + tblk->xflag = xflag_save; + /* + * Tells jfs_metapage code that the metadata pages + * for the index table are no longer useful, and + * remove them from page cache. + */ + invalidate_inode_metapages(ip); + } else + ip->i_size = 1; + + jfs_ip->next_index = 2; + } else + ip->i_size = IDATASIZE; + + /* + * acquire a transaction lock on the root + * + * action: directory initialization; + */ + tlck = txLock(tid, ip, (metapage_t *) & jfs_ip->bxflag, + tlckDTREE | tlckENTRY | tlckBTROOT); + dtlck = (dtlock_t *) & tlck->lock; + + /* linelock root */ + ASSERT(dtlck->index == 0); + lv = (lv_t *) & dtlck->lv[0]; + lv->offset = 0; + lv->length = DTROOTMAXSLOT; + dtlck->index++; + + p = &jfs_ip->i_dtroot; + + p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; + + p->header.nextindex = 0; + + /* init freelist */ + fsi = 1; + f = &p->slot[fsi]; + + /* init data area of root */ + for (fsi++; fsi < DTROOTMAXSLOT; f++, fsi++) + f->next = fsi; + f->next = -1; + + p->header.freelist = 1; + p->header.freecnt = 8; + + /* init '..' entry */ + p->header.idotdot = cpu_to_le32(idotdot); + +#if 0 + ip->i_blocks = LBLK2PBLK(ip->i_sb, + ((jfs_ip->ea.flag & DXD_EXTENT) ? + lengthDXD(&jfs_ip->ea) : 0) + + ((jfs_ip->acl.flag & DXD_EXTENT) ? + lengthDXD(&jfs_ip->acl) : 0)); +#endif + + return; +} + +/* + * jfs_readdir() + * + * function: read directory entries sequentially + * from the specified entry offset + * + * parameter: + * + * return: offset = (pn, index) of start entry + * of next jfs_readdir()/dtRead() + */ +int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct inode *ip = filp->f_dentry->d_inode; + struct nls_table *codepage = JFS_SBI(ip->i_sb)->nls_tab; + int rc = 0; + struct dtoffset { + s16 pn; + s16 index; + s32 unused; + } *dtoffset = (struct dtoffset *) &filp->f_pos; + s64 bn; + metapage_t *mp; + dtpage_t *p; + int index; + s8 *stbl; + btstack_t btstack; + int i, next; + ldtentry_t *d; + dtslot_t *t; + int d_namleft, d_namlen, len, outlen; + char *d_name, *name_ptr; + int dtlhdrdatalen; + u32 dir_index; + int do_index = 0; + uint loop_count = 0; + + if (filp->f_pos == DIREND) + return 0; + + if (DO_INDEX(ip)) { + /* + * persistent index is stored in directory entries. + * Special cases: 0 = . + * 1 = .. + * -1 = End of directory + */ + do_index = 1; + dtlhdrdatalen = DTLHDRDATALEN; + + dir_index = (u32) filp->f_pos; + + if (dir_index > 1) { + dir_table_slot_t dirtab_slot; + + if (dtEmpty(ip)) { + filp->f_pos = DIREND; + return 0; + } + repeat: + rc = get_index(ip, dir_index, &dirtab_slot); + if (rc) { + filp->f_pos = DIREND; + return rc; + } + if (dirtab_slot.flag == DIR_INDEX_FREE) { + if (loop_count++ > JFS_IP(ip)->next_index) { + jERROR(1, ("jfs_readdir detected " + "infinite loop!\n")); + filp->f_pos = DIREND; + return 0; + } + dir_index = le32_to_cpu(dirtab_slot.addr2); + if (dir_index == -1) { + filp->f_pos = DIREND; + return 0; + } + goto repeat; + } + bn = addressDTS(&dirtab_slot); + index = dirtab_slot.slot; + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) { + filp->f_pos = DIREND; + return 0; + } + if (p->header.flag & BT_INTERNAL) { + jERROR(1,("jfs_readdir: bad index table\n")); + DT_PUTPAGE(mp); + filp->f_pos = -1; + return 0; + } + } else { + if (dir_index == 0) { + /* + * self "." + */ + filp->f_pos = 0; + if (filldir(dirent, ".", 1, 0, ip->i_ino, + DT_DIR)) + return 0; + } + /* + * parent ".." + */ + filp->f_pos = 1; + if (filldir + (dirent, "..", 2, 1, PARENT(ip), DT_DIR)) + return 0; + + /* + * Find first entry of left-most leaf + */ + if (dtEmpty(ip)) { + filp->f_pos = DIREND; + return 0; + } + + if ((rc = dtReadFirst(ip, &btstack))) + return -rc; + + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + } + } else { + /* + * Legacy filesystem - OS/2 & Linux JFS < 0.3.6 + * + * pn = index = 0: First entry "." + * pn = 0; index = 1: Second entry ".." + * pn > 0: Real entries, pn=1 -> leftmost page + * pn = index = -1: No more entries + */ + dtlhdrdatalen = DTLHDRDATALEN_LEGACY; + + if (filp->f_pos == 0) { + /* build "." entry */ + + if (filldir(dirent, ".", 1, filp->f_pos, ip->i_ino, + DT_DIR)) + return 0; + dtoffset->index = 1; + } + + if (dtoffset->pn == 0) { + if (dtoffset->index == 1) { + /* build ".." entry */ + + if (filldir(dirent, "..", 2, filp->f_pos, + PARENT(ip), DT_DIR)) + return 0; + } else { + jERROR(1, + ("jfs_readdir called with invalid offset!\n")); + } + dtoffset->pn = 1; + dtoffset->index = 0; + } + + if (dtEmpty(ip)) { + filp->f_pos = DIREND; + return 0; + } + + if ((rc = dtReadNext(ip, &filp->f_pos, &btstack))) { + jERROR(1, + ("jfs_readdir: unexpected rc = %d from dtReadNext\n", + rc)); + filp->f_pos = DIREND; + return 0; + } + /* get start leaf page and index */ + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* offset beyond directory eof ? */ + if (bn < 0) { + filp->f_pos = DIREND; + return 0; + } + } + + d_name = kmalloc((JFS_NAME_MAX + 1) * sizeof(wchar_t), GFP_NOFS); + if (d_name == NULL) { + DT_PUTPAGE(mp); + jERROR(1, ("jfs_readdir: kmalloc failed!\n")); + filp->f_pos = DIREND; + return 0; + } + while (1) { + stbl = DT_GETSTBL(p); + + for (i = index; i < p->header.nextindex; i++) { + d = (ldtentry_t *) & p->slot[stbl[i]]; + + d_namleft = d->namlen; + name_ptr = d_name; + + if (do_index) { + filp->f_pos = le32_to_cpu(d->index); + len = min(d_namleft, DTLHDRDATALEN); + } else + len = min(d_namleft, DTLHDRDATALEN_LEGACY); + + /* copy the name of head/only segment */ + outlen = jfs_strfromUCS_le(name_ptr, d->name, len, + codepage); + d_namlen = outlen; + + /* copy name in the additional segment(s) */ + next = d->next; + while (next >= 0) { + t = (dtslot_t *) & p->slot[next]; + name_ptr += outlen; + d_namleft -= len; + /* Sanity Check */ + if (d_namleft == 0) { + jERROR(1,("JFS:Dtree error: " + "ino = %ld, bn=%Ld, index = %d\n", + ip->i_ino, bn, i)); + updateSuper(ip->i_sb, FM_DIRTY); + goto skip_one; + } + len = min(d_namleft, DTSLOTDATALEN); + outlen = jfs_strfromUCS_le(name_ptr, t->name, + len, codepage); + d_namlen+= outlen; + + next = t->next; + } + + if (filldir(dirent, d_name, d_namlen, filp->f_pos, + le32_to_cpu(d->inumber), DT_UNKNOWN)) + goto out; +skip_one: + if (!do_index) + dtoffset->index++; + } + + /* + * get next leaf page + */ + + if (p->header.flag & BT_ROOT) { + filp->f_pos = DIREND; + break; + } + + bn = le64_to_cpu(p->header.next); + if (bn == 0) { + filp->f_pos = DIREND; + break; + } + + /* unpin previous leaf page */ + DT_PUTPAGE(mp); + + /* get next leaf page */ + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) { + kfree(d_name); + return -rc; + } + + /* update offset (pn:index) for new page */ + index = 0; + if (!do_index) { + dtoffset->pn++; + dtoffset->index = 0; + } + + } + + out: + kfree(d_name); + DT_PUTPAGE(mp); + + return rc; +} + + +/* + * dtReadFirst() + * + * function: get the leftmost page of the directory + */ +static int dtReadFirst(struct inode *ip, btstack_t * btstack) +{ + int rc = 0; + s64 bn; + int psize = 288; /* initial in-line directory */ + metapage_t *mp; + dtpage_t *p; + s8 *stbl; + btframe_t *btsp; + pxd_t *xd; + + BT_CLR(btstack); /* reset stack */ + + /* + * descend leftmost path of the tree + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + DT_GETPAGE(ip, bn, mp, psize, p, rc); + if (rc) + return rc; + + /* + * leftmost leaf page + */ + if (p->header.flag & BT_LEAF) { + /* return leftmost entry */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = 0; + btsp->mp = mp; + + return 0; + } + + /* + * descend down to leftmost child page + */ + /* push (bn, index) of the parent page/entry */ + BT_PUSH(btstack, bn, 0); + + /* get the leftmost entry */ + stbl = DT_GETSTBL(p); + xd = (pxd_t *) & p->slot[stbl[0]]; + + /* get the child page block address */ + bn = addressPXD(xd); + psize = lengthPXD(xd) << JFS_SBI(ip->i_sb)->l2bsize; + + /* unpin the parent page */ + DT_PUTPAGE(mp); + } +} + + +/* + * dtReadNext() + * + * function: get the page of the specified offset (pn:index) + * + * return: if (offset > eof), bn = -1; + * + * note: if index > nextindex of the target leaf page, + * start with 1st entry of next leaf page; + */ +static int dtReadNext(struct inode *ip, loff_t * offset, btstack_t * btstack) +{ + int rc = 0; + struct dtoffset { + s16 pn; + s16 index; + s32 unused; + } *dtoffset = (struct dtoffset *) offset; + s64 bn; + metapage_t *mp; + dtpage_t *p; + int index; + int pn; + s8 *stbl; + btframe_t *btsp, *parent; + pxd_t *xd; + + /* + * get leftmost leaf page pinned + */ + if ((rc = dtReadFirst(ip, btstack))) + return rc; + + /* get leaf page */ + DT_GETSEARCH(ip, btstack->top, bn, mp, p, index); + + /* get the start offset (pn:index) */ + pn = dtoffset->pn - 1; /* Now pn = 0 represents leftmost leaf */ + index = dtoffset->index; + + /* start at leftmost page ? */ + if (pn == 0) { + /* offset beyond eof ? */ + if (index < p->header.nextindex) + goto out; + + if (p->header.flag & BT_ROOT) { + bn = -1; + goto out; + } + + /* start with 1st entry of next leaf page */ + dtoffset->pn++; + dtoffset->index = index = 0; + goto a; + } + + /* start at non-leftmost page: scan parent pages for large pn */ + if (p->header.flag & BT_ROOT) { + bn = -1; + goto out; + } + + /* start after next leaf page ? */ + if (pn > 1) + goto b; + + /* get leaf page pn = 1 */ + a: + bn = le64_to_cpu(p->header.next); + + /* unpin leaf page */ + DT_PUTPAGE(mp); + + /* offset beyond eof ? */ + if (bn == 0) { + bn = -1; + goto out; + } + + goto c; + + /* + * scan last internal page level to get target leaf page + */ + b: + /* unpin leftmost leaf page */ + DT_PUTPAGE(mp); + + /* get left most parent page */ + btsp = btstack->top; + parent = btsp - 1; + bn = parent->bn; + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* scan parent pages at last internal page level */ + while (pn >= p->header.nextindex) { + pn -= p->header.nextindex; + + /* get next parent page address */ + bn = le64_to_cpu(p->header.next); + + /* unpin current parent page */ + DT_PUTPAGE(mp); + + /* offset beyond eof ? */ + if (bn == 0) { + bn = -1; + goto out; + } + + /* get next parent page */ + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* update parent page stack frame */ + parent->bn = bn; + } + + /* get leaf page address */ + stbl = DT_GETSTBL(p); + xd = (pxd_t *) & p->slot[stbl[pn]]; + bn = addressPXD(xd); + + /* unpin parent page */ + DT_PUTPAGE(mp); + + /* + * get target leaf page + */ + c: + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * leaf page has been completed: + * start with 1st entry of next leaf page + */ + if (index >= p->header.nextindex) { + bn = le64_to_cpu(p->header.next); + + /* unpin leaf page */ + DT_PUTPAGE(mp); + + /* offset beyond eof ? */ + if (bn == 0) { + bn = -1; + goto out; + } + + /* get next leaf page */ + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* start with 1st entry of next leaf page */ + dtoffset->pn++; + dtoffset->index = 0; + } + + out: + /* return target leaf page pinned */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = dtoffset->index; + btsp->mp = mp; + + return 0; +} + + +/* + * dtCompare() + * + * function: compare search key with an internal entry + * + * return: + * < 0 if k is < record + * = 0 if k is = record + * > 0 if k is > record + */ +static int dtCompare(component_t * key, /* search key */ + dtpage_t * p, /* directory page */ + int si) +{ /* entry slot index */ + register int rc; + register wchar_t *kname, *name; + register int klen, namlen, len; + idtentry_t *ih; + dtslot_t *t; + + /* + * force the left-most key on internal pages, at any level of + * the tree, to be less than any search key. + * this obviates having to update the leftmost key on an internal + * page when the user inserts a new key in the tree smaller than + * anything that has been stored. + * + * (? if/when dtSearch() narrows down to 1st entry (index = 0), + * at any internal page at any level of the tree, + * it descends to child of the entry anyway - + * ? make the entry as min size dummy entry) + * + * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) + * return (1); + */ + + kname = key->name; + klen = key->namlen; + + ih = (idtentry_t *) & p->slot[si]; + si = ih->next; + name = ih->name; + namlen = ih->namlen; + len = min(namlen, DTIHDRDATALEN); + + /* compare with head/only segment */ + len = min(klen, len); + if ((rc = UniStrncmp_le(kname, name, len))) + return rc; + + klen -= len; + namlen -= len; + + /* compare with additional segment(s) */ + kname += len; + while (klen > 0 && namlen > 0) { + /* compare with next name segment */ + t = (dtslot_t *) & p->slot[si]; + len = min(namlen, DTSLOTDATALEN); + len = min(klen, len); + name = t->name; + if ((rc = UniStrncmp_le(kname, name, len))) + return rc; + + klen -= len; + namlen -= len; + kname += len; + si = t->next; + } + + return (klen - namlen); +} + + + + +/* + * ciCompare() + * + * function: compare search key with an (leaf/internal) entry + * + * return: + * < 0 if k is < record + * = 0 if k is = record + * > 0 if k is > record + */ +static int ciCompare(component_t * key, /* search key */ + dtpage_t * p, /* directory page */ + int si, /* entry slot index */ + int flag) +{ + register int rc; + register wchar_t *kname, *name, x; + register int klen, namlen, len; + ldtentry_t *lh; + idtentry_t *ih; + dtslot_t *t; + int i; + + /* + * force the left-most key on internal pages, at any level of + * the tree, to be less than any search key. + * this obviates having to update the leftmost key on an internal + * page when the user inserts a new key in the tree smaller than + * anything that has been stored. + * + * (? if/when dtSearch() narrows down to 1st entry (index = 0), + * at any internal page at any level of the tree, + * it descends to child of the entry anyway - + * ? make the entry as min size dummy entry) + * + * if (e->index == 0 && h->prevpg == P_INVALID && !(h->flags & BT_LEAF)) + * return (1); + */ + + kname = key->name; + klen = key->namlen; + + /* + * leaf page entry + */ + if (p->header.flag & BT_LEAF) { + lh = (ldtentry_t *) & p->slot[si]; + si = lh->next; + name = lh->name; + namlen = lh->namlen; + if (flag & JFS_DIR_INDEX) + len = min(namlen, DTLHDRDATALEN); + else + len = min(namlen, DTLHDRDATALEN_LEGACY); + } + /* + * internal page entry + */ + else { + ih = (idtentry_t *) & p->slot[si]; + si = ih->next; + name = ih->name; + namlen = ih->namlen; + len = min(namlen, DTIHDRDATALEN); + } + + /* compare with head/only segment */ + len = min(klen, len); + for (i = 0; i < len; i++, kname++, name++) { + /* only uppercase if case-insensitive support is on */ + if ((flag & JFS_OS2) == JFS_OS2) + x = UniToupper(le16_to_cpu(*name)); + else + x = le16_to_cpu(*name); + if ((rc = *kname - x)) + return rc; + } + + klen -= len; + namlen -= len; + + /* compare with additional segment(s) */ + while (klen > 0 && namlen > 0) { + /* compare with next name segment */ + t = (dtslot_t *) & p->slot[si]; + len = min(namlen, DTSLOTDATALEN); + len = min(klen, len); + name = t->name; + for (i = 0; i < len; i++, kname++, name++) { + /* only uppercase if case-insensitive support is on */ + if ((flag & JFS_OS2) == JFS_OS2) + x = UniToupper(le16_to_cpu(*name)); + else + x = le16_to_cpu(*name); + + if ((rc = *kname - x)) + return rc; + } + + klen -= len; + namlen -= len; + si = t->next; + } + + return (klen - namlen); +} + + +/* + * ciGetLeafPrefixKey() + * + * function: compute prefix of suffix compression + * from two adjacent leaf entries + * across page boundary + * + * return: + * Number of prefix bytes needed to distinguish b from a. + */ +static void ciGetLeafPrefixKey(dtpage_t * lp, int li, dtpage_t * rp, + int ri, component_t * key, int flag) +{ + register int klen, namlen; + register wchar_t *pl, *pr, *kname; + wchar_t lname[JFS_NAME_MAX + 1]; + component_t lkey = { 0, lname }; + wchar_t rname[JFS_NAME_MAX + 1]; + component_t rkey = { 0, rname }; + + /* get left and right key */ + dtGetKey(lp, li, &lkey, flag); + lkey.name[lkey.namlen] = 0; + + if ((flag & JFS_OS2) == JFS_OS2) + ciToUpper(&lkey); + + dtGetKey(rp, ri, &rkey, flag); + rkey.name[rkey.namlen] = 0; + + + if ((flag & JFS_OS2) == JFS_OS2) + ciToUpper(&rkey); + + /* compute prefix */ + klen = 0; + kname = key->name; + namlen = min(lkey.namlen, rkey.namlen); + for (pl = lkey.name, pr = rkey.name; + namlen; pl++, pr++, namlen--, klen++, kname++) { + *kname = *pr; + if (*pl != *pr) { + key->namlen = klen + 1; + return; + } + } + + /* l->namlen <= r->namlen since l <= r */ + if (lkey.namlen < rkey.namlen) { + *kname = *pr; + key->namlen = klen + 1; + } else /* l->namelen == r->namelen */ + key->namlen = klen; + + return; +} + + + +/* + * dtGetKey() + * + * function: get key of the entry + */ +static void dtGetKey(dtpage_t * p, int i, /* entry index */ + component_t * key, int flag) +{ + int si; + s8 *stbl; + ldtentry_t *lh; + idtentry_t *ih; + dtslot_t *t; + int namlen, len; + wchar_t *name, *kname; + + /* get entry */ + stbl = DT_GETSTBL(p); + si = stbl[i]; + if (p->header.flag & BT_LEAF) { + lh = (ldtentry_t *) & p->slot[si]; + si = lh->next; + namlen = lh->namlen; + name = lh->name; + if (flag & JFS_DIR_INDEX) + len = min(namlen, DTLHDRDATALEN); + else + len = min(namlen, DTLHDRDATALEN_LEGACY); + } else { + ih = (idtentry_t *) & p->slot[si]; + si = ih->next; + namlen = ih->namlen; + name = ih->name; + len = min(namlen, DTIHDRDATALEN); + } + + key->namlen = namlen; + kname = key->name; + + /* + * move head/only segment + */ + UniStrncpy_le(kname, name, len); + + /* + * move additional segment(s) + */ + while (si >= 0) { + /* get next segment */ + t = &p->slot[si]; + kname += len; + namlen -= len; + len = min(namlen, DTSLOTDATALEN); + UniStrncpy_le(kname, t->name, len); + + si = t->next; + } +} + + +/* + * dtInsertEntry() + * + * function: allocate free slot(s) and + * write a leaf/internal entry + * + * return: entry slot index + */ +static void dtInsertEntry(dtpage_t * p, int index, component_t * key, + ddata_t * data, dtlock_t ** dtlock) +{ + dtslot_t *h, *t; + ldtentry_t *lh = 0; + idtentry_t *ih = 0; + int hsi, fsi, klen, len, nextindex; + wchar_t *kname, *name; + s8 *stbl; + pxd_t *xd; + dtlock_t *dtlck = *dtlock; + lv_t *lv; + int xsi, n; + s64 bn = 0; + metapage_t *mp = 0; + + klen = key->namlen; + kname = key->name; + + /* allocate a free slot */ + hsi = fsi = p->header.freelist; + h = &p->slot[fsi]; + p->header.freelist = h->next; + --p->header.freecnt; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (dtlock_t *) txLinelock(dtlck); + + lv = (lv_t *) & dtlck->lv[dtlck->index]; + lv->offset = hsi; + + /* write head/only segment */ + if (p->header.flag & BT_LEAF) { + lh = (ldtentry_t *) h; + lh->next = h->next; + lh->inumber = data->leaf.ino; /* little-endian */ + lh->namlen = klen; + name = lh->name; + if (data->leaf.ip) { + len = min(klen, DTLHDRDATALEN); + if (!(p->header.flag & BT_ROOT)) + bn = addressPXD(&p->header.self); + lh->index = cpu_to_le32(add_index(data->leaf.tid, + data->leaf.ip, + bn, index)); + } else + len = min(klen, DTLHDRDATALEN_LEGACY); + } else { + ih = (idtentry_t *) h; + ih->next = h->next; + xd = (pxd_t *) ih; + *xd = data->xd; + ih->namlen = klen; + name = ih->name; + len = min(klen, DTIHDRDATALEN); + } + + UniStrncpy_le(name, kname, len); + + n = 1; + xsi = hsi; + + /* write additional segment(s) */ + t = h; + klen -= len; + while (klen) { + /* get free slot */ + fsi = p->header.freelist; + t = &p->slot[fsi]; + p->header.freelist = t->next; + --p->header.freecnt; + + /* is next slot contiguous ? */ + if (fsi != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (dtlock_t *) txLinelock(dtlck); + lv = (lv_t *) & dtlck->lv[0]; + } + + lv->offset = fsi; + n = 0; + } + + kname += len; + len = min(klen, DTSLOTDATALEN); + UniStrncpy_le(t->name, kname, len); + + n++; + xsi = fsi; + klen -= len; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; + + /* terminate last/only segment */ + if (h == t) { + /* single segment entry */ + if (p->header.flag & BT_LEAF) + lh->next = -1; + else + ih->next = -1; + } else + /* multi-segment entry */ + t->next = -1; + + /* if insert into middle, shift right succeeding entries in stbl */ + stbl = DT_GETSTBL(p); + nextindex = p->header.nextindex; + if (index < nextindex) { + memmove(stbl + index + 1, stbl + index, nextindex - index); + + if ((p->header.flag & BT_LEAF) && data->leaf.ip) { + /* + * Need to update slot number for entries that moved + * in the stbl + */ + mp = 0; + for (n = index + 1; n <= nextindex; n++) { + lh = (ldtentry_t *) & (p->slot[stbl[n]]); + modify_index(data->leaf.tid, data->leaf.ip, + le32_to_cpu(lh->index), bn, n, + &mp); + } + if (mp) + release_metapage(mp); + } + } + + stbl[index] = hsi; + + /* advance next available entry index of stbl */ + ++p->header.nextindex; +} + + +/* + * dtMoveEntry() + * + * function: move entries from split/left page to new/right page + * + * nextindex of dst page and freelist/freecnt of both pages + * are updated. + */ +static void dtMoveEntry(dtpage_t * sp, int si, dtpage_t * dp, + dtlock_t ** sdtlock, dtlock_t ** ddtlock, + int do_index) +{ + int ssi, next; /* src slot index */ + int di; /* dst entry index */ + int dsi; /* dst slot index */ + s8 *sstbl, *dstbl; /* sorted entry table */ + int snamlen, len; + ldtentry_t *slh, *dlh = 0; + idtentry_t *sih, *dih = 0; + dtslot_t *h, *s, *d; + dtlock_t *sdtlck = *sdtlock, *ddtlck = *ddtlock; + lv_t *slv, *dlv; + int xssi, ns, nd; + int sfsi; + + sstbl = (s8 *) & sp->slot[sp->header.stblindex]; + dstbl = (s8 *) & dp->slot[dp->header.stblindex]; + + dsi = dp->header.freelist; /* first (whole page) free slot */ + sfsi = sp->header.freelist; + + /* linelock destination entry slot */ + dlv = (lv_t *) & ddtlck->lv[ddtlck->index]; + dlv->offset = dsi; + + /* linelock source entry slot */ + slv = (lv_t *) & sdtlck->lv[sdtlck->index]; + slv->offset = sstbl[si]; + xssi = slv->offset - 1; + + /* + * move entries + */ + ns = nd = 0; + for (di = 0; si < sp->header.nextindex; si++, di++) { + ssi = sstbl[si]; + dstbl[di] = dsi; + + /* is next slot contiguous ? */ + if (ssi != xssi + 1) { + /* close current linelock */ + slv->length = ns; + sdtlck->index++; + + /* open new linelock */ + if (sdtlck->index < sdtlck->maxcnt) + slv++; + else { + sdtlck = (dtlock_t *) txLinelock(sdtlck); + slv = (lv_t *) & sdtlck->lv[0]; + } + + slv->offset = ssi; + ns = 0; + } + + /* + * move head/only segment of an entry + */ + /* get dst slot */ + h = d = &dp->slot[dsi]; + + /* get src slot and move */ + s = &sp->slot[ssi]; + if (sp->header.flag & BT_LEAF) { + /* get source entry */ + slh = (ldtentry_t *) s; + dlh = (ldtentry_t *) h; + snamlen = slh->namlen; + + if (do_index) { + len = min(snamlen, DTLHDRDATALEN); + dlh->index = slh->index; /* little-endian */ + } else + len = min(snamlen, DTLHDRDATALEN_LEGACY); + + memcpy(dlh, slh, 6 + len * 2); + + next = slh->next; + + /* update dst head/only segment next field */ + dsi++; + dlh->next = dsi; + } else { + sih = (idtentry_t *) s; + snamlen = sih->namlen; + + len = min(snamlen, DTIHDRDATALEN); + dih = (idtentry_t *) h; + memcpy(dih, sih, 10 + len * 2); + next = sih->next; + + dsi++; + dih->next = dsi; + } + + /* free src head/only segment */ + s->next = sfsi; + s->cnt = 1; + sfsi = ssi; + + ns++; + nd++; + xssi = ssi; + + /* + * move additional segment(s) of the entry + */ + snamlen -= len; + while ((ssi = next) >= 0) { + /* is next slot contiguous ? */ + if (ssi != xssi + 1) { + /* close current linelock */ + slv->length = ns; + sdtlck->index++; + + /* open new linelock */ + if (sdtlck->index < sdtlck->maxcnt) + slv++; + else { + sdtlck = + (dtlock_t *) + txLinelock(sdtlck); + slv = (lv_t *) & sdtlck->lv[0]; + } + + slv->offset = ssi; + ns = 0; + } + + /* get next source segment */ + s = &sp->slot[ssi]; + + /* get next destination free slot */ + d++; + + len = min(snamlen, DTSLOTDATALEN); + UniStrncpy(d->name, s->name, len); + + ns++; + nd++; + xssi = ssi; + + dsi++; + d->next = dsi; + + /* free source segment */ + next = s->next; + s->next = sfsi; + s->cnt = 1; + sfsi = ssi; + + snamlen -= len; + } /* end while */ + + /* terminate dst last/only segment */ + if (h == d) { + /* single segment entry */ + if (dp->header.flag & BT_LEAF) + dlh->next = -1; + else + dih->next = -1; + } else + /* multi-segment entry */ + d->next = -1; + } /* end for */ + + /* close current linelock */ + slv->length = ns; + sdtlck->index++; + *sdtlock = sdtlck; + + dlv->length = nd; + ddtlck->index++; + *ddtlock = ddtlck; + + /* update source header */ + sp->header.freelist = sfsi; + sp->header.freecnt += nd; + + /* update destination header */ + dp->header.nextindex = di; + + dp->header.freelist = dsi; + dp->header.freecnt -= nd; +} + + +/* + * dtDeleteEntry() + * + * function: free a (leaf/internal) entry + * + * log freelist header, stbl, and each segment slot of entry + * (even though last/only segment next field is modified, + * physical image logging requires all segment slots of + * the entry logged to avoid applying previous updates + * to the same slots) + */ +static void dtDeleteEntry(dtpage_t * p, int fi, dtlock_t ** dtlock) +{ + int fsi; /* free entry slot index */ + s8 *stbl; + dtslot_t *t; + int si, freecnt; + dtlock_t *dtlck = *dtlock; + lv_t *lv; + int xsi, n; + + /* get free entry slot index */ + stbl = DT_GETSTBL(p); + fsi = stbl[fi]; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (dtlock_t *) txLinelock(dtlck); + lv = (lv_t *) & dtlck->lv[dtlck->index]; + + lv->offset = fsi; + + /* get the head/only segment */ + t = &p->slot[fsi]; + if (p->header.flag & BT_LEAF) + si = ((ldtentry_t *) t)->next; + else + si = ((idtentry_t *) t)->next; + t->next = si; + t->cnt = 1; + + n = freecnt = 1; + xsi = fsi; + + /* find the last/only segment */ + while (si >= 0) { + /* is next slot contiguous ? */ + if (si != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (dtlock_t *) txLinelock(dtlck); + lv = (lv_t *) & dtlck->lv[0]; + } + + lv->offset = si; + n = 0; + } + + n++; + xsi = si; + freecnt++; + + t = &p->slot[si]; + t->cnt = 1; + si = t->next; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; + + /* update freelist */ + t->next = p->header.freelist; + p->header.freelist = fsi; + p->header.freecnt += freecnt; + + /* if delete from middle, + * shift left the succedding entries in the stbl + */ + si = p->header.nextindex; + if (fi < si - 1) + memmove(&stbl[fi], &stbl[fi + 1], si - fi - 1); + + p->header.nextindex--; +} + + +/* + * dtTruncateEntry() + * + * function: truncate a (leaf/internal) entry + * + * log freelist header, stbl, and each segment slot of entry + * (even though last/only segment next field is modified, + * physical image logging requires all segment slots of + * the entry logged to avoid applying previous updates + * to the same slots) + */ +static void dtTruncateEntry(dtpage_t * p, int ti, dtlock_t ** dtlock) +{ + int tsi; /* truncate entry slot index */ + s8 *stbl; + dtslot_t *t; + int si, freecnt; + dtlock_t *dtlck = *dtlock; + lv_t *lv; + int fsi, xsi, n; + + /* get free entry slot index */ + stbl = DT_GETSTBL(p); + tsi = stbl[ti]; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (dtlock_t *) txLinelock(dtlck); + lv = (lv_t *) & dtlck->lv[dtlck->index]; + + lv->offset = tsi; + + /* get the head/only segment */ + t = &p->slot[tsi]; + ASSERT(p->header.flag & BT_INTERNAL); + ((idtentry_t *) t)->namlen = 0; + si = ((idtentry_t *) t)->next; + ((idtentry_t *) t)->next = -1; + + n = 1; + freecnt = 0; + fsi = si; + xsi = tsi; + + /* find the last/only segment */ + while (si >= 0) { + /* is next slot contiguous ? */ + if (si != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (dtlock_t *) txLinelock(dtlck); + lv = (lv_t *) & dtlck->lv[0]; + } + + lv->offset = si; + n = 0; + } + + n++; + xsi = si; + freecnt++; + + t = &p->slot[si]; + t->cnt = 1; + si = t->next; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; + + /* update freelist */ + if (freecnt == 0) + return; + t->next = p->header.freelist; + p->header.freelist = fsi; + p->header.freecnt += freecnt; +} + + +/* + * dtLinelockFreelist() + */ +static void dtLinelockFreelist(dtpage_t * p, /* directory page */ + int m, /* max slot index */ + dtlock_t ** dtlock) +{ + int fsi; /* free entry slot index */ + dtslot_t *t; + int si; + dtlock_t *dtlck = *dtlock; + lv_t *lv; + int xsi, n; + + /* get free entry slot index */ + fsi = p->header.freelist; + + /* open new linelock */ + if (dtlck->index >= dtlck->maxcnt) + dtlck = (dtlock_t *) txLinelock(dtlck); + lv = (lv_t *) & dtlck->lv[dtlck->index]; + + lv->offset = fsi; + + n = 1; + xsi = fsi; + + t = &p->slot[fsi]; + si = t->next; + + /* find the last/only segment */ + while (si < m && si >= 0) { + /* is next slot contiguous ? */ + if (si != xsi + 1) { + /* close current linelock */ + lv->length = n; + dtlck->index++; + + /* open new linelock */ + if (dtlck->index < dtlck->maxcnt) + lv++; + else { + dtlck = (dtlock_t *) txLinelock(dtlck); + lv = (lv_t *) & dtlck->lv[0]; + } + + lv->offset = si; + n = 0; + } + + n++; + xsi = si; + + t = &p->slot[si]; + si = t->next; + } + + /* close current linelock */ + lv->length = n; + dtlck->index++; + + *dtlock = dtlck; +} + + +/* + * NAME: dtModify + * + * FUNCTION: Modify the inode number part of a directory entry + * + * PARAMETERS: + * tid - Transaction id + * ip - Inode of parent directory + * key - Name of entry to be modified + * orig_ino - Original inode number expected in entry + * new_ino - New inode number to put into entry + * flag - JFS_RENAME + * + * RETURNS: + * ESTALE - If entry found does not match orig_ino passed in + * ENOENT - If no entry can be found to match key + * 0 - If successfully modified entry + */ +int dtModify(tid_t tid, struct inode *ip, + component_t * key, ino_t * orig_ino, ino_t new_ino, int flag) +{ + int rc; + s64 bn; + metapage_t *mp; + dtpage_t *p; + int index; + btstack_t btstack; + tlock_t *tlck; + dtlock_t *dtlck; + lv_t *lv; + s8 *stbl; + int entry_si; /* entry slot index */ + ldtentry_t *entry; + + /* + * search for the entry to modify: + * + * dtSearch() returns (leaf page pinned, index at which to modify). + */ + if ((rc = dtSearch(ip, key, orig_ino, &btstack, flag))) + return rc; + + /* retrieve search result */ + DT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page of named entry + */ + tlck = txLock(tid, ip, mp, tlckDTREE | tlckENTRY); + dtlck = (dtlock_t *) & tlck->lock; + + /* get slot index of the entry */ + stbl = DT_GETSTBL(p); + entry_si = stbl[index]; + + /* linelock entry */ + ASSERT(dtlck->index == 0); + lv = (lv_t *) & dtlck->lv[0]; + lv->offset = entry_si; + lv->length = 1; + dtlck->index++; + + /* get the head/only segment */ + entry = (ldtentry_t *) & p->slot[entry_si]; + + /* substitute the inode number of the entry */ + entry->inumber = cpu_to_le32(new_ino); + + /* unpin the leaf page */ + DT_PUTPAGE(mp); + + return 0; +} + +#ifdef _JFS_DEBUG_DTREE +/* + * dtDisplayTree() + * + * function: traverse forward + */ +int dtDisplayTree(struct inode *ip) +{ + int rc; + metapage_t *mp; + dtpage_t *p; + s64 bn, pbn; + int index, lastindex, v, h; + pxd_t *xd; + btstack_t btstack; + btframe_t *btsp; + btframe_t *parent; + u8 *stbl; + int psize = 256; + + printk("display B+-tree.\n"); + + /* clear stack */ + btsp = btstack.stack; + + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + v = h = 0; + + /* + * first access of each page: + */ + newPage: + DT_GETPAGE(ip, bn, mp, psize, p, rc); + if (rc) + return rc; + + /* process entries forward from first index */ + index = 0; + lastindex = p->header.nextindex - 1; + + if (p->header.flag & BT_INTERNAL) { + /* + * first access of each internal page + */ + printf("internal page "); + dtDisplayPage(ip, bn, p); + + goto getChild; + } else { /* (p->header.flag & BT_LEAF) */ + + /* + * first access of each leaf page + */ + printf("leaf page "); + dtDisplayPage(ip, bn, p); + + /* + * process leaf page entries + * + for ( ; index <= lastindex; index++) + { + } + */ + + /* unpin the leaf page */ + DT_PUTPAGE(mp); + } + + /* + * go back up to the parent page + */ + getParent: + /* pop/restore parent entry for the current child page */ + if ((parent = (btsp == btstack.stack ? NULL : --btsp)) == NULL) + /* current page must have been root */ + return; + + /* + * parent page scan completed + */ + if ((index = parent->index) == (lastindex = parent->lastindex)) { + /* go back up to the parent page */ + goto getParent; + } + + /* + * parent page has entries remaining + */ + /* get back the parent page */ + bn = parent->bn; + /* v = parent->level; */ + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* get next parent entry */ + index++; + + /* + * internal page: go down to child page of current entry + */ + getChild: + /* push/save current parent entry for the child page */ + btsp->bn = pbn = bn; + btsp->index = index; + btsp->lastindex = lastindex; + /* btsp->level = v; */ + /* btsp->node = h; */ + ++btsp; + + /* get current entry for the child page */ + stbl = DT_GETSTBL(p); + xd = (pxd_t *) & p->slot[stbl[index]]; + + /* + * first access of each internal entry: + */ + + /* get child page */ + bn = addressPXD(xd); + psize = lengthPXD(xd) << ip->i_ipmnt->i_l2bsize; + + printk("traverse down 0x%Lx[%d]->0x%Lx\n", pbn, index, bn); + v++; + h = index; + + /* release parent page */ + DT_PUTPAGE(mp); + + /* process the child page */ + goto newPage; +} + + +/* + * dtDisplayPage() + * + * function: display page + */ +int dtDisplayPage(struct inode *ip, s64 bn, dtpage_t * p) +{ + int rc; + metapage_t *mp; + ldtentry_t *lh; + idtentry_t *ih; + pxd_t *xd; + int i, j; + u8 *stbl; + wchar_t name[JFS_NAME_MAX + 1]; + component_t key = { 0, name }; + int freepage = 0; + + if (p == NULL) { + freepage = 1; + DT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + } + + /* display page control */ + printk("bn:0x%Lx flag:0x%08x nextindex:%d\n", + bn, p->header.flag, p->header.nextindex); + + /* display entries */ + stbl = DT_GETSTBL(p); + for (i = 0, j = 1; i < p->header.nextindex; i++, j++) { + dtGetKey(p, i, &key, JFS_SBI(ip->i_sb)->mntflag); + key.name[key.namlen] = '\0'; + if (p->header.flag & BT_LEAF) { + lh = (ldtentry_t *) & p->slot[stbl[i]]; + printf("\t[%d] %s:%d", i, key.name, + le32_to_cpu(lh->inumber)); + } else { + ih = (idtentry_t *) & p->slot[stbl[i]]; + xd = (pxd_t *) ih; + bn = addressPXD(xd); + printf("\t[%d] %s:0x%Lx", i, key.name, bn); + } + + if (j == 4) { + printf("\n"); + j = 0; + } + } + + printf("\n"); + + if (freepage) + DT_PUTPAGE(mp); + + return 0; +} +#endif /* _JFS_DEBUG_DTREE */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_dtree.h linuxppc64_2_4/fs/jfs/jfs_dtree.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_dtree.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_dtree.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,288 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * Change History : + * + */ + +#ifndef _H_JFS_DTREE +#define _H_JFS_DTREE + +/* + * jfs_dtree.h: directory B+-tree manager + */ + +#include "jfs_btree.h" + +typedef union { + struct { + tid_t tid; + struct inode *ip; + u32 ino; + } leaf; + pxd_t xd; +} ddata_t; + + +/* + * entry segment/slot + * + * an entry consists of type dependent head/only segment/slot and + * additional segments/slots linked vi next field; + * N.B. last/only segment of entry is terminated by next = -1; + */ +/* + * directory page slot + */ +typedef struct { + s8 next; /* 1: */ + s8 cnt; /* 1: */ + wchar_t name[15]; /* 30: */ +} dtslot_t; /* (32) */ + + +#define DATASLOTSIZE 16 +#define L2DATASLOTSIZE 4 +#define DTSLOTSIZE 32 +#define L2DTSLOTSIZE 5 +#define DTSLOTHDRSIZE 2 +#define DTSLOTDATASIZE 30 +#define DTSLOTDATALEN 15 + +/* + * internal node entry head/only segment + */ +typedef struct { + pxd_t xd; /* 8: child extent descriptor */ + + s8 next; /* 1: */ + u8 namlen; /* 1: */ + wchar_t name[11]; /* 22: 2-byte aligned */ +} idtentry_t; /* (32) */ + +#define DTIHDRSIZE 10 +#define DTIHDRDATALEN 11 + +/* compute number of slots for entry */ +#define NDTINTERNAL(klen) ( ((4 + (klen)) + (15 - 1)) / 15 ) + + +/* + * leaf node entry head/only segment + * + * For legacy filesystems, name contains 13 wchars -- no index field + */ +typedef struct { + u32 inumber; /* 4: 4-byte aligned */ + s8 next; /* 1: */ + u8 namlen; /* 1: */ + wchar_t name[11]; /* 22: 2-byte aligned */ + u32 index; /* 4: index into dir_table */ +} ldtentry_t; /* (32) */ + +#define DTLHDRSIZE 6 +#define DTLHDRDATALEN_LEGACY 13 /* Old (OS/2) format */ +#define DTLHDRDATALEN 11 + +/* + * dir_table used for directory traversal during readdir + */ + +/* + * Keep persistent index for directory entries + */ +#define DO_INDEX(INODE) (JFS_SBI((INODE)->i_sb)->mntflag & JFS_DIR_INDEX) + +/* + * Maximum entry in inline directory table + */ +#define MAX_INLINE_DIRTABLE_ENTRY 13 + +typedef struct dir_table_slot { + u8 rsrvd; /* 1: */ + u8 flag; /* 1: 0 if free */ + u8 slot; /* 1: slot within leaf page of entry */ + u8 addr1; /* 1: upper 8 bits of leaf page address */ + u32 addr2; /* 4: lower 32 bits of leaf page address -OR- + index of next entry when this entry was deleted */ +} dir_table_slot_t; /* (8) */ + +/* + * flag values + */ +#define DIR_INDEX_VALID 1 +#define DIR_INDEX_FREE 0 + +#define DTSaddress(dir_table_slot, address64)\ +{\ + (dir_table_slot)->addr1 = ((u64)address64) >> 32;\ + (dir_table_slot)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ +} + +#define addressDTS(dts)\ + ( ((s64)((dts)->addr1)) << 32 | __le32_to_cpu((dts)->addr2) ) + +/* compute number of slots for entry */ +#define NDTLEAF_LEGACY(klen) ( ((2 + (klen)) + (15 - 1)) / 15 ) +#define NDTLEAF NDTINTERNAL + + +/* + * directory root page (in-line in on-disk inode): + * + * cf. dtpage_t below. + */ +typedef union { + struct { + dasd_t DASD; /* 16: DASD limit/usage info F226941 */ + + u8 flag; /* 1: */ + u8 nextindex; /* 1: next free entry in stbl */ + s8 freecnt; /* 1: free count */ + s8 freelist; /* 1: freelist header */ + + u32 idotdot; /* 4: parent inode number */ + + s8 stbl[8]; /* 8: sorted entry index table */ + } header; /* (32) */ + + dtslot_t slot[9]; +} dtroot_t; + +#define PARENT(IP) \ + (le32_to_cpu(JFS_IP(IP)->i_dtroot.header.idotdot)) + +#define DTROOTMAXSLOT 9 + +#define dtEmpty(IP) (JFS_IP(IP)->i_dtroot.header.nextindex == 0) + + +/* + * directory regular page: + * + * entry slot array of 32 byte slot + * + * sorted entry slot index table (stbl): + * contiguous slots at slot specified by stblindex, + * 1-byte per entry + * 512 byte block: 16 entry tbl (1 slot) + * 1024 byte block: 32 entry tbl (1 slot) + * 2048 byte block: 64 entry tbl (2 slot) + * 4096 byte block: 128 entry tbl (4 slot) + * + * data area: + * 512 byte block: 16 - 2 = 14 slot + * 1024 byte block: 32 - 2 = 30 slot + * 2048 byte block: 64 - 3 = 61 slot + * 4096 byte block: 128 - 5 = 123 slot + * + * N.B. index is 0-based; index fields refer to slot index + * except nextindex which refers to entry index in stbl; + * end of entry stot list or freelist is marked with -1. + */ +typedef union { + struct { + s64 next; /* 8: next sibling */ + s64 prev; /* 8: previous sibling */ + + u8 flag; /* 1: */ + u8 nextindex; /* 1: next entry index in stbl */ + s8 freecnt; /* 1: */ + s8 freelist; /* 1: slot index of head of freelist */ + + u8 maxslot; /* 1: number of slots in page slot[] */ + u8 stblindex; /* 1: slot index of start of stbl */ + u8 rsrvd[2]; /* 2: */ + + pxd_t self; /* 8: self pxd */ + } header; /* (32) */ + + dtslot_t slot[128]; +} dtpage_t; + +#define DTPAGEMAXSLOT 128 + +#define DT8THPGNODEBYTES 512 +#define DT8THPGNODETSLOTS 1 +#define DT8THPGNODESLOTS 16 + +#define DTQTRPGNODEBYTES 1024 +#define DTQTRPGNODETSLOTS 1 +#define DTQTRPGNODESLOTS 32 + +#define DTHALFPGNODEBYTES 2048 +#define DTHALFPGNODETSLOTS 2 +#define DTHALFPGNODESLOTS 64 + +#define DTFULLPGNODEBYTES 4096 +#define DTFULLPGNODETSLOTS 4 +#define DTFULLPGNODESLOTS 128 + +#define DTENTRYSTART 1 + +/* get sorted entry table of the page */ +#define DT_GETSTBL(p) ( ((p)->header.flag & BT_ROOT) ?\ + ((dtroot_t *)(p))->header.stbl : \ + (s8 *)&(p)->slot[(p)->header.stblindex] ) + +/* + * Flags for dtSearch + */ +#define JFS_CREATE 1 +#define JFS_LOOKUP 2 +#define JFS_REMOVE 3 +#define JFS_RENAME 4 + +#define DIRENTSIZ(namlen) \ + ( (sizeof(struct dirent) - 2*(JFS_NAME_MAX+1) + 2*((namlen)+1) + 3) &~ 3 ) + +/* + * Maximum file offset for directories. + */ +#define DIREND INT_MAX + +/* + * external declarations + */ +extern void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot); + +extern int dtSearch(struct inode *ip, component_t * key, + ino_t * data, btstack_t * btstack, int flag); + +extern int dtInsert(tid_t tid, struct inode *ip, + component_t * key, ino_t * ino, btstack_t * btstack); + +extern int dtDelete(tid_t tid, + struct inode *ip, component_t * key, ino_t * data, int flag); + +extern int dtRelocate(tid_t tid, + struct inode *ip, s64 lmxaddr, pxd_t * opxd, s64 nxaddr); + +extern int dtModify(tid_t tid, struct inode *ip, + component_t * key, ino_t * orig_ino, ino_t new_ino, int flag); + +extern int jfs_readdir(struct file *filp, void *dirent, filldir_t filldir); + +#ifdef _JFS_DEBUG_DTREE +extern int dtDisplayTree(struct inode *ip); + +extern int dtDisplayPage(struct inode *ip, s64 bn, dtpage_t * p); +#endif /* _JFS_DEBUG_DTREE */ + +#endif /* !_H_JFS_DTREE */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_extendfs.h linuxppc64_2_4/fs/jfs/jfs_extendfs.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_extendfs.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_extendfs.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,39 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_EXTENDFS +#define _H_JFS_EXTENDFS + +/* + * jfs_extendfs.h + */ +/* + * extendfs parameter list + */ +typedef struct { + u32 flag; /* 4: */ + u8 dev; /* 1: */ + u8 pad[3]; /* 3: */ + s64 LVSize; /* 8: LV size in LV block */ + s64 FSSize; /* 8: FS size in LV block */ + s32 LogSize; /* 4: inlinelog size in LV block */ +} extendfs_t; /* (28) */ + +/* plist flag */ +#define EXTENDFS_QUERY 0x00000001 + +#endif /* _H_JFS_EXTENDFS */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_extent.c linuxppc64_2_4/fs/jfs/jfs_extent.c --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_extent.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_extent.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,637 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * + * Module: jfs_extent.c: + */ + +#include +#include "jfs_incore.h" +#include "jfs_dmap.h" +#include "jfs_extent.h" +#include "jfs_debug.h" + +/* + * forward references + */ +static int extBalloc(struct inode *, s64, s64 *, s64 *); +static int extBrealloc(struct inode *, s64, s64, s64 *, s64 *); +int extRecord(struct inode *, xad_t *); +static s64 extRoundDown(s64 nb); + +/* + * external references + */ +extern int dbExtend(struct inode *, s64, s64, s64); +extern int jfs_commit_inode(struct inode *, int); + + +#define DPD(a) (printk("(a): %d\n",(a))) +#define DPC(a) (printk("(a): %c\n",(a))) +#define DPL1(a) \ +{ \ + if ((a) >> 32) \ + printk("(a): %x%08x ",(a)); \ + else \ + printk("(a): %x ",(a) << 32); \ +} +#define DPL(a) \ +{ \ + if ((a) >> 32) \ + printk("(a): %x%08x\n",(a)); \ + else \ + printk("(a): %x\n",(a) << 32); \ +} + +#define DPD1(a) (printk("(a): %d ",(a))) +#define DPX(a) (printk("(a): %08x\n",(a))) +#define DPX1(a) (printk("(a): %08x ",(a))) +#define DPS(a) (printk("%s\n",(a))) +#define DPE(a) (printk("\nENTERING: %s\n",(a))) +#define DPE1(a) (printk("\nENTERING: %s",(a))) +#define DPS1(a) (printk(" %s ",(a))) + + +/* + * NAME: extAlloc() + * + * FUNCTION: allocate an extent for a specified page range within a + * file. + * + * PARAMETERS: + * ip - the inode of the file. + * xlen - requested extent length. + * pno - the starting page number with the file. + * xp - pointer to an xad. on entry, xad describes an + * extent that is used as an allocation hint if the + * xaddr of the xad is non-zero. on successful exit, + * the xad describes the newly allocated extent. + * abnr - boolean_t indicating whether the newly allocated extent + * should be marked as allocated but not recorded. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error. + * ENOSPC - insufficient disk resources. + */ +int +extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + s64 nxlen, nxaddr, xoff, hint, xaddr = 0; + int rc, nbperpage; + int xflag; + + /* This blocks if we are low on resources */ + txBeginAnon(ip->i_sb); + + /* validate extent length */ + if (xlen > MAXXLEN) + xlen = MAXXLEN; + + /* get the number of blocks per page */ + nbperpage = sbi->nbperpage; + + /* get the page's starting extent offset */ + xoff = pno << sbi->l2nbperpage; + + /* check if an allocation hint was provided */ + if ((hint = addressXAD(xp))) { + /* get the size of the extent described by the hint */ + nxlen = lengthXAD(xp); + + /* check if the hint is for the portion of the file + * immediately previous to the current allocation + * request and if hint extent has the same abnr + * value as the current request. if so, we can + * extend the hint extent to include the current + * extent if we can allocate the blocks immediately + * following the hint extent. + */ + if (offsetXAD(xp) + nxlen == xoff && + abnr == ((xp->flag & XAD_NOTRECORDED) ? TRUE : FALSE)) + xaddr = hint + nxlen; + + /* adjust the hint to the last block of the extent */ + hint += (nxlen - 1); + } + + /* allocate the disk blocks for the extent. initially, extBalloc() + * will try to allocate disk blocks for the requested size (xlen). + * if this fails (xlen contigious free blocks not avaliable), it'll + * try to allocate a smaller number of blocks (producing a smaller + * extent), with this smaller number of blocks consisting of the + * requested number of blocks rounded down to the next smaller + * power of 2 number (i.e. 16 -> 8). it'll continue to round down + * and retry the allocation until the number of blocks to allocate + * is smaller than the number of blocks per page. + */ + nxlen = xlen; + if ((rc = + extBalloc(ip, hint ? hint : INOHINT(ip), &nxlen, &nxaddr))) { + return (rc); + } + + /* determine the value of the extent flag */ + xflag = (abnr == TRUE) ? XAD_NOTRECORDED : 0; + + /* if we can extend the hint extent to cover the current request, + * extend it. otherwise, insert a new extent to + * cover the current request. + */ + if (xaddr && xaddr == nxaddr) + rc = xtExtend(0, ip, xoff, (int) nxlen, 0); + else + rc = xtInsert(0, ip, xflag, xoff, (int) nxlen, &nxaddr, 0); + + /* if the extend or insert failed, + * free the newly allocated blocks and return the error. + */ + if (rc) { + dbFree(ip, nxaddr, nxlen); + return (rc); + } + + /* update the number of blocks allocated to the file */ + ip->i_blocks += LBLK2PBLK(ip->i_sb, nxlen); + + /* set the results of the extent allocation */ + XADaddress(xp, nxaddr); + XADlength(xp, nxlen); + XADoffset(xp, xoff); + xp->flag = xflag; + + mark_inode_dirty(ip); + + /* + * COMMIT_SyncList flags an anonymous tlock on page that is on + * sync list. + * We need to commit the inode to get the page written disk. + */ + if (test_and_clear_cflag(COMMIT_Synclist,ip)) + jfs_commit_inode(ip, 0); + + return (0); +} + + +/* + * NAME: extRealloc() + * + * FUNCTION: extend the allocation of a file extent containing a + * partial back last page. + * + * PARAMETERS: + * ip - the inode of the file. + * cp - cbuf for the partial backed last page. + * xlen - request size of the resulting extent. + * xp - pointer to an xad. on successful exit, the xad + * describes the newly allocated extent. + * abnr - boolean_t indicating whether the newly allocated extent + * should be marked as allocated but not recorded. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error. + * ENOSPC - insufficient disk resources. + */ +int extRealloc(struct inode *ip, s64 nxlen, xad_t * xp, boolean_t abnr) +{ + struct super_block *sb = ip->i_sb; + s64 xaddr, xlen, nxaddr, delta, xoff; + s64 ntail, nextend, ninsert; + int rc, nbperpage = JFS_SBI(sb)->nbperpage; + int xflag; + + /* This blocks if we are low on resources */ + txBeginAnon(ip->i_sb); + + /* validate extent length */ + if (nxlen > MAXXLEN) + nxlen = MAXXLEN; + + /* get the extend (partial) page's disk block address and + * number of blocks. + */ + xaddr = addressXAD(xp); + xlen = lengthXAD(xp); + xoff = offsetXAD(xp); + + /* if the extend page is abnr and if the request is for + * the extent to be allocated and recorded, + * make the page allocated and recorded. + */ + if ((xp->flag & XAD_NOTRECORDED) && !abnr) { + xp->flag = 0; + if ((rc = xtUpdate(0, ip, xp))) + return (rc); + } + + /* try to allocated the request number of blocks for the + * extent. dbRealloc() first tries to satisfy the request + * by extending the allocation in place. otherwise, it will + * try to allocate a new set of blocks large enough for the + * request. in satisfying a request, dbReAlloc() may allocate + * less than what was request but will always allocate enough + * space as to satisfy the extend page. + */ + if ((rc = extBrealloc(ip, xaddr, xlen, &nxlen, &nxaddr))) + return (rc); + + delta = nxlen - xlen; + + /* check if the extend page is not abnr but the request is abnr + * and the allocated disk space is for more than one page. if this + * is the case, there is a miss match of abnr between the extend page + * and the one or more pages following the extend page. as a result, + * two extents will have to be manipulated. the first will be that + * of the extent of the extend page and will be manipulated thru + * an xtExtend() or an xtTailgate(), depending upon whether the + * disk allocation occurred as an inplace extension. the second + * extent will be manipulated (created) through an xtInsert() and + * will be for the pages following the extend page. + */ + if (abnr && (!(xp->flag & XAD_NOTRECORDED)) && (nxlen > nbperpage)) { + ntail = nbperpage; + nextend = ntail - xlen; + ninsert = nxlen - nbperpage; + + xflag = XAD_NOTRECORDED; + } else { + ntail = nxlen; + nextend = delta; + ninsert = 0; + + xflag = xp->flag; + } + + /* if we were able to extend the disk allocation in place, + * extend the extent. otherwise, move the extent to a + * new disk location. + */ + if (xaddr == nxaddr) { + /* extend the extent */ + if ((rc = xtExtend(0, ip, xoff + xlen, (int) nextend, 0))) { + dbFree(ip, xaddr + xlen, delta); + return (rc); + } + } else { + /* + * move the extent to a new location: + * + * xtTailgate() accounts for relocated tail extent; + */ + if ((rc = xtTailgate(0, ip, xoff, (int) ntail, nxaddr, 0))) { + dbFree(ip, nxaddr, nxlen); + return (rc); + } + } + + + /* check if we need to also insert a new extent */ + if (ninsert) { + /* perform the insert. if it fails, free the blocks + * to be inserted and make it appear that we only did + * the xtExtend() or xtTailgate() above. + */ + xaddr = nxaddr + ntail; + if (xtInsert (0, ip, xflag, xoff + ntail, (int) ninsert, + &xaddr, 0)) { + dbFree(ip, xaddr, (s64) ninsert); + delta = nextend; + nxlen = ntail; + xflag = 0; + } + } + + /* update the inode with the number of blocks allocated */ + ip->i_blocks += LBLK2PBLK(sb, delta); + + /* set the return results */ + XADaddress(xp, nxaddr); + XADlength(xp, nxlen); + XADoffset(xp, xoff); + xp->flag = xflag; + + mark_inode_dirty(ip); + + return (0); +} + + +/* + * NAME: extHint() + * + * FUNCTION: produce an extent allocation hint for a file offset. + * + * PARAMETERS: + * ip - the inode of the file. + * offset - file offset for which the hint is needed. + * xp - pointer to the xad that is to be filled in with + * the hint. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error. + */ +int extHint(struct inode *ip, s64 offset, xad_t * xp) +{ + struct super_block *sb = ip->i_sb; + xadlist_t xadl; + lxdlist_t lxdl; + lxd_t lxd; + s64 prev; + int rc, nbperpage = JFS_SBI(sb)->nbperpage; + + /* init the hint as "no hint provided" */ + XADaddress(xp, 0); + + /* determine the starting extent offset of the page previous + * to the page containing the offset. + */ + prev = ((offset & ~POFFSET) >> JFS_SBI(sb)->l2bsize) - nbperpage; + + /* if the offsets in the first page of the file, + * no hint provided. + */ + if (prev < 0) + return (0); + + /* prepare to lookup the previous page's extent info */ + lxdl.maxnlxd = 1; + lxdl.nlxd = 1; + lxdl.lxd = &lxd; + LXDoffset(&lxd, prev) + LXDlength(&lxd, nbperpage); + + xadl.maxnxad = 1; + xadl.nxad = 0; + xadl.xad = xp; + + /* perform the lookup */ + if ((rc = xtLookupList(ip, &lxdl, &xadl, 0))) + return (rc); + + /* check if not extent exists for the previous page. + * this is possible for sparse files. + */ + if (xadl.nxad == 0) { +// assert(ISSPARSE(ip)); + return (0); + } + + /* only preserve the abnr flag within the xad flags + * of the returned hint. + */ + xp->flag &= XAD_NOTRECORDED; + + assert(xadl.nxad == 1); + assert(lengthXAD(xp) == nbperpage); + + return (0); +} + + +/* + * NAME: extRecord() + * + * FUNCTION: change a page with a file from not recorded to recorded. + * + * PARAMETERS: + * ip - inode of the file. + * cp - cbuf of the file page. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error. + * ENOSPC - insufficient disk resources. + */ +int extRecord(struct inode *ip, xad_t * xp) +{ + int rc; + + txBeginAnon(ip->i_sb); + + /* update the extent */ + if ((rc = xtUpdate(0, ip, xp))) + return (rc); + +#ifdef _STILL_TO_PORT + /* no longer abnr */ + cp->cm_abnr = FALSE; + + /* mark the cbuf as modified */ + cp->cm_modified = TRUE; +#endif /* _STILL_TO_PORT */ + + return (0); +} + + +/* + * NAME: extFill() + * + * FUNCTION: allocate disk space for a file page that represents + * a file hole. + * + * PARAMETERS: + * ip - the inode of the file. + * cp - cbuf of the file page represent the hole. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error. + * ENOSPC - insufficient disk resources. + */ +int extFill(struct inode *ip, xad_t * xp) +{ + int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage; + s64 blkno = offsetXAD(xp) >> ip->i_blksize; + +// assert(ISSPARSE(ip)); + + /* initialize the extent allocation hint */ + XADaddress(xp, 0); + + /* allocate an extent to fill the hole */ + if ((rc = extAlloc(ip, nbperpage, blkno, xp, FALSE))) + return (rc); + + assert(lengthPXD(xp) == nbperpage); + + return (0); +} + + +/* + * NAME: extBalloc() + * + * FUNCTION: allocate disk blocks to form an extent. + * + * initially, we will try to allocate disk blocks for the + * requested size (nblocks). if this fails (nblocks + * contigious free blocks not avaliable), we'll try to allocate + * a smaller number of blocks (producing a smaller extent), with + * this smaller number of blocks consisting of the requested + * number of blocks rounded down to the next smaller power of 2 + * number (i.e. 16 -> 8). we'll continue to round down and + * retry the allocation until the number of blocks to allocate + * is smaller than the number of blocks per page. + * + * PARAMETERS: + * ip - the inode of the file. + * hint - disk block number to be used as an allocation hint. + * *nblocks - pointer to an s64 value. on entry, this value specifies + * the desired number of block to be allocated. on successful + * exit, this value is set to the number of blocks actually + * allocated. + * blkno - pointer to a block address that is filled in on successful + * return with the starting block number of the newly + * allocated block range. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error. + * ENOSPC - insufficient disk resources. + */ +static int +extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) +{ + s64 nb, nblks, daddr, max; + int rc, nbperpage = JFS_SBI(ip->i_sb)->nbperpage; + bmap_t *mp = JFS_SBI(ip->i_sb)->bmap; + + /* get the number of blocks to initially attempt to allocate. + * we'll first try the number of blocks requested unless this + * number is greater than the maximum number of contigious free + * blocks in the map. in that case, we'll start off with the + * maximum free. + */ + max = (s64) 1 << mp->db_maxfreebud; + if (*nblocks >= max && *nblocks > nbperpage) + nb = nblks = (max > nbperpage) ? max : nbperpage; + else + nb = nblks = *nblocks; + + /* try to allocate blocks */ + while ((rc = dbAlloc(ip, hint, nb, &daddr))) { + /* if something other than an out of space error, + * stop and return this error. + */ + if (rc != ENOSPC) + return (rc); + + /* decrease the allocation request size */ + nb = min(nblks, extRoundDown(nb)); + + /* give up if we cannot cover a page */ + if (nb < nbperpage) + return (rc); + } + + *nblocks = nb; + *blkno = daddr; + + return (0); +} + + +/* + * NAME: extBrealloc() + * + * FUNCTION: attempt to extend an extent's allocation. + * + * initially, we will try to extend the extent's allocation + * in place. if this fails, we'll try to move the extent + * to a new set of blocks. if moving the extent, we initially + * will try to allocate disk blocks for the requested size + * (nnew). if this fails (nnew contigious free blocks not + * avaliable), we'll try to allocate a smaller number of + * blocks (producing a smaller extent), with this smaller + * number of blocks consisting of the requested number of + * blocks rounded down to the next smaller power of 2 + * number (i.e. 16 -> 8). we'll continue to round down and + * retry the allocation until the number of blocks to allocate + * is smaller than the number of blocks per page. + * + * PARAMETERS: + * ip - the inode of the file. + * blkno - starting block number of the extents current allocation. + * nblks - number of blocks within the extents current allocation. + * newnblks - pointer to a s64 value. on entry, this value is the + * the new desired extent size (number of blocks). on + * successful exit, this value is set to the extent's actual + * new size (new number of blocks). + * newblkno - the starting block number of the extents new allocation. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error. + * ENOSPC - insufficient disk resources. + */ +static int +extBrealloc(struct inode *ip, + s64 blkno, s64 nblks, s64 * newnblks, s64 * newblkno) +{ + int rc; + + /* try to extend in place */ + if ((rc = dbExtend(ip, blkno, nblks, *newnblks - nblks)) == 0) { + *newblkno = blkno; + return (0); + } else { + if (rc != ENOSPC) + return (rc); + } + + /* in place extension not possible. + * try to move the extent to a new set of blocks. + */ + return (extBalloc(ip, blkno, newnblks, newblkno)); +} + + +/* + * NAME: extRoundDown() + * + * FUNCTION: round down a specified number of blocks to the next + * smallest power of 2 number. + * + * PARAMETERS: + * nb - the inode of the file. + * + * RETURN VALUES: + * next smallest power of 2 number. + */ +static s64 extRoundDown(s64 nb) +{ + int i; + u64 m, k; + + for (i = 0, m = (u64) 1 << 63; i < 64; i++, m >>= 1) { + if (m & nb) + break; + } + + i = 63 - i; + k = (u64) 1 << i; + k = ((k - 1) & nb) ? k : k >> 1; + + return (k); +} diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_extent.h linuxppc64_2_4/fs/jfs/jfs_extent.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_extent.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_extent.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,31 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_EXTENT +#define _H_JFS_EXTENT + +/* get block allocation allocation hint as location of disk inode */ +#define INOHINT(ip) \ + (addressPXD(&(JFS_IP(ip)->ixpxd)) + lengthPXD(&(JFS_IP(ip)->ixpxd)) - 1) + +extern int extAlloc(struct inode *, s64, s64, xad_t *, boolean_t); +extern int extFill(struct inode *, xad_t *); +extern int extHint(struct inode *, s64, xad_t *); +extern int extRealloc(struct inode *, s64, xad_t *, boolean_t); +extern int extRecord(struct inode *, xad_t *); + +#endif /* _H_JFS_EXTENT */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_filsys.h linuxppc64_2_4/fs/jfs/jfs_filsys.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_filsys.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_filsys.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,274 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * +*/ + +#ifndef _H_JFS_FILSYS +#define _H_JFS_FILSYS + +/* + * jfs_filsys.h + * + * file system (implementation-dependent) constants + * + * refer to for system wide implementation-dependent constants + */ + +/* + * file system option (superblock flag) + */ +/* platform option (conditional compilation) */ +#define JFS_AIX 0x80000000 /* AIX support */ +/* POSIX name/directory support */ + +#define JFS_OS2 0x40000000 /* OS/2 support */ +/* case-insensitive name/directory support */ + +#define JFS_DFS 0x20000000 /* DCE DFS LFS support */ + +#define JFS_LINUX 0x10000000 /* Linux support */ +/* case-sensitive name/directory support */ + +/* directory option */ +#define JFS_UNICODE 0x00000001 /* unicode name */ + +/* commit option */ +#define JFS_COMMIT 0x00000f00 /* commit option mask */ +#define JFS_GROUPCOMMIT 0x00000100 /* group (of 1) commit */ +#define JFS_LAZYCOMMIT 0x00000200 /* lazy commit */ +#define JFS_TMPFS 0x00000400 /* temporary file system - + * do not log/commit: + */ + +/* log logical volume option */ +#define JFS_INLINELOG 0x00000800 /* inline log within file system */ +#define JFS_INLINEMOVE 0x00001000 /* inline log being moved */ + +/* Secondary aggregate inode table */ +#define JFS_BAD_SAIT 0x00010000 /* current secondary ait is bad */ + +/* sparse regular file support */ +#define JFS_SPARSE 0x00020000 /* sparse regular file */ + +/* DASD Limits F226941 */ +#define JFS_DASD_ENABLED 0x00040000 /* DASD limits enabled */ +#define JFS_DASD_PRIME 0x00080000 /* Prime DASD usage on boot */ + +/* big endian flag */ +#define JFS_SWAP_BYTES 0x00100000 /* running on big endian computer */ + +/* Directory index */ +#define JFS_DIR_INDEX 0x00200000 /* Persistant index for */ + /* directory entries */ + + +/* + * buffer cache configuration + */ +/* page size */ +#ifdef PSIZE +#undef PSIZE +#endif +#define PSIZE 4096 /* page size (in byte) */ +#define L2PSIZE 12 /* log2(PSIZE) */ +#define POFFSET 4095 /* offset within page */ + +/* buffer page size */ +#define BPSIZE PSIZE + +/* + * fs fundamental size + * + * PSIZE >= file system block size >= PBSIZE >= DISIZE + */ +#define PBSIZE 512 /* physical block size (in byte) */ +#define L2PBSIZE 9 /* log2(PBSIZE) */ + +#define DISIZE 512 /* on-disk inode size (in byte) */ +#define L2DISIZE 9 /* log2(DISIZE) */ + +#define IDATASIZE 256 /* inode inline data size */ +#define IXATTRSIZE 128 /* inode inline extended attribute size */ + +#define XTPAGE_SIZE 4096 +#define log2_PAGESIZE 12 + +#define IAG_SIZE 4096 +#define IAG_EXTENT_SIZE 4096 +#define INOSPERIAG 4096 /* number of disk inodes per iag */ +#define L2INOSPERIAG 12 /* l2 number of disk inodes per iag */ +#define INOSPEREXT 32 /* number of disk inode per extent */ +#define L2INOSPEREXT 5 /* l2 number of disk inode per extent */ +#define IXSIZE (DISIZE * INOSPEREXT) /* inode extent size */ +#define INOSPERPAGE 8 /* number of disk inodes per 4K page */ +#define L2INOSPERPAGE 3 /* log2(INOSPERPAGE) */ + +#define IAGFREELIST_LWM 64 + +#define INODE_EXTENT_SIZE IXSIZE /* inode extent size */ +#define NUM_INODE_PER_EXTENT INOSPEREXT +#define NUM_INODE_PER_IAG INOSPERIAG + +#define MINBLOCKSIZE 512 +#define MAXBLOCKSIZE 4096 +#define MAXFILESIZE ((s64)1 << 52) + +#define JFS_LINK_MAX 65535 /* nlink_t is unsigned short */ + +/* Minimum number of bytes supported for a JFS partition */ +#define MINJFS (0x1000000) +#define MINJFSTEXT "16" + +/* + * file system block size -> physical block size + */ +#define LBOFFSET(x) ((x) & (PBSIZE - 1)) +#define LBNUMBER(x) ((x) >> L2PBSIZE) +#define LBLK2PBLK(sb,b) ((b) << (sb->s_blocksize_bits - L2PBSIZE)) +#define PBLK2LBLK(sb,b) ((b) >> (sb->s_blocksize_bits - L2PBSIZE)) +/* size in byte -> last page number */ +#define SIZE2PN(size) ( ((s64)((size) - 1)) >> (L2PSIZE) ) +/* size in byte -> last file system block number */ +#define SIZE2BN(size, l2bsize) ( ((s64)((size) - 1)) >> (l2bsize) ) + +/* + * fixed physical block address (physical block size = 512 byte) + * + * NOTE: since we can't guarantee a physical block size of 512 bytes the use of + * these macros should be removed and the byte offset macros used instead. + */ +#define SUPER1_B 64 /* primary superblock */ +#define AIMAP_B (SUPER1_B + 8) /* 1st extent of aggregate inode map */ +#define AITBL_B (AIMAP_B + 16) /* + * 1st extent of aggregate inode table + */ +#define SUPER2_B (AITBL_B + 32) /* 2ndary superblock pbn */ +#define BMAP_B (SUPER2_B + 8) /* block allocation map */ + +/* + * SIZE_OF_SUPER defines the total amount of space reserved on disk for the + * superblock. This is not the same as the superblock structure, since all of + * this space is not currently being used. + */ +#define SIZE_OF_SUPER PSIZE + +/* + * SIZE_OF_AG_TABLE defines the amount of space reserved to hold the AG table + */ +#define SIZE_OF_AG_TABLE PSIZE + +/* + * SIZE_OF_MAP_PAGE defines the amount of disk space reserved for each page of + * the inode allocation map (to hold iag) + */ +#define SIZE_OF_MAP_PAGE PSIZE + +/* + * fixed byte offset address + */ +#define SUPER1_OFF 0x8000 /* primary superblock */ +#define AIMAP_OFF (SUPER1_OFF + SIZE_OF_SUPER) + /* + * Control page of aggregate inode map + * followed by 1st extent of map + */ +#define AITBL_OFF (AIMAP_OFF + (SIZE_OF_MAP_PAGE << 1)) + /* + * 1st extent of aggregate inode table + */ +#define SUPER2_OFF (AITBL_OFF + INODE_EXTENT_SIZE) + /* + * secondary superblock + */ +#define BMAP_OFF (SUPER2_OFF + SIZE_OF_SUPER) + /* + * block allocation map + */ + +/* + * The following macro is used to indicate the number of reserved disk blocks at + * the front of an aggregate, in terms of physical blocks. This value is + * currently defined to be 32K. This turns out to be the same as the primary + * superblock's address, since it directly follows the reserved blocks. + */ +#define AGGR_RSVD_BLOCKS SUPER1_B + +/* + * The following macro is used to indicate the number of reserved bytes at the + * front of an aggregate. This value is currently defined to be 32K. This + * turns out to be the same as the primary superblock's byte offset, since it + * directly follows the reserved blocks. + */ +#define AGGR_RSVD_BYTES SUPER1_OFF + +/* + * The following macro defines the byte offset for the first inode extent in + * the aggregate inode table. This allows us to find the self inode to find the + * rest of the table. Currently this value is 44K. + */ +#define AGGR_INODE_TABLE_START AITBL_OFF + +/* + * fixed reserved inode number + */ +/* aggregate inode */ +#define AGGR_RESERVED_I 0 /* aggregate inode (reserved) */ +#define AGGREGATE_I 1 /* aggregate inode map inode */ +#define BMAP_I 2 /* aggregate block allocation map inode */ +#define LOG_I 3 /* aggregate inline log inode */ +#define BADBLOCK_I 4 /* aggregate bad block inode */ +#define FILESYSTEM_I 16 /* 1st/only fileset inode in ait: + * fileset inode map inode + */ + +/* per fileset inode */ +#define FILESET_RSVD_I 0 /* fileset inode (reserved) */ +#define FILESET_EXT_I 1 /* fileset inode extension */ +#define ROOT_I 2 /* fileset root inode */ +#define ACL_I 3 /* fileset ACL inode */ + +#define FILESET_OBJECT_I 4 /* the first fileset inode available for a file + * or directory or link... + */ +#define FIRST_FILESET_INO 16 /* the first aggregate inode which describes + * an inode. (To fsck this is also the first + * inode in part 2 of the agg inode table.) + */ + +/* + * directory configuration + */ +#define JFS_NAME_MAX 255 +#define JFS_PATH_MAX BPSIZE + + +/* + * file system state (superblock state) + */ +#define FM_CLEAN 0x00000000 /* file system is unmounted and clean */ +#define FM_MOUNT 0x00000001 /* file system is mounted cleanly */ +#define FM_DIRTY 0x00000002 /* file system was not unmounted and clean + * when mounted or + * commit failure occurred while being mounted: + * fsck() must be run to repair + */ +#define FM_LOGREDO 0x00000004 /* log based recovery (logredo()) failed: + * fsck() must be run to repair + */ +#define FM_EXTENDFS 0x00000008 /* file system extendfs() in progress */ + +#endif /* _H_JFS_FILSYS */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_imap.c linuxppc64_2_4/fs/jfs/jfs_imap.c --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_imap.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_imap.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,3236 @@ +/* + + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +/* + * Change History : + * + */ + +/* + * jfs_imap.c: inode allocation map manager + * + * Serialization: + * Each AG has a simple lock which is used to control the serialization of + * the AG level lists. This lock should be taken first whenever an AG + * level list will be modified or accessed. + * + * Each IAG is locked by obtaining the buffer for the IAG page. + * + * There is also a inode lock for the inode map inode. A read lock needs to + * be taken whenever an IAG is read from the map or the global level + * information is read. A write lock needs to be taken whenever the global + * level information is modified or an atomic operation needs to be used. + * + * If more than one IAG is read at one time, the read lock may not + * be given up until all of the IAG's are read. Otherwise, a deadlock + * may occur when trying to obtain the read lock while another thread + * holding the read lock is waiting on the IAG already being held. + * + * The control page of the inode map is read into memory by diMount(). + * Thereafter it should only be modified in memory and then it will be + * written out when the filesystem is unmounted by diUnmount(). + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_dinode.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_metapage.h" +#include "jfs_superblock.h" +#include "jfs_debug.h" + +/* + * imap locks + */ +/* iag free list lock */ +#define IAGFREE_LOCK_INIT(imap) init_MUTEX(&imap->im_freelock) +#define IAGFREE_LOCK(imap) down(&imap->im_freelock) +#define IAGFREE_UNLOCK(imap) up(&imap->im_freelock) + +/* per ag iag list locks */ +#define AG_LOCK_INIT(imap,index) init_MUTEX(&(imap->im_aglock[index])) +#define AG_LOCK(imap,agno) down(&imap->im_aglock[agno]) +#define AG_UNLOCK(imap,agno) up(&imap->im_aglock[agno]) + +/* + * external references + */ +extern struct address_space_operations jfs_aops; + +/* + * forward references + */ +static int diAllocAG(imap_t *, int, boolean_t, struct inode *); +static int diAllocAny(imap_t *, int, boolean_t, struct inode *); +static int diAllocBit(imap_t *, iag_t *, int); +static int diAllocExt(imap_t *, int, struct inode *); +static int diAllocIno(imap_t *, int, struct inode *); +static int diFindFree(u32, int); +static int diNewExt(imap_t *, iag_t *, int); +static int diNewIAG(imap_t *, int *, int, metapage_t **); +static void duplicateIXtree(struct super_block *, s64, int, s64 *); + +static int diIAGRead(imap_t * imap, int, metapage_t **); +static int copy_from_dinode(dinode_t *, struct inode *); +static void copy_to_dinode(dinode_t *, struct inode *); + +/* + * debug code for double-checking inode map + */ +/* #define _JFS_DEBUG_IMAP 1 */ + +#ifdef _JFS_DEBUG_IMAP +#define DBG_DIINIT(imap) DBGdiInit(imap) +#define DBG_DIALLOC(imap, ino) DBGdiAlloc(imap, ino) +#define DBG_DIFREE(imap, ino) DBGdiFree(imap, ino) + +static void *DBGdiInit(imap_t * imap); +static void DBGdiAlloc(imap_t * imap, ino_t ino); +static void DBGdiFree(imap_t * imap, ino_t ino); +#else +#define DBG_DIINIT(imap) +#define DBG_DIALLOC(imap, ino) +#define DBG_DIFREE(imap, ino) +#endif /* _JFS_DEBUG_IMAP */ + +/* + * NAME: diMount() + * + * FUNCTION: initialize the incore inode map control structures for + * a fileset or aggregate init time. + * + * the inode map's control structure (dinomap_t) is + * brought in from disk and placed in virtual memory. + * + * PARAMETERS: + * ipimap - pointer to inode map inode for the aggregate or fileset. + * + * RETURN VALUES: + * 0 - success + * ENOMEM - insufficient free virtual memory. + * EIO - i/o error. + */ +int diMount(struct inode *ipimap) +{ + imap_t *imap; + metapage_t *mp; + int index; + dinomap_t *dinom_le; + + /* + * allocate/initialize the in-memory inode map control structure + */ + /* allocate the in-memory inode map control structure. */ + imap = (imap_t *) kmalloc(sizeof(imap_t), GFP_KERNEL); + if (imap == NULL) { + jERROR(1, ("diMount: kmalloc returned NULL!\n")); + return (ENOMEM); + } + + /* read the on-disk inode map control structure. */ + + mp = read_metapage(ipimap, + IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + kfree(imap); + return (EIO); + } + + /* copy the on-disk version to the in-memory version. */ + dinom_le = (dinomap_t *) mp->data; + imap->im_freeiag = le32_to_cpu(dinom_le->in_freeiag); + imap->im_nextiag = le32_to_cpu(dinom_le->in_nextiag); + atomic_set(&imap->im_numinos, le32_to_cpu(dinom_le->in_numinos)); + atomic_set(&imap->im_numfree, le32_to_cpu(dinom_le->in_numfree)); + imap->im_nbperiext = le32_to_cpu(dinom_le->in_nbperiext); + imap->im_l2nbperiext = le32_to_cpu(dinom_le->in_l2nbperiext); + for (index = 0; index < MAXAG; index++) { + imap->im_agctl[index].inofree = + le32_to_cpu(dinom_le->in_agctl[index].inofree); + imap->im_agctl[index].extfree = + le32_to_cpu(dinom_le->in_agctl[index].extfree); + imap->im_agctl[index].numinos = + le32_to_cpu(dinom_le->in_agctl[index].numinos); + imap->im_agctl[index].numfree = + le32_to_cpu(dinom_le->in_agctl[index].numfree); + } + + /* release the buffer. */ + release_metapage(mp); + + /* + * allocate/initialize inode allocation map locks + */ + /* allocate and init iag free list lock */ + IAGFREE_LOCK_INIT(imap); + + /* allocate and init ag list locks */ + for (index = 0; index < MAXAG; index++) { + AG_LOCK_INIT(imap, index); + } + + /* bind the inode map inode and inode map control structure + * to each other. + */ + imap->im_ipimap = ipimap; + JFS_IP(ipimap)->i_imap = imap; + +// DBG_DIINIT(imap); + + return (0); +} + + +/* + * NAME: diUnmount() + * + * FUNCTION: write to disk the incore inode map control structures for + * a fileset or aggregate at unmount time. + * + * PARAMETERS: + * ipimap - pointer to inode map inode for the aggregate or fileset. + * + * RETURN VALUES: + * 0 - success + * ENOMEM - insufficient free virtual memory. + * EIO - i/o error. + */ +int diUnmount(struct inode *ipimap, int mounterror) +{ + imap_t *imap = JFS_IP(ipimap)->i_imap; + + /* + * update the on-disk inode map control structure + */ + + if (!(mounterror || isReadOnly(ipimap))) + diSync(ipimap); + + /* + * Invalidate the page cache buffers + */ + truncate_inode_pages(ipimap->i_mapping, 0); + + /* + * free in-memory control structure + */ + kfree(imap); + + return (0); +} + + +/* + * diSync() + */ +int diSync(struct inode *ipimap) +{ + dinomap_t *dinom_le; + imap_t *imp = JFS_IP(ipimap)->i_imap; + metapage_t *mp; + int index; + + /* + * write imap global conrol page + */ + /* read the on-disk inode map control structure */ + mp = get_metapage(ipimap, + IMAPBLKNO << JFS_SBI(ipimap->i_sb)->l2nbperpage, + PSIZE, 0); + if (mp == NULL) { + jERROR(1,("diSync: get_metapage failed!\n")); + return EIO; + } + + /* copy the in-memory version to the on-disk version */ + //memcpy(mp->data, &imp->im_imap,sizeof(dinomap_t)); + dinom_le = (dinomap_t *) mp->data; + dinom_le->in_freeiag = cpu_to_le32(imp->im_freeiag); + dinom_le->in_nextiag = cpu_to_le32(imp->im_nextiag); + dinom_le->in_numinos = cpu_to_le32(atomic_read(&imp->im_numinos)); + dinom_le->in_numfree = cpu_to_le32(atomic_read(&imp->im_numfree)); + dinom_le->in_nbperiext = cpu_to_le32(imp->im_nbperiext); + dinom_le->in_l2nbperiext = cpu_to_le32(imp->im_l2nbperiext); + for (index = 0; index < MAXAG; index++) { + dinom_le->in_agctl[index].inofree = + cpu_to_le32(imp->im_agctl[index].inofree); + dinom_le->in_agctl[index].extfree = + cpu_to_le32(imp->im_agctl[index].extfree); + dinom_le->in_agctl[index].numinos = + cpu_to_le32(imp->im_agctl[index].numinos); + dinom_le->in_agctl[index].numfree = + cpu_to_le32(imp->im_agctl[index].numfree); + } + + /* write out the control structure */ + write_metapage(mp); + + /* + * write out dirty pages of imap + */ + fsync_inode_data_buffers(ipimap); + + diWriteSpecial(ipimap); + + return (0); +} + + +/* + * NAME: diRead() + * + * FUNCTION: initialize an incore inode from disk. + * + * on entry, the specifed incore inode should itself + * specify the disk inode number corresponding to the + * incore inode (i.e. i_number should be initialized). + * + * this routine handles incore inode initialization for + * both "special" and "regular" inodes. special inodes + * are those required early in the mount process and + * require special handling since much of the file system + * is not yet initialized. these "special" inodes are + * identified by a NULL inode map inode pointer and are + * actually initialized by a call to diReadSpecial(). + * + * for regular inodes, the iag describing the disk inode + * is read from disk to determine the inode extent address + * for the disk inode. with the inode extent address in + * hand, the page of the extent that contains the disk + * inode is read and the disk inode is copied to the + * incore inode. + * + * PARAMETERS: + * ip - pointer to incore inode to be initialized from disk. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error. + * ENOMEM - insufficient memory + * + */ +int diRead(struct inode *ip) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + int iagno, ino, extno, rc; + struct inode *ipimap; + dinode_t *dp; + iag_t *iagp; + metapage_t *mp; + s64 blkno, agstart; + imap_t *imap; + int block_offset; + int inodes_left; + uint pageno; + int rel_inode; + + jFYI(1, ("diRead: ino = %ld\n", ip->i_ino)); + + ipimap = sbi->ipimap; + JFS_IP(ip)->ipimap = ipimap; + + /* determine the iag number for this inode (number) */ + iagno = INOTOIAG(ip->i_ino); + + /* read the iag */ + imap = JFS_IP(ipimap)->i_imap; + IREAD_LOCK(ipimap); + rc = diIAGRead(imap, iagno, &mp); + IREAD_UNLOCK(ipimap); + if (rc) { + jERROR(1, ("diRead: diIAGRead returned %d\n", rc)); + return (rc); + } + + iagp = (iag_t *) mp->data; + + /* determine inode extent that holds the disk inode */ + ino = ip->i_ino & (INOSPERIAG - 1); + extno = ino >> L2INOSPEREXT; + + if ((lengthPXD(&iagp->inoext[extno]) != imap->im_nbperiext) || + (addressPXD(&iagp->inoext[extno]) == 0)) { + jERROR(1, ("diRead: Bad inoext: 0x%lx, 0x%lx\n", + (ulong) addressPXD(&iagp->inoext[extno]), + (ulong) lengthPXD(&iagp->inoext[extno]))); + release_metapage(mp); + updateSuper(ip->i_sb, FM_DIRTY); + return ESTALE; + } + + /* get disk block number of the page within the inode extent + * that holds the disk inode. + */ + blkno = INOPBLK(&iagp->inoext[extno], ino, sbi->l2nbperpage); + + /* get the ag for the iag */ + agstart = le64_to_cpu(iagp->agstart); + + release_metapage(mp); + + rel_inode = (ino & (INOSPERPAGE - 1)); + pageno = blkno >> sbi->l2nbperpage; + + if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) { + /* + * OS/2 didn't always align inode extents on page boundaries + */ + inodes_left = + (sbi->nbperpage - block_offset) << sbi->l2niperblk; + + if (rel_inode < inodes_left) + rel_inode += block_offset << sbi->l2niperblk; + else { + pageno += 1; + rel_inode -= inodes_left; + } + } + + /* read the page of disk inode */ + mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); + if (mp == 0) { + jERROR(1, ("diRead: read_metapage failed\n")); + return EIO; + } + + /* locate the the disk inode requested */ + dp = (dinode_t *) mp->data; + dp += rel_inode; + + if (ip->i_ino != le32_to_cpu(dp->di_number)) { + jERROR(1, ("diRead: i_ino != di_number\n")); + updateSuper(ip->i_sb, FM_DIRTY); + rc = EIO; + } else if (le32_to_cpu(dp->di_nlink) == 0) { + jERROR(1, + ("diRead: di_nlink is zero. ino=%ld\n", ip->i_ino)); + updateSuper(ip->i_sb, FM_DIRTY); + rc = ESTALE; + } else + /* copy the disk inode to the in-memory inode */ + rc = copy_from_dinode(dp, ip); + + release_metapage(mp); + + /* set the ag for the inode */ + JFS_IP(ip)->agno = BLKTOAG(agstart, sbi); + + return (rc); +} + + +/* + * NAME: diReadSpecial() + * + * FUNCTION: initialize a 'special' inode from disk. + * + * this routines handles aggregate level inodes. The + * inode cache cannot differentiate between the + * aggregate inodes and the filesystem inodes, so we + * handle these here. We don't actually use the aggregate + * inode map, since these inodes are at a fixed location + * and in some cases the aggregate inode map isn't initialized + * yet. + * + * PARAMETERS: + * sb - filesystem superblock + * inum - aggregate inode number + * + * RETURN VALUES: + * new inode - success + * NULL - i/o error. + */ +struct inode *diReadSpecial(struct super_block *sb, ino_t inum) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + uint address; + dinode_t *dp; + struct inode *ip; + metapage_t *mp; + int rc; + + ip = new_inode(sb); + if (ip == NULL) { + jERROR(1, + ("diReadSpecial: new_inode returned NULL!\n")); + return ip; + } + + rc = alloc_jfs_inode(ip); + if (rc) { + make_bad_inode(ip); + iput(ip); + return NULL; + } + + /* + * If ip->i_number >= 32 (INOSPEREXT), then read from secondary + * aggregate inode table. + */ + + if (inum >= INOSPEREXT) { + address = + addressPXD(&sbi->ait2) >> sbi->l2nbperpage; + inum -= INOSPEREXT; + ASSERT(inum < INOSPEREXT); + JFS_IP(ip)->ipimap = sbi->ipaimap2; + } else { + address = AITBL_OFF >> L2PSIZE; + JFS_IP(ip)->ipimap = sbi->ipaimap; + } + ip->i_ino = inum; + + address += inum >> 3; /* 8 inodes per 4K page */ + + /* read the page of fixed disk inode (AIT) in raw mode */ + jEVENT(0, + ("Reading aggregate inode %d from block %d\n", (uint) inum, + address)); + mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); + if (mp == NULL) { + ip->i_sb = NULL; + ip->i_nlink = 1; /* Don't want iput() deleting it */ + iput(ip); + return (NULL); + } + + /* get the pointer to the disk inode of interest */ + dp = (dinode_t *) (mp->data); + dp += inum % 8; /* 8 inodes per 4K page */ + + /* copy on-disk inode to in-memory inode */ + if ((copy_from_dinode(dp, ip)) != 0) { + /* handle bad return by returning NULL for ip */ + ip->i_sb = NULL; + ip->i_nlink = 1; /* Don't want iput() deleting it */ + iput(ip); + /* release the page */ + release_metapage(mp); + return (NULL); + + } + + ip->i_mapping->a_ops = &jfs_aops; + ip->i_mapping->gfp_mask = GFP_NOFS; + + if ((inum == FILESYSTEM_I) && (JFS_IP(ip)->ipimap == sbi->ipaimap)) { + sbi->gengen = le32_to_cpu(dp->di_gengen); + sbi->inostamp = le32_to_cpu(dp->di_inostamp); + } + + /* release the page */ + release_metapage(mp); + + return (ip); +} + +/* + * NAME: diWriteSpecial() + * + * FUNCTION: Write the special inode to disk + * + * PARAMETERS: + * ip - special inode + * + * RETURN VALUES: none + */ + +void diWriteSpecial(struct inode *ip) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + uint address; + dinode_t *dp; + ino_t inum = ip->i_ino; + metapage_t *mp; + + /* + * If ip->i_number >= 32 (INOSPEREXT), then write to secondary + * aggregate inode table. + */ + + if (!(ip->i_state & I_DIRTY)) + return; + + ip->i_state &= ~I_DIRTY; + + if (inum >= INOSPEREXT) { + address = + addressPXD(&sbi->ait2) >> sbi->l2nbperpage; + inum -= INOSPEREXT; + ASSERT(inum < INOSPEREXT); + } else { + address = AITBL_OFF >> L2PSIZE; + } + + address += inum >> 3; /* 8 inodes per 4K page */ + + /* read the page of fixed disk inode (AIT) in raw mode */ + jEVENT(0, + ("Reading aggregate inode %d from block %d\n", (uint) inum, + address)); + mp = read_metapage(ip, address << sbi->l2nbperpage, PSIZE, 1); + if (mp == NULL) { + jERROR(1, + ("diWriteSpecial: failed to read aggregate inode extent!\n")); + return; + } + + /* get the pointer to the disk inode of interest */ + dp = (dinode_t *) (mp->data); + dp += inum % 8; /* 8 inodes per 4K page */ + + /* copy on-disk inode to in-memory inode */ + copy_to_dinode(dp, ip); + memcpy(&dp->di_xtroot, &JFS_IP(ip)->i_xtroot, 288); + + if (inum == FILESYSTEM_I) + dp->di_gengen = cpu_to_le32(sbi->gengen); + + /* write the page */ + write_metapage(mp); +} + +/* + * NAME: diFreeSpecial() + * + * FUNCTION: Free allocated space for special inode + */ +void diFreeSpecial(struct inode *ip) +{ + if (ip == NULL) { + jERROR(1, ("diFreeSpecial called with NULL ip!\n")); + return; + } + fsync_inode_data_buffers(ip); + truncate_inode_pages(ip->i_mapping, 0); + iput(ip); +} + + + +/* + * NAME: diWrite() + * + * FUNCTION: write the on-disk inode portion of the in-memory inode + * to its corresponding on-disk inode. + * + * on entry, the specifed incore inode should itself + * specify the disk inode number corresponding to the + * incore inode (i.e. i_number should be initialized). + * + * the inode contains the inode extent address for the disk + * inode. with the inode extent address in hand, the + * page of the extent that contains the disk inode is + * read and the disk inode portion of the incore inode + * is copied to the disk inode. + * + * PARAMETERS: + * tid - transacation id + * ip - pointer to incore inode to be written to the inode extent. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error. + */ +int diWrite(tid_t tid, struct inode *ip) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + int rc = 0; + s32 ino; + dinode_t *dp; + s64 blkno; + int block_offset; + int inodes_left; + metapage_t *mp; + uint pageno; + int rel_inode; + int dioffset; + struct inode *ipimap; + uint type; + lid_t lid; + tlock_t *ditlck, *tlck; + linelock_t *dilinelock, *ilinelock; + lv_t *lv; + int n; + + ipimap = jfs_ip->ipimap; + + ino = ip->i_ino & (INOSPERIAG - 1); + + assert(lengthPXD(&(jfs_ip->ixpxd)) == + JFS_IP(ipimap)->i_imap->im_nbperiext); + assert(addressPXD(&(jfs_ip->ixpxd))); + + /* + * read the page of disk inode containing the specified inode: + */ + /* compute the block address of the page */ + blkno = INOPBLK(&(jfs_ip->ixpxd), ino, sbi->l2nbperpage); + + rel_inode = (ino & (INOSPERPAGE - 1)); + pageno = blkno >> sbi->l2nbperpage; + + if ((block_offset = ((u32) blkno & (sbi->nbperpage - 1)))) { + /* + * OS/2 didn't always align inode extents on page boundaries + */ + inodes_left = + (sbi->nbperpage - block_offset) << sbi->l2niperblk; + + if (rel_inode < inodes_left) + rel_inode += block_offset << sbi->l2niperblk; + else { + pageno += 1; + rel_inode -= inodes_left; + } + } + /* read the page of disk inode */ + retry: + mp = read_metapage(ipimap, pageno << sbi->l2nbperpage, PSIZE, 1); + if (mp == 0) + return (EIO); + + /* get the pointer to the disk inode */ + dp = (dinode_t *) mp->data; + dp += rel_inode; + + dioffset = (ino & (INOSPERPAGE - 1)) << L2DISIZE; + + /* + * acquire transaction lock on the on-disk inode; + * N.B. tlock is acquired on ipimap not ip; + */ + if ((ditlck = + txLock(tid, ipimap, mp, tlckINODE | tlckENTRY)) == NULL) + goto retry; + dilinelock = (linelock_t *) & ditlck->lock; + + /* + * copy btree root from in-memory inode to on-disk inode + * + * (tlock is taken from inline B+-tree root in in-memory + * inode when the B+-tree root is updated, which is pointed + * by jfs_ip->blid as well as being on tx tlock list) + * + * further processing of btree root is based on the copy + * in in-memory inode, where txLog() will log from, and, + * for xtree root, txUpdateMap() will update map and reset + * XAD_NEW bit; + */ + + if (S_ISDIR(ip->i_mode) && (lid = jfs_ip->xtlid)) { + /* + * This is the special xtree inside the directory for storing + * the directory table + */ + xtpage_t *p, *xp; + xad_t *xad; + + jfs_ip->xtlid = 0; + tlck = lid_to_tlock(lid); + assert(tlck->type & tlckXTREE); + tlck->type |= tlckBTROOT; + tlck->mp = mp; + ilinelock = (linelock_t *) & tlck->lock; + + /* + * copy xtree root from inode to dinode: + */ + p = &jfs_ip->i_xtroot; + xp = (xtpage_t *) &dp->di_dirtable; + lv = (lv_t *) & ilinelock->lv; + for (n = 0; n < ilinelock->index; n++, lv++) { + memcpy(&xp->xad[lv->offset], &p->xad[lv->offset], + lv->length << L2XTSLOTSIZE); + } + + /* reset on-disk (metadata page) xtree XAD_NEW bit */ + xad = &xp->xad[XTENTRYSTART]; + for (n = XTENTRYSTART; + n < le16_to_cpu(xp->header.nextindex); n++, xad++) + if (xad->flag & (XAD_NEW | XAD_EXTENDED)) + xad->flag &= ~(XAD_NEW | XAD_EXTENDED); + } + + if ((lid = jfs_ip->blid) == 0) + goto inlineData; + jfs_ip->blid = 0; + + tlck = lid_to_tlock(lid); + type = tlck->type; + tlck->type |= tlckBTROOT; + tlck->mp = mp; + ilinelock = (linelock_t *) & tlck->lock; + + /* + * regular file: 16 byte (XAD slot) granularity + */ + if (type & tlckXTREE) { + xtpage_t *p, *xp; + xad_t *xad; + + /* + * copy xtree root from inode to dinode: + */ + p = &jfs_ip->i_xtroot; + xp = &dp->di_xtroot; + lv = (lv_t *) & ilinelock->lv; + for (n = 0; n < ilinelock->index; n++, lv++) { + memcpy(&xp->xad[lv->offset], &p->xad[lv->offset], + lv->length << L2XTSLOTSIZE); + } + + /* reset on-disk (metadata page) xtree XAD_NEW bit */ + xad = &xp->xad[XTENTRYSTART]; + for (n = XTENTRYSTART; + n < le16_to_cpu(xp->header.nextindex); n++, xad++) + if (xad->flag & (XAD_NEW | XAD_EXTENDED)) + xad->flag &= ~(XAD_NEW | XAD_EXTENDED); + } + /* + * directory: 32 byte (directory entry slot) granularity + */ + else if (type & tlckDTREE) { + dtpage_t *p, *xp; + + /* + * copy dtree root from inode to dinode: + */ + p = (dtpage_t *) &jfs_ip->i_dtroot; + xp = (dtpage_t *) & dp->di_dtroot; + lv = (lv_t *) & ilinelock->lv; + for (n = 0; n < ilinelock->index; n++, lv++) { + memcpy(&xp->slot[lv->offset], &p->slot[lv->offset], + lv->length << L2DTSLOTSIZE); + } + } else { + jERROR(1, ("diWrite: UFO tlock\n")); + } + + inlineData: + /* + * copy inline symlink from in-memory inode to on-disk inode + */ + if (S_ISLNK(ip->i_mode) && ip->i_size < IDATASIZE) { + lv = (lv_t *) & dilinelock->lv[dilinelock->index]; + lv->offset = (dioffset + 2 * 128) >> L2INODESLOTSIZE; + lv->length = 2; + memcpy(&dp->di_fastsymlink, jfs_ip->i_inline, IDATASIZE); + dilinelock->index++; + } +#ifdef _STILL_TO_PORT + /* + * copy inline data from in-memory inode to on-disk inode: + * 128 byte slot granularity + */ + if (test_cflag(COMMIT_Inlineea, ip)) + lv = (lv_t *) & dilinelock->lv[dilinelock->index]; + lv->offset = (dioffset + 3 * 128) >> L2INODESLOTSIZE; + lv->length = 1; + memcpy(&dp->di_inlineea, &ip->i_inlineea, INODESLOTSIZE); + dilinelock->index++; + + clear_cflag(COMMIT_Inlineea, ip); + } +#endif /* _STILL_TO_PORT */ + + /* + * lock/copy inode base: 128 byte slot granularity + */ +// baseDinode: + lv = (lv_t *) & dilinelock->lv[dilinelock->index]; + lv->offset = dioffset >> L2INODESLOTSIZE; + copy_to_dinode(dp, ip); + if (test_and_clear_cflag(COMMIT_Dirtable, ip)) { + lv->length = 2; + memcpy(&dp->di_dirtable, &jfs_ip->i_dirtable, 96); + } else + lv->length = 1; + dilinelock->index++; + +#ifdef _JFS_FASTDASD + /* + * We aren't logging changes to the DASD used in directory inodes, + * but we need to write them to disk. If we don't unmount cleanly, + * mount will recalculate the DASD used. + */ + if (S_ISDIR(ip->i_mode) + && (ip->i_ipmnt->i_mntflag & JFS_DASD_ENABLED)) + bcopy(&ip->i_DASD, &dp->di_DASD, sizeof(dasd_t)); +#endif /* _JFS_FASTDASD */ + + /* release the buffer holding the updated on-disk inode. + * the buffer will be later written by commit processing. + */ + write_metapage(mp); + + return (rc); +} + + +/* + * NAME: diFree(ip) + * + * FUNCTION: free a specified inode from the inode working map + * for a fileset or aggregate. + * + * if the inode to be freed represents the first (only) + * free inode within the iag, the iag will be placed on + * the ag free inode list. + * + * freeing the inode will cause the inode extent to be + * freed if the inode is the only allocated inode within + * the extent. in this case all the disk resource backing + * up the inode extent will be freed. in addition, the iag + * will be placed on the ag extent free list if the extent + * is the first free extent in the iag. if freeing the + * extent also means that no free inodes will exist for + * the iag, the iag will also be removed from the ag free + * inode list. + * + * the iag describing the inode will be freed if the extent + * is to be freed and it is the only backed extent within + * the iag. in this case, the iag will be removed from the + * ag free extent list and ag free inode list and placed on + * the inode map's free iag list. + * + * a careful update approach is used to provide consistency + * in the face of updates to multiple buffers. under this + * approach, all required buffers are obtained before making + * any updates and are held until all updates are complete. + * + * PARAMETERS: + * ip - inode to be freed. + * + * RETURN VALUES: + * 0 - success + * EIO - i/o error. + */ +int diFree(struct inode *ip) +{ + int rc; + ino_t inum = ip->i_ino; + iag_t *iagp, *aiagp, *biagp, *ciagp, *diagp; + metapage_t *mp, *amp, *bmp, *cmp, *dmp; + int iagno, ino, extno, bitno, sword, agno; + int back, fwd; + u32 bitmap, mask; + struct inode *ipimap = JFS_SBI(ip->i_sb)->ipimap; + imap_t *imap = JFS_IP(ipimap)->i_imap; + s64 xaddr; + s64 xlen; + pxd_t freepxd; + tid_t tid; + struct inode *iplist[3]; + tlock_t *tlck; + pxdlock_t *pxdlock; + + /* + * This is just to suppress compiler warnings. The same logic that + * references these variables is used to initialize them. + */ + aiagp = biagp = ciagp = diagp = NULL; + + /* get the iag number containing the inode. + */ + iagno = INOTOIAG(inum); + + /* make sure that the iag is contained within + * the map. + */ + //assert(iagno < imap->im_nextiag); + if (iagno >= imap->im_nextiag) { + jERROR(1, ("diFree: inum = %d, iagno = %d, nextiag = %d\n", + (uint) inum, iagno, imap->im_nextiag)); + dump_mem("imap", imap, 32); + updateSuper(ip->i_sb, FM_DIRTY); + return EIO; + } + + /* get the allocation group for this ino. + */ + agno = JFS_IP(ip)->agno; + + /* Lock the AG specific inode map information + */ + AG_LOCK(imap, agno); + + /* Obtain read lock in imap inode. Don't release it until we have + * read all of the IAG's that we are going to. + */ + IREAD_LOCK(ipimap); + + /* read the iag. + */ + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + return (rc); + } + iagp = (iag_t *) mp->data; + + /* get the inode number and extent number of the inode within + * the iag and the inode number within the extent. + */ + ino = inum & (INOSPERIAG - 1); + extno = ino >> L2INOSPEREXT; + bitno = ino & (INOSPEREXT - 1); + mask = HIGHORDER >> bitno; + + assert(le32_to_cpu(iagp->wmap[extno]) & mask); +#ifdef _STILL_TO_PORT + assert((le32_to_cpu(iagp->pmap[extno]) & mask) == 0); +#endif /* _STILL_TO_PORT */ + assert(addressPXD(&iagp->inoext[extno])); + + /* compute the bitmap for the extent reflecting the freed inode. + */ + bitmap = le32_to_cpu(iagp->wmap[extno]) & ~mask; + + if (imap->im_agctl[agno].numfree > imap->im_agctl[agno].numinos) { + jERROR(1,("diFree: numfree > numinos\n")); + release_metapage(mp); + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + updateSuper(ip->i_sb, FM_DIRTY); + return EIO; + } + /* + * inode extent still has some inodes or below low water mark: + * keep the inode extent; + */ + if (bitmap || + imap->im_agctl[agno].numfree < 96 || + (imap->im_agctl[agno].numfree < 288 && + (((imap->im_agctl[agno].numfree * 100) / + imap->im_agctl[agno].numinos) <= 25))) { + /* if the iag currently has no free inodes (i.e., + * the inode being freed is the first free inode of iag), + * insert the iag at head of the inode free list for the ag. + */ + if (iagp->nfreeinos == 0) { + /* check if there are any iags on the ag inode + * free list. if so, read the first one so that + * we can link the current iag onto the list at + * the head. + */ + if ((fwd = imap->im_agctl[agno].inofree) >= 0) { + /* read the iag that currently is the head + * of the list. + */ + if ((rc = diIAGRead(imap, fwd, &))) { + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + release_metapage(mp); + return (rc); + } + aiagp = (iag_t *) amp->data; + + /* make current head point back to the iag. + */ + aiagp->inofreeback = cpu_to_le32(iagno); + + write_metapage(amp); + } + + /* iag points forward to current head and iag + * becomes the new head of the list. + */ + iagp->inofreefwd = + cpu_to_le32(imap->im_agctl[agno].inofree); + iagp->inofreeback = -1; + imap->im_agctl[agno].inofree = iagno; + } + IREAD_UNLOCK(ipimap); + + /* update the free inode summary map for the extent if + * freeing the inode means the extent will now have free + * inodes (i.e., the inode being freed is the first free + * inode of extent), + */ + if (iagp->wmap[extno] == ONES) { + sword = extno >> L2EXTSPERSUM; + bitno = extno & (EXTSPERSUM - 1); + iagp->inosmap[sword] &= + cpu_to_le32(~(HIGHORDER >> bitno)); + } + + /* update the bitmap. + */ + iagp->wmap[extno] = cpu_to_le32(bitmap); + DBG_DIFREE(imap, inum); + + /* update the free inode counts at the iag, ag and + * map level. + */ + iagp->nfreeinos = + cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 1); + imap->im_agctl[agno].numfree += 1; + atomic_inc(&imap->im_numfree); + + /* release the AG inode map lock + */ + AG_UNLOCK(imap, agno); + + /* write the iag */ + write_metapage(mp); + + return (0); + } + + + /* + * inode extent has become free and above low water mark: + * free the inode extent; + */ + + /* + * prepare to update iag list(s) (careful update step 1) + */ + amp = bmp = cmp = dmp = NULL; + fwd = back = -1; + + /* check if the iag currently has no free extents. if so, + * it will be placed on the head of the ag extent free list. + */ + if (iagp->nfreeexts == 0) { + /* check if the ag extent free list has any iags. + * if so, read the iag at the head of the list now. + * this (head) iag will be updated later to reflect + * the addition of the current iag at the head of + * the list. + */ + if ((fwd = imap->im_agctl[agno].extfree) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + goto error_out; + aiagp = (iag_t *) amp->data; + } + } else { + /* iag has free extents. check if the addition of a free + * extent will cause all extents to be free within this + * iag. if so, the iag will be removed from the ag extent + * free list and placed on the inode map's free iag list. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) { + /* in preparation for removing the iag from the + * ag extent free list, read the iags preceeding + * and following the iag on the ag extent free + * list. + */ + if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + goto error_out; + aiagp = (iag_t *) amp->data; + } + + if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) { + if ((rc = diIAGRead(imap, back, &bmp))) + goto error_out; + biagp = (iag_t *) bmp->data; + } + } + } + + /* remove the iag from the ag inode free list if freeing + * this extent cause the iag to have no free inodes. + */ + if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) { + int inofreeback = le32_to_cpu(iagp->inofreeback); + int inofreefwd = le32_to_cpu(iagp->inofreefwd); + + /* in preparation for removing the iag from the + * ag inode free list, read the iags preceeding + * and following the iag on the ag inode free + * list. before reading these iags, we must make + * sure that we already don't have them in hand + * from up above, since re-reading an iag (buffer) + * we are currently holding would cause a deadlock. + */ + if (inofreefwd >= 0) { + + if (inofreefwd == fwd) + ciagp = (iag_t *) amp->data; + else if (inofreefwd == back) + ciagp = (iag_t *) bmp->data; + else { + if ((rc = + diIAGRead(imap, inofreefwd, &cmp))) + goto error_out; + assert(cmp != NULL); + ciagp = (iag_t *) cmp->data; + } + assert(ciagp != NULL); + } + + if (inofreeback >= 0) { + if (inofreeback == fwd) + diagp = (iag_t *) amp->data; + else if (inofreeback == back) + diagp = (iag_t *) bmp->data; + else { + if ((rc = + diIAGRead(imap, inofreeback, &dmp))) + goto error_out; + assert(dmp != NULL); + diagp = (iag_t *) dmp->data; + } + assert(diagp != NULL); + } + } + + IREAD_UNLOCK(ipimap); + + /* + * invalidate any page of the inode extent freed from buffer cache; + */ + freepxd = iagp->inoext[extno]; + xaddr = addressPXD(&iagp->inoext[extno]); + xlen = lengthPXD(&iagp->inoext[extno]); + invalidate_metapages(JFS_SBI(ip->i_sb)->direct_inode, xaddr, xlen); + + /* + * update iag list(s) (careful update step 2) + */ + /* add the iag to the ag extent free list if this is the + * first free extent for the iag. + */ + if (iagp->nfreeexts == 0) { + if (fwd >= 0) + aiagp->extfreeback = cpu_to_le32(iagno); + + iagp->extfreefwd = + cpu_to_le32(imap->im_agctl[agno].extfree); + iagp->extfreeback = -1; + imap->im_agctl[agno].extfree = iagno; + } else { + /* remove the iag from the ag extent list if all extents + * are now free and place it on the inode map iag free list. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG - 1)) { + if (fwd >= 0) + aiagp->extfreeback = iagp->extfreeback; + + if (back >= 0) + biagp->extfreefwd = iagp->extfreefwd; + else + imap->im_agctl[agno].extfree = + le32_to_cpu(iagp->extfreefwd); + + iagp->extfreefwd = iagp->extfreeback = -1; + + IAGFREE_LOCK(imap); + iagp->iagfree = cpu_to_le32(imap->im_freeiag); + imap->im_freeiag = iagno; + IAGFREE_UNLOCK(imap); + } + } + + /* remove the iag from the ag inode free list if freeing + * this extent causes the iag to have no free inodes. + */ + if (iagp->nfreeinos == cpu_to_le32(INOSPEREXT - 1)) { + if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) + ciagp->inofreeback = iagp->inofreeback; + + if ((int) le32_to_cpu(iagp->inofreeback) >= 0) + diagp->inofreefwd = iagp->inofreefwd; + else + imap->im_agctl[agno].inofree = + le32_to_cpu(iagp->inofreefwd); + + iagp->inofreefwd = iagp->inofreeback = -1; + } + + /* update the inode extent address and working map + * to reflect the free extent. + * the permanent map should have been updated already + * for the inode being freed. + */ + assert(iagp->pmap[extno] == 0); + iagp->wmap[extno] = 0; + DBG_DIFREE(imap, inum); + PXDlength(&iagp->inoext[extno], 0); + PXDaddress(&iagp->inoext[extno], 0); + + /* update the free extent and free inode summary maps + * to reflect the freed extent. + * the inode summary map is marked to indicate no inodes + * available for the freed extent. + */ + sword = extno >> L2EXTSPERSUM; + bitno = extno & (EXTSPERSUM - 1); + mask = HIGHORDER >> bitno; + iagp->inosmap[sword] |= cpu_to_le32(mask); + iagp->extsmap[sword] &= cpu_to_le32(~mask); + + /* update the number of free inodes and number of free extents + * for the iag. + */ + iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - + (INOSPEREXT - 1)); + iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) + 1); + + /* update the number of free inodes and backed inodes + * at the ag and inode map level. + */ + imap->im_agctl[agno].numfree -= (INOSPEREXT - 1); + imap->im_agctl[agno].numinos -= INOSPEREXT; + atomic_sub(INOSPEREXT - 1, &imap->im_numfree); + atomic_sub(INOSPEREXT, &imap->im_numinos); + + if (amp) + write_metapage(amp); + if (bmp) + write_metapage(bmp); + if (cmp) + write_metapage(cmp); + if (dmp) + write_metapage(dmp); + + /* + * start transaction to update block allocation map + * for the inode extent freed; + * + * N.B. AG_LOCK is released and iag will be released below, and + * other thread may allocate inode from/reusing the ixad freed + * BUT with new/different backing inode extent from the extent + * to be freed by the transaction; + */ + tid = txBegin(ipimap->i_sb, COMMIT_FORCE); + + /* acquire tlock of the iag page of the freed ixad + * to force the page NOHOMEOK (even though no data is + * logged from the iag page) until NOREDOPAGE|FREEXTENT log + * for the free of the extent is committed; + * write FREEXTENT|NOREDOPAGE log record + * N.B. linelock is overlaid as freed extent descriptor; + */ + tlck = txLock(tid, ipimap, mp, tlckINODE | tlckFREE); + pxdlock = (pxdlock_t *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + pxdlock->pxd = freepxd; + pxdlock->index = 1; + + write_metapage(mp); + + iplist[0] = ipimap; + + /* + * logredo needs the IAG number and IAG extent index in order + * to ensure that the IMap is consistent. The least disruptive + * way to pass these values through to the transaction manager + * is in the iplist array. + * + * It's not pretty, but it works. + */ + iplist[1] = (struct inode *) (size_t)iagno; + iplist[2] = (struct inode *) (size_t)extno; + + rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); // D233382 + + txEnd(tid); + + /* unlock the AG inode map information */ + AG_UNLOCK(imap, agno); + + return (0); + + error_out: + IREAD_UNLOCK(ipimap); + + if (amp) + release_metapage(amp); + if (bmp) + release_metapage(bmp); + if (cmp) + release_metapage(cmp); + if (dmp) + release_metapage(dmp); + + AG_UNLOCK(imap, agno); + + release_metapage(mp); + + return (rc); +} + +/* + * There are several places in the diAlloc* routines where we initialize + * the inode. + */ +static inline void +diInitInode(struct inode *ip, int iagno, int ino, int extno, iag_t * iagp) +{ + struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + + ip->i_ino = (iagno << L2INOSPERIAG) + ino; + DBG_DIALLOC(JFS_IP(ipimap)->i_imap, ip->i_ino); + jfs_ip->ixpxd = iagp->inoext[extno]; + jfs_ip->agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi); +} + + +/* + * NAME: diAlloc(pip,dir,ip) + * + * FUNCTION: allocate a disk inode from the inode working map + * for a fileset or aggregate. + * + * PARAMETERS: + * pip - pointer to incore inode for the parent inode. + * dir - TRUE if the new disk inode is for a directory. + * ip - pointer to a new inode + * + * RETURN VALUES: + * 0 - success. + * ENOSPC - insufficient disk resources. + * EIO - i/o error. + */ +int diAlloc(struct inode *pip, boolean_t dir, struct inode *ip) +{ + int rc, ino, iagno, addext, extno, bitno, sword; + int nwords, rem, i, agno; + u32 mask, inosmap, extsmap; + struct inode *ipimap; + metapage_t *mp; + ino_t inum; + iag_t *iagp; + imap_t *imap; + + /* get the pointers to the inode map inode and the + * corresponding imap control structure. + */ + ipimap = JFS_SBI(pip->i_sb)->ipimap; + imap = JFS_IP(ipimap)->i_imap; + JFS_IP(ip)->ipimap = ipimap; + JFS_IP(ip)->fileset = FILESYSTEM_I; + + /* for a directory, the allocation policy is to start + * at the ag level using the preferred ag. + */ + if (dir == TRUE) { + agno = dbNextAG(JFS_SBI(pip->i_sb)->ipbmap); + AG_LOCK(imap, agno); + goto tryag; + } + + /* for files, the policy starts off by trying to allocate from + * the same iag containing the parent disk inode: + * try to allocate the new disk inode close to the parent disk + * inode, using parent disk inode number + 1 as the allocation + * hint. (we use a left-to-right policy to attempt to avoid + * moving backward on the disk.) compute the hint within the + * file system and the iag. + */ + inum = pip->i_ino + 1; + ino = inum & (INOSPERIAG - 1); + + /* back off the the hint if it is outside of the iag */ + if (ino == 0) + inum = pip->i_ino; + + /* get the ag number of this iag */ + agno = JFS_IP(pip)->agno; + + /* lock the AG inode map information */ + AG_LOCK(imap, agno); + + /* Get read lock on imap inode */ + IREAD_LOCK(ipimap); + + /* get the iag number and read the iag */ + iagno = INOTOIAG(inum); + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(ipimap); + return (rc); + } + iagp = (iag_t *) mp->data; + + /* determine if new inode extent is allowed to be added to the iag. + * new inode extent can be added to the iag if the ag + * has less than 32 free disk inodes and the iag has free extents. + */ + addext = (imap->im_agctl[agno].numfree < 32 && iagp->nfreeexts); + + /* + * try to allocate from the IAG + */ + /* check if the inode may be allocated from the iag + * (i.e. the inode has free inodes or new extent can be added). + */ + if (iagp->nfreeinos || addext) { + /* determine the extent number of the hint. + */ + extno = ino >> L2INOSPEREXT; + + /* check if the extent containing the hint has backed + * inodes. if so, try to allocate within this extent. + */ + if (addressPXD(&iagp->inoext[extno])) { + bitno = ino & (INOSPEREXT - 1); + if ((bitno = + diFindFree(le32_to_cpu(iagp->wmap[extno]), + bitno)) + < INOSPEREXT) { + ino = (extno << L2INOSPEREXT) + bitno; + + /* a free inode (bit) was found within this + * extent, so allocate it. + */ + rc = diAllocBit(imap, iagp, ino); + IREAD_UNLOCK(ipimap); + if (rc) { + assert(rc == EIO); + } else { + /* set the results of the allocation + * and write the iag. + */ + diInitInode(ip, iagno, ino, extno, + iagp); + mark_metapage_dirty(mp); + } + release_metapage(mp); + + /* free the AG lock and return. + */ + AG_UNLOCK(imap, agno); + return (rc); + } + + if (!addext) + extno = + (extno == + EXTSPERIAG - 1) ? 0 : extno + 1; + } + + /* + * no free inodes within the extent containing the hint. + * + * try to allocate from the backed extents following + * hint or, if appropriate (i.e. addext is true), allocate + * an extent of free inodes at or following the extent + * containing the hint. + * + * the free inode and free extent summary maps are used + * here, so determine the starting summary map position + * and the number of words we'll have to examine. again, + * the approach is to allocate following the hint, so we + * might have to initially ignore prior bits of the summary + * map that represent extents prior to the extent containing + * the hint and later revisit these bits. + */ + bitno = extno & (EXTSPERSUM - 1); + nwords = (bitno == 0) ? SMAPSZ : SMAPSZ + 1; + sword = extno >> L2EXTSPERSUM; + + /* mask any prior bits for the starting words of the + * summary map. + */ + mask = ONES << (EXTSPERSUM - bitno); + inosmap = le32_to_cpu(iagp->inosmap[sword]) | mask; + extsmap = le32_to_cpu(iagp->extsmap[sword]) | mask; + + /* scan the free inode and free extent summary maps for + * free resources. + */ + for (i = 0; i < nwords; i++) { + /* check if this word of the free inode summary + * map describes an extent with free inodes. + */ + if (~inosmap) { + /* an extent with free inodes has been + * found. determine the extent number + * and the inode number within the extent. + */ + rem = diFindFree(inosmap, 0); + extno = (sword << L2EXTSPERSUM) + rem; + rem = + diFindFree(le32_to_cpu + (iagp->wmap[extno]), 0); + assert(rem < INOSPEREXT); + + /* determine the inode number within the + * iag and allocate the inode from the + * map. + */ + ino = (extno << L2INOSPEREXT) + rem; + rc = diAllocBit(imap, iagp, ino); + IREAD_UNLOCK(ipimap); + if (rc) { + assert(rc == EIO); + } else { + /* set the results of the allocation + * and write the iag. + */ + diInitInode(ip, iagno, ino, extno, + iagp); + mark_metapage_dirty(mp); + } + release_metapage(mp); + + /* free the AG lock and return. + */ + AG_UNLOCK(imap, agno); + return (rc); + + } + + /* check if we may allocate an extent of free + * inodes and whether this word of the free + * extents summary map describes a free extent. + */ + if (addext && ~extsmap) { + /* a free extent has been found. determine + * the extent number. + */ + rem = diFindFree(extsmap, 0); + extno = (sword << L2EXTSPERSUM) + rem; + + /* allocate an extent of free inodes. + */ + if ((rc = diNewExt(imap, iagp, extno))) { + /* if there is no disk space for a + * new extent, try to allocate the + * disk inode from somewhere else. + */ + if (rc == ENOSPC) + break; + + assert(rc == EIO); + } else { + /* set the results of the allocation + * and write the iag. + */ + diInitInode(ip, iagno, + extno << L2INOSPEREXT, + extno, iagp); + mark_metapage_dirty(mp); + } + release_metapage(mp); + /* free the imap inode & the AG lock & return. + */ + IREAD_UNLOCK(ipimap); + AG_UNLOCK(imap, agno); + return (rc); + } + + /* move on to the next set of summary map words. + */ + sword = (sword == SMAPSZ - 1) ? 0 : sword + 1; + inosmap = le32_to_cpu(iagp->inosmap[sword]); + extsmap = le32_to_cpu(iagp->extsmap[sword]); + } + } + /* unlock imap inode */ + IREAD_UNLOCK(ipimap); + + /* nothing doing in this iag, so release it. */ + release_metapage(mp); + + tryag: + /* + * try to allocate anywhere within the same AG as the parent inode. + */ + rc = diAllocAG(imap, agno, dir, ip); + + AG_UNLOCK(imap, agno); + + if (rc != ENOSPC) + return (rc); + + /* + * try to allocate in any AG. + */ + return (diAllocAny(imap, agno, dir, ip)); +} + + +/* + * NAME: diAllocAG(imap,agno,dir,ip) + * + * FUNCTION: allocate a disk inode from the allocation group. + * + * this routine first determines if a new extent of free + * inodes should be added for the allocation group, with + * the current request satisfied from this extent. if this + * is the case, an attempt will be made to do just that. if + * this attempt fails or it has been determined that a new + * extent should not be added, an attempt is made to satisfy + * the request by allocating an existing (backed) free inode + * from the allocation group. + * + * PRE CONDITION: Already have the AG lock for this AG. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - allocation group to allocate from. + * dir - TRUE if the new disk inode is for a directory. + * ip - pointer to the new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * ENOSPC - insufficient disk resources. + * EIO - i/o error. + */ +static int +diAllocAG(imap_t * imap, int agno, boolean_t dir, struct inode *ip) +{ + int rc, addext, numfree, numinos; + + /* get the number of free and the number of backed disk + * inodes currently within the ag. + */ + numfree = imap->im_agctl[agno].numfree; + numinos = imap->im_agctl[agno].numinos; + + if (numfree > numinos) { + jERROR(1,("diAllocAG: numfree > numinos\n")); + updateSuper(ip->i_sb, FM_DIRTY); + return EIO; + } + + /* determine if we should allocate a new extent of free inodes + * within the ag: for directory inodes, add a new extent + * if there are a small number of free inodes or number of free + * inodes is a small percentage of the number of backed inodes. + */ + if (dir == TRUE) + addext = (numfree < 64 || + (numfree < 256 + && ((numfree * 100) / numinos) <= 20)); + else + addext = (numfree == 0); + + /* + * try to allocate a new extent of free inodes. + */ + if (addext) { + /* if free space is not avaliable for this new extent, try + * below to allocate a free and existing (already backed) + * inode from the ag. + */ + if ((rc = diAllocExt(imap, agno, ip)) != ENOSPC) + return (rc); + } + + /* + * try to allocate an existing free inode from the ag. + */ + return (diAllocIno(imap, agno, ip)); +} + + +/* + * NAME: diAllocAny(imap,agno,dir,iap) + * + * FUNCTION: allocate a disk inode from any other allocation group. + * + * this routine is called when an allocation attempt within + * the primary allocation group has failed. if attempts to + * allocate an inode from any allocation group other than the + * specified primary group. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - primary allocation group (to avoid). + * dir - TRUE if the new disk inode is for a directory. + * ip - pointer to a new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * ENOSPC - insufficient disk resources. + * EIO - i/o error. + */ +static int +diAllocAny(imap_t * imap, int agno, boolean_t dir, struct inode *ip) +{ + int ag, rc; + int maxag = JFS_SBI(imap->im_ipimap->i_sb)->bmap->db_maxag; + + + /* try to allocate from the ags following agno up to + * the maximum ag number. + */ + for (ag = agno + 1; ag <= maxag; ag++) { + AG_LOCK(imap, ag); + + rc = diAllocAG(imap, ag, dir, ip); + + AG_UNLOCK(imap, ag); + + if (rc != ENOSPC) + return (rc); + } + + /* try to allocate from the ags in front of agno. + */ + for (ag = 0; ag < agno; ag++) { + AG_LOCK(imap, ag); + + rc = diAllocAG(imap, ag, dir, ip); + + AG_UNLOCK(imap, ag); + + if (rc != ENOSPC) + return (rc); + } + + /* no free disk inodes. + */ + return (ENOSPC); +} + + +/* + * NAME: diAllocIno(imap,agno,ip) + * + * FUNCTION: allocate a disk inode from the allocation group's free + * inode list, returning an error if this free list is + * empty (i.e. no iags on the list). + * + * allocation occurs from the first iag on the list using + * the iag's free inode summary map to find the leftmost + * free inode in the iag. + * + * PRE CONDITION: Already have AG lock for this AG. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - allocation group. + * ip - pointer to new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * ENOSPC - insufficient disk resources. + * EIO - i/o error. + */ +static int diAllocIno(imap_t * imap, int agno, struct inode *ip) +{ + int iagno, ino, rc, rem, extno, sword; + metapage_t *mp; + iag_t *iagp; + + /* check if there are iags on the ag's free inode list. + */ + if ((iagno = imap->im_agctl[agno].inofree) < 0) + return (ENOSPC); + + /* obtain read lock on imap inode */ + IREAD_LOCK(imap->im_ipimap); + + /* read the iag at the head of the list. + */ + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(imap->im_ipimap); + return (rc); + } + iagp = (iag_t *) mp->data; + + /* better be free inodes in this iag if it is on the + * list. + */ + //assert(iagp->nfreeinos); + if (!iagp->nfreeinos) { + jERROR(1, + ("diAllocIno: nfreeinos = 0, but iag on freelist\n")); + jERROR(1, (" agno = %d, iagno = %d\n", agno, iagno)); + dump_mem("iag", iagp, 64); + updateSuper(ip->i_sb, FM_DIRTY); + return EIO; + } + + /* scan the free inode summary map to find an extent + * with free inodes. + */ + for (sword = 0;; sword++) { + assert(sword < SMAPSZ); + + if (~iagp->inosmap[sword]) + break; + } + + /* found a extent with free inodes. determine + * the extent number. + */ + rem = diFindFree(le32_to_cpu(iagp->inosmap[sword]), 0); + assert(rem < EXTSPERSUM); + extno = (sword << L2EXTSPERSUM) + rem; + + /* find the first free inode in the extent. + */ + rem = diFindFree(le32_to_cpu(iagp->wmap[extno]), 0); + assert(rem < INOSPEREXT); + + /* compute the inode number within the iag. + */ + ino = (extno << L2INOSPEREXT) + rem; + + /* allocate the inode. + */ + rc = diAllocBit(imap, iagp, ino); + IREAD_UNLOCK(imap->im_ipimap); + if (rc) { + release_metapage(mp); + return (rc); + } + + /* set the results of the allocation and write the iag. + */ + diInitInode(ip, iagno, ino, extno, iagp); + write_metapage(mp); + + return (0); +} + + +/* + * NAME: diAllocExt(imap,agno,ip) + * + * FUNCTION: add a new extent of free inodes to an iag, allocating + * an inode from this extent to satisfy the current allocation + * request. + * + * this routine first tries to find an existing iag with free + * extents through the ag free extent list. if list is not + * empty, the head of the list will be selected as the home + * of the new extent of free inodes. otherwise (the list is + * empty), a new iag will be allocated for the ag to contain + * the extent. + * + * once an iag has been selected, the free extent summary map + * is used to locate a free extent within the iag and diNewExt() + * is called to initialize the extent, with initialization + * including the allocation of the first inode of the extent + * for the purpose of satisfying this request. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * agno - allocation group number. + * ip - pointer to new inode to be filled in on successful return + * with the disk inode number allocated, its extent address + * and the start of the ag. + * + * RETURN VALUES: + * 0 - success. + * ENOSPC - insufficient disk resources. + * EIO - i/o error. + */ +static int diAllocExt(imap_t * imap, int agno, struct inode *ip) +{ + int rem, iagno, sword, extno, rc; + metapage_t *mp; + iag_t *iagp; + + /* check if the ag has any iags with free extents. if not, + * allocate a new iag for the ag. + */ + if ((iagno = imap->im_agctl[agno].extfree) < 0) { + /* If successful, diNewIAG will obtain the read lock on the + * imap inode. + */ + if ((rc = diNewIAG(imap, &iagno, agno, &mp))) { + return (rc); + } + iagp = (iag_t *) mp->data; + + /* set the ag number if this a brand new iag + */ + iagp->agstart = + cpu_to_le64(AGTOBLK(agno, imap->im_ipimap)); + } else { + /* read the iag. + */ + IREAD_LOCK(imap->im_ipimap); + if ((rc = diIAGRead(imap, iagno, &mp))) { + assert(0); + } + iagp = (iag_t *) mp->data; + } + + /* using the free extent summary map, find a free extent. + */ + for (sword = 0;; sword++) { + assert(sword < SMAPSZ); + if (~iagp->extsmap[sword]) + break; + } + + /* determine the extent number of the free extent. + */ + rem = diFindFree(le32_to_cpu(iagp->extsmap[sword]), 0); + assert(rem < EXTSPERSUM); + extno = (sword << L2EXTSPERSUM) + rem; + + /* initialize the new extent. + */ + rc = diNewExt(imap, iagp, extno); + IREAD_UNLOCK(imap->im_ipimap); + if (rc) { + /* something bad happened. if a new iag was allocated, + * place it back on the inode map's iag free list, and + * clear the ag number information. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + IAGFREE_LOCK(imap); + iagp->iagfree = cpu_to_le32(imap->im_freeiag); + imap->im_freeiag = iagno; + IAGFREE_UNLOCK(imap); + } + write_metapage(mp); + return (rc); + } + + /* set the results of the allocation and write the iag. + */ + diInitInode(ip, iagno, extno << L2INOSPEREXT, extno, iagp); + + write_metapage(mp); + + return (0); +} + + +/* + * NAME: diAllocBit(imap,iagp,ino) + * + * FUNCTION: allocate a backed inode from an iag. + * + * this routine performs the mechanics of allocating a + * specified inode from a backed extent. + * + * if the inode to be allocated represents the last free + * inode within the iag, the iag will be removed from the + * ag free inode list. + * + * a careful update approach is used to provide consistency + * in the face of updates to multiple buffers. under this + * approach, all required buffers are obtained before making + * any updates and are held all are updates are complete. + * + * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on + * this AG. Must have read lock on imap inode. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagp - pointer to iag. + * ino - inode number to be allocated within the iag. + * + * RETURN VALUES: + * 0 - success. + * ENOSPC - insufficient disk resources. + * EIO - i/o error. + */ +static int diAllocBit(imap_t * imap, iag_t * iagp, int ino) +{ + int extno, bitno, agno, sword, rc; + metapage_t *amp, *bmp; + iag_t *aiagp = 0, *biagp = 0; + u32 mask; + + /* check if this is the last free inode within the iag. + * if so, it will have to be removed from the ag free + * inode list, so get the iags preceeding and following + * it on the list. + */ + if (iagp->nfreeinos == cpu_to_le32(1)) { + amp = bmp = NULL; + + if ((int) le32_to_cpu(iagp->inofreefwd) >= 0) { + if ((rc = + diIAGRead(imap, le32_to_cpu(iagp->inofreefwd), + &))) + return (rc); + aiagp = (iag_t *) amp->data; + } + + if ((int) le32_to_cpu(iagp->inofreeback) >= 0) { + if ((rc = + diIAGRead(imap, + le32_to_cpu(iagp->inofreeback), + &bmp))) { + if (amp) + release_metapage(amp); + return (rc); + } + biagp = (iag_t *) bmp->data; + } + } + + /* get the ag number, extent number, inode number within + * the extent. + */ + agno = BLKTOAG(le64_to_cpu(iagp->agstart), JFS_SBI(imap->im_ipimap->i_sb)); + extno = ino >> L2INOSPEREXT; + bitno = ino & (INOSPEREXT - 1); + + /* compute the mask for setting the map. + */ + mask = HIGHORDER >> bitno; + + /* the inode should be free and backed. + */ + assert((le32_to_cpu(iagp->pmap[extno]) & mask) == 0); + assert((le32_to_cpu(iagp->wmap[extno]) & mask) == 0); + assert(addressPXD(&iagp->inoext[extno]) != 0); + + /* mark the inode as allocated in the working map. + */ + iagp->wmap[extno] |= cpu_to_le32(mask); + + /* check if all inodes within the extent are now + * allocated. if so, update the free inode summary + * map to reflect this. + */ + if (iagp->wmap[extno] == ONES) { + sword = extno >> L2EXTSPERSUM; + bitno = extno & (EXTSPERSUM - 1); + iagp->inosmap[sword] |= cpu_to_le32(HIGHORDER >> bitno); + } + + /* if this was the last free inode in the iag, remove the + * iag from the ag free inode list. + */ + if (iagp->nfreeinos == cpu_to_le32(1)) { + if (amp) { + aiagp->inofreeback = iagp->inofreeback; + write_metapage(amp); + } + + if (bmp) { + biagp->inofreefwd = iagp->inofreefwd; + write_metapage(bmp); + } else { + imap->im_agctl[agno].inofree = + le32_to_cpu(iagp->inofreefwd); + } + iagp->inofreefwd = iagp->inofreeback = -1; + } + + /* update the free inode count at the iag, ag, inode + * map levels. + */ + iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - 1); + imap->im_agctl[agno].numfree -= 1; + atomic_dec(&imap->im_numfree); + + return (0); +} + + +/* + * NAME: diNewExt(imap,iagp,extno) + * + * FUNCTION: initialize a new extent of inodes for an iag, allocating + * the first inode of the extent for use for the current + * allocation request. + * + * disk resources are allocated for the new extent of inodes + * and the inodes themselves are initialized to reflect their + * existence within the extent (i.e. their inode numbers and + * inode extent addresses are set) and their initial state + * (mode and link count are set to zero). + * + * if the iag is new, it is not yet on an ag extent free list + * but will now be placed on this list. + * + * if the allocation of the new extent causes the iag to + * have no free extent, the iag will be removed from the + * ag extent free list. + * + * if the iag has no free backed inodes, it will be placed + * on the ag free inode list, since the addition of the new + * extent will now cause it to have free inodes. + * + * a careful update approach is used to provide consistency + * (i.e. list consistency) in the face of updates to multiple + * buffers. under this approach, all required buffers are + * obtained before making any updates and are held until all + * updates are complete. + * + * PRE CONDITION: Already have buffer lock on iagp. Already have AG lock on + * this AG. Must have read lock on imap inode. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagp - pointer to iag. + * extno - extent number. + * + * RETURN VALUES: + * 0 - success. + * ENOSPC - insufficient disk resources. + * EIO - i/o error. + */ +static int diNewExt(imap_t * imap, iag_t * iagp, int extno) +{ + int agno, iagno, fwd, back, freei = 0, sword, rc; + iag_t *aiagp = 0, *biagp = 0, *ciagp = 0; + metapage_t *amp, *bmp, *cmp, *dmp; + struct inode *ipimap; + s64 blkno, hint; + int i, j; + u32 mask; + ino_t ino; + dinode_t *dp; + struct jfs_sb_info *sbi; + + /* better have free extents. + */ + assert(iagp->nfreeexts); + + /* get the inode map inode. + */ + ipimap = imap->im_ipimap; + sbi = JFS_SBI(ipimap->i_sb); + + amp = bmp = cmp = NULL; + + /* get the ag and iag numbers for this iag. + */ + agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi); + iagno = le32_to_cpu(iagp->iagnum); + + /* check if this is the last free extent within the + * iag. if so, the iag must be removed from the ag + * free extent list, so get the iags preceeding and + * following the iag on this list. + */ + if (iagp->nfreeexts == cpu_to_le32(1)) { + if ((fwd = le32_to_cpu(iagp->extfreefwd)) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + return (rc); + aiagp = (iag_t *) amp->data; + } + + if ((back = le32_to_cpu(iagp->extfreeback)) >= 0) { + if ((rc = diIAGRead(imap, back, &bmp))) + goto error_out; + biagp = (iag_t *) bmp->data; + } + } else { + /* the iag has free extents. if all extents are free + * (as is the case for a newly allocated iag), the iag + * must be added to the ag free extent list, so get + * the iag at the head of the list in preparation for + * adding this iag to this list. + */ + fwd = back = -1; + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + if ((fwd = imap->im_agctl[agno].extfree) >= 0) { + if ((rc = diIAGRead(imap, fwd, &))) + goto error_out; + aiagp = (iag_t *) amp->data; + } + } + } + + /* check if the iag has no free inodes. if so, the iag + * will have to be added to the ag free inode list, so get + * the iag at the head of the list in preparation for + * adding this iag to this list. in doing this, we must + * check if we already have the iag at the head of + * the list in hand. + */ + if (iagp->nfreeinos == 0) { + freei = imap->im_agctl[agno].inofree; + + if (freei >= 0) { + if (freei == fwd) { + ciagp = aiagp; + } else if (freei == back) { + ciagp = biagp; + } else { + if ((rc = diIAGRead(imap, freei, &cmp))) + goto error_out; + ciagp = (iag_t *) cmp->data; + } + assert(ciagp != NULL); + } + } + + /* allocate disk space for the inode extent. + */ + if ((extno == 0) || (addressPXD(&iagp->inoext[extno - 1]) == 0)) + hint = ((s64) agno << sbi->bmap->db_agl2size) - 1; + else + hint = addressPXD(&iagp->inoext[extno - 1]) + + lengthPXD(&iagp->inoext[extno - 1]) - 1; + + if ((rc = dbAlloc(ipimap, hint, (s64) imap->im_nbperiext, &blkno))) + goto error_out; + + /* compute the inode number of the first inode within the + * extent. + */ + ino = (iagno << L2INOSPERIAG) + (extno << L2INOSPEREXT); + + /* initialize the inodes within the newly allocated extent a + * page at a time. + */ + for (i = 0; i < imap->im_nbperiext; i += sbi->nbperpage) { + /* get a buffer for this page of disk inodes. + */ + dmp = get_metapage(ipimap, blkno + i, PSIZE, 1); + if (dmp == NULL) { + rc = EIO; + goto error_out; + } + dp = (dinode_t *) dmp->data; + + /* initialize the inode number, mode, link count and + * inode extent address. + */ + for (j = 0; j < INOSPERPAGE; j++, dp++, ino++) { + dp->di_inostamp = cpu_to_le32(sbi->inostamp); + dp->di_number = cpu_to_le32(ino); + dp->di_fileset = cpu_to_le32(FILESYSTEM_I); + dp->di_mode = 0; + dp->di_nlink = 0; + PXDaddress(&(dp->di_ixpxd), blkno); + PXDlength(&(dp->di_ixpxd), imap->im_nbperiext); + } + write_metapage(dmp); + } + + /* if this is the last free extent within the iag, remove the + * iag from the ag free extent list. + */ + if (iagp->nfreeexts == cpu_to_le32(1)) { + if (fwd >= 0) + aiagp->extfreeback = iagp->extfreeback; + + if (back >= 0) + biagp->extfreefwd = iagp->extfreefwd; + else + imap->im_agctl[agno].extfree = + le32_to_cpu(iagp->extfreefwd); + + iagp->extfreefwd = iagp->extfreeback = -1; + } else { + /* if the iag has all free extents (newly allocated iag), + * add the iag to the ag free extent list. + */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + if (fwd >= 0) + aiagp->extfreeback = cpu_to_le32(iagno); + + iagp->extfreefwd = cpu_to_le32(fwd); + iagp->extfreeback = -1; + imap->im_agctl[agno].extfree = iagno; + } + } + + /* if the iag has no free inodes, add the iag to the + * ag free inode list. + */ + if (iagp->nfreeinos == 0) { + if (freei >= 0) + ciagp->inofreeback = cpu_to_le32(iagno); + + iagp->inofreefwd = + cpu_to_le32(imap->im_agctl[agno].inofree); + iagp->inofreeback = -1; + imap->im_agctl[agno].inofree = iagno; + } + + /* initialize the extent descriptor of the extent. */ + PXDlength(&iagp->inoext[extno], imap->im_nbperiext); + PXDaddress(&iagp->inoext[extno], blkno); + + /* initialize the working and persistent map of the extent. + * the working map will be initialized such that + * it indicates the first inode of the extent is allocated. + */ + iagp->wmap[extno] = cpu_to_le32(HIGHORDER); + iagp->pmap[extno] = 0; + + /* update the free inode and free extent summary maps + * for the extent to indicate the extent has free inodes + * and no longer represents a free extent. + */ + sword = extno >> L2EXTSPERSUM; + mask = HIGHORDER >> (extno & (EXTSPERSUM - 1)); + iagp->extsmap[sword] |= cpu_to_le32(mask); + iagp->inosmap[sword] &= cpu_to_le32(~mask); + + /* update the free inode and free extent counts for the + * iag. + */ + iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + + (INOSPEREXT - 1)); + iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) - 1); + + /* update the free and backed inode counts for the ag. + */ + imap->im_agctl[agno].numfree += (INOSPEREXT - 1); + imap->im_agctl[agno].numinos += INOSPEREXT; + + /* update the free and backed inode counts for the inode map. + */ + atomic_add(INOSPEREXT - 1, &imap->im_numfree); + atomic_add(INOSPEREXT, &imap->im_numinos); + + /* write the iags. + */ + if (amp) + write_metapage(amp); + if (bmp) + write_metapage(bmp); + if (cmp) + write_metapage(cmp); + + return (0); + + error_out: + + /* release the iags. + */ + if (amp) + release_metapage(amp); + if (bmp) + release_metapage(bmp); + if (cmp) + release_metapage(cmp); + + return (rc); +} + + +/* + * NAME: diNewIAG(imap,iagnop,agno) + * + * FUNCTION: allocate a new iag for an allocation group. + * + * first tries to allocate the iag from the inode map + * iagfree list: + * if the list has free iags, the head of the list is removed + * and returned to satisfy the request. + * if the inode map's iag free list is empty, the inode map + * is extended to hold a new iag. this new iag is initialized + * and returned to satisfy the request. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagnop - pointer to an iag number set with the number of the + * newly allocated iag upon successful return. + * agno - allocation group number. + * bpp - Buffer pointer to be filled in with new IAG's buffer + * + * RETURN VALUES: + * 0 - success. + * ENOSPC - insufficient disk resources. + * EIO - i/o error. + * + * serialization: + * AG lock held on entry/exit; + * write lock on the map is held inside; + * read lock on the map is held on successful completion; + * + * note: new iag transaction: + * . synchronously write iag; + * . write log of xtree and inode of imap; + * . commit; + * . synchronous write of xtree (right to left, bottom to top); + * . at start of logredo(): init in-memory imap with one additional iag page; + * . at end of logredo(): re-read imap inode to determine + * new imap size; + */ +static int +diNewIAG(imap_t * imap, int *iagnop, int agno, metapage_t ** mpp) +{ + int rc; + int iagno, i, xlen; + struct inode *ipimap; + struct super_block *sb; + struct jfs_sb_info *sbi; + metapage_t *mp; + iag_t *iagp; + s64 xaddr = 0; + s64 blkno; + tid_t tid; +#ifdef _STILL_TO_PORT + xad_t xad; +#endif /* _STILL_TO_PORT */ + struct inode *iplist[1]; + + /* pick up pointers to the inode map and mount inodes */ + ipimap = imap->im_ipimap; + sb = ipimap->i_sb; + sbi = JFS_SBI(sb); + + /* acquire the free iag lock */ + IAGFREE_LOCK(imap); + + /* if there are any iags on the inode map free iag list, + * allocate the iag from the head of the list. + */ + if (imap->im_freeiag >= 0) { + /* pick up the iag number at the head of the list */ + iagno = imap->im_freeiag; + + /* determine the logical block number of the iag */ + blkno = IAGTOLBLK(iagno, sbi->l2nbperpage); + } else { + /* no free iags. the inode map will have to be extented + * to include a new iag. + */ + + /* acquire inode map lock */ + IWRITE_LOCK(ipimap); + + assert(ipimap->i_size >> L2PSIZE == imap->im_nextiag + 1); + + /* get the next avaliable iag number */ + iagno = imap->im_nextiag; + + /* make sure that we have not exceeded the maximum inode + * number limit. + */ + if (iagno > (MAXIAGS - 1)) { + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + rc = ENOSPC; + goto out; + } + + /* + * synchronously append new iag page. + */ + /* determine the logical address of iag page to append */ + blkno = IAGTOLBLK(iagno, sbi->l2nbperpage); + + /* Allocate extent for new iag page */ + xlen = sbi->nbperpage; + if ((rc = dbAlloc(ipimap, 0, (s64) xlen, &xaddr))) { + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + goto out; + } + + /* assign a buffer for the page */ + mp = get_metapage(ipimap, xaddr, PSIZE, 1); + //bp = bmAssign(ipimap, blkno, xaddr, PSIZE, bmREAD_PAGE); + if (!mp) { + /* Free the blocks allocated for the iag since it was + * not successfully added to the inode map + */ + dbFree(ipimap, xaddr, (s64) xlen); + + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + rc = EIO; + goto out; + } + iagp = (iag_t *) mp->data; + + /* init the iag */ + memset(iagp, 0, sizeof(iag_t)); + iagp->iagnum = cpu_to_le32(iagno); + iagp->inofreefwd = iagp->inofreeback = -1; + iagp->extfreefwd = iagp->extfreeback = -1; + iagp->iagfree = -1; + iagp->nfreeinos = 0; + iagp->nfreeexts = cpu_to_le32(EXTSPERIAG); + + /* initialize the free inode summary map (free extent + * summary map initialization handled by bzero). + */ + for (i = 0; i < SMAPSZ; i++) + iagp->inosmap[i] = ONES; + + flush_metapage(mp); +#ifdef _STILL_TO_PORT + /* synchronously write the iag page */ + if (bmWrite(bp)) { + /* Free the blocks allocated for the iag since it was + * not successfully added to the inode map + */ + dbFree(ipimap, xaddr, (s64) xlen); + + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + rc = EIO; + goto out; + } + + /* Now the iag is on disk */ + + /* + * start tyransaction of update of the inode map + * addressing structure pointing to the new iag page; + */ +#endif /* _STILL_TO_PORT */ + tid = txBegin(sb, COMMIT_FORCE); + + /* update the inode map addressing structure to point to it */ + if ((rc = + xtInsert(tid, ipimap, 0, blkno, xlen, &xaddr, 0))) { + /* Free the blocks allocated for the iag since it was + * not successfully added to the inode map + */ + dbFree(ipimap, xaddr, (s64) xlen); + + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + + goto out; + } + + /* update the inode map's inode to reflect the extension */ + ipimap->i_size += PSIZE; + ipimap->i_blocks += LBLK2PBLK(sb, xlen); + + /* + * txCommit(COMMIT_FORCE) will synchronously write address + * index pages and inode after commit in careful update order + * of address index pages (right to left, bottom up); + */ + iplist[0] = ipimap; + rc = txCommit(tid, 1, &iplist[0], COMMIT_FORCE); + + txEnd(tid); + + duplicateIXtree(sb, blkno, xlen, &xaddr); + + /* update the next avaliable iag number */ + imap->im_nextiag += 1; + + /* Add the iag to the iag free list so we don't lose the iag + * if a failure happens now. + */ + imap->im_freeiag = iagno; + + /* Until we have logredo working, we want the imap inode & + * control page to be up to date. + */ + diSync(ipimap); + + /* release the inode map lock */ + IWRITE_UNLOCK(ipimap); + } + + /* obtain read lock on map */ + IREAD_LOCK(ipimap); + + /* read the iag */ + if ((rc = diIAGRead(imap, iagno, &mp))) { + IREAD_UNLOCK(ipimap); + rc = EIO; + goto out; + } + iagp = (iag_t *) mp->data; + + /* remove the iag from the iag free list */ + imap->im_freeiag = le32_to_cpu(iagp->iagfree); + iagp->iagfree = -1; + + /* set the return iag number and buffer pointer */ + *iagnop = iagno; + *mpp = mp; + + out: + /* release the iag free lock */ + IAGFREE_UNLOCK(imap); + + return (rc); +} + +/* + * NAME: diIAGRead() + * + * FUNCTION: get the buffer for the specified iag within a fileset + * or aggregate inode map. + * + * PARAMETERS: + * imap - pointer to inode map control structure. + * iagno - iag number. + * bpp - point to buffer pointer to be filled in on successful + * exit. + * + * SERIALIZATION: + * must have read lock on imap inode + * (When called by diExtendFS, the filesystem is quiesced, therefore + * the read lock is unnecessary.) + * + * RETURN VALUES: + * 0 - success. + * EIO - i/o error. + */ +static int diIAGRead(imap_t * imap, int iagno, metapage_t ** mpp) +{ + struct inode *ipimap = imap->im_ipimap; + s64 blkno; + + /* compute the logical block number of the iag. */ + blkno = IAGTOLBLK(iagno, JFS_SBI(ipimap->i_sb)->l2nbperpage); + + /* read the iag. */ + *mpp = read_metapage(ipimap, blkno, PSIZE, 0); + if (*mpp == NULL) { + return (EIO); + } + + return (0); +} + +/* + * NAME: diFindFree() + * + * FUNCTION: find the first free bit in a word starting at + * the specified bit position. + * + * PARAMETERS: + * word - word to be examined. + * start - starting bit position. + * + * RETURN VALUES: + * bit position of first free bit in the word or 32 if + * no free bits were found. + */ +static int diFindFree(u32 word, int start) +{ + int bitno; + assert(start < 32); + /* scan the word for the first free bit. */ + for (word <<= start, bitno = start; bitno < 32; + bitno++, word <<= 1) { + if ((word & HIGHORDER) == 0) + break; + } + return (bitno); +} + +/* + * NAME: diUpdatePMap() + * + * FUNCTION: Update the persistent map in an IAG for the allocation or + * freeing of the specified inode. + * + * PRE CONDITIONS: Working map has already been updated for allocate. + * + * PARAMETERS: + * ipimap - Incore inode map inode + * inum - Number of inode to mark in permanent map + * is_free - If TRUE indicates inode should be marked freed, otherwise + * indicates inode should be marked allocated. + * + * RETURNS: 0 for success + */ +int +diUpdatePMap(struct inode *ipimap, + unsigned long inum, boolean_t is_free, tblock_t * tblk) +{ + int rc; + iag_t *iagp; + metapage_t *mp; + int iagno, ino, extno, bitno; + imap_t *imap; + u32 mask; + log_t *log; + int lsn, difft, diffp; + + imap = JFS_IP(ipimap)->i_imap; + /* get the iag number containing the inode */ + iagno = INOTOIAG(inum); + /* make sure that the iag is contained within the map */ + assert(iagno < imap->im_nextiag); + /* read the iag */ + IREAD_LOCK(ipimap); + rc = diIAGRead(imap, iagno, &mp); + IREAD_UNLOCK(ipimap); + if (rc) + return (rc); + iagp = (iag_t *) mp->data; + /* get the inode number and extent number of the inode within + * the iag and the inode number within the extent. + */ + ino = inum & (INOSPERIAG - 1); + extno = ino >> L2INOSPEREXT; + bitno = ino & (INOSPEREXT - 1); + mask = HIGHORDER >> bitno; + /* + * mark the inode free in persistent map: + */ + if (is_free == TRUE) { + /* The inode should have been allocated both in working + * map and in persistent map; + * the inode will be freed from working map at the release + * of last reference release; + */ +// assert(le32_to_cpu(iagp->wmap[extno]) & mask); + if (!(le32_to_cpu(iagp->wmap[extno]) & mask)) { + jERROR(1, + ("diUpdatePMap: inode %ld not marked as allocated in wmap!\n", + inum)); + updateSuper(ipimap->i_sb, FM_DIRTY); + } +// assert(le32_to_cpu(iagp->pmap[extno]) & mask); + if (!(le32_to_cpu(iagp->pmap[extno]) & mask)) { + jERROR(1, + ("diUpdatePMap: inode %ld not marked as allocated in pmap!\n", + inum)); + updateSuper(ipimap->i_sb, FM_DIRTY); + } + /* update the bitmap for the extent of the freed inode */ + iagp->pmap[extno] &= cpu_to_le32(~mask); + } + /* + * mark the inode allocated in persistent map: + */ + else { + /* The inode should be already allocated in the working map + * and should be free in persistent map; + */ + assert(le32_to_cpu(iagp->wmap[extno]) & mask); + assert((le32_to_cpu(iagp->pmap[extno]) & mask) == 0); + /* update the bitmap for the extent of the allocated inode */ + iagp->pmap[extno] |= cpu_to_le32(mask); + } + /* + * update iag lsn + */ + lsn = tblk->lsn; + log = JFS_SBI(tblk->sb)->log; + if (mp->lsn != 0) { + /* inherit older/smaller lsn */ + logdiff(difft, lsn, log); + logdiff(diffp, mp->lsn, log); + if (difft < diffp) { + mp->lsn = lsn; + /* move mp after tblock in logsync list */ + LOGSYNC_LOCK(log); + list_del(&mp->synclist); + list_add(&mp->synclist, &tblk->synclist); + LOGSYNC_UNLOCK(log); + } + /* inherit younger/larger clsn */ + LOGSYNC_LOCK(log); + assert(mp->clsn); + logdiff(difft, tblk->clsn, log); + logdiff(diffp, mp->clsn, log); + if (difft > diffp) + mp->clsn = tblk->clsn; + LOGSYNC_UNLOCK(log); + } else { + mp->log = log; + mp->lsn = lsn; + /* insert mp after tblock in logsync list */ + LOGSYNC_LOCK(log); + log->count++; + list_add(&mp->synclist, &tblk->synclist); + mp->clsn = tblk->clsn; + LOGSYNC_UNLOCK(log); + } +// bmLazyWrite(mp, log->flag & JFS_COMMIT); + write_metapage(mp); + return (0); +} + +/* + * diExtendFS() + * + * function: update imap for extendfs(); + * + * note: AG size has been increased s.t. each k old contiguous AGs are + * coalesced into a new AG; + */ +int diExtendFS(struct inode *ipimap, struct inode *ipbmap) +{ + int rc, rcx = 0; + imap_t *imap = JFS_IP(ipimap)->i_imap; + iag_t *iagp = 0, *hiagp = 0; + bmap_t *mp = JFS_SBI(ipbmap->i_sb)->bmap; + metapage_t *bp, *hbp; + int i, n, head; + int numinos, xnuminos = 0, xnumfree = 0; + s64 agstart; + + jEVENT(0, ("diExtendFS: nextiag:%d numinos:%d numfree:%d\n", + imap->im_nextiag, atomic_read(&imap->im_numinos), + atomic_read(&imap->im_numfree))); + + /* + * reconstruct imap + * + * coalesce contiguous k (newAGSize/oldAGSize) AGs; + * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn; + * note: new AG size = old AG size * (2**x). + */ + + /* init per AG control information im_agctl[] */ + for (i = 0; i < MAXAG; i++) { + imap->im_agctl[i].inofree = -1; /* free inode list */ + imap->im_agctl[i].extfree = -1; /* free extent list */ + imap->im_agctl[i].numinos = 0; /* number of backed inodes */ + imap->im_agctl[i].numfree = 0; /* number of free backed inodes */ + } + + /* + * process each iag_t page of the map. + * + * rebuild AG Free Inode List, AG Free Inode Extent List; + */ + for (i = 0; i < imap->im_nextiag; i++) { + if ((rc = diIAGRead(imap, i, &bp))) { + rcx = rc; + continue; + } + iagp = (iag_t *) bp->data; + assert(le32_to_cpu(iagp->iagnum) == i); + + /* leave free iag in the free iag list */ + if (iagp->nfreeexts == cpu_to_le32(EXTSPERIAG)) { + release_metapage(bp); + continue; + } + + /* agstart that computes to the same ag is treated as same; */ + agstart = le64_to_cpu(iagp->agstart); + /* iagp->agstart = agstart & ~(mp->db_agsize - 1); */ + n = agstart >> mp->db_agl2size; +/* +printf("diExtendFS: iag:%d agstart:%Ld agno:%d\n", i, agstart, n); +*/ + + /* compute backed inodes */ + numinos = (EXTSPERIAG - le32_to_cpu(iagp->nfreeexts)) + << L2INOSPEREXT; + if (numinos > 0) { + /* merge AG backed inodes */ + imap->im_agctl[n].numinos += numinos; + xnuminos += numinos; + } + + /* if any backed free inodes, insert at AG free inode list */ + if ((int) le32_to_cpu(iagp->nfreeinos) > 0) { + if ((head = imap->im_agctl[n].inofree) == -1) + iagp->inofreefwd = iagp->inofreeback = -1; + else { + if ((rc = diIAGRead(imap, head, &hbp))) { + rcx = rc; + goto nextiag; + } + hiagp = (iag_t *) hbp->data; + hiagp->inofreeback = + le32_to_cpu(iagp->iagnum); + iagp->inofreefwd = cpu_to_le32(head); + iagp->inofreeback = -1; + write_metapage(hbp); + } + + imap->im_agctl[n].inofree = + le32_to_cpu(iagp->iagnum); + + /* merge AG backed free inodes */ + imap->im_agctl[n].numfree += + le32_to_cpu(iagp->nfreeinos); + xnumfree += le32_to_cpu(iagp->nfreeinos); + } + + /* if any free extents, insert at AG free extent list */ + if (le32_to_cpu(iagp->nfreeexts) > 0) { + if ((head = imap->im_agctl[n].extfree) == -1) + iagp->extfreefwd = iagp->extfreeback = -1; + else { + if ((rc = diIAGRead(imap, head, &hbp))) { + rcx = rc; + goto nextiag; + } + hiagp = (iag_t *) hbp->data; + hiagp->extfreeback = iagp->iagnum; + iagp->extfreefwd = cpu_to_le32(head); + iagp->extfreeback = -1; + write_metapage(hbp); + } + + imap->im_agctl[n].extfree = + le32_to_cpu(iagp->iagnum); + } + + nextiag: + write_metapage(bp); + } + + ASSERT(xnuminos == atomic_read(&imap->im_numinos) && + xnumfree == atomic_read(&imap->im_numfree)); + + return rcx; +} + + +/* + * duplicateIXtree() + * + * serialization: IWRITE_LOCK held on entry/exit + * + * note: shadow page with regular inode (rel.2); + */ +static void +duplicateIXtree(struct super_block *sb, s64 blkno, int xlen, s64 * xaddr) +{ + int rc; + tid_t tid; + struct inode *ip; + metapage_t *mpsuper; + struct jfs_superblock *j_sb; + + /* if AIT2 ipmap2 is bad, do not try to update it */ + if (JFS_SBI(sb)->mntflag & JFS_BAD_SAIT) /* s_flag */ + return; + ip = diReadSpecial(sb, FILESYSTEM_I + INOSPEREXT); + if (ip == 0) { + JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT; + if ((rc = readSuper(sb, &mpsuper))) + return; + j_sb = (struct jfs_superblock *) (mpsuper->data); + j_sb->s_flag |= JFS_BAD_SAIT; + write_metapage(mpsuper); + return; + } + + /* start transaction */ + tid = txBegin(sb, COMMIT_FORCE); + /* update the inode map addressing structure to point to it */ + if ((rc = xtInsert(tid, ip, 0, blkno, xlen, xaddr, 0))) { + JFS_SBI(sb)->mntflag |= JFS_BAD_SAIT; + txAbort(tid, 1); + goto cleanup; + + } + /* update the inode map's inode to reflect the extension */ + ip->i_size += PSIZE; + ip->i_blocks += LBLK2PBLK(sb, xlen); + rc = txCommit(tid, 1, &ip, COMMIT_FORCE); + cleanup: + txEnd(tid); + diFreeSpecial(ip); +} + +/* + * NAME: copy_from_dinode() + * + * FUNCTION: Copies inode info from disk inode to in-memory inode + * + * RETURN VALUES: + * 0 - success + * ENOMEM - insufficient memory + */ +static int copy_from_dinode(dinode_t * dip, struct inode *ip) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + + jfs_ip->fileset = le32_to_cpu(dip->di_fileset); + jfs_ip->mode2 = le32_to_cpu(dip->di_mode); + + ip->i_mode = le32_to_cpu(dip->di_mode) & 0xffff; + ip->i_nlink = le32_to_cpu(dip->di_nlink); + ip->i_uid = le32_to_cpu(dip->di_uid); + ip->i_gid = le32_to_cpu(dip->di_gid); + ip->i_size = le64_to_cpu(dip->di_size); + ip->i_atime = le32_to_cpu(dip->di_atime.tv_sec); + ip->i_mtime = le32_to_cpu(dip->di_mtime.tv_sec); + ip->i_ctime = le32_to_cpu(dip->di_ctime.tv_sec); + ip->i_blksize = ip->i_sb->s_blocksize; + ip->i_blocks = LBLK2PBLK(ip->i_sb, le64_to_cpu(dip->di_nblocks)); + ip->i_version = ++event; + ip->i_generation = le32_to_cpu(dip->di_gen); + + jfs_ip->ixpxd = dip->di_ixpxd; /* in-memory pxd's are little-endian */ + jfs_ip->acl = dip->di_acl; /* as are dxd's */ + jfs_ip->ea = dip->di_ea; + jfs_ip->next_index = le32_to_cpu(dip->di_next_index); + jfs_ip->otime = le32_to_cpu(dip->di_otime.tv_sec); + jfs_ip->acltype = le32_to_cpu(dip->di_acltype); + /* + * We may only need to do this for "special" inodes (dmap, imap) + */ + if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) + ip->i_rdev = to_kdev_t(le32_to_cpu(dip->di_rdev)); + else if (S_ISDIR(ip->i_mode)) { + memcpy(&jfs_ip->i_dirtable, &dip->di_dirtable, 384); + } else if (!S_ISFIFO(ip->i_mode)) { + memcpy(&jfs_ip->i_xtroot, &dip->di_xtroot, 288); + } + /* Zero the in-memory-only stuff */ + jfs_ip->cflag = 0; + jfs_ip->btindex = 0; + jfs_ip->btorder = 0; + jfs_ip->bxflag = 0; + jfs_ip->blid = 0; + jfs_ip->atlhead = 0; + jfs_ip->atltail = 0; + jfs_ip->xtlid = 0; + return (0); +} + +/* + * NAME: copy_to_dinode() + * + * FUNCTION: Copies inode info from in-memory inode to disk inode + */ +static void copy_to_dinode(dinode_t * dip, struct inode *ip) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + + dip->di_fileset = cpu_to_le32(jfs_ip->fileset); + dip->di_inostamp = cpu_to_le32(JFS_SBI(ip->i_sb)->inostamp); + dip->di_number = cpu_to_le32(ip->i_ino); + dip->di_gen = cpu_to_le32(ip->i_generation); + dip->di_size = cpu_to_le64(ip->i_size); + dip->di_nblocks = cpu_to_le64(PBLK2LBLK(ip->i_sb, ip->i_blocks)); + dip->di_nlink = cpu_to_le32(ip->i_nlink); + dip->di_uid = cpu_to_le32(ip->i_uid); + dip->di_gid = cpu_to_le32(ip->i_gid); + /* + * mode2 is only needed for storing the higher order bits. + * Trust i_mode for the lower order ones + */ + dip->di_mode = cpu_to_le32((jfs_ip->mode2 & 0xffff0000) | ip->i_mode); + dip->di_atime.tv_sec = cpu_to_le32(ip->i_atime); + dip->di_atime.tv_nsec = 0; + dip->di_ctime.tv_sec = cpu_to_le32(ip->i_ctime); + dip->di_ctime.tv_nsec = 0; + dip->di_mtime.tv_sec = cpu_to_le32(ip->i_mtime); + dip->di_mtime.tv_nsec = 0; + dip->di_ixpxd = jfs_ip->ixpxd; /* in-memory pxd's are little-endian */ + dip->di_acl = jfs_ip->acl; /* as are dxd's */ + dip->di_ea = jfs_ip->ea; + dip->di_next_index = cpu_to_le32(jfs_ip->next_index); + dip->di_otime.tv_sec = cpu_to_le32(jfs_ip->otime); + dip->di_otime.tv_nsec = 0; + dip->di_acltype = cpu_to_le32(jfs_ip->acltype); + + if (S_ISCHR(ip->i_mode) || S_ISBLK(ip->i_mode)) + dip->di_rdev = cpu_to_le32(kdev_t_to_nr(ip->i_rdev)); +} + +void diClearExtension(struct inode *ip) +{ + jFYI(1, ("diClearExtension called ip = 0x%p\n", ip)); + + assert(list_empty(&JFS_IP(ip)->mp_list)); + assert(list_empty(&JFS_IP(ip)->anon_inode_list)); + + if (JFS_IP(ip)->atlhead) { + jERROR(1, + ("diClearExtension: inode 0x%p has anonymous tlocks\n", + ip)); + } + + free_jfs_inode(ip); + ip->u.generic_ip = 0; +} + +#ifdef _JFS_DEBUG_IMAP +/* + * DBGdiInit() + */ +static void *DBGdiInit(imap_t * imap) +{ + u32 *dimap; + int size; + size = 64 * 1024; + if ((dimap = (u32 *) xmalloc(size, L2PSIZE, kernel_heap)) == NULL) + assert(0); + bzero((void *) dimap, size); + imap->im_DBGdimap = dimap; +} + +/* + * DBGdiAlloc() + */ +static void DBGdiAlloc(imap_t * imap, ino_t ino) +{ + u32 *dimap = imap->im_DBGdimap; + int w, b; + u32 m; + w = ino >> 5; + b = ino & 31; + m = 0x80000000 >> b; + assert(w < 64 * 256); + if (dimap[w] & m) { + printk("DEBUG diAlloc: duplicate alloc ino:0x%x\n", ino); + } + dimap[w] |= m; +} + +/* + * DBGdiFree() + */ +static void DBGdiFree(imap_t * imap, ino_t ino) +{ + u32 *dimap = imap->im_DBGdimap; + int w, b; + u32 m; + w = ino >> 5; + b = ino & 31; + m = 0x80000000 >> b; + assert(w < 64 * 256); + if ((dimap[w] & m) == 0) { + printk("DEBUG diFree: duplicate free ino:0x%x\n", ino); + } + dimap[w] &= ~m; +} + +static void dump_cp(imap_t * ipimap, char *function, int line) +{ + printk("\n* ********* *\nControl Page %s %d\n", function, line); + printk("FreeIAG %d\tNextIAG %d\n", ipimap->im_freeiag, + ipimap->im_nextiag); + printk("NumInos %d\tNumFree %d\n", + atomic_read(&ipimap->im_numinos), + atomic_read(&ipimap->im_numfree)); + printk("AG InoFree %d\tAG ExtFree %d\n", + ipimap->im_agctl[0].inofree, ipimap->im_agctl[0].extfree); + printk("AG NumInos %d\tAG NumFree %d\n", + ipimap->im_agctl[0].numinos, ipimap->im_agctl[0].numfree); +} + +static void dump_iag(iag_t * iag, char *function, int line) +{ + printk("\n* ********* *\nIAG %s %d\n", function, line); + printk("IagNum %d\tIAG Free %d\n", le32_to_cpu(iag->iagnum), + le32_to_cpu(iag->iagfree)); + printk("InoFreeFwd %d\tInoFreeBack %d\n", + le32_to_cpu(iag->inofreefwd), + le32_to_cpu(iag->inofreeback)); + printk("ExtFreeFwd %d\tExtFreeBack %d\n", + le32_to_cpu(iag->extfreefwd), + le32_to_cpu(iag->extfreeback)); + printk("NFreeInos %d\tNFreeExts %d\n", le32_to_cpu(iag->nfreeinos), + le32_to_cpu(iag->nfreeexts)); +} +#endif /* _JFS_DEBUG_IMAP */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_imap.h linuxppc64_2_4/fs/jfs/jfs_imap.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_imap.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_imap.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,161 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_IMAP +#define _H_JFS_IMAP + +#include "jfs_txnmgr.h" + +/* + * jfs_imap.h: disk inode manager + */ + +#define EXTSPERIAG 128 /* number of disk inode extent per iag */ +#define IMAPBLKNO 0 /* lblkno of dinomap within inode map */ +#define SMAPSZ 4 /* number of words per summary map */ +#define EXTSPERSUM 32 /* number of extents per summary map entry */ +#define L2EXTSPERSUM 5 /* l2 number of extents per summary map */ +#define PGSPERIEXT 4 /* number of 4K pages per dinode extent */ +#define MAXIAGS ((1<<20)-1) /* maximum number of iags */ +#define MAXAG 128 /* maximum number of allocation groups */ + +#define AMAPSIZE 512 /* bytes in the IAG allocation maps */ +#define SMAPSIZE 16 /* bytes in the IAG summary maps */ + +/* convert inode number to iag number */ +#define INOTOIAG(ino) ((ino) >> L2INOSPERIAG) + +/* convert iag number to logical block number of the iag page */ +#define IAGTOLBLK(iagno,l2nbperpg) (((iagno) + 1) << (l2nbperpg)) + +/* get the starting block number of the 4K page of an inode extent + * that contains ino. + */ +#define INOPBLK(pxd,ino,l2nbperpg) (addressPXD((pxd)) + \ + ((((ino) & (INOSPEREXT-1)) >> L2INOSPERPAGE) << (l2nbperpg))) + +/* + * inode allocation map: + * + * inode allocation map consists of + * . the inode map control page and + * . inode allocation group pages (per 4096 inodes) + * which are addressed by standard JFS xtree. + */ +/* + * inode allocation group page (per 4096 inodes of an AG) + */ +typedef struct { + s64 agstart; /* 8: starting block of ag */ + s32 iagnum; /* 4: inode allocation group number */ + s32 inofreefwd; /* 4: ag inode free list forward */ + s32 inofreeback; /* 4: ag inode free list back */ + s32 extfreefwd; /* 4: ag inode extent free list forward */ + s32 extfreeback; /* 4: ag inode extent free list back */ + s32 iagfree; /* 4: iag free list */ + + /* summary map: 1 bit per inode extent */ + s32 inosmap[SMAPSZ]; /* 16: sum map of mapwords w/ free inodes; + * note: this indicates free and backed + * inodes, if the extent is not backed the + * value will be 1. if the extent is + * backed but all inodes are being used the + * value will be 1. if the extent is + * backed but at least one of the inodes is + * free the value will be 0. + */ + s32 extsmap[SMAPSZ]; /* 16: sum map of mapwords w/ free extents */ + s32 nfreeinos; /* 4: number of free inodes */ + s32 nfreeexts; /* 4: number of free extents */ + /* (72) */ + u8 pad[1976]; /* 1976: pad to 2048 bytes */ + /* allocation bit map: 1 bit per inode (0 - free, 1 - allocated) */ + u32 wmap[EXTSPERIAG]; /* 512: working allocation map */ + u32 pmap[EXTSPERIAG]; /* 512: persistent allocation map */ + pxd_t inoext[EXTSPERIAG]; /* 1024: inode extent addresses */ +} iag_t; /* (4096) */ + +/* + * per AG control information (in inode map control page) + */ +typedef struct { + s32 inofree; /* 4: free inode list anchor */ + s32 extfree; /* 4: free extent list anchor */ + s32 numinos; /* 4: number of backed inodes */ + s32 numfree; /* 4: number of free inodes */ +} iagctl_t; /* (16) */ + +/* + * per fileset/aggregate inode map control page + */ +typedef struct { + s32 in_freeiag; /* 4: free iag list anchor */ + s32 in_nextiag; /* 4: next free iag number */ + s32 in_numinos; /* 4: num of backed inodes */ + s32 in_numfree; /* 4: num of free backed inodes */ + s32 in_nbperiext; /* 4: num of blocks per inode extent */ + s32 in_l2nbperiext; /* 4: l2 of in_nbperiext */ + s32 in_diskblock; /* 4: for standalone test driver */ + s32 in_maxag; /* 4: for standalone test driver */ + u8 pad[2016]; /* 2016: pad to 2048 */ + iagctl_t in_agctl[MAXAG]; /* 2048: AG control information */ +} dinomap_t; /* (4096) */ + + +/* + * In-core inode map control page + */ +typedef struct inomap { + dinomap_t im_imap; /* 4096: inode allocation control */ + struct inode *im_ipimap; /* 4: ptr to inode for imap */ + struct semaphore im_freelock; /* 4: iag free list lock */ + struct semaphore im_aglock[MAXAG]; /* 512: per AG locks */ + u32 *im_DBGdimap; + atomic_t im_numinos; /* num of backed inodes */ + atomic_t im_numfree; /* num of free backed inodes */ +} imap_t; + +#define im_freeiag im_imap.in_freeiag +#define im_nextiag im_imap.in_nextiag +#define im_agctl im_imap.in_agctl +#define im_nbperiext im_imap.in_nbperiext +#define im_l2nbperiext im_imap.in_l2nbperiext + +/* for standalone testdriver + */ +#define im_diskblock im_imap.in_diskblock +#define im_maxag im_imap.in_maxag + +extern int diFree(struct inode *); +extern int diAlloc(struct inode *, boolean_t, struct inode *); +extern int diSync(struct inode *); +/* external references */ +extern int diUpdatePMap(struct inode *ipimap, unsigned long inum, + boolean_t is_free, tblock_t * tblk); +#ifdef _STILL_TO_PORT +extern int diExtendFS(inode_t * ipimap, inode_t * ipbmap); +#endif /* _STILL_TO_PORT */ + +extern int diMount(struct inode *); +extern int diUnmount(struct inode *, int); +extern int diRead(struct inode *); +extern void diClearExtension(struct inode *); +extern struct inode *diReadSpecial(struct super_block *, ino_t); +extern void diWriteSpecial(struct inode *); +extern void diFreeSpecial(struct inode *); +extern int diWrite(tid_t tid, struct inode *); +#endif /* _H_JFS_IMAP */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_incore.h linuxppc64_2_4/fs/jfs/jfs_incore.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_incore.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_incore.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,155 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * +*/ +#ifndef _H_JFS_INCORE +#define _H_JFS_INCORE + +#include +#include +#include "jfs_types.h" +#include "jfs_xtree.h" +#include "jfs_dtree.h" + +/* + * JFS magic number + */ +#define JFS_SUPER_MAGIC 0x3153464a /* "JFS1" */ + +/* + * Due to header ordering problems this can't be in jfs_lock.h + */ +typedef struct jfs_rwlock { + struct rw_semaphore rw_sem; + atomic_t in_use; /* for hacked implementation of trylock */ +} jfs_rwlock_t; + +/* + * JFS-private inode information + */ +struct jfs_inode_info { + struct inode *inode; /* pointer back to fs-independent inode */ + int fileset; /* fileset number (always 16)*/ + uint mode2; /* jfs-specific mode */ + pxd_t ixpxd; /* inode extent descriptor */ + dxd_t acl; /* dxd describing acl */ + dxd_t ea; /* dxd describing ea */ + time_t otime; /* time created */ + uint next_index; /* next available directory entry index */ + int acltype; /* Type of ACL */ + short btorder; /* access order */ + short btindex; /* btpage entry index*/ + struct inode *ipimap; /* inode map */ + long cflag; /* commit flags */ + u16 bxflag; /* xflag of pseudo buffer? */ + unchar agno; /* ag number */ + unchar pad; /* pad */ + lid_t blid; /* lid of pseudo buffer? */ + lid_t atlhead; /* anonymous tlock list head */ + lid_t atltail; /* anonymous tlock list tail */ + struct list_head anon_inode_list; /* inodes having anonymous txns */ + struct list_head mp_list; /* metapages in inode's address space */ + jfs_rwlock_t rdwrlock; /* read/write lock */ + lid_t xtlid; /* lid of xtree lock on directory */ + union { + struct { + xtpage_t _xtroot; /* 288: xtree root */ + struct inomap *_imap; /* 4: inode map header */ + } file; + struct { + dir_table_slot_t _table[12]; /* 96: directory index */ + dtroot_t _dtroot; /* 288: dtree root */ + } dir; + struct { + unchar _unused[16]; /* 16: */ + dxd_t _dxd; /* 16: */ + unchar _inline[128]; /* 128: inline symlink */ + } link; + } u; +}; +#define i_xtroot u.file._xtroot +#define i_imap u.file._imap +#define i_dirtable u.dir._table +#define i_dtroot u.dir._dtroot +#define i_inline u.link._inline + +/* + * cflag + */ +enum cflags { + COMMIT_New, /* never committed inode */ + COMMIT_Nolink, /* inode committed with zero link count */ + COMMIT_Inlineea, /* commit inode inline EA */ + COMMIT_Freewmap, /* free WMAP at iClose() */ + COMMIT_Dirty, /* Inode is really dirty */ + COMMIT_Holdlock, /* Hold the IWRITE_LOCK until commit is done */ + COMMIT_Dirtable, /* commit changes to di_dirtable */ + COMMIT_Stale, /* data extent is no longer valid */ + COMMIT_Synclist, /* metadata pages on group commit synclist */ +}; + +#define set_cflag(flag, ip) set_bit(flag, &(JFS_IP(ip)->cflag)) +#define clear_cflag(flag, ip) clear_bit(flag, &(JFS_IP(ip)->cflag)) +#define test_cflag(flag, ip) test_bit(flag, &(JFS_IP(ip)->cflag)) +#define test_and_clear_cflag(flag, ip) \ + test_and_clear_bit(flag, &(JFS_IP(ip)->cflag)) +/* + * JFS-private superblock information. + */ +struct jfs_sb_info { + unsigned long mntflag; /* 4: aggregate attributes */ + struct inode *ipbmap; /* 4: block map inode */ + struct inode *ipaimap; /* 4: aggregate inode map inode */ + struct inode *ipaimap2; /* 4: secondary aimap inode */ + struct inode *ipimap; /* 4: aggregate inode map inode */ + struct jfs_log *log; /* 4: log */ + short bsize; /* 2: logical block size */ + short l2bsize; /* 2: log2 logical block size */ + short nbperpage; /* 2: blocks per page */ + short l2nbperpage; /* 2: log2 blocks per page */ + short l2niperblk; /* 2: log2 inodes per page */ + kdev_t logdev; /* 2: external log device */ + pxd_t logpxd; /* 8: pxd describing log */ + pxd_t ait2; /* 8: pxd describing AIT copy */ + /* Formerly in ipimap */ + uint gengen; /* 4: inode generation generator*/ + uint inostamp; /* 4: shows inode belongs to fileset*/ + + /* Formerly in ipbmap */ + struct bmap *bmap; /* 4: incore bmap descriptor */ + struct nls_table *nls_tab; /* 4: current codepage */ + struct inode *direct_inode; /* 4: inode for physical I/O */ + struct address_space *direct_mapping; /* 4: mapping for physical I/O */ + uint state; /* 4: mount/recovery state */ +}; + +#define JFS_IP(ip) ((struct jfs_inode_info *)(ip)->u.generic_ip) +#define JFS_SBI(sb) ((struct jfs_sb_info *)(sb)->u.generic_sbp) + +#define isReadOnly(ip) ((JFS_SBI((ip)->i_sb)->log) ? 0 : 1) + +/* + * Allocating and freeing the structure + */ +extern kmem_cache_t *jfs_inode_cachep; +extern int alloc_jfs_inode(struct inode *); + +#define free_jfs_inode(inode) \ + kmem_cache_free(jfs_inode_cachep, (inode)->u.generic_ip) + +#endif /* _H_JFS_INCORE */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_inode.c linuxppc64_2_4/fs/jfs/jfs_inode.c --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_inode.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_inode.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,160 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_imap.h" +#include "jfs_dinode.h" +#include "jfs_debug.h" + +kmem_cache_t *jfs_inode_cachep; + +/* + * NAME: ialloc() + * + * FUNCTION: Allocate a new inode + * + */ +struct inode *ialloc(struct inode *parent, umode_t mode) +{ + struct super_block *sb = parent->i_sb; + struct inode *inode; + struct jfs_inode_info *jfs_inode; + int rc; + + inode = new_inode(sb); + if (!inode) { + jERROR(1, ("ialloc: new_inode returned NULL!\n")); + return inode; + } + + rc = alloc_jfs_inode(inode); + if (rc) { + make_bad_inode(inode); + iput(inode); + return NULL; + } + jfs_inode = JFS_IP(inode); + + rc = diAlloc(parent, S_ISDIR(mode), inode); + if (rc) { + jERROR(1, ("ialloc: diAlloc returned %d!\n", rc)); + free_jfs_inode(inode); + make_bad_inode(inode); + iput(inode); + return NULL; + } + + inode->i_uid = current->fsuid; + if (parent->i_mode & S_ISGID) { + inode->i_gid = parent->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current->fsgid; + + inode->i_mode = mode; + if (S_ISDIR(mode)) + jfs_inode->mode2 = IDIRECTORY | mode; + else + jfs_inode->mode2 = INLINEEA | ISPARSE | mode; + inode->i_blksize = sb->s_blocksize; + inode->i_blocks = 0; + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + jfs_inode->otime = inode->i_ctime; + inode->i_version = ++event; + inode->i_generation = JFS_SBI(sb)->gengen++; + + jfs_inode->cflag = 0; + set_cflag(COMMIT_New, inode); + + /* Zero remaining fields */ + memset(&jfs_inode->acl, 0, sizeof(dxd_t)); + memset(&jfs_inode->ea, 0, sizeof(dxd_t)); + jfs_inode->next_index = 0; + jfs_inode->acltype = 0; + jfs_inode->btorder = 0; + jfs_inode->btindex = 0; + jfs_inode->bxflag = 0; + jfs_inode->blid = 0; + jfs_inode->atlhead = 0; + jfs_inode->atltail = 0; + jfs_inode->xtlid = 0; + + jFYI(1, ("ialloc returns inode = 0x%p\n", inode)); + + return inode; +} + +/* + * NAME: iwritelocklist() + * + * FUNCTION: Lock multiple inodes in sorted order to avoid deadlock + * + */ +void iwritelocklist(int n, ...) +{ + va_list ilist; + struct inode *sort[4]; + struct inode *ip; + int k, m; + + va_start(ilist, n); + for (k = 0; k < n; k++) + sort[k] = va_arg(ilist, struct inode *); + va_end(ilist); + + /* Bubble sort in descending order */ + do { + m = 0; + for (k = 0; k < n; k++) + if ((k + 1) < n + && sort[k + 1]->i_ino > sort[k]->i_ino) { + ip = sort[k]; + sort[k] = sort[k + 1]; + sort[k + 1] = ip; + m++; + } + } while (m); + + /* Lock them */ + for (k = 0; k < n; k++) { + IWRITE_LOCK(sort[k]); + } +} + +/* + * NAME: alloc_jfs_inode() + * + * FUNCTION: Allocate jfs portion of in-memory inode + * + */ +int alloc_jfs_inode(struct inode *inode) +{ + struct jfs_inode_info *jfs_inode; + + jfs_inode = kmem_cache_alloc(jfs_inode_cachep, GFP_NOFS); + JFS_IP(inode) = jfs_inode; + if (!jfs_inode) + return -ENOSPC; + jfs_inode->inode = inode; + + return 0; +} diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_inode.h linuxppc64_2_4/fs/jfs/jfs_inode.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_inode.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_inode.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,23 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_INODE +#define _H_JFS_INODE + +extern struct inode *ialloc(struct inode *, umode_t); + +#endif /* _H_JFS_INODE */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_lock.h linuxppc64_2_4/fs/jfs/jfs_lock.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_lock.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_lock.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,106 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_LOCK +#define _H_JFS_LOCK + +#include +#include + +/* + * jfs_lock.h + * + * JFS lock definition for globally referenced locks + */ + +/* readers/writer lock: thread-thread */ + +/* + * RW semaphores do not currently have a trylock function. Since the + * implementation varies by platform, I have implemented a platform-independent + * wrapper around the rw_semaphore routines. If this turns out to be the best + * way of avoiding our locking problems, I will push to get a trylock + * implemented in the kernel, but I'd rather find a way to avoid having to + * use it. + */ +#define RDWRLOCK_T jfs_rwlock_t +static inline void RDWRLOCK_INIT(jfs_rwlock_t * Lock) +{ + init_rwsem(&Lock->rw_sem); + atomic_set(&Lock->in_use, 0); +} +static inline void READ_LOCK(jfs_rwlock_t * Lock) +{ + atomic_inc(&Lock->in_use); + down_read(&Lock->rw_sem); +} +static inline void READ_UNLOCK(jfs_rwlock_t * Lock) +{ + up_read(&Lock->rw_sem); + atomic_dec(&Lock->in_use); +} +static inline void WRITE_LOCK(jfs_rwlock_t * Lock) +{ + atomic_inc(&Lock->in_use); + down_write(&Lock->rw_sem); +} + +static inline int WRITE_TRYLOCK(jfs_rwlock_t * Lock) +{ + if (atomic_read(&Lock->in_use)) + return 0; + WRITE_LOCK(Lock); + return 1; +} +static inline void WRITE_UNLOCK(jfs_rwlock_t * Lock) +{ + up_write(&Lock->rw_sem); + atomic_dec(&Lock->in_use); +} + +#define IREAD_LOCK(ip) READ_LOCK(&JFS_IP(ip)->rdwrlock) +#define IREAD_UNLOCK(ip) READ_UNLOCK(&JFS_IP(ip)->rdwrlock) +#define IWRITE_LOCK(ip) WRITE_LOCK(&JFS_IP(ip)->rdwrlock) +#define IWRITE_TRYLOCK(ip) WRITE_TRYLOCK(&JFS_IP(ip)->rdwrlock) +#define IWRITE_UNLOCK(ip) WRITE_UNLOCK(&JFS_IP(ip)->rdwrlock) +#define IWRITE_LOCK_LIST iwritelocklist + +extern void iwritelocklist(int, ...); + +/* + * Conditional sleep where condition is protected by spinlock + * + * lock_cmd and unlock_cmd take and release the spinlock + */ +#define __SLEEP_COND(wq, cond, lock_cmd, unlock_cmd) \ +do { \ + DECLARE_WAITQUEUE(__wait, current); \ + \ + add_wait_queue(&wq, &__wait); \ + for (;;) { \ + set_current_state(TASK_UNINTERRUPTIBLE);\ + if (cond) \ + break; \ + unlock_cmd; \ + schedule(); \ + lock_cmd; \ + } \ + current->state = TASK_RUNNING; \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + +#endif /* _H_JFS_LOCK */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_logmgr.c linuxppc64_2_4/fs/jfs/jfs_logmgr.c --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_logmgr.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_logmgr.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,2327 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * +*/ + +/* + * jfs_logmgr.c: log manager + * + * for related information, see transaction manager (jfs_txnmgr.c), and + * recovery manager (jfs_logredo.c). + * + * note: for detail, RTFS. + * + * log buffer manager: + * special purpose buffer manager supporting log i/o requirements. + * per log serial pageout of logpage + * queuing i/o requests and redrive i/o at iodone + * maintain current logpage buffer + * no caching since append only + * appropriate jfs buffer cache buffers as needed + * + * group commit: + * transactions which wrote COMMIT records in the same in-memory + * log page during the pageout of previous/current log page(s) are + * committed together by the pageout of the page. + * + * TBD lazy commit: + * transactions are committed asynchronously when the log page + * containing it COMMIT is paged out when it becomes full; + * + * serialization: + * . a per log lock serialize log write. + * . a per log lock serialize group commit. + * . a per log lock serialize log open/close; + * + * TBD log integrity: + * careful-write (ping-pong) of last logpage to recover from crash + * in overwrite. + * detection of split (out-of-order) write of physical sectors + * of last logpage via timestamp at end of each sector + * with its mirror data array at trailer). + * + * alternatives: + * lsn - 64-bit monotonically increasing integer vs + * 32-bit lspn and page eor. + */ + +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_txnmgr.h" +#include "jfs_debug.h" + + +/* + * lbuf's ready to be redriven. Protected by log_redrive_lock (jfsIOtask) + */ +static lbuf_t *log_redrive_list; +static spinlock_t log_redrive_lock = SPIN_LOCK_UNLOCKED; + + +/* + * log read/write serialization (per log) + */ +#define LOG_LOCK_INIT(log) init_MUTEX(&(log)->loglock) +#define LOG_LOCK(log) down(&((log)->loglock)) +#define LOG_UNLOCK(log) up(&((log)->loglock)) + + +/* + * log group commit serialization (per log) + */ + +#define LOGGC_LOCK_INIT(log) spin_lock_init(&(log)->gclock) +#define LOGGC_LOCK(log) spin_lock_irq(&(log)->gclock) +#define LOGGC_UNLOCK(log) spin_unlock_irq(&(log)->gclock) +#define LOGGC_WAKEUP(tblk) wake_up(&(tblk)->gcwait) + +/* + * log sync serialization (per log) + */ +#define LOGSYNC_DELTA(logsize) min((logsize)/8, 128*LOGPSIZE) +#define LOGSYNC_BARRIER(logsize) ((logsize)/4) +/* +#define LOGSYNC_DELTA(logsize) min((logsize)/4, 256*LOGPSIZE) +#define LOGSYNC_BARRIER(logsize) ((logsize)/2) +*/ + + +/* + * log buffer cache synchronization + */ +static spinlock_t jfsLCacheLock = SPIN_LOCK_UNLOCKED; + +#define LCACHE_LOCK(flags) spin_lock_irqsave(&jfsLCacheLock, flags) +#define LCACHE_UNLOCK(flags) spin_unlock_irqrestore(&jfsLCacheLock, flags) + +/* + * See __SLEEP_COND in jfs_locks.h + */ +#define LCACHE_SLEEP_COND(wq, cond, flags) \ +do { \ + if (cond) \ + break; \ + __SLEEP_COND(wq, cond, LCACHE_LOCK(flags), LCACHE_UNLOCK(flags)); \ +} while (0) + +#define LCACHE_WAKEUP(event) wake_up(event) + + +/* + * lbuf buffer cache (lCache) control + */ +/* log buffer manager pageout control (cumulative, inclusive) */ +#define lbmREAD 0x0001 +#define lbmWRITE 0x0002 /* enqueue at tail of write queue; + * init pageout if at head of queue; + */ +#define lbmRELEASE 0x0004 /* remove from write queue + * at completion of pageout; + * do not free/recycle it yet: + * caller will free it; + */ +#define lbmSYNC 0x0008 /* do not return to freelist + * when removed from write queue; + */ +#define lbmFREE 0x0010 /* return to freelist + * at completion of pageout; + * the buffer may be recycled; + */ +#define lbmDONE 0x0020 +#define lbmERROR 0x0040 +#define lbmGC 0x0080 /* lbmIODone to perform post-GC processing + * of log page + */ +#define lbmDIRECT 0x0100 + +/* + * external references + */ +extern void txLazyUnlock(tblock_t * tblk); +extern int jfs_thread_stopped(void); +extern struct task_struct *jfsIOtask; +extern struct completion jfsIOwait; + +/* + * forward references + */ +static int lmWriteRecord(log_t * log, tblock_t * tblk, lrd_t * lrd, + tlock_t * tlck); + +static int lmNextPage(log_t * log); +static int lmLogFileSystem(log_t * log, kdev_t fsdev, int activate); +static int lmLogInit(log_t * log); +static int lmLogShutdown(log_t * log); + +static int lbmLogInit(log_t * log); +static void lbmLogShutdown(log_t * log); +static lbuf_t *lbmAllocate(log_t * log, int); +static void lbmFree(lbuf_t * bp); +static void lbmfree(lbuf_t * bp); +static int lbmRead(log_t * log, int pn, lbuf_t ** bpp); +static void lbmWrite(log_t * log, lbuf_t * bp, int flag, int cant_block); +static void lbmDirectWrite(log_t * log, lbuf_t * bp, int flag); +static int lbmIOWait(lbuf_t * bp, int flag); +static void lbmIODone(struct buffer_head *bh, int); + +void lbmStartIO(lbuf_t * bp); +void lmGCwrite(log_t * log, int cant_block); + + +/* + * statistics + */ +#ifdef CONFIG_JFS_STATISTICS +struct lmStat { + uint commit; /* # of commit */ + uint pagedone; /* # of page written */ + uint submitted; /* # of pages submitted */ +} lmStat; +#endif + + +/* + * NAME: lmLog() + * + * FUNCTION: write a log record; + * + * PARAMETER: + * + * RETURN: lsn - offset to the next log record to write (end-of-log); + * -1 - error; + * + * note: todo: log error handler + */ +int lmLog(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck) +{ + int lsn; + int diffp, difft; + metapage_t *mp = NULL; + + jFYI(1, ("lmLog: log:0x%p tblk:0x%p, lrd:0x%p tlck:0x%p\n", + log, tblk, lrd, tlck)); + + LOG_LOCK(log); + + /* log by (out-of-transaction) JFS ? */ + if (tblk == NULL) + goto writeRecord; + + /* log from page ? */ + if (tlck == NULL || + tlck->type & tlckBTROOT || (mp = tlck->mp) == NULL) + goto writeRecord; + + /* + * initialize/update page/transaction recovery lsn + */ + lsn = log->lsn; + + LOGSYNC_LOCK(log); + + /* + * initialize page lsn if first log write of the page + */ + if (mp->lsn == 0) { + mp->log = log; + mp->lsn = lsn; + log->count++; + + /* insert page at tail of logsynclist */ + list_add_tail(&mp->synclist, &log->synclist); + } + + /* + * initialize/update lsn of tblock of the page + * + * transaction inherits oldest lsn of pages associated + * with allocation/deallocation of resources (their + * log records are used to reconstruct allocation map + * at recovery time: inode for inode allocation map, + * B+-tree index of extent descriptors for block + * allocation map); + * allocation map pages inherit transaction lsn at + * commit time to allow forwarding log syncpt past log + * records associated with allocation/deallocation of + * resources only after persistent map of these map pages + * have been updated and propagated to home. + */ + /* + * initialize transaction lsn: + */ + if (tblk->lsn == 0) { + /* inherit lsn of its first page logged */ + tblk->lsn = mp->lsn; + log->count++; + + /* insert tblock after the page on logsynclist */ + list_add(&tblk->synclist, &mp->synclist); + } + /* + * update transaction lsn: + */ + else { + /* inherit oldest/smallest lsn of page */ + logdiff(diffp, mp->lsn, log); + logdiff(difft, tblk->lsn, log); + if (diffp < difft) { + /* update tblock lsn with page lsn */ + tblk->lsn = mp->lsn; + + /* move tblock after page on logsynclist */ + list_del(&tblk->synclist); + list_add(&tblk->synclist, &mp->synclist); + } + } + + LOGSYNC_UNLOCK(log); + + /* + * write the log record + */ + writeRecord: + lsn = lmWriteRecord(log, tblk, lrd, tlck); + + /* + * forward log syncpt if log reached next syncpt trigger + */ + logdiff(diffp, lsn, log); + if (diffp >= log->nextsync) + lsn = lmLogSync(log, 0); + + /* update end-of-log lsn */ + log->lsn = lsn; + + LOG_UNLOCK(log); + + /* return end-of-log address */ + return lsn; +} + + +/* + * NAME: lmWriteRecord() + * + * FUNCTION: move the log record to current log page + * + * PARAMETER: cd - commit descriptor + * + * RETURN: end-of-log address + * + * serialization: LOG_LOCK() held on entry/exit + */ +static int +lmWriteRecord(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck) +{ + int lsn = 0; /* end-of-log address */ + lbuf_t *bp; /* dst log page buffer */ + logpage_t *lp; /* dst log page */ + caddr_t dst; /* destination address in log page */ + int dstoffset; /* end-of-log offset in log page */ + int freespace; /* free space in log page */ + caddr_t p; /* src meta-data page */ + caddr_t src; + int srclen; + int nbytes; /* number of bytes to move */ + int i; + int len; + linelock_t *linelock; + lv_t *lv; + lvd_t *lvd; + int l2linesize; + + len = 0; + + /* retrieve destination log page to write */ + bp = (lbuf_t *) log->bp; + lp = (logpage_t *) bp->l_ldata; + dstoffset = log->eor; + + /* any log data to write ? */ + if (tlck == NULL) + goto moveLrd; + + /* + * move log record data + */ + /* retrieve source meta-data page to log */ + if (tlck->flag & tlckPAGELOCK) { + p = (caddr_t) (tlck->mp->data); + linelock = (linelock_t *) & tlck->lock; + } + /* retrieve source in-memory inode to log */ + else if (tlck->flag & tlckINODELOCK) { + if (tlck->type & tlckDTREE) + p = (caddr_t) &JFS_IP(tlck->ip)->i_dtroot; + else + p = (caddr_t) &JFS_IP(tlck->ip)->i_xtroot; + linelock = (linelock_t *) & tlck->lock; + } +#ifdef _JFS_WIP + else if (tlck->flag & tlckINLINELOCK) { + + inlinelock = (inlinelock_t *) & tlck; + p = (caddr_t) & inlinelock->pxd; + linelock = (linelock_t *) & tlck; + } +#endif /* _JFS_WIP */ + else { + jERROR(2, ("lmWriteRecord: UFO tlck:0x%p\n", tlck)); + return 0; /* Probably should trap */ + } + l2linesize = linelock->l2linesize; + + moveData: + ASSERT(linelock->index <= linelock->maxcnt); + + lv = (lv_t *) & linelock->lv; + for (i = 0; i < linelock->index; i++, lv++) { + if (lv->length == 0) + continue; + + /* is page full ? */ + if (dstoffset >= LOGPSIZE - LOGPTLRSIZE) { + /* page become full: move on to next page */ + lmNextPage(log); + + bp = log->bp; + lp = (logpage_t *) bp->l_ldata; + dstoffset = LOGPHDRSIZE; + } + + /* + * move log vector data + */ + src = (u8 *) p + (lv->offset << l2linesize); + srclen = lv->length << l2linesize; + len += srclen; + while (srclen > 0) { + freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset; + nbytes = min(freespace, srclen); + dst = (caddr_t) lp + dstoffset; + memcpy(dst, src, nbytes); + dstoffset += nbytes; + + /* is page not full ? */ + if (dstoffset < LOGPSIZE - LOGPTLRSIZE) + break; + + /* page become full: move on to next page */ + lmNextPage(log); + + bp = (lbuf_t *) log->bp; + lp = (logpage_t *) bp->l_ldata; + dstoffset = LOGPHDRSIZE; + + srclen -= nbytes; + src += nbytes; + } + + /* + * move log vector descriptor + */ + len += 4; + lvd = (lvd_t *) ((caddr_t) lp + dstoffset); + lvd->offset = cpu_to_le16(lv->offset); + lvd->length = cpu_to_le16(lv->length); + dstoffset += 4; + jFYI(1, + ("lmWriteRecord: lv offset:%d length:%d\n", + lv->offset, lv->length)); + } + + if ((i = linelock->next)) { + linelock = (linelock_t *) lid_to_tlock(i); + goto moveData; + } + + /* + * move log record descriptor + */ + moveLrd: + lrd->length = cpu_to_le16(len); + + src = (caddr_t) lrd; + srclen = LOGRDSIZE; + + while (srclen > 0) { + freespace = (LOGPSIZE - LOGPTLRSIZE) - dstoffset; + nbytes = min(freespace, srclen); + dst = (caddr_t) lp + dstoffset; + memcpy(dst, src, nbytes); + + dstoffset += nbytes; + srclen -= nbytes; + + /* are there more to move than freespace of page ? */ + if (srclen) + goto pageFull; + + /* + * end of log record descriptor + */ + + /* update last log record eor */ + log->eor = dstoffset; + bp->l_eor = dstoffset; + lsn = (log->page << L2LOGPSIZE) + dstoffset; + + if (lrd->type & cpu_to_le16(LOG_COMMIT)) { + tblk->clsn = lsn; + jFYI(1, + ("wr: tclsn:0x%x, beor:0x%x\n", tblk->clsn, + bp->l_eor)); + + INCREMENT(lmStat.commit); /* # of commit */ + + /* + * enqueue tblock for group commit: + * + * enqueue tblock of non-trivial/synchronous COMMIT + * at tail of group commit queue + * (trivial/asynchronous COMMITs are ignored by + * group commit.) + */ + LOGGC_LOCK(log); + + /* init tblock gc state */ + tblk->flag = tblkGC_QUEUE; + tblk->bp = log->bp; + tblk->pn = log->page; + tblk->eor = log->eor; + init_waitqueue_head(&tblk->gcwait); + + /* enqueue transaction to commit queue */ + tblk->cqnext = NULL; + if (log->cqueue.head) { + log->cqueue.tail->cqnext = tblk; + log->cqueue.tail = tblk; + } else + log->cqueue.head = log->cqueue.tail = tblk; + + LOGGC_UNLOCK(log); + } + + jFYI(1, + ("lmWriteRecord: lrd:0x%04x bp:0x%p pn:%d eor:0x%x\n", + le16_to_cpu(lrd->type), log->bp, log->page, + dstoffset)); + + /* page not full ? */ + if (dstoffset < LOGPSIZE - LOGPTLRSIZE) + return lsn; + + pageFull: + /* page become full: move on to next page */ + lmNextPage(log); + + bp = (lbuf_t *) log->bp; + lp = (logpage_t *) bp->l_ldata; + dstoffset = LOGPHDRSIZE; + src += nbytes; + } + + return lsn; +} + + +/* + * NAME: lmNextPage() + * + * FUNCTION: write current page and allocate next page. + * + * PARAMETER: log + * + * RETURN: 0 + * + * serialization: LOG_LOCK() held on entry/exit + */ +static int lmNextPage(log_t * log) +{ + logpage_t *lp; + int lspn; /* log sequence page number */ + int pn; /* current page number */ + lbuf_t *bp; + lbuf_t *nextbp; + tblock_t *tblk; + + jFYI(1, ("lmNextPage\n")); + + /* get current log page number and log sequence page number */ + pn = log->page; + bp = log->bp; + lp = (logpage_t *) bp->l_ldata; + lspn = le32_to_cpu(lp->h.page); + + LOGGC_LOCK(log); + + /* + * write or queue the full page at the tail of write queue + */ + /* get the tail tblk on commit queue */ + tblk = log->cqueue.tail; + + /* every tblk who has COMMIT record on the current page, + * and has not been committed, must be on commit queue + * since tblk is queued at commit queueu at the time + * of writing its COMMIT record on the page before + * page becomes full (even though the tblk thread + * who wrote COMMIT record may have been suspended + * currently); + */ + + /* is page bound with outstanding tail tblk ? */ + if (tblk && tblk->pn == pn) { + /* mark tblk for end-of-page */ + tblk->flag |= tblkGC_EOP; + + /* if page is not already on write queue, + * just enqueue (no lbmWRITE to prevent redrive) + * buffer to wqueue to ensure correct serial order + * of the pages since log pages will be added + * continuously (tblk bound with the page hasn't + * got around to init write of the page, either + * preempted or the page got filled by its COMMIT + * record); + * pages with COMMIT are paged out explicitly by + * tblk in lmGroupCommit(); + */ + if (bp->l_wqnext == NULL) { + /* bp->l_ceor = bp->l_eor; */ + /* lp->h.eor = lp->t.eor = bp->l_ceor; */ + lbmWrite(log, bp, 0, 0); + } + } + /* page is not bound with outstanding tblk: + * init write or mark it to be redriven (lbmWRITE) + */ + else { + /* finalize the page */ + bp->l_ceor = bp->l_eor; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); + lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, 0); + } + LOGGC_UNLOCK(log); + + /* + * allocate/initialize next page + */ + /* if log wraps, the first data page of log is 2 + * (0 never used, 1 is superblock). + */ + log->page = (pn == log->size - 1) ? 2 : pn + 1; + log->eor = LOGPHDRSIZE; /* ? valid page empty/full at logRedo() */ + + /* allocate/initialize next log page buffer */ + nextbp = lbmAllocate(log, log->page); + nextbp->l_eor = log->eor; + log->bp = nextbp; + + /* initialize next log page */ + lp = (logpage_t *) nextbp->l_ldata; + lp->h.page = lp->t.page = cpu_to_le32(lspn + 1); + lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE); + + jFYI(1, ("lmNextPage done\n")); + return 0; +} + + +/* + * NAME: lmGroupCommit() + * + * FUNCTION: group commit + * initiate pageout of the pages with COMMIT in the order of + * page number - redrive pageout of the page at the head of + * pageout queue until full page has been written. + * + * RETURN: + * + * NOTE: + * LOGGC_LOCK serializes log group commit queue, and + * transaction blocks on the commit queue. + * N.B. LOG_LOCK is NOT held during lmGroupCommit(). + */ +int lmGroupCommit(log_t * log, tblock_t * tblk) +{ + int rc = 0; + + LOGGC_LOCK(log); + + /* group committed already ? */ + if (tblk->flag & tblkGC_COMMITTED) { + if (tblk->flag & tblkGC_ERROR) + rc = EIO; + + LOGGC_UNLOCK(log); + return rc; + } + jFYI(1, + ("lmGroup Commit: tblk = 0x%p, gcrtc = %d\n", tblk, + log->gcrtc)); + + /* + * group commit pageout in progress + */ + if ((!(log->cflag & logGC_PAGEOUT)) && log->cqueue.head) { + /* + * only transaction in the commit queue: + * + * start one-transaction group commit as + * its group leader. + */ + log->cflag |= logGC_PAGEOUT; + + lmGCwrite(log, 0); + } + /* lmGCwrite gives up LOGGC_LOCK, check again */ + + if (tblk->flag & tblkGC_COMMITTED) { + if (tblk->flag & tblkGC_ERROR) + rc = EIO; + + LOGGC_UNLOCK(log); + return rc; + } + + /* upcount transaction waiting for completion + */ + log->gcrtc++; + + if (tblk->xflag & COMMIT_LAZY) { + tblk->flag |= tblkGC_LAZY; + LOGGC_UNLOCK(log); + return 0; + } + tblk->flag |= tblkGC_READY; + + __SLEEP_COND(tblk->gcwait, (tblk->flag & tblkGC_COMMITTED), + LOGGC_LOCK(log), LOGGC_UNLOCK(log)); + + /* removed from commit queue */ + if (tblk->flag & tblkGC_ERROR) + rc = EIO; + + LOGGC_UNLOCK(log); + return rc; +} + +/* + * NAME: lmGCwrite() + * + * FUNCTION: group commit write + * initiate write of log page, building a group of all transactions + * with commit records on that page. + * + * RETURN: None + * + * NOTE: + * LOGGC_LOCK must be held by caller. + * N.B. LOG_LOCK is NOT held during lmGroupCommit(). + */ +void lmGCwrite(log_t * log, int cant_write) +{ + lbuf_t *bp; + logpage_t *lp; + int gcpn; /* group commit page number */ + tblock_t *tblk; + tblock_t *xtblk; + + /* + * build the commit group of a log page + * + * scan commit queue and make a commit group of all + * transactions with COMMIT records on the same log page. + */ + /* get the head tblk on the commit queue */ + tblk = xtblk = log->cqueue.head; + gcpn = tblk->pn; + + while (tblk && tblk->pn == gcpn) { + xtblk = tblk; + + /* state transition: (QUEUE, READY) -> COMMIT */ + tblk->flag |= tblkGC_COMMIT; + tblk = tblk->cqnext; + } + tblk = xtblk; /* last tblk of the page */ + + /* + * pageout to commit transactions on the log page. + */ + bp = (lbuf_t *) tblk->bp; + lp = (logpage_t *) bp->l_ldata; + /* is page already full ? */ + if (tblk->flag & tblkGC_EOP) { + /* mark page to free at end of group commit of the page */ + tblk->flag &= ~tblkGC_EOP; + tblk->flag |= tblkGC_FREE; + bp->l_ceor = bp->l_eor; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); + jEVENT(0, + ("gc: tclsn:0x%x, bceor:0x%x\n", tblk->clsn, + bp->l_ceor)); + lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmGC, + cant_write); + } + /* page is not yet full */ + else { + bp->l_ceor = tblk->eor; /* ? bp->l_ceor = bp->l_eor; */ + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_ceor); + jEVENT(0, + ("gc: tclsn:0x%x, bceor:0x%x\n", tblk->clsn, + bp->l_ceor)); + lbmWrite(log, bp, lbmWRITE | lbmGC, cant_write); + } +} + +/* + * NAME: lmPostGC() + * + * FUNCTION: group commit post-processing + * Processes transactions after their commit records have been written + * to disk, redriving log I/O if necessary. + * + * RETURN: None + * + * NOTE: + * This routine is called a interrupt time by lbmIODone + */ +void lmPostGC(lbuf_t * bp) +{ + unsigned long flags; + log_t *log = bp->l_log; + logpage_t *lp; + tblock_t *tblk; + + //LOGGC_LOCK(log); + spin_lock_irqsave(&log->gclock, flags); + /* + * current pageout of group commit completed. + * + * remove/wakeup transactions from commit queue who were + * group committed with the current log page + */ + while ((tblk = log->cqueue.head) && (tblk->flag & tblkGC_COMMIT)) { + /* if transaction was marked GC_COMMIT then + * it has been shipped in the current pageout + * and made it to disk - it is committed. + */ + + if (bp->l_flag & lbmERROR) + tblk->flag |= tblkGC_ERROR; + + /* remove it from the commit queue */ + log->cqueue.head = tblk->cqnext; + if (log->cqueue.head == NULL) + log->cqueue.tail = NULL; + tblk->flag &= ~tblkGC_QUEUE; + tblk->cqnext = 0; + + jEVENT(0, + ("lmPostGC: tblk = 0x%p, flag = 0x%x\n", tblk, + tblk->flag)); + + if (!(tblk->xflag & COMMIT_FORCE)) + /* + * Hand tblk over to lazy commit thread + */ + txLazyUnlock(tblk); + else { + /* state transition: COMMIT -> COMMITTED */ + tblk->flag |= tblkGC_COMMITTED; + + if (tblk->flag & tblkGC_READY) { + log->gcrtc--; + LOGGC_WAKEUP(tblk); + } + } + + /* was page full before pageout ? + * (and this is the last tblk bound with the page) + */ + if (tblk->flag & tblkGC_FREE) + lbmFree(bp); + /* did page become full after pageout ? + * (and this is the last tblk bound with the page) + */ + else if (tblk->flag & tblkGC_EOP) { + /* finalize the page */ + lp = (logpage_t *) bp->l_ldata; + bp->l_ceor = bp->l_eor; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); + jEVENT(0, ("lmPostGC: calling lbmWrite\n")); + lbmWrite(log, bp, lbmWRITE | lbmRELEASE | lbmFREE, + 1); + } + + } + + /* are there any transactions who have entered lnGroupCommit() + * (whose COMMITs are after that of the last log page written. + * They are waiting for new group commit (above at (SLEEP 1)): + * select the latest ready transaction as new group leader and + * wake her up to lead her group. + */ + if ((log->gcrtc > 0) && log->cqueue.head) + /* + * Call lmGCwrite with new group leader + */ + lmGCwrite(log, 1); + + /* no transaction are ready yet (transactions are only just + * queued (GC_QUEUE) and not entered for group commit yet). + * let the first transaction entering group commit + * will elect hetself as new group leader. + */ + else + log->cflag &= ~logGC_PAGEOUT; + + //LOGGC_UNLOCK(log); + spin_unlock_irqrestore(&log->gclock, flags); + return; +} + +/* + * NAME: lmLogSync() + * + * FUNCTION: write log SYNCPT record for specified log + * if new sync address is available + * (normally the case if sync() is executed by back-ground + * process). + * if not, explicitly run jfs_blogsync() to initiate + * getting of new sync address. + * calculate new value of i_nextsync which determines when + * this code is called again. + * + * this is called only from lmLog(). + * + * PARAMETER: ip - pointer to logs inode. + * + * RETURN: 0 + * + * serialization: LOG_LOCK() held on entry/exit + */ +int lmLogSync(log_t * log, int nosyncwait) +{ + int logsize; + int written; /* written since last syncpt */ + int free; /* free space left available */ + int delta; /* additional delta to write normally */ + int more; /* additional write granted */ + lrd_t lrd; + int lsn; + struct logsyncblk *lp; + + /* + * forward syncpt + */ + /* if last sync is same as last syncpt, + * invoke sync point forward processing to update sync. + */ + + if (log->sync == log->syncpt) { + LOGSYNC_LOCK(log); + /* ToDo: push dirty metapages out to disk */ +// bmLogSync(log); + + if (list_empty(&log->synclist)) + log->sync = log->lsn; + else { + lp = list_entry(log->synclist.next, + struct logsyncblk, synclist); + log->sync = lp->lsn; + } + LOGSYNC_UNLOCK(log); + + } + + /* if sync is different from last syncpt, + * write a SYNCPT record with syncpt = sync. + * reset syncpt = sync + */ + if (log->sync != log->syncpt) { + struct jfs_sb_info *sbi = JFS_SBI(log->sb); + /* + * We need to make sure all of the "written" metapages + * actually make it to disk + */ + fsync_inode_data_buffers(sbi->ipbmap); + fsync_inode_data_buffers(sbi->ipimap); + fsync_inode_data_buffers(sbi->direct_inode); + + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_SYNCPT); + lrd.length = 0; + lrd.log.syncpt.sync = cpu_to_le32(log->sync); + lsn = lmWriteRecord(log, NULL, &lrd, NULL); + + log->syncpt = log->sync; + } else + lsn = log->lsn; + + /* + * setup next syncpt trigger (SWAG) + */ + logsize = log->logsize; + + logdiff(written, lsn, log); + free = logsize - written; + delta = LOGSYNC_DELTA(logsize); + more = min(free / 2, delta); + if (more < 2 * LOGPSIZE) { + jEVENT(1, + ("\n ... Log Wrap ... Log Wrap ... Log Wrap ...\n\n")); + /* + * log wrapping + * + * option 1 - panic ? No.! + * option 2 - shutdown file systems + * associated with log ? + * option 3 - extend log ? + */ + /* + * option 4 - second chance + * + * mark log wrapped, and continue. + * when all active transactions are completed, + * mark log vaild for recovery. + * if crashed during invalid state, log state + * implies invald log, forcing fsck(). + */ + /* mark log state log wrap in log superblock */ + /* log->state = LOGWRAP; */ + + /* reset sync point computation */ + log->syncpt = log->sync = lsn; + log->nextsync = delta; + } else + /* next syncpt trigger = written + more */ + log->nextsync = written + more; + + /* return if lmLogSync() from outside of transaction, e.g., sync() */ + if (nosyncwait) + return lsn; + + /* if number of bytes written from last sync point is more + * than 1/4 of the log size, stop new transactions from + * starting until all current transactions are completed + * by setting syncbarrier flag. + */ + if (written > LOGSYNC_BARRIER(logsize) && logsize > 32 * LOGPSIZE) { + log->syncbarrier = 1; + jFYI(1, ("log barrier on: lsn=0x%x syncpt=0x%x\n", lsn, + log->syncpt)); + } + + return lsn; +} + + +/* + * NAME: lmLogOpen() + * + * FUNCTION: open the log on first open; + * insert filesystem in the active list of the log. + * + * PARAMETER: ipmnt - file system mount inode + * iplog - log inode (out) + * + * RETURN: + * + * serialization: + */ +int lmLogOpen(struct super_block *sb, log_t ** logptr) +{ + int rc; + struct block_device *bdev; + log_t *log; + + if (!(log = kmalloc(sizeof(log_t), GFP_KERNEL))) + return ENOMEM; + memset(log, 0, sizeof(log_t)); + + if (!(JFS_SBI(sb)->mntflag & JFS_INLINELOG)) + goto externalLog; + + /* + * in-line log in host file system + * + * file system to log have 1-to-1 relationship; + */ + + log->sb = sb; /* This should be a list */ + log->flag = JFS_INLINELOG; + log->dev = sb->s_dev; + log->base = addressPXD(&JFS_SBI(sb)->logpxd); + log->size = lengthPXD(&JFS_SBI(sb)->logpxd) >> + (L2LOGPSIZE - sb->s_blocksize_bits); + log->l2bsize = sb->s_blocksize_bits; + ASSERT(L2LOGPSIZE >= sb->s_blocksize_bits); + + /* + * initialize log. + */ + if ((rc = lmLogInit(log))) + goto errout10; + goto out; + + /* + * external log as separate logical volume + * + * file systems to log may have n-to-1 relationship; + */ + externalLog: + if (!(bdev = bdget(kdev_t_to_nr(JFS_SBI(sb)->logdev)))) { + rc = ENODEV; + goto errout10; + } + + if ((rc = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_FS))) { + rc = -rc; + goto errout10; + } + + log->sb = sb; /* This should be a list */ + log->dev = JFS_SBI(sb)->logdev; + log->bdev = bdev; + + /* + * initialize log: + */ + if ((rc = lmLogInit(log))) + goto errout20; + + /* + * add file system to log active file system list + */ + if ((rc = lmLogFileSystem(log, sb->s_dev, 1))) + goto errout30; + + out: + jFYI(1, ("lmLogOpen: exit(0)\n")); + *logptr = log; + return 0; + + /* + * unwind on error + */ + errout30: /* unwind lbmLogInit() */ + lbmLogShutdown(log); + + errout20: /* close external log device */ + blkdev_put(bdev, BDEV_FS); + + errout10: /* free log descriptor */ + kfree(log); + + jFYI(1, ("lmLogOpen: exit(%d)\n", rc)); + return rc; +} + + +/* + * NAME: lmLogInit() + * + * FUNCTION: log initialization at first log open. + * + * logredo() (or logformat()) should have been run previously. + * initialize the log inode from log superblock. + * set the log state in the superblock to LOGMOUNT and + * write SYNCPT log record. + * + * PARAMETER: log - log structure + * + * RETURN: 0 - if ok + * EINVAL - bad log magic number or superblock dirty + * error returned from logwait() + * + * serialization: single first open thread + */ +static int lmLogInit(log_t * log) +{ + int rc = 0; + lrd_t lrd; + logsuper_t *logsuper; + lbuf_t *bpsuper; + lbuf_t *bp; + logpage_t *lp; + int lsn; + + jFYI(1, ("lmLogInit: log:0x%p\n", log)); + + /* + * log inode is overlaid on generic inode where + * dinode have been zeroed out by iRead(); + */ + + /* + * initialize log i/o + */ + if ((rc = lbmLogInit(log))) + return rc; + + /* + * validate log superblock + */ + + + if (!(log->flag & JFS_INLINELOG)) + log->l2bsize = 12; /* XXX kludge alert XXX */ + if ((rc = lbmRead(log, 1, &bpsuper))) + goto errout10; + + logsuper = (logsuper_t *) bpsuper->l_ldata; + + if (logsuper->magic != cpu_to_le32(LOGMAGIC)) { + jERROR(1, ("*** Log Format Error ! ***\n")); + rc = EINVAL; + goto errout20; + } + + /* logredo() should have been run successfully. */ + if (logsuper->state != cpu_to_le32(LOGREDONE)) { + jERROR(1, ("*** Log Is Dirty ! ***\n")); + rc = EINVAL; + goto errout20; + } + + /* initialize log inode from log superblock */ + if (log->flag & JFS_INLINELOG) { + if (log->size != le32_to_cpu(logsuper->size)) { + rc = EINVAL; + goto errout20; + } + jFYI(0, + ("lmLogInit: inline log:0x%p base:0x%Lx size:0x%x\n", + log, (unsigned long long) log->base, log->size)); + } else { + log->size = le32_to_cpu(logsuper->size); + log->l2bsize = le32_to_cpu(logsuper->l2bsize); + jFYI(0, + ("lmLogInit: external log:0x%p base:0x%Lx size:0x%x\n", + log, (unsigned long long) log->base, log->size)); + } + + log->flag |= JFS_GROUPCOMMIT; +/* + log->flag |= JFS_LAZYCOMMIT; +*/ + log->page = le32_to_cpu(logsuper->end) / LOGPSIZE; + log->eor = le32_to_cpu(logsuper->end) - (LOGPSIZE * log->page); + + /* + * initialize for log append write mode + */ + /* establish current/end-of-log page/buffer */ + if ((rc = lbmRead(log, log->page, &bp))) + goto errout20; + + lp = (logpage_t *) bp->l_ldata; + + jFYI(1, ("lmLogInit: lsn:0x%x page:%d eor:%d:%d\n", + le32_to_cpu(logsuper->end), log->page, log->eor, + le16_to_cpu(lp->h.eor))); + +// ASSERT(log->eor == lp->h.eor); + + log->bp = bp; + bp->l_pn = log->page; + bp->l_eor = log->eor; + + /* initialize the group commit serialization lock */ + LOGGC_LOCK_INIT(log); + + /* if current page is full, move on to next page */ + if (log->eor >= LOGPSIZE - LOGPTLRSIZE) + lmNextPage(log); + + /* allocate/initialize the log write serialization lock */ + LOG_LOCK_INIT(log); + + /* + * initialize log syncpoint + */ + /* + * write the first SYNCPT record with syncpoint = 0 + * (i.e., log redo up to HERE !); + * remove current page from lbm write queue at end of pageout + * (to write log superblock update), but do not release to freelist; + */ + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_SYNCPT); + lrd.length = 0; + lrd.log.syncpt.sync = 0; + lsn = lmWriteRecord(log, NULL, &lrd, NULL); + bp = log->bp; + bp->l_ceor = bp->l_eor; + lp = (logpage_t *) bp->l_ldata; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); + lbmWrite(log, bp, lbmWRITE | lbmSYNC, 0); + if ((rc = lbmIOWait(bp, 0))) + goto errout30; + + /* initialize logsync parameters */ + log->logsize = (log->size - 2) << L2LOGPSIZE; + log->lsn = lsn; + log->syncpt = lsn; + log->sync = log->syncpt; + log->nextsync = LOGSYNC_DELTA(log->logsize); + init_waitqueue_head(&log->syncwait); + + jFYI(1, ("lmLogInit: lsn:0x%x syncpt:0x%x sync:0x%x\n", + log->lsn, log->syncpt, log->sync)); + + LOGSYNC_LOCK_INIT(log); + + INIT_LIST_HEAD(&log->synclist); + + log->cqueue.head = log->cqueue.tail = 0; + + log->count = 0; + + /* + * initialize for lazy/group commit + */ + log->clsn = lsn; + + /* + * update/write superblock + */ + logsuper->state = cpu_to_le32(LOGMOUNT); + log->serial = le32_to_cpu(logsuper->serial) + 1; + logsuper->serial = cpu_to_le32(log->serial); + logsuper->device = cpu_to_le32(kdev_t_to_nr(log->dev)); + lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); + if ((rc = lbmIOWait(bpsuper, lbmFREE))) + goto errout30; + + jFYI(1, ("lmLogInit: exit(%d)\n", rc)); + return 0; + + /* + * unwind on error + */ + errout30: /* release log page */ + lbmFree(bp); + + errout20: /* release log superblock */ + lbmFree(bpsuper); + + errout10: /* unwind lbmLogInit() */ + lbmLogShutdown(log); + + jFYI(1, ("lmLogInit: exit(%d)\n", rc)); + return rc; +} + + +/* + * NAME: lmLogClose() + * + * FUNCTION: remove file system from active list of log + * and close it on last close. + * + * PARAMETER: sb - superblock + * log - log inode + * + * RETURN: errors from subroutines + * + * serialization: + */ +int lmLogClose(struct super_block *sb, log_t * log) +{ + int rc; + + jFYI(1, ("lmLogClose: log:0x%p\n", log)); + + if (!(log->flag & JFS_INLINELOG)) + goto externalLog; + + /* + * in-line log in host file system + */ + rc = lmLogShutdown(log); + goto out; + + /* + * external log as separate logical volume + */ + externalLog: + lmLogFileSystem(log, sb->s_dev, 0); + rc = lmLogShutdown(log); + blkdev_put(log->bdev, BDEV_FS); + + out: + jFYI(0, ("lmLogClose: exit(%d)\n", rc)); + return rc; +} + + +/* + * NAME: lmLogShutdown() + * + * FUNCTION: log shutdown at last LogClose(). + * + * write log syncpt record. + * update super block to set redone flag to 0. + * + * PARAMETER: log - log inode + * + * RETURN: 0 - success + * + * serialization: single last close thread + */ +static int lmLogShutdown(log_t * log) +{ + int rc; + lrd_t lrd; + int lsn; + logsuper_t *logsuper; + lbuf_t *bpsuper; + lbuf_t *bp; + logpage_t *lp; + + jFYI(1, ("lmLogShutdown: log:0x%p\n", log)); + + if (log->cqueue.head || !list_empty(&log->synclist)) { + /* + * If there was very recent activity, we may need to wait + * for the lazycommit thread to catch up + */ + int i; + + for (i = 0; i < 800; i++) { /* Too much? */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ / 4); + if ((log->cqueue.head == NULL) && + list_empty(&log->synclist)) + break; + } + } + assert(log->cqueue.head == NULL); + assert(list_empty(&log->synclist)); + + /* + * We need to make sure all of the "written" metapages + * actually make it to disk + */ +#if ( (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,8)) || \ + ( (LINUX_VERSION_CODE == KERNEL_VERSION(2,4,8)) && defined(MODULE) ) ) + /* + * fsync_no_super not added until 2.4.8, not exported until 2.4.9 + */ + { + struct jfs_sb_info *sbi = JFS_SBI(log->sb); + + fsync_inode_data_buffers(sbi->ipbmap); + fsync_inode_data_buffers(sbi->ipimap); + fsync_inode_data_buffers(sbi->direct_inode); + } +#else + fsync_no_super(log->sb->s_dev); +#endif + + /* + * write the last SYNCPT record with syncpoint = 0 + * (i.e., log redo up to HERE !) + */ + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_SYNCPT); + lrd.length = 0; + lrd.log.syncpt.sync = 0; + lsn = lmWriteRecord(log, NULL, &lrd, NULL); + bp = log->bp; + lp = (logpage_t *) bp->l_ldata; + lp->h.eor = lp->t.eor = cpu_to_le16(bp->l_eor); + lbmWrite(log, log->bp, lbmWRITE | lbmRELEASE | lbmSYNC, 0); + lbmIOWait(log->bp, lbmFREE); + + /* + * synchronous update log superblock + * mark log state as shutdown cleanly + * (i.e., Log does not need to be replayed). + */ + if ((rc = lbmRead(log, 1, &bpsuper))) + goto out; + + logsuper = (logsuper_t *) bpsuper->l_ldata; + logsuper->state = cpu_to_le32(LOGREDONE); + logsuper->end = cpu_to_le32(lsn); + lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); + rc = lbmIOWait(bpsuper, lbmFREE); + + jFYI(1, ("lmLogShutdown: lsn:0x%x page:%d eor:%d\n", + lsn, log->page, log->eor)); + + out: + /* + * shutdown per log i/o + */ + lbmLogShutdown(log); + + if (rc) { + jFYI(1, ("lmLogShutdown: exit(%d)\n", rc)); + } + return rc; +} + + +/* + * NAME: lmLogFileSystem() + * + * FUNCTION: insert ( = true)/remove ( = false) + * file system into/from log active file system list. + * + * PARAMETE: log - pointer to logs inode. + * fsdev - kdev_t of filesystem. + * serial - pointer to returned log serial number + * activate - insert/remove device from active list. + * + * RETURN: 0 - success + * errors returned by vms_iowait(). + * + * serialization: IWRITE_LOCK(log inode) held on entry/exit + */ +static int lmLogFileSystem(log_t * log, kdev_t fsdev, int activate) +{ + int rc = 0; + int i; + u32 dev_le = cpu_to_le32(kdev_t_to_nr(fsdev)); + logsuper_t *logsuper; + lbuf_t *bpsuper; + + /* + * insert/remove file system device to log active file system list. + */ + if ((rc = lbmRead(log, 1, &bpsuper))) + return rc; + + logsuper = (logsuper_t *) bpsuper->l_ldata; + if (activate) { + for (i = 0; i < MAX_ACTIVE; i++) + if (logsuper->active[i] == 0) { + logsuper->active[i] = dev_le; + break; + } + if (i == MAX_ACTIVE) { + jERROR(1,("Too many file systems sharing journal!\n")); + lbmFree(bpsuper); + return EMFILE; /* Is there a better rc? */ + } + } else { + for (i = 0; i < MAX_ACTIVE; i++) + if (logsuper->active[i] == dev_le) { + logsuper->active[i] = 0; + break; + } + assert(i < MAX_ACTIVE); + } + + /* + * synchronous write log superblock: + * + * write sidestream bypassing write queue: + * at file system mount, log super block is updated for + * activation of the file system before any log record + * (MOUNT record) of the file system, and at file system + * unmount, all meta data for the file system has been + * flushed before log super block is updated for deactivation + * of the file system. + */ + lbmDirectWrite(log, bpsuper, lbmWRITE | lbmRELEASE | lbmSYNC); + rc = lbmIOWait(bpsuper, lbmFREE); + + return rc; +} + + +/* + * lmLogQuiesce() + */ +int lmLogQuiesce(log_t * log) +{ + int rc; + + rc = lmLogShutdown(log); + + return rc; +} + + +/* + * lmLogResume() + */ +int lmLogResume(log_t * log, struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + int rc; + + log->base = addressPXD(&sbi->logpxd); + log->size = + (lengthPXD(&sbi->logpxd) << sb->s_blocksize_bits) >> L2LOGPSIZE; + rc = lmLogInit(log); + + return rc; +} + + +/* + * log buffer manager (lbm) + * ------------------------ + * + * special purpose buffer manager supporting log i/o requirements. + * + * per log write queue: + * log pageout occurs in serial order by fifo write queue and + * restricting to a single i/o in pregress at any one time. + * a circular singly-linked list + * (log->wrqueue points to the tail, and buffers are linked via + * bp->wrqueue field), and + * maintains log page in pageout ot waiting for pageout in serial pageout. + */ + +/* + * lbmLogInit() + * + * initialize per log I/O setup at lmLogInit() + */ +static int lbmLogInit(log_t * log) +{ /* log inode */ + int i; + lbuf_t *lbuf; + + jFYI(1, ("lbmLogInit: log:0x%p\n", log)); + + /* initialize current buffer cursor */ + log->bp = NULL; + + /* initialize log device write queue */ + log->wqueue = NULL; + + /* + * Each log has its own buffer pages allocated to it. These are + * not managed by the page cache. This ensures that a transaction + * writing to the log does not block trying to allocate a page from + * the page cache (for the log). This would be bad, since page + * allocation waits on the kswapd thread that may be committing inodes + * which would cause log activity. Was that clear? I'm trying to + * avoid deadlock here. + */ + init_waitqueue_head(&log->free_wait); + + log->lbuf_free = NULL; + + for (i = 0; i < LOGPAGES; i++) { + lbuf = kmalloc(sizeof(lbuf_t), GFP_KERNEL); + if (lbuf == 0) + goto error; + lbuf->l_bh.b_data = lbuf->l_ldata = + (char *) __get_free_page(GFP_KERNEL); + if (lbuf->l_ldata == 0) { + kfree(lbuf); + goto error; + } + lbuf->l_log = log; + init_waitqueue_head(&lbuf->l_ioevent); + + lbuf->l_bh.b_size = LOGPSIZE; + lbuf->l_bh.b_dev = log->dev; + lbuf->l_bh.b_end_io = lbmIODone; + lbuf->l_bh.b_private = lbuf; + lbuf->l_bh.b_page = virt_to_page(lbuf->l_ldata); + lbuf->l_bh.b_state = 0; + init_waitqueue_head(&lbuf->l_bh.b_wait); + + lbuf->l_freelist = log->lbuf_free; + log->lbuf_free = lbuf; + } + + return (0); + + error: + lbmLogShutdown(log); + return (ENOMEM); +} + + +/* + * lbmLogShutdown() + * + * finalize per log I/O setup at lmLogShutdown() + */ +static void lbmLogShutdown(log_t * log) +{ + lbuf_t *lbuf; + + jFYI(1, ("lbmLogShutdown: log:0x%p\n", log)); + + lbuf = log->lbuf_free; + while (lbuf) { + lbuf_t *next = lbuf->l_freelist; + free_page((unsigned long) lbuf->l_ldata); + kfree(lbuf); + lbuf = next; + } + + log->bp = NULL; +} + + +/* + * lbmAllocate() + * + * allocate an empty log buffer + */ +static lbuf_t *lbmAllocate(log_t * log, int pn) +{ + lbuf_t *bp; + unsigned long flags; + + /* + * recycle from log buffer freelist if any + */ + LCACHE_LOCK(flags); + LCACHE_SLEEP_COND(log->free_wait, (bp = log->lbuf_free), flags); + log->lbuf_free = bp->l_freelist; + LCACHE_UNLOCK(flags); + + bp->l_flag = 0; + + bp->l_wqnext = NULL; + bp->l_freelist = NULL; + + bp->l_pn = pn; + bp->l_blkno = log->base + (pn << (L2LOGPSIZE - log->l2bsize)); + bp->l_bh.b_blocknr = bp->l_blkno; + bp->l_ceor = 0; + + return bp; +} + + +/* + * lbmFree() + * + * release a log buffer to freelist + */ +static void lbmFree(lbuf_t * bp) +{ + unsigned long flags; + + LCACHE_LOCK(flags); + + lbmfree(bp); + + LCACHE_UNLOCK(flags); +} + +static void lbmfree(lbuf_t * bp) +{ + log_t *log = bp->l_log; + + assert(bp->l_wqnext == NULL); + + /* + * return the buffer to head of freelist + */ + bp->l_freelist = log->lbuf_free; + log->lbuf_free = bp; + + wake_up(&log->free_wait); + return; +} + + +/* + * NAME: lbmRedrive + * + * FUNCTION: add a log buffer to the the log redrive list + * + * PARAMETER: + * bp - log buffer + * + * NOTES: + * Takes log_redrive_lock. + */ +static inline void lbmRedrive(lbuf_t *bp) +{ + unsigned long flags; + + spin_lock_irqsave(&log_redrive_lock, flags); + bp->l_redrive_next = log_redrive_list; + log_redrive_list = bp; + spin_unlock_irqrestore(&log_redrive_lock, flags); + + wake_up_process(jfsIOtask); +} + + +/* + * lbmRead() + */ +static int lbmRead(log_t * log, int pn, lbuf_t ** bpp) +{ + lbuf_t *bp; + + /* + * allocate a log buffer + */ + *bpp = bp = lbmAllocate(log, pn); + jFYI(1, ("lbmRead: bp:0x%p pn:0x%x\n", bp, pn)); + + bp->l_flag |= lbmREAD; + bp->l_bh.b_reqnext = NULL; + clear_bit(BH_Uptodate, &bp->l_bh.b_state); + lock_buffer(&bp->l_bh); + set_bit(BH_Mapped, &bp->l_bh.b_state); + set_bit(BH_Req, &bp->l_bh.b_state); + bp->l_bh.b_rdev = bp->l_bh.b_dev; + bp->l_bh.b_rsector = bp->l_blkno << (log->l2bsize - 9); + generic_make_request(READ, &bp->l_bh); + run_task_queue(&tq_disk); + + wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); + + return 0; +} + + +/* + * lbmWrite() + * + * buffer at head of pageout queue stays after completion of + * partial-page pageout and redriven by explicit initiation of + * pageout by caller until full-page pageout is completed and + * released. + * + * device driver i/o done redrives pageout of new buffer at + * head of pageout queue when current buffer at head of pageout + * queue is released at the completion of its full-page pageout. + * + * LOGGC_LOCK() serializes lbmWrite() by lmNextPage() and lmGroupCommit(). + * LCACHE_LOCK() serializes xflag between lbmWrite() and lbmIODone() + */ +static void lbmWrite(log_t * log, lbuf_t * bp, int flag, int cant_block) +{ + lbuf_t *tail; + unsigned long flags; + + jFYI(1, ("lbmWrite: bp:0x%p flag:0x%x pn:0x%x\n", + bp, flag, bp->l_pn)); + + /* map the logical block address to physical block address */ + bp->l_blkno = + log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); + + LCACHE_LOCK(flags); /* disable+lock */ + + /* + * initialize buffer for device driver + */ + bp->l_flag = flag; + + /* + * insert bp at tail of write queue associated with log + * + * (request is either for bp already/currently at head of queue + * or new bp to be inserted at tail) + */ + tail = log->wqueue; + + /* is buffer not already on write queue ? */ + if (bp->l_wqnext == NULL) { + /* insert at tail of wqueue */ + if (tail == NULL) { + log->wqueue = bp; + bp->l_wqnext = bp; + } else { + log->wqueue = bp; + bp->l_wqnext = tail->l_wqnext; + tail->l_wqnext = bp; + } + + tail = bp; + } + + /* is buffer at head of wqueue and for write ? */ + if ((bp != tail->l_wqnext) || !(flag & lbmWRITE)) { + LCACHE_UNLOCK(flags); /* unlock+enable */ + return; + } + + LCACHE_UNLOCK(flags); /* unlock+enable */ + + if (cant_block) + lbmRedrive(bp); + else if (flag & lbmSYNC) + lbmStartIO(bp); + else { + LOGGC_UNLOCK(log); + lbmStartIO(bp); + LOGGC_LOCK(log); + } +} + + +/* + * lbmDirectWrite() + * + * initiate pageout bypassing write queue for sidestream + * (e.g., log superblock) write; + */ +static void lbmDirectWrite(log_t * log, lbuf_t * bp, int flag) +{ + jEVENT(0, ("lbmDirectWrite: bp:0x%p flag:0x%x pn:0x%x\n", + bp, flag, bp->l_pn)); + + /* + * initialize buffer for device driver + */ + bp->l_flag = flag | lbmDIRECT; + + /* map the logical block address to physical block address */ + bp->l_blkno = + log->base + (bp->l_pn << (L2LOGPSIZE - log->l2bsize)); + + /* + * initiate pageout of the page + */ + lbmStartIO(bp); +} + + +/* + * NAME: lbmStartIO() + * + * FUNCTION: Interface to DD strategy routine + * + * RETURN: none + * + * serialization: LCACHE_LOCK() is NOT held during log i/o; + */ +void lbmStartIO(lbuf_t * bp) +{ + jFYI(1, ("lbmStartIO\n")); + + bp->l_bh.b_reqnext = NULL; + set_bit(BH_Dirty, &bp->l_bh.b_state); +// lock_buffer(&bp->l_bh); + assert(!test_bit(BH_Lock, &bp->l_bh.b_state)); + set_bit(BH_Lock, &bp->l_bh.b_state); + + set_bit(BH_Mapped, &bp->l_bh.b_state); + set_bit(BH_Req, &bp->l_bh.b_state); + bp->l_bh.b_rdev = bp->l_bh.b_dev; + bp->l_bh.b_rsector = bp->l_blkno << (bp->l_log->l2bsize - 9); + generic_make_request(WRITE, &bp->l_bh); + + INCREMENT(lmStat.submitted); + run_task_queue(&tq_disk); + + jFYI(1, ("lbmStartIO done\n")); +} + + +/* + * lbmIOWait() + */ +static int lbmIOWait(lbuf_t * bp, int flag) +{ + unsigned long flags; + int rc = 0; + + jFYI(1, + ("lbmIOWait1: bp:0x%p flag:0x%x:0x%x\n", bp, bp->l_flag, + flag)); + + LCACHE_LOCK(flags); /* disable+lock */ + + LCACHE_SLEEP_COND(bp->l_ioevent, (bp->l_flag & lbmDONE), flags); + + rc = (bp->l_flag & lbmERROR) ? EIO : 0; + + if (flag & lbmFREE) + lbmfree(bp); + + LCACHE_UNLOCK(flags); /* unlock+enable */ + + jFYI(1, + ("lbmIOWait2: bp:0x%p flag:0x%x:0x%x\n", bp, bp->l_flag, + flag)); + return rc; +} + +/* + * lbmIODone() + * + * executed at INTIODONE level + */ +static void lbmIODone(struct buffer_head *bh, int uptodate) +{ + lbuf_t *bp = bh->b_private; + lbuf_t *nextbp, *tail; + log_t *log; + unsigned long flags; + + /* + * get back jfs buffer bound to the i/o buffer + */ + jEVENT(0, ("lbmIODone: bp:0x%p flag:0x%x\n", bp, bp->l_flag)); + + LCACHE_LOCK(flags); /* disable+lock */ + + unlock_buffer(&bp->l_bh); + bp->l_flag |= lbmDONE; + + if (!uptodate) { + bp->l_flag |= lbmERROR; + + jERROR(1, ("lbmIODone: I/O error in JFS log\n")); + } + + /* + * pagein completion + */ + if (bp->l_flag & lbmREAD) { + bp->l_flag &= ~lbmREAD; + + LCACHE_UNLOCK(flags); /* unlock+enable */ + + /* wakeup I/O initiator */ + LCACHE_WAKEUP(&bp->l_ioevent); + + return; + } + + /* + * pageout completion + * + * the bp at the head of write queue has completed pageout. + * + * if single-commit/full-page pageout, remove the current buffer + * from head of pageout queue, and redrive pageout with + * the new buffer at head of pageout queue; + * otherwise, the partial-page pageout buffer stays at + * the head of pageout queue to be redriven for pageout + * by lmGroupCommit() until full-page pageout is completed. + */ + bp->l_flag &= ~lbmWRITE; + INCREMENT(lmStat.pagedone); + + /* update committed lsn */ + log = bp->l_log; + log->clsn = (bp->l_pn << L2LOGPSIZE) + bp->l_ceor; + + if (bp->l_flag & lbmDIRECT) { + LCACHE_WAKEUP(&bp->l_ioevent); + LCACHE_UNLOCK(flags); + return; + } + + tail = log->wqueue; + + /* single element queue */ + if (bp == tail) { + /* remove head buffer of full-page pageout + * from log device write queue + */ + if (bp->l_flag & lbmRELEASE) { + log->wqueue = NULL; + bp->l_wqnext = NULL; + } + } + /* multi element queue */ + else { + /* remove head buffer of full-page pageout + * from log device write queue + */ + if (bp->l_flag & lbmRELEASE) { + nextbp = tail->l_wqnext = bp->l_wqnext; + bp->l_wqnext = NULL; + + /* + * redrive pageout of next page at head of write queue: + * redrive next page without any bound tblk + * (i.e., page w/o any COMMIT records), or + * first page of new group commit which has been + * queued after current page (subsequent pageout + * is performed synchronously, except page without + * any COMMITs) by lmGroupCommit() as indicated + * by lbmWRITE flag; + */ + if (nextbp->l_flag & lbmWRITE) { + /* + * We can't do the I/O at interrupt time. + * The jfsIO thread can do it + */ + lbmRedrive(nextbp); + } + } + } + + /* + * synchronous pageout: + * + * buffer has not necessarily been removed from write queue + * (e.g., synchronous write of partial-page with COMMIT): + * leave buffer for i/o initiator to dispose + */ + if (bp->l_flag & lbmSYNC) { + LCACHE_UNLOCK(flags); /* unlock+enable */ + + /* wakeup I/O initiator */ + LCACHE_WAKEUP(&bp->l_ioevent); + } + + /* + * Group Commit pageout: + */ + else if (bp->l_flag & lbmGC) { + LCACHE_UNLOCK(flags); + lmPostGC(bp); + } + + /* + * asynchronous pageout: + * + * buffer must have been removed from write queue: + * insert buffer at head of freelist where it can be recycled + */ + else { + assert(bp->l_flag & lbmRELEASE); + assert(bp->l_flag & lbmFREE); + lbmfree(bp); + + LCACHE_UNLOCK(flags); /* unlock+enable */ + } +} + +int jfsIOWait(void *arg) +{ + lbuf_t *bp; + + jFYI(1, ("jfsIOWait is here!\n")); + + lock_kernel(); + + daemonize(); + current->tty = NULL; + strcpy(current->comm, "jfsIO"); + + unlock_kernel(); + + jfsIOtask = current; + + spin_lock_irq(¤t->sigmask_lock); + siginitsetinv(¤t->blocked, + sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) + | sigmask(SIGCONT)); + spin_unlock_irq(¤t->sigmask_lock); + + complete(&jfsIOwait); + + do { + spin_lock_irq(&log_redrive_lock); + while ((bp = log_redrive_list)) { + log_redrive_list = bp->l_redrive_next; + bp->l_redrive_next = NULL; + spin_unlock_irq(&log_redrive_lock); + lbmStartIO(bp); + spin_lock_irq(&log_redrive_lock); + } + spin_unlock_irq(&log_redrive_lock); + + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } while (!jfs_thread_stopped()); + + jFYI(1,("jfsIOWait being killed!\n")); + complete(&jfsIOwait); + return 0; +} + + +#ifdef _STILL_TO_PORT +/* + * NAME: lmLogFormat()/jfs_logform() + * + * FUNCTION: format file system log (ref. jfs_logform()). + * + * PARAMETERS: + * log - log inode (with common mount inode base); + * logAddress - start address of log space in FS block; + * logSize - length of log space in FS block; + * + * RETURN: 0 - success + * -1 - i/o error + */ +int lmLogFormat(inode_t * ipmnt, s64 logAddress, int logSize) +{ + int rc = 0; + cbuf_t *bp; + logsuper_t *logsuper; + logpage_t *lp; + int lspn; /* log sequence page number */ + struct lrd *lrd_ptr; + int npbperpage, npages; + + jFYI(0, ("lmLogFormat: logAddress:%Ld logSize:%d\n", + logAddress, logSize)); + + /* allocate a JFS buffer */ + bp = rawAllocate(); + + /* map the logical block address to physical block address */ + bp->cm_blkno = logAddress << ipmnt->i_l2bfactor; + + npbperpage = LOGPSIZE >> ipmnt->i_l2pbsize; + npages = logSize / (LOGPSIZE >> ipmnt->i_l2bsize); + + /* + * log space: + * + * page 0 - reserved; + * page 1 - log superblock; + * page 2 - log data page: A SYNC log record is written + * into this page at logform time; + * pages 3-N - log data page: set to empty log data pages; + */ + /* + * init log superblock: log page 1 + */ + logsuper = (logsuper_t *) bp->cm_cdata; + + logsuper->magic = cpu_to_le32(LOGMAGIC); + logsuper->version = cpu_to_le32(LOGVERSION); + logsuper->state = cpu_to_le32(LOGREDONE); + logsuper->flag = cpu_to_le32(ipmnt->i_mntflag); /* ? */ + logsuper->size = cpu_to_le32(npages); + logsuper->bsize = cpu_to_le32(ipmnt->i_bsize); + logsuper->l2bsize = cpu_to_le32(ipmnt->i_l2bsize); + logsuper->end = + cpu_to_le32(2 * LOGPSIZE + LOGPHDRSIZE + LOGRDSIZE); + + bp->cm_blkno += npbperpage; + rawWrite(ipmnt, bp, 0); + + /* + * init pages 2 to npages-1 as log data pages: + * + * log page sequence number (lpsn) initialization: + * + * pn: 0 1 2 3 n-1 + * +-----+-----+=====+=====+===.....===+=====+ + * lspn: N-1 0 1 N-2 + * <--- N page circular file ----> + * + * the N (= npages-2) data pages of the log is maintained as + * a circular file for the log records; + * lpsn grows by 1 monotonically as each log page is written + * to the circular file of the log; + * Since the AIX DUMMY log record is dropped for this XJFS, + * and setLogpage() will not reset the page number even if + * the eor is equal to LOGPHDRSIZE. In order for binary search + * still work in find log end process, we have to simulate the + * log wrap situation at the log format time. + * The 1st log page written will have the highest lpsn. Then + * the succeeding log pages will have ascending order of + * the lspn starting from 0, ... (N-2) + */ + lp = (logpage_t *) bp->cm_cdata; + + /* + * initialize 1st log page to be written: lpsn = N - 1, + * write a SYNCPT log record is written to this page + */ + lp->h.page = lp->t.page = cpu_to_le32(npages - 3); + lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE + LOGRDSIZE); + + lrd_ptr = (struct lrd *) &lp->data; + lrd_ptr->logtid = 0; + lrd_ptr->backchain = 0; + lrd_ptr->type = cpu_to_le16(LOG_SYNCPT); + lrd_ptr->length = 0; + lrd_ptr->log.syncpt.sync = 0; + + bp->cm_blkno += npbperpage; + rawWrite(ipmnt, bp, 0); + + /* + * initialize succeeding log pages: lpsn = 0, 1, ..., (N-2) + */ + for (lspn = 0; lspn < npages - 3; lspn++) { + lp->h.page = lp->t.page = cpu_to_le32(lspn); + lp->h.eor = lp->t.eor = cpu_to_le16(LOGPHDRSIZE); + + bp->cm_blkno += npbperpage; + rawWrite(ipmnt, bp, 0); + } + + /* + * finalize log + */ + /* release the buffer */ + rawRelease(bp); + + return rc; +} +#endif /* _STILL_TO_PORT */ + + +#ifdef CONFIG_JFS_STATISTICS +int jfs_lmstats_read(char *buffer, char **start, off_t offset, int length, + int *eof, void *data) +{ + int len = 0; + off_t begin; + + len += sprintf(buffer, + "JFS Logmgr stats\n" + "================\n" + "commits = %d\n" + "writes submitted = %d\n" + "writes completed = %d\n", + lmStat.commit, + lmStat.submitted, + lmStat.pagedone); + + begin = offset; + *start = buffer + begin; + len -= begin; + + if (len > length) + len = length; + else + *eof = 1; + + if (len < 0) + len = 0; + + return len; +} +#endif /* CONFIG_JFS_STATISTICS */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_logmgr.h linuxppc64_2_4/fs/jfs/jfs_logmgr.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_logmgr.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_logmgr.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,502 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _H_JFS_LOGMGR +#define _H_JFS_LOGMGR + + +#include "jfs_filsys.h" +#include "jfs_lock.h" + +/* + * log manager configuration parameters + */ + +/* log page size */ +#define LOGPSIZE 4096 +#define L2LOGPSIZE 12 + +#define LOGPAGES 16 /* Log pages per mounted file system */ + +/* + * log logical volume + * + * a log is used to make the commit operation on journalled + * files within the same logical volume group atomic. + * a log is implemented with a logical volume. + * there is one log per logical volume group. + * + * block 0 of the log logical volume is not used (ipl etc). + * block 1 contains a log "superblock" and is used by logFormat(), + * lmLogInit(), lmLogShutdown(), and logRedo() to record status + * of the log but is not otherwise used during normal processing. + * blocks 2 - (N-1) are used to contain log records. + * + * when a volume group is varied-on-line, logRedo() must have + * been executed before the file systems (logical volumes) in + * the volume group can be mounted. + */ +/* + * log superblock (block 1 of logical volume) + */ +#define LOGSUPER_B 1 +#define LOGSTART_B 2 + +#define LOGMAGIC 0x87654321 +#define LOGVERSION 1 + +#define MAX_ACTIVE 512 /* Max active file systems sharing log */ + +typedef struct { + u32 magic; /* 4: log lv identifier */ + s32 version; /* 4: version number */ + s32 serial; /* 4: log open/mount counter */ + s32 size; /* 4: size in number of LOGPSIZE blocks */ + s32 bsize; /* 4: logical block size in byte */ + s32 l2bsize; /* 4: log2 of bsize */ + + u32 flag; /* 4: option */ + u32 state; /* 4: state - see below */ + + s32 end; /* 4: addr of last log record set by logredo */ + u32 device; /* 4: save device in case location changes */ + u32 active[MAX_ACTIVE]; /* 2048: active file systems list */ +} logsuper_t; + +/* log flag: commit option (see jfs_filsys.h) */ + +/* log state */ +#define LOGMOUNT 0 /* log mounted by lmLogInit() */ +#define LOGREDONE 1 /* log shutdown by lmLogShutdown(). + * log redo completed by logredo(). + */ +#define LOGWRAP 2 /* log wrapped */ +#define LOGREADERR 3 /* log read error detected in logredo() */ + + +/* + * log logical page + * + * (this comment should be rewritten !) + * the header and trailer structures (h,t) will normally have + * the same page and eor value. + * An exception to this occurs when a complete page write is not + * accomplished on a power failure. Since the hardware may "split write" + * sectors in the page, any out of order sequence may occur during powerfail + * and needs to be recognized during log replay. The xor value is + * an "exclusive or" of all log words in the page up to eor. This + * 32 bit eor is stored with the top 16 bits in the header and the + * bottom 16 bits in the trailer. logredo can easily recognize pages + * that were not completed by reconstructing this eor and checking + * the log page. + * + * Previous versions of the operating system did not allow split + * writes and detected partially written records in logredo by + * ordering the updates to the header, trailer, and the move of data + * into the logdata area. The order: (1) data is moved (2) header + * is updated (3) trailer is updated. In logredo, when the header + * differed from the trailer, the header and trailer were reconciled + * as follows: if h.page != t.page they were set to the smaller of + * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only) + * h.eor != t.eor they were set to the smaller of their two values. + */ +typedef struct { + struct { /* header */ + s32 page; /* 4: log sequence page number */ + s16 rsrvd; /* 2: */ + s16 eor; /* 2: end-of-log offset of lasrt record write */ + } h; + + s32 data[LOGPSIZE / 4 - 4]; /* log record area */ + + struct { /* trailer */ + s32 page; /* 4: normally the same as h.page */ + s16 rsrvd; /* 2: */ + s16 eor; /* 2: normally the same as h.eor */ + } t; +} logpage_t; + +#define LOGPHDRSIZE 8 /* log page header size */ +#define LOGPTLRSIZE 8 /* log page trailer size */ + + +/* + * log record + * + * (this comment should be rewritten !) + * jfs uses only "after" log records (only a single writer is allowed + * in a page, pages are written to temporary paging space if + * if they must be written to disk before commit, and i/o is + * scheduled for modified pages to their home location after + * the log records containing the after values and the commit + * record is written to the log on disk, undo discards the copy + * in main-memory.) + * + * a log record consists of a data area of variable length followed by + * a descriptor of fixed size LOGRDSIZE bytes. + * the data area is rounded up to an integral number of 4-bytes and + * must be no longer than LOGPSIZE. + * the descriptor is of size of multiple of 4-bytes and aligned on a + * 4-byte boundary. + * records are packed one after the other in the data area of log pages. + * (sometimes a DUMMY record is inserted so that at least one record ends + * on every page or the longest record is placed on at most two pages). + * the field eor in page header/trailer points to the byte following + * the last record on a page. + */ + +/* log record types */ +#define LOG_COMMIT 0x8000 +#define LOG_SYNCPT 0x4000 +#define LOG_MOUNT 0x2000 +#define LOG_REDOPAGE 0x0800 +#define LOG_NOREDOPAGE 0x0080 +#define LOG_NOREDOINOEXT 0x0040 +#define LOG_UPDATEMAP 0x0008 +#define LOG_NOREDOFILE 0x0001 + +/* REDOPAGE/NOREDOPAGE log record data type */ +#define LOG_INODE 0x0001 +#define LOG_XTREE 0x0002 +#define LOG_DTREE 0x0004 +#define LOG_BTROOT 0x0010 +#define LOG_EA 0x0020 +#define LOG_ACL 0x0040 +#define LOG_DATA 0x0080 +#define LOG_NEW 0x0100 +#define LOG_EXTEND 0x0200 +#define LOG_RELOCATE 0x0400 +#define LOG_DIR_XTREE 0x0800 /* Xtree is in directory inode */ + +/* UPDATEMAP log record descriptor type */ +#define LOG_ALLOCXADLIST 0x0080 +#define LOG_ALLOCPXDLIST 0x0040 +#define LOG_ALLOCXAD 0x0020 +#define LOG_ALLOCPXD 0x0010 +#define LOG_FREEXADLIST 0x0008 +#define LOG_FREEPXDLIST 0x0004 +#define LOG_FREEXAD 0x0002 +#define LOG_FREEPXD 0x0001 + + +typedef struct lrd { + /* + * type independent area + */ + s32 logtid; /* 4: log transaction identifier */ + s32 backchain; /* 4: ptr to prev record of same transaction */ + u16 type; /* 2: record type */ + s16 length; /* 2: length of data in record (in byte) */ + u32 aggregate; /* 4: file system lv/aggregate */ + /* (16) */ + + /* + * type dependent area (20) + */ + union { + + /* + * COMMIT: commit + * + * transaction commit: no type-dependent information; + */ + + /* + * REDOPAGE: after-image + * + * apply after-image; + * + * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; + */ + struct { + u32 fileset; /* 4: fileset number */ + u32 inode; /* 4: inode number */ + u16 type; /* 2: REDOPAGE record type */ + s16 l2linesize; /* 2: log2 of line size */ + pxd_t pxd; /* 8: on-disk page pxd */ + } redopage; /* (20) */ + + /* + * NOREDOPAGE: the page is freed + * + * do not apply after-image records which precede this record + * in the log with the same page block number to this page. + * + * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; + */ + struct { + s32 fileset; /* 4: fileset number */ + u32 inode; /* 4: inode number */ + u16 type; /* 2: NOREDOPAGE record type */ + s16 rsrvd; /* 2: reserved */ + pxd_t pxd; /* 8: on-disk page pxd */ + } noredopage; /* (20) */ + + /* + * UPDATEMAP: update block allocation map + * + * either in-line PXD, + * or out-of-line XADLIST; + * + * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; + */ + struct { + u32 fileset; /* 4: fileset number */ + u32 inode; /* 4: inode number */ + u16 type; /* 2: UPDATEMAP record type */ + s16 nxd; /* 2: number of extents */ + pxd_t pxd; /* 8: pxd */ + } updatemap; /* (20) */ + + /* + * NOREDOINOEXT: the inode extent is freed + * + * do not apply after-image records which precede this + * record in the log with the any of the 4 page block + * numbers in this inode extent. + * + * NOTE: The fileset and pxd fields MUST remain in + * the same fields in the REDOPAGE record format. + * + */ + struct { + s32 fileset; /* 4: fileset number */ + s32 iagnum; /* 4: IAG number */ + s32 inoext_idx; /* 4: inode extent index */ + pxd_t pxd; /* 8: on-disk page pxd */ + } noredoinoext; /* (20) */ + + /* + * SYNCPT: log sync point + * + * replay log upto syncpt address specified; + */ + struct { + s32 sync; /* 4: syncpt address (0 = here) */ + } syncpt; + + /* + * MOUNT: file system mount + * + * file system mount: no type-dependent information; + */ + + /* + * ? FREEXTENT: free specified extent(s) + * + * free specified extent(s) from block allocation map + * N.B.: nextents should be length of data/sizeof(xad_t) + */ + struct { + s32 type; /* 4: FREEXTENT record type */ + s32 nextent; /* 4: number of extents */ + + /* data: PXD or XAD list */ + } freextent; + + /* + * ? NOREDOFILE: this file is freed + * + * do not apply records which precede this record in the log + * with the same inode number. + * + * NOREDILE must be the first to be written at commit + * (last to be read in logredo()) - it prevents + * replay of preceding updates of all preceding generations + * of the inumber esp. the on-disk inode itself, + * but does NOT prevent + * replay of the + */ + struct { + s32 fileset; /* 4: fileset number */ + u32 inode; /* 4: inode number */ + } noredofile; + + /* + * ? NEWPAGE: + * + * metadata type dependent + */ + struct { + s32 fileset; /* 4: fileset number */ + u32 inode; /* 4: inode number */ + s32 type; /* 4: NEWPAGE record type */ + pxd_t pxd; /* 8: on-disk page pxd */ + } newpage; + + /* + * ? DUMMY: filler + * + * no type-dependent information + */ + } log; +} lrd_t; /* (36) */ + +#define LOGRDSIZE (sizeof(struct lrd)) + +/* + * line vector descriptor + */ +typedef struct { + s16 offset; + s16 length; +} lvd_t; + + +/* + * log logical volume + */ +typedef struct jfs_log { + + struct super_block *sb; /* 4: This is used to sync metadata + * before writing syncpt. Will + * need to be a list if we share + * the log between fs's + */ + kdev_t dev; /* 4: log lv number */ + struct block_device *bdev; /* 4: log lv pointer */ + s32 serial; /* 4: log mount serial number */ + + s64 base; /* @8: log extent address (inline log ) */ + int size; /* 4: log size in log page (in page) */ + int l2bsize; /* 4: log2 of bsize */ + + uint flag; /* 4: flag */ + uint state; /* 4: state */ + + struct lbuf *lbuf_free; /* 4: free lbufs */ + wait_queue_head_t free_wait; /* 4: */ + + /* log write */ + int logtid; /* 4: log tid */ + int page; /* 4: page number of eol page */ + int eor; /* 4: eor of last record in eol page */ + struct lbuf *bp; /* 4: current log page buffer */ + + struct semaphore loglock; /* 4: log write serialization lock */ + + /* syncpt */ + int nextsync; /* 4: bytes to write before next syncpt */ + int active; /* 4: */ + int syncbarrier; /* 4: */ + wait_queue_head_t syncwait; /* 4: */ + + /* commit */ + uint cflag; /* 4: */ + struct { /* 8: FIFO commit queue header */ + struct tblock *head; + struct tblock *tail; + } cqueue; + int gcrtc; /* 4: GC_READY transaction count */ + struct tblock *gclrt; /* 4: latest GC_READY transaction */ + spinlock_t gclock; /* 4: group commit lock */ + int logsize; /* 4: log data area size in byte */ + int lsn; /* 4: end-of-log */ + int clsn; /* 4: clsn */ + int syncpt; /* 4: addr of last syncpt record */ + int sync; /* 4: addr from last logsync() */ + struct list_head synclist; /* 8: logsynclist anchor */ + spinlock_t synclock; /* 4: synclist lock */ + struct lbuf *wqueue; /* 4: log pageout queue */ + int count; /* 4: count */ +} log_t; + +/* + * group commit flag + */ +/* log_t */ +#define logGC_PAGEOUT 0x00000001 + +/* tblock_t/lbuf_t */ +#define tblkGC_QUEUE 0x0001 +#define tblkGC_READY 0x0002 +#define tblkGC_COMMIT 0x0004 +#define tblkGC_COMMITTED 0x0008 +#define tblkGC_EOP 0x0010 +#define tblkGC_FREE 0x0020 +#define tblkGC_LEADER 0x0040 +#define tblkGC_ERROR 0x0080 +#define tblkGC_LAZY 0x0100 // D230860 +#define tblkGC_UNLOCKED 0x0200 // D230860 + +/* + * log cache buffer header + */ +typedef struct lbuf { + struct buffer_head l_bh; /* for doing I/O */ + log_t *l_log; /* 4: log associated with buffer */ + + /* + * data buffer base area + */ + uint l_flag; /* 4: pageout control flags */ + + struct lbuf *l_wqnext; /* 4: write queue link */ + struct lbuf *l_freelist; /* 4: freelistlink */ + + int l_pn; /* 4: log page number */ + int l_eor; /* 4: log record eor */ + int l_ceor; /* 4: committed log record eor */ + + s64 l_blkno; /* 8: log page block number */ + caddr_t l_ldata; /* 4: data page */ + + wait_queue_head_t l_ioevent; /* 4: i/o done event */ + struct page *l_page; /* The page itself */ +} lbuf_t; + +/* Reuse l_freelist for redrive list */ +#define l_redrive_next l_freelist + +/* + * logsynclist block + * + * common logsyncblk prefix for jbuf_t and tblock_t + */ +typedef struct logsyncblk { + u16 xflag; /* flags */ + u16 flag; /* only meaninful in tblock_t */ + lid_t lid; /* lock id */ + s32 lsn; /* log sequence number */ + struct list_head synclist; /* log sync list link */ +} logsyncblk_t; + +/* + * logsynclist serialization (per log) + */ + +#define LOGSYNC_LOCK_INIT(log) spin_lock_init(&(log)->synclock) +#define LOGSYNC_LOCK(log) spin_lock(&(log)->synclock) +#define LOGSYNC_UNLOCK(log) spin_unlock(&(log)->synclock) + +/* compute the difference in bytes of lsn from sync point */ +#define logdiff(diff, lsn, log)\ +{\ + diff = (lsn) - (log)->syncpt;\ + if (diff < 0)\ + diff += (log)->logsize;\ +} + +extern int lmLogOpen(struct super_block *sb, log_t ** log); +extern int lmLogClose(struct super_block *sb, log_t * log); +extern int lmLogSync(log_t * log, int nosyncwait); +extern int lmLogQuiesce(log_t * log); +extern int lmLogResume(log_t * log, struct super_block *sb); +extern int lmLogFormat(struct super_block *sb, s64 logAddress, int logSize); + +#endif /* _H_JFS_LOGMGR */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_metapage.c linuxppc64_2_4/fs/jfs/jfs_metapage.c --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_metapage.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_metapage.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,688 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Module: jfs/jfs_metapage.c + * + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_txnmgr.h" +#include "jfs_debug.h" + +extern struct task_struct *jfsCommitTask; +static unsigned int metapages = 1024; /* ??? Need a better number */ +static unsigned int free_metapages; +static metapage_t *metapage_buf; +static unsigned long meta_order; +static metapage_t *meta_free_list = NULL; +static spinlock_t meta_lock = SPIN_LOCK_UNLOCKED; +static wait_queue_head_t meta_wait; + +#ifdef CONFIG_JFS_STATISTICS +struct { + uint pagealloc; /* # of page allocations */ + uint pagefree; /* # of page frees */ + uint lockwait; /* # of sleeping lock_metapage() calls */ + uint allocwait; /* # of sleeping alloc_metapage() calls */ +} mpStat; +#endif + + +#define HASH_BITS 10 /* This makes hash_table 1 4K page */ +#define HASH_SIZE (1 << HASH_BITS) +static metapage_t **hash_table = NULL; +static unsigned long hash_order; + + +static inline int metapage_locked(struct metapage *mp) +{ + return test_bit(META_locked, &mp->flag); +} + +static inline int trylock_metapage(struct metapage *mp) +{ + return test_and_set_bit(META_locked, &mp->flag); +} + +static inline void unlock_metapage(struct metapage *mp) +{ + clear_bit(META_locked, &mp->flag); + wake_up(&mp->wait); +} + +static void __lock_metapage(struct metapage *mp) +{ + DECLARE_WAITQUEUE(wait, current); + + INCREMENT(mpStat.lockwait); + + add_wait_queue_exclusive(&mp->wait, &wait); + do { + set_current_state(TASK_UNINTERRUPTIBLE); + if (metapage_locked(mp)) { + spin_unlock(&meta_lock); + schedule(); + spin_lock(&meta_lock); + } + } while (trylock_metapage(mp)); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&mp->wait, &wait); +} + +/* needs meta_lock */ +static inline void lock_metapage(struct metapage *mp) +{ + if (trylock_metapage(mp)) + __lock_metapage(mp); +} + +/* We're currently re-evaluating the method we use to write metadata + * pages. Currently, we have to make sure there no dirty buffer_heads + * hanging around after we free the metadata page, since the same + * physical disk blocks may be used in a different address space and we + * can't write old data over the good data. + * + * The best way to do this now is with block_invalidate_page. However, + * this is only available in the newer kernels and is not exported + * to modules. block_flushpage is the next best, but it too is not exported + * to modules. + * + * In a module, about the best we have is generic_buffer_fdatasync. This + * synchronously writes any dirty buffers. This is not optimal, but it will + * keep old dirty buffers from overwriting newer data. + */ +static inline void invalidate_page(metapage_t *mp) +{ +#ifdef MODULE + generic_buffer_fdatasync(mp->mapping->host, mp->index, mp->index + 1); +#else + lock_page(mp->page); + block_flushpage(mp->page, 0); + UnlockPage(mp->page); +#endif +} + +int __init metapage_init(void) +{ + int i; + metapage_t *last = NULL; + metapage_t *mp; + + /* + * Initialize wait queue + */ + init_waitqueue_head(&meta_wait); + + /* + * Allocate the metapage structures + */ + for (meta_order = 0; + ((PAGE_SIZE << meta_order) / sizeof(metapage_t)) < metapages; + meta_order++); + metapages = (PAGE_SIZE << meta_order) / sizeof(metapage_t); + + jFYI(1, ("metapage_init: metapage size = %Zd, metapages = %d\n", + sizeof(metapage_t), metapages)); + + metapage_buf = + (metapage_t *) __get_free_pages(GFP_KERNEL, meta_order); + assert(metapage_buf); + memset(metapage_buf, 0, PAGE_SIZE << meta_order); + + mp = metapage_buf; + for (i = 0; i < metapages; i++, mp++) { + mp->flag = 0; + set_bit(META_free, &mp->flag); + init_waitqueue_head(&mp->wait); + mp->hash_next = last; + last = mp; + } + meta_free_list = last; + free_metapages = metapages; + + /* + * Now the hash list + */ + for (hash_order = 0; + ((PAGE_SIZE << hash_order) / sizeof(void *)) < HASH_SIZE; + hash_order++); + hash_table = + (metapage_t **) __get_free_pages(GFP_KERNEL, hash_order); + assert(hash_table); + memset(hash_table, 0, PAGE_SIZE << hash_order); + + return 0; +} + +void metapage_exit(void) +{ + free_pages((unsigned long) metapage_buf, meta_order); + free_pages((unsigned long) hash_table, hash_order); + metapage_buf = 0; /* This is a signal to the jfsIOwait thread */ +} + +/* + * Get metapage structure from freelist + * + * Caller holds meta_lock + */ +static metapage_t *alloc_metapage(int *dropped_lock) +{ + metapage_t *new; + + *dropped_lock = FALSE; + + /* + * Reserve two metapages for the lazy commit thread. Otherwise + * we may deadlock with holders of metapages waiting for tlocks + * that lazy thread should be freeing. + */ + if ((free_metapages < 3) && (current != jfsCommitTask)) { + INCREMENT(mpStat.allocwait); + *dropped_lock = TRUE; + __SLEEP_COND(meta_wait, (free_metapages > 2), + spin_lock(&meta_lock), spin_unlock(&meta_lock)); + } + + assert(meta_free_list); + + new = meta_free_list; + meta_free_list = new->hash_next; + free_metapages--; + + return new; +} + +/* + * Put metapage on freelist (holding meta_lock) + */ +static inline void __free_metapage(metapage_t * mp) +{ + mp->flag = 0; + set_bit(META_free, &mp->flag); + mp->hash_next = meta_free_list; + meta_free_list = mp; + free_metapages++; + wake_up(&meta_wait); +} + +/* + * Put metapage on freelist (not holding meta_lock) + */ +static inline void free_metapage(metapage_t * mp) +{ + spin_lock(&meta_lock); + __free_metapage(mp); + spin_unlock(&meta_lock); +} + +/* + * Basically same hash as in pagemap.h, but using our hash table + */ +static metapage_t **meta_hash(struct address_space *mapping, + unsigned long index) +{ +#define i (((unsigned long)mapping)/ \ + (sizeof(struct inode) & ~(sizeof(struct inode) -1 ))) +#define s(x) ((x) + ((x) >> HASH_BITS)) + return hash_table + (s(i + index) & (HASH_SIZE - 1)); +#undef i +#undef s +} + +static metapage_t *search_hash(metapage_t ** hash_ptr, + struct address_space *mapping, + unsigned long index) +{ + metapage_t *ptr; + + for (ptr = *hash_ptr; ptr; ptr = ptr->hash_next) { + if ((ptr->mapping == mapping) && (ptr->index == index)) + return ptr; + } + + return NULL; +} + +static void add_to_hash(metapage_t * mp, metapage_t ** hash_ptr) +{ + if (*hash_ptr) + (*hash_ptr)->hash_prev = mp; + + mp->hash_prev = NULL; + mp->hash_next = *hash_ptr; + *hash_ptr = mp; + list_add(&mp->inode_list, &JFS_IP(mp->mapping->host)->mp_list); +} + +static void remove_from_hash(metapage_t * mp, metapage_t ** hash_ptr) +{ + list_del(&mp->inode_list); + + if (mp->hash_prev) + mp->hash_prev->hash_next = mp->hash_next; + else { + assert(*hash_ptr == mp); + *hash_ptr = mp->hash_next; + } + + if (mp->hash_next) + mp->hash_next->hash_prev = mp->hash_prev; +} + +/* + * Direct address space operations + */ + +static int direct_get_block(struct inode *ip, long lblock, + struct buffer_head *bh_result, int create) +{ + bh_result->b_dev = ip->i_dev; + bh_result->b_blocknr = lblock; + if (create) + bh_result->b_state |= (1UL << BH_Mapped) | (1UL << BH_New); + else + bh_result->b_state |= (1UL << BH_Mapped); + + return 0; +} + +static int direct_writepage(struct page *page) +{ + return block_write_full_page(page, direct_get_block); +} + +static int direct_readpage(struct file *fp, struct page *page) +{ + return block_read_full_page(page, direct_get_block); +} + +static int direct_prepare_write(struct file *file, struct page *page, + unsigned from, unsigned to) +{ + return block_prepare_write(page, from, to, direct_get_block); +} + +static int direct_bmap(struct address_space *mapping, long block) +{ + return generic_block_bmap(mapping, block, direct_get_block); +} + +struct address_space_operations direct_aops = { + readpage: direct_readpage, + writepage: direct_writepage, + sync_page: block_sync_page, + prepare_write: direct_prepare_write, + commit_write: generic_commit_write, + bmap: direct_bmap, +}; + +metapage_t *__get_metapage(struct inode *inode, + unsigned long lblock, unsigned int size, + int absolute, unsigned long new) +{ + int dropped_lock; + metapage_t **hash_ptr; + int l2BlocksPerPage; + int l2bsize; + struct address_space *mapping; + metapage_t *mp; + unsigned long page_index; + unsigned long page_offset; + + jFYI(1, ("__get_metapage: inode = 0x%p, lblock = 0x%lx\n", + inode, lblock)); + + if (absolute) + mapping = JFS_SBI(inode->i_sb)->direct_mapping; + else + mapping = inode->i_mapping; + + spin_lock(&meta_lock); + + hash_ptr = meta_hash(mapping, lblock); + + mp = search_hash(hash_ptr, mapping, lblock); + if (mp) { + page_found: + if (test_bit(META_discard, &mp->flag)) { + assert(new); /* It's okay to reuse a discarded + * if we expect it to be empty + */ + clear_bit(META_discard, &mp->flag); + } + mp->count++; + jFYI(1, ("__get_metapage: found 0x%p, in hash\n", mp)); + assert(mp->logical_size == size); + lock_metapage(mp); + spin_unlock(&meta_lock); + } else { + l2bsize = inode->i_sb->s_blocksize_bits; + l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize; + page_index = lblock >> l2BlocksPerPage; + page_offset = (lblock - (page_index << l2BlocksPerPage)) << + l2bsize; + if ((page_offset + size) > PAGE_SIZE) { + spin_unlock(&meta_lock); + jERROR(1, ("MetaData crosses page boundary!!\n")); + return NULL; + } + + mp = alloc_metapage(&dropped_lock); + if (dropped_lock) { + /* alloc_metapage blocked, we need to search the hash + * again. (The goto is ugly, maybe we'll clean this + * up in the future.) + */ + metapage_t *mp2; + mp2 = search_hash(hash_ptr, mapping, lblock); + if (mp2) { + __free_metapage(mp); + mp = mp2; + goto page_found; + } + } + mp->flag = 0; + lock_metapage(mp); + if (absolute) + set_bit(META_absolute, &mp->flag); + mp->xflag = COMMIT_PAGE; + mp->count = 1; + atomic_set(&mp->nohomeok,0); + mp->mapping = mapping; + mp->index = lblock; + mp->page = 0; + mp->logical_size = size; + add_to_hash(mp, hash_ptr); + spin_unlock(&meta_lock); + + if (new) { + jFYI(1, + ("__get_metapage: Calling grab_cache_page\n")); + mp->page = grab_cache_page(mapping, page_index); + if (!mp->page) { + jERROR(1, ("grab_cache_page failed!\n")); + spin_lock(&meta_lock); + remove_from_hash(mp, hash_ptr); + __free_metapage(mp); + spin_unlock(&meta_lock); + return NULL; + } else + INCREMENT(mpStat.pagealloc); + } else { + jFYI(1, + ("__get_metapage: Calling read_cache_page\n")); + mp->page = + read_cache_page(mapping, lblock, + (filler_t *) mapping->a_ops-> + readpage, NULL); + if (IS_ERR(mp->page)) { + jERROR(1, ("read_cache_page failed!\n")); + spin_lock(&meta_lock); + remove_from_hash(mp, hash_ptr); + __free_metapage(mp); + spin_unlock(&meta_lock); + return NULL; + } else + INCREMENT(mpStat.pagealloc); + lock_page(mp->page); + } + mp->data = (void *) (kmap(mp->page) + page_offset); + } + jFYI(1, ("__get_metapage: returning = 0x%p\n", mp)); + return mp; +} + +void hold_metapage(metapage_t * mp, int force) +{ + spin_lock(&meta_lock); + + mp->count++; + + if (force) { + ASSERT (!(test_bit(META_forced, &mp->flag))); + if (trylock_metapage(mp)) + set_bit(META_forced, &mp->flag); + } else + lock_metapage(mp); + + spin_unlock(&meta_lock); +} + +static void __write_metapage(metapage_t * mp) +{ + struct inode *ip = (struct inode *) mp->mapping->host; + unsigned long page_index; + unsigned long page_offset; + int rc; + int l2bsize = ip->i_sb->s_blocksize_bits; + int l2BlocksPerPage = PAGE_CACHE_SHIFT - l2bsize; + + jFYI(1, ("__write_metapage: mp = 0x%p\n", mp)); + + if (test_bit(META_discard, &mp->flag)) { + /* + * This metadata is no longer valid + */ + clear_bit(META_dirty, &mp->flag); + return; + } + + page_index = mp->page->index; + page_offset = + (mp->index - (page_index << l2BlocksPerPage)) << l2bsize; + + rc = mp->mapping->a_ops->prepare_write(NULL, mp->page, page_offset, + page_offset + + mp->logical_size); + if (rc) { + jERROR(1, ("prepare_write return %d!\n", rc)); + ClearPageUptodate(mp->page); + kunmap(mp->page); + clear_bit(META_dirty, &mp->flag); + return; + } + rc = mp->mapping->a_ops->commit_write(NULL, mp->page, page_offset, + page_offset + + mp->logical_size); + if (rc) { + jERROR(1, ("commit_write returned %d\n", rc)); + } + + clear_bit(META_dirty, &mp->flag); + + jFYI(1, ("__write_metapage done\n")); +} + +void release_metapage(metapage_t * mp) +{ + log_t *log; + struct inode *ip; + + jFYI(1, + ("release_metapage: mp = 0x%p, flag = 0x%lx\n", mp, + mp->flag)); + + spin_lock(&meta_lock); + if (test_bit(META_forced, &mp->flag)) { + clear_bit(META_forced, &mp->flag); + mp->count--; + spin_unlock(&meta_lock); + return; + } + + ip = (struct inode *) mp->mapping->host; + + assert(mp->count); + if (--mp->count || atomic_read(&mp->nohomeok)) { + unlock_metapage(mp); + spin_unlock(&meta_lock); + } else { + remove_from_hash(mp, meta_hash(mp->mapping, mp->index)); + spin_unlock(&meta_lock); + + if (mp->page) { + kunmap(mp->page); + mp->data = 0; + if (test_bit(META_dirty, &mp->flag)) + __write_metapage(mp); + UnlockPage(mp->page); + if (test_bit(META_sync, &mp->flag)) { + sync_metapage(mp); + clear_bit(META_sync, &mp->flag); + } + + if (test_bit(META_discard, &mp->flag)) + invalidate_page(mp); + + page_cache_release(mp->page); + INCREMENT(mpStat.pagefree); + } + + if (mp->lsn) { + /* + * Remove metapage from logsynclist. + */ + log = mp->log; + LOGSYNC_LOCK(log); + mp->log = 0; + mp->lsn = 0; + mp->clsn = 0; + log->count--; + list_del(&mp->synclist); + LOGSYNC_UNLOCK(log); + } + + free_metapage(mp); + } + jFYI(1, ("release_metapage: done\n")); +} + +void invalidate_metapages(struct inode *ip, unsigned long addr, + unsigned long len) +{ + metapage_t **hash_ptr; + unsigned long lblock; + int l2BlocksPerPage = PAGE_CACHE_SHIFT - ip->i_sb->s_blocksize_bits; + struct address_space *mapping = ip->i_mapping; + metapage_t *mp; +#ifndef MODULE + struct page *page; +#endif + + /* + * First, mark metapages to discard. They will eventually be + * released, but should not be written. + */ + for (lblock = addr; lblock < addr + len; + lblock += 1 << l2BlocksPerPage) { + hash_ptr = meta_hash(mapping, lblock); + spin_lock(&meta_lock); + mp = search_hash(hash_ptr, mapping, lblock); + if (mp) { + set_bit(META_discard, &mp->flag); + spin_unlock(&meta_lock); + /* + * If in the metapage cache, we've got the page locked + */ +#ifdef MODULE + UnlockPage(mp->page); + generic_buffer_fdatasync(mp->mapping->host, mp->index, + mp->index+1); + lock_page(mp->page); +#else + block_flushpage(mp->page, 0); +#endif + } else { + spin_unlock(&meta_lock); +#ifdef MODULE + generic_buffer_fdatasync(ip, lblock << l2BlocksPerPage, + (lblock + 1) << l2BlocksPerPage); +#else + page = find_lock_page(mapping, + lblock >> l2BlocksPerPage); + if (page) { + block_flushpage(page, 0); + UnlockPage(page); + } +#endif + } + } +} + +void invalidate_inode_metapages(struct inode *inode) +{ + struct list_head *ptr; + metapage_t *mp; + + spin_lock(&meta_lock); + list_for_each(ptr, &JFS_IP(inode)->mp_list) { + mp = list_entry(ptr, metapage_t, inode_list); + clear_bit(META_dirty, &mp->flag); + set_bit(META_discard, &mp->flag); + kunmap(mp->page); + UnlockPage(mp->page); + page_cache_release(mp->page); + INCREMENT(mpStat.pagefree); + mp->data = 0; + mp->page = 0; + } + spin_unlock(&meta_lock); + truncate_inode_pages(inode->i_mapping, 0); +} + +#ifdef CONFIG_JFS_STATISTICS +int jfs_mpstat_read(char *buffer, char **start, off_t offset, int length, + int *eof, void *data) +{ + int len = 0; + off_t begin; + + len += sprintf(buffer, + "JFS Metapage statistics\n" + "=======================\n" + "metapages in use = %d\n" + "page allocations = %d\n" + "page frees = %d\n" + "lock waits = %d\n" + "allocation waits = %d\n", + metapages - free_metapages, + mpStat.pagealloc, + mpStat.pagefree, + mpStat.lockwait, + mpStat.allocwait); + + begin = offset; + *start = buffer + begin; + len -= begin; + + if (len > length) + len = length; + else + *eof = 1; + + if (len < 0) + len = 0; + + return len; +} +#endif diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_metapage.h linuxppc64_2_4/fs/jfs/jfs_metapage.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_metapage.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_metapage.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,123 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#ifndef _H_JFS_METAPAGE +#define _H_JFS_METAPAGE + +#include + +typedef struct metapage { + /* Common logsyncblk prefix (see jfs_logmgr.h) */ + u16 xflag; + u16 unused; + lid_t lid; + int lsn; + struct list_head synclist; + /* End of logsyncblk prefix */ + + unsigned long flag; /* See Below */ + unsigned long count; /* Reference count */ + void *data; /* Data pointer */ + + /* list management stuff */ + struct metapage *hash_prev; + struct metapage *hash_next; /* Also used for free list */ + + struct list_head inode_list; /* per-inode metapage list */ + /* + * mapping & index become redundant, but we need these here to + * add the metapage to the hash before we have the real page + */ + struct address_space *mapping; + unsigned long index; + wait_queue_head_t wait; + + /* implementation */ + struct page *page; + unsigned long logical_size; + + /* Journal management */ + int clsn; + atomic_t nohomeok; + struct jfs_log *log; +} metapage_t; + +/* + * Direct-access address space operations + */ +extern struct address_space_operations direct_aops; + +/* metapage flag */ +#define META_locked 0 +#define META_absolute 1 +#define META_free 2 +#define META_dirty 3 +#define META_sync 4 +#define META_discard 5 +#define META_forced 6 + +#define mark_metapage_dirty(mp) set_bit(META_dirty, &(mp)->flag) + +/* function prototypes */ +extern metapage_t *__get_metapage(struct inode *inode, + unsigned long lblock, unsigned int size, + int absolute, unsigned long new); + +#define read_metapage(inode, lblock, size, absolute)\ + __get_metapage(inode, lblock, size, absolute, FALSE) + +#define get_metapage(inode, lblock, size, absolute)\ + __get_metapage(inode, lblock, size, absolute, TRUE) + +extern void release_metapage(metapage_t *); + +#define flush_metapage(mp) \ +{\ + set_bit(META_dirty, &(mp)->flag);\ + set_bit(META_sync, &(mp)->flag);\ + release_metapage(mp);\ +} + +#define sync_metapage(mp) \ + generic_buffer_fdatasync((struct inode *)mp->mapping->host,\ + mp->page->index, mp->page->index + 1) + +#define write_metapage(mp) \ +{\ + set_bit(META_dirty, &(mp)->flag);\ + release_metapage(mp);\ +} + +#define discard_metapage(mp) \ +{\ + clear_bit(META_dirty, &(mp)->flag);\ + set_bit(META_discard, &(mp)->flag);\ + release_metapage(mp);\ +} + +extern void hold_metapage(metapage_t *, int); + +/* + * This routine uses hash to explicitly find small number of pages + */ +extern void invalidate_metapages(struct inode *, unsigned long, unsigned long); + +/* + * This one uses mp_list to invalidate all pages for an inode + */ +extern void invalidate_inode_metapages(struct inode *inode); +#endif /* _H_JFS_METAPAGE */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_mount.c linuxppc64_2_4/fs/jfs/jfs_mount.c --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_mount.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_mount.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,525 @@ +/* + * MODULE_NAME: jfs_mount.c + * + * COMPONENT_NAME: sysjfs + * + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * Change History : + * + */ + +/* + * Module: jfs_mount.c + * + * note: file system in transition to aggregate/fileset: + * + * file system mount is interpreted as the mount of aggregate, + * if not already mounted, and mount of the single/only fileset in + * the aggregate; + * + * a file system/aggregate is represented by an internal inode + * (aka mount inode) initialized with aggregate superblock; + * each vfs represents a fileset, and points to its "fileset inode + * allocation map inode" (aka fileset inode): + * (an aggregate itself is structured recursively as a filset: + * an internal vfs is constructed and points to its "fileset inode + * allocation map inode" (aka aggregate inode) where each inode + * represents a fileset inode) so that inode number is mapped to + * on-disk inode in uniform way at both aggregate and fileset level; + * + * each vnode/inode of a fileset is linked to its vfs (to facilitate + * per fileset inode operations, e.g., unmount of a fileset, etc.); + * each inode points to the mount inode (to facilitate access to + * per aggregate information, e.g., block size, etc.) as well as + * its file set inode. + * + * aggregate + * ipmnt + * mntvfs -> fileset ipimap+ -> aggregate ipbmap -> aggregate ipaimap; + * fileset vfs -> vp(1) <-> ... <-> vp(n) <->vproot; + */ + +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_metapage.h" +#include "jfs_debug.h" + + +/* + * forward references + */ +static int chkSuper(struct super_block *); +static int logMOUNT(struct super_block *sb); + +/* + * NAME: jfs_mount(sb) + * + * FUNCTION: vfs_mount() + * + * PARAMETER: sb - super block + * + * RETURN: EBUSY - device already mounted or open for write + * EBUSY - cvrdvp already mounted; + * EBUSY - mount table full + * ENOTDIR - cvrdvp not directory on a device mount + * ENXIO - device open failure + */ +int jfs_mount(struct super_block *sb) +{ + int rc = 0; /* Return code */ + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct inode *ipaimap = NULL; + struct inode *ipaimap2 = NULL; + struct inode *ipimap = NULL; + struct inode *ipbmap = NULL; + + jFYI(1, ("\nMount JFS\n")); + + /* + * read/validate superblock + * (initialize mount inode from the superblock) + */ + if ((rc = chkSuper(sb))) { + goto errout20; + } + + ipaimap = diReadSpecial(sb, AGGREGATE_I); + if (ipaimap == NULL) { + jERROR(1, ("jfs_mount: Faild to read AGGREGATE_I\n")); + rc = EIO; + goto errout20; + } + sbi->ipaimap = ipaimap; + + jFYI(1, ("jfs_mount: ipaimap:0x%p\n", ipaimap)); + + /* + * initialize aggregate inode allocation map + */ + if ((rc = diMount(ipaimap))) { + jERROR(1, + ("jfs_mount: diMount(ipaimap) failed w/rc = %d\n", + rc)); + goto errout21; + } + + /* + * open aggregate block allocation map + */ + ipbmap = diReadSpecial(sb, BMAP_I); + if (ipbmap == NULL) { + rc = EIO; + goto errout22; + } + + jFYI(1, ("jfs_mount: ipbmap:0x%p\n", ipbmap)); + + sbi->ipbmap = ipbmap; + + /* + * initialize aggregate block allocation map + */ + if ((rc = dbMount(ipbmap))) { + jERROR(1, ("jfs_mount: dbMount failed w/rc = %d\n", rc)); + goto errout22; + } + + /* + * open the secondary aggregate inode allocation map + * + * This is a duplicate of the aggregate inode allocation map. + * + * hand craft a vfs in the same fashion as we did to read ipaimap. + * By adding INOSPEREXT (32) to the inode number, we are telling + * diReadSpecial that we are reading from the secondary aggregate + * inode table. This also creates a unique entry in the inode hash + * table. + */ + if ((sbi->mntflag & JFS_BAD_SAIT) == 0) { + ipaimap2 = diReadSpecial(sb, AGGREGATE_I + INOSPEREXT); + if (ipaimap2 == 0) { + jERROR(1, + ("jfs_mount: Faild to read AGGREGATE_I\n")); + rc = EIO; + goto errout35; + } + sbi->ipaimap2 = ipaimap2; + + jFYI(1, ("jfs_mount: ipaimap2:0x%p\n", ipaimap2)); + + /* + * initialize secondary aggregate inode allocation map + */ + if ((rc = diMount(ipaimap2))) { + jERROR(1, + ("jfs_mount: diMount(ipaimap2) failed, rc = %d\n", + rc)); + goto errout35; + } + } else + /* Secondary aggregate inode table is not valid */ + sbi->ipaimap2 = 0; + + /* + * mount (the only/single) fileset + */ + /* + * open fileset inode allocation map (aka fileset inode) + */ + ipimap = diReadSpecial(sb, FILESYSTEM_I); + if (ipimap == NULL) { + jERROR(1, ("jfs_mount: Failed to read FILESYSTEM_I\n")); + /* open fileset secondary inode allocation map */ + rc = EIO; + goto errout40; + } + jFYI(1, ("jfs_mount: ipimap:0x%p\n", ipimap)); + + /* map further access of per fileset inodes by the fileset inode */ + sbi->ipimap = ipimap; + + /* initialize fileset inode allocation map */ + if ((rc = diMount(ipimap))) { + jERROR(1, ("jfs_mount: diMount failed w/rc = %d\n", rc)); + goto errout41; + } + + jFYI(1, ("Mount JFS Complete.\n")); + goto out; + + /* + * unwind on error + */ +//errout42: /* close fileset inode allocation map */ + diUnmount(ipimap, 1); + + errout41: /* close fileset inode allocation map inode */ + diFreeSpecial(ipimap); + + errout40: /* fileset closed */ + + /* close secondary aggregate inode allocation map */ + if (ipaimap2) { + diUnmount(ipaimap2, 1); + diFreeSpecial(ipaimap2); + } + + errout35: + + /* close aggregate block allocation map */ + dbUnmount(ipbmap, 1); + diFreeSpecial(ipbmap); + + errout22: /* close aggregate inode allocation map */ + + diUnmount(ipaimap, 1); + + errout21: /* close aggregate inodes */ + diFreeSpecial(ipaimap); + errout20: /* aggregate closed */ + + out: + + if (rc) { + jERROR(1, ("Mount JFS Failure: %d\n", rc)); + } + return rc; +} + +/* + * NAME: jfs_mount_rw(sb, remount) + * + * FUNCTION: Completes read-write mount, or remounts read-only volume + * as read-write + */ +int jfs_mount_rw(struct super_block *sb, int remount) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + log_t *log; + int rc; + + /* + * If we are re-mounting a previously read-only volume, we want to + * re-read the inode and block maps, since fsck.jfs may have updated + * them. + */ + if (remount) { + if (chkSuper(sb) || (sbi->state != FM_CLEAN)) + return -EINVAL; + + truncate_inode_pages(sbi->ipimap->i_mapping, 0); + truncate_inode_pages(sbi->ipbmap->i_mapping, 0); + diUnmount(sbi->ipimap, 1); + if ((rc = diMount(sbi->ipimap))) { + jERROR(1,("jfs_mount_rw: diMount failed!\n")); + return rc; + } + + dbUnmount(sbi->ipbmap, 1); + if ((rc = dbMount(sbi->ipbmap))) { + jERROR(1,("jfs_mount_rw: dbMount failed!\n")); + return rc; + } + } + + /* + * open/initialize log + */ + if ((rc = lmLogOpen(sb, &log))) + return rc; + + JFS_SBI(sb)->log = log; + + /* + * update file system superblock; + */ + if ((rc = updateSuper(sb, FM_MOUNT))) { + jERROR(1, + ("jfs_mount: updateSuper failed w/rc = %d\n", rc)); + lmLogClose(sb, log); + JFS_SBI(sb)->log = 0; + return rc; + } + + /* + * write MOUNT log record of the file system + */ + logMOUNT(sb); + + return rc; +} + +/* + * chkSuper() + * + * validate the superblock of the file system to be mounted and + * get the file system parameters. + * + * returns + * 0 with fragsize set if check successful + * error code if not successful + */ +static int chkSuper(struct super_block *sb) +{ + int rc = 0; + metapage_t *mp; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct jfs_superblock *j_sb; + int AIM_bytesize, AIT_bytesize; + int expected_AIM_bytesize, expected_AIT_bytesize; + s64 AIM_byte_addr, AIT_byte_addr, fsckwsp_addr; + s64 byte_addr_diff0, byte_addr_diff1; + s32 bsize; + + if ((rc = readSuper(sb, &mp))) + return rc; + j_sb = (struct jfs_superblock *) (mp->data); + + /* + * validate superblock + */ + /* validate fs signature */ + if (strncmp(j_sb->s_magic, JFS_MAGIC, 4) || + j_sb->s_version > cpu_to_le32(JFS_VERSION)) { + //rc = EFORMAT; + rc = EINVAL; + goto out; + } + + bsize = le32_to_cpu(j_sb->s_bsize); +#ifdef _JFS_4K + if (bsize != PSIZE) { + jERROR(1, ("Currently only 4K block size supported!\n")); + rc = EINVAL; + goto out; + } +#endif /* _JFS_4K */ + + jFYI(1, ("superblock: flag:0x%08x state:0x%08x size:0x%Lx\n", + le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state), + (unsigned long long) le64_to_cpu(j_sb->s_size))); + + /* validate the descriptors for Secondary AIM and AIT */ + if ((j_sb->s_flag & cpu_to_le32(JFS_BAD_SAIT)) != + cpu_to_le32(JFS_BAD_SAIT)) { + expected_AIM_bytesize = 2 * PSIZE; + AIM_bytesize = lengthPXD(&(j_sb->s_aim2)) * bsize; + expected_AIT_bytesize = 4 * PSIZE; + AIT_bytesize = lengthPXD(&(j_sb->s_ait2)) * bsize; + AIM_byte_addr = addressPXD(&(j_sb->s_aim2)) * bsize; + AIT_byte_addr = addressPXD(&(j_sb->s_ait2)) * bsize; + byte_addr_diff0 = AIT_byte_addr - AIM_byte_addr; + fsckwsp_addr = addressPXD(&(j_sb->s_fsckpxd)) * bsize; + byte_addr_diff1 = fsckwsp_addr - AIT_byte_addr; + if ((AIM_bytesize != expected_AIM_bytesize) || + (AIT_bytesize != expected_AIT_bytesize) || + (byte_addr_diff0 != AIM_bytesize) || + (byte_addr_diff1 <= AIT_bytesize)) + j_sb->s_flag |= cpu_to_le32(JFS_BAD_SAIT); + } + + if ((j_sb->s_flag & cpu_to_le32(JFS_GROUPCOMMIT)) != + cpu_to_le32(JFS_GROUPCOMMIT)) + j_sb->s_flag |= cpu_to_le32(JFS_GROUPCOMMIT); + jFYI(0, ("superblock: flag:0x%08x state:0x%08x size:0x%Lx\n", + le32_to_cpu(j_sb->s_flag), le32_to_cpu(j_sb->s_state), + (unsigned long long) le64_to_cpu(j_sb->s_size))); + + /* validate fs state */ + if (j_sb->s_state != cpu_to_le32(FM_CLEAN) && + !(sb->s_flags & MS_RDONLY)) { + jERROR(1, + ("jfs_mount: Mount Failure: File System Dirty.\n")); + rc = EINVAL; + goto out; + } + + sbi->state = le32_to_cpu(j_sb->s_state); + sbi->mntflag = le32_to_cpu(j_sb->s_flag); + + /* + * JFS always does I/O by 4K pages. Don't tell the buffer cache + * that we use anything else (leave s_blocksize alone). + */ + sbi->bsize = bsize; + sbi->l2bsize = le16_to_cpu(j_sb->s_l2bsize); + + /* + * For now, ignore s_pbsize, l2bfactor. All I/O going through buffer + * cache. + */ + sbi->nbperpage = PSIZE >> sbi->l2bsize; + sbi->l2nbperpage = L2PSIZE - sbi->l2bsize; + sbi->l2niperblk = sbi->l2bsize - L2DISIZE; + if (sbi->mntflag & JFS_INLINELOG) + sbi->logpxd = j_sb->s_logpxd; + else + sbi->logdev = to_kdev_t(le32_to_cpu(j_sb->s_logdev)); + sbi->ait2 = j_sb->s_ait2; + + out: + release_metapage(mp); + + return rc; +} + + +/* + * updateSuper() + * + * update synchronously superblock if it is mounted read-write. + */ +int updateSuper(struct super_block *sb, uint state) +{ + int rc; + metapage_t *mp; + struct jfs_superblock *j_sb; + + /* + * Only fsck can fix dirty state + */ + if (JFS_SBI(sb)->state == FM_DIRTY) + return 0; + + if ((rc = readSuper(sb, &mp))) + return rc; + + j_sb = (struct jfs_superblock *) (mp->data); + + j_sb->s_state = cpu_to_le32(state); + JFS_SBI(sb)->state = state; + + if (state == FM_MOUNT) { + /* record log's dev_t and mount serial number */ + j_sb->s_logdev = + cpu_to_le32(kdev_t_to_nr(JFS_SBI(sb)->log->dev)); + j_sb->s_logserial = cpu_to_le32(JFS_SBI(sb)->log->serial); + /* record our own device number in case the location + * changes after a reboot + */ + j_sb->s_device = cpu_to_le32(kdev_t_to_nr(sb->s_dev)); + } else if (state == FM_CLEAN) { + /* + * If this volume is shared with OS/2, OS/2 will need to + * recalculate DASD usage, since we don't deal with it. + */ + if (j_sb->s_flag & cpu_to_le32(JFS_DASD_ENABLED)) + j_sb->s_flag |= cpu_to_le32(JFS_DASD_PRIME); + } + + flush_metapage(mp); + + return 0; +} + + +/* + * readSuper() + * + * read superblock by raw sector address + */ +int readSuper(struct super_block *sb, metapage_t ** mpp) +{ + /* read in primary superblock */ + *mpp = read_metapage(JFS_SBI(sb)->direct_inode, + SUPER1_OFF >> sb->s_blocksize_bits, PSIZE, 1); + if (*mpp == NULL) { + /* read in secondary/replicated superblock */ + *mpp = read_metapage(JFS_SBI(sb)->direct_inode, + SUPER2_OFF >> sb->s_blocksize_bits, + PSIZE, 1); + } + return *mpp ? 0 : 1; +} + + +/* + * logMOUNT() + * + * function: write a MOUNT log record for file system. + * + * MOUNT record keeps logredo() from processing log records + * for this file system past this point in log. + * it is harmless if mount fails. + * + * note: MOUNT record is at aggregate level, not at fileset level, + * since log records of previous mounts of a fileset + * (e.g., AFTER record of extent allocation) have to be processed + * to update block allocation map at aggregate level. + */ +static int logMOUNT(struct super_block *sb) +{ + log_t *log = JFS_SBI(sb)->log; + lrd_t lrd; + + lrd.logtid = 0; + lrd.backchain = 0; + lrd.type = cpu_to_le16(LOG_MOUNT); + lrd.length = 0; + lrd.aggregate = cpu_to_le32(kdev_t_to_nr(sb->s_dev)); + lmLog(log, NULL, &lrd, NULL); + + return 0; +} diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_superblock.h linuxppc64_2_4/fs/jfs/jfs_superblock.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_superblock.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_superblock.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,115 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ +#ifndef _H_JFS_SUPERBLOCK +#define _H_JFS_SUPERBLOCK +/* + * jfs_superblock.h + */ + +/* + * make the magic number something a human could read + */ +#define JFS_MAGIC "JFS1" /* Magic word */ + +#define JFS_VERSION 2 /* Version number: Version 2 */ + +#define LV_NAME_SIZE 11 /* MUST BE 11 for OS/2 boot sector */ + +/* + * aggregate superblock + * + * The name superblock is too close to super_block, so the name has been + * changed to jfs_superblock. The utilities are still using the old name. + */ +struct jfs_superblock { + char s_magic[4]; /* 4: magic number */ + u32 s_version; /* 4: version number */ + + s64 s_size; /* 8: aggregate size in hardware/LVM blocks; + * VFS: number of blocks + */ + s32 s_bsize; /* 4: aggregate block size in bytes; + * VFS: fragment size + */ + s16 s_l2bsize; /* 2: log2 of s_bsize */ + s16 s_l2bfactor; /* 2: log2(s_bsize/hardware block size) */ + s32 s_pbsize; /* 4: hardware/LVM block size in bytes */ + s16 s_l2pbsize; /* 2: log2 of s_pbsize */ + s16 pad; /* 2: padding necessary for alignment */ + + u32 s_agsize; /* 4: allocation group size in aggr. blocks */ + + u32 s_flag; /* 4: aggregate attributes: + * see jfs_filsys.h + */ + u32 s_state; /* 4: mount/unmount/recovery state: + * see jfs_filsys.h + */ + s32 s_compress; /* 4: > 0 if data compression */ + + pxd_t s_ait2; /* 8: first extent of secondary + * aggregate inode table + */ + + pxd_t s_aim2; /* 8: first extent of secondary + * aggregate inode map + */ + u32 s_logdev; /* 4: device address of log */ + s32 s_logserial; /* 4: log serial number at aggregate mount */ + pxd_t s_logpxd; /* 8: inline log extent */ + + pxd_t s_fsckpxd; /* 8: inline fsck work space extent */ + + struct timestruc_t s_time; /* 8: time last updated */ + + s32 s_fsckloglen; /* 4: Number of filesystem blocks reserved for + * the fsck service log. + * N.B. These blocks are divided among the + * versions kept. This is not a per + * version size. + * N.B. These blocks are included in the + * length field of s_fsckpxd. + */ + s8 s_fscklog; /* 1: which fsck service log is most recent + * 0 => no service log data yet + * 1 => the first one + * 2 => the 2nd one + */ + char s_fpack[11]; /* 11: file system volume name + * N.B. This must be 11 bytes to + * conform with the OS/2 BootSector + * requirements + */ + + /* extendfs() parameter under s_state & FM_EXTENDFS */ + s64 s_xsize; /* 8: extendfs s_size */ + pxd_t s_xfsckpxd; /* 8: extendfs fsckpxd */ + pxd_t s_xlogpxd; /* 8: extendfs logpxd */ + /* - 128 byte boundary - */ + + u32 s_device; /* Store device in case location changes + * between reboots + */ + +}; + +extern int readSuper(struct super_block *, struct metapage **); +extern int updateSuper(struct super_block *, uint); + +#endif /*_H_JFS_SUPERBLOCK */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_txnmgr.c linuxppc64_2_4/fs/jfs/jfs_txnmgr.c --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_txnmgr.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_txnmgr.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,3019 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * jfs_txnmgr.c: transaction manager + * + * notes: + * transaction starts with txBegin() and ends with txCommit() + * or txAbort(). + * + * tlock is acquired at the time of update; + * (obviate scan at commit time for xtree and dtree) + * tlock and mp points to each other; + * (no hashlist for mp -> tlock). + * + * special cases: + * tlock on in-memory inode: + * in-place tlock in the in-memory inode itself; + * converted to page lock by iWrite() at commit time. + * + * tlock during write()/mmap() under anonymous transaction (tid = 0): + * transferred (?) to transaction at commit time. + * + * use the page itself to update allocation maps + * (obviate intermediate replication of allocation/deallocation data) + * hold on to mp+lock thru update of maps + */ + + +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dinode.h" +#include "jfs_imap.h" +#include "jfs_dmap.h" +#include "jfs_superblock.h" +#include "jfs_debug.h" + +/* + * transaction management structures + */ +static struct { + /* tblock */ + int freetid; /* 4: index of a free tid structure */ + wait_queue_head_t freewait; /* 4: eventlist of free tblock */ + + /* tlock */ + int freelock; /* 4: index first free lock word */ + wait_queue_head_t freelockwait; /* 4: eventlist of free tlock */ + wait_queue_head_t lowlockwait; /* 4: eventlist of ample tlocks */ + int tlocksInUse; /* 4: Number of tlocks in use */ + spinlock_t LazyLock; /* 4: synchronize sync_queue & unlock_queue */ +/* tblock_t *sync_queue; * 4: Transactions waiting for data sync */ + tblock_t *unlock_queue; /* 4: Transactions waiting to be released */ + tblock_t *unlock_tail; /* 4: Tail of unlock_queue */ + struct list_head anon_list; /* inodes having anonymous txns */ + struct list_head anon_list2; /* inodes having anonymous txns + that couldn't be sync'ed */ +} TxAnchor; + +static int nTxBlock = 512; /* number of transaction blocks */ +struct tblock *TxBlock; /* transaction block table */ + +static int nTxLock = 2048; /* number of transaction locks */ +static int TxLockLWM = 2048*.4; /* Low water mark for number of txLocks used */ +static int TxLockHWM = 2048*.8; /* High water mark for number of txLocks used */ +struct tlock *TxLock; /* transaction lock table */ +static int TlocksLow = 0; /* Indicates low number of available tlocks */ + + +/* + * transaction management lock + */ +static spinlock_t jfsTxnLock = SPIN_LOCK_UNLOCKED; + +#define TXN_LOCK() spin_lock(&jfsTxnLock) +#define TXN_UNLOCK() spin_unlock(&jfsTxnLock) + +#define LAZY_LOCK_INIT() spin_lock_init(&TxAnchor.LazyLock); +#define LAZY_LOCK(flags) spin_lock_irqsave(&TxAnchor.LazyLock, flags) +#define LAZY_UNLOCK(flags) spin_unlock_irqrestore(&TxAnchor.LazyLock, flags) + +/* + * Retry logic exist outside these macros to protect from spurrious wakeups. + */ +static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(event, &wait); + set_current_state(TASK_UNINTERRUPTIBLE); + TXN_UNLOCK(); + schedule(); + current->state = TASK_RUNNING; + remove_wait_queue(event, &wait); +} + +#define TXN_SLEEP(event)\ +{\ + TXN_SLEEP_DROP_LOCK(event);\ + TXN_LOCK();\ +} + +#define TXN_WAKEUP(event) wake_up_all(event) + + +/* + * statistics + */ +struct { + tid_t maxtid; /* 4: biggest tid ever used */ + lid_t maxlid; /* 4: biggest lid ever used */ + int ntid; /* 4: # of transactions performed */ + int nlid; /* 4: # of tlocks acquired */ + int waitlock; /* 4: # of tlock wait */ +} stattx; + + +/* + * external references + */ +extern int lmGroupCommit(log_t * log, tblock_t * tblk); +extern void lmSync(log_t *); +extern int readSuper(struct super_block *sb, metapage_t ** bpp); +extern int jfs_commit_inode(struct inode *, int); +extern int jfs_thread_stopped(void); + +extern struct task_struct *jfsCommitTask; +extern struct completion jfsIOwait; +extern struct task_struct *jfsSyncTask; + +/* + * forward references + */ +int diLog(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck, + commit_t * cd); +int dataLog(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck); +void dtLog(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck); +void inlineLog(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck); +void mapLog(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck); +void txAbortCommit(commit_t * cd, int exval); +static void txAllocPMap(struct inode *ip, maplock_t * maplock, + tblock_t * tblk); +void txForce(tblock_t * tblk); +static int txLog(log_t * log, tblock_t * tblk, commit_t * cd); +int txMoreLock(void); +static void txUpdateMap(tblock_t * tblk); +static void txRelease(tblock_t * tblk); +void xtLog(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck); +static void LogSyncRelease(metapage_t * mp); + +/* + * transaction block/lock management + * --------------------------------- + */ + +/* + * Get a transaction lock from the free list. If the number in use is + * greater than the high water mark, wake up the sync daemon. This should + * free some anonymous transaction locks. (TXN_LOCK must be held.) + */ +static lid_t txLockAlloc(void) +{ + lid_t lid; + + while (!(lid = TxAnchor.freelock)) + TXN_SLEEP(&TxAnchor.freelockwait); + TxAnchor.freelock = TxLock[lid].next; + HIGHWATERMARK(stattx.maxlid, lid); + if ((++TxAnchor.tlocksInUse > TxLockHWM) && (TlocksLow == 0)) { + jEVENT(0,("txLockAlloc TlocksLow\n")); + TlocksLow = 1; + wake_up_process(jfsSyncTask); + } + + return lid; +} + +static void txLockFree(lid_t lid) +{ + TxLock[lid].next = TxAnchor.freelock; + TxAnchor.freelock = lid; + TxAnchor.tlocksInUse--; + if (TlocksLow && (TxAnchor.tlocksInUse < TxLockLWM)) { + jEVENT(0,("txLockFree TlocksLow no more\n")); + TlocksLow = 0; + TXN_WAKEUP(&TxAnchor.lowlockwait); + } + TXN_WAKEUP(&TxAnchor.freelockwait); +} + +/* + * NAME: txInit() + * + * FUNCTION: initialize transaction management structures + * + * RETURN: + * + * serialization: single thread at jfs_init() + */ +int txInit(void) +{ + int k, size; + + /* + * initialize transaction block (tblock) table + * + * transaction id (tid) = tblock index + * tid = 0 is reserved. + */ + size = sizeof(tblock_t) * nTxBlock; + TxBlock = (tblock_t *) vmalloc(size); + if (TxBlock == NULL) + return ENOMEM; + + for (k = 1; k < nTxBlock - 1; k++) { + TxBlock[k].next = k + 1; + init_waitqueue_head(&TxBlock[k].gcwait); + init_waitqueue_head(&TxBlock[k].waitor); + } + TxBlock[k].next = 0; + init_waitqueue_head(&TxBlock[k].gcwait); + init_waitqueue_head(&TxBlock[k].waitor); + + TxAnchor.freetid = 1; + init_waitqueue_head(&TxAnchor.freewait); + + stattx.maxtid = 1; /* statistics */ + + /* + * initialize transaction lock (tlock) table + * + * transaction lock id = tlock index + * tlock id = 0 is reserved. + */ + size = sizeof(tlock_t) * nTxLock; + TxLock = (tlock_t *) vmalloc(size); + if (TxLock == NULL) { + vfree(TxBlock); + return ENOMEM; + } + + /* initialize tlock table */ + for (k = 1; k < nTxLock - 1; k++) + TxLock[k].next = k + 1; + TxLock[k].next = 0; + init_waitqueue_head(&TxAnchor.freelockwait); + init_waitqueue_head(&TxAnchor.lowlockwait); + + TxAnchor.freelock = 1; + TxAnchor.tlocksInUse = 0; + INIT_LIST_HEAD(&TxAnchor.anon_list); + INIT_LIST_HEAD(&TxAnchor.anon_list2); + + stattx.maxlid = 1; /* statistics */ + + return 0; +} + +/* + * NAME: txExit() + * + * FUNCTION: clean up when module is unloaded + */ +void txExit(void) +{ + vfree(TxLock); + TxLock = 0; + vfree(TxBlock); + TxBlock = 0; +} + + +/* + * NAME: txBegin() + * + * FUNCTION: start a transaction. + * + * PARAMETER: sb - superblock + * flag - force for nested tx; + * + * RETURN: tid - transaction id + * + * note: flag force allows to start tx for nested tx + * to prevent deadlock on logsync barrier; + */ +tid_t txBegin(struct super_block *sb, int flag) +{ + tid_t t; + tblock_t *tblk; + log_t *log; + + jFYI(1, ("txBegin: flag = 0x%x\n", flag)); + log = (log_t *) JFS_SBI(sb)->log; + + TXN_LOCK(); + + retry: + if (flag != COMMIT_FORCE) { + /* + * synchronize with logsync barrier + */ + if (log->syncbarrier) { + TXN_SLEEP(&log->syncwait); + goto retry; + } + } + if (flag == 0) { + /* + * Don't begin transaction if we're getting starved for tlocks + * unless COMMIT_FORCE (imap changes) or COMMIT_INODE (which + * may ultimately free tlocks) + */ + if (TlocksLow) { + TXN_SLEEP(&TxAnchor.lowlockwait); + goto retry; + } + } + + /* + * allocate transaction id/block + */ + if ((t = TxAnchor.freetid) == 0) { + jFYI(1, ("txBegin: waiting for free tid\n")); + TXN_SLEEP(&TxAnchor.freewait); + goto retry; + } + + tblk = tid_to_tblock(t); + + if ((tblk->next == 0) && (current != jfsCommitTask)) { + /* Save one tblk for jfsCommit thread */ + jFYI(1, ("txBegin: waiting for free tid\n")); + TXN_SLEEP(&TxAnchor.freewait); + goto retry; + } + + TxAnchor.freetid = tblk->next; + + /* + * initialize transaction + */ + + /* + * We can't zero the whole thing or we screw up another thread being + * awakened after sleeping on tblk->waitor + * + * memset(tblk, 0, sizeof(tblock_t)); + */ + tblk->next = tblk->last = tblk->xflag = tblk->flag = tblk->lsn = 0; + + tblk->sb = sb; + ++log->logtid; + tblk->logtid = log->logtid; + + ++log->active; + + HIGHWATERMARK(stattx.maxtid, t); /* statistics */ + INCREMENT(stattx.ntid); /* statistics */ + + TXN_UNLOCK(); + + jFYI(1, ("txBegin: returning tid = %d\n", t)); + + return t; +} + + +/* + * NAME: txBeginAnon() + * + * FUNCTION: start an anonymous transaction. + * Blocks if logsync or available tlocks are low to prevent + * anonymous tlocks from depleting supply. + * + * PARAMETER: sb - superblock + * + * RETURN: none + */ +void txBeginAnon(struct super_block *sb) +{ + log_t *log; + + log = (log_t *) JFS_SBI(sb)->log; + + TXN_LOCK(); + + retry: + /* + * synchronize with logsync barrier + */ + if (log->syncbarrier) { + TXN_SLEEP(&log->syncwait); + goto retry; + } + + /* + * Don't begin transaction if we're getting starved for tlocks + */ + if (TlocksLow) { + TXN_SLEEP(&TxAnchor.lowlockwait); + goto retry; + } + TXN_UNLOCK(); +} + + +/* + * txEnd() + * + * function: free specified transaction block. + * + * logsync barrier processing: + * + * serialization: + */ +void txEnd(tid_t tid) +{ + tblock_t *tblk = tid_to_tblock(tid); + log_t *log; + + jFYI(1, ("txEnd: tid = %d\n", tid)); + TXN_LOCK(); + + /* + * wakeup transactions waiting on the page locked + * by the current transaction + */ + TXN_WAKEUP(&tblk->waitor); + + log = (log_t *) JFS_SBI(tblk->sb)->log; + + /* + * Lazy commit thread can't free this guy until we mark it UNLOCKED, + * otherwise, we would be left with a transaction that may have been + * reused. + * + * Lazy commit thread will turn off tblkGC_LAZY before calling this + * routine. + */ + if (tblk->flag & tblkGC_LAZY) { + jFYI(1, + ("txEnd called w/lazy tid: %d, tblk = 0x%p\n", + tid, tblk)); + TXN_UNLOCK(); + + spin_lock_irq(&log->gclock); // LOGGC_LOCK + tblk->flag |= tblkGC_UNLOCKED; + spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK + return; + } + + jFYI(1, ("txEnd: tid: %d, tblk = 0x%p\n", tid, tblk)); + + assert(tblk->next == 0); + + /* + * insert tblock back on freelist + */ + tblk->next = TxAnchor.freetid; + TxAnchor.freetid = tid; + + /* + * mark the tblock not active + */ + --log->active; + + /* + * synchronize with logsync barrier + */ + if (log->syncbarrier && log->active == 0) { + /* forward log syncpt */ + /* lmSync(log); */ + + jFYI(1, (" log barrier off: 0x%x\n", log->lsn)); + + /* enable new transactions start */ + log->syncbarrier = 0; + + /* wakeup all waitors for logsync barrier */ + TXN_WAKEUP(&log->syncwait); + } + + /* + * wakeup all waitors for a free tblock + */ + TXN_WAKEUP(&TxAnchor.freewait); + + TXN_UNLOCK(); + jFYI(1, ("txEnd: exitting\n")); +} + + +/* + * txLock() + * + * function: acquire a transaction lock on the specified + * + * parameter: + * + * return: transaction lock id + * + * serialization: + */ +tlock_t *txLock(tid_t tid, struct inode *ip, metapage_t * mp, int type) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + int dir_xtree = 0; + lid_t lid; + tid_t xtid; + tlock_t *tlck; + xtlock_t *xtlck; + linelock_t *linelock; + xtpage_t *p; + tblock_t *tblk; + + TXN_LOCK(); + + if (S_ISDIR(ip->i_mode) && (type & tlckXTREE) && + !(mp->xflag & COMMIT_PAGE)) { + /* + * Directory inode is special. It can have both an xtree tlock + * and a dtree tlock associated with it. + */ + dir_xtree = 1; + lid = jfs_ip->xtlid; + } else + lid = mp->lid; + + /* is page not locked by a transaction ? */ + if (lid == 0) + goto allocateLock; + + jFYI(1, ("txLock: tid:%d ip:0x%p mp:0x%p lid:%d\n", + tid, ip, mp, lid)); + + /* is page locked by the requester transaction ? */ + tlck = lid_to_tlock(lid); + if ((xtid = tlck->tid) == tid) + goto grantLock; + + /* + * is page locked by anonymous transaction/lock ? + * + * (page update without transaction (i.e., file write) is + * locked under anonymous transaction tid = 0: + * anonymous tlocks maintained on anonymous tlock list of + * the inode of the page and available to all anonymous + * transactions until txCommit() time at which point + * they are transferred to the transaction tlock list of + * the commiting transaction of the inode) + */ + if (xtid == 0) { + tlck->tid = tid; + tblk = tid_to_tblock(tid); + /* + * The order of the tlocks in the transaction is important + * (during truncate, child xtree pages must be freed before + * parent's tlocks change the working map). + * Take tlock off anonymous list and add to tail of + * transaction list + * + * Note: We really need to get rid of the tid & lid and + * use list_head's. This code is getting UGLY! + */ + if (jfs_ip->atlhead == lid) { + if (jfs_ip->atltail == lid) { + /* only anonymous txn. + * Remove from anon_list + */ + list_del_init(&jfs_ip->anon_inode_list); + } + jfs_ip->atlhead = tlck->next; + } else { + lid_t last; + for (last = jfs_ip->atlhead; + lid_to_tlock(last)->next != lid; + last = lid_to_tlock(last)->next) { + assert(last); + } + lid_to_tlock(last)->next = tlck->next; + if (jfs_ip->atltail == lid) + jfs_ip->atltail = last; + } + + /* insert the tlock at tail of transaction tlock list */ + + if (tblk->next) + lid_to_tlock(tblk->last)->next = lid; + else + tblk->next = lid; + tlck->next = 0; + tblk->last = lid; + + goto grantLock; + } + + goto waitLock; + + /* + * allocate a tlock + */ + allocateLock: + lid = txLockAlloc(); + tlck = lid_to_tlock(lid); + + /* + * initialize tlock + */ + tlck->tid = tid; + + /* mark tlock for meta-data page */ + if (mp->xflag & COMMIT_PAGE) { + + tlck->flag = tlckPAGELOCK; + + /* mark the page dirty and nohomeok */ + mark_metapage_dirty(mp); + atomic_inc(&mp->nohomeok); + + jFYI(1, + ("locking mp = 0x%p, nohomeok = %d tid = %d tlck = 0x%p\n", + mp, atomic_read(&mp->nohomeok), tid, tlck)); + + /* if anonymous transaction, and buffer is on the group + * commit synclist, mark inode to show this. This will + * prevent the buffer from being marked nohomeok for too + * long a time. + */ + if ((tid == 0) && mp->lsn) + set_cflag(COMMIT_Synclist, ip); + } + /* mark tlock for in-memory inode */ + else + tlck->flag = tlckINODELOCK; + + tlck->type = 0; + + /* bind the tlock and the page */ + tlck->ip = ip; + tlck->mp = mp; + if (dir_xtree) + jfs_ip->xtlid = lid; + else + mp->lid = lid; + + /* + * enqueue transaction lock to transaction/inode + */ + /* insert the tlock at tail of transaction tlock list */ + if (tid) { + tblk = tid_to_tblock(tid); + if (tblk->next) + lid_to_tlock(tblk->last)->next = lid; + else + tblk->next = lid; + tlck->next = 0; + tblk->last = lid; + } + /* anonymous transaction: + * insert the tlock at head of inode anonymous tlock list + */ + else { + tlck->next = jfs_ip->atlhead; + jfs_ip->atlhead = lid; + if (tlck->next == 0) { + /* This inode's first anonymous transaction */ + jfs_ip->atltail = lid; + list_add_tail(&jfs_ip->anon_inode_list, + &TxAnchor.anon_list); + } + } + + /* initialize type dependent area for linelock */ + linelock = (linelock_t *) & tlck->lock; + linelock->next = 0; + linelock->flag = tlckLINELOCK; + linelock->maxcnt = TLOCKSHORT; + linelock->index = 0; + + switch (type & tlckTYPE) { + case tlckDTREE: + linelock->l2linesize = L2DTSLOTSIZE; + break; + + case tlckXTREE: + linelock->l2linesize = L2XTSLOTSIZE; + + xtlck = (xtlock_t *) linelock; + xtlck->header.offset = 0; + xtlck->header.length = 2; + + if (type & tlckNEW) { + xtlck->lwm.offset = XTENTRYSTART; + } else { + if (mp->xflag & COMMIT_PAGE) + p = (xtpage_t *) mp->data; + else + p = &jfs_ip->i_xtroot; + xtlck->lwm.offset = + le16_to_cpu(p->header.nextindex); + } + xtlck->lwm.length = 0; /* ! */ + + xtlck->index = 2; + break; + + case tlckINODE: + linelock->l2linesize = L2INODESLOTSIZE; + break; + + case tlckDATA: + linelock->l2linesize = L2DATASLOTSIZE; + break; + + default: + jERROR(1, ("UFO tlock:0x%p\n", tlck)); + } + + /* + * update tlock vector + */ + grantLock: + tlck->type |= type; + + TXN_UNLOCK(); + + return tlck; + + /* + * page is being locked by another transaction: + */ + waitLock: + /* Only locks on ipimap or ipaimap should reach here */ + /* assert(jfs_ip->fileset == AGGREGATE_I); */ + if (jfs_ip->fileset != AGGREGATE_I) { + jERROR(1, ("txLock: trying to lock locked page!\n")); + dump_mem("ip", ip, sizeof(struct inode)); + dump_mem("mp", mp, sizeof(metapage_t)); + dump_mem("Locker's tblk", tid_to_tblock(tid), + sizeof(tblock_t)); + dump_mem("Tlock", tlck, sizeof(tlock_t)); + BUG(); + } + INCREMENT(stattx.waitlock); /* statistics */ + release_metapage(mp); + + jEVENT(0, ("txLock: in waitLock, tid = %d, xtid = %d, lid = %d\n", + tid, xtid, lid)); + TXN_SLEEP_DROP_LOCK(&tid_to_tblock(xtid)->waitor); + jEVENT(0, ("txLock: awakened tid = %d, lid = %d\n", tid, lid)); + + return NULL; +} + + +/* + * NAME: txRelease() + * + * FUNCTION: Release buffers associated with transaction locks, but don't + * mark homeok yet. The allows other transactions to modify + * buffers, but won't let them go to disk until commit record + * actually gets written. + * + * PARAMETER: + * tblk - + * + * RETURN: Errors from subroutines. + */ +static void txRelease(tblock_t * tblk) +{ + metapage_t *mp; + lid_t lid; + tlock_t *tlck; + + TXN_LOCK(); + + for (lid = tblk->next; lid; lid = tlck->next) { + tlck = lid_to_tlock(lid); + if ((mp = tlck->mp) != NULL && + (tlck->type & tlckBTROOT) == 0) { + assert(mp->xflag & COMMIT_PAGE); + mp->lid = 0; + } + } + + /* + * wakeup transactions waiting on a page locked + * by the current transaction + */ + TXN_WAKEUP(&tblk->waitor); + + TXN_UNLOCK(); +} + + +/* + * NAME: txUnlock() + * + * FUNCTION: Initiates pageout of pages modified by tid in journalled + * objects and frees their lockwords. + * + * PARAMETER: + * flag - + * + * RETURN: Errors from subroutines. + */ +static void txUnlock(tblock_t * tblk, int flag) +{ + tlock_t *tlck; + linelock_t *linelock; + lid_t lid, next, llid, k; + metapage_t *mp; + log_t *log; + int force; + int difft, diffp; + + jFYI(1, ("txUnlock: tblk = 0x%p\n", tblk)); + log = (log_t *) JFS_SBI(tblk->sb)->log; + force = flag & COMMIT_FLUSH; + if (log->syncbarrier) + force |= COMMIT_FORCE; + + /* + * mark page under tlock homeok (its log has been written): + * if caller has specified FORCE (e.g., iRecycle()), or + * if syncwait for the log is set (i.e., the log sync point + * has fallen behind), or + * if syncpt is set for the page, or + * if the page is new, initiate pageout; + * otherwise, leave the page in memory. + */ + for (lid = tblk->next; lid; lid = next) { + tlck = lid_to_tlock(lid); + next = tlck->next; + + jFYI(1, ("unlocking lid = %d, tlck = 0x%p\n", lid, tlck)); + + /* unbind page from tlock */ + if ((mp = tlck->mp) != NULL && + (tlck->type & tlckBTROOT) == 0) { + assert(mp->xflag & COMMIT_PAGE); + + /* hold buffer + * + * It's possible that someone else has the metapage. + * The only things were changing are nohomeok, which + * is handled atomically, and clsn which is protected + * by the LOGSYNC_LOCK. + */ + hold_metapage(mp, 1); + + assert(atomic_read(&mp->nohomeok) > 0); + atomic_dec(&mp->nohomeok); + + /* inherit younger/larger clsn */ + LOGSYNC_LOCK(log); + if (mp->clsn) { + logdiff(difft, tblk->clsn, log); + logdiff(diffp, mp->clsn, log); + if (difft > diffp) + mp->clsn = tblk->clsn; + } else + mp->clsn = tblk->clsn; + LOGSYNC_UNLOCK(log); + + assert(!(tlck->flag & tlckFREEPAGE)); + + if (tlck->flag & tlckWRITEPAGE) { + write_metapage(mp); + } else { + /* release page which has been forced */ + release_metapage(mp); + } + } + + /* insert tlock, and linelock(s) of the tlock if any, + * at head of freelist + */ + TXN_LOCK(); + + llid = ((linelock_t *) & tlck->lock)->next; + while (llid) { + linelock = (linelock_t *) lid_to_tlock(llid); + k = linelock->next; + txLockFree(llid); + llid = k; + } + txLockFree(lid); + + TXN_UNLOCK(); + } + tblk->next = tblk->last = 0; + + /* + * remove tblock from logsynclist + * (allocation map pages inherited lsn of tblk and + * has been inserted in logsync list at txUpdateMap()) + */ + if (tblk->lsn) { + LOGSYNC_LOCK(log); + log->count--; + list_del(&tblk->synclist); + LOGSYNC_UNLOCK(log); + } +} + + +/* + * txMaplock() + * + * function: allocate a transaction lock for freed page/entry; + * for freed page, maplock is used as xtlock/dtlock type; + */ +tlock_t *txMaplock(tid_t tid, struct inode *ip, int type) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + lid_t lid; + tblock_t *tblk; + tlock_t *tlck; + maplock_t *maplock; + + TXN_LOCK(); + + /* + * allocate a tlock + */ + lid = txLockAlloc(); + tlck = lid_to_tlock(lid); + + /* + * initialize tlock + */ + tlck->tid = tid; + + /* bind the tlock and the object */ + tlck->flag = tlckINODELOCK; + tlck->ip = ip; + tlck->mp = NULL; + + tlck->type = type; + + /* + * enqueue transaction lock to transaction/inode + */ + /* insert the tlock at tail of transaction tlock list */ + if (tid) { + tblk = tid_to_tblock(tid); + if (tblk->next) + lid_to_tlock(tblk->last)->next = lid; + else + tblk->next = lid; + tlck->next = 0; + tblk->last = lid; + } + /* anonymous transaction: + * insert the tlock at head of inode anonymous tlock list + */ + else { + tlck->next = jfs_ip->atlhead; + jfs_ip->atlhead = lid; + if (tlck->next == 0) { + /* This inode's first anonymous transaction */ + jfs_ip->atltail = lid; + list_add_tail(&jfs_ip->anon_inode_list, + &TxAnchor.anon_list); + } + } + + TXN_UNLOCK(); + + /* initialize type dependent area for maplock */ + maplock = (maplock_t *) & tlck->lock; + maplock->next = 0; + maplock->maxcnt = 0; + maplock->index = 0; + + return tlck; +} + + +/* + * txLinelock() + * + * function: allocate a transaction lock for log vector list + */ +linelock_t *txLinelock(linelock_t * tlock) +{ + lid_t lid; + tlock_t *tlck; + linelock_t *linelock; + + TXN_LOCK(); + + /* allocate a TxLock structure */ + lid = txLockAlloc(); + tlck = lid_to_tlock(lid); + + TXN_UNLOCK(); + + /* initialize linelock */ + linelock = (linelock_t *) tlck; + linelock->next = 0; + linelock->flag = tlckLINELOCK; + linelock->maxcnt = TLOCKLONG; + linelock->index = 0; + + /* append linelock after tlock */ + linelock->next = tlock->next; + tlock->next = lid; + + return linelock; +} + + + +/* + * transaction commit management + * ----------------------------- + */ + +/* + * NAME: txCommit() + * + * FUNCTION: commit the changes to the objects specified in + * clist. For journalled segments only the + * changes of the caller are committed, ie by tid. + * for non-journalled segments the data are flushed to + * disk and then the change to the disk inode and indirect + * blocks committed (so blocks newly allocated to the + * segment will be made a part of the segment atomically). + * + * all of the segments specified in clist must be in + * one file system. no more than 6 segments are needed + * to handle all unix svcs. + * + * if the i_nlink field (i.e. disk inode link count) + * is zero, and the type of inode is a regular file or + * directory, or symbolic link , the inode is truncated + * to zero length. the truncation is committed but the + * VM resources are unaffected until it is closed (see + * iput and iclose). + * + * PARAMETER: + * + * RETURN: + * + * serialization: + * on entry the inode lock on each segment is assumed + * to be held. + * + * i/o error: + */ +int txCommit(tid_t tid, /* transaction identifier */ + int nip, /* number of inodes to commit */ + struct inode **iplist, /* list of inode to commit */ + int flag) +{ + int rc = 0, rc1 = 0; + commit_t cd; + log_t *log; + tblock_t *tblk; + lrd_t *lrd; + int lsn; + struct inode *ip; + struct jfs_inode_info *jfs_ip; + int k, n; + ino_t top; + struct super_block *sb; + + jFYI(1, ("txCommit, tid = %d, flag = %d\n", tid, flag)); + /* is read-only file system ? */ + if (isReadOnly(iplist[0])) { + rc = EROFS; + goto TheEnd; + } + + sb = cd.sb = iplist[0]->i_sb; + cd.tid = tid; + + if (tid == 0) + tid = txBegin(sb, 0); + tblk = tid_to_tblock(tid); + + /* + * initialize commit structure + */ + log = (log_t *) JFS_SBI(sb)->log; + cd.log = log; + + /* initialize log record descriptor in commit */ + lrd = &cd.lrd; + lrd->logtid = cpu_to_le32(tblk->logtid); + lrd->backchain = 0; + + tblk->xflag |= flag; + + if ((flag & (COMMIT_FORCE | COMMIT_SYNC)) == 0) + tblk->xflag |= COMMIT_LAZY; + /* + * prepare non-journaled objects for commit + * + * flush data pages of non-journaled file + * to prevent the file getting non-initialized disk blocks + * in case of crash. + * (new blocks - ) + */ + cd.iplist = iplist; + cd.nip = nip; + + /* + * acquire transaction lock on (on-disk) inodes + * + * update on-disk inode from in-memory inode + * acquiring transaction locks for AFTER records + * on the on-disk inode of file object + * + * sort the inodes array by inode number in descending order + * to prevent deadlock when acquiring transaction lock + * of on-disk inodes on multiple on-disk inode pages by + * multiple concurrent transactions + */ + for (k = 0; k < cd.nip; k++) { + top = (cd.iplist[k])->i_ino; + for (n = k + 1; n < cd.nip; n++) { + ip = cd.iplist[n]; + if (ip->i_ino > top) { + top = ip->i_ino; + cd.iplist[n] = cd.iplist[k]; + cd.iplist[k] = ip; + } + } + + ip = cd.iplist[k]; + jfs_ip = JFS_IP(ip); + + /* + * BUGBUG - Should we call filemap_fdatasync here instead + * of fsync_inode_data? + * If we do, we have a deadlock condition since we may end + * up recursively calling jfs_get_block with the IWRITELOCK + * held. We may be able to do away with IWRITELOCK while + * committing transactions and use i_sem instead. + */ + if ((!S_ISDIR(ip->i_mode)) + && (tblk->flag & COMMIT_DELETE) == 0) + fsync_inode_data_buffers(ip); + + /* + * Mark inode as not dirty. It will still be on the dirty + * inode list, but we'll know not to commit it again unless + * it gets marked dirty again + */ + clear_cflag(COMMIT_Dirty, ip); + + /* inherit anonymous tlock(s) of inode */ + if (jfs_ip->atlhead) { + lid_to_tlock(jfs_ip->atltail)->next = tblk->next; + tblk->next = jfs_ip->atlhead; + if (!tblk->last) + tblk->last = jfs_ip->atltail; + jfs_ip->atlhead = jfs_ip->atltail = 0; + TXN_LOCK(); + list_del_init(&jfs_ip->anon_inode_list); + TXN_UNLOCK(); + } + + /* + * acquire transaction lock on on-disk inode page + * (become first tlock of the tblk's tlock list) + */ + if (((rc = diWrite(tid, ip)))) + goto out; + } + + /* + * write log records from transaction locks + * + * txUpdateMap() resets XAD_NEW in XAD. + */ + if ((rc = txLog(log, tblk, &cd))) + goto TheEnd; + + /* + * Ensure that inode isn't reused before + * lazy commit thread finishes processing + */ + if (tblk->xflag & (COMMIT_CREATE | COMMIT_DELETE)) + atomic_inc(&tblk->ip->i_count); + if (tblk->xflag & COMMIT_DELETE) { + ip = tblk->ip; + assert((ip->i_nlink == 0) && !test_cflag(COMMIT_Nolink, ip)); + set_cflag(COMMIT_Nolink, ip); + } + + /* + * write COMMIT log record + */ + lrd->type = cpu_to_le16(LOG_COMMIT); + lrd->length = 0; + lsn = lmLog(log, tblk, lrd, NULL); + + lmGroupCommit(log, tblk); + + /* + * - transaction is now committed - + */ + + /* + * force pages in careful update + * (imap addressing structure update) + */ + if (flag & COMMIT_FORCE) + txForce(tblk); + + /* + * update allocation map. + * + * update inode allocation map and inode: + * free pager lock on memory object of inode if any. + * update block allocation map. + * + * txUpdateMap() resets XAD_NEW in XAD. + */ + if (tblk->xflag & COMMIT_FORCE) + txUpdateMap(tblk); + + /* + * free transaction locks and pageout/free pages + */ + txRelease(tblk); + + if ((tblk->flag & tblkGC_LAZY) == 0) + txUnlock(tblk, flag); + + + /* + * reset in-memory object state + */ + for (k = 0; k < cd.nip; k++) { + ip = cd.iplist[k]; + jfs_ip = JFS_IP(ip); + + /* + * reset in-memory inode state + */ + jfs_ip->bxflag = 0; + jfs_ip->blid = 0; + } + + out: + if (rc != 0) + txAbortCommit(&cd, rc); + else + rc = rc1; + + TheEnd: + jFYI(1, ("txCommit: tid = %d, returning %d\n", tid, rc)); + return rc; +} + + +/* + * NAME: txLog() + * + * FUNCTION: Writes AFTER log records for all lines modified + * by tid for segments specified by inodes in comdata. + * Code assumes only WRITELOCKS are recorded in lockwords. + * + * PARAMETERS: + * + * RETURN : + */ +static int txLog(log_t * log, tblock_t * tblk, commit_t * cd) +{ + int rc = 0; + struct inode *ip; + lid_t lid; + tlock_t *tlck; + lrd_t *lrd = &cd->lrd; + + /* + * write log record(s) for each tlock of transaction, + */ + for (lid = tblk->next; lid; lid = tlck->next) { + tlck = lid_to_tlock(lid); + + tlck->flag |= tlckLOG; + + /* initialize lrd common */ + ip = tlck->ip; + lrd->aggregate = cpu_to_le32(kdev_t_to_nr(ip->i_dev)); + lrd->log.redopage.fileset = cpu_to_le32(JFS_IP(ip)->fileset); + lrd->log.redopage.inode = cpu_to_le32(ip->i_ino); + + if (tlck->mp) + hold_metapage(tlck->mp, 0); + + /* write log record of page from the tlock */ + switch (tlck->type & tlckTYPE) { + case tlckXTREE: + xtLog(log, tblk, lrd, tlck); + break; + + case tlckDTREE: + dtLog(log, tblk, lrd, tlck); + break; + + case tlckINODE: + diLog(log, tblk, lrd, tlck, cd); + break; + + case tlckMAP: + mapLog(log, tblk, lrd, tlck); + break; + + case tlckDATA: + dataLog(log, tblk, lrd, tlck); + break; + + default: + jERROR(1, ("UFO tlock:0x%p\n", tlck)); + } + if (tlck->mp) + release_metapage(tlck->mp); + } + + return rc; +} + + +/* + * diLog() + * + * function: log inode tlock and format maplock to update bmap; + */ +int diLog(log_t * log, + tblock_t * tblk, lrd_t * lrd, tlock_t * tlck, commit_t * cd) +{ + int rc = 0; + metapage_t *mp; + pxd_t *pxd; + pxdlock_t *pxdlock; + + mp = tlck->mp; + + /* initialize as REDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_INODE); + lrd->log.redopage.l2linesize = cpu_to_le16(L2INODESLOTSIZE); + + pxd = &lrd->log.redopage.pxd; + + /* + * inode after image + */ + if (tlck->type & tlckENTRY) { + /* log after-image for logredo(): */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); +// *pxd = mp->cm_pxd; + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + } else if (tlck->type & tlckFREE) { + /* + * free inode extent + * + * (pages of the freed inode extent have been invalidated and + * a maplock for free of the extent has been formatted at + * txLock() time); + * + * the tlock had been acquired on the inode allocation map page + * (iag) that specifies the freed extent, even though the map + * page is not itself logged, to prevent pageout of the map + * page before the log; + */ + assert(tlck->type & tlckFREE); + + /* log LOG_NOREDOINOEXT of the freed inode extent for + * logredo() to start NoRedoPage filters, and to update + * imap and bmap for free of the extent; + */ + lrd->type = cpu_to_le16(LOG_NOREDOINOEXT); + /* + * For the LOG_NOREDOINOEXT record, we need + * to pass the IAG number and inode extent + * index (within that IAG) from which the + * the extent being released. These have been + * passed to us in the iplist[1] and iplist[2]. + */ + lrd->log.noredoinoext.iagnum = + cpu_to_le32((u32) (size_t) cd->iplist[1]); + lrd->log.noredoinoext.inoext_idx = + cpu_to_le32((u32) (size_t) cd->iplist[2]); + + pxdlock = (pxdlock_t *) & tlck->lock; + *pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* update bmap */ + tlck->flag |= tlckUPDATEMAP; + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + } else { + jERROR(2, ("diLog: UFO type tlck:0x%p\n", tlck)); + } +#ifdef _JFS_WIP + /* + * alloc/free external EA extent + * + * a maplock for txUpdateMap() to update bPWMAP for alloc/free + * of the extent has been formatted at txLock() time; + */ + else { + assert(tlck->type & tlckEA); + + /* log LOG_UPDATEMAP for logredo() to update bmap for + * alloc of new (and free of old) external EA extent; + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + pxdlock = (pxdlock_t *) & tlck->lock; + nlock = pxdlock->index; + for (i = 0; i < nlock; i++, pxdlock++) { + if (pxdlock->flag & mlckALLOCPXD) + lrd->log.updatemap.type = + cpu_to_le16(LOG_ALLOCPXD); + else + lrd->log.updatemap.type = + cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + } + + /* update bmap */ + tlck->flag |= tlckUPDATEMAP; + } +#endif /* _JFS_WIP */ + + return rc; +} + + +/* + * dataLog() + * + * function: log data tlock + */ +int dataLog(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck) +{ + metapage_t *mp; + pxd_t *pxd; + int rc; + s64 xaddr; + int xflag; + s32 xlen; + + mp = tlck->mp; + + /* initialize as REDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_DATA); + lrd->log.redopage.l2linesize = cpu_to_le16(L2DATASLOTSIZE); + + pxd = &lrd->log.redopage.pxd; + + /* log after-image for logredo(): */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + + if (JFS_IP(tlck->ip)->next_index < MAX_INLINE_DIRTABLE_ENTRY) { + /* + * The table has been truncated, we've must have deleted + * the last entry, so don't bother logging this + */ + mp->lid = 0; + atomic_dec(&mp->nohomeok); + discard_metapage(mp); + tlck->mp = 0; + return 0; + } + + rc = xtLookup(tlck->ip, mp->index, 1, &xflag, &xaddr, &xlen, 1); + if (rc || (xlen == 0)) { + jERROR(1, ("dataLog: can't find physical address\n")); + return 0; + } + + PXDaddress(pxd, xaddr); + PXDlength(pxd, mp->logical_size >> tblk->sb->s_blocksize_bits); + + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + + return 0; +} + + +/* + * dtLog() + * + * function: log dtree tlock and format maplock to update bmap; + */ +void dtLog(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck) +{ + struct inode *ip; + metapage_t *mp; + pxdlock_t *pxdlock; + pxd_t *pxd; + + ip = tlck->ip; + mp = tlck->mp; + + /* initialize as REDOPAGE/NOREDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_DTREE); + lrd->log.redopage.l2linesize = cpu_to_le16(L2DTSLOTSIZE); + + pxd = &lrd->log.redopage.pxd; + + if (tlck->type & tlckBTROOT) + lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); + + /* + * page extension via relocation: entry insertion; + * page extension in-place: entry insertion; + * new right page from page split, reinitialized in-line + * root from root page split: entry insertion; + */ + if (tlck->type & (tlckNEW | tlckEXTEND)) { + /* log after-image of the new page for logredo(): + * mark log (LOG_NEW) for logredo() to initialize + * freelist and update bmap for alloc of the new page; + */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + if (tlck->type & tlckEXTEND) + lrd->log.redopage.type |= cpu_to_le16(LOG_EXTEND); + else + lrd->log.redopage.type |= cpu_to_le16(LOG_NEW); +// *pxd = mp->cm_pxd; + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* format a maplock for txUpdateMap() to update bPMAP for + * alloc of the new page; + */ + if (tlck->type & tlckBTROOT) + return; + tlck->flag |= tlckUPDATEMAP; + pxdlock = (pxdlock_t *) & tlck->lock; + pxdlock->flag = mlckALLOCPXD; + pxdlock->pxd = *pxd; + + pxdlock->index = 1; + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + return; + } + + /* + * entry insertion/deletion, + * sibling page link update (old right page before split); + */ + if (tlck->type & (tlckENTRY | tlckRELINK)) { + /* log after-image for logredo(): */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + return; + } + + /* + * page deletion: page has been invalidated + * page relocation: source extent + * + * a maplock for free of the page has been formatted + * at txLock() time); + */ + if (tlck->type & (tlckFREE | tlckRELOCATE)) { + /* log LOG_NOREDOPAGE of the deleted page for logredo() + * to start NoRedoPage filter and to update bmap for free + * of the deletd page + */ + lrd->type = cpu_to_le16(LOG_NOREDOPAGE); + pxdlock = (pxdlock_t *) & tlck->lock; + *pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* a maplock for txUpdateMap() for free of the page + * has been formatted at txLock() time; + */ + tlck->flag |= tlckUPDATEMAP; + } + return; +} + + +/* + * xtLog() + * + * function: log xtree tlock and format maplock to update bmap; + */ +void xtLog(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck) +{ + struct inode *ip; + metapage_t *mp; + xtpage_t *p; + xtlock_t *xtlck; + maplock_t *maplock; + xdlistlock_t *xadlock; + pxdlock_t *pxdlock; + pxd_t *pxd; + int next, lwm, hwm; + + ip = tlck->ip; + mp = tlck->mp; + + /* initialize as REDOPAGE/NOREDOPAGE record format */ + lrd->log.redopage.type = cpu_to_le16(LOG_XTREE); + lrd->log.redopage.l2linesize = cpu_to_le16(L2XTSLOTSIZE); + + pxd = &lrd->log.redopage.pxd; + + if (tlck->type & tlckBTROOT) { + lrd->log.redopage.type |= cpu_to_le16(LOG_BTROOT); + p = &JFS_IP(ip)->i_xtroot; + if (S_ISDIR(ip->i_mode)) + lrd->log.redopage.type |= + cpu_to_le16(LOG_DIR_XTREE); + } else + p = (xtpage_t *) mp->data; + next = le16_to_cpu(p->header.nextindex); + + xtlck = (xtlock_t *) & tlck->lock; + + maplock = (maplock_t *) & tlck->lock; + xadlock = (xdlistlock_t *) maplock; + + /* + * entry insertion/extension; + * sibling page link update (old right page before split); + */ + if (tlck->type & (tlckNEW | tlckGROW | tlckRELINK)) { + /* log after-image for logredo(): + * logredo() will update bmap for alloc of new/extended + * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from + * after-image of XADlist; + * logredo() resets (XAD_NEW|XAD_EXTEND) flag when + * applying the after-image to the meta-data page. + */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); +// *pxd = mp->cm_pxd; + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb->s_blocksize_bits); + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* format a maplock for txUpdateMap() to update bPMAP + * for alloc of new/extended extents of XAD[lwm:next) + * from the page itself; + * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag. + */ + lwm = xtlck->lwm.offset; + if (lwm == 0) + lwm = XTPAGEMAXSLOT; + + if (lwm == next) + goto out; + assert(lwm < next); + tlck->flag |= tlckUPDATEMAP; + xadlock->flag = mlckALLOCXADLIST; + xadlock->count = next - lwm; + if ((xadlock->count <= 2) && (tblk->xflag & COMMIT_LAZY)) { + int i; + /* + * Lazy commit may allow xtree to be modified before + * txUpdateMap runs. Copy xad into linelock to + * preserve correct data. + */ + xadlock->xdlist = &xtlck->pxdlock; + memcpy(xadlock->xdlist, &p->xad[lwm], + sizeof(xad_t) * xadlock->count); + + for (i = 0; i < xadlock->count; i++) + p->xad[lwm + i].flag &= + ~(XAD_NEW | XAD_EXTENDED); + } else { + /* + * xdlist will point to into inode's xtree, ensure + * that transaction is not committed lazily. + */ + xadlock->xdlist = &p->xad[lwm]; + tblk->xflag &= ~COMMIT_LAZY; + } + jFYI(1, + ("xtLog: alloc ip:0x%p mp:0x%p tlck:0x%p lwm:%d count:%d\n", + tlck->ip, mp, tlck, lwm, xadlock->count)); + + maplock->index = 1; + + out: + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + + return; + } + + /* + * page deletion: file deletion/truncation (ref. xtTruncate()) + * + * (page will be invalidated after log is written and bmap + * is updated from the page); + */ + if (tlck->type & tlckFREE) { + /* LOG_NOREDOPAGE log for NoRedoPage filter: + * if page free from file delete, NoRedoFile filter from + * inode image of zero link count will subsume NoRedoPage + * filters for each page; + * if page free from file truncattion, write NoRedoPage + * filter; + * + * upadte of block allocation map for the page itself: + * if page free from deletion and truncation, LOG_UPDATEMAP + * log for the page itself is generated from processing + * its parent page xad entries; + */ + /* if page free from file truncation, log LOG_NOREDOPAGE + * of the deleted page for logredo() to start NoRedoPage + * filter for the page; + */ + if (tblk->xflag & COMMIT_TRUNCATE) { + /* write NOREDOPAGE for the page */ + lrd->type = cpu_to_le16(LOG_NOREDOPAGE); + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb-> + s_blocksize_bits); + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + if (tlck->type & tlckBTROOT) { + /* Empty xtree must be logged */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + } + } + + /* init LOG_UPDATEMAP of the freed extents + * XAD[XTENTRYSTART:hwm) from the deleted page itself + * for logredo() to update bmap; + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = cpu_to_le16(LOG_FREEXADLIST); + xtlck = (xtlock_t *) & tlck->lock; + hwm = xtlck->hwm.offset; + lrd->log.updatemap.nxd = + cpu_to_le16(hwm - XTENTRYSTART + 1); + /* reformat linelock for lmLog() */ + xtlck->header.offset = XTENTRYSTART; + xtlck->header.length = hwm - XTENTRYSTART + 1; + xtlck->index = 1; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + + /* format a maplock for txUpdateMap() to update bmap + * to free extents of XAD[XTENTRYSTART:hwm) from the + * deleted page itself; + */ + tlck->flag |= tlckUPDATEMAP; + xadlock->flag = mlckFREEXADLIST; + xadlock->count = hwm - XTENTRYSTART + 1; + if ((xadlock->count <= 2) && (tblk->xflag & COMMIT_LAZY)) { + /* + * Lazy commit may allow xtree to be modified before + * txUpdateMap runs. Copy xad into linelock to + * preserve correct data. + */ + xadlock->xdlist = &xtlck->pxdlock; + memcpy(xadlock->xdlist, &p->xad[XTENTRYSTART], + sizeof(xad_t) * xadlock->count); + } else { + /* + * xdlist will point to into inode's xtree, ensure + * that transaction is not committed lazily unless + * we're deleting the inode (unlink). In that case + * we have special logic for the inode to be + * unlocked by the lazy commit thread. + */ + xadlock->xdlist = &p->xad[XTENTRYSTART]; + if ((tblk->xflag & COMMIT_LAZY) && + (tblk->xflag & COMMIT_DELETE) && + (tblk->ip == ip)) + set_cflag(COMMIT_Holdlock, ip); + else + tblk->xflag &= ~COMMIT_LAZY; + } + jFYI(1, + ("xtLog: free ip:0x%p mp:0x%p count:%d lwm:2\n", + tlck->ip, mp, xadlock->count)); + + maplock->index = 1; + + /* mark page as invalid */ + if (((tblk->xflag & COMMIT_PWMAP) || S_ISDIR(ip->i_mode)) + && !(tlck->type & tlckBTROOT)) + tlck->flag |= tlckFREEPAGE; + /* + else (tblk->xflag & COMMIT_PMAP) + ? release the page; + */ + return; + } + + /* + * page/entry truncation: file truncation (ref. xtTruncate()) + * + * |----------+------+------+---------------| + * | | | + * | | hwm - hwm before truncation + * | next - truncation point + * lwm - lwm before truncation + * header ? + */ + if (tlck->type & tlckTRUNCATE) { + pxd_t tpxd; /* truncated extent of xad */ + + /* + * For truncation the entire linelock may be used, so it would + * be difficult to store xad list in linelock itself. + * Therefore, we'll just force transaction to be committed + * synchronously, so that xtree pages won't be changed before + * txUpdateMap runs. + */ + tblk->xflag &= ~COMMIT_LAZY; + lwm = xtlck->lwm.offset; + if (lwm == 0) + lwm = XTPAGEMAXSLOT; + hwm = xtlck->hwm.offset; + + /* + * write log records + */ + /* + * allocate entries XAD[lwm:next]: + */ + if (lwm < next) { + /* log after-image for logredo(): + * logredo() will update bmap for alloc of new/extended + * extents (XAD_NEW|XAD_EXTEND) of XAD[lwm:next) from + * after-image of XADlist; + * logredo() resets (XAD_NEW|XAD_EXTEND) flag when + * applying the after-image to the meta-data page. + */ + lrd->type = cpu_to_le16(LOG_REDOPAGE); + PXDaddress(pxd, mp->index); + PXDlength(pxd, + mp->logical_size >> tblk->sb-> + s_blocksize_bits); + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + } + + /* + * truncate entry XAD[hwm == next - 1]: + */ + if (hwm == next - 1) { + /* init LOG_UPDATEMAP for logredo() to update bmap for + * free of truncated delta extent of the truncated + * entry XAD[next - 1]: + * (xtlck->pxdlock = truncated delta extent); + */ + pxdlock = (pxdlock_t *) & xtlck->pxdlock; + /* assert(pxdlock->type & tlckTRUNCATE); */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + tpxd = pxdlock->pxd; /* save to format maplock */ + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + } + + /* + * free entries XAD[next:hwm]: + */ + if (hwm >= next) { + /* init LOG_UPDATEMAP of the freed extents + * XAD[next:hwm] from the deleted page itself + * for logredo() to update bmap; + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = + cpu_to_le16(LOG_FREEXADLIST); + xtlck = (xtlock_t *) & tlck->lock; + hwm = xtlck->hwm.offset; + lrd->log.updatemap.nxd = + cpu_to_le16(hwm - next + 1); + /* reformat linelock for lmLog() */ + xtlck->header.offset = next; + xtlck->header.length = hwm - next + 1; + xtlck->index = 1; + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, tlck)); + } + + /* + * format maplock(s) for txUpdateMap() to update bmap + */ + maplock->index = 0; + + /* + * allocate entries XAD[lwm:next): + */ + if (lwm < next) { + /* format a maplock for txUpdateMap() to update bPMAP + * for alloc of new/extended extents of XAD[lwm:next) + * from the page itself; + * txUpdateMap() resets (XAD_NEW|XAD_EXTEND) flag. + */ + tlck->flag |= tlckUPDATEMAP; + xadlock->flag = mlckALLOCXADLIST; + xadlock->count = next - lwm; + xadlock->xdlist = &p->xad[lwm]; + + jFYI(1, + ("xtLog: alloc ip:0x%p mp:0x%p count:%d lwm:%d next:%d\n", + tlck->ip, mp, xadlock->count, lwm, next)); + maplock->index++; + xadlock++; + } + + /* + * truncate entry XAD[hwm == next - 1]: + */ + if (hwm == next - 1) { + pxdlock_t *pxdlock; + + /* format a maplock for txUpdateMap() to update bmap + * to free truncated delta extent of the truncated + * entry XAD[next - 1]; + * (xtlck->pxdlock = truncated delta extent); + */ + tlck->flag |= tlckUPDATEMAP; + pxdlock = (pxdlock_t *) xadlock; + pxdlock->flag = mlckFREEPXD; + pxdlock->count = 1; + pxdlock->pxd = tpxd; + + jFYI(1, + ("xtLog: truncate ip:0x%p mp:0x%p count:%d hwm:%d\n", + ip, mp, pxdlock->count, hwm)); + maplock->index++; + xadlock++; + } + + /* + * free entries XAD[next:hwm]: + */ + if (hwm >= next) { + /* format a maplock for txUpdateMap() to update bmap + * to free extents of XAD[next:hwm] from thedeleted + * page itself; + */ + tlck->flag |= tlckUPDATEMAP; + xadlock->flag = mlckFREEXADLIST; + xadlock->count = hwm - next + 1; + xadlock->xdlist = &p->xad[next]; + + jFYI(1, + ("xtLog: free ip:0x%p mp:0x%p count:%d next:%d hwm:%d\n", + tlck->ip, mp, xadlock->count, next, hwm)); + maplock->index++; + } + + /* mark page as homeward bound */ + tlck->flag |= tlckWRITEPAGE; + } + return; +} + + +/* + * mapLog() + * + * function: log from maplock of freed data extents; + */ +void mapLog(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck) +{ + pxdlock_t *pxdlock; + int i, nlock; + pxd_t *pxd; + + /* + * page relocation: free the source page extent + * + * a maplock for txUpdateMap() for free of the page + * has been formatted at txLock() time saving the src + * relocated page address; + */ + if (tlck->type & tlckRELOCATE) { + /* log LOG_NOREDOPAGE of the old relocated page + * for logredo() to start NoRedoPage filter; + */ + lrd->type = cpu_to_le16(LOG_NOREDOPAGE); + pxdlock = (pxdlock_t *) & tlck->lock; + pxd = &lrd->log.redopage.pxd; + *pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* (N.B. currently, logredo() does NOT update bmap + * for free of the page itself for (LOG_XTREE|LOG_NOREDOPAGE); + * if page free from relocation, LOG_UPDATEMAP log is + * specifically generated now for logredo() + * to update bmap for free of src relocated page; + * (new flag LOG_RELOCATE may be introduced which will + * inform logredo() to start NORedoPage filter and also + * update block allocation map at the same time, thus + * avoiding an extra log write); + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + lrd->log.updatemap.type = cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + lrd->backchain = cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + + /* a maplock for txUpdateMap() for free of the page + * has been formatted at txLock() time; + */ + tlck->flag |= tlckUPDATEMAP; + return; + } + /* + + * Otherwise it's not a relocate request + * + */ + else { + /* log LOG_UPDATEMAP for logredo() to update bmap for + * free of truncated/relocated delta extent of the data; + * e.g.: external EA extent, relocated/truncated extent + * from xtTailgate(); + */ + lrd->type = cpu_to_le16(LOG_UPDATEMAP); + pxdlock = (pxdlock_t *) & tlck->lock; + nlock = pxdlock->index; + for (i = 0; i < nlock; i++, pxdlock++) { + if (pxdlock->flag & mlckALLOCPXD) + lrd->log.updatemap.type = + cpu_to_le16(LOG_ALLOCPXD); + else + lrd->log.updatemap.type = + cpu_to_le16(LOG_FREEPXD); + lrd->log.updatemap.nxd = cpu_to_le16(1); + lrd->log.updatemap.pxd = pxdlock->pxd; + lrd->backchain = + cpu_to_le32(lmLog(log, tblk, lrd, NULL)); + jFYI(1, ("mapLog: xaddr:0x%lx xlen:0x%x\n", + (ulong) addressPXD(&pxdlock->pxd), + lengthPXD(&pxdlock->pxd))); + } + + /* update bmap */ + tlck->flag |= tlckUPDATEMAP; + } +} + + +/* + * txEA() + * + * function: acquire maplock for EA/ACL extents or + * set COMMIT_INLINE flag; + */ +void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea) +{ + tlock_t *tlck = NULL; + pxdlock_t *maplock = NULL, *pxdlock = NULL; + + /* + * format maplock for alloc of new EA extent + */ + if (newea) { + /* Since the newea could be a completely zeroed entry we need to + * check for the two flags which indicate we should actually + * commit new EA data + */ + if (newea->flag & DXD_EXTENT) { + tlck = txMaplock(tid, ip, tlckMAP); + maplock = (pxdlock_t *) & tlck->lock; + pxdlock = (pxdlock_t *) maplock; + pxdlock->flag = mlckALLOCPXD; + PXDaddress(&pxdlock->pxd, addressDXD(newea)); + PXDlength(&pxdlock->pxd, lengthDXD(newea)); + pxdlock++; + maplock->index = 1; + } else if (newea->flag & DXD_INLINE) { + tlck = NULL; + + set_cflag(COMMIT_Inlineea, ip); + } + } + + /* + * format maplock for free of old EA extent + */ + if (!test_cflag(COMMIT_Nolink, ip) && oldea->flag & DXD_EXTENT) { + if (tlck == NULL) { + tlck = txMaplock(tid, ip, tlckMAP); + maplock = (pxdlock_t *) & tlck->lock; + pxdlock = (pxdlock_t *) maplock; + maplock->index = 0; + } + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, addressDXD(oldea)); + PXDlength(&pxdlock->pxd, lengthDXD(oldea)); + maplock->index++; + } +} + + +/* + * txForce() + * + * function: synchronously write pages locked by transaction + * after txLog() but before txUpdateMap(); + */ +void txForce(tblock_t * tblk) +{ + tlock_t *tlck; + lid_t lid, next; + metapage_t *mp; + + /* + * reverse the order of transaction tlocks in + * careful update order of address index pages + * (right to left, bottom up) + */ + tlck = lid_to_tlock(tblk->next); + lid = tlck->next; + tlck->next = 0; + while (lid) { + tlck = lid_to_tlock(lid); + next = tlck->next; + tlck->next = tblk->next; + tblk->next = lid; + lid = next; + } + + /* + * synchronously write the page, and + * hold the page for txUpdateMap(); + */ + for (lid = tblk->next; lid; lid = next) { + tlck = lid_to_tlock(lid); + next = tlck->next; + + if ((mp = tlck->mp) != NULL && + (tlck->type & tlckBTROOT) == 0) { + assert(mp->xflag & COMMIT_PAGE); + + if (tlck->flag & tlckWRITEPAGE) { + tlck->flag &= ~tlckWRITEPAGE; + + /* do not release page to freelist */ + assert(atomic_read(&mp->nohomeok)); + hold_metapage(mp, 0); + write_metapage(mp); + } + } + } +} + + +/* + * txUpdateMap() + * + * function: update persistent allocation map (and working map + * if appropriate); + * + * parameter: + */ +static void txUpdateMap(tblock_t * tblk) +{ + struct inode *ip; + struct inode *ipimap; + lid_t lid; + tlock_t *tlck; + maplock_t *maplock; + pxdlock_t pxdlock; + int maptype; + int k, nlock; + metapage_t *mp = 0; + + ipimap = JFS_SBI(tblk->sb)->ipimap; + + maptype = (tblk->xflag & COMMIT_PMAP) ? COMMIT_PMAP : COMMIT_PWMAP; + + + /* + * update block allocation map + * + * update allocation state in pmap (and wmap) and + * update lsn of the pmap page; + */ + /* + * scan each tlock/page of transaction for block allocation/free: + * + * for each tlock/page of transaction, update map. + * ? are there tlock for pmap and pwmap at the same time ? + */ + for (lid = tblk->next; lid; lid = tlck->next) { + tlck = lid_to_tlock(lid); + + if ((tlck->flag & tlckUPDATEMAP) == 0) + continue; + + if (tlck->flag & tlckFREEPAGE) { + /* + * Another thread may attempt to reuse freed space + * immediately, so we want to get rid of the metapage + * before anyone else has a chance to get it. + * Lock metapage, update maps, then invalidate + * the metapage. + */ + mp = tlck->mp; + ASSERT(mp->xflag & COMMIT_PAGE); + hold_metapage(mp, 0); + } + + /* + * extent list: + * . in-line PXD list: + * . out-of-line XAD list: + */ + maplock = (maplock_t *) & tlck->lock; + nlock = maplock->index; + + for (k = 0; k < nlock; k++, maplock++) { + /* + * allocate blocks in persistent map: + * + * blocks have been allocated from wmap at alloc time; + */ + if (maplock->flag & mlckALLOC) { + txAllocPMap(ipimap, maplock, tblk); + } + /* + * free blocks in persistent and working map: + * blocks will be freed in pmap and then in wmap; + * + * ? tblock specifies the PMAP/PWMAP based upon + * transaction + * + * free blocks in persistent map: + * blocks will be freed from wmap at last reference + * release of the object for regular files; + * + * Alway free blocks from both persistent & working + * maps for directories + */ + else { /* (maplock->flag & mlckFREE) */ + + if (S_ISDIR(tlck->ip->i_mode)) + txFreeMap(ipimap, maplock, + tblk, COMMIT_PWMAP); + else + txFreeMap(ipimap, maplock, + tblk, maptype); + } + } + if (tlck->flag & tlckFREEPAGE) { + if (!(tblk->flag & tblkGC_LAZY)) { + /* This is equivalent to txRelease */ + ASSERT(mp->lid == lid); + tlck->mp->lid = 0; + } + assert(atomic_read(&mp->nohomeok) == 1); + atomic_dec(&mp->nohomeok); + discard_metapage(mp); + tlck->mp = 0; + } + } + /* + * update inode allocation map + * + * update allocation state in pmap and + * update lsn of the pmap page; + * update in-memory inode flag/state + * + * unlock mapper/write lock + */ + if (tblk->xflag & COMMIT_CREATE) { + ip = tblk->ip; + + ASSERT(test_cflag(COMMIT_New, ip)); + clear_cflag(COMMIT_New, ip); + + diUpdatePMap(ipimap, ip->i_ino, FALSE, tblk); + ipimap->i_state |= I_DIRTY; + /* update persistent block allocation map + * for the allocation of inode extent; + */ + pxdlock.flag = mlckALLOCPXD; + pxdlock.pxd = JFS_IP(ip)->ixpxd; + pxdlock.index = 1; + txAllocPMap(ip, (maplock_t *) & pxdlock, tblk); + iput(ip); + } else if (tblk->xflag & COMMIT_DELETE) { + ip = tblk->ip; + diUpdatePMap(ipimap, ip->i_ino, TRUE, tblk); + ipimap->i_state |= I_DIRTY; + if (test_and_clear_cflag(COMMIT_Holdlock, ip)) { + if (tblk->flag & tblkGC_LAZY) + IWRITE_UNLOCK(ip); + } + iput(ip); + } +} + + +/* + * txAllocPMap() + * + * function: allocate from persistent map; + * + * parameter: + * ipbmap - + * malock - + * xad list: + * pxd: + * + * maptype - + * allocate from persistent map; + * free from persistent map; + * (e.g., tmp file - free from working map at releae + * of last reference); + * free from persistent and working map; + * + * lsn - log sequence number; + */ +static void txAllocPMap(struct inode *ip, maplock_t * maplock, + tblock_t * tblk) +{ + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + xdlistlock_t *xadlistlock; + xad_t *xad; + s64 xaddr; + int xlen; + pxdlock_t *pxdlock; + xdlistlock_t *pxdlistlock; + pxd_t *pxd; + int n; + + /* + * allocate from persistent map; + */ + if (maplock->flag & mlckALLOCXADLIST) { + xadlistlock = (xdlistlock_t *) maplock; + xad = xadlistlock->xdlist; + for (n = 0; n < xadlistlock->count; n++, xad++) { + if (xad->flag & (XAD_NEW | XAD_EXTENDED)) { + xaddr = addressXAD(xad); + xlen = lengthXAD(xad); + dbUpdatePMap(ipbmap, FALSE, xaddr, + (s64) xlen, tblk); + xad->flag &= ~(XAD_NEW | XAD_EXTENDED); + jFYI(1, + ("allocPMap: xaddr:0x%lx xlen:%d\n", + (ulong) xaddr, xlen)); + } + } + } else if (maplock->flag & mlckALLOCPXD) { + pxdlock = (pxdlock_t *) maplock; + xaddr = addressPXD(&pxdlock->pxd); + xlen = lengthPXD(&pxdlock->pxd); + dbUpdatePMap(ipbmap, FALSE, xaddr, (s64) xlen, tblk); + jFYI(1, + ("allocPMap: xaddr:0x%lx xlen:%d\n", (ulong) xaddr, + xlen)); + } else { /* (maplock->flag & mlckALLOCPXDLIST) */ + + pxdlistlock = (xdlistlock_t *) maplock; + pxd = pxdlistlock->xdlist; + for (n = 0; n < pxdlistlock->count; n++, pxd++) { + xaddr = addressPXD(pxd); + xlen = lengthPXD(pxd); + dbUpdatePMap(ipbmap, FALSE, xaddr, (s64) xlen, + tblk); + jFYI(1, + ("allocPMap: xaddr:0x%lx xlen:%d\n", + (ulong) xaddr, xlen)); + } + } +} + + +/* + * txFreeMap() + * + * function: free from persistent and/or working map; + * + * todo: optimization + */ +void txFreeMap(struct inode *ip, + maplock_t * maplock, tblock_t * tblk, int maptype) +{ + struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; + xdlistlock_t *xadlistlock; + xad_t *xad; + s64 xaddr; + int xlen; + pxdlock_t *pxdlock; + xdlistlock_t *pxdlistlock; + pxd_t *pxd; + int n; + + jFYI(1, + ("txFreeMap: tblk:0x%p maplock:0x%p maptype:0x%x\n", + tblk, maplock, maptype)); + + /* + * free from persistent map; + */ + if (maptype == COMMIT_PMAP || maptype == COMMIT_PWMAP) { + if (maplock->flag & mlckFREEXADLIST) { + xadlistlock = (xdlistlock_t *) maplock; + xad = xadlistlock->xdlist; + for (n = 0; n < xadlistlock->count; n++, xad++) { + if (!(xad->flag & XAD_NEW)) { + xaddr = addressXAD(xad); + xlen = lengthXAD(xad); + dbUpdatePMap(ipbmap, TRUE, xaddr, + (s64) xlen, tblk); + jFYI(1, + ("freePMap: xaddr:0x%lx xlen:%d\n", + (ulong) xaddr, xlen)); + } + } + } else if (maplock->flag & mlckFREEPXD) { + pxdlock = (pxdlock_t *) maplock; + xaddr = addressPXD(&pxdlock->pxd); + xlen = lengthPXD(&pxdlock->pxd); + dbUpdatePMap(ipbmap, TRUE, xaddr, (s64) xlen, + tblk); + jFYI(1, + ("freePMap: xaddr:0x%lx xlen:%d\n", + (ulong) xaddr, xlen)); + } else { /* (maplock->flag & mlckALLOCPXDLIST) */ + + pxdlistlock = (xdlistlock_t *) maplock; + pxd = pxdlistlock->xdlist; + for (n = 0; n < pxdlistlock->count; n++, pxd++) { + xaddr = addressPXD(pxd); + xlen = lengthPXD(pxd); + dbUpdatePMap(ipbmap, TRUE, xaddr, + (s64) xlen, tblk); + jFYI(1, + ("freePMap: xaddr:0x%lx xlen:%d\n", + (ulong) xaddr, xlen)); + } + } + } + + /* + * free from working map; + */ + if (maptype == COMMIT_PWMAP || maptype == COMMIT_WMAP) { + if (maplock->flag & mlckFREEXADLIST) { + xadlistlock = (xdlistlock_t *) maplock; + xad = xadlistlock->xdlist; + for (n = 0; n < xadlistlock->count; n++, xad++) { + xaddr = addressXAD(xad); + xlen = lengthXAD(xad); + dbFree(ip, xaddr, (s64) xlen); + xad->flag = 0; + jFYI(1, + ("freeWMap: xaddr:0x%lx xlen:%d\n", + (ulong) xaddr, xlen)); + } + } else if (maplock->flag & mlckFREEPXD) { + pxdlock = (pxdlock_t *) maplock; + xaddr = addressPXD(&pxdlock->pxd); + xlen = lengthPXD(&pxdlock->pxd); + dbFree(ip, xaddr, (s64) xlen); + jFYI(1, + ("freeWMap: xaddr:0x%lx xlen:%d\n", + (ulong) xaddr, xlen)); + } else { /* (maplock->flag & mlckFREEPXDLIST) */ + + pxdlistlock = (xdlistlock_t *) maplock; + pxd = pxdlistlock->xdlist; + for (n = 0; n < pxdlistlock->count; n++, pxd++) { + xaddr = addressPXD(pxd); + xlen = lengthPXD(pxd); + dbFree(ip, xaddr, (s64) xlen); + jFYI(1, + ("freeWMap: xaddr:0x%lx xlen:%d\n", + (ulong) xaddr, xlen)); + } + } + } +} + + +/* + * txFreelock() + * + * function: remove tlock from inode anonymous locklist + */ +void txFreelock(struct inode *ip) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + tlock_t *xtlck, *tlck; + lid_t xlid = 0, lid; + + if (!jfs_ip->atlhead) + return; + + xtlck = (tlock_t *) &jfs_ip->atlhead; + + while ((lid = xtlck->next)) { + tlck = lid_to_tlock(lid); + if (tlck->flag & tlckFREELOCK) { + xtlck->next = tlck->next; + txLockFree(lid); + } else { + xtlck = tlck; + xlid = lid; + } + } + + if (jfs_ip->atlhead) + jfs_ip->atltail = xlid; + else { + jfs_ip->atltail = 0; + /* + * If inode was on anon_list, remove it + */ + TXN_LOCK(); + list_del_init(&jfs_ip->anon_inode_list); + TXN_UNLOCK(); + } +} + + +/* + * txAbort() + * + * function: abort tx before commit; + * + * frees line-locks and segment locks for all + * segments in comdata structure. + * Optionally sets state of file-system to FM_DIRTY in super-block. + * log age of page-frames in memory for which caller has + * are reset to 0 (to avoid logwarap). + */ +void txAbort(tid_t tid, int dirty) +{ + lid_t lid, next; + metapage_t *mp; + tblock_t *tblk = tid_to_tblock(tid); + + jEVENT(1, ("txAbort: tid:%d dirty:0x%x\n", tid, dirty)); + + /* + * free tlocks of the transaction + */ + for (lid = tblk->next; lid; lid = next) { + next = lid_to_tlock(lid)->next; + + mp = lid_to_tlock(lid)->mp; + + if (mp) { + mp->lid = 0; + + /* + * reset lsn of page to avoid logwarap: + * + * (page may have been previously committed by another + * transaction(s) but has not been paged, i.e., + * it may be on logsync list even though it has not + * been logged for the current tx.) + */ + if (mp->xflag & COMMIT_PAGE && mp->lsn) + LogSyncRelease(mp); + } + /* insert tlock at head of freelist */ + TXN_LOCK(); + txLockFree(lid); + TXN_UNLOCK(); + } + + /* caller will free the transaction block */ + + tblk->next = tblk->last = 0; + + /* + * mark filesystem dirty + */ + if (dirty) + updateSuper(tblk->sb, FM_DIRTY); + + return; +} + + +/* + * txAbortCommit() + * + * function: abort commit. + * + * frees tlocks of transaction; line-locks and segment locks for all + * segments in comdata structure. frees malloc storage + * sets state of file-system to FM_MDIRTY in super-block. + * log age of page-frames in memory for which caller has + * are reset to 0 (to avoid logwarap). + */ +void txAbortCommit(commit_t * cd, int exval) +{ + tblock_t *tblk; + tid_t tid; + lid_t lid, next; + metapage_t *mp; + + assert(exval == EIO || exval == ENOMEM); + jEVENT(1, ("txAbortCommit: cd:0x%p\n", cd)); + + /* + * free tlocks of the transaction + */ + tid = cd->tid; + tblk = tid_to_tblock(tid); + for (lid = tblk->next; lid; lid = next) { + next = lid_to_tlock(lid)->next; + + mp = lid_to_tlock(lid)->mp; + if (mp) { + mp->lid = 0; + + /* + * reset lsn of page to avoid logwarap; + */ + if (mp->xflag & COMMIT_PAGE) + LogSyncRelease(mp); + } + + /* insert tlock at head of freelist */ + TXN_LOCK(); + txLockFree(lid); + TXN_UNLOCK(); + } + + tblk->next = tblk->last = 0; + + /* free the transaction block */ + txEnd(tid); + + /* + * mark filesystem dirty + */ + updateSuper(cd->sb, FM_DIRTY); +} + + +/* + * txLazyCommit(void) + * + * All transactions except those changing ipimap (COMMIT_FORCE) are + * processed by this routine. This insures that the inode and block + * allocation maps are updated in order. For synchronous transactions, + * let the user thread finish processing after txUpdateMap() is called. + */ +void txLazyCommit(tblock_t * tblk) +{ + log_t *log; + + while (((tblk->flag & tblkGC_READY) == 0) && + ((tblk->flag & tblkGC_UNLOCKED) == 0)) { + /* We must have gotten ahead of the user thread + */ + jFYI(1, + ("jfs_lazycommit: tblk 0x%p not unlocked\n", tblk)); + schedule(); + } + + jFYI(1, ("txLazyCommit: processing tblk 0x%p\n", tblk)); + + txUpdateMap(tblk); + + log = (log_t *) JFS_SBI(tblk->sb)->log; + + spin_lock_irq(&log->gclock); // LOGGC_LOCK + + tblk->flag |= tblkGC_COMMITTED; + + if ((tblk->flag & tblkGC_READY) || (tblk->flag & tblkGC_LAZY)) + log->gcrtc--; + + if (tblk->flag & tblkGC_READY) + wake_up(&tblk->gcwait); // LOGGC_WAKEUP + + spin_unlock_irq(&log->gclock); // LOGGC_UNLOCK + + if (tblk->flag & tblkGC_LAZY) { + txUnlock(tblk, 0); + tblk->flag &= ~tblkGC_LAZY; + txEnd(tblk - TxBlock); /* Convert back to tid */ + } + + jFYI(1, ("txLazyCommit: done: tblk = 0x%p\n", tblk)); +} + +/* + * jfs_lazycommit(void) + * + * To be run as a kernel daemon. If lbmIODone is called in an interrupt + * context, or where blocking is not wanted, this routine will process + * committed transactions from the unlock queue. + */ +int jfs_lazycommit(void) +{ + int WorkDone; + tblock_t *tblk; + unsigned long flags; + + lock_kernel(); + + daemonize(); + current->tty = NULL; + strcpy(current->comm, "jfsCommit"); + + unlock_kernel(); + + jfsCommitTask = current; + + spin_lock_irq(¤t->sigmask_lock); + siginitsetinv(¤t->blocked, + sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) + | sigmask(SIGCONT)); + spin_unlock_irq(¤t->sigmask_lock); + + LAZY_LOCK_INIT(); + TxAnchor.unlock_queue = TxAnchor.unlock_tail = 0; + + complete(&jfsIOwait); + + do { + LAZY_LOCK(flags); +restart: + WorkDone = 0; + while ((tblk = TxAnchor.unlock_queue)) { + /* + * We can't get ahead of user thread. Spinning is + * simpler than blocking/waking. We shouldn't spin + * very long, since user thread shouldn't be blocking + * between lmGroupCommit & txEnd. + */ + WorkDone = 1; + + /* + * Remove first transaction from queue + */ + TxAnchor.unlock_queue = tblk->cqnext; + tblk->cqnext = 0; + if (TxAnchor.unlock_tail == tblk) + TxAnchor.unlock_tail = 0; + + LAZY_UNLOCK(flags); + txLazyCommit(tblk); + + /* + * We can be running indefinately if other processors + * are adding transactions to this list + */ + if (current->need_resched) + schedule(); + LAZY_LOCK(flags); + } + + if (WorkDone) + goto restart; + + LAZY_UNLOCK(flags); + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } while (!jfs_thread_stopped()); + + if (TxAnchor.unlock_queue) + jERROR(1, ("jfs_lazycommit being killed with pending transactions!\n")); + else + jFYI(1, ("jfs_lazycommit being killed\n")); + complete(&jfsIOwait); + return 0; +} + +void txLazyUnlock(tblock_t * tblk) +{ + unsigned long flags; + + LAZY_LOCK(flags); + + if (TxAnchor.unlock_tail) + TxAnchor.unlock_tail->cqnext = tblk; + else + TxAnchor.unlock_queue = tblk; + TxAnchor.unlock_tail = tblk; + tblk->cqnext = 0; + LAZY_UNLOCK(flags); + wake_up_process(jfsCommitTask); +} + +static void LogSyncRelease(metapage_t * mp) +{ + log_t *log = mp->log; + + assert(atomic_read(&mp->nohomeok)); + assert(log); + atomic_dec(&mp->nohomeok); + + if (atomic_read(&mp->nohomeok)) + return; + + hold_metapage(mp, 0); + + LOGSYNC_LOCK(log); + mp->log = NULL; + mp->lsn = 0; + mp->clsn = 0; + log->count--; + list_del_init(&mp->synclist); + LOGSYNC_UNLOCK(log); + + release_metapage(mp); +} + +/* + * jfs_sync(void) + * + * To be run as a kernel daemon. This is awakened when tlocks run low. + * We write any inodes that have anonymous tlocks so they will become + * available. + */ +int jfs_sync(void) +{ + struct inode *ip; + struct jfs_inode_info *jfs_ip; + + lock_kernel(); + + daemonize(); + current->tty = NULL; + strcpy(current->comm, "jfsSync"); + + unlock_kernel(); + + jfsSyncTask = current; + + spin_lock_irq(¤t->sigmask_lock); + siginitsetinv(¤t->blocked, + sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) + | sigmask(SIGCONT)); + spin_unlock_irq(¤t->sigmask_lock); + + complete(&jfsIOwait); + + do { + /* + * write each inode on the anonymous inode list + */ + TXN_LOCK(); + while (TlocksLow && !list_empty(&TxAnchor.anon_list)) { + jfs_ip = list_entry(TxAnchor.anon_list.next, + struct jfs_inode_info, + anon_inode_list); + ip = jfs_ip->inode; + + /* + * We must release the TXN_LOCK since our + * IWRITE_TRYLOCK implementation may still block + */ + TXN_UNLOCK(); + if (IWRITE_TRYLOCK(ip)) { + /* + * inode will be removed from anonymous list + * when it is committed + */ + jfs_commit_inode(ip, 0); + IWRITE_UNLOCK(ip); + /* + * Just to be safe. I don't know how + * long we can run without blocking + */ + if (current->need_resched) + schedule(); + TXN_LOCK(); + } else { + /* We can't get the write lock. It may + * be held by a thread waiting for tlock's + * so let's not block here. Save it to + * put back on the anon_list. + */ + + /* + * We released TXN_LOCK, let's make sure + * this inode is still there + */ + TXN_LOCK(); + if (TxAnchor.anon_list.next != + &jfs_ip->anon_inode_list) + continue; + + /* Take off anon_list */ + list_del(&jfs_ip->anon_inode_list); + + /* Put on anon_list2 */ + list_add(&jfs_ip->anon_inode_list, + &TxAnchor.anon_list2); + } + } + /* Add anon_list2 back to anon_list */ + if (!list_empty(&TxAnchor.anon_list2)) { + list_splice(&TxAnchor.anon_list2, &TxAnchor.anon_list); + INIT_LIST_HEAD(&TxAnchor.anon_list2); + } + TXN_UNLOCK(); + + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } while (!jfs_thread_stopped()); + + jFYI(1, ("jfs_sync being killed\n")); + complete(&jfsIOwait); + return 0; +} + +#if CONFIG_PROC_FS +int jfs_txanchor_read(char *buffer, char **start, off_t offset, int length, + int *eof, void *data) +{ + int len = 0; + off_t begin; + char *freewait; + char *freelockwait; + char *lowlockwait; + + freewait = + waitqueue_active(&TxAnchor.freewait) ? "active" : "empty"; + freelockwait = + waitqueue_active(&TxAnchor.freelockwait) ? "active" : "empty"; + lowlockwait = + waitqueue_active(&TxAnchor.lowlockwait) ? "active" : "empty"; + + len += sprintf(buffer, + "JFS TxAnchor\n" + "============\n" + "freetid = %d\n" + "freewait = %s\n" + "freelock = %d\n" + "freelockwait = %s\n" + "lowlockwait = %s\n" + "tlocksInUse = %d\n" + "unlock_queue = 0x%p\n" + "unlock_tail = 0x%p\n", + TxAnchor.freetid, + freewait, + TxAnchor.freelock, + freelockwait, + lowlockwait, + TxAnchor.tlocksInUse, + TxAnchor.unlock_queue, + TxAnchor.unlock_tail); + + begin = offset; + *start = buffer + begin; + len -= begin; + + if (len > length) + len = length; + else + *eof = 1; + + if (len < 0) + len = 0; + + return len; +} +#endif diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_txnmgr.h linuxppc64_2_4/fs/jfs/jfs_txnmgr.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_txnmgr.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_txnmgr.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,315 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * Change History : + * + */ + +#ifndef _H_JFS_TXNMGR +#define _H_JFS_TXNMGR +/* + * jfs_txnmgr.h: transaction manager + */ + +#include "jfs_logmgr.h" + +/* + * Hide implementation of TxBlock and TxLock + */ +#define tid_to_tblock(tid) (&TxBlock[tid]) + +#define lid_to_tlock(lid) (&TxLock[lid]) + +/* + * transaction block + */ +typedef struct tblock { + /* + * tblock_t and jbuf_t common area: struct logsyncblk + * + * the following 5 fields are the same as struct logsyncblk + * which is common to tblock and jbuf to form logsynclist + */ + u16 xflag; /* tx commit type */ + u16 flag; /* tx commit state */ + lid_t dummy; /* Must keep structures common */ + s32 lsn; /* recovery lsn */ + struct list_head synclist; /* logsynclist link */ + + /* lock management */ + struct super_block *sb; /* 4: super block */ + lid_t next; /* 2: index of first tlock of tid */ + lid_t last; /* 2: index of last tlock of tid */ + wait_queue_head_t waitor; /* 4: tids waiting on this tid */ + + /* log management */ + u32 logtid; /* 4: log transaction id */ + /* (32) */ + + /* commit management */ + struct tblock *cqnext; /* 4: commit queue link */ + s32 clsn; /* 4: commit lsn */ + struct lbuf *bp; /* 4: */ + s32 pn; /* 4: commit record log page number */ + s32 eor; /* 4: commit record eor */ + wait_queue_head_t gcwait; /* 4: group commit event list: + * ready transactions wait on this + * event for group commit completion. + */ + struct inode *ip; /* 4: inode being created or deleted */ + s32 rsrvd; /* 4: */ +} tblock_t; /* (64) */ + +extern struct tblock *TxBlock; /* transaction block table */ + +/* commit flags: tblk->xflag */ +#define COMMIT_SYNC 0x0001 /* synchronous commit */ +#define COMMIT_FORCE 0x0002 /* force pageout at end of commit */ +#define COMMIT_FLUSH 0x0004 /* init flush at end of commit */ +#define COMMIT_MAP 0x00f0 +#define COMMIT_PMAP 0x0010 /* update pmap */ +#define COMMIT_WMAP 0x0020 /* update wmap */ +#define COMMIT_PWMAP 0x0040 /* update pwmap */ +#define COMMIT_FREE 0x0f00 +#define COMMIT_DELETE 0x0100 /* inode delete */ +#define COMMIT_TRUNCATE 0x0200 /* file truncation */ +#define COMMIT_CREATE 0x0400 /* inode create */ +#define COMMIT_LAZY 0x0800 /* lazy commit */ +#define COMMIT_PAGE 0x1000 /* Identifies element as metapage */ +#define COMMIT_INODE 0x2000 /* Identifies element as inode */ + +/* group commit flags tblk->flag: see jfs_logmgr.h */ + +/* + * transaction lock + */ +typedef struct tlock { + lid_t next; /* index next lockword on tid locklist + * next lockword on freelist + */ + tid_t tid; /* transaction id holding lock */ + + u16 flag; /* 2: lock control */ + u16 type; /* 2: log type */ + + struct metapage *mp; /* 4: object page buffer locked */ + struct inode *ip; /* 4: object */ + /* (16) */ + + s16 lock[24]; /* 48: overlay area */ +} tlock_t; /* (64) */ + +extern struct tlock *TxLock; /* transaction lock table */ + +/* + * tlock flag + */ +/* txLock state */ +#define tlckPAGELOCK 0x8000 +#define tlckINODELOCK 0x4000 +#define tlckLINELOCK 0x2000 +#define tlckINLINELOCK 0x1000 +/* lmLog state */ +#define tlckLOG 0x0800 +/* updateMap state */ +#define tlckUPDATEMAP 0x0080 +/* freeLock state */ +#define tlckFREELOCK 0x0008 +#define tlckWRITEPAGE 0x0004 +#define tlckFREEPAGE 0x0002 + +/* + * tlock type + */ +#define tlckTYPE 0xfe00 +#define tlckINODE 0x8000 +#define tlckXTREE 0x4000 +#define tlckDTREE 0x2000 +#define tlckMAP 0x1000 +#define tlckEA 0x0800 +#define tlckACL 0x0400 +#define tlckDATA 0x0200 +#define tlckBTROOT 0x0100 + +#define tlckOPERATION 0x00ff +#define tlckGROW 0x0001 /* file grow */ +#define tlckREMOVE 0x0002 /* file delete */ +#define tlckTRUNCATE 0x0004 /* file truncate */ +#define tlckRELOCATE 0x0008 /* file/directory relocate */ +#define tlckENTRY 0x0001 /* directory insert/delete */ +#define tlckEXTEND 0x0002 /* directory extend in-line */ +#define tlckSPLIT 0x0010 /* splited page */ +#define tlckNEW 0x0020 /* new page from split */ +#define tlckFREE 0x0040 /* free page */ +#define tlckRELINK 0x0080 /* update sibling pointer */ + +/* + * linelock for lmLog() + * + * note: linelock_t and its variations are overlaid + * at tlock.lock: watch for alignment; + */ +typedef struct { + u8 offset; /* 1: */ + u8 length; /* 1: */ +} lv_t; /* (2) */ + +#define TLOCKSHORT 20 +#define TLOCKLONG 28 + +typedef struct { + u16 next; /* 2: next linelock */ + + s8 maxcnt; /* 1: */ + s8 index; /* 1: */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 l2linesize; /* 1: log2 of linesize */ + /* (8) */ + + lv_t lv[20]; /* 40: */ +} linelock_t; /* (48) */ + +#define dtlock_t linelock_t +#define itlock_t linelock_t + +typedef struct { + u16 next; /* 2: */ + + s8 maxcnt; /* 1: */ + s8 index; /* 1: */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 l2linesize; /* 1: log2 of linesize */ + /* (8) */ + + lv_t header; /* 2: */ + lv_t lwm; /* 2: low water mark */ + lv_t hwm; /* 2: high water mark */ + lv_t twm; /* 2: */ + /* (16) */ + + s32 pxdlock[8]; /* 32: */ +} xtlock_t; /* (48) */ + + +/* + * maplock for txUpdateMap() + * + * note: maplock_t and its variations are overlaid + * at tlock.lock/linelock: watch for alignment; + * N.B. next field may be set by linelock, and should not + * be modified by maplock; + * N.B. index of the first pxdlock specifies index of next + * free maplock (i.e., number of maplock) in the tlock; + */ +typedef struct { + u16 next; /* 2: */ + + u8 maxcnt; /* 2: */ + u8 index; /* 2: next free maplock index */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 count; /* 1: number of pxd/xad */ + /* (8) */ + + pxd_t pxd; /* 8: */ +} maplock_t; /* (16): */ + +/* maplock flag */ +#define mlckALLOC 0x00f0 +#define mlckALLOCXADLIST 0x0080 +#define mlckALLOCPXDLIST 0x0040 +#define mlckALLOCXAD 0x0020 +#define mlckALLOCPXD 0x0010 +#define mlckFREE 0x000f +#define mlckFREEXADLIST 0x0008 +#define mlckFREEPXDLIST 0x0004 +#define mlckFREEXAD 0x0002 +#define mlckFREEPXD 0x0001 + +#define pxdlock_t maplock_t + +typedef struct { + u16 next; /* 2: */ + + u8 maxcnt; /* 2: */ + u8 index; /* 2: */ + + u16 flag; /* 2: */ + u8 type; /* 1: */ + u8 count; /* 1: number of pxd/xad */ + /* (8) */ + + void *xdlist; /* 4: pxd/xad list */ + s32 rsrvd; /* 4: */ +} xdlistlock_t; /* (16): */ + + +/* + * commit + * + * parameter to the commit manager routines + */ +typedef struct commit { + tid_t tid; /* 4: tid = index of tblock */ + int flag; /* 4: flags */ + log_t *log; /* 4: log */ + struct super_block *sb; /* 4: superblock */ + + int nip; /* 4: number of entries in iplist */ + struct inode **iplist; /* 4: list of pointers to inodes */ + /* (32) */ + + /* log record descriptor on 64-bit boundary */ + lrd_t lrd; /* : log record descriptor */ +} commit_t; + +/* + * external declarations + */ +extern tlock_t *txLock(tid_t tid, struct inode *ip, struct metapage *mp, int flag); + +extern tlock_t *txMaplock(tid_t tid, struct inode *ip, int flag); + +extern int txCommit(tid_t tid, int nip, struct inode **iplist, int flag); + +extern tid_t txBegin(struct super_block *sb, int flag); + +extern void txBeginAnon(struct super_block *sb); + +extern void txEnd(tid_t tid); + +extern void txAbort(tid_t tid, int dirty); + +extern linelock_t *txLinelock(linelock_t * tlock); + +extern void txFreeMap(struct inode *ip, + maplock_t * maplock, tblock_t * tblk, int maptype); + +extern void txEA(tid_t tid, struct inode *ip, dxd_t * oldea, dxd_t * newea); + +extern void txFreelock(struct inode *ip); + +extern int lmLog(log_t * log, tblock_t * tblk, lrd_t * lrd, tlock_t * tlck); + +#endif /* _H_JFS_TXNMGR */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_types.h linuxppc64_2_4/fs/jfs/jfs_types.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_types.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_types.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,188 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef _H_JFS_TYPES +#define _H_JFS_TYPES + +/* + * jfs_types.h: + * + * basic type/utility definitions + * + * note: this header file must be the 1st include file + * of JFS include list in all JFS .c file. + */ + +#include +#include + +#include "endian24.h" +#include "jfs_compat.h" + +/* + * transaction and lock id's + */ +typedef uint tid_t; +typedef uint lid_t; + +/* + * Almost identical to Linux's timespec, but not quite + */ +struct timestruc_t { + u32 tv_sec; + u32 tv_nsec; +}; + +/* + * handy + */ + +#define LEFTMOSTONE 0x80000000 +#define HIGHORDER 0x80000000u /* high order bit on */ +#define ONES 0xffffffffu /* all bit on */ + +typedef int boolean_t; +#define TRUE 1 +#define FALSE 0 + +/* + * logical xd (lxd) + */ +typedef struct { + unsigned len:24; + unsigned off1:8; + u32 off2; +} lxd_t; + +/* lxd_t field construction */ +#define LXDlength(lxd, length32) ( (lxd)->len = length32 ) +#define LXDoffset(lxd, offset64)\ +{\ + (lxd)->off1 = ((s64)offset64) >> 32;\ + (lxd)->off2 = (offset64) & 0xffffffff;\ +} + +/* lxd_t field extraction */ +#define lengthLXD(lxd) ( (lxd)->len ) +#define offsetLXD(lxd)\ + ( ((s64)((lxd)->off1)) << 32 | (lxd)->off2 ) + +/* lxd list */ +typedef struct { + s16 maxnlxd; + s16 nlxd; + lxd_t *lxd; +} lxdlist_t; + +/* + * physical xd (pxd) + */ +typedef struct { + unsigned len:24; + unsigned addr1:8; + u32 addr2; +} pxd_t; + +/* xd_t field construction */ + +#define PXDlength(pxd, length32) ((pxd)->len = __cpu_to_le24(length32)) +#define PXDaddress(pxd, address64)\ +{\ + (pxd)->addr1 = ((s64)address64) >> 32;\ + (pxd)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ +} + +/* xd_t field extraction */ +#define lengthPXD(pxd) __le24_to_cpu((pxd)->len) +#define addressPXD(pxd)\ + ( ((s64)((pxd)->addr1)) << 32 | __le32_to_cpu((pxd)->addr2)) + +/* pxd list */ +typedef struct { + s16 maxnpxd; + s16 npxd; + pxd_t pxd[8]; +} pxdlist_t; + + +/* + * data extent descriptor (dxd) + */ +typedef struct { + unsigned flag:8; /* 1: flags */ + unsigned rsrvd:24; /* 3: */ + u32 size; /* 4: size in byte */ + unsigned len:24; /* 3: length in unit of fsblksize */ + unsigned addr1:8; /* 1: address in unit of fsblksize */ + u32 addr2; /* 4: address in unit of fsblksize */ +} dxd_t; /* - 16 - */ + +/* dxd_t flags */ +#define DXD_INDEX 0x80 /* B+-tree index */ +#define DXD_INLINE 0x40 /* in-line data extent */ +#define DXD_EXTENT 0x20 /* out-of-line single extent */ +#define DXD_FILE 0x10 /* out-of-line file (inode) */ +#define DXD_CORRUPT 0x08 /* Inconsistency detected */ + +/* dxd_t field construction + * Conveniently, the PXD macros work for DXD + */ +#define DXDlength PXDlength +#define DXDaddress PXDaddress +#define lengthDXD lengthPXD +#define addressDXD addressPXD + +/* + * directory entry argument + */ +typedef struct component_name { + int namlen; + wchar_t *name; +} component_t; + + +/* + * DASD limit information - stored in directory inode + */ +typedef struct dasd { + u8 thresh; /* Alert Threshold (in percent) */ + u8 delta; /* Alert Threshold delta (in percent) */ + u8 rsrvd1; + u8 limit_hi; /* DASD limit (in logical blocks) */ + u32 limit_lo; /* DASD limit (in logical blocks) */ + u8 rsrvd2[3]; + u8 used_hi; /* DASD usage (in logical blocks) */ + u32 used_lo; /* DASD usage (in logical blocks) */ +} dasd_t; + +#define DASDLIMIT(dasdp) \ + (((u64)((dasdp)->limit_hi) << 32) + __le32_to_cpu((dasdp)->limit_lo)) +#define setDASDLIMIT(dasdp, limit)\ +{\ + (dasdp)->limit_hi = ((u64)limit) >> 32;\ + (dasdp)->limit_lo = __cpu_to_le32(limit);\ +} +#define DASDUSED(dasdp) \ + (((u64)((dasdp)->used_hi) << 32) + __le32_to_cpu((dasdp)->used_lo)) +#define setDASDUSED(dasdp, used)\ +{\ + (dasdp)->used_hi = ((u64)used) >> 32;\ + (dasdp)->used_lo = __cpu_to_le32(used);\ +} + +#endif /* !_H_JFS_TYPES */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_umount.c linuxppc64_2_4/fs/jfs/jfs_umount.c --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_umount.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_umount.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,158 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* + * Change History : + */ + +/* + * jfs_umount.c + * + * note: file system in transition to aggregate/fileset: + * (ref. jfs_mount.c) + * + * file system unmount is interpreted as mount of the single/only + * fileset in the aggregate and, if unmount of the last fileset, + * as unmount of the aggerate; + */ + +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_metapage.h" +#include "jfs_debug.h" + +/* + * NAME: jfs_umount(vfsp, flags, crp) + * + * FUNCTION: vfs_umount() + * + * PARAMETERS: vfsp - virtual file system pointer + * flags - unmount for shutdown + * crp - credential + * + * RETURN : EBUSY - device has open files + */ +int jfs_umount(struct super_block *sb) +{ + int rc = 0; + log_t *log; + struct jfs_sb_info *sbi = JFS_SBI(sb); + struct inode *ipbmap = sbi->ipbmap; + struct inode *ipimap = sbi->ipimap; + struct inode *ipaimap = sbi->ipaimap; + struct inode *ipaimap2 = sbi->ipaimap2; + + jFYI(1, ("\n UnMount JFS: sb:0x%p\n", sb)); + + /* + * update superblock and close log + * + * if mounted read-write and log based recovery was enabled + */ + if ((log = sbi->log)) { + /* + * close log: + * + * remove file system from log active file system list. + */ + log = sbi->log; + rc = lmLogClose(sb, log); + } + + /* + * close fileset inode allocation map (aka fileset inode) + */ + jEVENT(0, ("jfs_umount: close ipimap:0x%p\n", ipimap)); + diUnmount(ipimap, 0); + + diFreeSpecial(ipimap); + sbi->ipimap = NULL; + + /* + * close secondary aggregate inode allocation map + */ + ipaimap2 = sbi->ipaimap2; + if (ipaimap2) { + jEVENT(0, ("jfs_umount: close ipaimap2:0x%p\n", ipaimap2)); + diUnmount(ipaimap2, 0); + diFreeSpecial(ipaimap2); + sbi->ipaimap2 = NULL; + } + + /* + * close aggregate inode allocation map + */ + ipaimap = sbi->ipaimap; + jEVENT(0, ("jfs_umount: close ipaimap:0x%p\n", ipaimap)); + diUnmount(ipaimap, 0); + diFreeSpecial(ipaimap); + sbi->ipaimap = NULL; + + /* + * close aggregate block allocation map + */ + jEVENT(0, ("jfs_umount: close ipbmap:%p\n", ipbmap)); + dbUnmount(ipbmap, 0); + + diFreeSpecial(ipbmap); + sbi->ipimap = NULL; + + /* + * ensure all file system file pages are propagated to their + * home blocks on disk (and their in-memory buffer pages are + * invalidated) BEFORE updating file system superblock state + * (to signify file system is unmounted cleanly, and thus in + * consistent state) and log superblock active file system + * list (to signify skip logredo()). + */ + if (log) /* log = NULL if read-only mount */ + rc = updateSuper(sb, FM_CLEAN); + + + jFYI(0, (" UnMount JFS Complete: %d\n", rc)); + return rc; +} + + +int jfs_umount_rw(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + + if (!sbi->log) + return 0; + + /* + * close log: + * + * remove file system from log active file system list. + */ + lmLogClose(sb, sbi->log); + + dbSync(sbi->ipbmap); + diSync(sbi->ipimap); + + sbi->log = 0; + + return updateSuper(sb, FM_CLEAN); + +} diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_unicode.c linuxppc64_2_4/fs/jfs/jfs_unicode.c --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_unicode.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_unicode.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,110 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include "jfs_types.h" +#include "jfs_filsys.h" +#include "jfs_unicode.h" +#include "jfs_debug.h" + +/* + * NAME: jfs_strfromUCS() + * + * FUNCTION: Convert little-endian unicode string to character string + * + */ +int jfs_strfromUCS_le(char *to, const wchar_t * from, /* LITTLE ENDIAN */ + int len, struct nls_table *codepage) +{ + int i; + int outlen = 0; + + for (i = 0; (i < len) && from[i]; i++) { + int charlen; + charlen = + codepage->uni2char(le16_to_cpu(from[i]), &to[outlen], + NLS_MAX_CHARSET_SIZE); + if (charlen > 0) { + outlen += charlen; + } else { + to[outlen++] = '?'; + } + } + to[outlen] = 0; + jEVENT(0, ("jfs_strfromUCS returning %d - '%s'\n", outlen, to)); + return outlen; +} + +/* + * NAME: jfs_strtoUCS() + * + * FUNCTION: Convert character string to unicode string + * + */ +int jfs_strtoUCS(wchar_t * to, + const char *from, int len, struct nls_table *codepage) +{ + int charlen; + int i; + + jEVENT(0, ("jfs_strtoUCS - '%s'\n", from)); + + for (i = 0; len && *from; i++, from += charlen, len -= charlen) { + charlen = codepage->char2uni(from, len, &to[i]); + if (charlen < 1) { + jERROR(1, ("jfs_strtoUCS: char2uni returned %d.\n", + charlen)); + jERROR(1, ("charset = %s, char = 0x%x\n", + codepage->charset, (unsigned char) *from)); + to[i] = 0x003f; /* a question mark */ + charlen = 1; + } + } + + jEVENT(0, (" returning %d\n", i)); + + to[i] = 0; + return i; +} + +/* + * NAME: get_UCSname() + * + * FUNCTION: Allocate and translate to unicode string + * + */ +int get_UCSname(component_t * uniName, struct dentry *dentry, + struct nls_table *nls_tab) +{ + int length = dentry->d_name.len; + + if (length > JFS_NAME_MAX) + return ENAMETOOLONG; + + uniName->name = + kmalloc((length + 1) * sizeof(wchar_t), GFP_NOFS); + + if (uniName->name == NULL) + return ENOSPC; + + uniName->namlen = jfs_strtoUCS(uniName->name, dentry->d_name.name, + length, nls_tab); + + return 0; +} diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_unicode.h linuxppc64_2_4/fs/jfs/jfs_unicode.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_unicode.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_unicode.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,143 @@ +/* + * unistrk: Unicode kernel case support + * + * Function: + * Convert a unicode character to upper or lower case using + * compressed tables. + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + */ + +#include +#include "jfs_types.h" + +typedef struct { + wchar_t start; + wchar_t end; + signed char *table; +} UNICASERANGE; + +extern signed char UniUpperTable[512]; +extern UNICASERANGE UniUpperRange[]; +extern int get_UCSname(component_t *, struct dentry *, struct nls_table *); +extern int jfs_strfromUCS_le(char *, const wchar_t *, int, struct nls_table *); + +#define free_UCSname(COMP) kfree((COMP)->name) + +/* + * UniStrcpy: Copy a string + */ +static inline wchar_t *UniStrcpy(wchar_t * ucs1, const wchar_t * ucs2) +{ + wchar_t *anchor = ucs1; /* save the start of result string */ + + while ((*ucs1++ = *ucs2++)); + return anchor; +} + + + +/* + * UniStrncpy: Copy length limited string with pad + */ +static inline wchar_t *UniStrncpy(wchar_t * ucs1, const wchar_t * ucs2, + size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = *ucs2++; + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + +/* + * UniStrncmp_le: Compare length limited string - native to little-endian + */ +static inline int UniStrncmp_le(const wchar_t * ucs1, const wchar_t * ucs2, + size_t n) +{ + if (!n) + return 0; /* Null strings are equal */ + while ((*ucs1 == __le16_to_cpu(*ucs2)) && *ucs1 && --n) { + ucs1++; + ucs2++; + } + return (int) *ucs1 - (int) __le16_to_cpu(*ucs2); +} + +/* + * UniStrncpy_le: Copy length limited string with pad to little-endian + */ +static inline wchar_t *UniStrncpy_le(wchar_t * ucs1, const wchar_t * ucs2, + size_t n) +{ + wchar_t *anchor = ucs1; + + while (n-- && *ucs2) /* Copy the strings */ + *ucs1++ = __le16_to_cpu(*ucs2++); + + n++; + while (n--) /* Pad with nulls */ + *ucs1++ = 0; + return anchor; +} + + +/* + * UniToupper: Convert a unicode character to upper case + */ +static inline wchar_t UniToupper(register wchar_t uc) +{ + register UNICASERANGE *rp; + + if (uc < sizeof(UniUpperTable)) { /* Latin characters */ + return uc + UniUpperTable[uc]; /* Use base tables */ + } else { + rp = UniUpperRange; /* Use range tables */ + while (rp->start) { + if (uc < rp->start) /* Before start of range */ + return uc; /* Uppercase = input */ + if (uc <= rp->end) /* In range */ + return uc + rp->table[uc - rp->start]; + rp++; /* Try next range */ + } + } + return uc; /* Past last range */ +} + + +/* + * UniStrupr: Upper case a unicode string + */ +static inline wchar_t *UniStrupr(register wchar_t * upin) +{ + register wchar_t *up; + + up = upin; + while (*up) { /* For all characters */ + *up = UniToupper(*up); + up++; + } + return upin; /* Return input pointer */ +} + diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_uniupr.c linuxppc64_2_4/fs/jfs/jfs_uniupr.c --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_uniupr.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_uniupr.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,137 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * jfs_uniupr.c - Unicode compressed case ranges + * +*/ + +#include +#include "jfs_unicode.h" + +/* + * Latin upper case + */ +signed char UniUpperTable[512] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 030-03f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 040-04f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 050-05f */ + 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 060-06f */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, 0, 0, 0, 0, 0, /* 070-07f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 080-08f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 090-09f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0a0-0af */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0b0-0bf */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0c0-0cf */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0d0-0df */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 0e0-0ef */ + -32,-32,-32,-32,-32,-32,-32, 0,-32,-32,-32,-32,-32,-32,-32,121, /* 0f0-0ff */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 100-10f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 110-11f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 120-12f */ + 0, 0, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 130-13f */ + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, /* 140-14f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 150-15f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 160-16f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, -1, 0, -1, 0, /* 170-17f */ + 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, /* 180-18f */ + 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, /* 190-19f */ + 0, -1, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, 0, -1, 0, 0, /* 1a0-1af */ + -1, 0, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, /* 1b0-1bf */ + 0, 0, 0, 0, 0, -1, -2, 0, -1, -2, 0, -1, -2, 0, -1, 0, /* 1c0-1cf */ + -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1,-79, 0, -1, /* 1d0-1df */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e0-1ef */ + 0, 0, -1, -2, 0, -1, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, /* 1f0-1ff */ +}; + +/* Upper case range - Greek */ +static signed char UniCaseRangeU03a0[47] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-38,-37,-37,-37, /* 3a0-3af */ + 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 3b0-3bf */ + -32,-32,-31,-32,-32,-32,-32,-32,-32,-32,-32,-32,-64,-63,-63, +}; + +/* Upper case range - Cyrillic */ +static signed char UniCaseRangeU0430[48] = { + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 430-43f */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* 440-44f */ + 0,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80,-80, 0,-80,-80, /* 450-45f */ +}; + +/* Upper case range - Extended cyrillic */ +static signed char UniCaseRangeU0490[61] = { + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 490-49f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4a0-4af */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 4b0-4bf */ + 0, 0, -1, 0, -1, 0, 0, 0, -1, 0, 0, 0, -1, +}; + +/* Upper case range - Extended latin and greek */ +static signed char UniCaseRangeU1e00[509] = { + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e00-1e0f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e10-1e1f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e20-1e2f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e30-1e3f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e40-1e4f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e50-1e5f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e60-1e6f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e70-1e7f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1e80-1e8f */ + 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0,-59, 0, -1, 0, -1, /* 1e90-1e9f */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ea0-1eaf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1eb0-1ebf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ec0-1ecf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ed0-1edf */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, /* 1ee0-1eef */ + 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, /* 1ef0-1eff */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f00-1f0f */ + 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f10-1f1f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f20-1f2f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f30-1f3f */ + 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f40-1f4f */ + 0, 8, 0, 8, 0, 8, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f50-1f5f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f60-1f6f */ + 74, 74, 86, 86, 86, 86,100,100, 0, 0,112,112,126,126, 0, 0, /* 1f70-1f7f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f80-1f8f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1f90-1f9f */ + 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fa0-1faf */ + 8, 8, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fb0-1fbf */ + 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fc0-1fcf */ + 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fd0-1fdf */ + 8, 8, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 1fe0-1fef */ + 0, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/* Upper case range - Wide latin */ +static signed char UniCaseRangeUff40[27] = { + 0,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, /* ff40-ff4f */ + -32,-32,-32,-32,-32,-32,-32,-32,-32,-32,-32, +}; + +/* + * Upper Case Range + */ +UNICASERANGE UniUpperRange[] = { + { 0x03a0, 0x03ce, UniCaseRangeU03a0 }, + { 0x0430, 0x045f, UniCaseRangeU0430 }, + { 0x0490, 0x04cc, UniCaseRangeU0490 }, + { 0x1e00, 0x1ffc, UniCaseRangeU1e00 }, + { 0xff40, 0xff5a, UniCaseRangeUff40 }, + { 0, 0, 0 } +}; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_xtree.c linuxppc64_2_4/fs/jfs/jfs_xtree.c --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_xtree.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_xtree.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,4444 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * jfs_xtree.c: extent allocation descriptor B+-tree manager + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_dmap.h" +#include "jfs_dinode.h" +#include "jfs_superblock.h" +#include "jfs_debug.h" + +/* + * xtree local flag + */ +#define XT_INSERT 0x00000001 + +/* + * xtree key/entry comparison: extent offset + * + * return: + * -1: k < start of extent + * 0: start_of_extent <= k <= end_of_extent + * 1: k > end_of_extent + */ +#define XT_CMP(CMP, K, X, OFFSET64)\ +{\ + OFFSET64 = offsetXAD(X);\ + (CMP) = ((K) >= OFFSET64 + lengthXAD(X)) ? 1 :\ + ((K) < OFFSET64) ? -1 : 0;\ +} + +/* write a xad entry */ +#define XT_PUTENTRY(XAD, FLAG, OFF, LEN, ADDR)\ +{\ + (XAD)->flag = (FLAG);\ + XADoffset((XAD), (OFF));\ + XADlength((XAD), (LEN));\ + XADaddress((XAD), (ADDR));\ +} + +#define XT_PAGE(IP, MP) BT_PAGE(IP, MP, xtpage_t, i_xtroot) + +/* get page buffer for specified block address */ +#define XT_GETPAGE(IP, BN, MP, SIZE, P, RC)\ +{\ + BT_GETPAGE(IP, BN, MP, xtpage_t, SIZE, P, RC, i_xtroot)\ + if (!(RC))\ + {\ + if ((le16_to_cpu((P)->header.nextindex) < XTENTRYSTART) ||\ + (le16_to_cpu((P)->header.nextindex) > le16_to_cpu((P)->header.maxentry)) ||\ + (le16_to_cpu((P)->header.maxentry) > (((BN)==0)?XTROOTMAXSLOT:PSIZE>>L2XTSLOTSIZE)))\ + {\ + jERROR(1,("XT_GETPAGE: xtree page corrupt\n"));\ + BT_PUTPAGE(MP);\ + updateSuper((IP)->i_sb, FM_DIRTY);\ + MP = NULL;\ + RC = EIO;\ + }\ + }\ +} + +/* for consistency */ +#define XT_PUTPAGE(MP) BT_PUTPAGE(MP) + +#define XT_GETSEARCH(IP, LEAF, BN, MP, P, INDEX) \ + BT_GETSEARCH(IP, LEAF, BN, MP, xtpage_t, P, INDEX, i_xtroot) +/* xtree entry parameter descriptor */ +typedef struct { + metapage_t *mp; + s16 index; + u8 flag; + s64 off; + s64 addr; + int len; + pxdlist_t *pxdlist; +} xtsplit_t; + + +/* + * statistics + */ +#ifdef CONFIG_JFS_STATISTICS +static struct { + uint search; + uint fastSearch; + uint split; +} xtStat; +#endif + + +/* + * forward references + */ +static int xtSearch(struct inode *ip, + s64 xoff, int *cmpp, btstack_t * btstack, int flag); + +static int xtSplitUp(tid_t tid, + struct inode *ip, + xtsplit_t * split, btstack_t * btstack); + +static int xtSplitPage(tid_t tid, + struct inode *ip, + xtsplit_t * split, metapage_t ** rmpp, s64 * rbnp); + +static int xtSplitRoot(tid_t tid, + struct inode *ip, + xtsplit_t * split, metapage_t ** rmpp); + +#ifdef _STILL_TO_PORT +static int xtDeleteUp(tid_t tid, + struct inode *ip, + metapage_t * fmp, + xtpage_t * fp, btstack_t * btstack); + +static int xtSearchNode(struct inode *ip, + xad_t * xad, + int *cmpp, btstack_t * btstack, int flag); + +static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * fp); +#endif /* _STILL_TO_PORT */ + +/* External references */ + +/* + * debug control + */ +/* #define _JFS_DEBUG_XTREE 1 */ + + +/* + * xtLookup() + * + * function: map a single page into a physical extent; + */ +int xtLookup(struct inode *ip, s64 lstart, + s64 llen, int *pflag, s64 * paddr, s32 * plen, int no_check) +{ + int rc = 0; + btstack_t btstack; + int cmp; + s64 bn; + metapage_t *mp; + xtpage_t *p; + int index; + xad_t *xad; + s64 size, xoff, xend; + int xlen; + s64 xaddr; + + *plen = 0; + + if (!no_check) { + /* is lookup offset beyond eof ? */ + size = ((u64) ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >> + JFS_SBI(ip->i_sb)->l2bsize; + if (lstart >= size) { + jERROR(1, + ("xtLookup: lstart (0x%lx) >= size (0x%lx)\n", + (ulong) lstart, (ulong) size)); + return 0; + } + } + + /* + * search for the xad entry covering the logical extent + */ +//search: + if ((rc = xtSearch(ip, lstart, &cmp, &btstack, 0))) { + jERROR(1, ("xtLookup: xtSearch returned %d\n", rc)); + return rc; + } + + /* + * compute the physical extent covering logical extent + * + * N.B. search may have failed (e.g., hole in sparse file), + * and returned the index of the next entry. + */ + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* is xad found covering start of logical extent ? + * lstart is a page start address, + * i.e., lstart cannot start in a hole; + */ + if (cmp) { + jFYI(1, ("xtLookup: cmp = %d\n", cmp)); + goto out; + } + + /* + * lxd covered by xad + */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xend = xoff + xlen; + xaddr = addressXAD(xad); + + jEVENT(0, + ("index = %d, xoff = 0x%lx, xlen = 0x%x, xaddr = 0x%lx\n", + index, (ulong) xoff, xlen, (ulong) xaddr)); + + /* initialize new pxd */ + *pflag = xad->flag; + *paddr = xaddr + (lstart - xoff); + /* a page must be fully covered by an xad */ + *plen = min(xend - lstart, llen); + + out: + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtLookupList() + * + * function: map a single logical extent into a list of physical extent; + * + * parameter: + * struct inode *ip, + * lxdlist_t *lxdlist, lxd list (in) + * xadlist_t *xadlist, xad list (in/out) + * int flag) + * + * coverage of lxd by xad under assumption of + * . lxd's are ordered and disjoint. + * . xad's are ordered and disjoint. + * + * return: + * 0: success + * + * note: a page being written (even a single byte) is backed fully, + * except the last page which is only backed with blocks + * required to cover the last byte; + * the extent backing a page is fully contained within an xad; + */ +int xtLookupList(struct inode *ip, lxdlist_t * lxdlist, /* lxd list (in) */ + xadlist_t * xadlist, /* xad list (in/out) */ + int flag) +{ + int rc = 0; + btstack_t btstack; + int cmp; + s64 bn; + metapage_t *mp; + xtpage_t *p; + int index; + lxd_t *lxd; + xad_t *xad, *pxd; + s64 size, lstart, lend, xstart, xend, pstart; + s64 llen, xlen, plen; + s64 xaddr, paddr; + int nlxd, npxd, maxnpxd; + + npxd = xadlist->nxad = 0; + maxnpxd = xadlist->maxnxad; + pxd = xadlist->xad; + + nlxd = lxdlist->nlxd; + lxd = lxdlist->lxd; + + lstart = offsetLXD(lxd); + llen = lengthLXD(lxd); + lend = lstart + llen; + + size = (ip->i_size + (JFS_SBI(ip->i_sb)->bsize - 1)) >> + JFS_SBI(ip->i_sb)->l2bsize; + + /* + * search for the xad entry covering the logical extent + */ + search: + if (lstart >= size) + return 0; + + if ((rc = xtSearch(ip, lstart, &cmp, &btstack, 0))) + return rc; + + /* + * compute the physical extent covering logical extent + * + * N.B. search may have failed (e.g., hole in sparse file), + * and returned the index of the next entry. + */ +//map: + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* is xad on the next sibling page ? */ + if (index == le16_to_cpu(p->header.nextindex)) { + if (p->header.flag & BT_ROOT) + goto mapend; + + if ((bn = le64_to_cpu(p->header.next)) == 0) + goto mapend; + + XT_PUTPAGE(mp); + + /* get next sibling page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = XTENTRYSTART; + } + + xad = &p->xad[index]; + + /* + * is lxd covered by xad ? + */ + compare: + xstart = offsetXAD(xad); + xlen = lengthXAD(xad); + xend = xstart + xlen; + xaddr = addressXAD(xad); + + compare1: + if (xstart < lstart) + goto compare2; + + /* (lstart <= xstart) */ + + /* lxd is NOT covered by xad */ + if (lend <= xstart) { + /* + * get next lxd + */ + if (--nlxd == 0) + goto mapend; + lxd++; + + lstart = offsetLXD(lxd); + llen = lengthLXD(lxd); + lend = lstart + llen; + if (lstart >= size) + goto mapend; + + /* compare with the current xad */ + goto compare1; + } + /* lxd is covered by xad */ + else { /* (xstart < lend) */ + + /* initialize new pxd */ + pstart = xstart; + plen = min(lend - xstart, xlen); + paddr = xaddr; + + goto cover; + } + + /* (xstart < lstart) */ + compare2: + /* lxd is covered by xad */ + if (lstart < xend) { + /* initialize new pxd */ + pstart = lstart; + plen = min(xend - lstart, llen); + paddr = xaddr + (lstart - xstart); + + goto cover; + } + /* lxd is NOT covered by xad */ + else { /* (xend <= lstart) */ + + /* + * get next xad + * + * linear search next xad covering lxd on + * the current xad page, and then tree search + */ + if (index == le16_to_cpu(p->header.nextindex) - 1) { + if (p->header.flag & BT_ROOT) + goto mapend; + + XT_PUTPAGE(mp); + goto search; + } else { + index++; + xad++; + + /* compare with new xad */ + goto compare; + } + } + + /* + * lxd is covered by xad and a new pxd has been initialized + * (lstart <= xstart < lend) or (xstart < lstart < xend) + */ + cover: + /* finalize pxd corresponding to current xad */ + XT_PUTENTRY(pxd, xad->flag, pstart, plen, paddr); + + if (++npxd >= maxnpxd) + goto mapend; + pxd++; + + /* + * lxd is fully covered by xad + */ + if (lend <= xend) { + /* + * get next lxd + */ + if (--nlxd == 0) + goto mapend; + lxd++; + + lstart = offsetLXD(lxd); + llen = lengthLXD(lxd); + lend = lstart + llen; + if (lstart >= size) + goto mapend; + + /* + * test for old xad covering new lxd + * (old xstart < new lstart) + */ + goto compare2; + } + /* + * lxd is partially covered by xad + */ + else { /* (xend < lend) */ + + /* + * get next xad + * + * linear search next xad covering lxd on + * the current xad page, and then next xad page search + */ + if (index == le16_to_cpu(p->header.nextindex) - 1) { + if (p->header.flag & BT_ROOT) + goto mapend; + + if ((bn = le64_to_cpu(p->header.next)) == 0) + goto mapend; + + XT_PUTPAGE(mp); + + /* get next sibling page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = XTENTRYSTART; + xad = &p->xad[index]; + } else { + index++; + xad++; + } + + /* + * test for new xad covering old lxd + * (old lstart < new xstart) + */ + goto compare; + } + + mapend: + xadlist->nxad = npxd; + +//out: + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtSearch() + * + * function: search for the xad entry covering specified offset. + * + * parameters: + * ip - file object; + * xoff - extent offset; + * cmpp - comparison result: + * btstack - traverse stack; + * flag - search process flag (XT_INSERT); + * + * returns: + * btstack contains (bn, index) of search path traversed to the entry. + * *cmpp is set to result of comparison with the entry returned. + * the page containing the entry is pinned at exit. + */ +static int xtSearch(struct inode *ip, s64 xoff, /* offset of extent */ + int *cmpp, btstack_t * btstack, int flag) +{ + struct jfs_inode_info *jfs_ip = JFS_IP(ip); + int rc = 0; + int cmp = 1; /* init for empty page */ + s64 bn; /* block number */ + metapage_t *mp; /* page buffer */ + xtpage_t *p; /* page */ + xad_t *xad; + int base, index, lim, btindex; + btframe_t *btsp; + int nsplit = 0; /* number of pages to split */ + s64 t64; + + INCREMENT(xtStat.search); + + BT_CLR(btstack); + + btstack->nsplit = 0; + + /* + * search down tree from root: + * + * between two consecutive entries of and of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* try sequential access heuristics with the previous + * access entry in target leaf page: + * once search narrowed down into the target leaf, + * key must either match an entry in the leaf or + * key entry does not exist in the tree; + */ +//fastSearch: + if ((jfs_ip->btorder & BT_SEQUENTIAL) && + (p->header.flag & BT_LEAF) && + (index = jfs_ip->btindex) < + le16_to_cpu(p->header.nextindex)) { + xad = &p->xad[index]; + t64 = offsetXAD(xad); + if (xoff < t64 + lengthXAD(xad)) { + if (xoff >= t64) { + *cmpp = 0; + goto out; + } + + /* stop sequential access heuristics */ + goto binarySearch; + } else { /* (t64 + lengthXAD(xad)) <= xoff */ + + /* try next sequential entry */ + index++; + if (index < + le16_to_cpu(p->header.nextindex)) { + xad++; + t64 = offsetXAD(xad); + if (xoff < t64 + lengthXAD(xad)) { + if (xoff >= t64) { + *cmpp = 0; + goto out; + } + + /* miss: key falls between + * previous and this entry + */ + *cmpp = 1; + goto out; + } + + /* (xoff >= t64 + lengthXAD(xad)); + * matching entry may be further out: + * stop heuristic search + */ + /* stop sequential access heuristics */ + goto binarySearch; + } + + /* (index == p->header.nextindex); + * miss: key entry does not exist in + * the target leaf/tree + */ + *cmpp = 1; + goto out; + } + + /* + * if hit, return index of the entry found, and + * if miss, where new entry with search key is + * to be inserted; + */ + out: + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == /* little-endian */ + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + /* update sequential access heuristics */ + jfs_ip->btindex = index; + + INCREMENT(xtStat.fastSearch); + return 0; + } + + /* well, ... full search now */ + binarySearch: + lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + + /* + * binary search with search key K on the current page + */ + for (base = XTENTRYSTART; lim; lim >>= 1) { + index = base + (lim >> 1); + + XT_CMP(cmp, xoff, &p->xad[index], t64); + if (cmp == 0) { + /* + * search hit + */ + /* search hit - leaf page: + * return the entry found + */ + if (p->header.flag & BT_LEAF) { + *cmpp = cmp; + + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + /* init sequential access heuristics */ + btindex = jfs_ip->btindex; + if (index == btindex || + index == btindex + 1) + jfs_ip->btorder = BT_SEQUENTIAL; + else + jfs_ip->btorder = BT_RANDOM; + jfs_ip->btindex = index; + + return 0; + } + + /* search hit - internal page: + * descend/search its child page + */ + goto next; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or maxentry index. + */ + /* + * search miss - leaf page: + * + * return location of entry (base) where new entry with + * search key K is to be inserted. + */ + if (p->header.flag & BT_LEAF) { + *cmpp = cmp; + + /* compute number of pages to split */ + if (flag & XT_INSERT) { + if (p->header.nextindex == + p->header.maxentry) + nsplit++; + else + nsplit = 0; + btstack->nsplit = nsplit; + } + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = base; + btsp->mp = mp; + + /* init sequential access heuristics */ + btindex = jfs_ip->btindex; + if (base == btindex || base == btindex + 1) + jfs_ip->btorder = BT_SEQUENTIAL; + else + jfs_ip->btorder = BT_RANDOM; + jfs_ip->btindex = base; + + return 0; + } + + /* + * search miss - non-leaf page: + * + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + next: + /* update number of pages to split */ + if (p->header.nextindex == p->header.maxentry) + nsplit++; + else + nsplit = 0; + + /* push (bn, index) of the parent page/entry */ + BT_PUSH(btstack, bn, index); + + /* get the child page block number */ + bn = addressXAD(&p->xad[index]); + + /* unpin the parent page */ + XT_PUTPAGE(mp); + } +} + +/* + * xtInsert() + * + * function: + * + * parameter: + * tid - transaction id; + * ip - file object; + * xflag - extent flag (XAD_NOTRECORDED): + * xoff - extent offset; + * xlen - extent length; + * xaddrp - extent address pointer (in/out): + * if (*xaddrp) + * caller allocated data extent at *xaddrp; + * else + * allocate data extent and return its xaddr; + * flag - + * + * return: + */ +int xtInsert(tid_t tid, /* transaction id */ + struct inode *ip, int xflag, s64 xoff, s32 xlen, s64 * xaddrp, + int flag) +{ + int rc = 0; + s64 xaddr, hint; + metapage_t *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex; + btstack_t btstack; /* traverse stack */ + xtsplit_t split; /* split information */ + xad_t *xad; + int cmp; + tlock_t *tlck; + xtlock_t *xtlck; + + jFYI(1, + ("xtInsert: nxoff:0x%lx nxlen:0x%x\n", (ulong) xoff, xlen)); + + /* + * search for the entry location at which to insert: + * + * xtFastSearch() and xtSearch() both returns (leaf page + * pinned, index at which to insert). + * n.b. xtSearch() may return index of maxentry of + * the full page. + */ + if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* This test must follow XT_GETSEARCH since mp must be valid if + * we branch to out: */ + if (cmp == 0) { + rc = EEXIST; + goto out; + } + + /* + * allocate data extent requested + * + * allocation hint: last xad + */ + if ((xaddr = *xaddrp) == 0) { + if (index > XTENTRYSTART) { + xad = &p->xad[index - 1]; + hint = addressXAD(xad) + lengthXAD(xad) - 1; + } else + hint = 0; + if ((rc = dbAlloc(ip, hint, (s64) xlen, &xaddr))) + goto out; + } + + /* + * insert entry for new extent + */ + xflag |= XAD_NEW; + + /* + * if the leaf page is full, split the page and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + nextindex = le16_to_cpu(p->header.nextindex); + if (nextindex == le16_to_cpu(p->header.maxentry)) { + split.mp = mp; + split.index = index; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) { + /* undo data extent allocation */ + if (*xaddrp == 0) + dbFree(ip, xaddr, (s64) xlen); + return rc; + } + + *xaddrp = xaddr; + return 0; + } + + /* + * insert the new entry into the leaf page + */ + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + BT_MARK_DIRTY(mp, ip); + + /* if insert into middle, shift right remaining entries. */ + if (index < nextindex) + memmove(&p->xad[index + 1], &p->xad[index], + (nextindex - index) * sizeof(xad_t)); + + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (xtlock_t *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + } + + *xaddrp = xaddr; + + out: + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtSplitUp() + * + * function: + * split full pages as propagating insertion up the tree + * + * parameter: + * tid - transaction id; + * ip - file object; + * split - entry parameter descriptor; + * btstack - traverse stack from xtSearch() + * + * return: + */ +static int +xtSplitUp(tid_t tid, + struct inode *ip, xtsplit_t * split, btstack_t * btstack) +{ + int rc = 0; + metapage_t *smp; + xtpage_t *sp; /* split page */ + metapage_t *rmp; + s64 rbn; /* new right page block number */ + metapage_t *rcmp; + xtpage_t *rcp; /* right child page */ + s64 rcbn; /* right child page block number */ + int skip; /* index of entry of insertion */ + int nextindex; /* next available entry index of p */ + btframe_t *parent; /* parent page entry on traverse stack */ + xad_t *xad; + s64 xaddr; + int xlen; + int nsplit; /* number of pages split */ + pxdlist_t pxdlist; + pxd_t *pxd; + tlock_t *tlck; + xtlock_t *xtlck; + + smp = split->mp; + sp = XT_PAGE(ip, smp); + + /* is inode xtree root extension/inline EA area free ? */ + if ((sp->header.flag & BT_ROOT) && (!S_ISDIR(ip->i_mode)) && + (sp->header.maxentry < cpu_to_le16(XTROOTMAXSLOT)) && + (JFS_IP(ip)->mode2 & INLINEEA)) { + sp->header.maxentry = cpu_to_le16(XTROOTMAXSLOT); + JFS_IP(ip)->mode2 &= ~INLINEEA; + + BT_MARK_DIRTY(smp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + + /* if insert into middle, shift right remaining entries. */ + skip = split->index; + nextindex = le16_to_cpu(sp->header.nextindex); + if (skip < nextindex) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (nextindex - skip) * sizeof(xad_t)); + + /* insert the new entry: mark the entry NEW */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* advance next available entry index */ + sp->header.nextindex = + cpu_to_le16(le16_to_cpu(sp->header.nextindex) + 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); + xtlck = (xtlock_t *) & tlck->lock; + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(skip, (int)xtlck->lwm.offset) : skip; + xtlck->lwm.length = + le16_to_cpu(sp->header.nextindex) - + xtlck->lwm.offset; + } + + return 0; + } + + /* + * allocate new index blocks to cover index page split(s) + * + * allocation hint: ? + */ + if (split->pxdlist == NULL) { + nsplit = btstack->nsplit; + split->pxdlist = &pxdlist; + pxdlist.maxnpxd = pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + xlen = JFS_SBI(ip->i_sb)->nbperpage; + for (; nsplit > 0; nsplit--, pxd++) { + if ((rc = dbAlloc(ip, (s64) 0, (s64) xlen, &xaddr)) + == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, xlen); + + pxdlist.maxnpxd++; + + continue; + } + + /* undo allocation */ + + XT_PUTPAGE(smp); + return rc; + } + } + + /* + * Split leaf page into and a new right page . + * + * The split routines insert the new entry into the leaf page, + * and acquire txLock as appropriate. + * return pinned and its block number . + */ + rc = (sp->header.flag & BT_ROOT) ? + xtSplitRoot(tid, ip, split, &rmp) : + xtSplitPage(tid, ip, split, &rmp, &rbn); + if (rc) + return EIO; + + XT_PUTPAGE(smp); + + /* + * propagate up the router entry for the leaf page just split + * + * insert a router entry for the new page into the parent page, + * propagate the insert/split up the tree by walking back the stack + * of (bn of parent page, index of child page entry in parent page) + * that were traversed during the search for the page that split. + * + * the propagation of insert/split up the tree stops if the root + * splits or the page inserted into doesn't have to split to hold + * the new entry. + * + * the parent entry for the split page remains the same, and + * a new entry is inserted at its right with the first key and + * block number of the new right page. + * + * There are a maximum of 3 pages pinned at any time: + * right child, left parent and right parent (when the parent splits) + * to keep the child page pinned while working on the parent. + * make sure that all pins are released at exit. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* parent page specified by stack frame */ + + /* keep current child pages pinned */ + rcmp = rmp; + rcbn = rbn; + rcp = XT_PAGE(ip, rcmp); + + /* + * insert router entry in parent for new right child page + */ + /* get/pin the parent page */ + XT_GETPAGE(ip, parent->bn, smp, PSIZE, sp, rc); + if (rc) + goto errout2; + + /* + * The new key entry goes ONE AFTER the index of parent entry, + * because the split was to the right. + */ + skip = parent->index + 1; + + /* + * split or shift right remaining entries of the parent page + */ + nextindex = le16_to_cpu(sp->header.nextindex); + /* + * parent page is full - split the parent page + */ + if (nextindex == le16_to_cpu(sp->header.maxentry)) { + /* init for parent page split */ + split->mp = smp; + split->index = skip; /* index at insert */ + split->flag = XAD_NEW; + split->off = offsetXAD(&rcp->xad[XTENTRYSTART]); + split->len = JFS_SBI(ip->i_sb)->nbperpage; + split->addr = rcbn; + + /* unpin previous right child page */ + XT_PUTPAGE(rcmp); + + /* The split routines insert the new entry, + * and acquire txLock as appropriate. + * return pinned and its block number . + */ + rc = (sp->header.flag & BT_ROOT) ? + xtSplitRoot(tid, ip, split, &rmp) : + xtSplitPage(tid, ip, split, &rmp, &rbn); + if (rc) + goto errout1; + + XT_PUTPAGE(smp); + /* keep new child page pinned */ + } + /* + * parent page is not full - insert in parent page + */ + else { + /* + * insert router entry in parent for the right child + * page from the first entry of the right child page: + */ + /* + * acquire a transaction lock on the parent page; + * + * action: router xad insertion; + */ + BT_MARK_DIRTY(smp, ip); + + /* + * if insert into middle, shift right remaining entries + */ + if (skip < nextindex) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (nextindex - + skip) << L2XTSLOTSIZE); + + /* insert the router entry */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, XAD_NEW, + offsetXAD(&rcp->xad[XTENTRYSTART]), + JFS_SBI(ip->i_sb)->nbperpage, rcbn); + + /* advance next available entry index. */ + sp->header.nextindex = + cpu_to_le16(le16_to_cpu(sp->header.nextindex) + + 1); + + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, smp, + tlckXTREE | tlckGROW); + xtlck = (xtlock_t *) & tlck->lock; + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(skip, (int)xtlck->lwm.offset) : skip; + xtlck->lwm.length = + le16_to_cpu(sp->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin parent page */ + XT_PUTPAGE(smp); + + /* exit propagate up */ + break; + } + } + + /* unpin current right page */ + XT_PUTPAGE(rmp); + + return 0; + + /* + * If something fails in the above loop we were already walking back + * up the tree and the tree is now inconsistent. + * release all pages we're holding. + */ + errout1: + XT_PUTPAGE(smp); + + errout2: + XT_PUTPAGE(rcmp); + + return rc; +} + + +/* + * xtSplitPage() + * + * function: + * split a full non-root page into + * original/split/left page and new right page + * i.e., the original/split page remains as left page. + * + * parameter: + * int tid, + * struct inode *ip, + * xtsplit_t *split, + * metapage_t **rmpp, + * u64 *rbnp, + * + * return: + * Pointer to page in which to insert or NULL on error. + */ +static int +xtSplitPage(tid_t tid, struct inode *ip, + xtsplit_t * split, metapage_t ** rmpp, s64 * rbnp) +{ + int rc = 0; + metapage_t *smp; + xtpage_t *sp; + metapage_t *rmp; + xtpage_t *rp; /* new right page allocated */ + s64 rbn; /* new right page block number */ + metapage_t *mp; + xtpage_t *p; + s64 nextbn; + int skip, maxentry, middle, righthalf, n; + xad_t *xad; + pxdlist_t *pxdlist; + pxd_t *pxd; + tlock_t *tlck; + xtlock_t *sxtlck = 0, *rxtlck = 0; + + smp = split->mp; + sp = XT_PAGE(ip, smp); + + INCREMENT(xtStat.split); + + /* + * allocate the new right page for the split + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + rmp = get_metapage(ip, rbn, PSIZE, 1); + if (rmp == NULL) + return EIO; + + jEVENT(0, + ("xtSplitPage: ip:0x%p smp:0x%p rmp:0x%p\n", ip, smp, rmp)); + + BT_MARK_DIRTY(rmp, ip); + /* + * action: new page; + */ + + rp = (xtpage_t *) rmp->data; + rp->header.self = *pxd; + rp->header.flag = sp->header.flag & BT_TYPE; + rp->header.maxentry = sp->header.maxentry; /* little-endian */ + rp->header.nextindex = cpu_to_le16(XTENTRYSTART); + + BT_MARK_DIRTY(smp, ip); + /* Don't log it if there are no links to the file */ + if (!test_cflag(COMMIT_Nolink, ip)) { + /* + * acquire a transaction lock on the new right page; + */ + tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW); + rxtlck = (xtlock_t *) & tlck->lock; + rxtlck->lwm.offset = XTENTRYSTART; + /* + * acquire a transaction lock on the split page + */ + tlck = txLock(tid, ip, smp, tlckXTREE | tlckGROW); + sxtlck = (xtlock_t *) & tlck->lock; + } + + /* + * initialize/update sibling pointers of and + */ + nextbn = le64_to_cpu(sp->header.next); + rp->header.next = cpu_to_le64(nextbn); + rp->header.prev = cpu_to_le64(addressPXD(&sp->header.self)); + sp->header.next = cpu_to_le64(rbn); + + skip = split->index; + + /* + * sequential append at tail (after last entry of last page) + * + * if splitting the last page on a level because of appending + * a entry to it (skip is maxentry), it's likely that the access is + * sequential. adding an empty page on the side of the level is less + * work and can push the fill factor much higher than normal. + * if we're wrong it's no big deal - we will do the split the right + * way next time. + * (it may look like it's equally easy to do a similar hack for + * reverse sorted data, that is, split the tree left, but it's not. + * Be my guest.) + */ + if (nextbn == 0 && skip == le16_to_cpu(sp->header.maxentry)) { + /* + * acquire a transaction lock on the new/right page; + * + * action: xad insertion; + */ + /* insert entry at the first entry of the new right page */ + xad = &rp->xad[XTENTRYSTART]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + rp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + /* rxtlck->lwm.offset = XTENTRYSTART; */ + rxtlck->lwm.length = 1; + } + + *rmpp = rmp; + *rbnp = rbn; + + ip->i_blocks += LBLK2PBLK(ip->i_sb, lengthPXD(pxd)); + + jEVENT(0, ("xtSplitPage: sp:0x%p rp:0x%p\n", sp, rp)); + return 0; + } + + /* + * non-sequential insert (at possibly middle page) + */ + + /* + * update previous pointer of old next/right page of + */ + if (nextbn != 0) { + XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) { + XT_PUTPAGE(rmp); + return rc; + } + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the next page; + * + * action:sibling pointer update; + */ + if (!test_cflag(COMMIT_Nolink, ip)) + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + p->header.prev = cpu_to_le64(rbn); + + /* sibling page may have been updated previously, or + * it may be updated later; + */ + + XT_PUTPAGE(mp); + } + + /* + * split the data between the split and new/right pages + */ + maxentry = le16_to_cpu(sp->header.maxentry); + middle = maxentry >> 1; + righthalf = maxentry - middle; + + /* + * skip index in old split/left page - insert into left page: + */ + if (skip <= middle) { + /* move right half of split page to the new right page */ + memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle], + righthalf << L2XTSLOTSIZE); + + /* shift right tail of left half to make room for new entry */ + if (skip < middle) + memmove(&sp->xad[skip + 1], &sp->xad[skip], + (middle - skip) << L2XTSLOTSIZE); + + /* insert new entry */ + xad = &sp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* update page header */ + sp->header.nextindex = cpu_to_le16(middle + 1); + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.offset = (sxtlck->lwm.offset) ? + min(skip, (int)sxtlck->lwm.offset) : skip; + } + + rp->header.nextindex = + cpu_to_le16(XTENTRYSTART + righthalf); + } + /* + * skip index in new right page - insert into right page: + */ + else { + /* move left head of right half to right page */ + n = skip - middle; + memmove(&rp->xad[XTENTRYSTART], &sp->xad[middle], + n << L2XTSLOTSIZE); + + /* insert new entry */ + n += XTENTRYSTART; + xad = &rp->xad[n]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, + split->addr); + + /* move right tail of right half to right page */ + if (skip < maxentry) + memmove(&rp->xad[n + 1], &sp->xad[skip], + (maxentry - skip) << L2XTSLOTSIZE); + + /* update page header */ + sp->header.nextindex = cpu_to_le16(middle); + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.offset = (sxtlck->lwm.offset) ? + min(middle, (int)sxtlck->lwm.offset) : middle; + } + + rp->header.nextindex = cpu_to_le16(XTENTRYSTART + + righthalf + 1); + } + + if (!test_cflag(COMMIT_Nolink, ip)) { + sxtlck->lwm.length = le16_to_cpu(sp->header.nextindex) - + sxtlck->lwm.offset; + + /* rxtlck->lwm.offset = XTENTRYSTART; */ + rxtlck->lwm.length = le16_to_cpu(rp->header.nextindex) - + XTENTRYSTART; + } + + *rmpp = rmp; + *rbnp = rbn; + + ip->i_blocks += LBLK2PBLK(ip->i_sb, lengthPXD(pxd)); + + jEVENT(0, ("xtSplitPage: sp:0x%p rp:0x%p\n", sp, rp)); + return rc; +} + + +/* + * xtSplitRoot() + * + * function: + * split the full root page into + * original/root/split page and new right page + * i.e., root remains fixed in tree anchor (inode) and + * the root is copied to a single new right child page + * since root page << non-root page, and + * the split root page contains a single entry for the + * new right child page. + * + * parameter: + * int tid, + * struct inode *ip, + * xtsplit_t *split, + * metapage_t **rmpp) + * + * return: + * Pointer to page in which to insert or NULL on error. + */ +static int +xtSplitRoot(tid_t tid, + struct inode *ip, xtsplit_t * split, metapage_t ** rmpp) +{ + xtpage_t *sp; + metapage_t *rmp; + xtpage_t *rp; + s64 rbn; + int skip, nextindex; + xad_t *xad; + pxd_t *pxd; + pxdlist_t *pxdlist; + tlock_t *tlck; + xtlock_t *xtlck; + + sp = &JFS_IP(ip)->i_xtroot; + + INCREMENT(xtStat.split); + + /* + * allocate a single (right) child page + */ + pxdlist = split->pxdlist; + pxd = &pxdlist->pxd[pxdlist->npxd]; + pxdlist->npxd++; + rbn = addressPXD(pxd); + rmp = get_metapage(ip, rbn, PSIZE, 1); + if (rmp == NULL) + return EIO; + + jEVENT(0, ("xtSplitRoot: ip:0x%p rmp:0x%p\n", ip, rmp)); + + /* + * acquire a transaction lock on the new right page; + * + * action: new page; + */ + BT_MARK_DIRTY(rmp, ip); + + rp = (xtpage_t *) rmp->data; + rp->header.flag = + (sp->header.flag & BT_LEAF) ? BT_LEAF : BT_INTERNAL; + rp->header.self = *pxd; + rp->header.nextindex = cpu_to_le16(XTENTRYSTART); + rp->header.maxentry = cpu_to_le16(PSIZE >> L2XTSLOTSIZE); + + /* initialize sibling pointers */ + rp->header.next = 0; + rp->header.prev = 0; + + /* + * copy the in-line root page into new right page extent + */ + nextindex = le16_to_cpu(sp->header.maxentry); + memmove(&rp->xad[XTENTRYSTART], &sp->xad[XTENTRYSTART], + (nextindex - XTENTRYSTART) << L2XTSLOTSIZE); + + /* + * insert the new entry into the new right/child page + * (skip index in the new right page will not change) + */ + skip = split->index; + /* if insert into middle, shift right remaining entries */ + if (skip != nextindex) + memmove(&rp->xad[skip + 1], &rp->xad[skip], + (nextindex - skip) * sizeof(xad_t)); + + xad = &rp->xad[skip]; + XT_PUTENTRY(xad, split->flag, split->off, split->len, split->addr); + + /* update page header */ + rp->header.nextindex = cpu_to_le16(nextindex + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, rmp, tlckXTREE | tlckNEW); + xtlck = (xtlock_t *) & tlck->lock; + xtlck->lwm.offset = XTENTRYSTART; + xtlck->lwm.length = le16_to_cpu(rp->header.nextindex) - + XTENTRYSTART; + } + + /* + * reset the root + * + * init root with the single entry for the new right page + * set the 1st entry offset to 0, which force the left-most key + * at any level of the tree to be less than any search key. + */ + /* + * acquire a transaction lock on the root page (in-memory inode); + * + * action: root split; + */ + BT_MARK_DIRTY(split->mp, ip); + + xad = &sp->xad[XTENTRYSTART]; + XT_PUTENTRY(xad, XAD_NEW, 0, JFS_SBI(ip->i_sb)->nbperpage, rbn); + + /* update page header of root */ + sp->header.flag &= ~BT_LEAF; + sp->header.flag |= BT_INTERNAL; + + sp->header.nextindex = cpu_to_le16(XTENTRYSTART + 1); + + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, split->mp, tlckXTREE | tlckGROW); + xtlck = (xtlock_t *) & tlck->lock; + xtlck->lwm.offset = XTENTRYSTART; + xtlck->lwm.length = 1; + } + + *rmpp = rmp; + + ip->i_blocks += LBLK2PBLK(ip->i_sb, lengthPXD(pxd)); + + jEVENT(0, ("xtSplitRoot: sp:0x%p rp:0x%p\n", sp, rp)); + return 0; +} + + +/* + * xtExtend() + * + * function: extend in-place; + * + * note: existing extent may or may not have been committed. + * caller is responsible for pager buffer cache update, and + * working block allocation map update; + * update pmap: alloc whole extended extent; + */ +int xtExtend(tid_t tid, /* transaction id */ + struct inode *ip, s64 xoff, /* delta extent offset */ + s32 xlen, /* delta extent length */ + int flag) +{ + int rc = 0; + int cmp; + metapage_t *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex, len; + btstack_t btstack; /* traverse stack */ + xtsplit_t split; /* split information */ + xad_t *xad; + s64 xaddr; + tlock_t *tlck; + xtlock_t *xtlck = 0; + int rootsplit = 0; + + jFYI(1, + ("xtExtend: nxoff:0x%lx nxlen:0x%x\n", (ulong) xoff, xlen)); + + /* there must exist extent to be extended */ + if ((rc = xtSearch(ip, xoff - 1, &cmp, &btstack, XT_INSERT))) + return rc; + assert(cmp == 0); + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* extension must be contiguous */ + xad = &p->xad[index]; + jFYI(0, ("xtExtend: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", + (ulong) offsetXAD(xad), lengthXAD(xad), + (ulong) addressXAD(xad))); + assert((offsetXAD(xad) + lengthXAD(xad)) == xoff); + + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (xtlock_t *) & tlck->lock; + } + + /* extend will overflow extent ? */ + xlen = lengthXAD(xad) + xlen; + if ((len = xlen - MAXXLEN) <= 0) + goto extendOld; + + /* + * extent overflow: insert entry for new extent + */ +//insertNew: + xoff = offsetXAD(xad) + MAXXLEN; + xaddr = addressXAD(xad) + MAXXLEN; + nextindex = le16_to_cpu(p->header.nextindex); + + /* + * if the leaf page is full, insert the new entry and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + rootsplit = p->header.flag & BT_ROOT; + + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = index + 1; + split.flag = XAD_NEW; + split.off = xoff; /* split offset */ + split.len = len; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (rootsplit) { + if (p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)) { + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, + tlckXTREE | + tlckGROW); + xtlck = (xtlock_t *) & tlck->lock; + } + } + } else + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + } + /* + * insert the new entry into the leaf page + */ + else { + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index + 1]; + XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr); + + /* advance next available entry index */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + } + + /* get back old entry */ + xad = &p->xad[index]; + xlen = MAXXLEN; + + /* + * extend old extent + */ + extendOld: + XADlength(xad, xlen); + if (!(xad->flag & XAD_NEW)) + xad->flag |= XAD_EXTENDED; + + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtTailgate() + * + * function: split existing 'tail' extent + * (split offset >= start offset of tail extent), and + * relocate and extend the split tail half; + * + * note: existing extent may or may not have been committed. + * caller is responsible for pager buffer cache update, and + * working block allocation map update; + * update pmap: free old split tail extent, alloc new extent; + */ +int xtTailgate(tid_t tid, /* transaction id */ + struct inode *ip, s64 xoff, /* split/new extent offset */ + s32 xlen, /* new extent length */ + s64 xaddr, /* new extent address */ + int flag) +{ + int rc = 0; + int cmp; + metapage_t *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index, nextindex, llen, rlen; + btstack_t btstack; /* traverse stack */ + xtsplit_t split; /* split information */ + xad_t *xad; + tlock_t *tlck; + xtlock_t *xtlck = 0; + tlock_t *mtlck; + maplock_t *pxdlock; + int rootsplit = 0; + +/* +printf("xtTailgate: nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", + (ulong)xoff, xlen, (ulong)xaddr); +*/ + + /* there must exist extent to be tailgated */ + if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT))) + return rc; + assert(cmp == 0); + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + /* entry found must be last entry */ + nextindex = le16_to_cpu(p->header.nextindex); + assert(index == nextindex - 1); + + BT_MARK_DIRTY(mp, ip); + /* + * acquire tlock of the leaf page containing original entry + */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (xtlock_t *) & tlck->lock; + } + + /* completely replace extent ? */ + xad = &p->xad[index]; +/* +printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", + (ulong)offsetXAD(xad), lengthXAD(xad), (ulong)addressXAD(xad)); +*/ + if ((llen = xoff - offsetXAD(xad)) == 0) + goto updateOld; + + /* + * partially replace extent: insert entry for new extent + */ +//insertNew: + /* + * if the leaf page is full, insert the new entry and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { + rootsplit = p->header.flag & BT_ROOT; + + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = index + 1; + split.flag = XAD_NEW; + split.off = xoff; /* split offset */ + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (rootsplit) { + if (p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)) { + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, + tlckXTREE | + tlckGROW); + xtlck = (xtlock_t *) & tlck->lock; + } + } + } else + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + } + /* + * insert the new entry into the leaf page + */ + else { + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index + 1]; + XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); + + /* advance next available entry index */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + } + + /* get back old XAD */ + xad = &p->xad[index]; + + /* + * truncate/relocate old extent at split offset + */ + updateOld: + /* update dmap for old/committed/truncated extent */ + rlen = lengthXAD(xad) - llen; + if (!(xad->flag & XAD_NEW)) { + /* free from PWMAP at commit */ + if (!test_cflag(COMMIT_Nolink, ip)) { + mtlck = txMaplock(tid, ip, tlckMAP); + pxdlock = (maplock_t *) & mtlck->lock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, addressXAD(xad) + llen); + PXDlength(&pxdlock->pxd, rlen); + pxdlock->index = 1; + } + jEVENT(0, + ("xtTailgate: free extent xaddr:0x%lx xlen:0x%x\n", + (ulong) addressPXD(&pxdlock->pxd), + lengthPXD(&pxdlock->pxd))); + } else + /* free from WMAP */ + dbFree(ip, addressXAD(xad) + llen, (s64) rlen); + + if (llen) + /* truncate */ + XADlength(xad, llen); + else + /* replace */ + XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); + + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index, (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + + +/* + * xtUpdate() + * + * function: update XAD; + * + * update extent for allocated_but_not_recorded or + * compressed extent; + * + * parameter: + * nxad - new XAD; + * logical extent of the specified XAD must be completely + * contained by an existing XAD; + */ +int xtUpdate(tid_t tid, struct inode *ip, xad_t * nxad) +{ /* new XAD */ + int rc = 0; + int cmp; + metapage_t *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn; + int index0, index, newindex, nextindex; + btstack_t btstack; /* traverse stack */ + xtsplit_t split; /* split information */ + xad_t *xad, *lxad, *rxad; + int xflag; + s64 nxoff, xoff; + int nxlen, xlen, lxlen, rxlen; + s64 nxaddr, xaddr; + tlock_t *tlck; + xtlock_t *xtlck = 0; + int rootsplit = 0, newpage = 0; + + /* there must exist extent to be tailgated */ + nxoff = offsetXAD(nxad); + nxlen = lengthXAD(nxad); + nxaddr = addressXAD(nxad); +/* +printf("xtUpdate: nxflag:0x%x nxoff:0x%lx nxlen:0x%x nxaddr:0x%lx\n", + nxad->flag, (ulong)nxoff, nxlen, (ulong)nxaddr); +*/ + if ((rc = xtSearch(ip, nxoff, &cmp, &btstack, XT_INSERT))) + return rc; + assert(cmp == 0); + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); + + BT_MARK_DIRTY(mp, ip); + /* + * acquire tlock of the leaf page containing original entry + */ + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (xtlock_t *) & tlck->lock; + } + + xad = &p->xad[index0]; + xflag = xad->flag; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); +/* +printf("xtUpdate: xflag:0x%x xoff:0x%lx xlen:0x%x xaddr:0x%lx\n", + xflag, (ulong)xoff, xlen, (ulong)xaddr); +*/ + + /* nXAD must be completely contained within XAD */ + assert(xoff <= nxoff); + assert(nxoff + nxlen <= xoff + xlen); + + index = index0; + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + +#ifdef _JFS_WIP_NOCOALESCE + if (xoff < nxoff) + goto updateRight; + + /* + * replace XAD with nXAD + */ + replace: /* (nxoff == xoff) */ + if (nxlen == xlen) { + /* replace XAD with nXAD:recorded */ + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + goto out; + } else /* (nxlen < xlen) */ + goto updateLeft; +#endif /* _JFS_WIP_NOCOALESCE */ + +/* #ifdef _JFS_WIP_COALESCE */ + if (xoff < nxoff) + goto coalesceRight; + + /* + * coalesce with left XAD + */ +//coalesceLeft: /* (xoff == nxoff) */ + /* is XAD first entry of page ? */ + if (index == XTENTRYSTART) + goto replace; + + /* is nXAD logically and physically contiguous with lXAD ? */ + lxad = &p->xad[index - 1]; + lxlen = lengthXAD(lxad); + if (!(lxad->flag & XAD_NOTRECORDED) && + (nxoff == offsetXAD(lxad) + lxlen) && + (nxaddr == addressXAD(lxad) + lxlen) && + (lxlen + nxlen < MAXXLEN)) { + /* extend right lXAD */ + index0 = index - 1; + XADlength(lxad, lxlen + nxlen); + + /* If we just merged two extents together, need to make sure the + * right extent gets logged. If the left one is marked XAD_NEW, + * then we know it will be logged. Otherwise, mark as + * XAD_EXTENDED + */ + if (!(lxad->flag & XAD_NEW)) + lxad->flag |= XAD_EXTENDED; + + if (xlen > nxlen) { + /* truncate XAD */ + XADoffset(xad, xoff + nxlen); + XADlength(xad, xlen - nxlen); + XADaddress(xad, xaddr + nxlen); + goto out; + } else { /* (xlen == nxlen) */ + + /* remove XAD */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - + 1) << L2XTSLOTSIZE); + + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) - + 1); + + index = index0; + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + xoff = nxoff = offsetXAD(lxad); + xlen = nxlen = lxlen + nxlen; + xaddr = nxaddr = addressXAD(lxad); + goto coalesceRight; + } + } + + /* + * replace XAD with nXAD + */ + replace: /* (nxoff == xoff) */ + if (nxlen == xlen) { + /* replace XAD with nXAD:recorded */ + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + goto coalesceRight; + } else /* (nxlen < xlen) */ + goto updateLeft; + + /* + * coalesce with right XAD + */ + coalesceRight: /* (xoff <= nxoff) */ + /* is XAD last entry of page ? */ + if (newindex == nextindex) { + if (xoff == nxoff) + goto out; + goto updateRight; + } + + /* is nXAD logically and physically contiguous with rXAD ? */ + rxad = &p->xad[index + 1]; + rxlen = lengthXAD(rxad); + if (!(rxad->flag & XAD_NOTRECORDED) && + (nxoff + nxlen == offsetXAD(rxad)) && + (nxaddr + nxlen == addressXAD(rxad)) && + (rxlen + nxlen < MAXXLEN)) { + /* extend left rXAD */ + XADoffset(rxad, nxoff); + XADlength(rxad, rxlen + nxlen); + XADaddress(rxad, nxaddr); + + /* If we just merged two extents together, need to make sure + * the left extent gets logged. If the right one is marked + * XAD_NEW, then we know it will be logged. Otherwise, mark as + * XAD_EXTENDED + */ + if (!(rxad->flag & XAD_NEW)) + rxad->flag |= XAD_EXTENDED; + + if (xlen > nxlen) + /* truncate XAD */ + XADlength(xad, xlen - nxlen); + else { /* (xlen == nxlen) */ + + /* remove XAD */ + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - 1) << L2XTSLOTSIZE); + + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) - + 1); + } + + goto out; + } else if (xoff == nxoff) + goto out; + + assert(xoff < nxoff); +/* #endif _JFS_WIP_COALESCE */ + + /* + * split XAD into (lXAD, nXAD): + * + * |---nXAD---> + * --|----------XAD----------|-- + * |-lXAD-| + */ + updateRight: /* (xoff < nxoff) */ + /* truncate old XAD as lXAD:not_recorded */ + xad = &p->xad[index]; + XADlength(xad, nxoff - xoff); + + /* insert nXAD:recorded */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { +/* +printf("xtUpdate.updateRight.split p:0x%p\n", p); +*/ + rootsplit = p->header.flag & BT_ROOT; + + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = newindex; + split.flag = xflag & ~XAD_NOTRECORDED; + split.off = nxoff; + split.len = nxlen; + split.addr = nxaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (rootsplit) { + if (p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)) { + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, + tlckXTREE | + tlckGROW); + xtlck = (xtlock_t *) & tlck->lock; + } + } + } else { + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + + /* is nXAD on new page ? */ + if (newindex > + (le16_to_cpu(p->header.maxentry) >> 1)) { + newindex = + newindex - + le16_to_cpu(p->header.nextindex) + + XTENTRYSTART; + newpage = 1; + } + } + } else { + /* if insert into middle, shift right remaining entries */ + if (newindex < nextindex) + memmove(&p->xad[newindex + 1], &p->xad[newindex], + (nextindex - newindex) << L2XTSLOTSIZE); + + /* insert the entry */ + xad = &p->xad[newindex]; + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + /* advance next available entry index. */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + } + + /* + * does nXAD force 3-way split ? + * + * |---nXAD--->| + * --|----------XAD-------------|-- + * |-lXAD-| |-rXAD -| + */ + if (nxoff + nxlen == xoff + xlen) + goto out; + + /* reorient nXAD as XAD for further split XAD into (nXAD, rXAD) */ + if (newpage) { + /* close out old page */ + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index0, (int)xtlck->lwm.offset) : index0; + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + bn = le64_to_cpu(p->header.next); + XT_PUTPAGE(mp); + + /* get new right page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (xtlock_t *) & tlck->lock; + } + + index0 = index = newindex; + } else + index++; + + newindex = index + 1; + nextindex = le16_to_cpu(p->header.nextindex); + xlen = xlen - (nxoff - xoff); + xoff = nxoff; + xaddr = nxaddr; + + /* recompute split pages */ + if (nextindex == le16_to_cpu(p->header.maxentry)) { +/* +printf("xtUpdate: updateRight+Left recompute split pages: p:0x%p\n", p); +*/ + XT_PUTPAGE(mp); + + if ((rc = xtSearch(ip, nxoff, &cmp, &btstack, XT_INSERT))) + return rc; + assert(cmp == 0); + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index0); + assert(index0 == index); + } + + /* + * split XAD into (nXAD, rXAD) + * + * ---nXAD---| + * --|----------XAD----------|-- + * |-rXAD-| + */ + updateLeft: /* (nxoff == xoff) && (nxlen < xlen) */ + /* update old XAD with nXAD:recorded */ + xad = &p->xad[index]; + *xad = *nxad; + xad->flag = xflag & ~XAD_NOTRECORDED; + + /* insert rXAD:not_recorded */ + xoff = xoff + nxlen; + xlen = xlen - nxlen; + xaddr = xaddr + nxlen; + if (nextindex == le16_to_cpu(p->header.maxentry)) { + rootsplit = p->header.flag & BT_ROOT; + +/* +printf("xtUpdate.updateLeft.split p:0x%p\n", p); +*/ + /* xtSpliUp() unpins leaf pages */ + split.mp = mp; + split.index = newindex; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + split.pxdlist = NULL; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) + return rc; + + /* + * if leaf root has been split, original root has been + * copied to new child page, i.e., original entry now + * resides on the new child page; + */ + if (rootsplit) { + if (p->header.nextindex == + cpu_to_le16(XTENTRYSTART + 1)) { + xad = &p->xad[XTENTRYSTART]; + bn = addressXAD(xad); + + /* get new child page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + + BT_MARK_DIRTY(mp, ip); + if (!test_cflag(COMMIT_Nolink, ip)) { + tlck = txLock(tid, ip, mp, + tlckXTREE | + tlckGROW); + xtlck = (xtlock_t *) & tlck->lock; + } + } + } else + /* get back old page */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + } else { + /* if insert into middle, shift right remaining entries */ + if (newindex < nextindex) + memmove(&p->xad[newindex + 1], &p->xad[newindex], + (nextindex - newindex) << L2XTSLOTSIZE); + + /* insert the entry */ + xad = &p->xad[newindex]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index. */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + } + + out: + if (!test_cflag(COMMIT_Nolink, ip)) { + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index0, (int)xtlck->lwm.offset) : index0; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + + +#ifdef _STILL_TO_PORT +/* + * xtAppend() + * + * function: grow in append mode from contiguous region specified ; + * + * parameter: + * tid - transaction id; + * ip - file object; + * xflag - extent flag: + * xoff - extent offset; + * maxblocks - max extent length; + * xlen - extent length (in/out); + * xaddrp - extent address pointer (in/out): + * flag - + * + * return: + */ +int xtAppend(tid_t tid, /* transaction id */ + struct inode *ip, int xflag, s64 xoff, s32 maxblocks, /* @GD1 */ + s32 * xlenp, /* (in/out) */ + s64 * xaddrp, /* (in/out) */ + int flag) +{ + int rc = 0; + metapage_t *mp; /* meta-page buffer */ + xtpage_t *p; /* base B+-tree index page */ + s64 bn, xaddr; + int index, nextindex; + btstack_t btstack; /* traverse stack */ + xtsplit_t split; /* split information */ + xad_t *xad; + int cmp; + tlock_t *tlck; + xtlock_t *xtlck; + int nsplit, nblocks, xlen; + pxdlist_t pxdlist; + pxd_t *pxd; + + xaddr = *xaddrp; + xlen = *xlenp; + jEVENT(0, + ("xtAppend: xoff:0x%lx maxblocks:%d xlen:%d xaddr:0x%lx\n", + (ulong) xoff, maxblocks, xlen, (ulong) xaddr)); + + /* + * search for the entry location at which to insert: + * + * xtFastSearch() and xtSearch() both returns (leaf page + * pinned, index at which to insert). + * n.b. xtSearch() may return index of maxentry of + * the full page. + */ + if ((rc = xtSearch(ip, xoff, &cmp, &btstack, XT_INSERT))) + return rc; + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + + if (cmp == 0) { + rc = EEXIST; + goto out; + } +//insert: + /* + * insert entry for new extent + */ + xflag |= XAD_NEW; + + /* + * if the leaf page is full, split the page and + * propagate up the router entry for the new page from split + * + * The xtSplitUp() will insert the entry and unpin the leaf page. + */ + nextindex = le16_to_cpu(p->header.nextindex); + if (nextindex < le16_to_cpu(p->header.maxentry)) + goto insertLeaf; + + /* + * allocate new index blocks to cover index page split(s) + */ + nsplit = btstack.nsplit; + split.pxdlist = &pxdlist; + pxdlist.maxnpxd = pxdlist.npxd = 0; + pxd = &pxdlist.pxd[0]; + nblocks = JFS_SBI(ip->i_sb)->nbperpage; + for (; nsplit > 0; nsplit--, pxd++, xaddr += nblocks, maxblocks -= nblocks) { /* @GD1 */ + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) nblocks)) == 0) { + PXDaddress(pxd, xaddr); + PXDlength(pxd, nblocks); + + pxdlist.maxnpxd++; + + continue; + } + + /* undo allocation */ + + goto out; + } + + xlen = min(xlen, maxblocks); /* @GD1 */ + + /* + * allocate data extent requested + */ + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen))) + goto out; + + split.mp = mp; + split.index = index; + split.flag = xflag; + split.off = xoff; + split.len = xlen; + split.addr = xaddr; + if ((rc = xtSplitUp(tid, ip, &split, &btstack))) { + /* undo data extent allocation */ + dbFree(ip, *xaddrp, (s64) * xlenp); + + return rc; + } + + *xaddrp = xaddr; + *xlenp = xlen; + return 0; + + /* + * insert the new entry into the leaf page + */ + insertLeaf: + /* + * allocate data extent requested + */ + if ((rc = dbAllocBottomUp(ip, xaddr, (s64) xlen))) + goto out; + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action: xad insertion/extension; + */ + tlck = txLock(tid, ip, mp, tlckXTREE | tlckGROW); + xtlck = (xtlock_t *) & tlck->lock; + + /* insert the new entry: mark the entry NEW */ + xad = &p->xad[index]; + XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); + + /* advance next available entry index */ + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1); + + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index; + xtlck->lwm.length = le16_to_cpu(p->header.nextindex) - + xtlck->lwm.offset; + + *xaddrp = xaddr; + *xlenp = xlen; + + out: + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + return rc; +} + + +/* - TBD for defragmentaion/reorganization - + * + * xtDelete() + * + * function: + * delete the entry with the specified key. + * + * N.B.: whole extent of the entry is assumed to be deleted. + * + * parameter: + * + * return: + * ENOENT: if the entry is not found. + * + * exception: + */ +int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag) +{ + int rc = 0; + btstack_t btstack; + int cmp; + s64 bn; + metapage_t *mp; + xtpage_t *p; + int index, nextindex; + tlock_t *tlck; + xtlock_t *xtlck; + + /* + * find the matching entry; xtSearch() pins the page + */ + if ((rc = xtSearch(ip, xoff, &cmp, &btstack, 0))) + return rc; + + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + if (cmp) { + /* unpin the leaf page */ + XT_PUTPAGE(mp); + return ENOENT; + } + + /* + * delete the entry from the leaf page + */ + nextindex = le16_to_cpu(p->header.nextindex); + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1); + + /* + * if the leaf page bocome empty, free the page + */ + if (p->header.nextindex == cpu_to_le16(XTENTRYSTART)) + return (xtDeleteUp(tid, ip, mp, p, &btstack)); + + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action:xad deletion; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (xtlock_t *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, xtlck->lwm.offset) : index; + + /* if delete from middle, shift left/compact the remaining entries */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - 1) * sizeof(xad_t)); + + XT_PUTPAGE(mp); + + return 0; +} + + +/* - TBD for defragmentaion/reorganization - + * + * xtDeleteUp() + * + * function: + * free empty pages as propagating deletion up the tree + * + * parameter: + * + * return: + */ +static int +xtDeleteUp(tid_t tid, + struct inode *ip, + metapage_t * fmp, xtpage_t * fp, btstack_t * btstack) +{ + int rc = 0; + metapage_t *mp; + xtpage_t *p; + int index, nextindex; + s64 xaddr; + int xlen; + btframe_t *parent; + tlock_t *tlck; + xtlock_t *xtlck; + + /* + * keep root leaf page which has become empty + */ + if (fp->header.flag & BT_ROOT) { + /* keep the root page */ + fp->header.flag &= ~BT_INTERNAL; + fp->header.flag |= BT_LEAF; + fp->header.nextindex = cpu_to_le16(XTENTRYSTART); + + /* XT_PUTPAGE(fmp); */ + + return 0; + } + + /* + * free non-root leaf page + */ + if ((rc = xtRelink(tid, ip, fp))) + return rc; + + xaddr = addressPXD(&fp->header.self); + xlen = lengthPXD(&fp->header.self); + /* free the page extent */ + dbFree(ip, xaddr, (s64) xlen); + + /* free the buffer page */ + discard_metapage(fmp); + + /* + * propagate page deletion up the index tree + * + * If the delete from the parent page makes it empty, + * continue all the way up the tree. + * stop if the root page is reached (which is never deleted) or + * if the entry deletion does not empty the page. + */ + while ((parent = BT_POP(btstack)) != NULL) { + /* get/pin the parent page */ + XT_GETPAGE(ip, parent->bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + index = parent->index; + + /* delete the entry for the freed child page from parent. + */ + nextindex = le16_to_cpu(p->header.nextindex); + + /* + * the parent has the single entry being deleted: + * free the parent page which has become empty. + */ + if (nextindex == 1) { + if (p->header.flag & BT_ROOT) { + /* keep the root page */ + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = + cpu_to_le16(XTENTRYSTART); + + /* XT_PUTPAGE(fmp); */ + + break; + } else { + /* free the parent page */ + if ((rc = xtRelink(tid, ip, p))) + return rc; + + xaddr = addressPXD(&p->header.self); + /* free the page extent */ + dbFree(ip, xaddr, + (s64) JFS_SBI(ip->i_sb)->nbperpage); + + /* unpin/free the buffer page */ + discard_metapage(fmp); + + /* propagate up */ + continue; + } + } + /* + * the parent has other entries remaining: + * delete the router entry from the parent page. + */ + else { + BT_MARK_DIRTY(mp, ip); + /* + * acquire a transaction lock on the leaf page; + * + * action:xad deletion; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (xtlock_t *) & tlck->lock; + xtlck->lwm.offset = + (xtlck->lwm.offset) ? min(index, + xtlck->lwm. + offset) : index; + + /* if delete from middle, + * shift left/compact the remaining entries in the page + */ + if (index < nextindex - 1) + memmove(&p->xad[index], &p->xad[index + 1], + (nextindex - index - + 1) << L2XTSLOTSIZE); + + p->header.nextindex = + cpu_to_le16(le16_to_cpu(p->header.nextindex) - + 1); + jEVENT(0, + ("xtDeleteUp(entry): 0x%lx[%d]\n", + (ulong) parent->bn, index)); + } + + /* unpin the parent page */ + XT_PUTPAGE(mp); + + /* exit propagation up */ + break; + } + + return 0; +} + + +/* + * NAME: xtRelocate() + * + * FUNCTION: relocate xtpage or data extent of regular file; + * This function is mainly used by defragfs utility. + * + * NOTE: This routine does not have the logic to handle + * uncommitted allocated extent. The caller should call + * txCommit() to commit all the allocation before call + * this routine. + */ +xtRelocate(tid_t tid, struct inode * ip, xad_t * oxad, /* old XAD */ + s64 nxaddr, /* new xaddr */ + int xtype) +{ /* extent type: XTPAGE or DATAEXT */ + int rc = 0; + tblock_t *tblk; + tlock_t *tlck; + xtlock_t *xtlck; + metapage_t *mp, *pmp, *lmp, *rmp; /* meta-page buffer */ + xtpage_t *p, *pp, *rp, *lp; /* base B+-tree index page */ + xad_t *xad; + pxd_t *pxd; + s64 xoff, xsize; + int xlen; + s64 oxaddr, sxaddr, dxaddr, nextbn, prevbn; + cbuf_t *cp; + s64 offset, nbytes, nbrd, pno; + int nb, npages, nblks; + s64 bn; + int cmp; + int index; + pxdlock_t *pxdlock; + btstack_t btstack; /* traverse stack */ + + xtype = xtype & EXTENT_TYPE; + + xoff = offsetXAD(oxad); + oxaddr = addressXAD(oxad); + xlen = lengthXAD(oxad); + + /* validate extent offset */ + offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; + if (offset >= ip->i_size) + return ESTALE; /* stale extent */ + + jEVENT(0, + ("xtRelocate: xtype:%d xoff:0x%lx xlen:0x%x xaddr:0x%lx:0x%lx\n", + xtype, (ulong) xoff, xlen, (ulong) oxaddr, + (ulong) nxaddr)); + + /* + * 1. get and validate the parent xtpage/xad entry + * covering the source extent to be relocated; + */ + if (xtype == DATAEXT) { + /* search in leaf entry */ + rc = xtSearch(ip, xoff, &cmp, &btstack, 0); + if (rc) + return rc; + if (cmp) { + XT_PUTPAGE(pmp); + return ESTALE; + } + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + + /* validate for exact match with a single entry */ + xad = &pp->xad[index]; + if (addressXAD(xad) != oxaddr || lengthXAD(xad) != xlen) { + XT_PUTPAGE(pmp); + return ESTALE; + } + } else { /* (xtype == XTPAGE) */ + + /* search in internal entry */ + rc = xtSearchNode(ip, oxad, &cmp, &btstack, 0); + if (rc) + return rc; + if (cmp) { + XT_PUTPAGE(pmp); + return ESTALE; + } + + /* retrieve search result */ + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + + /* xtSearchNode() validated for exact match with a single entry + */ + xad = &pp->xad[index]; + } + jEVENT(0, ("xtRelocate: parent xad entry validated.\n")); + + /* + * 2. relocate the extent + */ + if (xtype == DATAEXT) { + /* if the extent is allocated-but-not-recorded + * there is no real data to be moved in this extent, + */ + if (xad->flag & XAD_NOTRECORDED) + goto out; + else + /* release xtpage for cmRead()/xtLookup() */ + XT_PUTPAGE(pmp); + + /* + * cmRelocate() + * + * copy target data pages to be relocated; + * + * data extent must start at page boundary and + * multiple of page size (except the last data extent); + * read in each page of the source data extent into cbuf, + * update the cbuf extent descriptor of the page to be + * homeward bound to new dst data extent + * copy the data from the old extent to new extent. + * copy is essential for compressed files to avoid problems + * that can arise if there was a change in compression + * algorithms. + * it is a good strategy because it may disrupt cache + * policy to keep the pages in memory afterwards. + */ + offset = xoff << JFS_SBI(ip->i_sb)->l2bsize; + assert((offset & CM_OFFSET) == 0); + nbytes = xlen << JFS_SBI(ip->i_sb)->l2bsize; + pno = offset >> CM_L2BSIZE; + npages = (nbytes + (CM_BSIZE - 1)) >> CM_L2BSIZE; +/* + npages = ((offset + nbytes - 1) >> CM_L2BSIZE) - + (offset >> CM_L2BSIZE) + 1; +*/ + sxaddr = oxaddr; + dxaddr = nxaddr; + + /* process the request one cache buffer at a time */ + for (nbrd = 0; nbrd < nbytes; nbrd += nb, + offset += nb, pno++, npages--) { + /* compute page size */ + nb = min(nbytes - nbrd, CM_BSIZE); + + /* get the cache buffer of the page */ + if (rc = cmRead(ip, offset, npages, &cp)) + break; + + assert(addressPXD(&cp->cm_pxd) == sxaddr); + assert(!cp->cm_modified); + + /* bind buffer with the new extent address */ + nblks = nb >> JFS_IP(ip->i_sb)->l2bsize; + cmSetXD(ip, cp, pno, dxaddr, nblks); + + /* release the cbuf, mark it as modified */ + cmPut(cp, TRUE); + + dxaddr += nblks; + sxaddr += nblks; + } + + /* get back parent page */ + rc = xtSearch(ip, xoff, &cmp, &btstack, 0); + XT_GETSEARCH(ip, btstack.top, bn, pmp, pp, index); + jEVENT(0, ("xtRelocate: target data extent relocated.\n")); + } else { /* (xtype == XTPAGE) */ + + /* + * read in the target xtpage from the source extent; + */ + XT_GETPAGE(ip, oxaddr, mp, PSIZE, p, rc); + if (rc) { + XT_PUTPAGE(pmp); + return rc; + } + + /* + * read in sibling pages if any to update sibling pointers; + */ + rmp = NULL; + if (p->header.next) { + nextbn = le64_to_cpu(p->header.next); + XT_GETPAGE(ip, nextbn, rmp, PSIZE, rp, rc); + if (rc) { + XT_PUTPAGE(pmp); + XT_PUTPAGE(mp); + return (rc); + } + } + + lmp = NULL; + if (p->header.prev) { + prevbn = le64_to_cpu(p->header.prev); + XT_GETPAGE(ip, prevbn, lmp, PSIZE, lp, rc); + if (rc) { + XT_PUTPAGE(pmp); + XT_PUTPAGE(mp); + if (rmp) + XT_PUTPAGE(rmp); + return (rc); + } + } + + /* at this point, all xtpages to be updated are in memory */ + + /* + * update sibling pointers of sibling xtpages if any; + */ + if (lmp) { + BT_MARK_DIRTY(lmp, ip); + tlck = + txLock(tid, ip, lmp, tlckXTREE | tlckRELINK); + lp->header.next = cpu_to_le64(nxaddr); + XT_PUTPAGE(lmp); + } + + if (rmp) { + BT_MARK_DIRTY(rmp, ip); + tlck = + txLock(tid, ip, rmp, tlckXTREE | tlckRELINK); + rp->header.prev = cpu_to_le64(nxaddr); + XT_PUTPAGE(rmp); + } + + /* + * update the target xtpage to be relocated + * + * update the self address of the target page + * and write to destination extent; + * redo image covers the whole xtpage since it is new page + * to the destination extent; + * update of bmap for the free of source extent + * of the target xtpage itself: + * update of bmap for the allocation of destination extent + * of the target xtpage itself: + * update of bmap for the extents covered by xad entries in + * the target xtpage is not necessary since they are not + * updated; + * if not committed before this relocation, + * target page may contain XAD_NEW entries which must + * be scanned for bmap update (logredo() always + * scan xtpage REDOPAGE image for bmap update); + * if committed before this relocation (tlckRELOCATE), + * scan may be skipped by commit() and logredo(); + */ + BT_MARK_DIRTY(mp, ip); + /* tlckNEW init xtlck->lwm.offset = XTENTRYSTART; */ + tlck = txLock(tid, ip, mp, tlckXTREE | tlckNEW); + xtlck = (xtlock_t *) & tlck->lock; + + /* update the self address in the xtpage header */ + pxd = &p->header.self; + PXDaddress(pxd, nxaddr); + + /* linelock for the after image of the whole page */ + xtlck->lwm.length = + le16_to_cpu(p->header.nextindex) - xtlck->lwm.offset; + + /* update the buffer extent descriptor of target xtpage */ + xsize = xlen << JFS_SBI(ip->i_sb)->l2bsize; + bmSetXD(mp, nxaddr, xsize); + + /* unpin the target page to new homeward bound */ + XT_PUTPAGE(mp); + jEVENT(0, ("xtRelocate: target xtpage relocated.\n")); + } + + /* + * 3. acquire maplock for the source extent to be freed; + * + * acquire a maplock saving the src relocated extent address; + * to free of the extent at commit time; + */ + out: + /* if DATAEXT relocation, write a LOG_UPDATEMAP record for + * free PXD of the source data extent (logredo() will update + * bmap for free of source data extent), and update bmap for + * free of the source data extent; + */ + if (xtype == DATAEXT) + tlck = txMaplock(tid, ip, tlckMAP); + /* if XTPAGE relocation, write a LOG_NOREDOPAGE record + * for the source xtpage (logredo() will init NoRedoPage + * filter and will also update bmap for free of the source + * xtpage), and update bmap for free of the source xtpage; + * N.B. We use tlckMAP instead of tlkcXTREE because there + * is no buffer associated with this lock since the buffer + * has been redirected to the target location. + */ + else /* (xtype == XTPAGE) */ + tlck = txMaplock(tid, ip, tlckMAP | tlckRELOCATE); + + pxdlock = (pxdlock_t *) & tlck->lock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, oxaddr); + PXDlength(&pxdlock->pxd, xlen); + pxdlock->index = 1; + + /* + * 4. update the parent xad entry for relocation; + * + * acquire tlck for the parent entry with XAD_NEW as entry + * update which will write LOG_REDOPAGE and update bmap for + * allocation of XAD_NEW destination extent; + */ + jEVENT(0, ("xtRelocate: update parent xad entry.\n")); + BT_MARK_DIRTY(pmp, ip); + tlck = txLock(tid, ip, pmp, tlckXTREE | tlckGROW); + xtlck = (xtlock_t *) & tlck->lock; + + /* update the XAD with the new destination extent; */ + xad = &pp->xad[index]; + xad->flag |= XAD_NEW; + XADaddress(xad, nxaddr); + + xtlck->lwm.offset = min(index, xtlck->lwm.offset); + xtlck->lwm.length = le16_to_cpu(pp->header.nextindex) - + xtlck->lwm.offset; + + /* unpin the parent xtpage */ + XT_PUTPAGE(pmp); + + return rc; +} + + +/* + * xtSearchNode() + * + * function: search for the internal xad entry covering specified extent. + * This function is mainly used by defragfs utility. + * + * parameters: + * ip - file object; + * xad - extent to find; + * cmpp - comparison result: + * btstack - traverse stack; + * flag - search process flag; + * + * returns: + * btstack contains (bn, index) of search path traversed to the entry. + * *cmpp is set to result of comparison with the entry returned. + * the page containing the entry is pinned at exit. + */ +static int xtSearchNode(struct inode *ip, xad_t * xad, /* required XAD entry */ + int *cmpp, btstack_t * btstack, int flag) +{ + int rc = 0; + s64 xoff, xaddr; + int xlen; + int cmp = 1; /* init for empty page */ + s64 bn; /* block number */ + metapage_t *mp; /* meta-page buffer */ + xtpage_t *p; /* page */ + int base, index, lim; + btframe_t *btsp; + s64 t64; + + BT_CLR(btstack); + + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); + + /* + * search down tree from root: + * + * between two consecutive entries of and of + * internal page, child page Pi contains entry with k, Ki <= K < Kj. + * + * if entry with search key K is not found + * internal page search find the entry with largest key Ki + * less than K which point to the child page to search; + * leaf page search find the entry with smallest key Kj + * greater than K so that the returned index is the position of + * the entry to be shifted right for insertion of new entry. + * for empty tree, search key is greater than any key of the tree. + * + * by convention, root bn = 0. + */ + for (bn = 0;;) { + /* get/pin the page to search */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + if (p->header.flag & BT_LEAF) + return ESTALE; + + lim = le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + + /* + * binary search with search key K on the current page + */ + for (base = XTENTRYSTART; lim; lim >>= 1) { + index = base + (lim >> 1); + + XT_CMP(cmp, xoff, &p->xad[index], t64); + if (cmp == 0) { + /* + * search hit + * + * verify for exact match; + */ + if (xaddr == addressXAD(&p->xad[index]) && + xoff == offsetXAD(&p->xad[index])) { + *cmpp = cmp; + + /* save search result */ + btsp = btstack->top; + btsp->bn = bn; + btsp->index = index; + btsp->mp = mp; + + return 0; + } + + /* descend/search its child page */ + goto next; + } + + if (cmp > 0) { + base = index + 1; + --lim; + } + } + + /* + * search miss - non-leaf page: + * + * base is the smallest index with key (Kj) greater than + * search key (K) and may be zero or maxentry index. + * if base is non-zero, decrement base by one to get the parent + * entry of the child page to search. + */ + index = base ? base - 1 : base; + + /* + * go down to child page + */ + next: + /* get the child page block number */ + bn = addressXAD(&p->xad[index]); + + /* unpin the parent page */ + XT_PUTPAGE(mp); + } +} + + +/* + * xtRelink() + * + * function: + * link around a freed page. + * + * Parameter: + * int tid, + * struct inode *ip, + * xtpage_t *p) + * + * returns: + */ +static int xtRelink(tid_t tid, struct inode *ip, xtpage_t * p) +{ + int rc = 0; + metapage_t *mp; + s64 nextbn, prevbn; + tlock_t *tlck; + + nextbn = le64_to_cpu(p->header.next); + prevbn = le64_to_cpu(p->header.prev); + + /* update prev pointer of the next page */ + if (nextbn != 0) { + XT_GETPAGE(ip, nextbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * acquire a transaction lock on the page; + * + * action: update prev pointer; + */ + BT_MARK_DIRTY(mp, ip); + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + /* the page may already have been tlock'd */ + + p->header.prev = cpu_to_le64(prevbn); + + XT_PUTPAGE(mp); + } + + /* update next pointer of the previous page */ + if (prevbn != 0) { + XT_GETPAGE(ip, prevbn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * acquire a transaction lock on the page; + * + * action: update next pointer; + */ + BT_MARK_DIRTY(mp, ip); + tlck = txLock(tid, ip, mp, tlckXTREE | tlckRELINK); + + /* the page may already have been tlock'd */ + + p->header.next = le64_to_cpu(nextbn); + + XT_PUTPAGE(mp); + } + + return 0; +} +#endif /* _STILL_TO_PORT */ + + +/* + * xtInitRoot() + * + * initialize file root (inline in inode) + */ +void xtInitRoot(tid_t tid, struct inode *ip) +{ + xtpage_t *p; + tlock_t *tlck; + + /* + * acquire a transaction lock on the root + * + * action: + */ + tlck = txLock(tid, ip, (metapage_t *) &JFS_IP(ip)->bxflag, + tlckXTREE | tlckNEW); + p = &JFS_IP(ip)->i_xtroot; + + p->header.flag = DXD_INDEX | BT_ROOT | BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + + if (S_ISDIR(ip->i_mode)) + p->header.maxentry = cpu_to_le16(XTROOTINITSLOT_DIR); + else { + p->header.maxentry = cpu_to_le16(XTROOTINITSLOT); + ip->i_size = 0; + } + + + return; +} + + +/* + * We can run into a deadlock truncating a file with a large number of + * xtree pages (large fragmented file). A robust fix would entail a + * reservation system where we would reserve a number of metadata pages + * and tlocks which we would be guaranteed without a deadlock. Without + * this, a partial fix is to limit number of metadata pages we will lock + * in a single transaction. Currently we will truncate the file so that + * no more than 50 leaf pages will be locked. The caller of xtTruncate + * will be responsible for ensuring that the current transaction gets + * committed, and that subsequent transactions are created to truncate + * the file further if needed. + */ +#define MAX_TRUNCATE_LEAVES 50 + +/* + * xtTruncate() + * + * function: + * traverse for truncation logging backward bottom up; + * terminate at the last extent entry at the current subtree + * root page covering new down size. + * truncation may occur within the last extent entry. + * + * parameter: + * int tid, + * struct inode *ip, + * s64 newsize, + * int type) {PWMAP, PMAP, WMAP; DELETE, TRUNCATE} + * + * return: + * + * note: + * PWMAP: + * 1. truncate (non-COMMIT_NOLINK file) + * by jfs_truncate() or jfs_open(O_TRUNC): + * xtree is updated; + * 2. truncate index table of directory when last entry removed + * map update via tlock at commit time; + * PMAP: + * Call xtTruncate_pmap instead + * WMAP: + * 1. remove (free zero link count) on last reference release + * (pmap has been freed at commit zero link count); + * 2. truncate (COMMIT_NOLINK file, i.e., tmp file): + * xtree is updated; + * map update directly at truncation time; + * + * if (DELETE) + * no LOG_NOREDOPAGE is required (NOREDOFILE is sufficient); + * else if (TRUNCATE) + * must write LOG_NOREDOPAGE for deleted index page; + * + * pages may already have been tlocked by anonymous transactions + * during file growth (i.e., write) before truncation; + * + * except last truncated entry, deleted entries remains as is + * in the page (nextindex is updated) for other use + * (e.g., log/update allocation map): this avoid copying the page + * info but delay free of pages; + * + */ +s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) +{ + int rc = 0; + s64 teof; + metapage_t *mp; + xtpage_t *p; + s64 bn; + int index, nextindex; + xad_t *xad; + s64 xoff, xaddr; + int xlen, len, freexlen; + btstack_t btstack; + btframe_t *parent; + tblock_t *tblk = 0; + tlock_t *tlck = 0; + xtlock_t *xtlck = 0; + xdlistlock_t xadlock; /* maplock for COMMIT_WMAP */ + pxdlock_t *pxdlock; /* maplock for COMMIT_WMAP */ + s64 nfreed; + int freed, log; + int locked_leaves = 0; + + /* save object truncation type */ + if (tid) { + tblk = tid_to_tblock(tid); + tblk->xflag |= flag; + } + + nfreed = 0; + + flag &= COMMIT_MAP; + assert(flag != COMMIT_PMAP); + + if (flag == COMMIT_PWMAP) + log = 1; + else { + log = 0; + xadlock.flag = mlckFREEXADLIST; + xadlock.index = 1; + } + + /* + * if the newsize is not an integral number of pages, + * the file between newsize and next page boundary will + * be cleared. + * if truncating into a file hole, it will cause + * a full block to be allocated for the logical block. + */ + + /* + * release page blocks of truncated region + * + * free the data blocks from the leaf index blocks. + * delete the parent index entries corresponding to + * the freed child data/index blocks. + * free the index blocks themselves which aren't needed + * in new sized file. + * + * index blocks are updated only if the blocks are to be + * retained in the new sized file. + * if type is PMAP, the data and index pages are NOT + * freed, and the data and index blocks are NOT freed + * from working map. + * (this will allow continued access of data/index of + * temporary file (zerolink count file truncated to zero-length)). + */ + teof = (newsize + (JFS_SBI(ip->i_sb)->bsize - 1)) >> + JFS_SBI(ip->i_sb)->l2bsize; + + /* clear stack */ + BT_CLR(&btstack); + + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + + /* + * first access of each page: + */ + getPage: + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return -rc; + + /* process entries backward from last index */ + index = le16_to_cpu(p->header.nextindex) - 1; + + if (p->header.flag & BT_INTERNAL) + goto getChild; + + /* + * leaf page + */ + + /* Since this is the rightmost leaf, and we may have already freed + * a page that was formerly to the right, let's make sure that the + * next pointer is zero. + */ + p->header.next = 0; + + freed = 0; + + /* does region covered by leaf page precede Teof ? */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + if (teof >= xoff + xlen) { + XT_PUTPAGE(mp); + goto getParent; + } + + /* (re)acquire tlock of the leaf page */ + if (log) { + if (++locked_leaves > MAX_TRUNCATE_LEAVES) { + /* + * We need to limit the size of the transaction + * to avoid exhausting pagecache & tlocks + */ + XT_PUTPAGE(mp); + newsize = (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; + goto getParent; + } + tlck = txLock(tid, ip, mp, tlckXTREE); + tlck->type = tlckXTREE | tlckTRUNCATE; + xtlck = (xtlock_t *) & tlck->lock; + xtlck->hwm.offset = le16_to_cpu(p->header.nextindex) - 1; + } + BT_MARK_DIRTY(mp, ip); + + /* + * scan backward leaf page entries + */ + for (; index >= XTENTRYSTART; index--) { + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + xaddr = addressXAD(xad); + + /* + * entry beyond eof: continue scan of current page + * xad + * ---|---=======-------> + * eof + */ + if (teof < xoff) { + nfreed += xlen; + continue; + } + + /* + * (xoff <= teof): last entry to be deleted from page; + * If other entries remain in page: keep and update the page. + */ + + /* + * eof == entry_start: delete the entry + * xad + * -------|=======-------> + * eof + * + */ + if (teof == xoff) { + nfreed += xlen; + + if (index == XTENTRYSTART) + break; + + nextindex = index; + } + /* + * eof within the entry: truncate the entry. + * xad + * -------===|===-------> + * eof + */ + else if (teof < xoff + xlen) { + /* update truncated entry */ + len = teof - xoff; + freexlen = xlen - len; + XADlength(xad, len); + + /* save pxd of truncated extent in tlck */ + xaddr += len; + if (log) { /* COMMIT_PWMAP */ + xtlck->lwm.offset = (xtlck->lwm.offset) ? + min(index, (int)xtlck->lwm.offset) : index; + xtlck->lwm.length = index + 1 - + xtlck->lwm.offset; + pxdlock = (pxdlock_t *) & xtlck->pxdlock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, freexlen); + } + /* free truncated extent */ + else { /* COMMIT_WMAP */ + + pxdlock = (pxdlock_t *) & xadlock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, freexlen); + txFreeMap(ip, pxdlock, 0, COMMIT_WMAP); + + /* reset map lock */ + xadlock.flag = mlckFREEXADLIST; + } + + /* current entry is new last entry; */ + nextindex = index + 1; + + nfreed += freexlen; + } + /* + * eof beyond the entry: + * xad + * -------=======---|---> + * eof + */ + else { /* (xoff + xlen < teof) */ + + nextindex = index + 1; + } + + if (nextindex < le16_to_cpu(p->header.nextindex)) { + if (!log) { /* COMMIT_WAMP */ + xadlock.xdlist = &p->xad[nextindex]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + nextindex; + txFreeMap(ip, (maplock_t *) & xadlock, 0, + COMMIT_WMAP); + } + p->header.nextindex = cpu_to_le16(nextindex); + } + + XT_PUTPAGE(mp); + + /* assert(freed == 0); */ + goto getParent; + } /* end scan of leaf page entries */ + + freed = 1; + + /* + * leaf page become empty: free the page if type != PMAP + */ + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckFREE: + * free data extents covered by leaf [XTENTRYSTART:hwm); + * invalidate leaf if COMMIT_PWMAP; + * if (TRUNCATE), will write LOG_NOREDOPAGE; + */ + tlck->type = tlckXTREE | tlckFREE; + } else { /* COMMIT_WAMP */ + + /* free data extents covered by leaf */ + xadlock.xdlist = &p->xad[XTENTRYSTART]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - XTENTRYSTART; + txFreeMap(ip, (maplock_t *) & xadlock, 0, COMMIT_WMAP); + } + + if (p->header.flag & BT_ROOT) { + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + + XT_PUTPAGE(mp); /* debug */ + goto out; + } else { + if (log) { /* COMMIT_PWMAP */ + /* page will be invalidated at tx completion + */ + XT_PUTPAGE(mp); + } else { /* COMMIT_WMAP */ + + if (mp->lid) + lid_to_tlock(mp->lid)->flag |= tlckFREELOCK; + + /* invalidate empty leaf page */ + discard_metapage(mp); + } + } + + /* + * the leaf page become empty: delete the parent entry + * for the leaf page if the parent page is to be kept + * in the new sized file. + */ + + /* + * go back up to the parent page + */ + getParent: + /* pop/restore parent entry for the current child page */ + if ((parent = BT_POP(&btstack)) == NULL) + /* current page must have been root */ + goto out; + + /* get back the parent page */ + bn = parent->bn; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return -rc; + + index = parent->index; + + /* + * child page was not empty: + */ + if (freed == 0) { + /* has any entry deleted from parent ? */ + if (index < le16_to_cpu(p->header.nextindex) - 1) { + /* (re)acquire tlock on the parent page */ + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckTRUNCATE: + * free child extents covered by parent [); + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (xtlock_t *) & tlck->lock; + xtlck->twm.offset = index; + if (!(tlck->type & tlckTRUNCATE)) { + xtlck->hwm.offset = + le16_to_cpu(p->header. + nextindex) - 1; + tlck->type = + tlckXTREE | tlckTRUNCATE; + } + } else { /* COMMIT_WMAP */ + + /* free child extents covered by parent */ + xadlock.xdlist = &p->xad[index + 1]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + index - 1; + txFreeMap(ip, (maplock_t *) & xadlock, 0, + COMMIT_WMAP); + } + BT_MARK_DIRTY(mp, ip); + + p->header.nextindex = cpu_to_le16(index + 1); + } + XT_PUTPAGE(mp); + goto getParent; + } + + /* + * child page was empty: + */ + nfreed += lengthXAD(&p->xad[index]); + + /* + * During working map update, child page's tlock must be handled + * before parent's. This is because the parent's tlock will cause + * the child's disk space to be marked available in the wmap, so + * it's important that the child page be released by that time. + * + * ToDo: tlocks should be on doubly-linked list, so we can + * quickly remove it and add it to the end. + */ + + /* + * Move parent page's tlock to the end of the tid's tlock list + */ + if (log && mp->lid && (tblk->last != mp->lid) && + lid_to_tlock(mp->lid)->tid) { + lid_t lid = mp->lid; + tlock_t *prev; + + tlck = lid_to_tlock(lid); + + if (tblk->next == lid) + tblk->next = tlck->next; + else { + for (prev = lid_to_tlock(tblk->next); + prev->next != lid; + prev = lid_to_tlock(prev->next)) { + assert(prev->next); + } + prev->next = tlck->next; + } + lid_to_tlock(tblk->last)->next = lid; + tlck->next = 0; + tblk->last = lid; + } + + /* + * parent page become empty: free the page + */ + if (index == XTENTRYSTART) { + if (log) { /* COMMIT_PWMAP */ + /* txCommit() with tlckFREE: + * free child extents covered by parent; + * invalidate parent if COMMIT_PWMAP; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (xtlock_t *) & tlck->lock; + xtlck->twm.offset = index; + xtlck->hwm.offset = + le16_to_cpu(p->header.nextindex) - 1; + tlck->type = tlckXTREE | tlckFREE; + } else { /* COMMIT_WMAP */ + + /* free child extents covered by parent */ + xadlock.xdlist = &p->xad[XTENTRYSTART]; + xadlock.count = + le16_to_cpu(p->header.nextindex) - + XTENTRYSTART; + txFreeMap(ip, (maplock_t *) & xadlock, 0, + COMMIT_WMAP); + } + BT_MARK_DIRTY(mp, ip); + + if (p->header.flag & BT_ROOT) { + p->header.flag &= ~BT_INTERNAL; + p->header.flag |= BT_LEAF; + p->header.nextindex = cpu_to_le16(XTENTRYSTART); + if (le16_to_cpu(p->header.maxentry) == XTROOTMAXSLOT) { + /* + * Shrink root down to allow inline + * EA (otherwise fsck complains) + */ + p->header.maxentry = + cpu_to_le16(XTROOTINITSLOT); + JFS_IP(ip)->mode2 |= INLINEEA; + } + + XT_PUTPAGE(mp); /* debug */ + goto out; + } else { + if (log) { /* COMMIT_PWMAP */ + /* page will be invalidated at tx completion + */ + XT_PUTPAGE(mp); + } else { /* COMMIT_WMAP */ + + if (mp->lid) + lid_to_tlock(mp->lid)->flag |= + tlckFREELOCK; + + /* invalidate parent page */ + discard_metapage(mp); + } + + /* parent has become empty and freed: + * go back up to its parent page + */ + /* freed = 1; */ + goto getParent; + } + } + /* + * parent page still has entries for front region; + */ + else { + /* try truncate region covered by preceding entry + * (process backward) + */ + index--; + + /* go back down to the child page corresponding + * to the entry + */ + goto getChild; + } + + /* + * internal page: go down to child page of current entry + */ + getChild: + /* save current parent entry for the child page */ + BT_PUSH(&btstack, bn, index); + + /* get child page */ + xad = &p->xad[index]; + bn = addressXAD(xad); + + /* + * first access of each internal entry: + */ + /* release parent page */ + XT_PUTPAGE(mp); + + /* process the child page */ + goto getPage; + + out: + /* + * update file resource stat + */ + /* set size + */ + if (S_ISDIR(ip->i_mode) && !newsize) + ip->i_size = 1; /* fsck hates zero-length directories */ + else + ip->i_size = newsize; + + /* update nblocks to reflect freed blocks */ + ip->i_blocks -= LBLK2PBLK(ip->i_sb, nfreed); + + /* + * free tlock of invalidated pages + */ + if (flag == COMMIT_WMAP) + txFreelock(ip); + + return newsize; +} + + +/* + * xtTruncate_pmap() + * + * function: + * Perform truncate to zero lenghth for deleted file, leaving the + * the xtree and working map untouched. This allows the file to + * be accessed via open file handles, while the delete of the file + * is committed to disk. + * + * parameter: + * tid_t tid, + * struct inode *ip, + * s64 committed_size) + * + * return: new committed size + * + * note: + * + * To avoid deadlock by holding too many transaction locks, the + * truncation may be broken up into multiple transactions. + * The committed_size keeps track of part of the file has been + * freed from the pmaps. + */ +s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) +{ + s64 bn; + btstack_t btstack; + int cmp; + int index; + int locked_leaves = 0; + metapage_t *mp; + xtpage_t *p; + btframe_t *parent; + int rc; + tblock_t *tblk; + tlock_t *tlck = 0; + xad_t *xad; + int xlen; + s64 xoff; + xtlock_t *xtlck = 0; + + /* save object truncation type */ + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_PMAP; + + /* clear stack */ + BT_CLR(&btstack); + + if (committed_size) { + xoff = (committed_size >> JFS_SBI(ip->i_sb)->l2bsize) - 1; + rc = xtSearch(ip, xoff, &cmp, &btstack, 0); + if (rc) + return -rc; + assert(cmp == 0); + XT_GETSEARCH(ip, btstack.top, bn, mp, p, index); + } else { + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + + /* + * first access of each page: + */ + getPage: + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return -rc; + + /* process entries backward from last index */ + index = le16_to_cpu(p->header.nextindex) - 1; + + if (p->header.flag & BT_INTERNAL) + goto getChild; + } + + /* + * leaf page + */ + + if (++locked_leaves > MAX_TRUNCATE_LEAVES) { + /* + * We need to limit the size of the transaction + * to avoid exhausting pagecache & tlocks + */ + xad = &p->xad[index]; + xoff = offsetXAD(xad); + xlen = lengthXAD(xad); + XT_PUTPAGE(mp); + return (xoff + xlen) << JFS_SBI(ip->i_sb)->l2bsize; + } + tlck = txLock(tid, ip, mp, tlckXTREE); + tlck->type = tlckXTREE | tlckTRUNCATE; + xtlck = (xtlock_t *) & tlck->lock; + xtlck->hwm.offset = index; + + tlck->type = tlckXTREE | tlckFREE; + + XT_PUTPAGE(mp); + + /* + * go back up to the parent page + */ + getParent: + /* pop/restore parent entry for the current child page */ + if ((parent = BT_POP(&btstack)) == NULL) + /* current page must have been root */ + goto out; + + /* get back the parent page */ + bn = parent->bn; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return -rc; + + index = parent->index; + + /* + * parent page become empty: free the page + */ + if (index == XTENTRYSTART) { + /* txCommit() with tlckFREE: + * free child extents covered by parent; + * invalidate parent if COMMIT_PWMAP; + */ + tlck = txLock(tid, ip, mp, tlckXTREE); + xtlck = (xtlock_t *) & tlck->lock; + xtlck->twm.offset = index; + xtlck->hwm.offset = + le16_to_cpu(p->header.nextindex) - 1; + tlck->type = tlckXTREE | tlckFREE; + + XT_PUTPAGE(mp); + + if (p->header.flag & BT_ROOT) { + + goto out; + } else { + goto getParent; + } + } + /* + * parent page still has entries for front region; + */ + else + index--; + /* + * internal page: go down to child page of current entry + */ + getChild: + /* save current parent entry for the child page */ + BT_PUSH(&btstack, bn, index); + + /* get child page */ + xad = &p->xad[index]; + bn = addressXAD(xad); + + /* + * first access of each internal entry: + */ + /* release parent page */ + XT_PUTPAGE(mp); + + /* process the child page */ + goto getPage; + + out: + + return 0; +} + + +#ifdef _JFS_DEBUG_XTREE +/* + * xtDisplayTree() + * + * function: traverse forward + */ +int xtDisplayTree(struct inode *ip) +{ + int rc = 0; + metapage_t *mp; + xtpage_t *p; + s64 bn, pbn; + int index, lastindex, v, h; + xad_t *xad; + btstack_t btstack; + btframe_t *btsp; + btframe_t *parent; + + printk("display B+-tree.\n"); + + /* clear stack */ + btsp = btstack.stack; + + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + v = h = 0; + + /* + * first access of each page: + */ + getPage: + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* process entries forward from first index */ + index = XTENTRYSTART; + lastindex = le16_to_cpu(p->header.nextindex) - 1; + + if (p->header.flag & BT_INTERNAL) { + /* + * first access of each internal page + */ + goto getChild; + } else { /* (p->header.flag & BT_LEAF) */ + + /* + * first access of each leaf page + */ + printf("leaf page "); + xtDisplayPage(ip, bn, p); + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + } + + /* + * go back up to the parent page + */ + getParent: + /* pop/restore parent entry for the current child page */ + if ((parent = (btsp == btstack.stack ? NULL : --btsp)) == NULL) + /* current page must have been root */ + return; + + /* + * parent page scan completed + */ + if ((index = parent->index) == (lastindex = parent->lastindex)) { + /* go back up to the parent page */ + goto getParent; + } + + /* + * parent page has entries remaining + */ + /* get back the parent page */ + bn = parent->bn; + /* v = parent->level; */ + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* get next parent entry */ + index++; + + /* + * internal page: go down to child page of current entry + */ + getChild: + /* push/save current parent entry for the child page */ + btsp->bn = pbn = bn; + btsp->index = index; + btsp->lastindex = lastindex; + /* btsp->level = v; */ + /* btsp->node = h; */ + ++btsp; + + /* get child page */ + xad = &p->xad[index]; + bn = addressXAD(xad); + + /* + * first access of each internal entry: + */ + /* release parent page */ + XT_PUTPAGE(mp); + + printk("traverse down 0x%lx[%d]->0x%lx\n", (ulong) pbn, index, + (ulong) bn); + v++; + h = index; + + /* process the child page */ + goto getPage; +} + + +/* + * xtDisplayPage() + * + * function: display page + */ +int xtDisplayPage(struct inode *ip, s64 bn, xtpage_t * p) +{ + int rc = 0; + metapage_t *mp; + xad_t *xad; + s64 xaddr, xoff; + int xlen, i, j; + + if (p == NULL) { + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + } + + /* display page control */ + printf("bn:0x%lx flag:0x%x nextindex:%d\n", + (ulong) bn, p->header.flag, + le16_to_cpu(p->header.nextindex)); + + /* display entries */ + xad = &p->xad[XTENTRYSTART]; + for (i = XTENTRYSTART, j = 1; i < le16_to_cpu(p->header.nextindex); + i++, xad++, j++) { + xoff = offsetXAD(xad); + xaddr = addressXAD(xad); + xlen = lengthXAD(xad); + printf("\t[%d] 0x%lx:0x%lx(0x%x)", i, (ulong) xoff, + (ulong) xaddr, xlen); + + if (j == 4) { + printf("\n"); + j = 0; + } + } + + printf("\n"); +} +#endif /* _JFS_DEBUG_XTREE */ + + +#ifdef _JFS_WIP +/* + * xtGather() + * + * function: + * traverse for allocation acquiring tlock at commit time + * (vs at the time of update) logging backward top down + * + * note: + * problem - establishing that all new allocation have been + * processed both for append and random write in sparse file + * at the current entry at the current subtree root page + * + */ +int xtGather(t) +btree_t *t; +{ + int rc = 0; + xtpage_t *p; + u64 bn; + int index; + btentry_t *e; + btstack_t btstack; + struct btsf *parent; + + /* clear stack */ + BT_CLR(&btstack); + + /* + * start with root + * + * root resides in the inode + */ + bn = 0; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* new root is NOT pointed by a new entry + if (p->header.flag & NEW) + allocate new page lock; + write a NEWPAGE log; + */ + + dopage: + /* + * first access of each page: + */ + /* process entries backward from last index */ + index = le16_to_cpu(p->header.nextindex) - 1; + + if (p->header.flag & BT_LEAF) { + /* + * first access of each leaf page + */ + /* process leaf page entries backward */ + for (; index >= XTENTRYSTART; index--) { + e = &p->xad[index]; + /* + * if newpage, log NEWPAGE. + * + if (e->flag & XAD_NEW) { + nfound =+ entry->length; + update current page lock for the entry; + newpage(entry); + * + * if moved, log move. + * + } else if (e->flag & XAD_MOVED) { + reset flag; + update current page lock for the entry; + } + */ + } + + /* unpin the leaf page */ + XT_PUTPAGE(mp); + + /* + * go back up to the parent page + */ + getParent: + /* restore parent entry for the current child page */ + if ((parent = BT_POP(&btstack)) == NULL) + /* current page must have been root */ + return 0; + + if ((index = parent->index) == XTENTRYSTART) { + /* + * parent page scan completed + */ + /* go back up to the parent page */ + goto getParent; + } else { + /* + * parent page has entries remaining + */ + /* get back the parent page */ + bn = parent->bn; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return EIO; + + /* first subroot page which + * covers all new allocated blocks + * itself not new/modified. + * (if modified from split of descendent, + * go down path of split page) + + if (nfound == nnew && + !(p->header.flag & (NEW | MOD))) + exit scan; + */ + + /* process parent page entries backward */ + index--; + } + } else { + /* + * first access of each internal page + */ + } + + /* + * internal page: go down to child page of current entry + */ + + /* save current parent entry for the child page */ + BT_PUSH(&btstack, bn, index); + + /* get current entry for the child page */ + e = &p->xad[index]; + + /* + * first access of each internal entry: + */ + /* + * if new entry, log btree_tnewentry. + * + if (e->flag & XAD_NEW) + update parent page lock for the entry; + */ + + /* release parent page */ + XT_PUTPAGE(mp); + + /* get child page */ + bn = e->bn; + XT_GETPAGE(ip, bn, mp, PSIZE, p, rc); + if (rc) + return rc; + + /* + * first access of each non-root page: + */ + /* + * if new, log btree_newpage. + * + if (p->header.flag & NEW) + allocate new page lock; + write a NEWPAGE log (next, prev); + */ + + /* process the child page */ + goto dopage; + + out: + return 0; +} +#endif /* _JFS_WIP */ + + +#ifdef CONFIG_JFS_STATISTICS +int jfs_xtstat_read(char *buffer, char **start, off_t offset, int length, + int *eof, void *data) +{ + int len = 0; + off_t begin; + + len += sprintf(buffer, + "JFS Xtree statistics\n" + "====================\n" + "searches = %d\n" + "fast searches = %d\n" + "splits = %d\n", + xtStat.search, + xtStat.fastSearch, + xtStat.split); + + begin = offset; + *start = buffer + begin; + len -= begin; + + if (len > length) + len = length; + else + *eof = 1; + + if (len < 0) + len = 0; + + return len; +} +#endif diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/jfs_xtree.h linuxppc64_2_4/fs/jfs/jfs_xtree.h --- ../kernel.org/linux-2.4.19/fs/jfs/jfs_xtree.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/jfs_xtree.h Tue Apr 23 11:14:25 2002 @@ -0,0 +1,143 @@ +/* + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +/* + * Change History : + * +*/ + +#ifndef _H_JFS_XTREE +#define _H_JFS_XTREE + +/* + * jfs_xtree.h: extent allocation descriptor B+-tree manager + */ + +#include "jfs_btree.h" + + +/* + * extent allocation descriptor (xad) + */ +typedef struct xad { + unsigned flag:8; /* 1: flag */ + unsigned rsvrd:16; /* 2: reserved */ + unsigned off1:8; /* 1: offset in unit of fsblksize */ + u32 off2; /* 4: offset in unit of fsblksize */ + unsigned len:24; /* 3: length in unit of fsblksize */ + unsigned addr1:8; /* 1: address in unit of fsblksize */ + u32 addr2; /* 4: address in unit of fsblksize */ +} xad_t; /* (16) */ + +#define MAXXLEN ((1 << 24) - 1) + +#define XTSLOTSIZE 16 +#define L2XTSLOTSIZE 4 + +/* xad_t field construction */ +#define XADoffset(xad, offset64)\ +{\ + (xad)->off1 = ((u64)offset64) >> 32;\ + (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\ +} +#define XADaddress(xad, address64)\ +{\ + (xad)->addr1 = ((u64)address64) >> 32;\ + (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\ +} +#define XADlength(xad, length32) (xad)->len = __cpu_to_le24(length32) + +/* xad_t field extraction */ +#define offsetXAD(xad)\ + ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2)) +#define addressXAD(xad)\ + ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2)) +#define lengthXAD(xad) __le24_to_cpu((xad)->len) + +/* xad list */ +typedef struct { + s16 maxnxad; + s16 nxad; + xad_t *xad; +} xadlist_t; + +/* xad_t flags */ +#define XAD_NEW 0x01 /* new */ +#define XAD_EXTENDED 0x02 /* extended */ +#define XAD_COMPRESSED 0x04 /* compressed with recorded length */ +#define XAD_NOTRECORDED 0x08 /* allocated but not recorded */ +#define XAD_COW 0x10 /* copy-on-write */ + + +/* possible values for maxentry */ +#define XTROOTINITSLOT_DIR 6 +#define XTROOTINITSLOT 10 +#define XTROOTMAXSLOT 18 +#define XTPAGEMAXSLOT 256 +#define XTENTRYSTART 2 + +/* + * xtree page: + */ +typedef union { + struct xtheader { + s64 next; /* 8: */ + s64 prev; /* 8: */ + + u8 flag; /* 1: */ + u8 rsrvd1; /* 1: */ + s16 nextindex; /* 2: next index = number of entries */ + s16 maxentry; /* 2: max number of entries */ + s16 rsrvd2; /* 2: */ + + pxd_t self; /* 8: self */ + } header; /* (32) */ + + xad_t xad[XTROOTMAXSLOT]; /* 16 * maxentry: xad array */ +} xtpage_t; + +/* + * external declaration + */ +extern int xtLookup(struct inode *ip, s64 lstart, s64 llen, + int *pflag, s64 * paddr, int *plen, int flag); +extern int xtLookupList(struct inode *ip, lxdlist_t * lxdlist, + xadlist_t * xadlist, int flag); +extern void xtInitRoot(tid_t tid, struct inode *ip); +extern int xtInsert(tid_t tid, struct inode *ip, + int xflag, s64 xoff, int xlen, s64 * xaddrp, int flag); +extern int xtExtend(tid_t tid, struct inode *ip, s64 xoff, int xlen, + int flag); +extern int xtTailgate(tid_t tid, struct inode *ip, + s64 xoff, int xlen, s64 xaddr, int flag); +extern int xtUpdate(tid_t tid, struct inode *ip, struct xad *nxad); +extern int xtDelete(tid_t tid, struct inode *ip, s64 xoff, int xlen, + int flag); +extern s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int type); +extern s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size); +extern int xtRelocate(tid_t tid, struct inode *ip, + xad_t * oxad, s64 nxaddr, int xtype); +extern int xtAppend(tid_t tid, + struct inode *ip, int xflag, s64 xoff, int maxblocks, + int *xlenp, s64 * xaddrp, int flag); + +#ifdef _JFS_DEBUG_XTREE +extern int xtDisplayTree(struct inode *ip); +extern int xtDisplayPage(struct inode *ip, s64 bn, xtpage_t * p); +#endif /* _JFS_DEBUG_XTREE */ + +#endif /* !_H_JFS_XTREE */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/namei.c linuxppc64_2_4/fs/jfs/namei.c --- ../kernel.org/linux-2.4.19/fs/jfs/namei.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/namei.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,1461 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Module: jfs/namei.c + * + */ + +/* + * Change History : + * + */ + +#include +#include +#include "jfs_incore.h" +#include "jfs_inode.h" +#include "jfs_dinode.h" +#include "jfs_dmap.h" +#include "jfs_unicode.h" +#include "jfs_metapage.h" +#include "jfs_debug.h" + +extern struct inode_operations jfs_file_inode_operations; +extern struct inode_operations jfs_symlink_inode_operations; +extern struct file_operations jfs_file_operations; +extern struct address_space_operations jfs_aops; + +extern int jfs_fsync(struct file *, struct dentry *, int); +extern void jfs_truncate_nolock(struct inode *, loff_t); + +/* + * forward references + */ +struct inode_operations jfs_dir_inode_operations; +struct file_operations jfs_dir_operations; + +s64 commitZeroLink(tid_t, struct inode *); + +/* + * NAME: jfs_create(dip, dentry, mode) + * + * FUNCTION: create a regular file in the parent directory + * with name = and mode = + * + * PARAMETER: dip - parent directory vnode + * dentry - dentry of new file + * mode - create mode (rwxrwxrwx). + * + * RETURN: Errors from subroutines + * + */ +int jfs_create(struct inode *dip, struct dentry *dentry, int mode) +{ + int rc = 0; + tid_t tid; /* transaction id */ + struct inode *ip = NULL; /* child directory inode */ + ino_t ino; + component_t dname; /* child directory name */ + btstack_t btstack; + struct inode *iplist[2]; + tblock_t *tblk; + + jFYI(1, ("jfs_create: dip:0x%p name:%s\n", dip, dentry->d_name.name)); + + IWRITE_LOCK(dip); + + /* + * search parent directory for entry/freespace + * (dtSearch() returns parent directory page pinned) + */ + if ((rc = get_UCSname(&dname, dentry, JFS_SBI(dip->i_sb)->nls_tab))) + goto out1; + + /* + * Either iAlloc() or txBegin() may block. Deadlock can occur if we + * block there while holding dtree page, so we allocate the inode & + * begin the transaction before we search the directory. + */ + ip = ialloc(dip, mode); + if (ip == NULL) { + rc = ENOSPC; + goto out2; + } + + tid = txBegin(dip->i_sb, 0); + + if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) { + jERROR(1, ("jfs_create: dtSearch returned %d\n", rc)); + ip->i_nlink = 0; + iput(ip); + txEnd(tid); + goto out2; + } + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ip = ip; + + iplist[0] = dip; + iplist[1] = ip; + + /* + * initialize the child XAD tree root in-line in inode + */ + xtInitRoot(tid, ip); + + /* + * create entry in parent directory for child directory + * (dtInsert() releases parent directory page) + */ + ino = ip->i_ino; + if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) { + jERROR(1, ("jfs_create: dtInsert returned %d\n", rc)); + /* discard new inode */ + ip->i_nlink = 0; + iput(ip); + + if (rc == EIO) + txAbort(tid, 1); /* Marks Filesystem dirty */ + else + txAbort(tid, 0); /* Filesystem full */ + txEnd(tid); + goto out2; + } + + ip->i_op = &jfs_file_inode_operations; + ip->i_fop = &jfs_file_operations; + ip->i_mapping->a_ops = &jfs_aops; + + insert_inode_hash(ip); + mark_inode_dirty(ip); + d_instantiate(dentry, ip); + + dip->i_version = ++event; + dip->i_ctime = dip->i_mtime = CURRENT_TIME; + + mark_inode_dirty(dip); + + rc = txCommit(tid, 2, &iplist[0], 0); + txEnd(tid); + + out2: + free_UCSname(&dname); + + out1: + + IWRITE_UNLOCK(dip); + jFYI(1, ("jfs_create: rc:%d\n", -rc)); + return -rc; +} + + +/* + * NAME: jfs_mkdir(dip, dentry, mode) + * + * FUNCTION: create a child directory in the parent directory + * with name = and mode = + * + * PARAMETER: dip - parent directory vnode + * dentry - dentry of child directory + * mode - create mode (rwxrwxrwx). + * + * RETURN: Errors from subroutines + * + * note: + * EACCESS: user needs search+write permission on the parent directory + */ +int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) +{ + int rc = 0; + tid_t tid; /* transaction id */ + struct inode *ip = NULL; /* child directory inode */ + ino_t ino; + component_t dname; /* child directory name */ + btstack_t btstack; + struct inode *iplist[2]; + tblock_t *tblk; + + jFYI(1, ("jfs_mkdir: dip:0x%p name:%s\n", dip, dentry->d_name.name)); + + IWRITE_LOCK(dip); + + /* link count overflow on parent directory ? */ + if (dip->i_nlink == JFS_LINK_MAX) { + rc = EMLINK; + goto out1; + } + + /* + * search parent directory for entry/freespace + * (dtSearch() returns parent directory page pinned) + */ + if ((rc = get_UCSname(&dname, dentry, JFS_SBI(dip->i_sb)->nls_tab))) + goto out1; + + /* + * Either iAlloc() or txBegin() may block. Deadlock can occur if we + * block there while holding dtree page, so we allocate the inode & + * begin the transaction before we search the directory. + */ + ip = ialloc(dip, S_IFDIR | mode); + if (ip == NULL) { + rc = ENOSPC; + goto out2; + } + + tid = txBegin(dip->i_sb, 0); + + if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) { + jERROR(1, ("jfs_mkdir: dtSearch returned %d\n", rc)); + ip->i_nlink = 0; + iput(ip); + txEnd(tid); + goto out2; + } + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ip = ip; + + iplist[0] = dip; + iplist[1] = ip; + + /* + * initialize the child directory in-line in inode + */ + dtInitRoot(tid, ip, dip->i_ino); + + /* + * create entry in parent directory for child directory + * (dtInsert() releases parent directory page) + */ + ino = ip->i_ino; + if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) { + jERROR(1, ("jfs_mkdir: dtInsert returned %d\n", rc)); + /* discard new directory inode */ + ip->i_nlink = 0; + iput(ip); + + if (rc == EIO) + txAbort(tid, 1); /* Marks Filesystem dirty */ + else + txAbort(tid, 0); /* Filesystem full */ + txEnd(tid); + goto out2; + } + + ip->i_nlink = 2; /* for '.' */ + ip->i_op = &jfs_dir_inode_operations; + ip->i_fop = &jfs_dir_operations; + ip->i_mapping->a_ops = &jfs_aops; + ip->i_mapping->gfp_mask = GFP_NOFS; + + insert_inode_hash(ip); + mark_inode_dirty(ip); + d_instantiate(dentry, ip); + + /* update parent directory inode */ + dip->i_nlink++; /* for '..' from child directory */ + dip->i_version = ++event; + dip->i_ctime = dip->i_mtime = CURRENT_TIME; + mark_inode_dirty(dip); + + rc = txCommit(tid, 2, &iplist[0], 0); + txEnd(tid); + + out2: + free_UCSname(&dname); + + out1: + + IWRITE_UNLOCK(dip); + + jFYI(1, ("jfs_mkdir: rc:%d\n", -rc)); + return -rc; +} + +/* + * NAME: jfs_rmdir(dip, dentry) + * + * FUNCTION: remove a link to child directory + * + * PARAMETER: dip - parent inode + * dentry - child directory dentry + * + * RETURN: EINVAL - if name is . or .. + * EINVAL - if . or .. exist but are invalid. + * errors from subroutines + * + * note: + * if other threads have the directory open when the last link + * is removed, the "." and ".." entries, if present, are removed before + * rmdir() returns and no new entries may be created in the directory, + * but the directory is not removed until the last reference to + * the directory is released (cf.unlink() of regular file). + */ +int jfs_rmdir(struct inode *dip, struct dentry *dentry) +{ + int rc; + tid_t tid; /* transaction id */ + struct inode *ip = dentry->d_inode; + ino_t ino; + component_t dname; + struct inode *iplist[2]; + tblock_t *tblk; + + jFYI(1, ("jfs_rmdir: dip:0x%p name:%s\n", dip, dentry->d_name.name)); + + IWRITE_LOCK_LIST(2, dip, ip); + + /* directory must be empty to be removed */ + if (!dtEmpty(ip)) { + IWRITE_UNLOCK(ip); + IWRITE_UNLOCK(dip); + rc = ENOTEMPTY; + goto out; + } + + if ((rc = get_UCSname(&dname, dentry, JFS_SBI(dip->i_sb)->nls_tab))) { + IWRITE_UNLOCK(ip); + IWRITE_UNLOCK(dip); + goto out; + } + + tid = txBegin(dip->i_sb, 0); + + iplist[0] = dip; + iplist[1] = ip; + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->ip = ip; + + /* + * delete the entry of target directory from parent directory + */ + ino = ip->i_ino; + if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) { + jERROR(1, ("jfs_rmdir: dtDelete returned %d\n", rc)); + if (rc == EIO) + txAbort(tid, 1); + txEnd(tid); + + IWRITE_UNLOCK(ip); + IWRITE_UNLOCK(dip); + + goto out2; + } + + /* update parent directory's link count corresponding + * to ".." entry of the target directory deleted + */ + dip->i_nlink--; + dip->i_ctime = dip->i_mtime = CURRENT_TIME; + dip->i_version = ++event; + mark_inode_dirty(dip); + + /* + * OS/2 could have created EA and/or ACL + */ + /* free EA from both persistent and working map */ + if (JFS_IP(ip)->ea.flag & DXD_EXTENT) { + /* free EA pages */ + txEA(tid, ip, &JFS_IP(ip)->ea, NULL); + } + JFS_IP(ip)->ea.flag = 0; + + /* free ACL from both persistent and working map */ + if (JFS_IP(ip)->acl.flag & DXD_EXTENT) { + /* free ACL pages */ + txEA(tid, ip, &JFS_IP(ip)->acl, NULL); + } + JFS_IP(ip)->acl.flag = 0; + + /* mark the target directory as deleted */ + ip->i_nlink = 0; + mark_inode_dirty(ip); + + rc = txCommit(tid, 2, &iplist[0], 0); + + txEnd(tid); + + IWRITE_UNLOCK(ip); + + /* + * Truncating the directory index table is not guaranteed. It + * may need to be done iteratively + */ + if (test_cflag(COMMIT_Stale, dip)) { + if (dip->i_size > 1) + jfs_truncate_nolock(dip, 0); + + clear_cflag(COMMIT_Stale, dip); + } + + IWRITE_UNLOCK(dip); + + d_delete(dentry); + + out2: + free_UCSname(&dname); + + out: + jFYI(1, ("jfs_rmdir: rc:%d\n", rc)); + return -rc; +} + +/* + * NAME: jfs_unlink(dip, dentry) + * + * FUNCTION: remove a link to object named by + * from parent directory + * + * PARAMETER: dip - inode of parent directory + * dentry - dentry of object to be removed + * + * RETURN: errors from subroutines + * + * note: + * temporary file: if one or more processes have the file open + * when the last link is removed, the link will be removed before + * unlink() returns, but the removal of the file contents will be + * postponed until all references to the files are closed. + * + * JFS does NOT support unlink() on directories. + * + */ +int jfs_unlink(struct inode *dip, struct dentry *dentry) +{ + int rc; + tid_t tid; /* transaction id */ + struct inode *ip = dentry->d_inode; + ino_t ino; + component_t dname; /* object name */ + struct inode *iplist[2]; + tblock_t *tblk; + s64 new_size = 0; + int commit_flag; + + jFYI(1, ("jfs_unlink: dip:0x%p name:%s\n", dip, dentry->d_name.name)); + + if ((rc = get_UCSname(&dname, dentry, JFS_SBI(dip->i_sb)->nls_tab))) + goto out; + + IWRITE_LOCK_LIST(2, ip, dip); + + tid = txBegin(dip->i_sb, 0); + + iplist[0] = dip; + iplist[1] = ip; + + /* + * delete the entry of target file from parent directory + */ + ino = ip->i_ino; + if ((rc = dtDelete(tid, dip, &dname, &ino, JFS_REMOVE))) { + jERROR(1, ("jfs_unlink: dtDelete returned %d\n", rc)); + if (rc == EIO) + txAbort(tid, 1); /* Marks FS Dirty */ + txEnd(tid); + IWRITE_UNLOCK(ip); + IWRITE_UNLOCK(dip); + goto out1; + } + + ASSERT(ip->i_nlink); + + ip->i_ctime = dip->i_ctime = dip->i_mtime = CURRENT_TIME; + dip->i_version = ++event; + mark_inode_dirty(dip); + + /* update target's inode */ + ip->i_nlink--; + mark_inode_dirty(ip); + + /* + * commit zero link count object + */ + if (ip->i_nlink == 0) { + assert(!test_cflag(COMMIT_Nolink, ip)); + /* free block resources */ + if ((new_size = commitZeroLink(tid, ip)) < 0) { + txAbort(tid, 1); /* Marks FS Dirty */ + txEnd(tid); + IWRITE_UNLOCK(ip); + IWRITE_UNLOCK(dip); + rc = -new_size; /* We return -rc */ + goto out1; + } + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->ip = ip; + } + + /* + * Incomplete truncate of file data can + * result in timing problems unless we synchronously commit the + * transaction. + */ + if (new_size) + commit_flag = COMMIT_SYNC; + else + commit_flag = 0; + + /* + * If xtTruncate was incomplete, commit synchronously to avoid + * timing complications + */ + rc = txCommit(tid, 2, &iplist[0], commit_flag); + + txEnd(tid); + + while (new_size && (rc == 0)) { + tid = txBegin(dip->i_sb, 0); + new_size = xtTruncate_pmap(tid, ip, new_size); + if (new_size < 0) { + txAbort(tid, 1); /* Marks FS Dirty */ + rc = -new_size; /* We return -rc */ + } else + rc = txCommit(tid, 2, &iplist[0], COMMIT_SYNC); + txEnd(tid); + } + + if (!test_cflag(COMMIT_Holdlock, ip)) + IWRITE_UNLOCK(ip); + + /* + * Truncating the directory index table is not guaranteed. It + * may need to be done iteratively + */ + if (test_cflag(COMMIT_Stale, dip)) { + if (dip->i_size > 1) + jfs_truncate_nolock(dip, 0); + + clear_cflag(COMMIT_Stale, dip); + } + + IWRITE_UNLOCK(dip); + + d_delete(dentry); + + out1: + free_UCSname(&dname); + out: + jFYI(1, ("jfs_unlink: rc:%d\n", -rc)); + return -rc; +} + +/* + * NAME: commitZeroLink() + * + * FUNCTION: for non-directory, called by jfs_remove(), + * truncate a regular file, directory or symbolic + * link to zero length. return 0 if type is not + * one of these. + * + * if the file is currently associated with a VM segment + * only permanent disk and inode map resources are freed, + * and neither the inode nor indirect blocks are modified + * so that the resources can be later freed in the work + * map by ctrunc1. + * if there is no VM segment on entry, the resources are + * freed in both work and permanent map. + * (? for temporary file - memory object is cached even + * after no reference: + * reference count > 0 - ) + * + * PARAMETERS: cd - pointer to commit data structure. + * current inode is the one to truncate. + * + * RETURN : Errors from subroutines + */ +s64 commitZeroLink(tid_t tid, struct inode *ip) +{ + int filetype; + tblock_t *tblk; + + jFYI(1, ("commitZeroLink: tid = %d, ip = 0x%p\n", tid, ip)); + + filetype = ip->i_mode & S_IFMT; + switch (filetype) { + case S_IFREG: + break; + case S_IFLNK: + /* fast symbolic link */ + if (ip->i_size <= 256) { + ip->i_size = 0; + return 0; + } + break; + default: + assert(filetype != S_IFDIR); + return 0; + } + + set_cflag(COMMIT_Freewmap, ip); + + /* mark transaction of block map update type */ + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_PMAP; + + /* + * free EA + */ + if (JFS_IP(ip)->ea.flag & DXD_EXTENT) + /* acquire maplock on EA to be freed from block map */ + txEA(tid, ip, &JFS_IP(ip)->ea, NULL); + + /* + * free ACL + */ + if (JFS_IP(ip)->acl.flag & DXD_EXTENT) + /* acquire maplock on EA to be freed from block map */ + txEA(tid, ip, &JFS_IP(ip)->acl, NULL); + + /* + * free xtree/data (truncate to zero length): + * free xtree/data pages from cache if COMMIT_PWMAP, + * free xtree/data blocks from persistent block map, and + * free xtree/data blocks from working block map if COMMIT_PWMAP; + */ + if (ip->i_size) + return xtTruncate_pmap(tid, ip, 0); + + return 0; +} + + +/* + * NAME: freeZeroLink() + * + * FUNCTION: for non-directory, called by iClose(), + * free resources of a file from cache and WORKING map + * for a file previously committed with zero link count + * while associated with a pager object, + * + * PARAMETER: ip - pointer to inode of file. + * + * RETURN: 0 -ok + */ +int freeZeroLink(struct inode *ip) +{ + int rc = 0; + int type; + + jFYI(1, ("freeZeroLink: ip = 0x%p\n", ip)); + + /* return if not reg or symbolic link or if size is + * already ok. + */ + type = ip->i_mode & S_IFMT; + + switch (type) { + case S_IFREG: + break; + case S_IFLNK: + /* if its contained in inode nothing to do */ + if (ip->i_size <= 256) + return 0; + break; + default: + return 0; + } + + /* + * free EA + */ + if (JFS_IP(ip)->ea.flag & DXD_EXTENT) { + s64 xaddr; + int xlen; + maplock_t maplock; /* maplock for COMMIT_WMAP */ + pxdlock_t *pxdlock; /* maplock for COMMIT_WMAP */ + + /* free EA pages from cache */ + xaddr = addressDXD(&JFS_IP(ip)->ea); + xlen = lengthDXD(&JFS_IP(ip)->ea); +#ifdef _STILL_TO_PORT + bmExtentInvalidate(ip, xaddr, xlen); +#endif + + /* free EA extent from working block map */ + maplock.index = 1; + pxdlock = (pxdlock_t *) & maplock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, xlen); + txFreeMap(ip, pxdlock, 0, COMMIT_WMAP); + } + + /* + * free ACL + */ + if (JFS_IP(ip)->acl.flag & DXD_EXTENT) { + s64 xaddr; + int xlen; + maplock_t maplock; /* maplock for COMMIT_WMAP */ + pxdlock_t *pxdlock; /* maplock for COMMIT_WMAP */ + + /* free ACL pages from cache */ + xaddr = addressDXD(&JFS_IP(ip)->acl); + xlen = lengthDXD(&JFS_IP(ip)->acl); +#ifdef _STILL_TO_PORT + bmExtentInvalidate(ip, xaddr, xlen); +#endif + + /* free ACL extent from working block map */ + maplock.index = 1; + pxdlock = (pxdlock_t *) & maplock; + pxdlock->flag = mlckFREEPXD; + PXDaddress(&pxdlock->pxd, xaddr); + PXDlength(&pxdlock->pxd, xlen); + txFreeMap(ip, pxdlock, 0, COMMIT_WMAP); + } + + /* + * free xtree/data (truncate to zero length): + * free xtree/data pages from cache, and + * free xtree/data blocks from working block map; + */ + if (ip->i_size) + rc = xtTruncate(0, ip, 0, COMMIT_WMAP); + + return rc; +} + +/* + * NAME: jfs_link(vp, dvp, name, crp) + * + * FUNCTION: create a link to by the name = + * in the parent directory + * + * PARAMETER: vp - target object + * dvp - parent directory of new link + * name - name of new link to target object + * crp - credential + * + * RETURN: Errors from subroutines + * + * note: + * JFS does NOT support link() on directories (to prevent circular + * path in the directory hierarchy); + * EPERM: the target object is a directory, and either the caller + * does not have appropriate privileges or the implementation prohibits + * using link() on directories [XPG4.2]. + * + * JFS does NOT support links between file systems: + * EXDEV: target object and new link are on different file systems and + * implementation does not support links between file systems [XPG4.2]. + */ +int jfs_link(struct dentry *old_dentry, + struct inode *dir, struct dentry *dentry) +{ + int rc; + tid_t tid; + struct inode *ip = old_dentry->d_inode; + ino_t ino; + component_t dname; + btstack_t btstack; + struct inode *iplist[2]; + + jFYI(1, + ("jfs_link: %s %s\n", old_dentry->d_name.name, + dentry->d_name.name)); + + /* JFS does NOT support link() on directories */ + if (S_ISDIR(ip->i_mode)) + return -EPERM; + + IWRITE_LOCK_LIST(2, dir, ip); + + tid = txBegin(ip->i_sb, 0); + + if (ip->i_nlink == JFS_LINK_MAX) { + rc = EMLINK; + goto out; + } + + /* + * scan parent directory for entry/freespace + */ + if ((rc = get_UCSname(&dname, dentry, JFS_SBI(ip->i_sb)->nls_tab))) + goto out; + + if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) + goto out; + + /* + * create entry for new link in parent directory + */ + ino = ip->i_ino; + if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) + goto out; + + dir->i_version = ++event; + + /* update object inode */ + ip->i_nlink++; /* for new link */ + ip->i_ctime = CURRENT_TIME; + mark_inode_dirty(dir); + atomic_inc(&ip->i_count); + d_instantiate(dentry, ip); + + iplist[0] = ip; + iplist[1] = dir; + rc = txCommit(tid, 2, &iplist[0], 0); + + out: + IWRITE_UNLOCK(dir); + IWRITE_UNLOCK(ip); + + txEnd(tid); + + jFYI(1, ("jfs_link: rc:%d\n", rc)); + return -rc; +} + +/* + * NAME: jfs_symlink(dip, dentry, name) + * + * FUNCTION: creates a symbolic link to by name + * in directory + * + * PARAMETER: dip - parent directory vnode + * dentry - dentry of symbolic link + * name - the path name of the existing object + * that will be the source of the link + * + * RETURN: errors from subroutines + * + * note: + * ENAMETOOLONG: pathname resolution of a symbolic link produced + * an intermediate result whose length exceeds PATH_MAX [XPG4.2] +*/ + +int jfs_symlink(struct inode *dip, struct dentry *dentry, const char *name) +{ + int rc; + tid_t tid; + ino_t ino = 0; + component_t dname; + int ssize; /* source pathname size */ + btstack_t btstack; + struct inode *ip = dentry->d_inode; + unchar *i_fastsymlink; + s64 xlen = 0; + int bmask = 0, xsize; + s64 xaddr; + metapage_t *mp; + struct super_block *sb; + tblock_t *tblk; + + struct inode *iplist[2]; + + jFYI(1, ("jfs_symlink: dip:0x%p name:%s\n", dip, name)); + + IWRITE_LOCK(dip); + + ssize = strlen(name) + 1; + + tid = txBegin(dip->i_sb, 0); + + /* + * search parent directory for entry/freespace + * (dtSearch() returns parent directory page pinned) + */ + + if ((rc = get_UCSname(&dname, dentry, JFS_SBI(dip->i_sb)->nls_tab))) + goto out1; + + if ((rc = dtSearch(dip, &dname, &ino, &btstack, JFS_CREATE))) + goto out2; + + + + /* + * allocate on-disk/in-memory inode for symbolic link: + * (iAlloc() returns new, locked inode) + */ + + ip = ialloc(dip, S_IFLNK | 0777); + if (ip == NULL) { + BT_PUTSEARCH(&btstack); + rc = ENOSPC; + goto out2; + } + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ip = ip; + + /* + * create entry for symbolic link in parent directory + */ + + ino = ip->i_ino; + + + + if ((rc = dtInsert(tid, dip, &dname, &ino, &btstack))) { + jERROR(1, ("jfs_symlink: dtInsert returned %d\n", rc)); + /* discard ne inode */ + ip->i_nlink = 0; + iput(ip); + goto out2; + + } + + /* fix symlink access permission + * (dir_create() ANDs in the u.u_cmask, + * but symlinks really need to be 777 access) + */ + ip->i_mode |= 0777; + + /* + * write symbolic link target path name + */ + xtInitRoot(tid, ip); + + /* + * write source path name inline in on-disk inode (fast symbolic link) + */ + + if (ssize <= IDATASIZE) { + ip->i_op = &jfs_symlink_inode_operations; + + i_fastsymlink = JFS_IP(ip)->i_inline; + memcpy(i_fastsymlink, name, ssize); + ip->i_size = ssize - 1; + jFYI(1, + ("jfs_symlink: fast symlink added ssize:%d name:%s \n", + ssize, name)); + } + /* + * write source path name in a single extent + */ + else { + jFYI(1, ("jfs_symlink: allocate extent ip:0x%p\n", ip)); + + ip->i_op = &page_symlink_inode_operations; + ip->i_mapping->a_ops = &jfs_aops; + + /* + * even though the data of symlink object (source + * path name) is treated as non-journaled user data, + * it is read/written thru buffer cache for performance. + */ + sb = ip->i_sb; + bmask = JFS_SBI(sb)->bsize - 1; + xsize = (ssize + bmask) & ~bmask; + xaddr = 0; + xlen = xsize >> JFS_SBI(sb)->l2bsize; + if ((rc = xtInsert(tid, ip, 0, 0, xlen, &xaddr, 0)) == 0) { + ip->i_size = ssize - 1; + while (ssize) { + int copy_size = min(ssize, PSIZE); + + mp = get_metapage(ip, xaddr, PSIZE, 1); + + if (mp == NULL) { + dtDelete(tid, dip, &dname, &ino, + JFS_REMOVE); + ip->i_nlink = 0; + iput(ip); + rc = EIO; + goto out2; + } + memcpy(mp->data, name, copy_size); + flush_metapage(mp); +#if 0 + mark_buffer_uptodate(bp, 1); + mark_buffer_dirty(bp, 1); + if (IS_SYNC(dip)) { + ll_rw_block(WRITE, 1, &bp); + wait_on_buffer(bp); + } + brelse(bp); +#endif /* 0 */ + ssize -= copy_size; + xaddr += JFS_SBI(sb)->nbperpage; + } + ip->i_blocks = LBLK2PBLK(sb, xlen); + } else { + dtDelete(tid, dip, &dname, &ino, JFS_REMOVE); + ip->i_nlink = 0; + iput(ip); + rc = ENOSPC; + goto out2; + } + } + dip->i_version = ++event; + + insert_inode_hash(ip); + mark_inode_dirty(ip); + d_instantiate(dentry, ip); + + /* + * commit update of parent directory and link object + * + * if extent allocation failed (ENOSPC), + * the parent inode is committed regardless to avoid + * backing out parent directory update (by dtInsert()) + * and subsequent dtDelete() which is harmless wrt + * integrity concern. + * the symlink inode will be freed by iput() at exit + * as it has a zero link count (by dtDelete()) and + * no permanant resources. + */ + + iplist[0] = dip; + if (rc == 0) { + iplist[1] = ip; + rc = txCommit(tid, 2, &iplist[0], 0); + } else + rc = txCommit(tid, 1, &iplist[0], 0); + + out2: + + free_UCSname(&dname); + out1: + IWRITE_UNLOCK(dip); + + txEnd(tid); + + jFYI(1, ("jfs_symlink: rc:%d\n", -rc)); + return -rc; +} + + +/* + * NAME: jfs_rename + * + * FUNCTION: rename a file or directory + */ +int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + btstack_t btstack; + ino_t ino; + component_t new_dname; + struct inode *new_ip; + component_t old_dname; + struct inode *old_ip; + int rc; + tid_t tid; + tlock_t *tlck; + dtlock_t *dtlck; + lv_t *lv; + int ipcount; + struct inode *iplist[4]; + tblock_t *tblk; + s64 new_size = 0; + int commit_flag; + + + jFYI(1, + ("jfs_rename: %s %s\n", old_dentry->d_name.name, + new_dentry->d_name.name)); + + old_ip = old_dentry->d_inode; + new_ip = new_dentry->d_inode; + + if (old_dir == new_dir) { + if (new_ip) + IWRITE_LOCK_LIST(3, old_dir, old_ip, new_ip); + else + IWRITE_LOCK_LIST(2, old_dir, old_ip); + } else { + if (new_ip) + IWRITE_LOCK_LIST(4, old_dir, new_dir, old_ip, + new_ip); + else + IWRITE_LOCK_LIST(3, old_dir, new_dir, old_ip); + } + + if ((rc = get_UCSname(&old_dname, old_dentry, + JFS_SBI(old_dir->i_sb)->nls_tab))) + goto out1; + + if ((rc = get_UCSname(&new_dname, new_dentry, + JFS_SBI(old_dir->i_sb)->nls_tab))) + goto out2; + + /* + * Make sure source inode number is what we think it is + */ + rc = dtSearch(old_dir, &old_dname, &ino, &btstack, JFS_LOOKUP); + if (rc || (ino != old_ip->i_ino)) { + rc = ENOENT; + goto out3; + } + + /* + * Make sure dest inode number (if any) is what we think it is + */ + rc = dtSearch(new_dir, &new_dname, &ino, &btstack, JFS_LOOKUP); + if (rc == 0) { + if ((new_ip == 0) || (ino != new_ip->i_ino)) { + rc = ESTALE; + goto out3; + } + } else if (rc != ENOENT) + goto out3; + else if (new_ip) { + /* no entry exists, but one was expected */ + rc = ESTALE; + goto out3; + } + + if (S_ISDIR(old_ip->i_mode)) { + if (new_ip) { + if (!dtEmpty(new_ip)) { + rc = ENOTEMPTY; + goto out3; + } + } else if ((new_dir != old_dir) && + (new_dir->i_nlink == JFS_LINK_MAX)) { + rc = EMLINK; + goto out3; + } + } + + /* + * The real work starts here + */ + tid = txBegin(new_dir->i_sb, 0); + + if (new_ip) { + /* + * Change existing directory entry to new inode number + */ + ino = new_ip->i_ino; + rc = dtModify(tid, new_dir, &new_dname, &ino, + old_ip->i_ino, JFS_RENAME); + if (rc) + goto out4; + new_ip->i_nlink--; + if (S_ISDIR(new_ip->i_mode)) { + new_ip->i_nlink--; + assert(new_ip->i_nlink == 0); + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->ip = new_ip; + } else if (new_ip->i_nlink == 0) { + assert(!test_cflag(COMMIT_Nolink, new_ip)); + /* free block resources */ + if ((new_size = commitZeroLink(tid, new_ip)) < 0) { + txAbort(tid, 1); /* Marks FS Dirty */ + rc = -new_size; /* We return -rc */ + goto out4; + } + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_DELETE; + tblk->ip = new_ip; + } else { + new_ip->i_ctime = CURRENT_TIME; + mark_inode_dirty(new_ip); + } + } else { + /* + * Add new directory entry + */ + rc = dtSearch(new_dir, &new_dname, &ino, &btstack, + JFS_CREATE); + if (rc) { + jERROR(1, + ("jfs_rename didn't expect dtSearch to fail w/rc = %d\n", + rc)); + goto out4; + } + + ino = old_ip->i_ino; + rc = dtInsert(tid, new_dir, &new_dname, &ino, &btstack); + if (rc) { + jERROR(1, + ("jfs_rename: dtInsert failed w/rc = %d\n", + rc)); + goto out4; + } + if (S_ISDIR(old_ip->i_mode)) + new_dir->i_nlink++; + } + /* + * Remove old directory entry + */ + + ino = old_ip->i_ino; + rc = dtDelete(tid, old_dir, &old_dname, &ino, JFS_REMOVE); + if (rc) { + jERROR(1, + ("jfs_rename did not expect dtDelete to return rc = %d\n", + rc)); + txAbort(tid, 1); /* Marks Filesystem dirty */ + goto out4; + } + if (S_ISDIR(old_ip->i_mode)) { + old_dir->i_nlink--; + if (old_dir != new_dir) { + /* + * Change inode number of parent for moved directory + */ + + JFS_IP(old_ip)->i_dtroot.header.idotdot = + cpu_to_le32(new_dir->i_ino); + + /* Linelock header of dtree */ + tlck = txLock(tid, old_ip, + (metapage_t *) & JFS_IP(old_ip)->bxflag, + tlckDTREE | tlckBTROOT); + dtlck = (dtlock_t *) & tlck->lock; + ASSERT(dtlck->index == 0); + lv = (lv_t *) & dtlck->lv[0]; + lv->offset = 0; + lv->length = 1; + dtlck->index++; + } + } + + /* + * Update ctime on changed/moved inodes & mark dirty + */ + old_ip->i_ctime = CURRENT_TIME; + mark_inode_dirty(old_ip); + + new_dir->i_version = ++event; + new_dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(new_dir); + + /* Build list of inodes modified by this transaction */ + ipcount = 0; + iplist[ipcount++] = old_ip; + if (new_ip) + iplist[ipcount++] = new_ip; + iplist[ipcount++] = old_dir; + + if (old_dir != new_dir) { + iplist[ipcount++] = new_dir; + old_dir->i_version = ++event; + old_dir->i_ctime = CURRENT_TIME; + mark_inode_dirty(old_dir); + } + + /* + * Incomplete truncate of file data can + * result in timing problems unless we synchronously commit the + * transaction. + */ + if (new_size) + commit_flag = COMMIT_SYNC; + else + commit_flag = 0; + + rc = txCommit(tid, ipcount, iplist, commit_flag); + + /* + * Don't unlock new_ip if COMMIT_HOLDLOCK is set + */ + if (new_ip && test_cflag(COMMIT_Holdlock, new_ip)) + new_ip = 0; + + out4: + txEnd(tid); + + while (new_size && (rc == 0)) { + tid = txBegin(new_ip->i_sb, 0); + new_size = xtTruncate_pmap(tid, new_ip, new_size); + if (new_size < 0) { + txAbort(tid, 1); + rc = -new_size; /* We return -rc */ + } else + rc = txCommit(tid, 1, &new_ip, COMMIT_SYNC); + txEnd(tid); + } + out3: + free_UCSname(&new_dname); + out2: + free_UCSname(&old_dname); + out1: + IWRITE_UNLOCK(old_ip); + if (old_dir != new_dir) + IWRITE_UNLOCK(new_dir); + if (new_ip) + IWRITE_UNLOCK(new_ip); + + /* + * Truncating the directory index table is not guaranteed. It + * may need to be done iteratively + */ + if (test_cflag(COMMIT_Stale, old_dir)) { + if (old_dir->i_size > 1) + jfs_truncate_nolock(old_dir, 0); + + clear_cflag(COMMIT_Stale, old_dir); + } + + IWRITE_UNLOCK(old_dir); + + jFYI(1, ("jfs_rename: returning %d\n", rc)); + return -rc; +} + + +/* + * NAME: jfs_mknod + * + * FUNCTION: Create a special file (device) + */ +int jfs_mknod(struct inode *dir, struct dentry *dentry, int mode, int rdev) +{ + btstack_t btstack; + component_t dname; + ino_t ino; + struct inode *ip; + struct inode *iplist[2]; + int rc; + tid_t tid; + tblock_t *tblk; + + jFYI(1, ("jfs_mknod: %s\n", dentry->d_name.name)); + + if ((rc = get_UCSname(&dname, dentry, JFS_SBI(dir->i_sb)->nls_tab))) + goto out; + + IWRITE_LOCK(dir); + + ip = ialloc(dir, mode); + if (ip == NULL) { + rc = ENOSPC; + goto out1; + } + + tid = txBegin(dir->i_sb, 0); + + if ((rc = dtSearch(dir, &dname, &ino, &btstack, JFS_CREATE))) { + ip->i_nlink = 0; + iput(ip); + txEnd(tid); + goto out1; + } + + tblk = tid_to_tblock(tid); + tblk->xflag |= COMMIT_CREATE; + tblk->ip = ip; + + ino = ip->i_ino; + if ((rc = dtInsert(tid, dir, &dname, &ino, &btstack))) { + ip->i_nlink = 0; + iput(ip); + txEnd(tid); + goto out1; + } + + if (S_ISREG(ip->i_mode)) { + ip->i_op = &jfs_file_inode_operations; + ip->i_fop = &jfs_file_operations; + ip->i_mapping->a_ops = &jfs_aops; + } else + init_special_inode(ip, ip->i_mode, rdev); + + insert_inode_hash(ip); + mark_inode_dirty(ip); + d_instantiate(dentry, ip); + + dir->i_version = ++event; + dir->i_ctime = dir->i_mtime = CURRENT_TIME; + + mark_inode_dirty(dir); + + iplist[0] = dir; + iplist[1] = ip; + rc = txCommit(tid, 2, iplist, 0); + txEnd(tid); + + out1: + IWRITE_UNLOCK(dir); + free_UCSname(&dname); + + out: + jFYI(1, ("jfs_mknod: returning %d\n", rc)); + return -rc; +} + +static struct dentry *jfs_lookup(struct inode *dip, struct dentry *dentry) +{ + btstack_t btstack; + ino_t inum; + struct inode *ip; + component_t key; + const char *name = dentry->d_name.name; + int len = dentry->d_name.len; + int rc; + + jFYI(1, ("jfs_lookup: name = %s\n", name)); + + + if ((name[0] == '.') && (len == 1)) + inum = dip->i_ino; + else if (strcmp(name, "..") == 0) + inum = PARENT(dip); + else { + if ((rc = + get_UCSname(&key, dentry, JFS_SBI(dip->i_sb)->nls_tab))) + return ERR_PTR(-rc); + IREAD_LOCK(dip); + rc = dtSearch(dip, &key, &inum, &btstack, JFS_LOOKUP); + IREAD_UNLOCK(dip); + free_UCSname(&key); + if (rc == ENOENT) { + d_add(dentry, NULL); + return ERR_PTR(0); + } else if (rc) { + jERROR(1, + ("jfs_lookup: dtSearch returned %d\n", rc)); + return ERR_PTR(-rc); + } + } + + ip = iget(dip->i_sb, inum); + if (ip == NULL) { + jERROR(1, + ("jfs_lookup: iget failed on inum %d\n", + (uint) inum)); + return ERR_PTR(-EACCES); + } + + d_add(dentry, ip); + + return ERR_PTR(0); +} + +struct inode_operations jfs_dir_inode_operations = { + create: jfs_create, + lookup: jfs_lookup, + link: jfs_link, + unlink: jfs_unlink, + symlink: jfs_symlink, + mkdir: jfs_mkdir, + rmdir: jfs_rmdir, + mknod: jfs_mknod, + rename: jfs_rename, +}; + +struct file_operations jfs_dir_operations = { + read: generic_read_dir, + readdir: jfs_readdir, + fsync: jfs_fsync, +}; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/super.c linuxppc64_2_4/fs/jfs/super.c --- ../kernel.org/linux-2.4.19/fs/jfs/super.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/super.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,481 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include "jfs_incore.h" +#include "jfs_filsys.h" +#include "jfs_metapage.h" +#include "jfs_superblock.h" +#include "jfs_dmap.h" +#include "jfs_imap.h" +#include "jfs_debug.h" + +MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); +MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); +MODULE_LICENSE("GPL"); + +static int in_shutdown; +static pid_t jfsIOthread; +static pid_t jfsCommitThread; +static pid_t jfsSyncThread; +struct task_struct *jfsIOtask; +struct task_struct *jfsCommitTask; +struct task_struct *jfsSyncTask; +DECLARE_COMPLETION(jfsIOwait); + +#ifdef CONFIG_JFS_DEBUG +int jfsloglevel = 1; +MODULE_PARM(jfsloglevel, "i"); +MODULE_PARM_DESC(jfsloglevel, "Specify JFS loglevel (0, 1 or 2)"); +#endif + +/* + * External declarations + */ +extern int jfs_mount(struct super_block *); +extern int jfs_mount_rw(struct super_block *, int); +extern int jfs_umount(struct super_block *); +extern int jfs_umount_rw(struct super_block *); + +extern int jfsIOWait(void *); +extern int jfs_lazycommit(void *); +extern int jfs_sync(void *); +extern void jfs_put_inode(struct inode *inode); +extern void jfs_read_inode(struct inode *inode); +extern void jfs_dirty_inode(struct inode *inode); +extern void jfs_delete_inode(struct inode *inode); +extern void jfs_write_inode(struct inode *inode, int wait); + +#if defined(CONFIG_JFS_DEBUG) && defined(CONFIG_PROC_FS) +extern void jfs_proc_init(void); +extern void jfs_proc_clean(void); +#endif + +int jfs_thread_stopped(void) +{ + unsigned long signr; + siginfo_t info; + + spin_lock_irq(¤t->sigmask_lock); + signr = dequeue_signal(¤t->blocked, &info); + spin_unlock_irq(¤t->sigmask_lock); + + if (signr == SIGKILL && in_shutdown) + return 1; + return 0; +} + +static int jfs_statfs(struct super_block *sb, struct statfs *buf) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + s64 maxinodes; + imap_t *imap = JFS_IP(sbi->ipimap)->i_imap; + + jFYI(1, ("In jfs_statfs\n")); + buf->f_type = JFS_SUPER_MAGIC; + buf->f_bsize = sbi->bsize; + buf->f_blocks = sbi->bmap->db_mapsize; + buf->f_bfree = sbi->bmap->db_nfree; + buf->f_bavail = sbi->bmap->db_nfree; + /* + * If we really return the number of allocated & free inodes, some + * applications will fail because they won't see enough free inodes. + * We'll try to calculate some guess as to how may inodes we can + * really allocate + * + * buf->f_files = atomic_read(&imap->im_numinos); + * buf->f_ffree = atomic_read(&imap->im_numfree); + */ + maxinodes = min((s64) atomic_read(&imap->im_numinos) + + ((sbi->bmap->db_nfree >> imap->im_l2nbperiext) + << L2INOSPEREXT), (s64)0xffffffffLL); + buf->f_files = maxinodes; + buf->f_ffree = maxinodes - (atomic_read(&imap->im_numinos) - + atomic_read(&imap->im_numfree)); + + buf->f_namelen = JFS_NAME_MAX; + return 0; +} + +static void jfs_put_super(struct super_block *sb) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + int rc; + + jFYI(1, ("In jfs_put_super\n")); + rc = jfs_umount(sb); + if (rc) { + jERROR(1, ("jfs_umount failed with return code %d\n", rc)); + } + unload_nls(sbi->nls_tab); + sbi->nls_tab = NULL; + + /* + * We need to clean out the direct_inode pages since this inode + * is not in the inode hash. + */ + fsync_inode_data_buffers(sbi->direct_inode); + truncate_inode_pages(sbi->direct_mapping, 0); + iput(sbi->direct_inode); + sbi->direct_inode = NULL; + sbi->direct_mapping = NULL; + + JFS_SBI(sb) = 0; + kfree(sbi); +} + +static int parse_options (char * options, struct jfs_sb_info *sbi) +{ + void *nls_map = NULL; + char * this_char; + char * value; + + if (!options) + return 1; + while ((this_char = strsep (&options, ",")) != NULL) { + if (!*this_char) + continue; + if ((value = strchr (this_char, '=')) != NULL) + *value++ = 0; + if (!strcmp (this_char, "iocharset")) { + if (!value || !*value) + goto needs_arg; + if (nls_map) /* specified iocharset twice! */ + unload_nls(nls_map); + nls_map = load_nls(value); + if (!nls_map) { + printk(KERN_ERR "JFS: charset not found\n"); + goto cleanup; + } + /* Silently ignore the quota options */ + } else if (!strcmp (this_char, "grpquota") + || !strcmp (this_char, "noquota") + || !strcmp (this_char, "quota") + || !strcmp (this_char, "usrquota")) + /* Don't do anything ;-) */ ; + else { + printk ("jfs: Unrecognized mount option %s\n", this_char); + goto cleanup; + } + } + if (nls_map) { + /* Discard old (if remount) */ + if (sbi->nls_tab) + unload_nls(sbi->nls_tab); + sbi->nls_tab = nls_map; + } + return 1; +needs_arg: + printk(KERN_ERR "JFS: %s needs an argument\n", this_char); +cleanup: + if (nls_map) + unload_nls(nls_map); + return 0; +} + +int jfs_remount(struct super_block *sb, int *flags, char *data) +{ + struct jfs_sb_info *sbi = JFS_SBI(sb); + + if (!parse_options(data, sbi)) { + return -EINVAL; + } + + if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) { + /* + * Invalidate any previously read metadata. fsck may + * have changed the on-disk data since we mounted r/o + */ + truncate_inode_pages(sbi->direct_mapping, 0); + + return jfs_mount_rw(sb, 1); + } else if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) + return jfs_umount_rw(sb); + + return 0; +} + +static struct super_operations jfs_sops = { + read_inode: jfs_read_inode, + dirty_inode: jfs_dirty_inode, + write_inode: jfs_write_inode, + put_inode: jfs_put_inode, + delete_inode: jfs_delete_inode, + put_super: jfs_put_super, + statfs: jfs_statfs, + remount_fs: jfs_remount, + clear_inode: diClearExtension, +}; + +static struct super_block *jfs_read_super(struct super_block *sb, + void *data, int silent) +{ + struct jfs_sb_info *sbi; + struct inode *inode; + int rc; + + jFYI(1, + ("In jfs_read_super s_dev=0x%x s_flags=0x%lx\n", sb->s_dev, + sb->s_flags)); + + sbi = kmalloc(sizeof(struct jfs_sb_info), GFP_KERNEL); + JFS_SBI(sb) = sbi; + if (!sbi) + return NULL; + memset(sbi, 0, sizeof(struct jfs_sb_info)); + + if (!parse_options((char *)data, sbi)) { + kfree(sbi); + return NULL; + } + + /* + * Initialize blocksize to 4K. + */ + sb->s_blocksize = PSIZE; + sb->s_blocksize_bits = L2PSIZE; + set_blocksize(sb->s_dev, PSIZE); + + /* + * Initialize direct-mapping inode/address-space + */ + inode = new_inode(sb); + if (inode == NULL) + goto out_kfree; + inode->i_ino = 0; + inode->i_nlink = 1; + inode->i_size = 0x0000010000000000LL; + inode->i_mapping->a_ops = &direct_aops; + inode->i_mapping->gfp_mask = GFP_NOFS; + + sbi->direct_inode = inode; + sbi->direct_mapping = inode->i_mapping; + + rc = alloc_jfs_inode(inode); + if (rc) + goto out_free_inode; + + sb->s_op = &jfs_sops; + rc = jfs_mount(sb); + if (rc) { + if (!silent) { + jERROR(1, + ("jfs_mount failed w/return code = %d\n", + rc)); + } + goto out_mount_failed; + } + if (sb->s_flags & MS_RDONLY) + sbi->log = 0; + else { + rc = jfs_mount_rw(sb, 0); + if (rc) { + if (!silent) { + jERROR(1, + ("jfs_mount_rw failed w/return code = %d\n", + rc)); + } + goto out_no_rw; + } + } + + sb->s_magic = JFS_SUPER_MAGIC; + + inode = iget(sb, ROOT_I); + if (!inode || is_bad_inode(inode)) + goto out_no_root; + sb->s_root = d_alloc_root(inode); + if (!sb->s_root) + goto out_no_root; + + if (!sbi->nls_tab) + sbi->nls_tab = load_nls_default(); + + sb->s_maxbytes = ((u64) sb->s_blocksize) << 40; +#if BITS_PER_LONG == 32 + sb->s_maxbytes = min((u64)PAGE_CACHE_SIZE << 32, sb->s_maxbytes); +#endif + + return sb; + +out_no_root: + jEVENT(1, ("jfs_read_super: get root inode failed\n")); + if (inode) + iput(inode); + +out_no_rw: + rc = jfs_umount(sb); + if (rc) { + jERROR(1, ("jfs_umount failed with return code %d\n", rc)); + } +out_mount_failed: + fsync_inode_data_buffers(sbi->direct_inode); + truncate_inode_pages(sbi->direct_mapping, 0); + sb->s_op = NULL; + + free_jfs_inode(inode); + +out_free_inode: + iput(sbi->direct_inode); + sbi->direct_inode = NULL; + sbi->direct_mapping = NULL; +out_kfree: + if (sbi->nls_tab) + unload_nls(sbi->nls_tab); + kfree(sbi); + return NULL; +} + +static DECLARE_FSTYPE_DEV(jfs_fs_type, "jfs", jfs_read_super); + +extern int metapage_init(void); +extern int txInit(void); +extern void txExit(void); +extern void metapage_exit(void); + +static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct jfs_inode_info *jfs_ip = (struct jfs_inode_info *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + INIT_LIST_HEAD(&jfs_ip->anon_inode_list); + INIT_LIST_HEAD(&jfs_ip->mp_list); + RDWRLOCK_INIT(&jfs_ip->rdwrlock); + } +} + +static int __init init_jfs_fs(void) +{ + int rc; + + printk("JFS development version: $Name: $\n"); + + jfs_inode_cachep = + kmem_cache_create("jfs_ip", + sizeof(struct jfs_inode_info), + 0, 0, init_once, NULL); + if (jfs_inode_cachep == NULL) + return -ENOMEM; + + /* + * Metapage initialization + */ + rc = metapage_init(); + if (rc) { + jERROR(1, ("metapage_init failed w/rc = %d\n", rc)); + goto free_slab; + } + + /* + * Transaction Manager initialization + */ + rc = txInit(); + if (rc) { + jERROR(1, ("txInit failed w/rc = %d\n", rc)); + goto free_metapage; + } + + /* + * I/O completion thread (endio) + */ + jfsIOthread = kernel_thread(jfsIOWait, 0, + CLONE_FS | CLONE_FILES | + CLONE_SIGHAND); + if (jfsIOthread < 0) { + jERROR(1, + ("init_jfs_fs: fork failed w/rc = %d\n", + jfsIOthread)); + goto end_txmngr; + } + wait_for_completion(&jfsIOwait); /* Wait until IO thread starts */ + + jfsCommitThread = kernel_thread(jfs_lazycommit, 0, + CLONE_FS | CLONE_FILES | + CLONE_SIGHAND); + if (jfsCommitThread < 0) { + jERROR(1, + ("init_jfs_fs: fork failed w/rc = %d\n", + jfsCommitThread)); + goto kill_iotask; + } + wait_for_completion(&jfsIOwait); /* Wait until IO thread starts */ + + jfsSyncThread = kernel_thread(jfs_sync, 0, + CLONE_FS | CLONE_FILES | + CLONE_SIGHAND); + if (jfsSyncThread < 0) { + jERROR(1, + ("init_jfs_fs: fork failed w/rc = %d\n", + jfsSyncThread)); + goto kill_committask; + } + wait_for_completion(&jfsIOwait); /* Wait until IO thread starts */ + +#if defined(CONFIG_JFS_DEBUG) && defined(CONFIG_PROC_FS) + jfs_proc_init(); +#endif + + return register_filesystem(&jfs_fs_type); + + +kill_committask: + send_sig(SIGKILL, jfsCommitTask, 1); + wait_for_completion(&jfsIOwait); /* Wait until Commit thread exits */ +kill_iotask: + send_sig(SIGKILL, jfsIOtask, 1); + wait_for_completion(&jfsIOwait); /* Wait until IO thread exits */ +end_txmngr: + txExit(); +free_metapage: + metapage_exit(); +free_slab: + kmem_cache_destroy(jfs_inode_cachep); + return -rc; +} + +static void __exit exit_jfs_fs(void) +{ + jFYI(1, ("exit_jfs_fs called\n")); + + in_shutdown = 1; + txExit(); + metapage_exit(); + send_sig(SIGKILL, jfsIOtask, 1); + wait_for_completion(&jfsIOwait); /* Wait until IO thread exits */ + send_sig(SIGKILL, jfsCommitTask, 1); + wait_for_completion(&jfsIOwait); /* Wait until Commit thread exits */ + send_sig(SIGKILL, jfsSyncTask, 1); + wait_for_completion(&jfsIOwait); /* Wait until Sync thread exits */ +#if defined(CONFIG_JFS_DEBUG) && defined(CONFIG_PROC_FS) + jfs_proc_clean(); +#endif + unregister_filesystem(&jfs_fs_type); + kmem_cache_destroy(jfs_inode_cachep); +} + + +EXPORT_NO_SYMBOLS; + +module_init(init_jfs_fs) +module_exit(exit_jfs_fs) diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/jfs/symlink.c linuxppc64_2_4/fs/jfs/symlink.c --- ../kernel.org/linux-2.4.19/fs/jfs/symlink.c Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/fs/jfs/symlink.c Tue Apr 23 11:14:25 2002 @@ -0,0 +1,47 @@ + +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * JFS fast symlink handling code + */ + +#include +#include "jfs_incore.h" + +static int jfs_readlink(struct dentry *, char *buffer, int buflen); +static int jfs_follow_link(struct dentry *dentry, struct nameidata *nd); + +/* + * symlinks can't do much... + */ +struct inode_operations jfs_symlink_inode_operations = { + readlink: jfs_readlink, + follow_link: jfs_follow_link, +}; + +static int jfs_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + char *s = JFS_IP(dentry->d_inode)->i_inline; + return vfs_follow_link(nd, s); +} + +static int jfs_readlink(struct dentry *dentry, char *buffer, int buflen) +{ + char *s = JFS_IP(dentry->d_inode)->i_inline; + return vfs_readlink(dentry, buffer, buflen, s); +} diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/nfsd/nfsctl.c linuxppc64_2_4/fs/nfsd/nfsctl.c --- ../kernel.org/linux-2.4.19/fs/nfsd/nfsctl.c Mon Apr 22 11:34:26 2002 +++ linuxppc64_2_4/fs/nfsd/nfsctl.c Tue Apr 23 09:37:30 2002 @@ -9,6 +9,7 @@ #include #include #include +#include #include #include diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/nls/Config.in linuxppc64_2_4/fs/nls/Config.in --- ../kernel.org/linux-2.4.19/fs/nls/Config.in Fri Apr 19 10:30:00 2002 +++ linuxppc64_2_4/fs/nls/Config.in Thu Feb 21 20:57:41 2002 @@ -12,7 +12,7 @@ # msdos and Joliet want NLS if [ "$CONFIG_JOLIET" = "y" -o "$CONFIG_FAT_FS" != "n" \ -o "$CONFIG_NTFS_FS" != "n" -o "$CONFIG_NCPFS_NLS" = "y" \ - -o "$CONFIG_SMB_NLS" = "y" ]; then + -o "$CONFIG_SMB_NLS" = "y" -o "$CONFIG_JFS_FS" != "n" ]; then define_bool CONFIG_NLS y else define_bool CONFIG_NLS n diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/proc/proc_misc.c linuxppc64_2_4/fs/proc/proc_misc.c --- ../kernel.org/linux-2.4.19/fs/proc/proc_misc.c Fri Apr 19 11:00:46 2002 +++ linuxppc64_2_4/fs/proc/proc_misc.c Mon Apr 22 10:35:10 2002 @@ -348,13 +348,46 @@ return len; } -#if !defined(CONFIG_ARCH_S390) +#if !defined(CONFIG_ARCH_S390) && !defined(CONFIG_X86) && !defined(CONFIG_PPC64) static int interrupts_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) { int len = get_irq_list(page); return proc_calc_metrics(page, start, off, count, eof, len); } +#else +extern int show_interrupts(struct seq_file *p, void *v); +static struct seq_operations proc_interrupts_op = { + show: show_interrupts, +}; +static int interrupts_open(struct inode *inode, struct file *file) +{ + unsigned size = PAGE_SIZE; + /* + * probably should depend on NR_CPUS, but that's only rough estimate; + * if we'll need more it will be given, + */ + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = seq_open(file, &proc_interrupts_op); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} +static struct file_operations proc_interrupts_operations = { + open: interrupts_open, + read: seq_read_single, + llseek: seq_lseek, + release: seq_release, +}; #endif static int filesystems_read_proc(char *page, char **start, off_t off, @@ -526,7 +559,7 @@ {"stat", kstat_read_proc}, {"devices", devices_read_proc}, {"partitions", partitions_read_proc}, -#if !defined(CONFIG_ARCH_S390) +#if !defined(CONFIG_ARCH_S390) && !defined(CONFIG_X86) && !defined(CONFIG_PPC64) {"interrupts", interrupts_read_proc}, #endif {"filesystems", filesystems_read_proc}, @@ -552,6 +585,11 @@ if (entry) entry->proc_fops = &proc_kmsg_operations; create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations); +#if defined(CONFIG_ARCH_S390) || defined(CONFIG_X86) || defined(CONFIG_PPC64) + entry = create_proc_entry("interrupts", 0, NULL); + if (entry) + entry->proc_fops = &proc_interrupts_operations; +#endif #ifdef CONFIG_MODULES create_seq_entry("ksyms", 0, &proc_ksyms_operations); #endif diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/proc/root.c linuxppc64_2_4/fs/proc/root.c --- ../kernel.org/linux-2.4.19/fs/proc/root.c Fri Apr 19 11:00:38 2002 +++ linuxppc64_2_4/fs/proc/root.c Tue Aug 14 14:55:53 2001 @@ -16,6 +16,7 @@ #include #include #include +#include struct proc_dir_entry *proc_net, *proc_bus, *proc_root_fs, *proc_root_driver; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/fs/seq_file.c linuxppc64_2_4/fs/seq_file.c --- ../kernel.org/linux-2.4.19/fs/seq_file.c Fri Apr 19 10:30:02 2002 +++ linuxppc64_2_4/fs/seq_file.c Thu Feb 21 21:43:40 2002 @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -187,6 +188,62 @@ kfree(m->buf); m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); return !m->buf ? -ENOMEM : -EAGAIN; +} + +ssize_t seq_read_single(struct file *file, char *buf, size_t size, loff_t *ppos) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + size_t copied = 0; + size_t n; + int err = 0; + + if (ppos != &file->f_pos) + return -EPIPE; + + down(&m->sem); + if (*ppos != 0) + return 0; + if (!m->count) { + if (!m->buf) { + m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); + if (!m->buf) + goto Enomem; + } + m->from = 0; + while (1) { + err = m->op->show(m, NULL); + if (err) + goto Done; + if (m->count < m->size) + break; + kfree(m->buf); + m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); + if (!m->buf) + goto Enomem; + } + } + n = min(m->count, size); + err = copy_to_user(buf, m->buf + m->from, n); + if (err) + goto Efault; + size -= n; + buf += n; + m->count -= n; + m->from += n; + copied += n; + if (!m->count) + *ppos = 1; +Done: + if (!copied) + copied = err; + up(&m->sem); + return copied; +Enomem: + err = -ENOMEM; + goto Done; +Efault: + err = -EFAULT; + goto Done; } /** diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/fcntl.h linuxppc64_2_4/include/asm-ppc64/fcntl.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/fcntl.h Wed May 15 08:29:43 2002 +++ linuxppc64_2_4/include/asm-ppc64/fcntl.h Fri May 10 23:42:16 2002 @@ -26,7 +26,7 @@ #define O_DIRECTORY 040000 /* must be a directory */ #define O_NOFOLLOW 0100000 /* don't follow links */ #define O_LARGEFILE 0200000 -#define O_DIRECT 0400000 /* direct disk access hint - currently ignored */ +#define O_DIRECT 0400000 /* direct disk access hint */ #define F_DUPFD 0 /* dup */ #define F_GETFD 1 /* get close_on_exec */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/io.h linuxppc64_2_4/include/asm-ppc64/io.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/io.h Wed May 15 08:29:43 2002 +++ linuxppc64_2_4/include/asm-ppc64/io.h Sat May 11 00:46:04 2002 @@ -134,7 +134,7 @@ * Change virtual addresses to physical addresses and vv, for * addresses in the area where the kernel has the RAM mapped. */ -extern inline unsigned long virt_to_phys(volatile void * address) +static inline unsigned long virt_to_phys(volatile void * address) { #ifdef __IO_DEBUG printk("virt_to_phys: 0x%08lx -> 0x%08lx\n", @@ -144,7 +144,7 @@ return __pa((unsigned long)address); } -extern inline void * phys_to_virt(unsigned long address) +static inline void * phys_to_virt(unsigned long address) { #ifdef __IO_DEBUG printk("phys_to_virt: 0x%08lx -> 0x%08lx\n", address, __va(address)); @@ -154,7 +154,7 @@ #endif /* __KERNEL__ */ -extern inline void iosync(void) +static inline void iosync(void) { __asm__ __volatile__ ("sync" : : : "memory"); } @@ -172,7 +172,7 @@ * excess of syncs before the MMIO operations will make things work. On * sstar, sync time is << than mmio time, so this should not be a big impact. */ -extern inline int in_8(volatile unsigned char *addr) +static inline int in_8(volatile unsigned char *addr) { int ret; @@ -180,12 +180,12 @@ return ret; } -extern inline void out_8(volatile unsigned char *addr, int val) +static inline void out_8(volatile unsigned char *addr, int val) { __asm__ __volatile__("sync; stb%U0%X0 %1,%0; sync" : "=m" (*addr) : "r" (val)); } -extern inline int in_le16(volatile unsigned short *addr) +static inline int in_le16(volatile unsigned short *addr) { int ret; @@ -194,7 +194,7 @@ return ret; } -extern inline int in_be16(volatile unsigned short *addr) +static inline int in_be16(volatile unsigned short *addr) { int ret; @@ -202,18 +202,18 @@ return ret; } -extern inline void out_le16(volatile unsigned short *addr, int val) +static inline void out_le16(volatile unsigned short *addr, int val) { __asm__ __volatile__("sync; sthbrx %1,0,%2; sync" : "=m" (*addr) : "r" (val), "r" (addr)); } -extern inline void out_be16(volatile unsigned short *addr, int val) +static inline void out_be16(volatile unsigned short *addr, int val) { __asm__ __volatile__("sync; sth%U0%X0 %1,%0; sync" : "=m" (*addr) : "r" (val)); } -extern inline unsigned in_le32(volatile unsigned *addr) +static inline unsigned in_le32(volatile unsigned *addr) { unsigned ret; @@ -222,7 +222,7 @@ return ret; } -extern inline unsigned in_be32(volatile unsigned *addr) +static inline unsigned in_be32(volatile unsigned *addr) { unsigned ret; @@ -230,13 +230,13 @@ return ret; } -extern inline void out_le32(volatile unsigned *addr, int val) +static inline void out_le32(volatile unsigned *addr, int val) { __asm__ __volatile__("sync; stwbrx %1,0,%2; sync" : "=m" (*addr) : "r" (val), "r" (addr)); } -extern inline void out_be32(volatile unsigned *addr, int val) +static inline void out_be32(volatile unsigned *addr, int val) { __asm__ __volatile__("sync; stw%U0%X0 %1,%0; sync" : "=m" (*addr) : "r" (val)); } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/machdep.h linuxppc64_2_4/include/asm-ppc64/machdep.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/machdep.h Wed May 15 08:29:43 2002 +++ linuxppc64_2_4/include/asm-ppc64/machdep.h Thu Apr 25 20:45:03 2002 @@ -69,7 +69,6 @@ void (*init_IRQ)(void); void (*init_ras_IRQ)(void); int (*get_irq)(struct pt_regs *); - void (*post_irq)( struct pt_regs *, int ); /* A general init function, called by ppc_init in init/main.c. May be NULL. */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/mmu.h linuxppc64_2_4/include/asm-ppc64/mmu.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/mmu.h Wed May 15 08:29:43 2002 +++ linuxppc64_2_4/include/asm-ppc64/mmu.h Sat May 11 00:46:04 2002 @@ -222,7 +222,7 @@ #define PG_SHIFT (12) /* Page Entry */ -extern __inline__ void _tlbie( unsigned long va ) +static inline void _tlbie( unsigned long va ) { __asm__ __volatile__ ( " \n\ clrldi %0,%0,16 \n\ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/page.h linuxppc64_2_4/include/asm-ppc64/page.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/page.h Wed May 15 08:29:43 2002 +++ linuxppc64_2_4/include/asm-ppc64/page.h Sat May 11 00:46:04 2002 @@ -132,15 +132,8 @@ #define PAGE_BUG(page) do { BUG(); } while (0) -/* - * XXX A bug in the current ppc64 compiler prevents an optimisation - * where a divide is replaced by a multiply by shifted inverse. For - * the moment use page->virtaul - */ -#define WANT_PAGE_VIRTUAL 1 - /* Pure 2^n version of get_order */ -extern __inline__ int get_order(unsigned long size) +static inline int get_order(unsigned long size) { int order; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/pci.h linuxppc64_2_4/include/asm-ppc64/pci.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/pci.h Wed May 15 08:29:43 2002 +++ linuxppc64_2_4/include/asm-ppc64/pci.h Sat May 11 00:46:04 2002 @@ -25,12 +25,12 @@ #define PCIBIOS_MIN_IO 0x1000 #define PCIBIOS_MIN_MEM 0x10000000 -extern inline void pcibios_set_master(struct pci_dev *dev) +static inline void pcibios_set_master(struct pci_dev *dev) { /* No special bus mastering setup handling */ } -extern inline void pcibios_penalize_isa_irq(int irq) +static inline void pcibios_penalize_isa_irq(int irq) { /* We don't do dynamic PCI IRQ allocation */ } @@ -78,7 +78,7 @@ extern void pSeries_pcibios_init_early(void); -extern inline void pci_dma_sync_single(struct pci_dev *hwdev, +static inline void pci_dma_sync_single(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, int direction) { @@ -87,7 +87,7 @@ /* nothing to do */ } -extern inline void pci_dma_sync_sg(struct pci_dev *hwdev, +static inline void pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nelems, int direction) { @@ -101,7 +101,7 @@ * only drive the low 24-bits during PCI bus mastering, then * you would pass 0x00ffffff as the mask to this function. */ -extern inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask) +static inline int pci_dma_supported(struct pci_dev *hwdev, u64 mask) { return 1; } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/pgtable.h linuxppc64_2_4/include/asm-ppc64/pgtable.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/pgtable.h Wed May 15 08:29:43 2002 +++ linuxppc64_2_4/include/asm-ppc64/pgtable.h Sat May 11 00:46:04 2002 @@ -232,47 +232,41 @@ /* to find an entry in the ioremap page-table-directory */ #define pgd_offset_i(address) (ioremap_pgd + pgd_index(address)) -/* - * Given a pointer to an mem_map[] entry, return the kernel virtual - * address corresponding to that page. - */ -#define page_address(page) ((page)->virtual) - #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) /* * The following only work if pte_present() is true. * Undefined behaviour if not.. */ -extern inline int pte_read(pte_t pte) { return pte_val(pte) & _PAGE_USER;} -extern inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW;} -extern inline int pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_EXEC;} -extern inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY;} -extern inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED;} +static inline int pte_read(pte_t pte) { return pte_val(pte) & _PAGE_USER;} +static inline int pte_write(pte_t pte) { return pte_val(pte) & _PAGE_RW;} +static inline int pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_EXEC;} +static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY;} +static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED;} -extern inline void pte_uncache(pte_t pte) { pte_val(pte) |= _PAGE_NO_CACHE; } -extern inline void pte_cache(pte_t pte) { pte_val(pte) &= ~_PAGE_NO_CACHE; } +static inline void pte_uncache(pte_t pte) { pte_val(pte) |= _PAGE_NO_CACHE; } +static inline void pte_cache(pte_t pte) { pte_val(pte) &= ~_PAGE_NO_CACHE; } -extern inline pte_t pte_rdprotect(pte_t pte) { +static inline pte_t pte_rdprotect(pte_t pte) { pte_val(pte) &= ~_PAGE_USER; return pte; } -extern inline pte_t pte_exprotect(pte_t pte) { +static inline pte_t pte_exprotect(pte_t pte) { pte_val(pte) &= ~_PAGE_EXEC; return pte; } -extern inline pte_t pte_wrprotect(pte_t pte) { +static inline pte_t pte_wrprotect(pte_t pte) { pte_val(pte) &= ~(_PAGE_RW); return pte; } -extern inline pte_t pte_mkclean(pte_t pte) { +static inline pte_t pte_mkclean(pte_t pte) { pte_val(pte) &= ~(_PAGE_DIRTY); return pte; } -extern inline pte_t pte_mkold(pte_t pte) { +static inline pte_t pte_mkold(pte_t pte) { pte_val(pte) &= ~_PAGE_ACCESSED; return pte; } -extern inline pte_t pte_mkread(pte_t pte) { +static inline pte_t pte_mkread(pte_t pte) { pte_val(pte) |= _PAGE_USER; return pte; } -extern inline pte_t pte_mkexec(pte_t pte) { +static inline pte_t pte_mkexec(pte_t pte) { pte_val(pte) |= _PAGE_USER | _PAGE_EXEC; return pte; } -extern inline pte_t pte_mkwrite(pte_t pte) { +static inline pte_t pte_mkwrite(pte_t pte) { pte_val(pte) |= _PAGE_RW; return pte; } -extern inline pte_t pte_mkdirty(pte_t pte) { +static inline pte_t pte_mkdirty(pte_t pte) { pte_val(pte) |= _PAGE_DIRTY; return pte; } -extern inline pte_t pte_mkyoung(pte_t pte) { +static inline pte_t pte_mkyoung(pte_t pte) { pte_val(pte) |= _PAGE_ACCESSED; return pte; } /* Atomic PTE updates */ @@ -349,8 +343,8 @@ #define flush_tlb_page local_flush_tlb_page #define flush_tlb_range local_flush_tlb_range -extern inline void flush_tlb_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) +static inline void flush_tlb_pgtables(struct mm_struct *mm, + unsigned long start, unsigned long end) { /* PPC has hw page tables. */ } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/processor.h linuxppc64_2_4/include/asm-ppc64/processor.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/processor.h Wed May 15 08:29:43 2002 +++ linuxppc64_2_4/include/asm-ppc64/processor.h Sat May 11 00:46:04 2002 @@ -726,12 +726,12 @@ #define ARCH_HAS_PREFETCHW #define ARCH_HAS_SPINLOCK_PREFETCH -extern inline void prefetch(const void *x) +static inline void prefetch(const void *x) { __asm__ __volatile__ ("dcbt 0,%0" : : "r" (x)); } -extern inline void prefetchw(const void *x) +static inline void prefetchw(const void *x) { __asm__ __volatile__ ("dcbtst 0,%0" : : "r" (x)); } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/rtas.h linuxppc64_2_4/include/asm-ppc64/rtas.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/rtas.h Wed May 15 08:29:43 2002 +++ linuxppc64_2_4/include/asm-ppc64/rtas.h Mon May 6 14:29:20 2002 @@ -2,6 +2,7 @@ #define _PPC64_RTAS_H #include +#include /* * Definitions for talking to the RTAS on CHRP machines. @@ -128,6 +129,29 @@ unsigned char buffer[1]; /* allocated by klimit bump */ }; +struct flash_block { + char *data; + unsigned long length; +}; + +/* This struct is very similar but not identical to + * that needed by the rtas flash update. + * All we need to do for rtas is rewrite num_blocks + * into a version/length and translate the pointers + * to absolute. + */ +#define FLASH_BLOCKS_PER_NODE ((PAGE_SIZE - 16) / sizeof(struct flash_block)) +struct flash_block_list { + unsigned long num_blocks; + struct flash_block_list *next; + struct flash_block blocks[FLASH_BLOCKS_PER_NODE]; +}; +struct flash_block_list_header { /* just the header of flash_block_list */ + unsigned long num_blocks; + struct flash_block_list *next; +}; +extern struct flash_block_list_header rtas_firmware_flash_list; + extern struct rtas_t rtas; extern void enter_rtas(struct rtas_args *); @@ -139,5 +163,8 @@ extern void rtas_restart(char *cmd); extern void rtas_power_off(void); extern void rtas_halt(void); + +extern struct proc_dir_entry *rtas_proc_dir; + #endif /* _PPC64_RTAS_H */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/semaphore.h linuxppc64_2_4/include/asm-ppc64/semaphore.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/semaphore.h Fri Apr 19 11:00:39 2002 +++ linuxppc64_2_4/include/asm-ppc64/semaphore.h Sat May 11 00:46:04 2002 @@ -2,12 +2,6 @@ #define _PPC64_SEMAPHORE_H /* - * Swiped from asm-sparc/semaphore.h and modified - * -- Cort (cort@cs.nmt.edu) - * - * Stole some rw spinlock-based semaphore stuff from asm-alpha/semaphore.h - * -- Ani Joshi (ajoshi@unixbox.com) - * * Remove spinlock-based RW semaphores; RW semaphore definitions are * now in rwsem.h and we use the the generic lib/rwsem.c implementation. * Rework semaphores to use atomic_dec_if_positive. @@ -78,7 +72,7 @@ extern int __down_interruptible(struct semaphore * sem); extern void __up(struct semaphore * sem); -extern inline void down(struct semaphore * sem) +static inline void down(struct semaphore * sem) { #if WAITQUEUE_DEBUG CHECK_MAGIC(sem->__magic); @@ -92,7 +86,7 @@ smp_wmb(); } -extern inline int down_interruptible(struct semaphore * sem) +static inline int down_interruptible(struct semaphore * sem) { int ret = 0; @@ -106,7 +100,7 @@ return ret; } -extern inline int down_trylock(struct semaphore * sem) +static inline int down_trylock(struct semaphore * sem) { int ret; @@ -119,7 +113,7 @@ return ret; } -extern inline void up(struct semaphore * sem) +static inline void up(struct semaphore * sem) { #if WAITQUEUE_DEBUG CHECK_MAGIC(sem->__magic); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/siginfo.h linuxppc64_2_4/include/asm-ppc64/siginfo.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/siginfo.h Wed May 15 08:29:43 2002 +++ linuxppc64_2_4/include/asm-ppc64/siginfo.h Sat May 11 00:46:04 2002 @@ -224,7 +224,7 @@ #ifdef __KERNEL__ #include -extern inline void copy_siginfo(siginfo_t *to, siginfo_t *from) +static inline void copy_siginfo(siginfo_t *to, siginfo_t *from) { if (from->si_code < 0) memcpy(to, from, sizeof(siginfo_t)); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/smplock.h linuxppc64_2_4/include/asm-ppc64/smplock.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/smplock.h Fri Apr 19 11:00:39 2002 +++ linuxppc64_2_4/include/asm-ppc64/smplock.h Sat May 11 00:46:04 2002 @@ -43,13 +43,13 @@ * so we only need to worry about other * CPU's. */ -extern __inline__ void lock_kernel(void) +static inline void lock_kernel(void) { if (!++current->lock_depth) spin_lock(&kernel_flag); } -extern __inline__ void unlock_kernel(void) +static inline void unlock_kernel(void) { if (--current->lock_depth < 0) spin_unlock(&kernel_flag); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/time.h linuxppc64_2_4/include/asm-ppc64/time.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/time.h Wed May 15 08:29:44 2002 +++ linuxppc64_2_4/include/asm-ppc64/time.h Sat May 11 00:46:04 2002 @@ -73,20 +73,22 @@ static __inline__ void set_dec(int val) { +#ifdef CONFIG_PPC_ISERIES struct paca_struct *lpaca = get_paca(); int cur_dec; - if ( lpaca->xLpPaca.xSharedProc ) { + if (lpaca->xLpPaca.xSharedProc) { lpaca->xLpPaca.xVirtualDecr = val; cur_dec = get_dec(); - if ( cur_dec > val ) + if (cur_dec > val) HvCall_setVirtualDecr(); - } else { + } else +#endif mtspr(SPRN_DEC, val); - } } -extern __inline__ unsigned long tb_ticks_since(unsigned long tstamp) { +static inline unsigned long tb_ticks_since(unsigned long tstamp) +{ return get_tb() - tstamp; } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/uaccess.h linuxppc64_2_4/include/asm-ppc64/uaccess.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/uaccess.h Fri Apr 19 11:00:39 2002 +++ linuxppc64_2_4/include/asm-ppc64/uaccess.h Sat May 11 00:46:04 2002 @@ -38,7 +38,7 @@ #define __access_ok(addr,size) (__kernel_ok || __user_ok((addr),(size))) #define access_ok(type,addr,size) __access_ok((unsigned long)(addr),(size)) -extern inline int verify_area(int type, const void * addr, unsigned long size) +static inline int verify_area(int type, const void * addr, unsigned long size) { return access_ok(type,addr,size) ? 0 : -EFAULT; } @@ -200,7 +200,7 @@ extern unsigned long __copy_tofrom_user(void *to, const void *from, unsigned long size); -extern inline unsigned long +static inline unsigned long copy_from_user(void *to, const void *from, unsigned long n) { unsigned long over; @@ -214,7 +214,7 @@ return n; } -extern inline unsigned long +static inline unsigned long copy_to_user(void *to, const void *from, unsigned long n) { unsigned long over; @@ -235,7 +235,7 @@ extern unsigned long __clear_user(void *addr, unsigned long size); -extern inline unsigned long +static inline unsigned long clear_user(void *addr, unsigned long size) { if (access_ok(VERIFY_WRITE, addr, size)) @@ -245,7 +245,7 @@ extern int __strncpy_from_user(char *dst, const char *src, long count); -extern inline long +static inline long strncpy_from_user(char *dst, const char *src, long count) { if (access_ok(VERIFY_READ, src, 1)) @@ -269,7 +269,7 @@ * The `top' parameter to __strnlen_user is to make sure that * we can never overflow from the user area into kernel space. */ -extern __inline__ int strnlen_user(const char *str, long len) +static inline int strnlen_user(const char *str, long len) { unsigned long top = __kernel_ok? ~0UL: TASK_SIZE - 1; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/asm-ppc64/vga.h linuxppc64_2_4/include/asm-ppc64/vga.h --- ../kernel.org/linux-2.4.19/include/asm-ppc64/vga.h Fri Apr 19 11:00:39 2002 +++ linuxppc64_2_4/include/asm-ppc64/vga.h Sat May 11 00:46:04 2002 @@ -26,12 +26,12 @@ * has already done the right job for us. */ -extern inline void scr_writew(u16 val, volatile u16 *addr) +static inline void scr_writew(u16 val, volatile u16 *addr) { st_le16(addr, val); } -extern inline u16 scr_readw(volatile const u16 *addr) +static inline u16 scr_readw(volatile const u16 *addr) { return ld_le16(addr); } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/linux/blk.h linuxppc64_2_4/include/linux/blk.h --- ../kernel.org/linux-2.4.19/include/linux/blk.h Fri Apr 19 11:00:46 2002 +++ linuxppc64_2_4/include/linux/blk.h Mon Apr 22 10:35:35 2002 @@ -313,6 +313,21 @@ #define DEVICE_REQUEST i2ob_request #define DEVICE_NR(device) (MINOR(device)>>4) +#elif (MAJOR_NR == VIODASD_MAJOR) + +#define DEVICE_NAME "viod" +#define TIMEOUT_VALUE (25*HZ) +#define DEVICE_REQUEST do_viodasd_request +#define DEVICE_NR(device) (MINOR(device) >> 3) + +#elif (MAJOR_NR == VIOCD_MAJOR) + +#define DEVICE_NAME "viocd" +#define TIMEOUT_VALUE (25*HZ) +#define DEVICE_REQUEST do_viocd_request +#define DEVICE_NR(device) (MINOR(device)) +#define DEVICE_ON(device) +#define DEVICE_OFF(device) #elif (MAJOR_NR == COMPAQ_SMART2_MAJOR) #define DEVICE_NAME "ida" diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/linux/jfs_fs.h linuxppc64_2_4/include/linux/jfs_fs.h --- ../kernel.org/linux-2.4.19/include/linux/jfs_fs.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/include/linux/jfs_fs.h Wed Nov 14 10:19:36 2001 @@ -0,0 +1,34 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * +*/ +#ifndef _LINUX_JFS_FS_H +#define _LINUX_JFS_FS_H + +#include + +#include +#include +#include + + +/* JFS magic number */ + +#define JFS_SUPER_MAGIC 0x3153464a /* "JFS1" */ + +#endif /* _LINUX_JFS_FS_H */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/linux/jfs_fs_i.h linuxppc64_2_4/include/linux/jfs_fs_i.h --- ../kernel.org/linux-2.4.19/include/linux/jfs_fs_i.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/include/linux/jfs_fs_i.h Wed Nov 14 10:19:36 2001 @@ -0,0 +1,80 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * +*/ +#ifndef _JFS_FS_I +#define _JFS_FS_I + +#include +#include + +typedef struct jfs_rwlock { + struct rw_semaphore rw_sem; + atomic_t in_use; /* for hacked implementation of trylock */ +} jfs_rwlock_t; + +#define JFS_IP(ip) ((struct jfs_inode_info *)(ip)->u.generic_ip) + +struct jfs_inode_info { + int fileset; /* 4: fileset number (always 16)*/ + uint mode2; /* 4: jfs-specific mode */ + pxd_t ixpxd; /* 8: inode extent descriptor */ + dxd_t acl; /* 16: dxd describing acl */ + dxd_t ea; /* 16: dxd describing ea */ + time_t otime; /* 4: time created */ + uint next_index; /* 4: next available directory entry index */ + int acltype; /* 4: Type of ACL */ + short btorder; /* 2: access order */ + short btindex; /* 2: btpage entry index*/ + struct inode *ipimap; /* 4: inode map */ + ushort flag; /* 2: JFS in-memory flag*/ + unchar cflag; /* 1: commit flags */ + unchar agno; /* 1: ag number */ + ushort bxflag; /* 2: xflag of pseudo buffer? */ + short blid; /* 2: lid of pseudo buffer? */ + ushort atlhead; /* 2: anonymous tlock list head */ + ushort atltail; /* 2: anonymous tlock list tail */ + struct inode *atlnext; /* 4: next inode w/anonymous txn's */ + struct inode *atlprev; /* 4: previous inode w/anonymous txn's */ + struct page *extent_page; /* 4: page containing extent */ + jfs_rwlock_t rdwrlock; /* 12/20: read/write lock */ + ushort xtlid; /* 2: lid of xtree lock on directory */ + short pad; /* 2: pad */ + union { + struct { + xtpage_t _xtroot; /* 288: xtree root */ + struct inomap *_imap; /* 4: inode map header */ + } file; + struct { + dir_table_slot_t _table[12]; /* 96: directory index */ + dtroot_t _dtroot; /* 288: dtree root */ + } dir; + struct { + unchar _unused[16]; /* 16: */ + dxd_t _dxd; /* 16: */ + unchar _inline[128]; /* 128: inline symlink */ + } link; + } u; +}; +#define i_xtroot u.file._xtroot +#define i_imap u.file._imap +#define i_dirtable u.dir._table +#define i_dtroot u.dir._dtroot +#define i_inline u.link._inline + +#endif diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/linux/jfs_fs_sb.h linuxppc64_2_4/include/linux/jfs_fs_sb.h --- ../kernel.org/linux-2.4.19/include/linux/jfs_fs_sb.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/include/linux/jfs_fs_sb.h Wed Nov 14 10:19:36 2001 @@ -0,0 +1,53 @@ +/* + * + * Copyright (c) International Business Machines Corp., 2000 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * +*/ +#ifndef _JFS_FS_SB +#define _JFS_FS_SB + +#define JFS_SBI(sb) ((struct jfs_sb_info *)(sb)->u.generic_sbp) + +struct jfs_sb_info { + unsigned long mntflag; /* 4: aggregate attributes */ + struct inode *ipbmap; /* 4: block map inode */ + struct inode *ipaimap; /* 4: aggregate inode map inode */ + struct inode *ipaimap2; /* 4: secondary aimap inode */ + struct inode *ipimap; /* 4: aggregate inode map inode */ + struct jfs_log *log; /* 4: log */ + short bsize; /* 2: logical block size */ + short l2bsize; /* 2: log2 logical block size */ + short nbperpage; /* 2: blocks per page */ + short l2nbperpage; /* 2: log2 blocks per page */ + short l2niperblk; /* 2: log2 inodes per page */ + short reserved; /* 2: log2 inodes per page */ + pxd_t logpxd; /* 8: pxd describing log */ + pxd_t ait2; /* 8: pxd describing AIT copy */ + /* Formerly in ipimap */ + uint gengen; /* 4: inode generation generator*/ + uint inostamp; /* 4: shows inode belongs to fileset*/ + + /* Formerly in ipbmap */ + struct bmap *bmap; /* 4: incore bmap descriptor */ + struct nls_table *nls_tab; /* 4: current codepage */ + struct inode *direct_inode; /* 4: inode for physical I/O */ + struct address_space *direct_mapping; /* 4: mapping for physical I/O */ +}; /* (72) */ + +#define isReadOnly(ip) ((JFS_SBI((ip)->i_sb)->log) ? 0 : 1) + +#endif diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/linux/major.h linuxppc64_2_4/include/linux/major.h --- ../kernel.org/linux-2.4.19/include/linux/major.h Wed May 15 08:29:44 2002 +++ linuxppc64_2_4/include/linux/major.h Mon May 13 16:41:10 2002 @@ -117,6 +117,9 @@ #define COMPAQ_CISS_MAJOR6 110 #define COMPAQ_CISS_MAJOR7 111 +#define VIODASD_MAJOR 112 +#define VIOCD_MAJOR 113 + #define ATARAID_MAJOR 114 #define DASD_MAJOR 94 /* Official assignations from Peter */ diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/linux/pci.h linuxppc64_2_4/include/linux/pci.h --- ../kernel.org/linux-2.4.19/include/linux/pci.h Fri Apr 19 11:00:39 2002 +++ linuxppc64_2_4/include/linux/pci.h Mon Apr 22 10:35:36 2002 @@ -416,10 +416,10 @@ void *sysdata; /* hook for sys-specific extension */ struct proc_dir_entry *procdir; /* directory entry in /proc/bus/pci */ - unsigned char number; /* bus number */ - unsigned char primary; /* number of primary bridge */ - unsigned char secondary; /* number of secondary bridge */ - unsigned char subordinate; /* max number of subordinate buses */ + unsigned int number; /* bus number */ + unsigned int primary; /* number of primary bridge */ + unsigned int secondary; /* number of secondary bridge */ + unsigned int subordinate; /* max number of subordinate buses */ char name[48]; unsigned short vendor; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/linux/proc_fs.h linuxppc64_2_4/include/linux/proc_fs.h --- ../kernel.org/linux-2.4.19/include/linux/proc_fs.h Fri Apr 19 11:00:39 2002 +++ linuxppc64_2_4/include/linux/proc_fs.h Mon Apr 22 10:35:36 2002 @@ -25,7 +25,11 @@ /* Finally, the dynamically allocatable proc entries are reserved: */ #define PROC_DYNAMIC_FIRST 4096 +#ifdef CONFIG_PPC64 +#define PROC_NDYNAMIC 16384 +#else #define PROC_NDYNAMIC 4096 +#endif #define PROC_SUPER_MAGIC 0x9fa0 diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/linux/seq_file.h linuxppc64_2_4/include/linux/seq_file.h --- ../kernel.org/linux-2.4.19/include/linux/seq_file.h Fri Apr 19 11:00:24 2002 +++ linuxppc64_2_4/include/linux/seq_file.h Mon Apr 22 10:35:36 2002 @@ -2,6 +2,10 @@ #define _LINUX_SEQ_FILE_H #ifdef __KERNEL__ +#include +#include +#include + struct seq_operations; struct seq_file { @@ -24,6 +28,7 @@ int seq_open(struct file *, struct seq_operations *); ssize_t seq_read(struct file *, char *, size_t, loff_t *); +ssize_t seq_read_single(struct file *, char *, size_t, loff_t *); loff_t seq_lseek(struct file *, loff_t, int); int seq_release(struct inode *, struct file *); int seq_escape(struct seq_file *, const char *, const char *); diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/include/linux/vethdevice.h linuxppc64_2_4/include/linux/vethdevice.h --- ../kernel.org/linux-2.4.19/include/linux/vethdevice.h Wed Dec 31 18:00:00 1969 +++ linuxppc64_2_4/include/linux/vethdevice.h Tue Jun 19 11:06:08 2001 @@ -0,0 +1,16 @@ +/* File vethdevice.h created by Kyle A. Lucke on Wed Aug 9 2000. */ + +/* Change Activity: */ +/* End Change Activity */ + +#ifndef _LINUX_VETHDEVICE_H +#define _LINUX_VETHDEVICE_H + +#include + +#ifdef __KERNEL__ +extern struct net_device * init_vethdev(struct net_device *, int, int); +#endif + +#endif /* _LINUX_VETHDEVICE_H */ + diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/init/do_mounts.c linuxppc64_2_4/init/do_mounts.c --- ../kernel.org/linux-2.4.19/init/do_mounts.c Mon Apr 22 11:34:27 2002 +++ linuxppc64_2_4/init/do_mounts.c Tue Apr 23 09:37:32 2002 @@ -198,6 +198,7 @@ { "cciss/c0d13p",0x68D0 }, { "cciss/c0d14p",0x68E0 }, { "cciss/c0d15p",0x68F0 }, + #endif { "ataraid/d0p",0x7200 }, { "ataraid/d1p",0x7210 }, diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/init/main.c linuxppc64_2_4/init/main.c --- ../kernel.org/linux-2.4.19/init/main.c Mon Apr 22 11:34:27 2002 +++ linuxppc64_2_4/init/main.c Tue Apr 23 09:37:32 2002 @@ -157,7 +157,7 @@ better than 1% */ #define LPS_PREC 8 -void __init calibrate_delay(void) +void __init do_calibrate_delay(void) { unsigned long ticks, loopbit; int lps_precision = LPS_PREC; @@ -197,6 +197,8 @@ loops_per_jiffy/(500000/HZ), (loops_per_jiffy/(5000/HZ)) % 100); } + +void (*calibrate_delay)(void) = do_calibrate_delay; static int __init debug_kernel(char *str) { diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/kernel/ptrace.c linuxppc64_2_4/kernel/ptrace.c --- ../kernel.org/linux-2.4.19/kernel/ptrace.c Fri Apr 19 11:00:15 2002 +++ linuxppc64_2_4/kernel/ptrace.c Thu Feb 21 21:00:33 2002 @@ -121,10 +121,114 @@ } /* - * Access another process' address space. - * Source/target buffer must be kernel space, - * Do not walk the page table directly, use get_user_pages + * Access another process' address space, one page at a time. */ +static int access_one_page(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, void *buf, int len, int write) +{ + pgd_t * pgdir; + pmd_t * pgmiddle; + pte_t * pgtable; + char *maddr; + struct page *page; + +repeat: + spin_lock(&mm->page_table_lock); + pgdir = pgd_offset(vma->vm_mm, addr); + if (pgd_none(*pgdir)) + goto fault_in_page; + if (pgd_bad(*pgdir)) + goto bad_pgd; + pgmiddle = pmd_offset(pgdir, addr); + if (pmd_none(*pgmiddle)) + goto fault_in_page; + if (pmd_bad(*pgmiddle)) + goto bad_pmd; + pgtable = pte_offset(pgmiddle, addr); + if (!pte_present(*pgtable)) + goto fault_in_page; + if (write && (!pte_write(*pgtable) || !pte_dirty(*pgtable))) + goto fault_in_page; + page = pte_page(*pgtable); + + /* ZERO_PAGE is special: reads from it are ok even though it's marked reserved */ + if (page != ZERO_PAGE(addr) || write) { + if ((!VALID_PAGE(page)) || PageReserved(page)) { + spin_unlock(&mm->page_table_lock); + return 0; + } + } + get_page(page); + spin_unlock(&mm->page_table_lock); + flush_cache_page(vma, addr); + + if (write) { + maddr = kmap(page) + (addr & ~PAGE_MASK); + memcpy(maddr, buf, len); + flush_page_to_ram(page); + flush_icache_range((unsigned long) maddr, + (unsigned long) maddr + len); + kunmap(page); + } else { + maddr = kmap(page); + memcpy(buf, maddr + (addr & ~PAGE_MASK), len); + flush_page_to_ram(page); + kunmap(page); + } + put_page(page); + return len; + +fault_in_page: + spin_unlock(&mm->page_table_lock); + /* -1: out of memory. 0 - unmapped page */ + if (handle_mm_fault(mm, vma, addr, write) > 0) + goto repeat; + return 0; + +bad_pgd: + spin_unlock(&mm->page_table_lock); + pgd_ERROR(*pgdir); + return 0; + +bad_pmd: + spin_unlock(&mm->page_table_lock); + pmd_ERROR(*pgmiddle); + return 0; +} + +static int access_mm(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long addr, void *buf, int len, int write) +{ + int copied = 0; + + for (;;) { + unsigned long offset = addr & ~PAGE_MASK; + int this_len = PAGE_SIZE - offset; + int retval; + + if (this_len > len) + this_len = len; + retval = access_one_page(mm, vma, addr, buf, this_len, write); + copied += retval; + if (retval != this_len) + break; + + len -= retval; + if (!len) + break; + + addr += retval; + buf += retval; + + if (addr < vma->vm_end) + continue; + if (!vma->vm_next) + break; + if (vma->vm_next->vm_start != vma->vm_end) + break; + + vma = vma->vm_next; + } + return copied; +} int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write) { diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/kernel/sched.c linuxppc64_2_4/kernel/sched.c --- ../kernel.org/linux-2.4.19/kernel/sched.c Fri Apr 19 11:00:47 2002 +++ linuxppc64_2_4/kernel/sched.c Mon Apr 22 10:35:37 2002 @@ -1171,10 +1171,13 @@ else printk(" (NOTLB)\n"); +#if defined(CONFIG_X86) || defined(CONFIG_SPARC64) || defined(CONFIG_ARM) || defined(CONFIG_ALPHA) || defined(CONFIG_PPC64) +/* This is very useful, but doesn't work on all archs yet */ { extern void show_trace_task(struct task_struct *tsk); show_trace_task(p); } +#endif } char * render_sigset_t(sigset_t *set, char *buffer) diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/kernel/sys.c linuxppc64_2_4/kernel/sys.c --- ../kernel.org/linux-2.4.19/kernel/sys.c Fri Apr 19 10:30:02 2002 +++ linuxppc64_2_4/kernel/sys.c Thu Feb 21 21:00:33 2002 @@ -1272,6 +1272,16 @@ } current->keep_capabilities = arg2; break; + +#ifdef SET_FP_EXC_MODE + case PR_SET_FP_EXC: + error = SET_FP_EXC_MODE(current, arg2); + break; + case PR_GET_FP_EXC: + error = GET_FP_EXC_MODE(current); + break; +#endif + default: error = -EINVAL; break; diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/lib/brlock.c linuxppc64_2_4/lib/brlock.c --- ../kernel.org/linux-2.4.19/lib/brlock.c Fri Apr 19 10:30:02 2002 +++ linuxppc64_2_4/lib/brlock.c Thu Nov 15 00:10:51 2001 @@ -14,6 +14,7 @@ #include #include +#include #ifdef __BRLOCK_USE_ATOMICS @@ -54,7 +55,8 @@ if (__brlock_array[cpu_logical_map(i)][idx] != 0) { spin_unlock(&__br_write_locks[idx].lock); barrier(); - cpu_relax(); + /* We must allow recursive readers to make progress */ + udelay(1); goto again; } } diff -uNr --exclude=CVS ../kernel.org/linux-2.4.19/mm/filemap.c linuxppc64_2_4/mm/filemap.c --- ../kernel.org/linux-2.4.19/mm/filemap.c Wed May 15 08:29:45 2002 +++ linuxppc64_2_4/mm/filemap.c Mon May 13 16:41:10 2002 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -3168,7 +3169,7 @@ page_hash_bits++; page_hash_table = (struct page **) - __get_free_pages(GFP_ATOMIC, order); + vmalloc(PAGE_SIZE << order); } while(page_hash_table == NULL && --order > 0); printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",