diff -u --recursive --new-file v2.4.10/linux/Documentation/filesystems/ntfs.txt linux/Documentation/filesystems/ntfs.txt --- v2.4.10/linux/Documentation/filesystems/ntfs.txt Sun Sep 23 11:40:54 2001 +++ linux/Documentation/filesystems/ntfs.txt Sun Sep 30 11:42:44 2001 @@ -98,6 +98,14 @@ ChangeLog ========= +NTFS 1.1.20: + - Fixed two bugs in ntfs_readwrite_attr(). Thanks to Jan Kara for + spotting the out of bounds one. + - Check return value of set_blocksize() in ntfs_read_super() and make + use of get_hardsect_size() to determine the minimum block size. + - Fix return values of ntfs_vcn_to_lcn(). This should stop + peoples start of partition being overwritten at random. + NTFS 1.1.19: - Fixed ntfs_getdir_unsorted(), ntfs_readdir() and ntfs_printcb() to cope with arbitrary cluster sizes. Very important for Win2k+. Also, diff -u --recursive --new-file v2.4.10/linux/Makefile linux/Makefile --- v2.4.10/linux/Makefile Sun Sep 23 11:40:54 2001 +++ linux/Makefile Sun Sep 23 11:41:36 2001 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 -SUBLEVEL = 10 -EXTRAVERSION = +SUBLEVEL = 11 +EXTRAVERSION =-pre1 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -u --recursive --new-file v2.4.10/linux/arch/i386/kernel/entry.S linux/arch/i386/kernel/entry.S --- v2.4.10/linux/arch/i386/kernel/entry.S Sun Sep 23 11:40:55 2001 +++ linux/arch/i386/kernel/entry.S Sat Sep 29 12:59:47 2001 @@ -620,6 +620,7 @@ .long SYMBOL_NAME(sys_getdents64) /* 220 */ .long SYMBOL_NAME(sys_fcntl64) .long SYMBOL_NAME(sys_ni_syscall) /* reserved for TUX */ + .long SYMBOL_NAME(sys_ni_syscall) /* Reserved for Security */ .rept NR_syscalls-(.-sys_call_table)/4 .long SYMBOL_NAME(sys_ni_syscall) diff -u --recursive --new-file v2.4.10/linux/arch/i386/kernel/pci-pc.c linux/arch/i386/kernel/pci-pc.c --- v2.4.10/linux/arch/i386/kernel/pci-pc.c Sun Sep 23 11:40:55 2001 +++ linux/arch/i386/kernel/pci-pc.c Wed Sep 26 22:43:44 2001 @@ -261,18 +261,14 @@ u32 data; result = pci_conf2_read(0, dev->bus->number, PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn), where, 2, &data); - *value = (u8)data; + *value = (u16)data; return result; } static int pci_conf2_read_config_dword(struct pci_dev *dev, int where, u32 *value) { - int result; - u32 data; - result = pci_conf2_read(0, dev->bus->number, PCI_SLOT(dev->devfn), - PCI_FUNC(dev->devfn), where, 4, &data); - *value = (u8)data; - return result; + return pci_conf2_read(0, dev->bus->number, PCI_SLOT(dev->devfn), + PCI_FUNC(dev->devfn), where, 4, value); } static int pci_conf2_write_config_byte(struct pci_dev *dev, int where, u8 value) diff -u --recursive --new-file v2.4.10/linux/drivers/block/loop.c linux/drivers/block/loop.c --- v2.4.10/linux/drivers/block/loop.c Sun Sep 23 11:40:57 2001 +++ linux/drivers/block/loop.c Fri Sep 28 11:21:40 2001 @@ -719,7 +719,7 @@ return err; } -static int loop_clr_fd(struct loop_device *lo, kdev_t dev) +static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) { struct file *filp = lo->lo_backing_file; int gfp = lo->old_gfp_mask; @@ -752,7 +752,7 @@ memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_name, 0, LO_NAME_SIZE); loop_sizes[lo->lo_number] = 0; - invalidate_buffers(dev); + invalidate_bdev(bdev, 0); filp->f_dentry->d_inode->i_mapping->gfp_mask = gfp; lo->lo_state = Lo_unbound; fput(filp); @@ -852,7 +852,7 @@ err = loop_set_fd(lo, file, inode->i_rdev, arg); break; case LOOP_CLR_FD: - err = loop_clr_fd(lo, inode->i_rdev); + err = loop_clr_fd(lo, inode->i_bdev); break; case LOOP_SET_STATUS: err = loop_set_status(lo, (struct loop_info *) arg); diff -u --recursive --new-file v2.4.10/linux/drivers/ide/ide-disk.c linux/drivers/ide/ide-disk.c --- v2.4.10/linux/drivers/ide/ide-disk.c Mon Aug 27 12:41:41 2001 +++ linux/drivers/ide/ide-disk.c Fri Sep 28 11:21:40 2001 @@ -481,7 +481,7 @@ static void idedisk_release (struct inode *inode, struct file *filp, ide_drive_t *drive) { if (drive->removable && !drive->usage) { - invalidate_buffers(inode->i_rdev); + invalidate_bdev(inode->i_bdev, 0); if (drive->doorlocking && ide_wait_cmd(drive, WIN_DOORUNLOCK, 0, 0, 0, NULL)) drive->doorlocking = 0; } diff -u --recursive --new-file v2.4.10/linux/drivers/ide/ide-floppy.c linux/drivers/ide/ide-floppy.c --- v2.4.10/linux/drivers/ide/ide-floppy.c Sun Sep 23 11:40:57 2001 +++ linux/drivers/ide/ide-floppy.c Fri Sep 28 11:21:40 2001 @@ -1750,7 +1750,7 @@ if (!drive->usage) { idefloppy_floppy_t *floppy = drive->driver_data; - invalidate_buffers (inode->i_rdev); + invalidate_bdev (inode->i_bdev, 0); /* IOMEGA Clik! drives do not support lock/unlock commands */ if (!test_bit(IDEFLOPPY_CLIK_DRIVE, &floppy->flags)) { diff -u --recursive --new-file v2.4.10/linux/drivers/net/ppp_generic.c linux/drivers/net/ppp_generic.c --- v2.4.10/linux/drivers/net/ppp_generic.c Sun Sep 23 11:40:58 2001 +++ linux/drivers/net/ppp_generic.c Thu Sep 27 08:42:29 2001 @@ -2105,13 +2105,12 @@ { struct compressor_entry *ce; int ret; - spin_lock(&compressor_list_lock); ret = -EEXIST; if (find_comp_entry(cp->compress_proto) != 0) goto out; ret = -ENOMEM; - ce = kmalloc(sizeof(struct compressor_entry), GFP_KERNEL); + ce = kmalloc(sizeof(struct compressor_entry), GFP_ATOMIC); if (ce == 0) goto out; ret = 0; @@ -2216,11 +2215,11 @@ /* Create a new ppp structure and link it before `list'. */ ret = -ENOMEM; - ppp = kmalloc(sizeof(struct ppp), GFP_KERNEL); + ppp = kmalloc(sizeof(struct ppp), GFP_ATOMIC); if (ppp == 0) goto out; memset(ppp, 0, sizeof(struct ppp)); - dev = kmalloc(sizeof(struct net_device), GFP_KERNEL); + dev = kmalloc(sizeof(struct net_device), GFP_ATOMIC); if (dev == 0) { kfree(ppp); goto out; @@ -2285,6 +2284,7 @@ static void ppp_destroy_interface(struct ppp *ppp) { struct net_device *dev; + int n_channels ; spin_lock(&all_ppp_lock); list_del(&ppp->file.list); @@ -2314,6 +2314,7 @@ #endif /* CONFIG_PPP_FILTER */ dev = ppp->dev; ppp->dev = 0; + n_channels = ppp->n_channels ; ppp_unlock(ppp); if (dev) { @@ -2329,7 +2330,7 @@ * ppp structure. Otherwise we leave it around until the * last channel disconnects from it. */ - if (ppp->n_channels == 0) + if (n_channels == 0) kfree(ppp); spin_unlock(&all_ppp_lock); diff -u --recursive --new-file v2.4.10/linux/drivers/net/pppoe.c linux/drivers/net/pppoe.c --- v2.4.10/linux/drivers/net/pppoe.c Wed Jul 25 17:10:21 2001 +++ linux/drivers/net/pppoe.c Thu Sep 27 08:42:29 2001 @@ -541,11 +541,15 @@ sk->state = PPPOX_DEAD; po = sk->protinfo.pppox; - if (po->pppoe_pa.sid) + if (po->pppoe_pa.sid) { delete_item(po->pppoe_pa.sid, po->pppoe_pa.remote); + po->pppoe_pa.sid = 0 ; + } if (po->pppoe_dev) dev_put(po->pppoe_dev); + + po->pppoe_dev = NULL ; sock_orphan(sk); sock->sk = NULL; diff -u --recursive --new-file v2.4.10/linux/drivers/usb/usb-uhci.c linux/drivers/usb/usb-uhci.c --- v2.4.10/linux/drivers/usb/usb-uhci.c Sun Sep 23 11:41:00 2001 +++ linux/drivers/usb/usb-uhci.c Thu Sep 27 08:38:48 2001 @@ -2528,7 +2528,7 @@ int i; int ret = 0; urb_priv_t *urb_priv = urb->hcpriv; - struct list_head *p = urb_priv->desc_list.next; + struct list_head *p = urb_priv->desc_list.next, *p_tmp; uhci_desc_t *desc = list_entry (urb_priv->desc_list.prev, uhci_desc_t, desc_list); dbg("urb contains iso request"); @@ -2578,8 +2578,9 @@ dbg("process_iso: %i: len:%d %08x status:%x", i, urb->iso_frame_desc[i].actual_length, le32_to_cpu(desc->hw.td.status),urb->iso_frame_desc[i].status); - list_del (p); + p_tmp = p; p = p->next; + list_del (p_tmp); delete_desc (s, desc); } diff -u --recursive --new-file v2.4.10/linux/fs/affs/inode.c linux/fs/affs/inode.c --- v2.4.10/linux/fs/affs/inode.c Sun Sep 23 11:41:00 2001 +++ linux/fs/affs/inode.c Fri Sep 28 18:03:48 2001 @@ -300,7 +300,7 @@ u32 block; struct buffer_head *bh; - if (!(inode = get_empty_inode())) + if (!(inode = new_inode(sb))) goto err_inode; if (!(block = affs_alloc_block(dir, dir->i_ino))) @@ -312,8 +312,6 @@ mark_buffer_dirty_inode(bh, inode); affs_brelse(bh); - inode->i_sb = sb; - inode->i_dev = sb->s_dev; inode->i_uid = current->fsuid; inode->i_gid = current->fsgid; inode->i_ino = block; diff -u --recursive --new-file v2.4.10/linux/fs/block_dev.c linux/fs/block_dev.c --- v2.4.10/linux/fs/block_dev.c Sun Sep 23 11:41:00 2001 +++ linux/fs/block_dev.c Sun Sep 30 12:11:44 2001 @@ -22,291 +22,138 @@ #include -static inline int blkdev_get_block(struct inode * inode, long iblock, struct buffer_head * bh_result) -{ - int err; - - err = -EIO; - if (iblock >= buffered_blk_size(inode->i_rdev) >> (BUFFERED_BLOCKSIZE_BITS - BLOCK_SIZE_BITS)) - goto out; - - bh_result->b_blocknr = iblock; - bh_result->b_state |= 1UL << BH_Mapped; - err = 0; +#define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512) - out: - return err; +static inline unsigned int blksize_bits(unsigned int size) +{ + unsigned int bits = 8; + do { + bits++; + size >>= 1; + } while (size > 256); + return bits; } -static int blkdev_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) +static inline unsigned int block_size(kdev_t dev) { - int i, nr_blocks, retval, dev = inode->i_rdev; - unsigned long * blocks = iobuf->blocks; + int retval = BLOCK_SIZE; + int major = MAJOR(dev); - if (blocksize != BUFFERED_BLOCKSIZE) - BUG(); - - nr_blocks = iobuf->length >> BUFFERED_BLOCKSIZE_BITS; - /* build the blocklist */ - for (i = 0; i < nr_blocks; i++, blocknr++) { - struct buffer_head bh; - - retval = blkdev_get_block(inode, blocknr, &bh); - if (retval) - goto out; - - blocks[i] = bh.b_blocknr; + if (blksize_size[major]) { + int minor = MINOR(dev); + if (blksize_size[major][minor]) + retval = blksize_size[major][minor]; } - - retval = brw_kiovec(rw, 1, &iobuf, dev, iobuf->blocks, blocksize); - - out: return retval; } -static int blkdev_writepage(struct page * page) +static unsigned long max_block(kdev_t dev) { - int err, i; - unsigned long block; - struct buffer_head *bh, *head; - struct inode *inode = page->mapping->host; - - if (!PageLocked(page)) - BUG(); - - if (!page->buffers) - create_empty_buffers(page, inode->i_rdev, BUFFERED_BLOCKSIZE); - head = page->buffers; - - block = page->index << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS); - - bh = head; - i = 0; + unsigned int retval = ~0U; + int major = MAJOR(dev); - /* Stage 1: make sure we have all the buffers mapped! */ - do { - /* - * If the buffer isn't up-to-date, we can't be sure - * that the buffer has been initialized with the proper - * block number information etc.. - * - * Leave it to the low-level FS to make all those - * decisions (block #0 may actually be a valid block) - */ - if (!buffer_mapped(bh)) { - err = blkdev_get_block(inode, block, bh); - if (err) - goto out; + if (blk_size[major]) { + int minor = MINOR(dev); + unsigned int blocks = blk_size[major][minor]; + if (blocks) { + unsigned int size = block_size(dev); + unsigned int sizebits = blksize_bits(size); + blocks += (size-1) >> BLOCK_SIZE_BITS; + retval = blocks << (BLOCK_SIZE_BITS - sizebits); + if (sizebits > BLOCK_SIZE_BITS) + retval = blocks >> (sizebits - BLOCK_SIZE_BITS); } - bh = bh->b_this_page; - block++; - } while (bh != head); - - /* Stage 2: lock the buffers, mark them clean */ - do { - lock_buffer(bh); - set_buffer_async_io(bh); - set_bit(BH_Uptodate, &bh->b_state); - clear_bit(BH_Dirty, &bh->b_state); - bh = bh->b_this_page; - } while (bh != head); - - /* Stage 3: submit the IO */ - do { - submit_bh(WRITE, bh); - bh = bh->b_this_page; - } while (bh != head); + } + return retval; +} - /* Done - end_buffer_io_async will unlock */ - SetPageUptodate(page); - return 0; +static loff_t blkdev_size(kdev_t dev) +{ + unsigned int blocks = ~0U; + int major = MAJOR(dev); -out: - ClearPageUptodate(page); - UnlockPage(page); - return err; + if (blk_size[major]) { + int minor = MINOR(dev); + blocks = blk_size[major][minor]; + } + return (loff_t) blocks << BLOCK_SIZE_BITS; } -static int blkdev_readpage(struct file * file, struct page * page) +/* Kill _all_ buffers, dirty or not.. */ +static void kill_bdev(struct block_device *bdev) { - struct inode *inode = page->mapping->host; - kdev_t dev = inode->i_rdev; - unsigned long iblock, lblock; - struct buffer_head *bh, *head, *arr[1 << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS)]; - unsigned int blocks; - int nr, i; - - if (!PageLocked(page)) - PAGE_BUG(page); - if (!page->buffers) - create_empty_buffers(page, dev, BUFFERED_BLOCKSIZE); - head = page->buffers; - - blocks = PAGE_CACHE_SIZE >> BUFFERED_BLOCKSIZE_BITS; - iblock = page->index << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS); - lblock = buffered_blk_size(dev) >> (BUFFERED_BLOCKSIZE_BITS - BLOCK_SIZE_BITS); - bh = head; - nr = 0; - i = 0; + invalidate_bdev(bdev, 1); + truncate_inode_pages(bdev->bd_inode->i_mapping, 0); +} - do { - if (buffer_uptodate(bh)) - continue; +int set_blocksize(kdev_t dev, int size) +{ + int oldsize; + struct block_device *bdev; - if (!buffer_mapped(bh)) { - if (iblock <= lblock) { - if (blkdev_get_block(inode, iblock, bh)) - continue; - } - if (!buffer_mapped(bh)) { - memset(kmap(page) + i * BUFFERED_BLOCKSIZE, 0, BUFFERED_BLOCKSIZE); - flush_dcache_page(page); - kunmap(page); - set_bit(BH_Uptodate, &bh->b_state); - continue; - } - /* get_block() might have updated the buffer synchronously */ - if (buffer_uptodate(bh)) - continue; - } + /* Size must be a power of two, and between 512 and PAGE_SIZE */ + if (size > PAGE_SIZE || size < 512 || (size & (size-1))) + return -EINVAL; - arr[nr] = bh; - nr++; - } while (i++, iblock++, (bh = bh->b_this_page) != head); + /* Size cannot be smaller than the size supported by the device */ + if (size < get_hardsect_size(dev)) + return -EINVAL; - if (!nr) { - /* - * all buffers are uptodate - we can set the page - * uptodate as well. - */ - SetPageUptodate(page); - UnlockPage(page); - return 0; + /* No blocksize array? Implies hardcoded BLOCK_SIZE */ + if (!blksize_size[MAJOR(dev)]) { + if (size == BLOCK_SIZE) + return 0; + return -EINVAL; } - /* Stage two: lock the buffers */ - for (i = 0; i < nr; i++) { - struct buffer_head * bh = arr[i]; - lock_buffer(bh); - set_buffer_async_io(bh); - } + oldsize = blksize_size[MAJOR(dev)][MINOR(dev)]; + if (oldsize == size) + return 0; - /* Stage 3: start the IO */ - for (i = 0; i < nr; i++) - submit_bh(READ, arr[i]); + if (!oldsize && size == BLOCK_SIZE) { + blksize_size[MAJOR(dev)][MINOR(dev)] = size; + return 0; + } + /* Ok, we're actually changing the blocksize.. */ + bdev = bdget(dev); + sync_buffers(dev, 2); + blksize_size[MAJOR(dev)][MINOR(dev)] = size; + bdev->bd_inode->i_blkbits = blksize_bits(size); + kill_bdev(bdev); + bdput(bdev); return 0; } -static int __blkdev_prepare_write(struct inode *inode, struct page *page, - unsigned from, unsigned to) +static int blkdev_get_block(struct inode * inode, long iblock, struct buffer_head * bh, int create) { - kdev_t dev = inode->i_rdev; - unsigned block_start, block_end; - unsigned long block; - int err = 0; - struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; - kmap(page); - - if (!page->buffers) - create_empty_buffers(page, dev, BUFFERED_BLOCKSIZE); - head = page->buffers; - - block = page->index << (PAGE_CACHE_SHIFT - BUFFERED_BLOCKSIZE_BITS); - - for(bh = head, block_start = 0; bh != head || !block_start; - block++, block_start=block_end, bh = bh->b_this_page) { - if (!bh) - BUG(); - block_end = block_start + BUFFERED_BLOCKSIZE; - if (block_end <= from) - continue; - if (block_start >= to) - break; - if (!buffer_mapped(bh)) { - err = blkdev_get_block(inode, block, bh); - if (err) - goto out; - } - if (Page_Uptodate(page)) { - set_bit(BH_Uptodate, &bh->b_state); - continue; - } - if (!buffer_uptodate(bh) && - (block_start < from || block_end > to)) { - ll_rw_block(READ, 1, &bh); - *wait_bh++=bh; - } - } - /* - * If we issued read requests - let them complete. - */ - while(wait_bh > wait) { - wait_on_buffer(*--wait_bh); - err = -EIO; - if (!buffer_uptodate(*wait_bh)) - goto out; - } + if (iblock >= max_block(inode->i_rdev)) + return -EIO; + + bh->b_dev = inode->i_rdev; + bh->b_blocknr = iblock; + bh->b_state |= 1UL << BH_Mapped; return 0; -out: - return err; } -static int blkdev_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) +static int blkdev_writepage(struct page * page) { - struct inode *inode = page->mapping->host; - int err = __blkdev_prepare_write(inode, page, from, to); - if (err) { - ClearPageUptodate(page); - kunmap(page); - } - return err; -} - -static int __blkdev_commit_write(struct inode *inode, struct page *page, - unsigned from, unsigned to) -{ - unsigned block_start, block_end; - int partial = 0, need_balance_dirty = 0; - struct buffer_head *bh, *head; - - for(bh = head = page->buffers, block_start = 0; - bh != head || !block_start; - block_start=block_end, bh = bh->b_this_page) { - block_end = block_start + BUFFERED_BLOCKSIZE; - if (block_end <= from || block_start >= to) { - if (!buffer_uptodate(bh)) - partial = 1; - } else { - set_bit(BH_Uptodate, &bh->b_state); - if (!atomic_set_buffer_dirty(bh)) { - __mark_dirty(bh); - buffer_insert_inode_data_queue(bh, inode); - need_balance_dirty = 1; - } - } - } + return block_write_full_page(page, blkdev_get_block); +} - if (need_balance_dirty) - balance_dirty(); - /* - * is this a partial write that happened to make all buffers - * uptodate then we can optimize away a bogus readpage() for - * the next read(). Here we 'discover' wether the page went - * uptodate as a result of this (potentially partial) write. - */ - if (!partial) - SetPageUptodate(page); - return 0; +static int blkdev_readpage(struct file * file, struct page * page) +{ + return block_read_full_page(page, blkdev_get_block); } -static int blkdev_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int blkdev_prepare_write(struct file *file, struct page *page, unsigned from, unsigned to) { - struct inode *inode = page->mapping->host; - __blkdev_commit_write(inode,page,from,to); - kunmap(page); - return 0; + return block_prepare_write(page, from, to, blkdev_get_block); +} + +static int blkdev_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) +{ + return block_commit_write(page, from, to); } /* @@ -316,21 +163,19 @@ */ static loff_t block_llseek(struct file *file, loff_t offset, int origin) { - long long retval; - kdev_t dev; + /* ewww */ + loff_t size = file->f_dentry->d_inode->i_bdev->bd_inode->i_size; + loff_t retval; switch (origin) { case 2: - dev = file->f_dentry->d_inode->i_rdev; - if (blk_size[MAJOR(dev)]) - offset += (loff_t) blk_size[MAJOR(dev)][MINOR(dev)] << BLOCK_SIZE_BITS; - /* else? return -EINVAL? */ + offset += size; break; case 1: offset += file->f_pos; } retval = -EINVAL; - if (offset >= 0) { + if (offset >= 0 && offset <= size) { if (offset != file->f_pos) { file->f_pos = offset; file->f_reada = 0; @@ -378,6 +223,7 @@ root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; root->i_uid = root->i_gid = 0; root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; + sb->s_maxbytes = ~0ULL; sb->s_blocksize = 1024; sb->s_blocksize_bits = 10; sb->s_magic = 0x62646576; @@ -491,13 +337,17 @@ if (new_bdev) { struct inode *inode = new_inode(bd_mnt->mnt_sb); if (inode) { + kdev_t kdev = to_kdev_t(dev); atomic_set(&new_bdev->bd_count,1); new_bdev->bd_dev = dev; new_bdev->bd_op = NULL; new_bdev->bd_inode = inode; - inode->i_rdev = to_kdev_t(dev); + inode->i_rdev = kdev; + inode->i_dev = kdev; inode->i_bdev = new_bdev; + inode->i_blkbits = blksize_bits(block_size(kdev)); inode->i_data.a_ops = &def_blk_aops; + inode->i_data.gfp_mask = GFP_USER; spin_lock(&bdev_lock); bdev = bdfind(dev, head); if (!bdev) { @@ -728,6 +578,8 @@ ret = bdev->bd_op->open(bdev->bd_inode, &fake_file); if (!ret) { bdev->bd_openers++; + bdev->bd_inode->i_size = blkdev_size(rdev); + bdev->bd_inode->i_blkbits = blksize_bits(block_size(rdev)); } else if (!bdev->bd_openers) bdev->bd_op = NULL; } @@ -759,15 +611,18 @@ lock_kernel(); if (!bdev->bd_op) bdev->bd_op = get_blkfops(MAJOR(inode->i_rdev)); + if (bdev->bd_op) { ret = 0; if (bdev->bd_op->open) ret = bdev->bd_op->open(inode,filp); - if (!ret) + if (!ret) { bdev->bd_openers++; - else if (!bdev->bd_openers) + bdev->bd_inode->i_size = blkdev_size(inode->i_rdev); + } else if (!bdev->bd_openers) bdev->bd_op = NULL; } + unlock_kernel(); up(&bdev->bd_sem); if (ret) @@ -783,31 +638,12 @@ down(&bdev->bd_sem); lock_kernel(); - if (kind == BDEV_FILE) { - struct super_block * sb; - + if (kind == BDEV_FILE) __block_fsync(bd_inode); - - /* Janitorianism: this shit must go away */ - sb = get_super(bd_inode->i_rdev); - if (sb) { - if (sb->s_flags & MS_RDONLY) { - shrink_dcache_sb(sb); - invalidate_inodes(sb); - invalidate_buffers(bd_inode->i_rdev); - } - lock_super(sb); - if (sb->s_flags & MS_RDONLY) - update_buffers(bd_inode->i_rdev); - unlock_super(sb); - drop_super(sb); - } - } else if (kind == BDEV_FS) + else if (kind == BDEV_FS) fsync_no_super(rdev); - if (!--bdev->bd_openers) { - truncate_inode_pages(bd_inode->i_mapping, 0); - invalidate_buffers(rdev); - } + if (!--bdev->bd_openers) + kill_bdev(bdev); if (bdev->bd_op->release) ret = bdev->bd_op->release(bd_inode, NULL); if (!bdev->bd_openers) @@ -837,7 +673,6 @@ sync_page: block_sync_page, prepare_write: blkdev_prepare_write, commit_write: blkdev_commit_write, - direct_IO: blkdev_direct_IO, }; struct file_operations def_blk_fops = { diff -u --recursive --new-file v2.4.10/linux/fs/buffer.c linux/fs/buffer.c --- v2.4.10/linux/fs/buffer.c Wed Sep 26 11:53:42 2001 +++ linux/fs/buffer.c Sun Sep 30 12:05:19 2001 @@ -52,22 +52,13 @@ #include #include -#define NR_SIZES 7 -static char buffersize_index[65] = -{-1, 0, 1, -1, 2, -1, -1, -1, 3, -1, -1, -1, -1, -1, -1, -1, - 4, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, - 5, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1,-1, -1, -1, -1, -1, -1, -1, - 6}; - -#define BUFSIZE_INDEX(X) ((int) buffersize_index[(X)>>9]) #define MAX_BUF_PER_PAGE (PAGE_CACHE_SIZE / 512) #define NR_RESERVED (10*MAX_BUF_PER_PAGE) #define MAX_UNUSED_BUFFERS NR_RESERVED+20 /* don't ever have more than this number of unused buffer heads */ /* Anti-deadlock ordering: - * lru_list_lock > hash_table_lock > free_list_lock > unused_list_lock + * lru_list_lock > hash_table_lock > unused_list_lock */ #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_inode_buffers) @@ -90,13 +81,7 @@ static spinlock_t unused_list_lock = SPIN_LOCK_UNLOCKED; static DECLARE_WAIT_QUEUE_HEAD(buffer_wait); -struct bh_free_head { - struct buffer_head *list; - spinlock_t lock; -}; -static struct bh_free_head free_list[NR_SIZES]; - -static int grow_buffers(int size); +static int grow_buffers(kdev_t dev, unsigned long block, int size); static void __refile_buffer(struct buffer_head *); /* This is used by some architectures to estimate available memory. */ @@ -481,12 +466,16 @@ ((block) << (bh_hash_shift - 12)))) #define hash(dev,block) hash_table[(_hashfn(HASHDEV(dev),block) & bh_hash_mask)] -static __inline__ void __hash_link(struct buffer_head *bh, struct buffer_head **head) +static inline void __insert_into_hash_list(struct buffer_head *bh) { - if ((bh->b_next = *head) != NULL) - bh->b_next->b_pprev = &bh->b_next; + struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr); + struct buffer_head *next = *head; + *head = bh; bh->b_pprev = head; + bh->b_next = next; + if (next != NULL) + next->b_pprev = &bh->b_next; } static __inline__ void __hash_unlink(struct buffer_head *bh) @@ -503,6 +492,8 @@ { struct buffer_head **bhp = &lru_list[blist]; + if (bh->b_prev_free || bh->b_next_free) BUG(); + if(!*bhp) { *bhp = bh; bh->b_prev_free = bh; @@ -530,19 +521,6 @@ } } -static void __remove_from_free_list(struct buffer_head * bh, int index) -{ - if(bh->b_next_free == bh) - free_list[index].list = NULL; - else { - bh->b_prev_free->b_next_free = bh->b_next_free; - bh->b_next_free->b_prev_free = bh->b_prev_free; - if (free_list[index].list == bh) - free_list[index].list = bh->b_next_free; - } - bh->b_next_free = bh->b_prev_free = NULL; -} - /* must be called with both the hash_table_lock and the lru_list_lock held */ static void __remove_from_queues(struct buffer_head *bh) @@ -551,67 +529,28 @@ __remove_from_lru_list(bh, bh->b_list); } -static void __insert_into_queues(struct buffer_head *bh) -{ - struct buffer_head **head = &hash(bh->b_dev, bh->b_blocknr); - - __hash_link(bh, head); - __insert_into_lru_list(bh, bh->b_list); -} - -/* This function must only run if there are no other - * references _anywhere_ to this buffer head. - */ -static void put_last_free(struct buffer_head * bh) +struct buffer_head * get_hash_table(kdev_t dev, int block, int size) { - struct bh_free_head *head = &free_list[BUFSIZE_INDEX(bh->b_size)]; - struct buffer_head **bhp = &head->list; - - bh->b_state = 0; + struct buffer_head *bh, **p = &hash(dev, block); - spin_lock(&head->lock); - bh->b_dev = B_FREE; - if(!*bhp) { - *bhp = bh; - bh->b_prev_free = bh; - } - bh->b_next_free = *bhp; - bh->b_prev_free = (*bhp)->b_prev_free; - (*bhp)->b_prev_free->b_next_free = bh; - (*bhp)->b_prev_free = bh; - spin_unlock(&head->lock); -} - -/* - * Why like this, I hear you say... The reason is race-conditions. - * As we don't lock buffers (unless we are reading them, that is), - * something might happen to it while we sleep (ie a read-error - * will force it bad). This shouldn't really happen currently, but - * the code is ready. - */ -static inline struct buffer_head * __get_hash_table(kdev_t dev, int block, int size) -{ - struct buffer_head *bh = hash(dev, block); + read_lock(&hash_table_lock); - for (; bh; bh = bh->b_next) - if (bh->b_blocknr == block && - bh->b_size == size && - bh->b_dev == dev) + for (;;) { + bh = *p; + if (!bh) break; - if (bh) + p = &bh->b_next; + if (bh->b_blocknr != block) + continue; + if (bh->b_size != size) + continue; + if (bh->b_dev != dev) + continue; get_bh(bh); + break; + } - return bh; -} - -struct buffer_head * get_hash_table(kdev_t dev, int block, int size) -{ - struct buffer_head *bh; - - read_lock(&hash_table_lock); - bh = __get_hash_table(dev, block, size); read_unlock(&hash_table_lock); - return bh; } @@ -688,10 +627,11 @@ we think the disk contains more recent information than the buffercache. The update == 1 pass marks the buffers we need to update, the update == 2 pass does the actual I/O. */ -void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers, int update) +void invalidate_bdev(struct block_device *bdev, int destroy_dirty_buffers) { int i, nlist, slept; struct buffer_head * bh, * bh_next; + kdev_t dev = to_kdev_t(bdev->bd_dev); /* will become bdev */ retry: slept = 0; @@ -722,33 +662,14 @@ /* All buffers in the lru lists are mapped */ if (!buffer_mapped(bh)) BUG(); + if (buffer_dirty(bh)) + printk("invalidate: dirty buffer\n"); if (!atomic_read(&bh->b_count)) { if (destroy_dirty_buffers || !buffer_dirty(bh)) { remove_inode_queue(bh); - __remove_from_queues(bh); - put_last_free(bh); } - } else if (update) { - if ((update == 2) ^ buffer_uptodate(bh) && - (update == 2) ^ buffer_req(bh)) { - write_unlock(&hash_table_lock); - atomic_inc(&bh->b_count); - spin_unlock(&lru_list_lock); - - if (update == 2) { - ll_rw_block(READ, 1, &bh); - wait_on_buffer(bh); - } else { - lock_buffer(bh); - clear_bit(BH_Uptodate, &bh->b_state); - clear_bit(BH_Req, &bh->b_state); - unlock_buffer(bh); - } - - atomic_dec(&bh->b_count); - goto retry; - } - } + } else + printk("invalidate: busy buffer\n"); write_unlock(&hash_table_lock); if (slept) @@ -759,81 +680,18 @@ spin_unlock(&lru_list_lock); if (slept) goto retry; + + /* Get rid of the page cache */ + invalidate_inode_pages(bdev->bd_inode); } -void set_blocksize(kdev_t dev, int size) +void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers) { - extern int *blksize_size[]; - int i, nlist, slept; - struct buffer_head * bh, * bh_next; - - if (!blksize_size[MAJOR(dev)]) - return; - - /* Size must be a power of two, and between 512 and PAGE_SIZE */ - if (size > PAGE_SIZE || size < 512 || (size & (size-1))) - panic("Invalid blocksize passed to set_blocksize"); - - if (blksize_size[MAJOR(dev)][MINOR(dev)] == 0 && size == BLOCK_SIZE) { - blksize_size[MAJOR(dev)][MINOR(dev)] = size; - return; - } - if (blksize_size[MAJOR(dev)][MINOR(dev)] == size) - return; - sync_buffers(dev, 2); - blksize_size[MAJOR(dev)][MINOR(dev)] = size; - - retry: - slept = 0; - spin_lock(&lru_list_lock); - for(nlist = 0; nlist < NR_LIST; nlist++) { - bh = lru_list[nlist]; - if (!bh) - continue; - for (i = nr_buffers_type[nlist]; i > 0 ; bh = bh_next, i--) { - bh_next = bh->b_next_free; - if (bh->b_dev != dev || bh->b_size == size) - continue; - /* Unhashed? */ - if (!bh->b_pprev) - continue; - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(&lru_list_lock); - wait_on_buffer(bh); - slept = 1; - spin_lock(&lru_list_lock); - put_bh(bh); - } - - write_lock(&hash_table_lock); - if (!atomic_read(&bh->b_count)) { - if (buffer_dirty(bh)) - printk(KERN_WARNING - "set_blocksize: dev %s buffer_dirty %lu size %hu\n", - kdevname(dev), bh->b_blocknr, bh->b_size); - remove_inode_queue(bh); - __remove_from_queues(bh); - put_last_free(bh); - } else { - if (atomic_set_buffer_clean(bh)) - __refile_buffer(bh); - clear_bit(BH_Uptodate, &bh->b_state); - printk(KERN_WARNING - "set_blocksize: " - "b_count %d, dev %s, block %lu, from %p\n", - atomic_read(&bh->b_count), bdevname(bh->b_dev), - bh->b_blocknr, __builtin_return_address(0)); - } - write_unlock(&hash_table_lock); - if (slept) - goto out; - } + struct block_device *bdev = bdget(dev); + if (bdev) { + invalidate_bdev(bdev, destroy_dirty_buffers); + bdput(bdev); } - out: - spin_unlock(&lru_list_lock); - if (slept) - goto retry; } static void free_more_memory(void) @@ -1137,57 +995,16 @@ */ struct buffer_head * getblk(kdev_t dev, int block, int size) { - struct buffer_head * bh; - int isize; + for (;;) { + struct buffer_head * bh; -repeat: - spin_lock(&lru_list_lock); - write_lock(&hash_table_lock); - bh = __get_hash_table(dev, block, size); - if (bh) - goto out; - - isize = BUFSIZE_INDEX(size); - spin_lock(&free_list[isize].lock); - bh = free_list[isize].list; - if (bh) { - __remove_from_free_list(bh, isize); - atomic_set(&bh->b_count, 1); - } - spin_unlock(&free_list[isize].lock); - - /* - * OK, FINALLY we know that this buffer is the only one of - * its kind, we hold a reference (b_count>0), it is unlocked, - * and it is clean. - */ - if (bh) { - init_buffer(bh, NULL, NULL); - bh->b_dev = dev; - bh->b_blocknr = block; - bh->b_state = 1 << BH_Mapped; + bh = get_hash_table(dev, block, size); + if (bh) + return bh; - /* Insert the buffer into the regular lists */ - __insert_into_queues(bh); - out: - write_unlock(&hash_table_lock); - spin_unlock(&lru_list_lock); - touch_buffer(bh); - return bh; + if (!grow_buffers(dev, block, size)) + free_more_memory(); } - - /* - * If we block while refilling the free list, somebody may - * create the buffer first ... search the hashes again. - */ - write_unlock(&hash_table_lock); - spin_unlock(&lru_list_lock); - - if (!grow_buffers(size)) - free_more_memory(); - - /* FIXME: getblk should fail if there's no enough memory */ - goto repeat; } /* -1 -> no need to flush @@ -1313,22 +1130,7 @@ */ void __bforget(struct buffer_head * buf) { - /* grab the lru lock here to block bdflush. */ - spin_lock(&lru_list_lock); - write_lock(&hash_table_lock); - if (!atomic_dec_and_test(&buf->b_count) || buffer_locked(buf)) - goto in_use; - __hash_unlink(buf); - write_unlock(&hash_table_lock); - remove_inode_queue(buf); - __remove_from_lru_list(buf, buf->b_list); - spin_unlock(&lru_list_lock); - put_last_free(buf); - return; - - in_use: - write_unlock(&hash_table_lock); - spin_unlock(&lru_list_lock); + __brelse(buf); } /** @@ -1364,6 +1166,7 @@ if (nr_unused_buffer_heads >= MAX_UNUSED_BUFFERS) { kmem_cache_free(bh_cachep, bh); } else { + bh->b_dev = B_FREE; bh->b_blocknr = -1; bh->b_this_page = NULL; @@ -1416,20 +1219,6 @@ } spin_unlock(&unused_list_lock); } -#if 0 - /* - * (Pending further analysis ...) - * Ordinary (non-async) requests can use a different memory priority - * to free up pages. Any swapping thus generated will use async - * buffer heads. - */ - if(!async && - (bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL)) != NULL) { - memset(bh, 0, sizeof(*bh)); - init_waitqueue_head(&bh->b_wait); - return bh; - } -#endif return NULL; } @@ -1470,7 +1259,7 @@ if (!bh) goto no_grow; - bh->b_dev = B_FREE; /* Flag as unused */ + bh->b_dev = NODEV; bh->b_this_page = head; head = bh; @@ -1524,7 +1313,10 @@ goto try_again; } -static void unmap_buffer(struct buffer_head * bh) +/* + * Called when truncating a buffer on a page completely. + */ +static void discard_buffer(struct buffer_head * bh) { if (buffer_mapped(bh)) { mark_buffer_clean(bh); @@ -1564,7 +1356,7 @@ * is this block fully flushed? */ if (offset <= curr_off) - unmap_buffer(bh); + discard_buffer(bh); curr_off = next_off; bh = next; } while (bh != head); @@ -1580,11 +1372,8 @@ * instead. */ if (!offset) { - if (!try_to_free_buffers(page, 0)) { - if (drop_pagecache) - atomic_inc(&buffermem_pages); + if (!try_to_free_buffers(page, 0)) return 0; - } } return 1; @@ -1667,10 +1456,10 @@ BUG(); if (!page->buffers) - create_empty_buffers(page, inode->i_dev, inode->i_sb->s_blocksize); + create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits); head = page->buffers; - block = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + block = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); bh = head; i = 0; @@ -1732,12 +1521,12 @@ struct buffer_head *bh, *head, *wait[2], **wait_bh=wait; char *kaddr = kmap(page); - blocksize = inode->i_sb->s_blocksize; + blocksize = 1 << inode->i_blkbits; if (!page->buffers) create_empty_buffers(page, inode->i_dev, blocksize); head = page->buffers; - bbits = inode->i_sb->s_blocksize_bits; + bbits = inode->i_blkbits; block = page->index << (PAGE_CACHE_SHIFT - bbits); for(bh = head, block_start = 0; bh != head || !block_start; @@ -1800,7 +1589,7 @@ unsigned blocksize; struct buffer_head *bh, *head; - blocksize = inode->i_sb->s_blocksize; + blocksize = 1 << inode->i_blkbits; for(bh = head = page->buffers, block_start = 0; bh != head || !block_start; @@ -1849,14 +1638,14 @@ if (!PageLocked(page)) PAGE_BUG(page); - blocksize = inode->i_sb->s_blocksize; + blocksize = 1 << inode->i_blkbits; if (!page->buffers) create_empty_buffers(page, inode->i_dev, blocksize); head = page->buffers; - blocks = PAGE_CACHE_SIZE >> inode->i_sb->s_blocksize_bits; - iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - lblock = (inode->i_size+blocksize-1) >> inode->i_sb->s_blocksize_bits; + blocks = PAGE_CACHE_SIZE >> inode->i_blkbits; + iblock = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); + lblock = (inode->i_size+blocksize-1) >> inode->i_blkbits; bh = head; nr = 0; i = 0; @@ -1923,7 +1712,7 @@ unsigned long pgpos; long status; unsigned zerofrom; - unsigned blocksize = inode->i_sb->s_blocksize; + unsigned blocksize = 1 << inode->i_blkbits; char *kaddr; while(page->index > (pgpos = *bytes>>PAGE_CACHE_SHIFT)) { @@ -2008,6 +1797,14 @@ return err; } +int block_commit_write(struct page *page, unsigned from, unsigned to) +{ + struct inode *inode = page->mapping->host; + __block_commit_write(inode,page,from,to); + kunmap(page); + return 0; +} + int generic_commit_write(struct file *file, struct page *page, unsigned from, unsigned to) { @@ -2032,7 +1829,7 @@ struct buffer_head *bh; int err; - blocksize = inode->i_sb->s_blocksize; + blocksize = 1 << inode->i_blkbits; length = offset & (blocksize - 1); /* Block boundary? Nothing to do */ @@ -2040,7 +1837,7 @@ return 0; length = blocksize - length; - iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); page = grab_cache_page(mapping, index); err = -ENOMEM; @@ -2141,47 +1938,6 @@ return tmp.b_blocknr; } -int generic_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize, get_block_t * get_block) -{ - int i, nr_blocks, retval; - unsigned long * blocks = iobuf->blocks; - - nr_blocks = iobuf->length / blocksize; - /* build the blocklist */ - for (i = 0; i < nr_blocks; i++, blocknr++) { - struct buffer_head bh; - - bh.b_state = 0; - bh.b_dev = inode->i_dev; - bh.b_size = blocksize; - - retval = get_block(inode, blocknr, &bh, rw == READ ? 0 : 1); - if (retval) - goto out; - - if (rw == READ) { - if (buffer_new(&bh)) - BUG(); - if (!buffer_mapped(&bh)) { - /* there was an hole in the filesystem */ - blocks[i] = -1UL; - continue; - } - } else { - if (buffer_new(&bh)) - unmap_underlying_metadata(&bh); - if (!buffer_mapped(&bh)) - BUG(); - } - blocks[i] = bh.b_blocknr; - } - - retval = brw_kiovec(rw, 1, &iobuf, inode->i_dev, iobuf->blocks, blocksize); - - out: - return retval; -} - /* * IO completion routine for a buffer_head being used for kiobuf IO: we * can't dispatch the kiobuf callback until io_count reaches 0. @@ -2311,7 +2067,6 @@ } tmp = bhs[bhind++]; - tmp->b_dev = B_FREE; tmp->b_size = size; set_bh_page(tmp, map, offset); tmp->b_this_page = tmp; @@ -2447,67 +2202,129 @@ return err; } +static inline void link_dev_buffers(struct page * page, struct buffer_head *head) +{ + struct buffer_head *bh, *tail; + + bh = head; + do { + tail = bh; + bh = bh->b_this_page; + } while (bh); + tail->b_this_page = head; + page->buffers = head; + page_cache_get(page); +} + +/* + * Create the page-cache page that contains the requested block + */ +static struct page * grow_dev_page(struct block_device *bdev, unsigned long index, int size) +{ + struct page * page; + struct buffer_head *bh; + + page = find_or_create_page(bdev->bd_inode->i_mapping, index, GFP_NOFS); + if (IS_ERR(page)) + return NULL; + + if (!PageLocked(page)) + BUG(); + + bh = page->buffers; + if (bh) { + if (bh->b_size == size) + return page; + if (!try_to_free_buffers(page, GFP_NOFS)) + goto failed; + } + + bh = create_buffers(page, size, 0); + if (!bh) + goto failed; + link_dev_buffers(page, bh); + return page; + +failed: + UnlockPage(page); + page_cache_release(page); + return NULL; +} + +static void hash_page_buffers(struct page *page, kdev_t dev, int block, int size) +{ + struct buffer_head *head = page->buffers; + struct buffer_head *bh = head; + unsigned int uptodate; + + uptodate = 1 << BH_Mapped; + if (Page_Uptodate(page)) + uptodate |= 1 << BH_Uptodate; + + write_lock(&hash_table_lock); + do { + if (!(bh->b_state & (1 << BH_Mapped))) { + init_buffer(bh, NULL, NULL); + bh->b_dev = dev; + bh->b_blocknr = block; + bh->b_state = uptodate; + } + + /* Insert the buffer into the hash lists if necessary */ + if (!bh->b_pprev) + __insert_into_hash_list(bh); + + block++; + bh = bh->b_this_page; + } while (bh != head); + write_unlock(&hash_table_lock); +} + /* * Try to increase the number of buffers available: the size argument * is used to determine what kind of buffers we want. */ -static int grow_buffers(int size) +static int grow_buffers(kdev_t dev, unsigned long block, int size) { struct page * page; - struct buffer_head *bh, *tmp; - struct buffer_head * insert_point; - int isize; + struct block_device *bdev; + unsigned long index; + int sizebits; if ((size & 511) || (size > PAGE_SIZE)) { printk(KERN_ERR "VFS: grow_buffers: size = %d\n",size); return 0; } + sizebits = -1; + do { + sizebits++; + } while ((size << sizebits) < PAGE_SIZE); - page = alloc_page(GFP_NOFS); - if (!page) - goto out; - LockPage(page); - bh = create_buffers(page, size, 0); - if (!bh) - goto no_buffer_head; - - isize = BUFSIZE_INDEX(size); + index = block >> sizebits; + block = index << sizebits; - spin_lock(&free_list[isize].lock); - insert_point = free_list[isize].list; - tmp = bh; - while (1) { - if (insert_point) { - tmp->b_next_free = insert_point->b_next_free; - tmp->b_prev_free = insert_point; - insert_point->b_next_free->b_prev_free = tmp; - insert_point->b_next_free = tmp; - } else { - tmp->b_prev_free = tmp; - tmp->b_next_free = tmp; - } - insert_point = tmp; - if (tmp->b_this_page) - tmp = tmp->b_this_page; - else - break; + bdev = bdget(kdev_t_to_nr(dev)); + if (!bdev) { + printk("No block device for %s\n", kdevname(dev)); + BUG(); } - tmp->b_this_page = bh; - free_list[isize].list = bh; - spin_unlock(&free_list[isize].lock); - - page->buffers = bh; - page->flags &= ~(1 << PG_referenced); - lru_cache_add(page); - UnlockPage(page); - atomic_inc(&buffermem_pages); - return 1; -no_buffer_head: + /* Create a page with the proper size buffers.. */ + page = grow_dev_page(bdev, index, size); + + /* This is "wrong" - talk to Al Viro */ + atomic_dec(&bdev->bd_count); + if (!page) + return 0; + + /* Hash in the buffers on the hash list */ + hash_page_buffers(page, dev, block, size); UnlockPage(page); page_cache_release(page); -out: - return 0; + + /* We hashed up this page, so increment buffermem */ + atomic_inc(&buffermem_pages); + return 1; } static int sync_page_buffers(struct buffer_head *bh, unsigned int gfp_mask) @@ -2522,7 +2339,7 @@ ll_rw_block(WRITE, 1, &p); tryagain = 0; } else if (buffer_locked(p)) { - if (gfp_mask & __GFP_WAIT) { + if (gfp_mask & __GFP_WAITBUF) { wait_on_buffer(p); tryagain = 1; } else @@ -2557,12 +2374,10 @@ int try_to_free_buffers(struct page * page, unsigned int gfp_mask) { struct buffer_head * tmp, * bh = page->buffers; - int index = BUFSIZE_INDEX(bh->b_size); cleaned_buffers_try_again: spin_lock(&lru_list_lock); write_lock(&hash_table_lock); - spin_lock(&free_list[index].lock); tmp = bh; do { if (buffer_busy(tmp)) @@ -2572,18 +2387,18 @@ spin_lock(&unused_list_lock); tmp = bh; + + /* if this buffer was hashed, this page counts as buffermem */ + if (bh->b_pprev) + atomic_dec(&buffermem_pages); do { struct buffer_head * p = tmp; tmp = tmp->b_this_page; - /* The buffer can be either on the regular - * queues or on the free list.. - */ - if (p->b_dev != B_FREE) { - remove_inode_queue(p); - __remove_from_queues(p); - } else - __remove_from_free_list(p, index); + if (p->b_dev == B_FREE) BUG(); + + remove_inode_queue(p); + __remove_from_queues(p); __put_unused_buffer_head(p); } while (tmp != bh); spin_unlock(&unused_list_lock); @@ -2594,14 +2409,12 @@ /* And free the page */ page->buffers = NULL; page_cache_release(page); - spin_unlock(&free_list[index].lock); write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); return 1; busy_buffer_page: /* Uhhuh, start writeback so that we don't end up with all dirty pages */ - spin_unlock(&free_list[index].lock); write_unlock(&hash_table_lock); spin_unlock(&lru_list_lock); if (gfp_mask & __GFP_IO) { @@ -2713,12 +2526,6 @@ /* Setup hash chains. */ for(i = 0; i < nr_hash; i++) hash_table[i] = NULL; - - /* Setup free lists. */ - for(i = 0; i < NR_SIZES; i++) { - free_list[i].list = NULL; - free_list[i].lock = SPIN_LOCK_UNLOCKED; - } /* Setup lru lists. */ for(i = 0; i < NR_LIST; i++) diff -u --recursive --new-file v2.4.10/linux/fs/ext2/inode.c linux/fs/ext2/inode.c --- v2.4.10/linux/fs/ext2/inode.c Sun Sep 23 11:41:00 2001 +++ linux/fs/ext2/inode.c Mon Sep 24 22:25:20 2001 @@ -586,10 +586,6 @@ { return generic_block_bmap(mapping,block,ext2_get_block); } -static int ext2_direct_IO(int rw, struct inode * inode, struct kiobuf * iobuf, unsigned long blocknr, int blocksize) -{ - return generic_direct_IO(rw, inode, iobuf, blocknr, blocksize, ext2_get_block); -} struct address_space_operations ext2_aops = { readpage: ext2_readpage, writepage: ext2_writepage, @@ -597,7 +593,6 @@ prepare_write: ext2_prepare_write, commit_write: generic_commit_write, bmap: ext2_bmap, - direct_IO: ext2_direct_IO, }; /* diff -u --recursive --new-file v2.4.10/linux/fs/ext2/super.c linux/fs/ext2/super.c --- v2.4.10/linux/fs/ext2/super.c Wed Jul 25 17:10:24 2001 +++ linux/fs/ext2/super.c Sat Sep 29 12:46:47 2001 @@ -408,7 +408,6 @@ unsigned long offset = 0; kdev_t dev = sb->s_dev; int blocksize = BLOCK_SIZE; - int hblock; int db_count; int i, j; @@ -429,7 +428,10 @@ return NULL; } - set_blocksize (dev, blocksize); + if (set_blocksize(dev, blocksize) < 0) { + printk ("EXT2-fs: unable to set blocksize %d\n", blocksize); + return NULL; + } /* * If the superblock doesn't start on a sector boundary, @@ -488,24 +490,19 @@ sb->s_maxbytes = ext2_max_size(sb->s_blocksize_bits); - if (sb->s_blocksize != BLOCK_SIZE && - (sb->s_blocksize == 1024 || sb->s_blocksize == 2048 || - sb->s_blocksize == 4096)) { - /* - * Make sure the blocksize for the filesystem is larger - * than the hardware sectorsize for the machine. - */ - hblock = get_hardsect_size(dev); - if (sb->s_blocksize < hblock) { + /* If the blocksize doesn't match, re-read the thing.. */ + if (sb->s_blocksize != blocksize) { + blocksize = sb->s_blocksize; + brelse(bh); + + if (set_blocksize(dev, blocksize) < 0) { printk(KERN_ERR "EXT2-fs: blocksize too small for device.\n"); - goto failed_mount; + return NULL; } - brelse (bh); - set_blocksize (dev, sb->s_blocksize); - logic_sb_block = (sb_block*BLOCK_SIZE) / sb->s_blocksize; - offset = (sb_block*BLOCK_SIZE) % sb->s_blocksize; - bh = bread (dev, logic_sb_block, sb->s_blocksize); + logic_sb_block = (sb_block*BLOCK_SIZE) / blocksize; + offset = (sb_block*BLOCK_SIZE) % blocksize; + bh = bread (dev, logic_sb_block, blocksize); if(!bh) { printk("EXT2-fs: Couldn't read superblock on " "2nd try.\n"); @@ -518,6 +515,7 @@ goto failed_mount; } } + if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV) { sb->u.ext2_sb.s_inode_size = EXT2_GOOD_OLD_INODE_SIZE; sb->u.ext2_sb.s_first_ino = EXT2_GOOD_OLD_FIRST_INO; diff -u --recursive --new-file v2.4.10/linux/fs/inode.c linux/fs/inode.c --- v2.4.10/linux/fs/inode.c Sun Sep 23 11:41:00 2001 +++ linux/fs/inode.c Fri Sep 28 18:03:48 2001 @@ -816,6 +816,7 @@ list_add(&inode->i_list, &inode_in_use); inode->i_sb = NULL; inode->i_dev = 0; + inode->i_blkbits = 0; inode->i_ino = ++last_ino; inode->i_flags = 0; atomic_set(&inode->i_count, 1); @@ -849,6 +850,7 @@ list_add(&inode->i_hash, head); inode->i_sb = sb; inode->i_dev = sb->s_dev; + inode->i_blkbits = sb->s_blocksize_bits; inode->i_ino = ino; inode->i_flags = 0; atomic_set(&inode->i_count, 1); diff -u --recursive --new-file v2.4.10/linux/fs/ntfs/Makefile linux/fs/ntfs/Makefile --- v2.4.10/linux/fs/ntfs/Makefile Sun Sep 23 11:41:00 2001 +++ linux/fs/ntfs/Makefile Sun Sep 30 11:42:44 2001 @@ -5,7 +5,7 @@ obj-y := fs.o sysctl.o support.o util.o inode.o dir.o super.o attr.o unistr.o obj-m := $(O_TARGET) # New version format started 3 February 2001. -EXTRA_CFLAGS = -DNTFS_VERSION=\"1.1.19\" #-DDEBUG +EXTRA_CFLAGS = -DNTFS_VERSION=\"1.1.20\" #-DDEBUG include $(TOPDIR)/Rules.make diff -u --recursive --new-file v2.4.10/linux/fs/ntfs/fs.c linux/fs/ntfs/fs.c --- v2.4.10/linux/fs/ntfs/fs.c Sun Sep 23 11:41:00 2001 +++ linux/fs/ntfs/fs.c Sun Sep 30 11:42:44 2001 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1012,22 +1013,27 @@ { ntfs_volume *vol; struct buffer_head *bh; - int i, to_read; + int i, to_read, blocksize; ntfs_debug(DEBUG_OTHER, "ntfs_read_super\n"); vol = NTFS_SB2VOL(sb); init_ntfs_super_block(vol); if (!parse_options(vol, (char*)options)) goto ntfs_read_super_vol; - /* Assume a 512 bytes block device for now. */ - set_blocksize(sb->s_dev, 512); + blocksize = get_hardsect_size(sb->s_dev); + if (blocksize < 512) + blocksize = 512; + if (set_blocksize(sb->s_dev, blocksize) < 0) { + ntfs_error("Unable to set blocksize %d.\n", blocksize); + goto ntfs_read_super_vol; + } /* Read the super block (boot block). */ - if (!(bh = bread(sb->s_dev, 0, 512))) { + if (!(bh = bread(sb->s_dev, 0, blocksize))) { ntfs_error("Reading super block failed\n"); goto ntfs_read_super_unl; } ntfs_debug(DEBUG_OTHER, "Done reading boot block\n"); - /* Check for 'NTFS' magic number */ + /* Check for valid 'NTFS' boot sector. */ if (!is_boot_sector_ntfs(bh->b_data)) { ntfs_debug(DEBUG_OTHER, "Not a NTFS volume\n"); bforget(bh); @@ -1040,7 +1046,7 @@ goto ntfs_read_super_unl; } ntfs_debug(DEBUG_OTHER, "$Mft at cluster 0x%lx\n", vol->mft_lcn); - bforget(bh); + brelse(bh); NTFS_SB(vol) = sb; if (vol->cluster_size > PAGE_SIZE) { ntfs_error("Partition cluster size is not supported yet (it " @@ -1050,9 +1056,12 @@ ntfs_debug(DEBUG_OTHER, "Done to init volume\n"); /* Inform the kernel that a device block is a NTFS cluster. */ sb->s_blocksize = vol->cluster_size; - for (i = sb->s_blocksize, sb->s_blocksize_bits = 0; i != 1; i >>= 1) - sb->s_blocksize_bits++; - set_blocksize(sb->s_dev, sb->s_blocksize); + sb->s_blocksize_bits = vol->cluster_size_bits; + if (blocksize != vol->cluster_size && + set_blocksize(sb->s_dev, sb->s_blocksize) < 0) { + ntfs_error("Cluster size too small for device.\n"); + goto ntfs_read_super_unl; + } ntfs_debug(DEBUG_OTHER, "set_blocksize\n"); /* Allocate an MFT record (MFT record can be smaller than a cluster). */ i = vol->cluster_size; diff -u --recursive --new-file v2.4.10/linux/fs/ntfs/inode.c linux/fs/ntfs/inode.c --- v2.4.10/linux/fs/ntfs/inode.c Sun Sep 23 11:41:00 2001 +++ linux/fs/ntfs/inode.c Sun Sep 30 11:42:44 2001 @@ -592,18 +592,23 @@ * If write extends beyond _allocated_ size, extend attribute, * updating attr->allocated and attr->size in the process. (AIA) */ - if (offset + l > attr->allocated) { + if ((!attr->resident && offset + l > attr->allocated) || + (attr->resident && offset + l > attr->size)) { error = ntfs_resize_attr(ino, attr, offset + l); if (error) return error; - } else if (offset + l > attr->size) - /* If amount of data has increased: update. */ - attr->size = offset + l; - /* If amount of initialised data has increased: update. */ - if (offset + l > attr->initialized) { - /* FIXME: Zero-out the section between the old - * initialised length and the write start. (AIA) */ - attr->initialized = offset + l; + } + if (!attr->resident) { + /* Has amount of data increased? */ + if (offset + l > attr->size) + attr->size = offset + l; + /* Has amount of initialised data increased? */ + if (offset + l > attr->initialized) { + /* FIXME: Clear the section between the old + * initialised length and the write start. + * (AIA) */ + attr->initialized = offset + l; + } } } if (attr->resident) { @@ -619,10 +624,11 @@ if (offset >= attr->initialized) return ntfs_read_zero(dest, l); if (offset + l > attr->initialized) { - dest->size = chunk = offset + l - attr->initialized; + dest->size = chunk = attr->initialized - offset; error = ntfs_readwrite_attr(ino, attr, offset, dest); - if (error) + if (error || (dest->size != chunk && (error = -EIO, 1))) return error; + dest->size += l - chunk; return ntfs_read_zero(dest, l - chunk); } if (attr->flags & ATTR_IS_COMPRESSED) @@ -707,31 +713,25 @@ return ntfs_readwrite_attr(ino, attr, offset, buf); } +/* -2 = error, -1 = hole, >= 0 means real disk cluster (lcn). */ int ntfs_vcn_to_lcn(ntfs_inode *ino, int vcn) { int rnum; ntfs_attribute *data; data = ntfs_find_attr(ino, ino->vol->at_data, 0); - /* It's hard to give an error code. */ if (!data || data->resident || data->flags & (ATTR_IS_COMPRESSED | ATTR_IS_ENCRYPTED)) - return -1; + return -2; if (data->size <= (__s64)vcn << ino->vol->cluster_size_bits) - return -1; - /* - * For Linux, block number 0 represents a hole. - No problem as we do - * not support bmap in any form whatsoever. The FIBMAP sys call is - * deprecated anyway and NTFS is not a block based file system so - * allowing bmapping is complete and utter garbage IMO. Use mmap once - * we implement it... (AIA) - */ + return -2; if (data->initialized <= (__s64)vcn << ino->vol->cluster_size_bits) - return 0; + return -1; for (rnum = 0; rnum < data->d.r.len && - vcn >= data->d.r.runlist[rnum].len; rnum++) + vcn >= data->d.r.runlist[rnum].len; rnum++) vcn -= data->d.r.runlist[rnum].len; - /* We need to cope with sparse runs. (AIA) */ + if (data->d.r.runlist[rnum].lcn >= 0) + return data->d.r.runlist[rnum].lcn + vcn; return data->d.r.runlist[rnum].lcn + vcn; } diff -u --recursive --new-file v2.4.10/linux/fs/pipe.c linux/fs/pipe.c --- v2.4.10/linux/fs/pipe.c Sun Sep 23 11:41:00 2001 +++ linux/fs/pipe.c Fri Sep 28 18:03:48 2001 @@ -476,7 +476,7 @@ static struct inode * get_pipe_inode(void) { - struct inode *inode = get_empty_inode(); + struct inode *inode = new_inode(pipe_mnt->mnt_sb); if (!inode) goto fail_inode; @@ -485,7 +485,6 @@ goto fail_iput; PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1; inode->i_fop = &rdwr_pipe_fops; - inode->i_sb = pipe_mnt->mnt_sb; /* * Mark the inode dirty from the very beginning, diff -u --recursive --new-file v2.4.10/linux/fs/reiserfs/inode.c linux/fs/reiserfs/inode.c --- v2.4.10/linux/fs/reiserfs/inode.c Sun Sep 23 11:41:00 2001 +++ linux/fs/reiserfs/inode.c Fri Sep 28 18:03:48 2001 @@ -1428,7 +1428,6 @@ } sb = dir->i_sb; - inode->i_sb = sb; inode->i_flags = 0;//inode->i_sb->s_flags; /* item head of new item */ diff -u --recursive --new-file v2.4.10/linux/fs/reiserfs/namei.c linux/fs/reiserfs/namei.c --- v2.4.10/linux/fs/reiserfs/namei.c Sun Sep 23 11:41:00 2001 +++ linux/fs/reiserfs/namei.c Fri Sep 28 18:03:48 2001 @@ -533,7 +533,7 @@ struct reiserfs_transaction_handle th ; - inode = get_empty_inode() ; + inode = new_inode(dir->i_sb) ; if (!inode) { return -ENOMEM ; } @@ -586,7 +586,7 @@ struct reiserfs_transaction_handle th ; int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; - inode = get_empty_inode() ; + inode = new_inode(dir->i_sb) ; if (!inode) { return -ENOMEM ; } @@ -638,7 +638,7 @@ struct reiserfs_transaction_handle th ; int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; - inode = get_empty_inode() ; + inode = new_inode(dir->i_sb) ; if (!inode) { return -ENOMEM ; } @@ -859,7 +859,7 @@ int jbegin_count = JOURNAL_PER_BALANCE_CNT * 3; - inode = get_empty_inode() ; + inode = new_inode(dir->i_sb) ; if (!inode) { return -ENOMEM ; } diff -u --recursive --new-file v2.4.10/linux/include/asm-i386/unistd.h linux/include/asm-i386/unistd.h --- v2.4.10/linux/include/asm-i386/unistd.h Fri Aug 11 14:39:23 2000 +++ linux/include/asm-i386/unistd.h Sat Sep 29 12:59:47 2001 @@ -227,6 +227,7 @@ #define __NR_madvise1 219 /* delete when C lib stub is removed */ #define __NR_getdents64 220 #define __NR_fcntl64 221 +#define __NR_security 223 /* syscall for security modules */ /* user-visible error numbers are in the range -1 - -124: see */ diff -u --recursive --new-file v2.4.10/linux/include/linux/blkdev.h linux/include/linux/blkdev.h --- v2.4.10/linux/include/linux/blkdev.h Sun Sep 23 11:41:01 2001 +++ linux/include/linux/blkdev.h Sun Sep 30 12:09:18 2001 @@ -203,15 +203,4 @@ #define blk_finished_io(nsects) do { } while (0) #define blk_started_io(nsects) do { } while (0) -static inline int buffered_blk_size(kdev_t dev) -{ - int ret = INT_MAX; - int major = MAJOR(dev); - - if (blk_size[major]) - ret = blk_size[major][MINOR(dev)] + ((BUFFERED_BLOCKSIZE-1) >> BLOCK_SIZE_BITS); - - return ret; -} - #endif diff -u --recursive --new-file v2.4.10/linux/include/linux/fs.h linux/include/linux/fs.h --- v2.4.10/linux/include/linux/fs.h Sun Sep 23 11:41:01 2001 +++ linux/include/linux/fs.h Sun Sep 30 12:07:30 2001 @@ -46,10 +46,6 @@ #define BLOCK_SIZE_BITS 10 #define BLOCK_SIZE (1<b_page) +#define touch_buffer(bh) mark_page_accessed(bh->b_page) #include @@ -437,6 +433,7 @@ time_t i_atime; time_t i_mtime; time_t i_ctime; + unsigned int i_blkbits; unsigned long i_blksize; unsigned long i_blocks; unsigned long i_version; @@ -1174,14 +1171,10 @@ extern void invalidate_inode_pages(struct inode *); extern void invalidate_inode_pages2(struct address_space *); extern void invalidate_inode_buffers(struct inode *); -#define invalidate_buffers(dev) __invalidate_buffers((dev), 0, 0) -#define destroy_buffers(dev) __invalidate_buffers((dev), 1, 0) -#define update_buffers(dev) \ -do { \ - __invalidate_buffers((dev), 0, 1); \ - __invalidate_buffers((dev), 0, 2); \ -} while (0) -extern void __invalidate_buffers(kdev_t dev, int, int); +#define invalidate_buffers(dev) __invalidate_buffers((dev), 0) +#define destroy_buffers(dev) __invalidate_buffers((dev), 1) +extern void invalidate_bdev(struct block_device *, int); +extern void __invalidate_buffers(kdev_t dev, int); extern void sync_inodes(kdev_t); extern void sync_unlocked_inodes(void); extern void write_inode_now(struct inode *, int); @@ -1312,12 +1305,14 @@ extern void clear_inode(struct inode *); extern struct inode * get_empty_inode(void); + static inline struct inode * new_inode(struct super_block *sb) { struct inode *inode = get_empty_inode(); if (inode) { inode->i_sb = sb; inode->i_dev = sb->s_dev; + inode->i_blkbits = sb->s_blocksize_bits; } return inode; } @@ -1344,7 +1339,7 @@ if (buf) __bforget(buf); } -extern void set_blocksize(kdev_t, int); +extern int set_blocksize(kdev_t, int); extern struct buffer_head * bread(kdev_t, int, int); extern void wakeup_bdflush(void); @@ -1362,12 +1357,12 @@ extern int block_prepare_write(struct page*, unsigned, unsigned, get_block_t*); extern int cont_prepare_write(struct page*, unsigned, unsigned, get_block_t*, unsigned long *); +extern int block_commit_write(struct page *page, unsigned from, unsigned to); extern int block_sync_page(struct page *); int generic_block_bmap(struct address_space *, long, get_block_t *); int generic_commit_write(struct file *, struct page *, unsigned, unsigned); int block_truncate_page(struct address_space *, loff_t, get_block_t *); -extern int generic_direct_IO(int, struct inode *, struct kiobuf *, unsigned long, int, get_block_t *); extern void create_empty_buffers(struct page *, kdev_t, unsigned long); extern int waitfor_one_page(struct page*); diff -u --recursive --new-file v2.4.10/linux/include/linux/list.h linux/include/linux/list.h --- v2.4.10/linux/include/linux/list.h Sun Sep 23 11:41:01 2001 +++ linux/include/linux/list.h Sun Sep 30 12:07:30 2001 @@ -92,7 +92,6 @@ static __inline__ void list_del(struct list_head *entry) { __list_del(entry->prev, entry->next); - entry->next = entry->prev = 0; } /** diff -u --recursive --new-file v2.4.10/linux/include/linux/mm.h linux/include/linux/mm.h --- v2.4.10/linux/include/linux/mm.h Sun Sep 23 11:41:01 2001 +++ linux/include/linux/mm.h Sun Sep 30 12:07:34 2001 @@ -550,16 +550,17 @@ #define __GFP_IO 0x40 /* Can start low memory physical IO? */ #define __GFP_HIGHIO 0x80 /* Can start high mem physical IO? */ #define __GFP_FS 0x100 /* Can call down to low-level FS? */ +#define __GFP_WAITBUF 0x200 /* Can we wait for buffers to complete? */ #define GFP_NOHIGHIO (__GFP_HIGH | __GFP_WAIT | __GFP_IO) #define GFP_NOIO (__GFP_HIGH | __GFP_WAIT) -#define GFP_NOFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO) +#define GFP_NOFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_WAITBUF) #define GFP_ATOMIC (__GFP_HIGH) -#define GFP_USER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) -#define GFP_HIGHUSER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS | __GFP_HIGHMEM) -#define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) -#define GFP_NFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) -#define GFP_KSWAPD ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS) +#define GFP_USER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_WAITBUF | __GFP_FS) +#define GFP_HIGHUSER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_WAITBUF | __GFP_FS | __GFP_HIGHMEM) +#define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_WAITBUF | __GFP_FS) +#define GFP_NFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_WAITBUF | __GFP_FS) +#define GFP_KSWAPD ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_WAITBUF | __GFP_FS) /* Flag - indicates that the buffer will be suitable for DMA. Ignored on some platforms, used as appropriate on others */ diff -u --recursive --new-file v2.4.10/linux/include/linux/pagemap.h linux/include/linux/pagemap.h --- v2.4.10/linux/include/linux/pagemap.h Sun Sep 23 11:41:01 2001 +++ linux/include/linux/pagemap.h Sun Sep 30 12:07:41 2001 @@ -76,6 +76,9 @@ __find_get_page(mapping, index, page_hash(mapping, index)) extern struct page * __find_lock_page (struct address_space * mapping, unsigned long index, struct page **hash); +extern struct page * find_or_create_page(struct address_space *mapping, + unsigned long index, unsigned int gfp_mask); + extern void lock_page(struct page *page); #define find_lock_page(mapping, index) \ __find_lock_page(mapping, index, page_hash(mapping, index)) diff -u --recursive --new-file v2.4.10/linux/include/linux/slab.h linux/include/linux/slab.h --- v2.4.10/linux/include/linux/slab.h Sun Sep 23 11:41:01 2001 +++ linux/include/linux/slab.h Sun Sep 30 12:07:34 2001 @@ -24,7 +24,7 @@ #define SLAB_NFS GFP_NFS #define SLAB_DMA GFP_DMA -#define SLAB_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_HIGHIO|__GFP_FS) +#define SLAB_LEVEL_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_HIGHIO|__GFP_WAITBUF|__GFP_FS) #define SLAB_NO_GROW 0x00001000UL /* don't grow a cache */ /* flags to pass to kmem_cache_create(). diff -u --recursive --new-file v2.4.10/linux/include/linux/swap.h linux/include/linux/swap.h --- v2.4.10/linux/include/linux/swap.h Sun Sep 23 11:41:01 2001 +++ linux/include/linux/swap.h Sun Sep 30 12:07:30 2001 @@ -131,6 +131,7 @@ extern void oom_kill(void); /* linux/mm/swapfile.c */ +extern int total_swap_pages; extern unsigned int nr_swapfiles; extern struct swap_info_struct swap_info[]; extern int is_swap_partition(kdev_t); diff -u --recursive --new-file v2.4.10/linux/kernel/ksyms.c linux/kernel/ksyms.c --- v2.4.10/linux/kernel/ksyms.c Sun Sep 23 11:41:01 2001 +++ linux/kernel/ksyms.c Fri Sep 28 11:21:40 2001 @@ -172,6 +172,7 @@ EXPORT_SYMBOL(files_lock); EXPORT_SYMBOL(check_disk_change); EXPORT_SYMBOL(__invalidate_buffers); +EXPORT_SYMBOL(invalidate_bdev); EXPORT_SYMBOL(invalidate_inodes); EXPORT_SYMBOL(invalidate_device); EXPORT_SYMBOL(invalidate_inode_pages); @@ -210,7 +211,6 @@ EXPORT_SYMBOL(generic_file_read); EXPORT_SYMBOL(do_generic_file_read); EXPORT_SYMBOL(generic_file_write); -EXPORT_SYMBOL(generic_direct_IO); EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_ro_fops); EXPORT_SYMBOL(generic_buffer_fdatasync); diff -u --recursive --new-file v2.4.10/linux/mm/filemap.c linux/mm/filemap.c --- v2.4.10/linux/mm/filemap.c Sun Sep 23 11:41:01 2001 +++ linux/mm/filemap.c Sat Sep 29 12:33:49 2001 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -56,6 +57,7 @@ #define CLUSTER_PAGES (1 << page_cluster) #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster) +static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p)); static void add_page_to_hash_queue(struct page * page, struct page **p) { struct page *next = *p; @@ -170,11 +172,7 @@ page = list_entry(curr, struct page, list); curr = curr->next; - /* We cannot invalidate something in use.. */ - if (page_count(page) != 1) - continue; - - /* ..or dirty.. */ + /* We cannot invalidate something in dirty.. */ if (PageDirty(page)) continue; @@ -182,10 +180,20 @@ if (TryLockPage(page)) continue; + if (page->buffers && !try_to_free_buffers(page, 0)) + goto unlock; + + if (page_count(page) != 1) + goto unlock; + __lru_cache_del(page); __remove_inode_page(page); UnlockPage(page); page_cache_release(page); + continue; +unlock: + UnlockPage(page); + continue; } spin_unlock(&pagemap_lru_lock); @@ -792,11 +800,13 @@ } /* - * Same as the above, but lock the page too, verifying that - * it's still valid once we own it. - */ -struct page * __find_lock_page (struct address_space *mapping, - unsigned long offset, struct page **hash) + * Must be called with the pagecache lock held, + * will return with it held (but it may be dropped + * during blocking operations.. + */ +static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *)); +static struct page * __find_lock_page_helper(struct address_space *mapping, + unsigned long offset, struct page *hash) { struct page *page; @@ -805,27 +815,79 @@ * the hash-list needs a held write-lock. */ repeat: - spin_lock(&pagecache_lock); - page = __find_page_nolock(mapping, offset, *hash); + page = __find_page_nolock(mapping, offset, hash); if (page) { page_cache_get(page); - spin_unlock(&pagecache_lock); + if (TryLockPage(page)) { + spin_unlock(&pagecache_lock); + lock_page(page); + spin_lock(&pagecache_lock); - lock_page(page); + /* Has the page been re-allocated while we slept? */ + if (page->mapping != mapping || page->index != offset) { + UnlockPage(page); + page_cache_release(page); + goto repeat; + } + } + } + return page; +} - /* Is the page still hashed? Ok, good.. */ - if (page->mapping == mapping && page->index == offset) - return page; +/* + * Same as the above, but lock the page too, verifying that + * it's still valid once we own it. + */ +struct page * __find_lock_page (struct address_space *mapping, + unsigned long offset, struct page **hash) +{ + struct page *page; - /* Nope: we raced. Release and try again.. */ - UnlockPage(page); - page_cache_release(page); - goto repeat; - } + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, offset, *hash); spin_unlock(&pagecache_lock); - return NULL; + return page; } +/* + * Same as above, but create the page if required.. + */ +struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask) +{ + struct page *page; + struct page **hash = page_hash(mapping, index); + + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, index, *hash); + spin_unlock(&pagecache_lock); + if (!page) { + struct page *newpage = alloc_page(gfp_mask); + page = ERR_PTR(-ENOMEM); + if (newpage) { + spin_lock(&pagecache_lock); + page = __find_lock_page_helper(mapping, index, *hash); + if (likely(!page)) { + page = newpage; + __add_to_page_cache(page, mapping, index, hash); + newpage = NULL; + } + spin_unlock(&pagecache_lock); + if (unlikely(newpage != NULL)) + page_cache_release(newpage); + } + } + return page; +} + +/* + * Returns locked page at given index in given cache, creating it if needed. + */ +struct page *grab_cache_page(struct address_space *mapping, unsigned long index) +{ + return find_or_create_page(mapping, index, mapping->gfp_mask); +} + + #if 0 #define PROFILE_READAHEAD #define DEBUG_READAHEAD @@ -956,30 +1018,6 @@ return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)]; } -static inline unsigned long calc_end_index(struct inode * inode) -{ - unsigned long end_index; - - if (!S_ISBLK(inode->i_mode)) - end_index = inode->i_size >> PAGE_CACHE_SHIFT; - else - end_index = buffered_blk_size(inode->i_rdev) >> (PAGE_CACHE_SHIFT - BLOCK_SIZE_BITS); - - return end_index; -} - -static inline loff_t calc_rsize(struct inode * inode) -{ - loff_t rsize; - - if (!S_ISBLK(inode->i_mode)) - rsize = inode->i_size; - else - rsize = (loff_t) buffered_blk_size(inode->i_rdev) << BLOCK_SIZE_BITS; - - return rsize; -} - static void generic_file_readahead(int reada_ok, struct file * filp, struct inode * inode, struct page * page) @@ -990,7 +1028,7 @@ unsigned long raend; int max_readahead = get_max_readahead(inode); - end_index = calc_end_index(inode); + end_index = inode->i_size >> PAGE_CACHE_SHIFT; raend = filp->f_raend; max_ahead = 0; @@ -1114,8 +1152,8 @@ */ void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor) { - struct inode *inode = filp->f_dentry->d_inode; - struct address_space *mapping = inode->i_mapping; + struct address_space *mapping = filp->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; unsigned long index, offset; struct page *cached_page; int reada_ok; @@ -1169,13 +1207,13 @@ struct page *page, **hash; unsigned long end_index, nr, ret; - end_index = calc_end_index(inode); + end_index = inode->i_size >> PAGE_CACHE_SHIFT; if (index > end_index) break; nr = PAGE_CACHE_SIZE; if (index == end_index) { - nr = calc_rsize(inode) & ~PAGE_CACHE_MASK; + nr = inode->i_size & ~PAGE_CACHE_MASK; if (nr <= offset) break; } @@ -1207,6 +1245,13 @@ flush_dcache_page(page); /* + * Mark the page accessed if we read the + * beginning or we just did an lseek. + */ + if (!offset || !filp->f_reada) + mark_page_accessed(page); + + /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... * @@ -1221,7 +1266,6 @@ index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; - mark_page_accessed(page); page_cache_release(page); if (ret == nr && desc->count) continue; @@ -1316,92 +1360,6 @@ UPDATE_ATIME(inode); } -static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset) -{ - ssize_t retval; - int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits, iosize, progress; - struct kiobuf * iobuf; - struct inode * inode = filp->f_dentry->d_inode; - struct address_space * mapping = inode->i_mapping; - - new_iobuf = 0; - iobuf = filp->f_iobuf; - if (test_and_set_bit(0, &filp->f_iobuf_lock)) { - /* - * A parallel read/write is using the preallocated iobuf - * so just run slow and allocate a new one. - */ - retval = alloc_kiovec(1, &iobuf); - if (retval) - goto out; - new_iobuf = 1; - } - - if (!S_ISBLK(inode->i_mode)) { - blocksize = inode->i_sb->s_blocksize; - blocksize_bits = inode->i_sb->s_blocksize_bits; - } else { - blocksize = BUFFERED_BLOCKSIZE; - blocksize_bits = BUFFERED_BLOCKSIZE_BITS; - } - blocksize_mask = blocksize - 1; - chunk_size = KIO_MAX_ATOMIC_IO << 10; - - retval = -EINVAL; - if ((offset & blocksize_mask) || (count & blocksize_mask)) - goto out_free; - if (!mapping->a_ops->direct_IO) - goto out_free; - - /* - * Flush to disk exlusively the _data_, metadata must remains - * completly asynchronous or performance will go to /dev/null. - */ - filemap_fdatasync(mapping); - retval = fsync_inode_data_buffers(inode); - filemap_fdatawait(mapping); - if (retval < 0) - goto out_free; - - progress = retval = 0; - while (count > 0) { - iosize = count; - if (iosize > chunk_size) - iosize = chunk_size; - - retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize); - if (retval) - break; - - retval = mapping->a_ops->direct_IO(rw, inode, iobuf, (offset+progress) >> blocksize_bits, blocksize); - - if (rw == READ && retval > 0) - mark_dirty_kiobuf(iobuf, retval); - - if (retval >= 0) { - count -= retval; - buf += retval; - progress += retval; - } - - unmap_kiobuf(iobuf); - - if (retval != iosize) - break; - } - - if (progress) - retval = progress; - - out_free: - if (!new_iobuf) - clear_bit(0, &filp->f_iobuf_lock); - else - free_kiovec(1, &iobuf); - out: - return retval; -} - int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size) { char *kaddr; @@ -1435,9 +1393,6 @@ if ((ssize_t) count < 0) return -EINVAL; - if (filp->f_flags & O_DIRECT) - goto o_direct; - retval = -EFAULT; if (access_ok(VERIFY_WRITE, buf, count)) { retval = 0; @@ -1456,28 +1411,7 @@ retval = desc.error; } } - out: return retval; - - o_direct: - { - loff_t pos = *ppos, size; - struct inode * inode = filp->f_dentry->d_inode; - - retval = 0; - if (!count) - goto out; /* skip atime */ - size = calc_rsize(inode); - if (pos < size) { - if (pos + count > size) - count = size - pos; - retval = generic_file_direct_IO(READ, filp, buf, count, pos); - if (retval > 0) - *ppos = pos + retval; - } - UPDATE_ATIME(filp->f_dentry->d_inode); - goto out; - } } static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size) @@ -1662,7 +1596,6 @@ struct address_space *mapping = inode->i_mapping; struct page *page, **hash, *old_page; unsigned long size, pgoff; - loff_t rsize; pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff; @@ -1671,8 +1604,7 @@ * An external ptracer can access pages that normally aren't * accessible.. */ - rsize = calc_rsize(inode); - size = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; if ((pgoff >= size) && (area->vm_mm == current->mm)) return NULL; @@ -2171,14 +2103,13 @@ long error = -EBADF; struct file * file; unsigned long size, rlim_rss; - loff_t rsize; /* Doesn't work if there's no mapped file. */ if (!vma->vm_file) return error; file = vma->vm_file; - rsize = calc_rsize(file->f_dentry->d_inode); - size = (rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size = (file->f_dentry->d_inode->i_size + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; if (end > vma->vm_end) @@ -2616,19 +2547,6 @@ return page; } -/* - * Returns locked page at given index in given cache, creating it if needed. - */ - -struct page *grab_cache_page(struct address_space *mapping, unsigned long index) -{ - struct page *cached_page = NULL; - struct page *page = __grab_cache_page(mapping,index,&cached_page); - if (cached_page) - page_cache_release(cached_page); - return page; -} - inline void remove_suid(struct inode *inode) { unsigned int mode; @@ -2662,8 +2580,8 @@ ssize_t generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos) { - struct inode *inode = file->f_dentry->d_inode; - struct address_space *mapping = inode->i_mapping; + struct address_space *mapping = file->f_dentry->d_inode->i_mapping; + struct inode *inode = mapping->host; unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur; loff_t pos; struct page *page, *cached_page; @@ -2695,8 +2613,7 @@ written = 0; - /* FIXME: this is for backwards compatibility with 2.4 */ - if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND) + if (file->f_flags & O_APPEND) pos = inode->i_size; /* @@ -2757,17 +2674,15 @@ err = -EPERM; goto out; } - if (pos >= calc_rsize(inode)) { - if (count || pos > calc_rsize(inode)) { - /* FIXME: this is for backwards compatibility with 2.4 */ + if (pos >= inode->i_size) { + if (count || pos > inode->i_size) { err = -ENOSPC; goto out; } - /* zero-length writes at blkdev end are OK */ } - if (pos + count > calc_rsize(inode)) - count = calc_rsize(inode) - pos; + if (pos + count > inode->i_size) + count = inode->i_size - pos; } err = 0; @@ -2778,9 +2693,6 @@ inode->i_ctime = inode->i_mtime = CURRENT_TIME; mark_inode_dirty_sync(inode); - if (file->f_flags & O_DIRECT) - goto o_direct; - do { unsigned long index, offset; long page_fault; @@ -2855,7 +2767,6 @@ if ((status >= 0) && (file->f_flags & O_SYNC)) status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA); -out_status: err = written ? written : status; out: @@ -2864,25 +2775,6 @@ fail_write: status = -EFAULT; goto unlock; - -o_direct: - written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos); - if (written > 0) { - loff_t end = pos + written; - if (end > inode->i_size && !S_ISBLK(inode->i_mode)) { - inode->i_size = end; - mark_inode_dirty(inode); - } - *ppos = end; - invalidate_inode_pages2(mapping); - } - /* - * Sync the fs metadata but not the minor inode changes and - * of course not the data as we did direct DMA for the IO. - */ - if (written >= 0 && file->f_flags & O_SYNC) - status = generic_osync_inode(inode, OSYNC_METADATA); - goto out_status; } void __init page_cache_init(unsigned long mempages) diff -u --recursive --new-file v2.4.10/linux/mm/memory.c linux/mm/memory.c --- v2.4.10/linux/mm/memory.c Sun Sep 23 11:41:01 2001 +++ linux/mm/memory.c Thu Sep 27 08:41:16 2001 @@ -319,7 +319,9 @@ if (pte_none(pte)) continue; if (pte_present(pte)) { - freed ++; + struct page *page = pte_page(pte); + if (VALID_PAGE(page) && !PageReserved(page)) + freed ++; /* This will eventually call __free_pte on the pte. */ tlb_remove_page(tlb, ptep, address + offset); } else { @@ -1101,6 +1103,10 @@ return; } +/* Swap 80% full? Release the pages as they are paged in.. */ +#define vm_swap_full() \ + (swapper_space.nrpages*5 > total_swap_pages*4) + /* * We hold the mm semaphore and the page_table_lock on entry and exit. */ @@ -1158,10 +1164,12 @@ swap_free(entry); mark_page_accessed(page); if (exclusive_swap_page(page)) { - if (vma->vm_flags & VM_WRITE) - pte = pte_mkwrite(pte); - pte = pte_mkdirty(pte); - delete_from_swap_cache(page); + if (write_access || vm_swap_full()) { + pte = pte_mkdirty(pte); + if (vma->vm_flags & VM_WRITE) + pte = pte_mkwrite(pte); + delete_from_swap_cache(page); + } } UnlockPage(page); diff -u --recursive --new-file v2.4.10/linux/mm/mmap.c linux/mm/mmap.c --- v2.4.10/linux/mm/mmap.c Sun Sep 23 11:41:01 2001 +++ linux/mm/mmap.c Sun Sep 30 11:05:40 2001 @@ -67,8 +67,8 @@ if (sysctl_overcommit_memory) return 1; - free = atomic_read(&buffermem_pages); - free += atomic_read(&page_cache_size); + /* The page cache contains buffer pages these days.. */ + free = atomic_read(&page_cache_size); free += nr_free_pages(); free += nr_swap_pages; diff -u --recursive --new-file v2.4.10/linux/mm/page_alloc.c linux/mm/page_alloc.c --- v2.4.10/linux/mm/page_alloc.c Sun Sep 23 11:41:01 2001 +++ linux/mm/page_alloc.c Thu Sep 27 14:37:23 2001 @@ -480,7 +480,7 @@ zone_t **zonep, *zone; do { - zonelist = pgdat->node_zonelists + __GFP_HIGHMEM; + zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK); zonep = zonelist->zones; for (zone = *zonep++; zone; zone = *zonep++) diff -u --recursive --new-file v2.4.10/linux/mm/vmscan.c linux/mm/vmscan.c --- v2.4.10/linux/mm/vmscan.c Sun Sep 23 11:41:01 2001 +++ linux/mm/vmscan.c Sat Sep 29 12:44:24 2001 @@ -407,12 +407,6 @@ if (try_to_free_buffers(page, gfp_mask)) { if (!page->mapping) { /* - * Account we successfully freed a page - * of buffer cache. - */ - atomic_dec(&buffermem_pages); - - /* * We must not allow an anon page * with no buffers to be visible on * the LRU, so we unlock the page after @@ -536,16 +530,20 @@ static int FASTCALL(shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)); static int shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages) { - int max_scan = nr_inactive_pages / priority; + int max_scan; + int chunk_size = nr_pages; + unsigned long ratio; nr_pages -= kmem_cache_reap(gfp_mask); if (nr_pages <= 0) return 0; - /* Do we want to age the active list? */ - if (nr_inactive_pages < nr_active_pages*2) - refill_inactive(nr_pages); - + nr_pages = chunk_size; + /* try to keep the active list 2/3 of the size of the cache */ + ratio = (unsigned long) nr_pages * nr_active_pages / ((nr_inactive_pages + 1) * 2); + refill_inactive(ratio); + + max_scan = nr_inactive_pages / priority; nr_pages = shrink_cache(nr_pages, max_scan, classzone, gfp_mask); if (nr_pages <= 0) return 0; @@ -558,17 +556,28 @@ int try_to_free_pages(zone_t * classzone, unsigned int gfp_mask, unsigned int order) { - int priority = DEF_PRIORITY; int ret = 0; - do { + for (;;) { + int priority = DEF_PRIORITY; int nr_pages = SWAP_CLUSTER_MAX; - nr_pages = shrink_caches(priority, classzone, gfp_mask, nr_pages); - if (nr_pages <= 0) - return 1; - ret |= swap_out(priority, classzone, gfp_mask, SWAP_CLUSTER_MAX << 2); - } while (--priority); + do { + nr_pages = shrink_caches(priority, classzone, gfp_mask, nr_pages); + if (nr_pages <= 0) + return 1; + + ret |= swap_out(priority, classzone, gfp_mask, SWAP_CLUSTER_MAX << 2); + } while (--priority); + + if (likely(ret)) + break; + if (likely(current->pid != 1)) + break; + current->policy |= SCHED_YIELD; + __set_current_state(TASK_RUNNING); + schedule(); + } return ret; } diff -u --recursive --new-file v2.4.10/linux/net/socket.c linux/net/socket.c --- v2.4.10/linux/net/socket.c Sun Sep 23 11:41:02 2001 +++ linux/net/socket.c Fri Sep 28 18:03:48 2001 @@ -440,11 +440,10 @@ struct inode * inode; struct socket * sock; - inode = get_empty_inode(); + inode = new_inode(sock_mnt->mnt_sb); if (!inode) return NULL; - inode->i_sb = sock_mnt->mnt_sb; sock = socki_lookup(inode); inode->i_mode = S_IFSOCK|S_IRWXUGO;