diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/s390/kernel/compat_exec.c x/arch/s390/kernel/compat_exec.c --- x-ref/arch/s390/kernel/compat_exec.c 2004-04-04 06:51:44.523633000 +0200 +++ x/arch/s390/kernel/compat_exec.c 2004-04-04 06:51:49.448884248 +0200 @@ -71,7 +71,7 @@ int setup_arg_pages32(struct linux_binpr mpnt->vm_ops = NULL; mpnt->vm_pgoff = mpnt->vm_start >> PAGE_SHIFT; mpnt->vm_file = NULL; - INIT_LIST_HEAD(&mpnt->shared); + INIT_VMA_SHARED(mpnt); /* insert_vm_struct takes care of anon_vma_node */ mpnt->anon_vma = NULL; mpnt->vm_private_data = (void *) 0; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/arch/x86_64/ia32/ia32_binfmt.c x/arch/x86_64/ia32/ia32_binfmt.c --- x-ref/arch/x86_64/ia32/ia32_binfmt.c 2004-04-04 06:51:44.524632848 +0200 +++ x/arch/x86_64/ia32/ia32_binfmt.c 2004-04-04 06:51:49.415889264 +0200 @@ -360,7 +360,7 @@ int setup_arg_pages(struct linux_binprm mpnt->vm_ops = NULL; mpnt->vm_pgoff = mpnt->vm_start >> PAGE_SHIFT; mpnt->vm_file = NULL; - INIT_LIST_HEAD(&mpnt->shared); + INIT_VMA_SHARED(mpnt); /* insert_vm_struct takes care of anon_vma_node */ mpnt->anon_vma = NULL; mpnt->vm_private_data = (void *) 0; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/fs/exec.c x/fs/exec.c --- x-ref/fs/exec.c 2004-04-04 06:51:44.530631936 +0200 +++ x/fs/exec.c 2004-04-04 06:51:49.417888960 +0200 @@ -428,7 +428,7 @@ int setup_arg_pages(struct linux_binprm mpnt->vm_ops = NULL; mpnt->vm_pgoff = mpnt->vm_start >> PAGE_SHIFT; mpnt->vm_file = NULL; - INIT_LIST_HEAD(&mpnt->shared); + INIT_VMA_SHARED(mpnt); /* insert_vm_struct takes care of anon_vma_node */ mpnt->anon_vma = NULL; mpnt->vm_private_data = (void *) 0; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/fs/hugetlbfs/inode.c x/fs/hugetlbfs/inode.c --- x-ref/fs/hugetlbfs/inode.c 2004-04-04 06:51:42.748902800 +0200 +++ x/fs/hugetlbfs/inode.c 2004-04-04 06:51:49.418888808 +0200 @@ -265,11 +265,13 @@ static void hugetlbfs_drop_inode(struct * vma->vm_pgoff is in PAGE_SIZE units. */ static void -hugetlb_vmtruncate_list(struct list_head *list, unsigned long h_pgoff) +hugetlb_vmtruncate_list(struct prio_tree_root *root, unsigned long h_pgoff) { struct vm_area_struct *vma; + struct prio_tree_iter iter; - list_for_each_entry(vma, list, shared) { + vma = __vma_prio_tree_first(root, &iter, h_pgoff, ULONG_MAX); + while (vma) { unsigned long h_vm_pgoff; unsigned long v_length; unsigned long h_length; @@ -301,6 +303,8 @@ hugetlb_vmtruncate_list(struct list_head zap_hugepage_range(vma, vma->vm_start + v_offset, v_length - v_offset); + + vma = __vma_prio_tree_next(vma, root, &iter, h_pgoff, ULONG_MAX); } } @@ -320,9 +324,11 @@ static int hugetlb_vmtruncate(struct ino inode->i_size = offset; down(&mapping->i_shared_sem); - if (!list_empty(&mapping->i_mmap)) + /* Protect against page fault */ + atomic_inc(&mapping->truncate_count); + if (unlikely(!prio_tree_empty(&mapping->i_mmap))) hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); - if (!list_empty(&mapping->i_mmap_shared)) + if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared))) hugetlb_vmtruncate_list(&mapping->i_mmap_shared, pgoff); up(&mapping->i_shared_sem); truncate_hugepages(mapping, offset); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/fs/inode.c x/fs/inode.c --- x-ref/fs/inode.c 2004-04-04 06:51:43.639767368 +0200 +++ x/fs/inode.c 2004-04-04 06:51:49.420888504 +0200 @@ -185,8 +185,9 @@ void inode_init_once(struct inode *inode atomic_set(&inode->i_data.truncate_count, 0); INIT_LIST_HEAD(&inode->i_data.private_list); spin_lock_init(&inode->i_data.private_lock); - INIT_LIST_HEAD(&inode->i_data.i_mmap); - INIT_LIST_HEAD(&inode->i_data.i_mmap_shared); + INIT_PRIO_TREE_ROOT(&inode->i_data.i_mmap); + INIT_PRIO_TREE_ROOT(&inode->i_data.i_mmap_shared); + INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear); spin_lock_init(&inode->i_lock); i_size_ordered_init(inode); } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/fs/locks.c x/fs/locks.c --- x-ref/fs/locks.c 2004-03-11 08:27:41.000000000 +0100 +++ x/fs/locks.c 2004-04-04 06:51:49.422888200 +0200 @@ -1455,8 +1455,8 @@ int fcntl_setlk(struct file *filp, unsig if (IS_MANDLOCK(inode) && (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) { struct address_space *mapping = filp->f_mapping; - - if (!list_empty(&mapping->i_mmap_shared)) { + if (!prio_tree_empty(&mapping->i_mmap_shared) || + !list_empty(&mapping->i_mmap_nonlinear)) { error = -EAGAIN; goto out; } @@ -1593,8 +1593,8 @@ int fcntl_setlk64(struct file *filp, uns if (IS_MANDLOCK(inode) && (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID) { struct address_space *mapping = filp->f_mapping; - - if (!list_empty(&mapping->i_mmap_shared)) { + if (!prio_tree_empty(&mapping->i_mmap_shared) || + !list_empty(&mapping->i_mmap_nonlinear)) { error = -EAGAIN; goto out; } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/fs/proc/task_mmu.c x/fs/proc/task_mmu.c --- x-ref/fs/proc/task_mmu.c 2004-02-04 16:07:00.000000000 +0100 +++ x/fs/proc/task_mmu.c 2004-04-04 06:51:49.423888048 +0200 @@ -65,7 +65,7 @@ int task_statm(struct mm_struct *mm, int *shared += pages; continue; } - if (vma->vm_flags & VM_SHARED || !list_empty(&vma->shared)) + if (vma->vm_flags & VM_SHARED || !vma_shared_empty(vma)) *shared += pages; if (vma->vm_flags & VM_EXECUTABLE) *text += pages; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/fs/xfs/linux/xfs_vnode.h x/fs/xfs/linux/xfs_vnode.h --- x-ref/fs/xfs/linux/xfs_vnode.h 2004-04-04 06:51:43.457795032 +0200 +++ x/fs/xfs/linux/xfs_vnode.h 2004-04-04 06:51:49.424887896 +0200 @@ -597,8 +597,9 @@ static __inline__ void vn_flagclr(struct * Some useful predicates. */ #define VN_MAPPED(vp) \ - (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \ - (!list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared)))) + (!prio_tree_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap)) || \ + !prio_tree_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_shared)) || \ + !list_empty(&(LINVFS_GET_IP(vp)->i_mapping->i_mmap_nonlinear))) #define VN_CACHED(vp) (LINVFS_GET_IP(vp)->i_mapping->nrpages) #define VN_DIRTY(vp) mapping_tagged(LINVFS_GET_IP(vp)->i_mapping, \ PAGECACHE_TAG_DIRTY) diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/include/linux/fs.h x/include/linux/fs.h --- x-ref/include/linux/fs.h 2004-04-04 06:51:43.653765240 +0200 +++ x/include/linux/fs.h 2004-04-04 06:51:49.426887592 +0200 @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -325,8 +326,9 @@ struct address_space { spinlock_t tree_lock; /* and spinlock protecting it */ unsigned long nrpages; /* number of total pages */ struct address_space_operations *a_ops; /* methods */ - struct list_head i_mmap; /* list of private mappings */ - struct list_head i_mmap_shared; /* list of shared mappings */ + struct prio_tree_root i_mmap; /* tree of private mappings */ + struct prio_tree_root i_mmap_shared; /* tree of shared mappings */ + struct list_head i_mmap_nonlinear;/*list of nonlinear mappings */ struct semaphore i_shared_sem; /* protect both above lists */ atomic_t truncate_count; /* Cover race condition with truncate */ unsigned long flags; /* error bits/gfp mask */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/include/linux/mm.h x/include/linux/mm.h --- x-ref/include/linux/mm.h 2004-04-04 06:51:44.547629352 +0200 +++ x/include/linux/mm.h 2004-04-04 06:51:49.428887288 +0200 @@ -11,6 +11,7 @@ #include #include #include +#include #include #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ @@ -83,7 +84,28 @@ struct vm_area_struct { * one of the address_space->i_mmap{,shared} lists, * for shm areas, the list of attaches, otherwise unused. */ - struct list_head shared; + union { + struct { + struct list_head list; + void *parent; + } vm_set; + + struct prio_tree_node prio_tree_node; + + struct { + void *first; + void *second; + void *parent; + } both; + } shared; + + /* + * shared.vm_set : list of vmas that map exactly the same set of pages + * vm_set_head : head of the vm_set list + * + * TODO: try to shove the following field into vm_private_data ?? + */ + struct vm_area_struct *vm_set_head; /* * The same vma can be both queued into the i_mmap and in a @@ -161,6 +183,150 @@ struct vm_area_struct { #define VM_RandomReadHint(v) ((v)->vm_flags & VM_RAND_READ) /* + * The following macros are used for implementing prio_tree for i_mmap{_shared} + */ + +#define RADIX_INDEX(vma) ((vma)->vm_pgoff) +#define VMA_SIZE(vma) (((vma)->vm_end - (vma)->vm_start) >> PAGE_SHIFT) +/* avoid overflow */ +#define HEAP_INDEX(vma) ((vma)->vm_pgoff + (VMA_SIZE(vma) - 1)) + +#define GET_INDEX_VMA(vma, radix, heap) \ +do { \ + radix = RADIX_INDEX(vma); \ + heap = HEAP_INDEX(vma); \ +} while (0) + +#define GET_INDEX(node, radix, heap) \ +do { \ + struct vm_area_struct *__tmp = \ + prio_tree_entry(node, struct vm_area_struct, shared.prio_tree_node);\ + GET_INDEX_VMA(__tmp, radix, heap); \ +} while (0) + +#define INIT_VMA_SHARED_LIST(vma) \ +do { \ + INIT_LIST_HEAD(&(vma)->shared.vm_set.list); \ + (vma)->shared.vm_set.parent = NULL; \ + (vma)->vm_set_head = NULL; \ +} while (0) + +#define INIT_VMA_SHARED(vma) \ +do { \ + (vma)->shared.both.first = NULL; \ + (vma)->shared.both.second = NULL; \ + (vma)->shared.both.parent = NULL; \ + (vma)->vm_set_head = NULL; \ +} while (0) + +extern void __vma_prio_tree_insert(struct prio_tree_root *, + struct vm_area_struct *); + +extern void __vma_prio_tree_remove(struct prio_tree_root *, + struct vm_area_struct *); + +static inline int vma_shared_empty(struct vm_area_struct *vma) +{ + return vma->shared.both.first == NULL; +} + +/* + * Helps to add a new vma that maps the same (identical) set of pages as the + * old vma to an i_mmap tree. + */ +static inline void __vma_prio_tree_add(struct vm_area_struct *vma, + struct vm_area_struct *old) +{ + INIT_VMA_SHARED_LIST(vma); + + /* Leave these BUG_ONs till prio_tree patch stabilizes */ + BUG_ON(RADIX_INDEX(vma) != RADIX_INDEX(old)); + BUG_ON(HEAP_INDEX(vma) != HEAP_INDEX(old)); + + if (old->shared.both.parent) { + if (old->vm_set_head) { + list_add_tail(&vma->shared.vm_set.list, + &old->vm_set_head->shared.vm_set.list); + return; + } + else { + old->vm_set_head = vma; + vma->vm_set_head = old; + } + } + else + list_add(&vma->shared.vm_set.list, &old->shared.vm_set.list); +} + +/* + * We cannot modify vm_start, vm_end, vm_pgoff fields of a vma that has been + * already present in an i_mmap{_shared} tree without modifying the tree. The + * following helper function should be used when such modifications are + * necessary. We should hold the mapping's i_shared_sem. + * + * This function can be (micro)optimized for some special cases (maybe later). + */ +static inline void __vma_modify(struct prio_tree_root *root, + struct vm_area_struct *vma, unsigned long start, unsigned long end, + unsigned long pgoff) +{ + if (root) + __vma_prio_tree_remove(root, vma); + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + if (root) + __vma_prio_tree_insert(root, vma); +} + +/* + * Helper functions to enumerate vmas that map a given file page or a set of + * contiguous file pages. The functions return vmas that at least map a single + * page in the given range of contiguous file pages. + */ +static inline struct vm_area_struct *__vma_prio_tree_first( + struct prio_tree_root *root, struct prio_tree_iter *iter, + unsigned long begin, unsigned long end) +{ + struct prio_tree_node *ptr; + + ptr = prio_tree_first(root, iter, begin, end); + + if (ptr) + return prio_tree_entry(ptr, struct vm_area_struct, + shared.prio_tree_node); + else + return NULL; +} + +static inline struct vm_area_struct *__vma_prio_tree_next( + struct vm_area_struct *vma, struct prio_tree_root *root, + struct prio_tree_iter *iter, unsigned long begin, unsigned long end) +{ + struct prio_tree_node *ptr; + struct vm_area_struct *next; + + if (vma->shared.both.parent) { + if (vma->vm_set_head) + return vma->vm_set_head; + } + else { + next = list_entry(vma->shared.vm_set.list.next, + struct vm_area_struct, shared.vm_set.list); + if (!(next->vm_set_head)) + return next; + } + + ptr = prio_tree_next(root, iter, begin, end); + + if (ptr) + return prio_tree_entry(ptr, struct vm_area_struct, + shared.prio_tree_node); + else + return NULL; +} + +/* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/include/linux/prio_tree.h x/include/linux/prio_tree.h --- x-ref/include/linux/prio_tree.h 1970-01-01 01:00:00.000000000 +0100 +++ x/include/linux/prio_tree.h 2004-04-04 06:51:49.429887136 +0200 @@ -0,0 +1,78 @@ +#ifndef _LINUX_PRIO_TREE_H +#define _LINUX_PRIO_TREE_H + +struct prio_tree_node { + struct prio_tree_node *left; + struct prio_tree_node *right; + struct prio_tree_node *parent; +}; + +struct prio_tree_root { + struct prio_tree_node *prio_tree_node; + unsigned int index_bits; +}; + +struct prio_tree_iter { + struct prio_tree_node *cur; + unsigned long mask; + unsigned long value; + int size_level; +}; + +#define PRIO_TREE_ROOT (struct prio_tree_root) {NULL, 1} + +#define PRIO_TREE_ROOT_INIT {NULL, 1} + +#define INIT_PRIO_TREE_ROOT(ptr) \ +do { \ + (ptr)->prio_tree_node = NULL; \ + (ptr)->index_bits = 1; \ +} while (0) + +#define PRIO_TREE_NODE_INIT(name) {&(name), &(name), &(name)} + +#define PRIO_TREE_NODE(name) \ + struct prio_tree_node name = PRIO_TREE_NODE_INIT(name) + +#define INIT_PRIO_TREE_NODE(ptr) \ +do { \ + (ptr)->left = (ptr)->right = (ptr)->parent = (ptr); \ +} while (0) + +#define prio_tree_entry(ptr, type, member) \ + ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member))) + +#define PRIO_TREE_ITER (struct prio_tree_iter) {NULL, 0UL, 0UL, 0} + +static inline int prio_tree_empty(const struct prio_tree_root *root) +{ + return root->prio_tree_node == NULL; +} + +static inline int prio_tree_root(const struct prio_tree_node *node) +{ + return node->parent == node; +} + +static inline int prio_tree_left_empty(const struct prio_tree_node *node) +{ + return node->left == node; +} + +static inline int prio_tree_right_empty(const struct prio_tree_node *node) +{ + return node->right == node; +} + +extern struct prio_tree_node *prio_tree_insert(struct prio_tree_root *, + struct prio_tree_node *); + +extern void prio_tree_remove(struct prio_tree_root *, struct prio_tree_node *); + +extern struct prio_tree_node *prio_tree_first(struct prio_tree_root *, + struct prio_tree_iter *, unsigned long, unsigned long); + +extern struct prio_tree_node *prio_tree_next(struct prio_tree_root *, + struct prio_tree_iter *, unsigned long, unsigned long); + +#endif diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/init/main.c x/init/main.c --- x-ref/init/main.c 2004-04-04 06:51:44.554628288 +0200 +++ x/init/main.c 2004-04-04 06:51:49.430886984 +0200 @@ -86,6 +86,7 @@ extern void pidhash_init(void); extern void pidmap_init(void); extern void anon_vma_init(void); extern void radix_tree_init(void); +extern void prio_tree_init(void); extern void free_initmem(void); extern void populate_rootfs(void); extern void driver_init(void); @@ -460,6 +461,7 @@ asmlinkage void __init start_kernel(void calibrate_delay(); pidmap_init(); pgtable_cache_init(); + prio_tree_init(); anon_vma_init(); #ifdef CONFIG_X86 if (efi_enabled) diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/kernel/fork.c x/kernel/fork.c --- x-ref/kernel/fork.c 2004-04-04 06:51:44.555628136 +0200 +++ x/kernel/fork.c 2004-04-04 06:51:49.431886832 +0200 @@ -314,7 +314,7 @@ static inline int dup_mmap(struct mm_str tmp->vm_mm = mm; tmp->vm_next = NULL; file = tmp->vm_file; - INIT_LIST_HEAD(&tmp->shared); + INIT_VMA_SHARED(tmp); if (file) { struct inode *inode = file->f_dentry->d_inode; get_file(file); @@ -323,7 +323,7 @@ static inline int dup_mmap(struct mm_str /* insert tmp into the share list, just after mpnt */ down(&file->f_mapping->i_shared_sem); - list_add_tail(&tmp->shared, &mpnt->shared); + __vma_prio_tree_add(tmp, mpnt); up(&file->f_mapping->i_shared_sem); } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/filemap.c x/mm/filemap.c --- x-ref/mm/filemap.c 2004-04-04 06:51:44.558627680 +0200 +++ x/mm/filemap.c 2004-04-04 06:51:49.433886528 +0200 @@ -651,7 +651,8 @@ page_ok: * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (!list_empty(&mapping->i_mmap_shared)) + if (!prio_tree_empty(&mapping->i_mmap_shared) || + !list_empty(&mapping->i_mmap_nonlinear)) flush_dcache_page(page); /* diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/fremap.c x/mm/fremap.c --- x-ref/mm/fremap.c 2004-04-04 06:51:44.559627528 +0200 +++ x/mm/fremap.c 2004-04-04 06:51:49.434886376 +0200 @@ -151,6 +151,8 @@ asmlinkage long sys_remap_file_pages(uns unsigned long __prot, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; + struct address_space *mapping; + unsigned long linear_pgoff; unsigned long end = start + size; struct vm_area_struct *vma; int err = -EINVAL; @@ -187,9 +189,19 @@ asmlinkage long sys_remap_file_pages(uns end > start && start >= vma->vm_start && end <= vma->vm_end) { + linear_pgoff = vma->vm_pgoff; + linear_pgoff += ((start - vma->vm_start) >> PAGE_SHIFT); /* Must set VM_NONLINEAR before any pages are populated. */ - if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff) + if (pgoff != linear_pgoff && !(vma->vm_flags & VM_NONLINEAR)) { + mapping = vma->vm_file->f_mapping; + down(&mapping->i_shared_sem); vma->vm_flags |= VM_NONLINEAR; + __vma_prio_tree_remove(&mapping->i_mmap_shared, vma); + INIT_VMA_SHARED_LIST(vma); + list_add_tail(&vma->shared.vm_set.list, + &mapping->i_mmap_nonlinear); + up(&mapping->i_shared_sem); + } /* ->populate can take a long time, so downgrade the lock. */ downgrade_write(&mm->mmap_sem); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/Makefile x/mm/Makefile --- x-ref/mm/Makefile 2004-04-04 06:51:44.559627528 +0200 +++ x/mm/Makefile 2004-04-04 06:51:49.435886224 +0200 @@ -9,6 +9,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o readahead.o \ - slab.o swap.o truncate.o vmscan.o $(mmu-y) + slab.o swap.o truncate.o vmscan.o prio_tree.o \ + $(mmu-y) obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/memory.c x/mm/memory.c --- x-ref/mm/memory.c 2004-04-04 06:51:44.565626616 +0200 +++ x/mm/memory.c 2004-04-04 06:51:49.436886072 +0200 @@ -1074,11 +1074,11 @@ no_new_page: * An hlen of zero blows away the entire portion file after hba. */ static void -invalidate_mmap_range_list(struct list_head *head, +invalidate_mmap_range_list(struct prio_tree_root *root, unsigned long const hba, unsigned long const hlen) { - struct list_head *curr; + struct prio_tree_iter iter; unsigned long hea; /* last page of hole. */ unsigned long vba; unsigned long vea; /* last page of corresponding uva hole. */ @@ -1089,17 +1089,16 @@ invalidate_mmap_range_list(struct list_h hea = hba + hlen - 1; /* avoid overflow. */ if (hea < hba) hea = ULONG_MAX; - list_for_each(curr, head) { - vp = list_entry(curr, struct vm_area_struct, shared); + vp = __vma_prio_tree_first(root, &iter, hba, hea); + while(vp) { vba = vp->vm_pgoff; vea = vba + ((vp->vm_end - vp->vm_start) >> PAGE_SHIFT) - 1; - if (hea < vba || vea < hba) - continue; /* Mapping disjoint from hole. */ zba = (hba <= vba) ? vba : hba; zea = (vea <= hea) ? vea : hea; zap_page_range(vp, ((zba - vba) << PAGE_SHIFT) + vp->vm_start, (zea - zba + 1) << PAGE_SHIFT); + vp = __vma_prio_tree_next(vp, root, &iter, hba, hea); } } @@ -1134,9 +1133,9 @@ void invalidate_mmap_range(struct addres down(&mapping->i_shared_sem); /* Protect against page fault */ atomic_inc(&mapping->truncate_count); - if (unlikely(!list_empty(&mapping->i_mmap))) + if (unlikely(!prio_tree_empty(&mapping->i_mmap))) invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen); - if (unlikely(!list_empty(&mapping->i_mmap_shared))) + if (unlikely(!prio_tree_empty(&mapping->i_mmap_shared))) invalidate_mmap_range_list(&mapping->i_mmap_shared, hba, hlen); up(&mapping->i_shared_sem); } diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/mmap.c x/mm/mmap.c --- x-ref/mm/mmap.c 2004-04-04 06:51:44.568626160 +0200 +++ x/mm/mmap.c 2004-04-04 06:52:28.698917336 +0200 @@ -77,12 +77,20 @@ EXPORT_SYMBOL(vm_committed_space); * Requires inode->i_mapping->i_shared_sem */ static void -__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode) +__remove_shared_vm_struct(struct vm_area_struct *vma, struct inode *inode, + struct address_space * mapping) { if (inode) { if (vma->vm_flags & VM_DENYWRITE) atomic_inc(&inode->i_writecount); - list_del_init(&vma->shared); + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + list_del_init(&vma->shared.vm_set.list); + INIT_VMA_SHARED(vma); + } + else if (vma->vm_flags & VM_SHARED) + __vma_prio_tree_remove(&mapping->i_mmap_shared, vma); + else + __vma_prio_tree_remove(&mapping->i_mmap, vma); } } @@ -96,7 +104,8 @@ static void remove_shared_vm_struct(stru if (file) { struct address_space *mapping = file->f_mapping; down(&mapping->i_shared_sem); - __remove_shared_vm_struct(vma, file->f_dentry->d_inode); + __remove_shared_vm_struct(vma, file->f_dentry->d_inode, + mapping); up(&mapping->i_shared_sem); } } @@ -270,10 +279,15 @@ static inline void __vma_link_file(struc if (vma->vm_flags & VM_DENYWRITE) atomic_dec(&file->f_dentry->d_inode->i_writecount); - if (vma->vm_flags & VM_SHARED) - list_add_tail(&vma->shared, &mapping->i_mmap_shared); + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + INIT_VMA_SHARED_LIST(vma); + list_add_tail(&vma->shared.vm_set.list, + &mapping->i_mmap_nonlinear); + } + else if (vma->vm_flags & VM_SHARED) + __vma_prio_tree_insert(&mapping->i_mmap_shared, vma); else - list_add_tail(&vma->shared, &mapping->i_mmap); + __vma_prio_tree_insert(&mapping->i_mmap, vma); } } @@ -351,17 +365,6 @@ static inline int is_mergeable_vma(struc } /* - * Requires that the relevant i_shared_sem and anon_vma_lock - * be held by the caller. - */ -static void move_vma_start(struct vm_area_struct *vma, unsigned long addr) -{ - /* we must update pgoff even if no vm_file for the anon_vma */ - vma->vm_pgoff += (long) (addr - vma->vm_start) >> PAGE_SHIFT; - vma->vm_start = addr; -} - -/* * Return true if we can merge this (vm_flags,file,vm_pgoff,size) * in front of (at a lower virtual address and file offset than) the vma. * @@ -424,7 +427,9 @@ static int vma_merge(struct mm_struct *m struct file *file, unsigned long pgoff) { struct inode *inode = file ? file->f_dentry->d_inode : NULL; + struct address_space *mapping = file ? file->f_mapping : NULL; struct semaphore *i_shared_sem; + struct prio_tree_root *root = NULL; /* * We later require that vma->vm_flags == vm_flags, so this tests @@ -435,6 +440,15 @@ static int vma_merge(struct mm_struct *m i_shared_sem = file ? &file->f_mapping->i_shared_sem : NULL; + if (mapping) { + if (vm_flags & VM_SHARED) { + if (likely(!(vm_flags & VM_NONLINEAR))) + root = &mapping->i_mmap_shared; + } + else + root = &mapping->i_mmap; + } + if (!prev) { prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb); goto merge_next; @@ -448,37 +462,31 @@ static int vma_merge(struct mm_struct *m struct vm_area_struct *next; /* - * this can happen outside the i_shared_sem and outside - * the anon_vma_lock since it only enlarge the size of - * the vma, there are no ptes mapped in this new extended - * region anyways. - */ - prev->vm_end = end; - - /* * OK, it did. Can we now merge in the successor as well? */ next = prev->vm_next; /* next cannot change under us, it's serialized by the mmap_sem */ - if (next && prev->vm_end == next->vm_start && + if (next && end == next->vm_start && can_vma_merge_before(prev, next, vm_flags, file, pgoff, (end - addr) >> PAGE_SHIFT)) { - /* - * the vm_end extension on the right can happen as usual - * outside the i_shared_sem/anon_vma_lock. - */ - prev->vm_end = next->vm_end; - /* serialized by the mmap_sem */ __vma_unlink(mm, next, prev); if (file) down(i_shared_sem); - __remove_shared_vm_struct(next, inode); + __vma_modify(root, prev, prev->vm_start, + next->vm_end, prev->vm_pgoff); + + __remove_shared_vm_struct(next, inode, mapping); if (file) up(i_shared_sem); - /* the anon_vma_lock is taken inside */ + /* + * The anon_vma_lock is taken inside and + * we can race with the vm_end move on the right, + * that will not be a problem, moves on the right + * of vm_end are controlled races. + */ anon_vma_merge(prev, next); if (file) @@ -488,6 +496,19 @@ static int vma_merge(struct mm_struct *m kmem_cache_free(vm_area_cachep, next); return 1; } + + /* + * this can happen outside the anon_vma_lock since it only + * enlarge the size of the vma, there are no ptes mapped in + * this new extended region anyways. As usual this is a move + * on the right of the vm_end. + */ + if (file) + down(i_shared_sem); + __vma_modify(root, prev, prev->vm_start, end, prev->vm_pgoff); + if (file) + up(i_shared_sem); + return 1; } @@ -504,7 +525,8 @@ static int vma_merge(struct mm_struct *m if (file) down(i_shared_sem); anon_vma_lock(prev); - move_vma_start(prev, addr); + __vma_modify(root, prev, addr, prev->vm_end, + prev->vm_pgoff - ((end - addr) >> PAGE_SHIFT)); anon_vma_unlock(prev); if (file) up(i_shared_sem); @@ -696,7 +718,7 @@ munmap_back: vma->vm_file = NULL; vma->vm_private_data = NULL; vma->vm_next = NULL; - INIT_LIST_HEAD(&vma->shared); + INIT_VMA_SHARED(vma); vma->anon_vma = NULL; if (file) { @@ -1238,6 +1260,7 @@ int split_vma(struct mm_struct * mm, str { struct vm_area_struct *new; struct address_space *mapping = NULL; + struct prio_tree_root *root = NULL; if (mm->map_count >= sysctl_max_map_count) return -ENOMEM; @@ -1249,7 +1272,7 @@ int split_vma(struct mm_struct * mm, str /* most fields are the same, copy all, and then fixup */ *new = *vma; - INIT_LIST_HEAD(&new->shared); + INIT_VMA_SHARED(new); if (new_below) new->vm_end = addr; @@ -1264,18 +1287,27 @@ int split_vma(struct mm_struct * mm, str if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - if (vma->vm_file) + if (vma->vm_file) { mapping = vma->vm_file->f_mapping; + if (vma->vm_flags & VM_SHARED) { + if (likely(!(vma->vm_flags & VM_NONLINEAR))) + root = &mapping->i_mmap_shared; + } + else + root = &mapping->i_mmap; + } + if (mapping) down(&mapping->i_shared_sem); spin_lock(&mm->page_table_lock); anon_vma_lock(vma); if (new_below) - move_vma_start(vma, addr); + __vma_modify(root, vma, addr, vma->vm_end, + vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT)); else - vma->vm_end = addr; + __vma_modify(root, vma, vma->vm_start, addr, vma->vm_pgoff); __insert_vm_struct(mm, new); @@ -1452,7 +1484,7 @@ unsigned long do_brk(unsigned long addr, vma->vm_pgoff = pgoff; vma->vm_file = NULL; vma->vm_private_data = NULL; - INIT_LIST_HEAD(&vma->shared); + INIT_VMA_SHARED(vma); vma->anon_vma = NULL; vma_link(mm, vma, prev, rb_link, rb_parent); diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/mremap.c x/mm/mremap.c --- x-ref/mm/mremap.c 2004-04-04 06:51:44.570625856 +0200 +++ x/mm/mremap.c 2004-04-04 06:51:49.440885464 +0200 @@ -243,7 +243,7 @@ static unsigned long move_vma(struct vm_ if (allocated_vma) { *new_vma = *vma; - INIT_LIST_HEAD(&new_vma->shared); + INIT_VMA_SHARED(new_vma); new_vma->vm_start = new_addr; new_vma->vm_end = new_addr+new_len; new_vma->vm_pgoff += (addr-vma->vm_start) >> PAGE_SHIFT; @@ -301,6 +301,8 @@ unsigned long do_mremap(unsigned long ad unsigned long flags, unsigned long new_addr) { struct vm_area_struct *vma; + struct address_space *mapping = NULL; + struct prio_tree_root *root = NULL; unsigned long ret = -EINVAL; unsigned long charged = 0; @@ -408,9 +410,26 @@ unsigned long do_mremap(unsigned long ad /* can we just expand the current mapping? */ if (max_addr - addr >= new_len) { int pages = (new_len - old_len) >> PAGE_SHIFT; + + if (vma->vm_file) { + mapping = vma->vm_file->f_mapping; + if (vma->vm_flags & VM_SHARED) { + if (likely(!(vma->vm_flags & VM_NONLINEAR))) + root = &mapping->i_mmap_shared; + } + else + root = &mapping->i_mmap; + down(&mapping->i_shared_sem); + } + spin_lock(&vma->vm_mm->page_table_lock); - vma->vm_end = addr + new_len; + __vma_modify(root, vma, vma->vm_start, + addr + new_len, vma->vm_pgoff); spin_unlock(&vma->vm_mm->page_table_lock); + + if(mapping) + up(&mapping->i_shared_sem); + current->mm->total_vm += pages; if (vma->vm_flags & VM_LOCKED) { current->mm->locked_vm += pages; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/objrmap.c x/mm/objrmap.c --- x-ref/mm/objrmap.c 2004-04-04 06:51:44.574625248 +0200 +++ x/mm/objrmap.c 2004-04-04 06:51:49.442885160 +0200 @@ -79,8 +79,8 @@ find_pte(struct vm_area_struct *vma, str loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); address = vma->vm_start + ((loffset - vma->vm_pgoff) << PAGE_SHIFT); - if (address < vma->vm_start || address >= vma->vm_end) - goto out; + if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + goto out_wrong_vma; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -106,6 +106,10 @@ out_unmap: pte_unmap(pte); out: return NULL; + +out_wrong_vma: + BUG_ON(!PageAnon(page)); + goto out; } /** @@ -129,8 +133,10 @@ page_referenced_one(struct vm_area_struc * Tracking the referenced info is too expensive * for nonlinear mappings. */ - if (vma->vm_flags & VM_NONLINEAR) + if (unlikely(vma->vm_flags & VM_NONLINEAR)) { + BUG(); goto out; + } if (unlikely(!spin_trylock(&mm->page_table_lock))) goto out; @@ -166,6 +172,8 @@ page_referenced_inode(struct page *page) { struct address_space *mapping = page->mapping; struct vm_area_struct *vma; + struct prio_tree_iter iter; + unsigned long loffset; int referenced = 0; BUG_ON(PageSwapCache(page)); @@ -173,11 +181,22 @@ page_referenced_inode(struct page *page) if (unlikely(down_trylock(&mapping->i_shared_sem))) goto out; - list_for_each_entry(vma, &mapping->i_mmap, shared) + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + + vma = __vma_prio_tree_first(&mapping->i_mmap, &iter, loffset, loffset); + while (vma) { referenced += page_referenced_one(vma, page); + vma = __vma_prio_tree_next(vma, &mapping->i_mmap, &iter, + loffset, loffset); + } - list_for_each_entry(vma, &mapping->i_mmap_shared, shared) + vma = __vma_prio_tree_first(&mapping->i_mmap_shared, &iter, loffset, + loffset); + while (vma) { referenced += page_referenced_one(vma, page); + vma = __vma_prio_tree_next(vma, &mapping->i_mmap_shared, &iter, + loffset, loffset); + } up(&mapping->i_shared_sem); out: @@ -588,6 +607,8 @@ try_to_unmap_inode(struct page *page) { struct address_space *mapping = page->mapping; struct vm_area_struct *vma; + struct prio_tree_iter iter; + unsigned long loffset; int ret = SWAP_AGAIN, young = 0; BUG_ON(PageSwapCache(page)); @@ -595,13 +616,28 @@ try_to_unmap_inode(struct page *page) if (unlikely(down_trylock(&mapping->i_shared_sem))) return ret; - list_for_each_entry(vma, &mapping->i_mmap, shared) { + loffset = (page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT)); + + vma = __vma_prio_tree_first(&mapping->i_mmap, &iter, loffset, loffset); + while (vma) { + ret = try_to_unmap_one(vma, page, &young); + if (ret == SWAP_FAIL || !page->mapcount) + goto out; + vma = __vma_prio_tree_next(vma, &mapping->i_mmap, &iter, + loffset, loffset); + } + + vma = __vma_prio_tree_first(&mapping->i_mmap_shared, &iter, loffset, + loffset); + while (vma) { ret = try_to_unmap_one(vma, page, &young); if (ret == SWAP_FAIL || !page->mapcount) goto out; + vma = __vma_prio_tree_next(vma, &mapping->i_mmap_shared, &iter, + loffset, loffset); } - list_for_each_entry(vma, &mapping->i_mmap_shared, shared) { + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { ret = try_to_unmap_one(vma, page, &young); if (ret == SWAP_FAIL || !page->mapcount) goto out; diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/prio_tree.c x/mm/prio_tree.c --- x-ref/mm/prio_tree.c 1970-01-01 01:00:00.000000000 +0100 +++ x/mm/prio_tree.c 2004-04-04 06:51:49.444884856 +0200 @@ -0,0 +1,577 @@ +/* + * mm/prio_tree.c - priority search tree for mapping->i_mmap{,_shared} + * + * Copyright (C) 2004, Rajesh Venkatasubramanian + * + * Based on the radix priority search tree proposed by Edward M. McCreight + * SIAM Journal of Computing, vol. 14, no.2, pages 257-276, May 1985 + * + * 02Feb2004 Initial version + */ + +#include +#include +#include +#include + +/* + * A clever mix of heap and radix trees forms a radix priority search tree (PST) + * which is useful for storing intervals, e.g, we can consider a vma as a closed + * interval of file pages [offset_begin, offset_end], and store all vmas that + * map a file in a PST. Then, using the PST, we can answer a stabbing query, + * i.e., selecting a set of stored intervals (vmas) that overlap with (map) a + * given input interval X (a set of consecutive file pages), in "O(log n + m)" + * time where 'log n' is the height of the PST, and 'm' is the number of stored + * intervals (vmas) that overlap (map) with the input interval X (the set of + * consecutive file pages). + * + * In our implementation, we store closed intervals of the form [radix_index, + * heap_index]. We assume that always radix_index <= heap_index. McCreight's PST + * is designed for storing intervals with unique radix indices, i.e., each + * interval have different radix_index. However, this limitation can be easily + * overcome by using the size, i.e., heap_index - radix_index, as part of the + * index, so we index the tree using [(radix_index,size), heap_index]. + * + * When the above-mentioned indexing scheme is used, theoretically, in a 32 bit + * machine, the maximum height of a PST can be 64. We can use a balanced version + * of the priority search tree to optimize the tree height, but the balanced + * tree proposed by McCreight is too complex and memory-hungry for our purpose. + */ + +static unsigned long index_bits_to_maxindex[BITS_PER_LONG]; + +/* + * Maximum heap_index that can be stored in a PST with index_bits bits + */ +static inline unsigned long prio_tree_maxindex(unsigned int bits) +{ + return index_bits_to_maxindex[bits - 1]; +} + +/* + * Extend a priority search tree so that it can store a node with heap_index + * max_heap_index. In the worst case, this algorithm takes O((log n)^2). + * However, this function is used rarely and the common case performance is + * not bad. + */ +static struct prio_tree_node *prio_tree_expand(struct prio_tree_root *root, + struct prio_tree_node *node, unsigned long max_heap_index) +{ + struct prio_tree_node *first = NULL, *prev, *last = NULL; + + if (max_heap_index > prio_tree_maxindex(root->index_bits)) + root->index_bits++; + + while (max_heap_index > prio_tree_maxindex(root->index_bits)) { + root->index_bits++; + + if (prio_tree_empty(root)) + continue; + + if (first == NULL) { + first = root->prio_tree_node; + prio_tree_remove(root, root->prio_tree_node); + INIT_PRIO_TREE_NODE(first); + last = first; + } + else { + prev = last; + last = root->prio_tree_node; + prio_tree_remove(root, root->prio_tree_node); + INIT_PRIO_TREE_NODE(last); + prev->left = last; + last->parent = prev; + } + } + + INIT_PRIO_TREE_NODE(node); + + if (first) { + node->left = first; + first->parent = node; + } + else + last = node; + + if (!prio_tree_empty(root)) { + last->left = root->prio_tree_node; + last->left->parent = last; + } + + root->prio_tree_node = node; + return node; +} + +/* + * Replace a prio_tree_node with a new node and return the old node + */ +static inline struct prio_tree_node *prio_tree_replace( + struct prio_tree_root *root, struct prio_tree_node *old, + struct prio_tree_node *node) +{ + INIT_PRIO_TREE_NODE(node); + + if (prio_tree_root(old)) { + BUG_ON(root->prio_tree_node != old); + /* + * We can reduce root->index_bits here. However, it is complex + * and does not help much to improve performance (IMO). + */ + node->parent = node; + root->prio_tree_node = node; + } + else { + node->parent = old->parent; + if (old->parent->left == old) + old->parent->left = node; + else { + BUG_ON(old->parent->right != old); + old->parent->right = node; + } + } + + if (!prio_tree_left_empty(old)) { + node->left = old->left; + old->left->parent = node; + } + + if (!prio_tree_right_empty(old)) { + node->right = old->right; + old->right->parent = node; + } + + return old; +} + +#undef swap +#define swap(x,y,z) do {z = x; x = y; y = z; } while (0) + +/* + * Insert a prio_tree_node @node into a radix priority search tree @root. The + * algorithm typically takes O(log n) time where 'log n' is the number of bits + * required to represent the maximum heap_index. In the worst case, the algo + * can take O((log n)^2) - check prio_tree_expand. + * + * If a prior node with same radix_index and heap_index is already found in + * the tree, then returns the address of the prior node. Otherwise, inserts + * @node into the tree and returns @node. + */ + +struct prio_tree_node *prio_tree_insert(struct prio_tree_root *root, + struct prio_tree_node *node) +{ + struct prio_tree_node *cur, *res = node; + unsigned long radix_index, heap_index; + unsigned long r_index, h_index, index, mask; + int size_flag = 0; + + GET_INDEX(node, radix_index, heap_index); + + if (prio_tree_empty(root) || + heap_index > prio_tree_maxindex(root->index_bits)) + return prio_tree_expand(root, node, heap_index); + + cur = root->prio_tree_node; + mask = 1UL << (root->index_bits - 1); + + while (mask) { + GET_INDEX(cur, r_index, h_index); + + if (r_index == radix_index && h_index == heap_index) + return cur; + + if (h_index < heap_index || (h_index == heap_index && + r_index > radix_index)) + { + struct prio_tree_node *tmp = node; + node = prio_tree_replace(root, cur, node); + cur = tmp; + swap(r_index, radix_index, index); + swap(h_index, heap_index, index); + } + + if (size_flag) + index = heap_index - radix_index; + else + index = radix_index; + + if (index & mask) { + if (prio_tree_right_empty(cur)) { + INIT_PRIO_TREE_NODE(node); + cur->right = node; + node->parent = cur; + return res; + } + else + cur = cur->right; + } + else { + if (prio_tree_left_empty(cur)) { + INIT_PRIO_TREE_NODE(node); + cur->left = node; + node->parent = cur; + return res; + } + else + cur = cur->left; + } + + mask >>= 1; + + if (!mask) { + mask = 1UL << (root->index_bits - 1); + size_flag = 1; + } + } + /* Should not reach here */ + BUG(); + return NULL; +} + +/* + * Remove a prio_tree_node @node from a radix priority search tree @root. The + * algorithm takes O(log n) time where 'log n' is the number of bits required + * to represent the maximum heap_index. + */ + +void prio_tree_remove(struct prio_tree_root *root, struct prio_tree_node *node) +{ + struct prio_tree_node *cur; + unsigned long r_index, h_index_right, h_index_left; + + cur = node; + + while (!prio_tree_left_empty(cur) || !prio_tree_right_empty(cur)) { + if (!prio_tree_left_empty(cur)) + GET_INDEX(cur->left, r_index, h_index_left); + else { + cur = cur->right; + continue; + } + + if (!prio_tree_right_empty(cur)) + GET_INDEX(cur->right, r_index, h_index_right); + else { + cur = cur->left; + continue; + } + + /* both h_index_left and h_index_right cannot be 0 */ + if (h_index_left >= h_index_right) + cur = cur->left; + else + cur = cur->right; + } + + if (prio_tree_root(cur)) { + BUG_ON(root->prio_tree_node != cur); + *root = PRIO_TREE_ROOT; + return; + } + + if (cur->parent->right == cur) + cur->parent->right = cur->parent; + else { + BUG_ON(cur->parent->left != cur); + cur->parent->left = cur->parent; + } + + while (cur != node) + cur = prio_tree_replace(root, cur->parent, cur); + + return; +} + +/* + * Following functions help to enumerate all prio_tree_nodes in the tree that + * overlap with the input interval X [radix_index, heap_index]. The enumeration + * takes O(log n + m) time where 'log n' is the height of the tree (which is + * proportional to # of bits required to represent the maximum heap_index) and + * 'm' is the number of prio_tree_nodes that overlap the interval X. + */ + +static inline struct prio_tree_node *__prio_tree_left( + struct prio_tree_root *root, struct prio_tree_iter *iter, + unsigned long radix_index, unsigned long heap_index, + unsigned long *r_index, unsigned long *h_index) +{ + if (prio_tree_left_empty(iter->cur)) + return NULL; + + GET_INDEX(iter->cur->left, *r_index, *h_index); + + if (radix_index <= *h_index) { + iter->cur = iter->cur->left; + iter->mask >>= 1; + if (iter->mask) { + if (iter->size_level) + iter->size_level++; + } + else { + iter->size_level = 1; + iter->mask = 1UL << (root->index_bits - 1); + } + return iter->cur; + } + + return NULL; +} + + +static inline struct prio_tree_node *__prio_tree_right( + struct prio_tree_root *root, struct prio_tree_iter *iter, + unsigned long radix_index, unsigned long heap_index, + unsigned long *r_index, unsigned long *h_index) +{ + unsigned long value; + + if (prio_tree_right_empty(iter->cur)) + return NULL; + + if (iter->size_level) + value = iter->value; + else + value = iter->value | iter->mask; + + if (heap_index < value) + return NULL; + + GET_INDEX(iter->cur->right, *r_index, *h_index); + + if (radix_index <= *h_index) { + iter->cur = iter->cur->right; + iter->mask >>= 1; + iter->value = value; + if (iter->mask) { + if (iter->size_level) + iter->size_level++; + } + else { + iter->size_level = 1; + iter->mask = 1UL << (root->index_bits - 1); + } + return iter->cur; + } + + return NULL; +} + +static inline struct prio_tree_node *__prio_tree_parent( + struct prio_tree_iter *iter) +{ + iter->cur = iter->cur->parent; + iter->mask <<= 1; + if (iter->size_level) { + if (iter->size_level == 1) + iter->mask = 1UL; + iter->size_level--; + } + else if (iter->value & iter->mask) + iter->value ^= iter->mask; + return iter->cur; +} + +static inline int overlap(unsigned long radix_index, unsigned long heap_index, + unsigned long r_index, unsigned long h_index) +{ + if (heap_index < r_index || radix_index > h_index) + return 0; + + return 1; +} + +/* + * prio_tree_first: + * + * Get the first prio_tree_node that overlaps with the interval [radix_index, + * heap_index]. Note that always radix_index <= heap_index. We do a pre-order + * traversal of the tree. + */ +struct prio_tree_node *prio_tree_first(struct prio_tree_root *root, + struct prio_tree_iter *iter, unsigned long radix_index, + unsigned long heap_index) +{ + unsigned long r_index, h_index; + + *iter = PRIO_TREE_ITER; + + if (prio_tree_empty(root)) + return NULL; + + GET_INDEX(root->prio_tree_node, r_index, h_index); + + if (radix_index > h_index) + return NULL; + + iter->mask = 1UL << (root->index_bits - 1); + iter->cur = root->prio_tree_node; + + while (1) { + if (overlap(radix_index, heap_index, r_index, h_index)) + return iter->cur; + + if (__prio_tree_left(root, iter, radix_index, heap_index, + &r_index, &h_index)) + continue; + + if (__prio_tree_right(root, iter, radix_index, heap_index, + &r_index, &h_index)) + continue; + + break; + } + return NULL; +} +EXPORT_SYMBOL(prio_tree_first); + +/* Get the next prio_tree_node that overlaps with the input interval in iter */ +struct prio_tree_node *prio_tree_next(struct prio_tree_root *root, + struct prio_tree_iter *iter, unsigned long radix_index, + unsigned long heap_index) +{ + unsigned long r_index, h_index; + +repeat: + while (__prio_tree_left(root, iter, radix_index, heap_index, + &r_index, &h_index)) + if (overlap(radix_index, heap_index, r_index, h_index)) + return iter->cur; + + while (!__prio_tree_right(root, iter, radix_index, heap_index, + &r_index, &h_index)) { + while (!prio_tree_root(iter->cur) && + iter->cur->parent->right == iter->cur) + __prio_tree_parent(iter); + + if (prio_tree_root(iter->cur)) + return NULL; + + __prio_tree_parent(iter); + } + + if (overlap(radix_index, heap_index, r_index, h_index)) + return iter->cur; + + goto repeat; +} +EXPORT_SYMBOL(prio_tree_next); + +/* + * Radix priority search tree for address_space->i_mmap_{_shared} + * + * For each vma that map a unique set of file pages i.e., unique [radix_index, + * heap_index] value, we have a corresponing priority search tree node. If + * multiple vmas have identical [radix_index, heap_index] value, then one of + * them is used as a tree node and others are stored in a vm_set list. The tree + * node points to the first vma (head) of the list using vm_set_head. + * + * prio_tree_root + * | + * A vm_set_head + * / \ / + * L R -> H-I-J-K-M-N-O-P-Q-S + * ^ ^ <-- vm_set.list --> + * tree nodes + * + * We need some way to identify whether a vma is a tree node, head of a vm_set + * list, or just a member of a vm_set list. We cannot use vm_flags to store + * such information. The reason is, in the above figure, it is possible that + * vm_flags' of R and H are covered by the different mmap_sems. When R is + * removed under R->mmap_sem, H replaces R as a tree node. Since we do not hold + * H->mmap_sem, we cannot use H->vm_flags for marking that H is a tree node now. + * That's why some trick involving shared.both.parent is used for identifying + * tree nodes and list head nodes. We can possibly use the least significant + * bit of the vm_set_head field to mark tree and list head nodes. I was worried + * about the alignment of vm_area_struct in various architectures. + * + * vma radix priority search tree node rules: + * + * vma->shared.both.parent != NULL ==> a tree node + * + * vma->shared.both.parent == NULL + * vma->vm_set_head != NULL ==> list head of vmas that map same pages + * vma->vm_set_head == NULL ==> a list node + */ + +void __vma_prio_tree_insert(struct prio_tree_root *root, + struct vm_area_struct *vma) +{ + struct prio_tree_node *ptr; + struct vm_area_struct *old; + + ptr = prio_tree_insert(root, &vma->shared.prio_tree_node); + + if (ptr == &vma->shared.prio_tree_node) { + vma->vm_set_head = NULL; + return; + } + + old = prio_tree_entry(ptr, struct vm_area_struct, + shared.prio_tree_node); + + __vma_prio_tree_add(vma, old); +} + +void __vma_prio_tree_remove(struct prio_tree_root *root, + struct vm_area_struct *vma) +{ + struct vm_area_struct *node, *head, *new_head; + + if (vma->shared.both.parent == NULL && vma->vm_set_head == NULL) { + list_del_init(&vma->shared.vm_set.list); + INIT_VMA_SHARED(vma); + return; + } + + if (vma->vm_set_head) { + /* Leave this BUG_ON till prio_tree patch stabilizes */ + BUG_ON(vma->vm_set_head->vm_set_head != vma); + if (vma->shared.both.parent) { + head = vma->vm_set_head; + if (!list_empty(&head->shared.vm_set.list)) { + new_head = list_entry( + head->shared.vm_set.list.next, + struct vm_area_struct, + shared.vm_set.list); + list_del_init(&head->shared.vm_set.list); + } + else + new_head = NULL; + + prio_tree_replace(root, &vma->shared.prio_tree_node, + &head->shared.prio_tree_node); + head->vm_set_head = new_head; + if (new_head) + new_head->vm_set_head = head; + + } + else { + node = vma->vm_set_head; + if (!list_empty(&vma->shared.vm_set.list)) { + new_head = list_entry( + vma->shared.vm_set.list.next, + struct vm_area_struct, + shared.vm_set.list); + list_del_init(&vma->shared.vm_set.list); + node->vm_set_head = new_head; + new_head->vm_set_head = node; + } + else + node->vm_set_head = NULL; + } + INIT_VMA_SHARED(vma); + return; + } + + prio_tree_remove(root, &vma->shared.prio_tree_node); + INIT_VMA_SHARED(vma); +} + +void __init prio_tree_init(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(index_bits_to_maxindex) - 1; i++) + index_bits_to_maxindex[i] = (1UL << (i + 1)) - 1; + index_bits_to_maxindex[ARRAY_SIZE(index_bits_to_maxindex) - 1] = ~0UL; +} diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/shmem.c x/mm/shmem.c --- x-ref/mm/shmem.c 2004-04-04 06:47:29.609385856 +0200 +++ x/mm/shmem.c 2004-04-04 06:51:49.446884552 +0200 @@ -1328,7 +1328,8 @@ static void do_shmem_file_read(struct fi * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (!list_empty(&mapping->i_mmap_shared)) + if (!prio_tree_empty(&mapping->i_mmap_shared) || + !list_empty(&mapping->i_mmap_nonlinear)) flush_dcache_page(page); /* * Mark the page accessed if we read the beginning. diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/swap_state.c x/mm/swap_state.c --- x-ref/mm/swap_state.c 2004-04-04 06:51:44.584623728 +0200 +++ x/mm/swap_state.c 2004-04-04 06:51:49.446884552 +0200 @@ -28,8 +28,9 @@ struct address_space swapper_space = { .tree_lock = SPIN_LOCK_UNLOCKED, .a_ops = &swap_aops, .backing_dev_info = &swap_backing_dev_info, - .i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap), - .i_mmap_shared = LIST_HEAD_INIT(swapper_space.i_mmap_shared), + .i_mmap = PRIO_TREE_ROOT_INIT, + .i_mmap_shared = PRIO_TREE_ROOT_INIT, + .i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear), .i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem), .truncate_count = ATOMIC_INIT(0), .private_lock = SPIN_LOCK_UNLOCKED, diff -urNp --exclude CVS --exclude BitKeeper --exclude {arch} --exclude .arch-ids x-ref/mm/vmscan.c x/mm/vmscan.c --- x-ref/mm/vmscan.c 2004-04-04 06:51:44.586623424 +0200 +++ x/mm/vmscan.c 2004-04-04 06:51:49.448884248 +0200 @@ -191,9 +191,11 @@ static inline int page_mapping_inuse(str return 1; /* File is mmap'd by somebody. */ - if (!list_empty(&mapping->i_mmap)) + if (!prio_tree_empty(&mapping->i_mmap)) return 1; - if (!list_empty(&mapping->i_mmap_shared)) + if (!prio_tree_empty(&mapping->i_mmap_shared)) + return 1; + if (!list_empty(&mapping->i_mmap_nonlinear)) return 1; return 0;