From: "V. Rajesh" If a vma is already present in an i_mmap list of a mapping, then it is racy to update the vm_start, vm_end, and vm_pgoff members of the vma without holding the mapping's i_shared_sem. This is because the updates can race with invalidate_mmap_range_list. I audited all the places that assign vm_start, vm_end, and vm_pgoff. AFAIK, the following is the list of questionable places: 1) This patch fixes the racy split_vma. Kernel 2.4 does the right thing, but the following changesets introduced a race. http://linux.bkbits.net:8080/linux-2.5/patch@1.536.34.4 http://linux.bkbits.net:8080/linux-2.5/patch@1.536.34.5 You can use the patch and programs in the following URL to trigger the race. http://www-personal.engin.umich.edu/~vrajesh/linux/truncate-race/ 2) This patch also locks a small racy window in vma_merge. 3) In few cases vma_merge and do_mremap expand a vma by adding extra length to vm_end without holding i_shared_sem. I think that's fine. 4) In arch/sparc64, vm_end is updated without holding i_shared_sem. Check make_hugetlb_page_present. I hope that is fine, but I am not sure. mm/filemap.c | 3 ++ mm/mmap.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 57 insertions(+), 11 deletions(-) diff -puN mm/mmap.c~vma-split-truncate-race-fix-2 mm/mmap.c --- 25/mm/mmap.c~vma-split-truncate-race-fix-2 2003-10-17 17:52:12.000000000 -0700 +++ 25-akpm/mm/mmap.c 2003-10-18 16:26:29.000000000 -0700 @@ -280,6 +280,26 @@ static void vma_link(struct mm_struct *m } /* + * Insert vm structure into process list sorted by address and into the inode's + * i_mmap ring. The caller should hold mm->page_table_lock and + * ->f_mappping->i_shared_sem if vm_file is non-NULL. + */ +static void +__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) +{ + struct vm_area_struct * __vma, * prev; + struct rb_node ** rb_link, * rb_parent; + + __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); + if (__vma && __vma->vm_start < vma->vm_end) + BUG(); + __vma_link(mm, vma, prev, rb_link, rb_parent); + mark_mm_hugetlb(mm, vma); + mm->map_count++; + validate_mm(mm); +} + +/* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. */ @@ -351,7 +371,9 @@ static int vma_merge(struct mm_struct *m unsigned long end, unsigned long vm_flags, struct file *file, unsigned long pgoff) { - spinlock_t * lock = &mm->page_table_lock; + spinlock_t *lock = &mm->page_table_lock; + struct inode *inode = file ? file->f_dentry->d_inode : NULL; + struct semaphore *i_shared_sem; /* * We later require that vma->vm_flags == vm_flags, so this tests @@ -360,6 +382,8 @@ static int vma_merge(struct mm_struct *m if (vm_flags & VM_SPECIAL) return 0; + i_shared_sem = file ? &inode->i_mapping->i_shared_sem : NULL; + if (!prev) { prev = rb_entry(rb_parent, struct vm_area_struct, vm_rb); goto merge_next; @@ -372,12 +396,11 @@ static int vma_merge(struct mm_struct *m is_mergeable_vma(prev, file, vm_flags) && can_vma_merge_after(prev, vm_flags, file, pgoff)) { struct vm_area_struct *next; - struct inode *inode = file ? file->f_dentry->d_inode : NULL; int need_up = 0; if (unlikely(file && prev->vm_next && prev->vm_next->vm_file == file)) { - down(&inode->i_mapping->i_shared_sem); + down(i_shared_sem); need_up = 1; } spin_lock(lock); @@ -395,7 +418,7 @@ static int vma_merge(struct mm_struct *m __remove_shared_vm_struct(next, inode); spin_unlock(lock); if (need_up) - up(&inode->i_mapping->i_shared_sem); + up(i_shared_sem); if (file) fput(file); @@ -405,7 +428,7 @@ static int vma_merge(struct mm_struct *m } spin_unlock(lock); if (need_up) - up(&inode->i_mapping->i_shared_sem); + up(i_shared_sem); return 1; } @@ -419,10 +442,14 @@ static int vma_merge(struct mm_struct *m pgoff, (end - addr) >> PAGE_SHIFT)) return 0; if (end == prev->vm_start) { + if (file) + down(i_shared_sem); spin_lock(lock); prev->vm_start = addr; prev->vm_pgoff -= (end - addr) >> PAGE_SHIFT; spin_unlock(lock); + if (file) + up(i_shared_sem); return 1; } } @@ -1142,6 +1169,7 @@ int split_vma(struct mm_struct * mm, str unsigned long addr, int new_below) { struct vm_area_struct *new; + struct address_space *mapping = NULL; if (mm->map_count >= MAX_MAP_COUNT) return -ENOMEM; @@ -1155,12 +1183,9 @@ int split_vma(struct mm_struct * mm, str INIT_LIST_HEAD(&new->shared); - if (new_below) { + if (new_below) new->vm_end = addr; - vma->vm_start = addr; - vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); - } else { - vma->vm_end = addr; + else { new->vm_start = addr; new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } @@ -1171,7 +1196,25 @@ int split_vma(struct mm_struct * mm, str if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - insert_vm_struct(mm, new); + if (vma->vm_file) + mapping = vma->vm_file->f_dentry->d_inode->i_mapping; + + if (mapping) + down(&mapping->i_shared_sem); + spin_lock(&mm->page_table_lock); + + if (new_below) { + vma->vm_start = addr; + vma->vm_pgoff += ((addr - new->vm_start) >> PAGE_SHIFT); + } else + vma->vm_end = addr; + + __insert_vm_struct(mm, new); + + spin_unlock(&mm->page_table_lock); + if (mapping) + up(&mapping->i_shared_sem); + return 0; } diff -puN mm/filemap.c~vma-split-truncate-race-fix-2 mm/filemap.c --- 25/mm/filemap.c~vma-split-truncate-race-fix-2 2003-10-17 17:52:12.000000000 -0700 +++ 25-akpm/mm/filemap.c 2003-10-18 16:26:15.000000000 -0700 @@ -61,6 +61,9 @@ * ->swap_device_lock (exclusive_swap_page, others) * ->mapping->page_lock * + * ->i_sem + * ->i_shared_sem (truncate->invalidate_mmap_range) + * * ->mmap_sem * ->i_shared_sem (various places) * _