diff -urN 2.3.27pre4/include/linux/swap.h shm/include/linux/swap.h --- 2.3.27pre4/include/linux/swap.h Thu Nov 11 18:23:09 1999 +++ shm/include/linux/swap.h Thu Nov 11 21:20:36 1999 @@ -112,8 +112,10 @@ extern struct swap_info_struct swap_info[]; extern int is_swap_partition(kdev_t); extern void si_swapinfo(struct sysinfo *); -extern swp_entry_t get_swap_page(void); -extern void swap_free(swp_entry_t); +extern swp_entry_t __get_swap_page(unsigned short); +#define get_swap_page() __get_swap_page(1) +extern void __swap_free(swp_entry_t, unsigned short); +#define swap_free(entry) __swap_free((entry), 1) struct swap_list_t { int head; /* head of priority-ordered swapfile list */ int next; /* swapfile to be used next */ diff -urN 2.3.27pre4/ipc/shm.c shm/ipc/shm.c --- 2.3.27pre4/ipc/shm.c Wed Nov 10 16:59:27 1999 +++ shm/ipc/shm.c Thu Nov 11 21:58:28 1999 @@ -36,6 +36,7 @@ pte_t **shm_dir; /* ptr to array of ptrs to frames -> SHMMAX */ struct vm_area_struct *attaches; /* descriptors for attaches */ int id; /* backreference to id for shm_close */ + struct semaphore sem; }; static int findkey (key_t key); @@ -61,6 +62,9 @@ static unsigned int num_segs = 0; static unsigned short shm_seq = 0; /* incremented, for recognizing stale ids */ +/* locks order: + shm_lock -> pagecache_lock (end of shm_swap) + shp->sem -> other spinlocks (shm_nopage) */ spinlock_t shm_lock = SPIN_LOCK_UNLOCKED; /* some statistics */ @@ -260,6 +264,7 @@ shp->u.shm_ctime = CURRENT_TIME; shp->shm_npages = numpages; shp->id = id; + init_MUTEX(&shp->sem); spin_lock(&shm_lock); @@ -770,10 +775,13 @@ idx = (address - shmd->vm_start) >> PAGE_SHIFT; idx += shmd->vm_pgoff; + down(&shp->sem); spin_lock(&shm_lock); -again: pte = SHM_ENTRY(shp,idx); if (!pte_present(pte)) { + /* page not present so shm_swap can't race with us + and the semaphore protects us by other tasks that + could potentially fault on our pte under us */ if (pte_none(pte)) { spin_unlock(&shm_lock); page = get_free_highpage(GFP_HIGHUSER); @@ -781,8 +789,6 @@ goto oom; clear_highpage(page); spin_lock(&shm_lock); - if (pte_val(pte) != pte_val(SHM_ENTRY(shp, idx))) - goto changed; } else { swp_entry_t entry = pte_to_swp_entry(pte); @@ -803,9 +809,6 @@ unlock_kernel(); spin_lock(&shm_lock); shm_swp--; - pte = SHM_ENTRY(shp, idx); - if (pte_present(pte)) - goto present; } shm_rss++; pte = pte_mkdirty(mk_pte(page, PAGE_SHARED)); @@ -813,21 +816,15 @@ } else --current->maj_flt; /* was incremented in do_no_page */ -done: /* pte_val(pte) == SHM_ENTRY (shp, idx) */ get_page(pte_page(pte)); spin_unlock(&shm_lock); + up(&shp->sem); current->min_flt++; return pte_page(pte); -changed: - __free_page(page); - goto again; -present: - if (page) - free_page_and_swap_cache(page); - goto done; oom: + up(&shp->sem); return NOPAGE_OOM; } @@ -851,7 +848,11 @@ if (!counter) return 0; lock_kernel(); - swap_entry = get_swap_page(); + /* subtle: preload the swap count for the swap cache. We can't + increase the count inside the critical section as we can't release + the shm_lock there. And we can't acquire the big lock with the + shm_lock held (otherwise we would deadlock too easily). */ + swap_entry = __get_swap_page(2); if (!swap_entry.val) { unlock_kernel(); return 0; @@ -893,7 +894,7 @@ failed: spin_unlock(&shm_lock); lock_kernel(); - swap_free(swap_entry); + __swap_free(swap_entry, 2); unlock_kernel(); return 0; } @@ -905,11 +906,16 @@ swap_successes++; shm_swp++; shm_rss--; + + /* add the locked page to the swap cache before allowing + the swapin path to run lookup_swap_cache(). This avoids + reading a not yet uptodate block from disk. + NOTE: we just accounted the swap space reference for this + swap cache page at __get_swap_page() time. */ + add_to_swap_cache(page_map, swap_entry); spin_unlock(&shm_lock); lock_kernel(); - swap_duplicate(swap_entry); - add_to_swap_cache(page_map, swap_entry); rw_swap_page(WRITE, page_map, 0); unlock_kernel(); diff -urN 2.3.27pre4/mm/swapfile.c shm/mm/swapfile.c --- 2.3.27pre4/mm/swapfile.c Sun Nov 7 17:33:38 1999 +++ shm/mm/swapfile.c Thu Nov 11 21:38:01 1999 @@ -25,7 +25,7 @@ #define SWAPFILE_CLUSTER 256 -static inline int scan_swap_map(struct swap_info_struct *si) +static inline int scan_swap_map(struct swap_info_struct *si, unsigned short count) { unsigned long offset; /* @@ -73,7 +73,7 @@ si->lowest_bit++; if (offset == si->highest_bit) si->highest_bit--; - si->swap_map[offset] = 1; + si->swap_map[offset] = count; nr_swap_pages--; si->cluster_next = offset+1; return offset; @@ -81,7 +81,7 @@ return 0; } -swp_entry_t get_swap_page(void) +swp_entry_t __get_swap_page(unsigned short count) { struct swap_info_struct * p; unsigned long offset; @@ -94,11 +94,13 @@ goto out; if (nr_swap_pages == 0) goto out; + if (count >= SWAP_MAP_MAX) + goto bad_count; while (1) { p = &swap_info[type]; if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { - offset = scan_swap_map(p); + offset = scan_swap_map(p, count); if (offset) { entry = SWP_ENTRY(type,offset); type = swap_info[type].next; @@ -123,10 +125,15 @@ } out: return entry; + +bad_count: + printk(KERN_ERR "get_swap_page: bad count %hd from %p\n", + count, __builtin_return_address(0)); + goto out; } -void swap_free(swp_entry_t entry) +void __swap_free(swp_entry_t entry, unsigned short count) { struct swap_info_struct * p; unsigned long offset, type; @@ -148,7 +155,9 @@ if (!p->swap_map[offset]) goto bad_free; if (p->swap_map[offset] < SWAP_MAP_MAX) { - if (!--p->swap_map[offset]) { + if (p->swap_map[offset] < count) + goto bad_count; + if (!(p->swap_map[offset] -= count)) { if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) @@ -170,6 +179,9 @@ goto out; bad_free: printk("VM: Bad swap entry %08lx\n", entry.val); + goto out; +bad_count: + printk(KERN_ERR "VM: Bad count %hd current count %hd\n", count, p->swap_map[offset]); goto out; }