From: Manfred Spraul Description: Right now kmem_cache_create automatically decides about the alignment of allocated objects. The automatic decisions are sometimes wrong: - for some objects, it's better to keep them as small as possible to reduce the memory usage. Ingo already added a parameter to kmem_cache_create for the sigqueue cache, but it wasn't implemented. - for s390, normal kmalloc must be 8-byte aligned. With debugging enabled, the default allocation was 4-bytes. This means that s390 cannot enable slab debugging. - arm26 needs 1 kB aligned objects. Previously this was impossible to generate, therefore arm has its own allocator in arm26/machine/small_page.c - most objects should be cache line aligned, to avoid false sharing. But the cache line size was set at compile time, often to 128 bytes for generic kernels. This wastes memory. The new code uses the runtime determined cache line size instead. - some caches want an explicit alignment. One example are the pte_chain objects: they must find the start of the object with addr&mask. Right now pte_chain objects are scaled to the cache line size, because that was the only alignment that could be generated reliably. The implementation reuses the "offset" parameter of kmem_cache_create and now uses it to pass in the requested alignment. offset was ignored by the current implementation, and the only user I found is sigqueue, which intended to set the alignment. In the long run, it might be interesting for the main tree: due to the 128 byte alignment, only 7 inodes fit into one page, with 64-byte alignment, 9 inodes - 20% memory recovered for Athlon systems. For generic kernels running on P6 cpus (i.e. 32 byte cachelines), it means Number of objects per page: ext2_inode_cache: 8 instead of 7 ext3_inode_cache: 8 instead of 7 fat_inode_cache: 9 instead of 7 rpc_tasks: 24 instead of 15 tcp_tw_bucket: 40 instead of 30 arp_cache: 40 instead of 30 nfs_write_data: 9 instead of 7 DESC slab-alignment-rework merge fix EDESC From: Manfred Spraul The actual bug is that you've dropped one L1_CACHE_ALIGN/ALIGN change in kmem_cache_create: This increased the size of the control structure in each slab, which caused cache_grow to place 4112 bytes payload into each page. This overwrote the next page, and caused random crashes. Nasty one - it disappeared after I enabled slab debugging, because that changed the object size. --- 25-akpm/arch/i386/mm/init.c | 14 +-- 25-akpm/include/asm-i386/processor.h | 2 25-akpm/kernel/fork.c | 7 + 25-akpm/mm/rmap.c | 2 25-akpm/mm/slab.c | 135 ++++++++++++++++++++--------------- 5 files changed, 94 insertions(+), 66 deletions(-) diff -puN arch/i386/mm/init.c~slab-alignment-rework arch/i386/mm/init.c --- 25/arch/i386/mm/init.c~slab-alignment-rework 2004-03-17 10:33:58.678087960 -0800 +++ 25-akpm/arch/i386/mm/init.c 2004-03-17 10:34:07.528742456 -0800 @@ -528,13 +528,13 @@ struct kmem_cache_s *pae_pgd_cachep; void __init pgtable_cache_init(void) { - /* - * PAE pgds must be 16-byte aligned: - */ - pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0, - SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); - if (!pae_pgd_cachep) - panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); + /* + * PAE pgds must be 32-byte aligned: + */ + pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 32, 0, + NULL, NULL); + if (!pae_pgd_cachep) + panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); } #endif diff -puN include/asm-i386/processor.h~slab-alignment-rework include/asm-i386/processor.h --- 25/include/asm-i386/processor.h~slab-alignment-rework 2004-03-17 10:33:58.680087656 -0800 +++ 25-akpm/include/asm-i386/processor.h 2004-03-17 10:33:58.688086440 -0800 @@ -403,6 +403,8 @@ struct tss_struct { unsigned long stack[64]; } __attribute__((packed)); +#define ARCH_MIN_TASKALIGN 16 + struct thread_struct { /* cached TLS descriptors. */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; diff -puN kernel/fork.c~slab-alignment-rework kernel/fork.c --- 25/kernel/fork.c~slab-alignment-rework 2004-03-17 10:33:58.682087352 -0800 +++ 25-akpm/kernel/fork.c 2004-03-17 10:33:58.689086288 -0800 @@ -210,11 +210,14 @@ EXPORT_SYMBOL(autoremove_wake_function); void __init fork_init(unsigned long mempages) { #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR +#ifndef ARCH_MIN_TASKALIGN +#define ARCH_MIN_TASKALIGN 0 +#endif /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", - sizeof(struct task_struct),0, - SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); + sizeof(struct task_struct),ARCH_MIN_TASKALIGN, + 0, NULL, NULL); if (!task_struct_cachep) panic("fork_init(): cannot create task_struct SLAB cache"); #endif diff -puN mm/rmap.c~slab-alignment-rework mm/rmap.c --- 25/mm/rmap.c~slab-alignment-rework 2004-03-17 10:33:58.683087200 -0800 +++ 25-akpm/mm/rmap.c 2004-03-17 10:33:58.689086288 -0800 @@ -525,8 +525,8 @@ void __init pte_chain_init(void) { pte_chain_cache = kmem_cache_create( "pte_chain", sizeof(struct pte_chain), + sizeof(struct pte_chain), 0, - SLAB_MUST_HWCACHE_ALIGN, pte_chain_ctor, NULL); diff -puN mm/slab.c~slab-alignment-rework mm/slab.c --- 25/mm/slab.c~slab-alignment-rework 2004-03-17 10:33:58.685086896 -0800 +++ 25-akpm/mm/slab.c 2004-03-17 10:34:07.530742152 -0800 @@ -121,6 +121,14 @@ /* Shouldn't this be in a header file somewhere? */ #define BYTES_PER_WORD sizeof(void *) +#ifndef cache_line_size +#define cache_line_size() L1_CACHE_BYTES +#endif + +#ifndef ARCH_KMALLOC_MINALIGN +#define ARCH_KMALLOC_MINALIGN 0 +#endif + /* Legal flag mask for kmem_cache_create(). */ #if DEBUG # define CREATE_MASK (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \ @@ -268,6 +276,7 @@ struct kmem_cache_s { unsigned int colour_off; /* colour offset */ unsigned int colour_next; /* cache colouring */ kmem_cache_t *slabp_cache; + unsigned int slab_size; unsigned int dflags; /* dynamic flags */ /* constructor func */ @@ -490,8 +499,10 @@ static kmem_cache_t cache_cache = { .objsize = sizeof(kmem_cache_t), .flags = SLAB_NO_REAP, .spinlock = SPIN_LOCK_UNLOCKED, - .colour_off = L1_CACHE_BYTES, .name = "kmem_cache", +#if DEBUG + .reallen = sizeof(kmem_cache_t), +#endif }; /* Guard access to the cache-chain. */ @@ -535,7 +546,7 @@ static inline struct array_cache *ac_dat } /* Cal the num objs, wastage, and bytes left over for a given slab size. */ -static void cache_estimate (unsigned long gfporder, size_t size, +static void cache_estimate (unsigned long gfporder, size_t size, size_t align, int flags, size_t *left_over, unsigned int *num) { int i; @@ -548,7 +559,7 @@ static void cache_estimate (unsigned lon extra = sizeof(kmem_bufctl_t); } i = 0; - while (i*size + L1_CACHE_ALIGN(base+i*extra) <= wastage) + while (i*size + ALIGN(base+i*extra, align) <= wastage) i++; if (i > 0) i--; @@ -558,7 +569,7 @@ static void cache_estimate (unsigned lon *num = i; wastage -= i*size; - wastage -= L1_CACHE_ALIGN(base+i*extra); + wastage -= ALIGN(base+i*extra, align); *left_over = wastage; } @@ -688,16 +699,20 @@ void __init kmem_cache_init(void) init_MUTEX(&cache_chain_sem); INIT_LIST_HEAD(&cache_chain); list_add(&cache_cache.next, &cache_chain); + cache_cache.colour_off = cache_line_size(); cache_cache.array[smp_processor_id()] = &initarray_cache.cache; - cache_estimate(0, cache_cache.objsize, 0, - &left_over, &cache_cache.num); + cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size()); + + cache_estimate(0, cache_cache.objsize, cache_line_size(), 0, + &left_over, &cache_cache.num); if (!cache_cache.num) BUG(); cache_cache.colour = left_over/cache_cache.colour_off; cache_cache.colour_next = 0; - + cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) + + sizeof(struct slab), cache_line_size()); /* 2+3) create the kmalloc caches */ sizes = malloc_sizes; @@ -711,7 +726,7 @@ void __init kmem_cache_init(void) * allow tighter packing of the smaller caches. */ sizes->cs_cachep = kmem_cache_create( names->name, sizes->cs_size, - 0, SLAB_HWCACHE_ALIGN, NULL, NULL); + ARCH_KMALLOC_MINALIGN, 0, NULL, NULL); if (!sizes->cs_cachep) BUG(); @@ -723,7 +738,7 @@ void __init kmem_cache_init(void) sizes->cs_dmacachep = kmem_cache_create( names->name_dma, sizes->cs_size, - 0, SLAB_CACHE_DMA|SLAB_HWCACHE_ALIGN, NULL, NULL); + ARCH_KMALLOC_MINALIGN, SLAB_CACHE_DMA, NULL, NULL); if (!sizes->cs_dmacachep) BUG(); @@ -1039,7 +1054,7 @@ static void slab_destroy (kmem_cache_t * * kmem_cache_create - Create a cache. * @name: A string which is used in /proc/slabinfo to identify this cache. * @size: The size of objects to be created in this cache. - * @offset: The offset to use within the page. + * @align: The required alignment for the objects. * @flags: SLAB flags * @ctor: A constructor for the objects. * @dtor: A destructor for the objects. @@ -1064,16 +1079,15 @@ static void slab_destroy (kmem_cache_t * * %SLAB_NO_REAP - Don't automatically reap this cache when we're under * memory pressure. * - * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware - * cacheline. This can be beneficial if you're counting cycles as closely - * as davem. + * %SLAB_HWCACHE_ALIGN - This flag has no effect and will be removed soon. + * */ kmem_cache_t * -kmem_cache_create (const char *name, size_t size, size_t offset, +kmem_cache_create (const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long), void (*dtor)(void*, kmem_cache_t *, unsigned long)) { - size_t left_over, align, slab_size; + size_t left_over, slab_size; kmem_cache_t *cachep = NULL; /* @@ -1084,7 +1098,7 @@ kmem_cache_create (const char *name, siz (size < BYTES_PER_WORD) || (size > (1< size)) { + (align < 0)) { printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__, name); BUG(); @@ -1101,22 +1115,16 @@ kmem_cache_create (const char *name, siz #if FORCED_DEBUG /* - * Enable redzoning and last user accounting, except - * - for caches with forced alignment: redzoning would violate the - * alignment - * - for caches with large objects, if the increased size would - * increase the object size above the next power of two: caches - * with object sizes just above a power of two have a significant - * amount of internal fragmentation + * Enable redzoning and last user accounting, except for caches with + * large objects, if the increased size would increase the object size + * above the next power of two: caches with object sizes just above a + * power of two have a significant amount of internal fragmentation. */ - if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)) - && !(flags & SLAB_MUST_HWCACHE_ALIGN)) { + if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD))) flags |= SLAB_RED_ZONE|SLAB_STORE_USER; - } flags |= SLAB_POISON; #endif #endif - /* * Always checks flags, a caller might be expecting debug * support which isn't available. @@ -1124,15 +1132,23 @@ kmem_cache_create (const char *name, siz if (flags & ~CREATE_MASK) BUG(); + if (align) { + /* minimum supported alignment: */ + if (align < BYTES_PER_WORD) + align = BYTES_PER_WORD; + + /* combinations of forced alignment and advanced debugging is + * not yet implemented. + */ + flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER); + } + /* Get cache's description obj. */ cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL); if (!cachep) goto opps; memset(cachep, 0, sizeof(kmem_cache_t)); -#if DEBUG - cachep->reallen = size; -#endif /* Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. @@ -1143,30 +1159,31 @@ kmem_cache_create (const char *name, siz } #if DEBUG + cachep->reallen = size; + if (flags & SLAB_RED_ZONE) { - /* - * There is no point trying to honour cache alignment - * when redzoning. - */ - flags &= ~SLAB_HWCACHE_ALIGN; + /* redzoning only works with word aligned caches */ + align = BYTES_PER_WORD; + /* add space for red zone words */ cachep->dbghead += BYTES_PER_WORD; size += 2*BYTES_PER_WORD; } if (flags & SLAB_STORE_USER) { - flags &= ~SLAB_HWCACHE_ALIGN; - size += BYTES_PER_WORD; /* add space */ + /* user store requires word alignment and + * one word storage behind the end of the real + * object. + */ + align = BYTES_PER_WORD; + size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) - if (size > 128 && cachep->reallen > L1_CACHE_BYTES && size < PAGE_SIZE) { + if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) { cachep->dbghead += PAGE_SIZE - size; size = PAGE_SIZE; } #endif #endif - align = BYTES_PER_WORD; - if (flags & SLAB_HWCACHE_ALIGN) - align = L1_CACHE_BYTES; /* Determine if the slab management is 'on' or 'off' slab. */ if (size >= (PAGE_SIZE>>3)) @@ -1176,13 +1193,16 @@ kmem_cache_create (const char *name, siz */ flags |= CFLGS_OFF_SLAB; - if (flags & SLAB_HWCACHE_ALIGN) { - /* Need to adjust size so that objs are cache aligned. */ - /* Small obj size, can get at least two per cache line. */ + if (!align) { + /* Default alignment: compile time specified l1 cache size. + * Except if an object is really small, then squeeze multiple + * into one cacheline. + */ + align = cache_line_size(); while (size <= align/2) align /= 2; - size = (size+align-1)&(~(align-1)); } + size = ALIGN(size, align); /* Cal size (in pages) of slabs, and the num of objs per slab. * This could be made much more intelligent. For now, try to avoid @@ -1192,7 +1212,7 @@ kmem_cache_create (const char *name, siz do { unsigned int break_flag = 0; cal_wastage: - cache_estimate(cachep->gfporder, size, flags, + cache_estimate(cachep->gfporder, size, align, flags, &left_over, &cachep->num); if (break_flag) break; @@ -1226,7 +1246,8 @@ next: cachep = NULL; goto opps; } - slab_size = L1_CACHE_ALIGN(cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab)); + slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t) + + sizeof(struct slab), align); /* * If the slab has been placed off-slab, and we have enough space then @@ -1237,14 +1258,17 @@ next: left_over -= slab_size; } - /* Offset must be a multiple of the alignment. */ - offset += (align-1); - offset &= ~(align-1); - if (!offset) - offset = L1_CACHE_BYTES; - cachep->colour_off = offset; - cachep->colour = left_over/offset; + if (flags & CFLGS_OFF_SLAB) { + /* really off slab. No need for manual alignment */ + slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab); + } + cachep->colour_off = cache_line_size(); + /* Offset must be a multiple of the alignment. */ + if (cachep->colour_off < align) + cachep->colour_off = align; + cachep->colour = left_over/cachep->colour_off; + cachep->slab_size = slab_size; cachep->flags = flags; cachep->gfpflags = 0; if (flags & SLAB_CACHE_DMA) @@ -1512,8 +1536,7 @@ static inline struct slab* alloc_slabmgm return NULL; } else { slabp = objp+colour_off; - colour_off += L1_CACHE_ALIGN(cachep->num * - sizeof(kmem_bufctl_t) + sizeof(struct slab)); + colour_off += cachep->slab_size; } slabp->inuse = 0; slabp->colouroff = colour_off; _