Linuxメモリ管理のslabメカニズム(cacheの作成)


Linuxカーネルでcacheノードを作成する関数kmem_cache_create()実装.
この関数の実行プロセス:
1、グローバルcache_からCacheでは、グローバルcache_Cache初期化オブジェクトのサイズはkmem_ですCache構造の大きさなので、返されるポインタはちょうどcache構造に変換できます.呼び出しkmem_cache_zalloc(&cache_cache, gfp);
2,slabのフラグメントサイズを取得し、関数calculate_slab_order()実装;
3,cacheの各種属性を計算して初期化し,外付け式であればkmem_を用いる必要がある.find_general_Cachep(slab_size,0 u)指定cachep->slabp_Cache、slabオブジェクトとkmem_を格納するために使用bufctl_t[]配列;
4,各CPU上のローカルcache,setup_を設定するcpu_cache();
5、cacheの作成が完了し、グローバルslab cacheチェーンテーブルに追加します.
一、主な実現
/**
 * kmem_cache_create - Create a cache.
 * @name: A string which is used in /proc/slabinfo to identify this cache.
 * @size: The size of objects to be created in this cache.
 * @align: The required alignment for the objects.
 * @flags: SLAB flags
 * @ctor: A constructor for the objects.
 *
 * Returns a ptr to the cache on success, NULL on failure.
 * Cannot be called within a int, but can be interrupted.
 * The @ctor is run when new pages are allocated by the cache.
 *
 * @name must be valid until the cache is destroyed. This implies that
 * the module calling this has to destroy the cache before getting unloaded.
 * Note that kmem_cache_name() is not guaranteed to return the same pointer,
 * therefore applications must manage it themselves.
 *
 * The flags are
 *
 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
 * to catch references to uninitialised memory.
 *
 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
 * for buffer overruns.
 *
 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
 * cacheline.  This can be beneficial if you're counting cycles as closely
 * as davem.
 */
 /*  slab     cache  。     ,cache
       slab    ,       
 ,  cache        ,      slab。*/
struct kmem_cache *
kmem_cache_create (const char *name, size_t size, size_t align,
	unsigned long flags, void (*ctor)(void *))
{
	size_t left_over, slab_size, ralign;
	struct kmem_cache *cachep = NULL, *pc;
	gfp_t gfp;

	/*
	 * Sanity checks... these are all serious usage bugs.
	 *//*       */
	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
	    size > KMALLOC_MAX_SIZE) {
		printk(KERN_ERR "%s: Early error in slab %s
", __func__, name); BUG(); } /* * We use cache_chain_mutex to ensure a consistent view of * cpu_online_mask as well. Please see cpuup_callback */ /* slab , , cpu slab , , */ if (slab_is_available()) { get_online_cpus(); mutex_lock(&cache_chain_mutex); } /* cache , */ list_for_each_entry(pc, &cache_chain, next) { char tmp; int res; /* * This happens when the module gets unloaded and doesn't * destroy its slab cache and no-one else reuses the vmalloc * area of the module. Print a warning. */ /* cache cache */ res = probe_kernel_address(pc->name, tmp); if (res) {/* , */ printk(KERN_ERR "SLAB: cache with size %d has lost its name
", pc->buffer_size); continue; } /* cache cache */ if (!strcmp(pc->name, name)) { printk(KERN_ERR "kmem_cache_create: duplicate cache %s
", name); dump_stack(); goto oops; } } #if DEBUG WARN_ON(strchr(name, ' ')); /* It confuses parsers */ #if FORCED_DEBUG /* * Enable redzoning and last user accounting, except for caches with * large objects, if the increased size would increase the object size * above the next power of two: caches with object sizes just above a * power of two have a significant amount of internal fragmentation. */ if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2 * sizeof(unsigned long long))) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; if (!(flags & SLAB_DESTROY_BY_RCU)) flags |= SLAB_POISON; #endif if (flags & SLAB_DESTROY_BY_RCU) BUG_ON(flags & SLAB_POISON); #endif /* * Always checks flags, a caller might be expecting debug support which * isn't available. */ BUG_ON(flags & ~CREATE_MASK); /* * Check that size is in terms of words. This is needed to avoid * unaligned accesses for some archs when redzoning is used, and makes * sure any on-slab bufctl's are also correctly aligned. */ if (size & (BYTES_PER_WORD - 1)) { size += (BYTES_PER_WORD - 1); size &= ~(BYTES_PER_WORD - 1); } /* calculate the final buffer alignment: */ /* 1) arch recommendation: can be overridden for debug */ if (flags & SLAB_HWCACHE_ALIGN) { /* * Default alignment: as specified by the arch code. Except if * an object is really small, then squeeze multiple objects into * one cacheline. */ ralign = cache_line_size(); while (size <= ralign / 2) ralign /= 2; } else { ralign = BYTES_PER_WORD; } /* * Redzoning and user store require word alignment or possibly larger. * Note this will be overridden by architecture or caller mandated * alignment if either is greater than BYTES_PER_WORD. */ if (flags & SLAB_STORE_USER) ralign = BYTES_PER_WORD; if (flags & SLAB_RED_ZONE) { ralign = REDZONE_ALIGN; /* If redzoning, ensure that the second redzone is suitably * aligned, by adjusting the object size accordingly. */ size += REDZONE_ALIGN - 1; size &= ~(REDZONE_ALIGN - 1); } /* 2) arch mandated alignment */ if (ralign < ARCH_SLAB_MINALIGN) { ralign = ARCH_SLAB_MINALIGN; } /* 3) caller mandated alignment */ if (ralign < align) { ralign = align; } /* disable debug if necessary */ if (ralign > __alignof__(unsigned long long)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. */ align = ralign; /* slab */ if (slab_is_available()) gfp = GFP_KERNEL; else /* slab , , */ gfp = GFP_NOWAIT; /* Get cache's description obj. */ /* struct kmem_cache , cache kmem_cache , cache_cache kmem_cache */ cachep = kmem_cache_zalloc(&cache_cache, gfp); if (!cachep) goto oops; #if DEBUG cachep->obj_size = size; /* * Both debugging options require word-alignment which is calculated * into align above. */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ cachep->obj_offset += sizeof(unsigned long long); size += 2 * sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of * the real object. But if the second red zone needs to be * aligned to 64 bits, we must allow that much space. */ if (flags & SLAB_RED_ZONE) size += REDZONE_ALIGN; else size += BYTES_PER_WORD; } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { cachep->obj_offset += PAGE_SIZE - size; size = PAGE_SIZE; } #endif #endif /* * Determine if the slab management is 'on' or 'off' slab. * (bootstrapping cannot cope with offslab caches so don't do * it too early on.) */ /* slab : 。 , 512 , 。 。 slab_early_init kmem_cache_init */ if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). */ flags |= CFLGS_OFF_SLAB; size = ALIGN(size, align); /* slab */ left_over = calculate_slab_order(cachep, size, align, flags); /* cachep->num cache slab , 0, cache */ if (!cachep->num) { printk(KERN_ERR "kmem_cache_create: couldn't create cache %s.
", name); kmem_cache_free(&cache_cache, cachep); cachep = NULL; goto oops; } /* slab , struct slab kmem_bufctl_t */ slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab), align); /* * If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. */ /* slab, slab , slab slab , slab */ if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { /* off-slab */ flags &= ~CFLGS_OFF_SLAB; /* */ left_over -= slab_size; } if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ /* align slab , slab , slab , */ slab_size = cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() * poisoning, then it's going to smash the contents of * the redzone and userword anyhow, so switch them off. */ if (size % PAGE_SIZE == 0 && flags & SLAB_POISON) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); #endif } /* cache */ cachep->colour_off = cache_line_size(); /* Offset must be a multiple of the alignment. */ /* */ if (cachep->colour_off < align) cachep->colour_off = align; /* */ cachep->colour = left_over / cachep->colour_off; /* slab */ cachep->slab_size = slab_size; cachep->flags = flags; cachep->gfpflags = 0; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) cachep->gfpflags |= GFP_DMA; /* slab */ cachep->buffer_size = size; /* slab , obj_to_index */ cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { /* slab , slabp_cache , slab_size, slab_size cache , slab , slab , kmem_bufctl_t[] , slab, */ cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u); /* * This is a possibility for one of the malloc_sizes caches. * But since we go off slab only for object size greater than * PAGE_SIZE/8, and malloc_sizes gets created in ascending order, * this should not happen at all. * But leave a BUG_ON for some lucky dude. */ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } cachep->ctor = ctor; cachep->name = name; /* cpu local cache */ if (setup_cpu_cache(cachep, gfp)) { __kmem_cache_destroy(cachep); cachep = NULL; goto oops; } /* cache setup completed, link it into the list */ /* cache , slab cache */ list_add(&cachep->next, &cache_chain); oops: if (!cachep && (flags & SLAB_PANIC)) panic("kmem_cache_create(): failed to create slab `%s'
", name); if (slab_is_available()) { mutex_unlock(&cache_chain_mutex); put_online_cpus(); } return cachep; }

ここで、cache_cache
/* internal cache of cache description objs */
static struct kmem_cache cache_cache = {
	.batchcount = 1,
	.limit = BOOT_CPUCACHE_ENTRIES,
	.shared = 1,
	.buffer_size = sizeof(struct kmem_cache),/*   cache  ,     cache_cache*/
	.name = "kmem_cache",
};

二、slab断片の大きさを計算する
/**
 * calculate_slab_order - calculate size (page order) of slabs
 * @cachep: pointer to the cache that is being created
 * @size: size of objects to be created in this cache.
 * @align: required alignment for the objects.
 * @flags: slab allocation flags
 *
 * Also calculates the number of objects per slab.
 *
 * This could be made much more intelligent.  For now, try to avoid using
 * high order pages for slabs.  When the gfp() functions are more friendly
 * towards high-order requests, this should be changed.
 */
 /*  slab       ,      slab      */
static size_t calculate_slab_order(struct kmem_cache *cachep,
			size_t size, size_t align, unsigned long flags)
{
	unsigned long offslab_limit;
	size_t left_over = 0;
	int gfporder;

	for (gfporder = 0; gfporder <= KMALLOC_MAX_ORDER; gfporder++) {
		unsigned int num;
		size_t remainder;
		/*   slab     */
		cache_estimate(gfporder, size, align, flags, &remainder, &num);
		/*     0,   order ,        ,    order */
		if (!num)
			continue;

		if (flags & CFLGS_OFF_SLAB) {
			/*
			 * Max number of objs-per-slab for caches which
			 * use off-slab slabs. Needed to avoid a possible
			 * looping condition in cache_grow().
			 */
			 /*        slab ,      slab     
			 ,  struct slab   kmem_bufctl_t  ,                    
			 ,            :
			kmem_cache_alloc->__cache_alloc-> __do_cache_alloc-> ____cache_alloc-> cache_alloc_refill-> cache_grow-> alloc_slabmgmt-> kmem_cache_alloc_node-> kmem_cache_alloc
			              ,       alloc_slabmgmt  
			, slab     off-slab   ,      
			。      slab          slab ?       slab     
			,  kmem_bufctl_t    ,           ,        
			。    kmem_bufctl_t      ,          ,       size 
			,    slab,               size,   kmem_bufctl_t     
			,     kmem_bufctl_t               slab。      
			,             ,               slab。         
			,      slab_break_gfp_order       slab     ,     1,   slab      
			,   slab       512    ,  
			slab          ,kmem_bufctl_t       ,          。
			*/
			offslab_limit = size - sizeof(struct slab);
			offslab_limit /= sizeof(kmem_bufctl_t);
			/*         ,    ,       order
			,  slab       
			,             ,        */
 			if (num > offslab_limit)
				break;
		}

		/* Found something acceptable - save it away */
		/*   slab      */
		cachep->num = num;
		 /* slab order,         */
		cachep->gfporder = gfporder;
		 /* slab     (  )    */
		left_over = remainder;

		/*
		 * A VFS-reclaimable slab tends to have most allocations
		 * as GFP_NOFS and we really don't want to have to be allocating
		 * higher-order pages when we are unable to shrink dcache.
		 */
		 /* SLAB_RECLAIM_ACCOUNT   slab         
		 ,                      
		 ,          ,    
		 kmem_freepages()         slab   。       
		 ,               */
		if (flags & SLAB_RECLAIM_ACCOUNT)
			break;

		/*
		 * Large number of objects is good, but very large slabs are
		 * currently bad for the gfp()s.
		 */
		 /* slab_break_gfp_order slab       ,       
		 ,      ,        order  */
		if (gfporder >= slab_break_gfp_order)
			break;

		/*
		 * Acceptable internal fragmentation?
		 */
		 /* slab             8   
		 ,       ,       order */
		if (left_over * 8 <= (PAGE_SIZE << gfporder))
			break;
	}
	/*        */
	return left_over;
}

三、指定サイズcacheの検索
/* general cache     struct kmem_cache  。    __find_general_cachep。*/
static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
{
	return __find_general_cachep(size, gfpflags);
}
static inline struct kmem_cache *__find_general_cachep(size_t size,
							gfp_t gfpflags)
{
	struct cache_sizes *csizep = malloc_sizes;

#if DEBUG
	/* This happens if someone tries to call
	 * kmem_cache_create(), or __kmalloc(), before
	 * the generic caches are initialized.
	 */
	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
#endif
	if (!size)
		return ZERO_SIZE_PTR;
	/*      malloc size */
	while (size > csizep->cs_size)
		csizep++;

	/*
	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
	 * has cs_{dma,}cachep==NULL. Thus no special case
	 * for large kmalloc calls required.
	 */
#ifdef CONFIG_ZONE_DMA
	if (unlikely(gfpflags & GFP_DMA))
		return csizep->cs_dmacachep;
#endif
  	/*         cache */
	return csizep->cs_cachep;
}

四、CPUローカルcacheの設定
/*  local cache slab  。*/
static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
{
	/* general cache     ,    cpu local cache */
	if (g_cpucache_up == FULL)
		return enable_cpucache(cachep, gfp);
	/*            ,g_cpucache_up  general cache      
	,  PARTIAL_AC  struct array_cache   cache    ,
	PARTIAL_L3  struct kmem_list3   cache    
	,       cache     
	。           cpu local cache slab   */
	if (g_cpucache_up == NONE) {
		/*
		 * Note: the first kmem_cache_create must create the cache
		 * that's used by kmalloc(24), otherwise the creation of
		 * further caches will BUG().
		 */
		 /*        struct array_cache  cache       
		 ,  struct array_cache   general cache    
		 ,             initarray_generic   local cache */
		cachep->array[smp_processor_id()] = &initarray_generic.cache;

		/*
		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
		 * the first cache, then we need to set up all its list3s,
		 * otherwise the creation of further caches will BUG().
		 */
		 /*   struct kmem_list3   cache  struct array_cache  cache  
		 ,    struct kmem_list3   
		 cache       ,          */
		set_up_list3s(cachep, SIZE_AC);
		/*     struct array_cache   cache    
		,  struct kmem_list3 struct array_cache     general cache 
		,        
		,g_cpucache_up          */
		if (INDEX_AC == INDEX_L3)
			g_cpucache_up = PARTIAL_L3;
		else
			g_cpucache_up = PARTIAL_AC;
	} else {
		/* g_cpucache_up   PARTIAL_AC       ,struct array_cache   
		general cache      ,    kmalloc    */
		cachep->array[smp_processor_id()] =
			kmalloc(sizeof(struct arraycache_init), gfp);

		if (g_cpucache_up == PARTIAL_AC) {
			/* struct kmem_list3  cache      ,       slab   */
			set_up_list3s(cachep, SIZE_L3);
			/*       kmem_cache_init  ,    struct kmem_list3  
			cache        ,        ,struct kmem_list3  
			cache           ,  g_cpucache_up */
			g_cpucache_up = PARTIAL_L3;
		} else {
			int node;
			for_each_online_node(node) {
				cachep->nodelists[node] =/*   kmalloc  struct kmem_list3   */
				    kmalloc_node(sizeof(struct kmem_list3),
						gfp, node);
				BUG_ON(!cachep->nodelists[node]);
				/*    slab   */
				kmem_list3_init(cachep->nodelists[node]);
			}
		}
	}
	/*        */
	cachep->nodelists[numa_node_id()]->next_reap =
			jiffies + REAPTIMEOUT_LIST3 +
			((unsigned long)cachep) % REAPTIMEOUT_LIST3;

	cpu_cache_get(cachep)->avail = 0;
	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
	cpu_cache_get(cachep)->batchcount = 1;
	cpu_cache_get(cachep)->touched = 0;
	cachep->batchcount = 1;
	cachep->limit = BOOT_CPUCACHE_ENTRIES;
	return 0;
}