RHEL4/mm/slab.c
<<
>>
Prefs
   1/*
   2 * linux/mm/slab.c
   3 * Written by Mark Hemment, 1996/97.
   4 * (markhe@nextd.demon.co.uk)
   5 *
   6 * kmem_cache_destroy() + some cleanup - 1999 Andrea Arcangeli
   7 *
   8 * Major cleanup, different bufctl logic, per-cpu arrays
   9 *      (c) 2000 Manfred Spraul
  10 *
  11 * Cleanup, make the head arrays unconditional, preparation for NUMA
  12 *      (c) 2002 Manfred Spraul
  13 *
  14 * An implementation of the Slab Allocator as described in outline in;
  15 *      UNIX Internals: The New Frontiers by Uresh Vahalia
  16 *      Pub: Prentice Hall      ISBN 0-13-101908-2
  17 * or with a little more detail in;
  18 *      The Slab Allocator: An Object-Caching Kernel Memory Allocator
  19 *      Jeff Bonwick (Sun Microsystems).
  20 *      Presented at: USENIX Summer 1994 Technical Conference
  21 *
  22 * The memory is organized in caches, one cache for each object type.
  23 * (e.g. inode_cache, dentry_cache, buffer_head, vm_area_struct)
  24 * Each cache consists out of many slabs (they are small (usually one
  25 * page long) and always contiguous), and each slab contains multiple
  26 * initialized objects.
  27 *
  28 * This means, that your constructor is used only for newly allocated
  29 * slabs and you must pass objects with the same intializations to
  30 * kmem_cache_free.
  31 *
  32 * Each cache can only support one memory type (GFP_DMA, GFP_HIGHMEM,
  33 * normal). If you need a special memory type, then must create a new
  34 * cache for that memory type.
  35 *
  36 * In order to reduce fragmentation, the slabs are sorted in 3 groups:
  37 *   full slabs with 0 free objects
  38 *   partial slabs
  39 *   empty slabs with no allocated objects
  40 *
  41 * If partial slabs exist, then new allocations come from these slabs,
  42 * otherwise from empty slabs or new slabs are allocated.
  43 *
  44 * kmem_cache_destroy() CAN CRASH if you try to allocate from the cache
  45 * during kmem_cache_destroy(). The caller must prevent concurrent allocs.
  46 *
  47 * Each cache has a short per-cpu head array, most allocs
  48 * and frees go into that array, and if that array overflows, then 1/2
  49 * of the entries in the array are given back into the global cache.
  50 * The head array is strictly LIFO and should improve the cache hit rates.
  51 * On SMP, it additionally reduces the spinlock operations.
  52 *
  53 * The c_cpuarray may not be read with enabled local interrupts - 
  54 * it's changed with a smp_call_function().
  55 *
  56 * SMP synchronization:
  57 *  constructors and destructors are called without any locking.
  58 *  Several members in kmem_cache_t and struct slab never change, they
  59 *      are accessed without any locking.
  60 *  The per-cpu arrays are never accessed from the wrong cpu, no locking,
  61 *      and local interrupts are disabled so slab code is preempt-safe.
  62 *  The non-constant members are protected with a per-cache irq spinlock.
  63 *
  64 * Many thanks to Mark Hemment, who wrote another per-cpu slab patch
  65 * in 2000 - many ideas in the current implementation are derived from
  66 * his patch.
  67 *
  68 * Further notes from the original documentation:
  69 *
  70 * 11 April '97.  Started multi-threading - markhe
  71 *      The global cache-chain is protected by the semaphore 'cache_chain_sem'.
  72 *      The sem is only needed when accessing/extending the cache-chain, which
  73 *      can never happen inside an interrupt (kmem_cache_create(),
  74 *      kmem_cache_shrink() and kmem_cache_reap()).
  75 *
  76 *      At present, each engine can be growing a cache.  This should be blocked.
  77 *
  78 */
  79
  80#include        <linux/config.h>
  81#include        <linux/slab.h>
  82#include        <linux/mm.h>
  83#include        <linux/swap.h>
  84#include        <linux/cache.h>
  85#include        <linux/interrupt.h>
  86#include        <linux/init.h>
  87#include        <linux/compiler.h>
  88#include        <linux/seq_file.h>
  89#include        <linux/notifier.h>
  90#include        <linux/kallsyms.h>
  91#include        <linux/cpu.h>
  92#include        <linux/sysctl.h>
  93#include        <linux/module.h>
  94#include        <linux/rcupdate.h>
  95
  96#include        <asm/uaccess.h>
  97#include        <asm/cacheflush.h>
  98#include        <asm/tlbflush.h>
  99#include        <asm/page.h>
 100
 101/*
 102 * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_DEBUG_INITIAL,
 103 *                SLAB_RED_ZONE & SLAB_POISON.
 104 *                0 for faster, smaller code (especially in the critical paths).
 105 *
 106 * STATS        - 1 to collect stats for /proc/slabinfo.
 107 *                0 for faster, smaller code (especially in the critical paths).
 108 *
 109 * FORCED_DEBUG - 1 enables SLAB_RED_ZONE and SLAB_POISON (if possible)
 110 */
 111
 112#ifdef CONFIG_DEBUG_SLAB
 113#define DEBUG           1
 114#define STATS           1
 115#define FORCED_DEBUG    1
 116#else
 117#define DEBUG           0
 118#define STATS           0
 119#define FORCED_DEBUG    0
 120#endif
 121
 122
 123/* Shouldn't this be in a header file somewhere? */
 124#define BYTES_PER_WORD          sizeof(void *)
 125
 126#ifndef cache_line_size
 127#define cache_line_size()       L1_CACHE_BYTES
 128#endif
 129
 130#ifndef ARCH_KMALLOC_MINALIGN
 131#define ARCH_KMALLOC_MINALIGN 0
 132#endif
 133
 134#ifndef ARCH_KMALLOC_FLAGS
 135#define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN
 136#endif
 137
 138/* Legal flag mask for kmem_cache_create(). */
 139#if DEBUG
 140# define CREATE_MASK    (SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
 141                         SLAB_POISON | SLAB_HWCACHE_ALIGN | \
 142                         SLAB_NO_REAP | SLAB_CACHE_DMA | \
 143                         SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
 144                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 145                         SLAB_DESTROY_BY_RCU)
 146#else
 147# define CREATE_MASK    (SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
 148                         SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
 149                         SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 150                         SLAB_DESTROY_BY_RCU)
 151#endif
 152
 153/*
 154 * kmem_bufctl_t:
 155 *
 156 * Bufctl's are used for linking objs within a slab
 157 * linked offsets.
 158 *
 159 * This implementation relies on "struct page" for locating the cache &
 160 * slab an object belongs to.
 161 * This allows the bufctl structure to be small (one int), but limits
 162 * the number of objects a slab (not a cache) can contain when off-slab
 163 * bufctls are used. The limit is the size of the largest general cache
 164 * that does not use off-slab slabs.
 165 * For 32bit archs with 4 kB pages, is this 56.
 166 * This is not serious, as it is only for large objects, when it is unwise
 167 * to have too many per slab.
 168 * Note: This limit can be raised by introducing a general cache whose size
 169 * is less than 512 (PAGE_SIZE<<3), but greater than 256.
 170 */
 171
 172#define BUFCTL_END      (((kmem_bufctl_t)(~0U))-0)
 173#define BUFCTL_FREE     (((kmem_bufctl_t)(~0U))-1)
 174#define SLAB_LIMIT      (((kmem_bufctl_t)(~0U))-2)
 175
 176/* Max number of objs-per-slab for caches which use off-slab slabs.
 177 * Needed to avoid a possible looping condition in cache_grow().
 178 */
 179static unsigned long offslab_limit;
 180
 181/*
 182 * struct slab
 183 *
 184 * Manages the objs in a slab. Placed either at the beginning of mem allocated
 185 * for a slab, or allocated from an general cache.
 186 * Slabs are chained into three list: fully used, partial, fully free slabs.
 187 */
 188struct slab {
 189        struct list_head        list;
 190        unsigned long           colouroff;
 191        void                    *s_mem;         /* including colour offset */
 192        unsigned int            inuse;          /* num of objs active in slab */
 193        kmem_bufctl_t           free;
 194};
 195
 196/*
 197 * struct slab_rcu
 198 *
 199 * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
 200 * arrange for kmem_freepages to be called via RCU.  This is useful if
 201 * we need to approach a kernel structure obliquely, from its address
 202 * obtained without the usual locking.  We can lock the structure to
 203 * stabilize it and check it's still at the given address, only if we
 204 * can be sure that the memory has not been meanwhile reused for some
 205 * other kind of object (which our subsystem's lock might corrupt).
 206 *
 207 * rcu_read_lock before reading the address, then rcu_read_unlock after
 208 * taking the spinlock within the structure expected at that address.
 209 *
 210 * We assume struct slab_rcu can overlay struct slab when destroying.
 211 */
 212struct slab_rcu {
 213        struct rcu_head         head;
 214        kmem_cache_t            *cachep;
 215        void                    *addr;
 216};
 217
 218/*
 219 * struct array_cache
 220 *
 221 * Per cpu structures
 222 * Purpose:
 223 * - LIFO ordering, to hand out cache-warm objects from _alloc
 224 * - reduce the number of linked list operations
 225 * - reduce spinlock operations
 226 *
 227 * The limit is stored in the per-cpu structure to reduce the data cache
 228 * footprint.
 229 *
 230 */
 231struct array_cache {
 232        unsigned int avail;
 233        unsigned int limit;
 234        unsigned int batchcount;
 235        unsigned int touched;
 236};
 237
 238/* bootstrap: The caches do not work without cpuarrays anymore,
 239 * but the cpuarrays are allocated from the generic caches...
 240 */
 241#define BOOT_CPUCACHE_ENTRIES   1
 242struct arraycache_init {
 243        struct array_cache cache;
 244        void * entries[BOOT_CPUCACHE_ENTRIES];
 245};
 246
 247/*
 248 * The slab lists of all objects.
 249 * Hopefully reduce the internal fragmentation
 250 * NUMA: The spinlock could be moved from the kmem_cache_t
 251 * into this structure, too. Figure out what causes
 252 * fewer cross-node spinlock operations.
 253 */
 254struct kmem_list3 {
 255        struct list_head        slabs_partial;  /* partial list first, better asm code */
 256        struct list_head        slabs_full;
 257        struct list_head        slabs_free;
 258        unsigned long   free_objects;
 259        int             free_touched;
 260        unsigned long   next_reap;
 261        struct array_cache      *shared;
 262};
 263
 264#define LIST3_INIT(parent) \
 265        { \
 266                .slabs_full     = LIST_HEAD_INIT(parent.slabs_full), \
 267                .slabs_partial  = LIST_HEAD_INIT(parent.slabs_partial), \
 268                .slabs_free     = LIST_HEAD_INIT(parent.slabs_free) \
 269        }
 270#define list3_data(cachep) \
 271        (&(cachep)->lists)
 272
 273/* NUMA: per-node */
 274#define list3_data_ptr(cachep, ptr) \
 275                list3_data(cachep)
 276
 277/*
 278 * kmem_cache_t
 279 *
 280 * manages a cache.
 281 */
 282        
 283struct kmem_cache_s {
 284/* 1) per-cpu data, touched during every alloc/free */
 285        struct array_cache      *array[NR_CPUS];
 286        unsigned int            batchcount;
 287        unsigned int            limit;
 288/* 2) touched by every alloc & free from the backend */
 289        struct kmem_list3       lists;
 290        /* NUMA: kmem_3list_t   *nodelists[MAX_NUMNODES] */
 291        unsigned int            objsize;
 292        unsigned int            flags;  /* constant flags */
 293        unsigned int            num;    /* # of objs per slab */
 294        unsigned int            free_limit; /* upper limit of objects in the lists */
 295        spinlock_t              spinlock;
 296
 297/* 3) cache_grow/shrink */
 298        /* order of pgs per slab (2^n) */
 299        unsigned int            gfporder;
 300
 301        /* force GFP flags, e.g. GFP_DMA */
 302        unsigned int            gfpflags;
 303
 304        size_t                  colour;         /* cache colouring range */
 305        unsigned int            colour_off;     /* colour offset */
 306        unsigned int            colour_next;    /* cache colouring */
 307        kmem_cache_t            *slabp_cache;
 308        unsigned int            slab_size;
 309        unsigned int            dflags;         /* dynamic flags */
 310
 311        /* constructor func */
 312        void (*ctor)(void *, kmem_cache_t *, unsigned long);
 313
 314        /* de-constructor func */
 315        void (*dtor)(void *, kmem_cache_t *, unsigned long);
 316
 317/* 4) cache creation/removal */
 318        const char              *name;
 319        struct list_head        next;
 320
 321/* 5) statistics */
 322#if STATS
 323        unsigned long           num_active;
 324        unsigned long           num_allocations;
 325        unsigned long           high_mark;
 326        unsigned long           grown;
 327        unsigned long           reaped;
 328        unsigned long           errors;
 329        unsigned long           max_freeable;
 330        atomic_t                allochit;
 331        atomic_t                allocmiss;
 332        atomic_t                freehit;
 333        atomic_t                freemiss;
 334#endif
 335#if DEBUG
 336        int                     dbghead;
 337        int                     reallen;
 338#endif
 339};
 340
 341#define CFLGS_OFF_SLAB          (0x80000000UL)
 342#define OFF_SLAB(x)     ((x)->flags & CFLGS_OFF_SLAB)
 343
 344#define BATCHREFILL_LIMIT       16
 345/* Optimization question: fewer reaps means less 
 346 * probability for unnessary cpucache drain/refill cycles.
 347 *
 348 * OTHO the cpuarrays can contain lots of objects,
 349 * which could lock up otherwise freeable slabs.
 350 */
 351#define REAPTIMEOUT_CPUC        (2*HZ)
 352#define REAPTIMEOUT_LIST3       (4*HZ)
 353
 354#if STATS
 355#define STATS_INC_ACTIVE(x)     ((x)->num_active++)
 356#define STATS_DEC_ACTIVE(x)     ((x)->num_active--)
 357#define STATS_INC_ALLOCED(x)    ((x)->num_allocations++)
 358#define STATS_INC_GROWN(x)      ((x)->grown++)
 359#define STATS_INC_REAPED(x)     ((x)->reaped++)
 360#define STATS_SET_HIGH(x)       do { if ((x)->num_active > (x)->high_mark) \
 361                                        (x)->high_mark = (x)->num_active; \
 362                                } while (0)
 363#define STATS_INC_ERR(x)        ((x)->errors++)
 364#define STATS_SET_FREEABLE(x, i) \
 365                                do { if ((x)->max_freeable < i) \
 366                                        (x)->max_freeable = i; \
 367                                } while (0)
 368
 369#define STATS_INC_ALLOCHIT(x)   atomic_inc(&(x)->allochit)
 370#define STATS_INC_ALLOCMISS(x)  atomic_inc(&(x)->allocmiss)
 371#define STATS_INC_FREEHIT(x)    atomic_inc(&(x)->freehit)
 372#define STATS_INC_FREEMISS(x)   atomic_inc(&(x)->freemiss)
 373#else
 374#define STATS_INC_ACTIVE(x)     do { } while (0)
 375#define STATS_DEC_ACTIVE(x)     do { } while (0)
 376#define STATS_INC_ALLOCED(x)    do { } while (0)
 377#define STATS_INC_GROWN(x)      do { } while (0)
 378#define STATS_INC_REAPED(x)     do { } while (0)
 379#define STATS_SET_HIGH(x)       do { } while (0)
 380#define STATS_INC_ERR(x)        do { } while (0)
 381#define STATS_SET_FREEABLE(x, i) \
 382                                do { } while (0)
 383
 384#define STATS_INC_ALLOCHIT(x)   do { } while (0)
 385#define STATS_INC_ALLOCMISS(x)  do { } while (0)
 386#define STATS_INC_FREEHIT(x)    do { } while (0)
 387#define STATS_INC_FREEMISS(x)   do { } while (0)
 388#endif
 389
 390#if DEBUG
 391/* Magic nums for obj red zoning.
 392 * Placed in the first word before and the first word after an obj.
 393 */
 394#define RED_INACTIVE    0x5A2CF071UL    /* when obj is inactive */
 395#define RED_ACTIVE      0x170FC2A5UL    /* when obj is active */
 396
 397/* ...and for poisoning */
 398#define POISON_INUSE    0x5a    /* for use-uninitialised poisoning */
 399#define POISON_FREE     0x6b    /* for use-after-free poisoning */
 400#define POISON_END      0xa5    /* end-byte of poisoning */
 401
 402/* memory layout of objects:
 403 * 0            : objp
 404 * 0 .. cachep->dbghead - BYTES_PER_WORD - 1: padding. This ensures that
 405 *              the end of an object is aligned with the end of the real
 406 *              allocation. Catches writes behind the end of the allocation.
 407 * cachep->dbghead - BYTES_PER_WORD .. cachep->dbghead - 1:
 408 *              redzone word.
 409 * cachep->dbghead: The real object.
 410 * cachep->objsize - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
 411 * cachep->objsize - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
 412 */
 413static int obj_dbghead(kmem_cache_t *cachep)
 414{
 415        return cachep->dbghead;
 416}
 417
 418static int obj_reallen(kmem_cache_t *cachep)
 419{
 420        return cachep->reallen;
 421}
 422
 423static unsigned long *dbg_redzone1(kmem_cache_t *cachep, void *objp)
 424{
 425        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 426        return (unsigned long*) (objp+obj_dbghead(cachep)-BYTES_PER_WORD);
 427}
 428
 429static unsigned long *dbg_redzone2(kmem_cache_t *cachep, void *objp)
 430{
 431        BUG_ON(!(cachep->flags & SLAB_RED_ZONE));
 432        if (cachep->flags & SLAB_STORE_USER)
 433                return (unsigned long*) (objp+cachep->objsize-2*BYTES_PER_WORD);
 434        return (unsigned long*) (objp+cachep->objsize-BYTES_PER_WORD);
 435}
 436
 437static void **dbg_userword(kmem_cache_t *cachep, void *objp)
 438{
 439        BUG_ON(!(cachep->flags & SLAB_STORE_USER));
 440        return (void**)(objp+cachep->objsize-BYTES_PER_WORD);
 441}
 442
 443#else
 444
 445#define obj_dbghead(x)                  0
 446#define obj_reallen(cachep)             (cachep->objsize)
 447#define dbg_redzone1(cachep, objp)      ({BUG(); (unsigned long *)NULL;})
 448#define dbg_redzone2(cachep, objp)      ({BUG(); (unsigned long *)NULL;})
 449#define dbg_userword(cachep, objp)      ({BUG(); (void **)NULL;})
 450
 451#endif
 452
 453/*
 454 * Maximum size of an obj (in 2^order pages)
 455 * and absolute limit for the gfp order.
 456 */
 457#if defined(CONFIG_LARGE_ALLOCS)
 458#define MAX_OBJ_ORDER   13      /* up to 32Mb */
 459#define MAX_GFP_ORDER   13      /* up to 32Mb */
 460#elif defined(CONFIG_MMU)
 461#define MAX_OBJ_ORDER   5       /* 32 pages */
 462#define MAX_GFP_ORDER   5       /* 32 pages */
 463#else
 464#define MAX_OBJ_ORDER   8       /* up to 1Mb */
 465#define MAX_GFP_ORDER   8       /* up to 1Mb */
 466#endif
 467
 468/*
 469 * Do not go above this order unless 0 objects fit into the slab.
 470 */
 471#define BREAK_GFP_ORDER_HI      1
 472#define BREAK_GFP_ORDER_LO      0
 473static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
 474
 475/* Macros for storing/retrieving the cachep and or slab from the
 476 * global 'mem_map'. These are used to find the slab an obj belongs to.
 477 * With kfree(), these are used to find the cache which an obj belongs to.
 478 */
 479#define SET_PAGE_CACHE(pg,x)  ((pg)->lru.next = (struct list_head *)(x))
 480#define GET_PAGE_CACHE(pg)    ((kmem_cache_t *)(pg)->lru.next)
 481#define SET_PAGE_SLAB(pg,x)   ((pg)->lru.prev = (struct list_head *)(x))
 482#define GET_PAGE_SLAB(pg)     ((struct slab *)(pg)->lru.prev)
 483
 484/* These are the default caches for kmalloc. Custom caches can have other sizes. */
 485struct cache_sizes malloc_sizes[] = {
 486#define CACHE(x) { .cs_size = (x) },
 487#include <linux/kmalloc_sizes.h>
 488        { 0, }
 489#undef CACHE
 490};
 491
 492EXPORT_SYMBOL(malloc_sizes);
 493
 494/* Must match cache_sizes above. Out of line to keep cache footprint low. */
 495struct cache_names {
 496        char *name;
 497        char *name_dma;
 498};
 499
 500static struct cache_names __initdata cache_names[] = {
 501#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
 502#include <linux/kmalloc_sizes.h>
 503        { NULL, }
 504#undef CACHE
 505};
 506
 507static struct arraycache_init initarray_cache __initdata =
 508        { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 509static struct arraycache_init initarray_generic =
 510        { { 0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 511
 512/* internal cache of cache description objs */
 513static kmem_cache_t cache_cache = {
 514        .lists          = LIST3_INIT(cache_cache.lists),
 515        .batchcount     = 1,
 516        .limit          = BOOT_CPUCACHE_ENTRIES,
 517        .objsize        = sizeof(kmem_cache_t),
 518        .flags          = SLAB_NO_REAP,
 519        .spinlock       = SPIN_LOCK_UNLOCKED,
 520        .name           = "kmem_cache",
 521#if DEBUG
 522        .reallen        = sizeof(kmem_cache_t),
 523#endif
 524};
 525
 526/* Guard access to the cache-chain. */
 527static struct semaphore cache_chain_sem;
 528static struct list_head cache_chain;
 529
 530/*
 531 * vm_enough_memory() looks at this to determine how many
 532 * slab-allocated pages are possibly freeable under pressure
 533 *
 534 * SLAB_RECLAIM_ACCOUNT turns this on per-slab
 535 */
 536atomic_t slab_reclaim_pages;
 537EXPORT_SYMBOL(slab_reclaim_pages);
 538
 539/*
 540 * chicken and egg problem: delay the per-cpu array allocation
 541 * until the general caches are up.
 542 */
 543static enum {
 544        NONE,
 545        PARTIAL,
 546        FULL
 547} g_cpucache_up;
 548
 549static DEFINE_PER_CPU(struct work_struct, reap_work);
 550
 551static void free_block(kmem_cache_t* cachep, void** objpp, int len);
 552static void enable_cpucache (kmem_cache_t *cachep);
 553static void cache_reap (void *unused);
 554
 555static inline void ** ac_entry(struct array_cache *ac)
 556{
 557        return (void**)(ac+1);
 558}
 559
 560static inline struct array_cache *ac_data(kmem_cache_t *cachep)
 561{
 562        return cachep->array[smp_processor_id()];
 563}
 564
 565static kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags)
 566{
 567        struct cache_sizes *csizep = malloc_sizes;
 568
 569        /* This function could be moved to the header file, and
 570         * made inline so consumers can quickly determine what
 571         * cache pointer they require.
 572         */
 573        for ( ; csizep->cs_size; csizep++) {
 574                if (size > csizep->cs_size)
 575                        continue;
 576                break;
 577        }
 578        return (gfpflags & GFP_DMA) ? csizep->cs_dmacachep : csizep->cs_cachep;
 579}
 580
 581/* Cal the num objs, wastage, and bytes left over for a given slab size. */
 582static void cache_estimate (unsigned long gfporder, size_t size, size_t align,
 583                 int flags, size_t *left_over, unsigned int *num)
 584{
 585        int i;
 586        size_t wastage = PAGE_SIZE<<gfporder;
 587        size_t extra = 0;
 588        size_t base = 0;
 589
 590        if (!(flags & CFLGS_OFF_SLAB)) {
 591                base = sizeof(struct slab);
 592                extra = sizeof(kmem_bufctl_t);
 593        }
 594        i = 0;
 595        while (i*size + ALIGN(base+i*extra, align) <= wastage)
 596                i++;
 597        if (i > 0)
 598                i--;
 599
 600        if (i > SLAB_LIMIT)
 601                i = SLAB_LIMIT;
 602
 603        *num = i;
 604        wastage -= i*size;
 605        wastage -= ALIGN(base+i*extra, align);
 606        *left_over = wastage;
 607}
 608
 609#define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
 610
 611static void __slab_error(const char *function, kmem_cache_t *cachep, char *msg)
 612{
 613        printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 614                function, cachep->name, msg);
 615        dump_stack();
 616}
 617
 618/*
 619 * Initiate the reap timer running on the target CPU.  We run at around 1 to 2Hz
 620 * via the workqueue/eventd.
 621 * Add the CPU number into the expiration time to minimize the possibility of
 622 * the CPUs getting into lockstep and contending for the global cache chain
 623 * lock.
 624 */
 625static void __devinit start_cpu_timer(int cpu)
 626{
 627        struct work_struct *reap_work = &per_cpu(reap_work, cpu);
 628
 629        /*
 630         * When this gets called from do_initcalls via cpucache_init(),
 631         * init_workqueues() has already run, so keventd will be setup
 632         * at that time.
 633         */
 634        if (keventd_up() && reap_work->func == NULL) {
 635                INIT_WORK(reap_work, cache_reap, NULL);
 636                schedule_delayed_work_on(cpu, reap_work, HZ + 3 * cpu);
 637        }
 638}
 639
 640static struct array_cache *alloc_arraycache(int cpu, int entries, int batchcount)
 641{
 642        int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
 643        struct array_cache *nc = NULL;
 644
 645        if (cpu != -1) {
 646                nc = kmem_cache_alloc_node(kmem_find_general_cachep(memsize,
 647                                        GFP_KERNEL), cpu_to_node(cpu));
 648        }
 649        if (!nc)
 650                nc = kmalloc(memsize, GFP_KERNEL);
 651        if (nc) {
 652                nc->avail = 0;
 653                nc->limit = entries;
 654                nc->batchcount = batchcount;
 655                nc->touched = 0;
 656        }
 657        return nc;
 658}
 659
 660static int __devinit cpuup_callback(struct notifier_block *nfb,
 661                                  unsigned long action,
 662                                  void *hcpu)
 663{
 664        long cpu = (long)hcpu;
 665        kmem_cache_t* cachep;
 666
 667        switch (action) {
 668        case CPU_UP_PREPARE:
 669                down(&cache_chain_sem);
 670                list_for_each_entry(cachep, &cache_chain, next) {
 671                        struct array_cache *nc;
 672
 673                        nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount);
 674                        if (!nc)
 675                                goto bad;
 676
 677                        spin_lock_irq(&cachep->spinlock);
 678                        cachep->array[cpu] = nc;
 679                        cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
 680                                                + cachep->num;
 681                        spin_unlock_irq(&cachep->spinlock);
 682
 683                }
 684                up(&cache_chain_sem);
 685                break;
 686        case CPU_ONLINE:
 687                start_cpu_timer(cpu);
 688                break;
 689#ifdef CONFIG_HOTPLUG_CPU
 690        case CPU_DEAD:
 691                /* fall thru */
 692        case CPU_UP_CANCELED:
 693                down(&cache_chain_sem);
 694
 695                list_for_each_entry(cachep, &cache_chain, next) {
 696                        struct array_cache *nc;
 697
 698                        spin_lock_irq(&cachep->spinlock);
 699                        /* cpu is dead; no one can alloc from it. */
 700                        nc = cachep->array[cpu];
 701                        cachep->array[cpu] = NULL;
 702                        cachep->free_limit -= cachep->batchcount;
 703                        free_block(cachep, ac_entry(nc), nc->avail);
 704                        spin_unlock_irq(&cachep->spinlock);
 705                        kfree(nc);
 706                }
 707                up(&cache_chain_sem);
 708                break;
 709#endif
 710        }
 711        return NOTIFY_OK;
 712bad:
 713        up(&cache_chain_sem);
 714        return NOTIFY_BAD;
 715}
 716
 717static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 718
 719/* Initialisation.
 720 * Called after the gfp() functions have been enabled, and before smp_init().
 721 */
 722void __init kmem_cache_init(void)
 723{
 724        size_t left_over;
 725        struct cache_sizes *sizes;
 726        struct cache_names *names;
 727
 728        /*
 729         * Fragmentation resistance on low memory - only use bigger
 730         * page orders on machines with more than 32MB of memory.
 731         */
 732        if (num_physpages > (32 << 20) >> PAGE_SHIFT)
 733                slab_break_gfp_order = BREAK_GFP_ORDER_HI;
 734
 735        
 736        /* Bootstrap is tricky, because several objects are allocated
 737         * from caches that do not exist yet:
 738         * 1) initialize the cache_cache cache: it contains the kmem_cache_t
 739         *    structures of all caches, except cache_cache itself: cache_cache
 740         *    is statically allocated.
 741         *    Initially an __init data area is used for the head array, it's
 742         *    replaced with a kmalloc allocated array at the end of the bootstrap.
 743         * 2) Create the first kmalloc cache.
 744         *    The kmem_cache_t for the new cache is allocated normally. An __init
 745         *    data area is used for the head array.
 746         * 3) Create the remaining kmalloc caches, with minimally sized head arrays.
 747         * 4) Replace the __init data head arrays for cache_cache and the first
 748         *    kmalloc cache with kmalloc allocated arrays.
 749         * 5) Resize the head arrays of the kmalloc caches to their final sizes.
 750         */
 751
 752        /* 1) create the cache_cache */
 753        init_MUTEX(&cache_chain_sem);
 754        INIT_LIST_HEAD(&cache_chain);
 755        list_add(&cache_cache.next, &cache_chain);
 756        cache_cache.colour_off = cache_line_size();
 757        cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
 758
 759        cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
 760
 761        cache_estimate(0, cache_cache.objsize, cache_line_size(), 0,
 762                                &left_over, &cache_cache.num);
 763        if (!cache_cache.num)
 764                BUG();
 765
 766        cache_cache.colour = left_over/cache_cache.colour_off;
 767        cache_cache.colour_next = 0;
 768        cache_cache.slab_size = ALIGN(cache_cache.num*sizeof(kmem_bufctl_t) +
 769                                sizeof(struct slab), cache_line_size());
 770
 771        /* 2+3) create the kmalloc caches */
 772        sizes = malloc_sizes;
 773        names = cache_names;
 774
 775        while (sizes->cs_size) {
 776                /* For performance, all the general caches are L1 aligned.
 777                 * This should be particularly beneficial on SMP boxes, as it
 778                 * eliminates "false sharing".
 779                 * Note for systems short on memory removing the alignment will
 780                 * allow tighter packing of the smaller caches. */
 781                sizes->cs_cachep = kmem_cache_create(names->name,
 782                        sizes->cs_size, ARCH_KMALLOC_MINALIGN,
 783                        (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
 784
 785                /* Inc off-slab bufctl limit until the ceiling is hit. */
 786                if (!(OFF_SLAB(sizes->cs_cachep))) {
 787                        offslab_limit = sizes->cs_size-sizeof(struct slab);
 788                        offslab_limit /= sizeof(kmem_bufctl_t);
 789                }
 790
 791                sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
 792                        sizes->cs_size, ARCH_KMALLOC_MINALIGN,
 793                        (ARCH_KMALLOC_FLAGS | SLAB_CACHE_DMA | SLAB_PANIC),
 794                        NULL, NULL);
 795
 796                sizes++;
 797                names++;
 798        }
 799        /* 4) Replace the bootstrap head arrays */
 800        {
 801                void * ptr;
 802                
 803                ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 804                local_irq_disable();
 805                BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
 806                memcpy(ptr, ac_data(&cache_cache), sizeof(struct arraycache_init));
 807                cache_cache.array[smp_processor_id()] = ptr;
 808                local_irq_enable();
 809        
 810                ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
 811                local_irq_disable();
 812                BUG_ON(ac_data(malloc_sizes[0].cs_cachep) != &initarray_generic.cache);
 813                memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep),
 814                                sizeof(struct arraycache_init));
 815                malloc_sizes[0].cs_cachep->array[smp_processor_id()] = ptr;
 816                local_irq_enable();
 817        }
 818
 819        /* 5) resize the head arrays to their final sizes */
 820        {
 821                kmem_cache_t *cachep;
 822                down(&cache_chain_sem);
 823                list_for_each_entry(cachep, &cache_chain, next)
 824                        enable_cpucache(cachep);
 825                up(&cache_chain_sem);
 826        }
 827
 828        /* Done! */
 829        g_cpucache_up = FULL;
 830
 831        /* Register a cpu startup notifier callback
 832         * that initializes ac_data for all new cpus
 833         */
 834        register_cpu_notifier(&cpucache_notifier);
 835        
 836
 837        /* The reap timers are started later, with a module init call:
 838         * That part of the kernel is not yet operational.
 839         */
 840}
 841
 842static int __init cpucache_init(void)
 843{
 844        int cpu;
 845
 846        /* 
 847         * Register the timers that return unneeded
 848         * pages to gfp.
 849         */
 850        for (cpu = 0; cpu < NR_CPUS; cpu++) {
 851                if (cpu_online(cpu))
 852                        start_cpu_timer(cpu);
 853        }
 854
 855        return 0;
 856}
 857
 858__initcall(cpucache_init);
 859
 860/*
 861 * Interface to system's page allocator. No need to hold the cache-lock.
 862 *
 863 * If we requested dmaable memory, we will get it. Even if we
 864 * did not request dmaable memory, we might get it, but that
 865 * would be relatively rare and ignorable.
 866 */
 867static void *kmem_getpages(kmem_cache_t *cachep, int flags, int nodeid)
 868{
 869        struct page *page;
 870        void *addr;
 871        int i;
 872
 873        flags |= cachep->gfpflags;
 874        if (likely(nodeid == -1)) {
 875                addr = (void*)__get_free_pages(flags, cachep->gfporder);
 876                if (!addr)
 877                        return NULL;
 878                page = virt_to_page(addr);
 879        } else {
 880                page = alloc_pages_node(nodeid, flags, cachep->gfporder);
 881                if (!page)
 882                        return NULL;
 883                addr = page_address(page);
 884        }
 885
 886        i = (1 << cachep->gfporder);
 887        if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 888                atomic_add(i, &slab_reclaim_pages);
 889        add_page_state(nr_slab, i);
 890        while (i--) {
 891                SetPageSlab(page);
 892                page++;
 893        }
 894        return addr;
 895}
 896
 897/*
 898 * Interface to system's page release.
 899 */
 900static void kmem_freepages(kmem_cache_t *cachep, void *addr)
 901{
 902        unsigned long i = (1<<cachep->gfporder);
 903        struct page *page = virt_to_page(addr);
 904        const unsigned long nr_freed = i;
 905
 906        while (i--) {
 907                if (!TestClearPageSlab(page))
 908                        BUG();
 909                page++;
 910        }
 911        sub_page_state(nr_slab, nr_freed);
 912        if (current->reclaim_state)
 913                current->reclaim_state->reclaimed_slab += nr_freed;
 914        free_pages((unsigned long)addr, cachep->gfporder);
 915        if (cachep->flags & SLAB_RECLAIM_ACCOUNT) 
 916                atomic_sub(1<<cachep->gfporder, &slab_reclaim_pages);
 917}
 918
 919static void kmem_rcu_free(struct rcu_head *head)
 920{
 921        struct slab_rcu *slab_rcu = (struct slab_rcu *) head;
 922        kmem_cache_t *cachep = slab_rcu->cachep;
 923
 924        kmem_freepages(cachep, slab_rcu->addr);
 925        if (OFF_SLAB(cachep))
 926                kmem_cache_free(cachep->slabp_cache, slab_rcu);
 927}
 928
 929#if DEBUG
 930
 931#ifdef CONFIG_DEBUG_PAGEALLOC
 932static void store_stackinfo(kmem_cache_t *cachep, unsigned long *addr, unsigned long caller)
 933{
 934        int size = obj_reallen(cachep);
 935
 936        addr = (unsigned long *)&((char*)addr)[obj_dbghead(cachep)];
 937
 938        if (size < 5*sizeof(unsigned long))
 939                return;
 940
 941        *addr++=0x12345678;
 942        *addr++=caller;
 943        *addr++=smp_processor_id();
 944        size -= 3*sizeof(unsigned long);
 945        {
 946                unsigned long *sptr = &caller;
 947                unsigned long svalue;
 948
 949                while (!kstack_end(sptr)) {
 950                        svalue = *sptr++;
 951                        if (kernel_text_address(svalue)) {
 952                                *addr++=svalue;
 953                                size -= sizeof(unsigned long);
 954                                if (size <= sizeof(unsigned long))
 955                                        break;
 956                        }
 957                }
 958
 959        }
 960        *addr++=0x87654321;
 961}
 962#endif
 963
 964static void poison_obj(kmem_cache_t *cachep, void *addr, unsigned char val)
 965{
 966        int size = obj_reallen(cachep);
 967        addr = &((char*)addr)[obj_dbghead(cachep)];
 968
 969        memset(addr, val, size);
 970        *(unsigned char *)(addr+size-1) = POISON_END;
 971}
 972
 973static void dump_line(char *data, int offset, int limit)
 974{
 975        int i;
 976        printk(KERN_ERR "%03x:", offset);
 977        for (i=0;i<limit;i++) {
 978                printk(" %02x", (unsigned char)data[offset+i]);
 979        }
 980        printk("\n");
 981}
 982#endif
 983
 984#if DEBUG
 985
 986static void print_objinfo(kmem_cache_t *cachep, void *objp, int lines)
 987{
 988        int i, size;
 989        char *realobj;
 990
 991        if (cachep->flags & SLAB_RED_ZONE) {
 992                printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
 993                        *dbg_redzone1(cachep, objp),
 994                        *dbg_redzone2(cachep, objp));
 995        }
 996
 997        if (cachep->flags & SLAB_STORE_USER) {
 998                printk(KERN_ERR "Last user: [<%p>]",
 999                                *dbg_userword(cachep, objp));
1000                print_symbol("(%s)",
1001                                (unsigned long)*dbg_userword(cachep, objp));
1002                printk("\n");
1003        }
1004        realobj = (char*)objp+obj_dbghead(cachep);
1005        size = obj_reallen(cachep);
1006        for (i=0; i<size && lines;i+=16, lines--) {
1007                int limit;
1008                limit = 16;
1009                if (i+limit > size)
1010                        limit = size-i;
1011                dump_line(realobj, i, limit);
1012        }
1013}
1014
1015static void check_poison_obj(kmem_cache_t *cachep, void *objp)
1016{
1017        char *realobj;
1018        int size, i;
1019        int lines = 0;
1020
1021        realobj = (char*)objp+obj_dbghead(cachep);
1022        size = obj_reallen(cachep);
1023
1024        for (i=0;i<size;i++) {
1025                char exp = POISON_FREE;
1026                if (i == size-1)
1027                        exp = POISON_END;
1028                if (realobj[i] != exp) {
1029                        int limit;
1030                        /* Mismatch ! */
1031                        /* Print header */
1032                        if (lines == 0) {
1033                                printk(KERN_ERR "Slab corruption: start=%p, len=%d\n",
1034                                                realobj, size);
1035                                print_objinfo(cachep, objp, 0);
1036                        }
1037                        /* Hexdump the affected line */
1038                        i = (i/16)*16;
1039                        limit = 16;
1040                        if (i+limit > size)
1041                                limit = size-i;
1042                        dump_line(realobj, i, limit);
1043                        i += 16;
1044                        lines++;
1045                        /* Limit to 5 lines */
1046                        if (lines > 5)
1047                                break;
1048                }
1049        }
1050        if (lines != 0) {
1051                /* Print some data about the neighboring objects, if they
1052                 * exist:
1053                 */
1054                struct slab *slabp = GET_PAGE_SLAB(virt_to_page(objp));
1055                int objnr;
1056
1057                objnr = (objp-slabp->s_mem)/cachep->objsize;
1058                if (objnr) {
1059                        objp = slabp->s_mem+(objnr-1)*cachep->objsize;
1060                        realobj = (char*)objp+obj_dbghead(cachep);
1061                        printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
1062                                                realobj, size);
1063                        print_objinfo(cachep, objp, 2);
1064                }
1065                if (objnr+1 < cachep->num) {
1066                        objp = slabp->s_mem+(objnr+1)*cachep->objsize;
1067                        realobj = (char*)objp+obj_dbghead(cachep);
1068                        printk(KERN_ERR "Next obj: start=%p, len=%d\n",
1069                                                realobj, size);
1070                        print_objinfo(cachep, objp, 2);
1071                }
1072        }
1073}
1074#endif
1075
1076/* Destroy all the objs in a slab, and release the mem back to the system.
1077 * Before calling the slab must have been unlinked from the cache.
1078 * The cache-lock is not held/needed.
1079 */
1080static void slab_destroy (kmem_cache_t *cachep, struct slab *slabp)
1081{
1082        void *addr = slabp->s_mem - slabp->colouroff;
1083
1084#if DEBUG
1085        int i;
1086        for (i = 0; i < cachep->num; i++) {
1087                void *objp = slabp->s_mem + cachep->objsize * i;
1088
1089                if (cachep->flags & SLAB_POISON) {
1090#ifdef CONFIG_DEBUG_PAGEALLOC
1091                        if ((cachep->objsize%PAGE_SIZE)==0 && OFF_SLAB(cachep))
1092                                kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE,1);
1093                        else
1094                                check_poison_obj(cachep, objp);
1095#else
1096                        check_poison_obj(cachep, objp);
1097#endif
1098                }
1099                if (cachep->flags & SLAB_RED_ZONE) {
1100                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1101                                slab_error(cachep, "start of a freed object "
1102                                                        "was overwritten");
1103                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1104                                slab_error(cachep, "end of a freed object "
1105                                                        "was overwritten");
1106                }
1107                if (cachep->dtor && !(cachep->flags & SLAB_POISON))
1108                        (cachep->dtor)(objp+obj_dbghead(cachep), cachep, 0);
1109        }
1110#else
1111        if (cachep->dtor) {
1112                int i;
1113                for (i = 0; i < cachep->num; i++) {
1114                        void* objp = slabp->s_mem+cachep->objsize*i;
1115                        (cachep->dtor)(objp, cachep, 0);
1116                }
1117        }
1118#endif
1119
1120        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
1121                struct slab_rcu *slab_rcu;
1122
1123                slab_rcu = (struct slab_rcu *) slabp;
1124                slab_rcu->cachep = cachep;
1125                slab_rcu->addr = addr;
1126                call_rcu(&slab_rcu->head, kmem_rcu_free);
1127        } else {
1128                kmem_freepages(cachep, addr);
1129                if (OFF_SLAB(cachep))
1130                        kmem_cache_free(cachep->slabp_cache, slabp);
1131        }
1132}
1133
1134/**
1135 * kmem_cache_create - Create a cache.
1136 * @name: A string which is used in /proc/slabinfo to identify this cache.
1137 * @size: The size of objects to be created in this cache.
1138 * @align: The required alignment for the objects.
1139 * @flags: SLAB flags
1140 * @ctor: A constructor for the objects.
1141 * @dtor: A destructor for the objects.
1142 *
1143 * Returns a ptr to the cache on success, NULL on failure.
1144 * Cannot be called within a int, but can be interrupted.
1145 * The @ctor is run when new pages are allocated by the cache
1146 * and the @dtor is run before the pages are handed back.
1147 *
1148 * @name must be valid until the cache is destroyed. This implies that
1149 * the module calling this has to destroy the cache before getting 
1150 * unloaded.
1151 * 
1152 * The flags are
1153 *
1154 * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
1155 * to catch references to uninitialised memory.
1156 *
1157 * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
1158 * for buffer overruns.
1159 *
1160 * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
1161 * memory pressure.
1162 *
1163 * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
1164 * cacheline.  This can be beneficial if you're counting cycles as closely
1165 * as davem.
1166 */
1167kmem_cache_t *
1168kmem_cache_create (const char *name, size_t size, size_t align,
1169        unsigned long flags, void (*ctor)(void*, kmem_cache_t *, unsigned long),
1170        void (*dtor)(void*, kmem_cache_t *, unsigned long))
1171{
1172        size_t left_over, slab_size;
1173        kmem_cache_t *cachep = NULL;
1174
1175        /*
1176         * Sanity checks... these are all serious usage bugs.
1177         */
1178        if ((!name) ||
1179                in_interrupt() ||
1180                (size < BYTES_PER_WORD) ||
1181                (size > (1<<MAX_OBJ_ORDER)*PAGE_SIZE) ||
1182                (dtor && !ctor)) {
1183                        printk(KERN_ERR "%s: Early error in slab %s\n",
1184                                        __FUNCTION__, name);
1185                        BUG();
1186                }
1187
1188#if DEBUG
1189        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
1190        if ((flags & SLAB_DEBUG_INITIAL) && !ctor) {
1191                /* No constructor, but inital state check requested */
1192                printk(KERN_ERR "%s: No con, but init state check "
1193                                "requested - %s\n", __FUNCTION__, name);
1194                flags &= ~SLAB_DEBUG_INITIAL;
1195        }
1196
1197#if FORCED_DEBUG
1198        /*
1199         * Enable redzoning and last user accounting, except for caches with
1200         * large objects, if the increased size would increase the object size
1201         * above the next power of two: caches with object sizes just above a
1202         * power of two have a significant amount of internal fragmentation.
1203         */
1204        if ((size < 4096 || fls(size-1) == fls(size-1+3*BYTES_PER_WORD)))
1205                flags |= SLAB_RED_ZONE|SLAB_STORE_USER;
1206        if (!(flags & SLAB_DESTROY_BY_RCU))
1207                flags |= SLAB_POISON;
1208#endif
1209        if (flags & SLAB_DESTROY_BY_RCU)
1210                BUG_ON(flags & SLAB_POISON);
1211#endif
1212        if (flags & SLAB_DESTROY_BY_RCU)
1213                BUG_ON(dtor);
1214
1215        /*
1216         * Always checks flags, a caller might be expecting debug
1217         * support which isn't available.
1218         */
1219        if (flags & ~CREATE_MASK)
1220                BUG();
1221
1222        if (align) {
1223                /* combinations of forced alignment and advanced debugging is
1224                 * not yet implemented.
1225                 */
1226                flags &= ~(SLAB_RED_ZONE|SLAB_STORE_USER);
1227        } else {
1228                if (flags & SLAB_HWCACHE_ALIGN) {
1229                        /* Default alignment: as specified by the arch code.
1230                         * Except if an object is really small, then squeeze multiple
1231                         * into one cacheline.
1232                         */
1233                        align = cache_line_size();
1234                        while (size <= align/2)
1235                                align /= 2;
1236                } else {
1237                        align = BYTES_PER_WORD;
1238                }
1239        }
1240
1241        /* Get cache's description obj. */
1242        cachep = (kmem_cache_t *) kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
1243        if (!cachep)
1244                goto opps;
1245        memset(cachep, 0, sizeof(kmem_cache_t));
1246
1247        /* Check that size is in terms of words.  This is needed to avoid
1248         * unaligned accesses for some archs when redzoning is used, and makes
1249         * sure any on-slab bufctl's are also correctly aligned.
1250         */
1251        if (size & (BYTES_PER_WORD-1)) {
1252                size += (BYTES_PER_WORD-1);
1253                size &= ~(BYTES_PER_WORD-1);
1254        }
1255        
1256#if DEBUG
1257        cachep->reallen = size;
1258
1259        if (flags & SLAB_RED_ZONE) {
1260                /* redzoning only works with word aligned caches */
1261                align = BYTES_PER_WORD;
1262
1263                /* add space for red zone words */
1264                cachep->dbghead += BYTES_PER_WORD;
1265                size += 2*BYTES_PER_WORD;
1266        }
1267        if (flags & SLAB_STORE_USER) {
1268                /* user store requires word alignment and
1269                 * one word storage behind the end of the real
1270                 * object.
1271                 */
1272                align = BYTES_PER_WORD;
1273                size += BYTES_PER_WORD;
1274        }
1275#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
1276        if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
1277                cachep->dbghead += PAGE_SIZE - size;
1278                size = PAGE_SIZE;
1279        }
1280#endif
1281#endif
1282
1283        /* Determine if the slab management is 'on' or 'off' slab. */
1284        if (size >= (PAGE_SIZE>>3))
1285                /*
1286                 * Size is large, assume best to place the slab management obj
1287                 * off-slab (should allow better packing of objs).
1288                 */
1289                flags |= CFLGS_OFF_SLAB;
1290
1291        size = ALIGN(size, align);
1292
1293        if ((flags & SLAB_RECLAIM_ACCOUNT) && size <= PAGE_SIZE) {
1294                /*
1295                 * A VFS-reclaimable slab tends to have most allocations
1296                 * as GFP_NOFS and we really don't want to have to be allocating
1297                 * higher-order pages when we are unable to shrink dcache.
1298                 */
1299                cachep->gfporder = 0;
1300                cache_estimate(cachep->gfporder, size, align, flags,
1301                                        &left_over, &cachep->num);
1302        } else {
1303                /*
1304                 * Calculate size (in pages) of slabs, and the num of objs per
1305                 * slab.  This could be made much more intelligent.  For now,
1306                 * try to avoid using high page-orders for slabs.  When the
1307                 * gfp() funcs are more friendly towards high-order requests,
1308                 * this should be changed.
1309                 */
1310                do {
1311                        unsigned int break_flag = 0;
1312cal_wastage:
1313                        cache_estimate(cachep->gfporder, size, align, flags,
1314                                                &left_over, &cachep->num);
1315                        if (break_flag)
1316                                break;
1317                        if (cachep->gfporder >= MAX_GFP_ORDER)
1318                                break;
1319                        if (!cachep->num)
1320                                goto next;
1321                        if (flags & CFLGS_OFF_SLAB &&
1322                                        cachep->num > offslab_limit) {
1323                                /* This num of objs will cause problems. */
1324                                cachep->gfporder--;
1325                                break_flag++;
1326                                goto cal_wastage;
1327                        }
1328
1329                        /*
1330                         * Large num of objs is good, but v. large slabs are
1331                         * currently bad for the gfp()s.
1332                         */
1333                        if (cachep->gfporder >= slab_break_gfp_order)
1334                                break;
1335
1336                        if ((left_over*8) <= (PAGE_SIZE<<cachep->gfporder))
1337                                break;  /* Acceptable internal fragmentation. */
1338next:
1339                        cachep->gfporder++;
1340                } while (1);
1341        }
1342
1343        if (!cachep->num) {
1344                printk("kmem_cache_create: couldn't create cache %s.\n", name);
1345                kmem_cache_free(&cache_cache, cachep);
1346                cachep = NULL;
1347                goto opps;
1348        }
1349        slab_size = ALIGN(cachep->num*sizeof(kmem_bufctl_t)
1350                                + sizeof(struct slab), align);
1351
1352        /*
1353         * If the slab has been placed off-slab, and we have enough space then
1354         * move it on-slab. This is at the expense of any extra colouring.
1355         */
1356        if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
1357                flags &= ~CFLGS_OFF_SLAB;
1358                left_over -= slab_size;
1359        }
1360
1361        if (flags & CFLGS_OFF_SLAB) {
1362                /* really off slab. No need for manual alignment */
1363                slab_size = cachep->num*sizeof(kmem_bufctl_t)+sizeof(struct slab);
1364        }
1365
1366        cachep->colour_off = cache_line_size();
1367        /* Offset must be a multiple of the alignment. */
1368        if (cachep->colour_off < align)
1369                cachep->colour_off = align;
1370        cachep->colour = left_over/cachep->colour_off;
1371        cachep->slab_size = slab_size;
1372        cachep->flags = flags;
1373        cachep->gfpflags = 0;
1374        if (flags & SLAB_CACHE_DMA)
1375                cachep->gfpflags |= GFP_DMA;
1376        spin_lock_init(&cachep->spinlock);
1377        cachep->objsize = size;
1378        /* NUMA */
1379        INIT_LIST_HEAD(&cachep->lists.slabs_full);
1380        INIT_LIST_HEAD(&cachep->lists.slabs_partial);
1381        INIT_LIST_HEAD(&cachep->lists.slabs_free);
1382
1383        if (flags & CFLGS_OFF_SLAB)
1384                cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
1385        cachep->ctor = ctor;
1386        cachep->dtor = dtor;
1387        cachep->name = name;
1388
1389        /* Don't let CPUs to come and go */
1390        lock_cpu_hotplug();
1391
1392        if (g_cpucache_up == FULL) {
1393                enable_cpucache(cachep);
1394        } else {
1395                if (g_cpucache_up == NONE) {
1396                        /* Note: the first kmem_cache_create must create
1397                         * the cache that's used by kmalloc(24), otherwise
1398                         * the creation of further caches will BUG().
1399                         */
1400                        cachep->array[smp_processor_id()] =
1401                                        &initarray_generic.cache;
1402                        g_cpucache_up = PARTIAL;
1403                } else {
1404                        cachep->array[smp_processor_id()] =
1405                                kmalloc(sizeof(struct arraycache_init),
1406                                        GFP_KERNEL);
1407                }
1408                BUG_ON(!ac_data(cachep));
1409                ac_data(cachep)->avail = 0;
1410                ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
1411                ac_data(cachep)->batchcount = 1;
1412                ac_data(cachep)->touched = 0;
1413                cachep->batchcount = 1;
1414                cachep->limit = BOOT_CPUCACHE_ENTRIES;
1415                cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
1416                                        + cachep->num;
1417        } 
1418
1419        cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 +
1420                                ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
1421
1422        /* Need the semaphore to access the chain. */
1423        down(&cache_chain_sem);
1424        {
1425                struct list_head *p;
1426                mm_segment_t old_fs;
1427
1428                old_fs = get_fs();
1429                set_fs(KERNEL_DS);
1430                list_for_each(p, &cache_chain) {
1431                        kmem_cache_t *pc = list_entry(p, kmem_cache_t, next);
1432                        char tmp;
1433
1434                        /*
1435                         * This happens when the module gets unloaded and
1436                         * doesn't destroy its slab cache and noone else reuses
1437                         * the vmalloc area of the module. Print a warning.
1438                         */
1439#ifdef CONFIG_X86_UACCESS_INDIRECT
1440                        if (__direct_get_user(tmp,pc->name)) {
1441#else
1442                        if (__get_user(tmp,pc->name)) {
1443#endif
1444                                printk("SLAB: cache with size %d has lost its "
1445                                                "name\n", pc->objsize);
1446                                continue; 
1447                        }       
1448                        if (!strcmp(pc->name,name)) { 
1449                                printk("kmem_cache_create: duplicate "
1450                                                "cache %s\n",name);
1451                                up(&cache_chain_sem); 
1452                                unlock_cpu_hotplug();
1453                                BUG(); 
1454                        }       
1455                }
1456                set_fs(old_fs);
1457        }
1458
1459        /* cache setup completed, link it into the list */
1460        list_add(&cachep->next, &cache_chain);
1461        up(&cache_chain_sem);
1462        unlock_cpu_hotplug();
1463opps:
1464        if (!cachep && (flags & SLAB_PANIC))
1465                panic("kmem_cache_create(): failed to create slab `%s'\n",
1466                        name);
1467        return cachep;
1468}
1469EXPORT_SYMBOL(kmem_cache_create);
1470
1471#if DEBUG
1472static void check_irq_off(void)
1473{
1474        BUG_ON(!irqs_disabled());
1475}
1476
1477static void check_irq_on(void)
1478{
1479        BUG_ON(irqs_disabled());
1480}
1481
1482static void check_spinlock_acquired(kmem_cache_t *cachep)
1483{
1484#ifdef CONFIG_SMP
1485        check_irq_off();
1486        BUG_ON(spin_trylock(&cachep->spinlock));
1487#endif
1488}
1489#else
1490#define check_irq_off() do { } while(0)
1491#define check_irq_on()  do { } while(0)
1492#define check_spinlock_acquired(x) do { } while(0)
1493#endif
1494
1495/*
1496 * Waits for all CPUs to execute func().
1497 */
1498static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg)
1499{
1500        check_irq_on();
1501        preempt_disable();
1502
1503        local_irq_disable();
1504        func(arg);
1505        local_irq_enable();
1506
1507        if (smp_call_function(func, arg, 1, 1))
1508                BUG();
1509
1510        preempt_enable();
1511}
1512
1513static void drain_array_locked(kmem_cache_t* cachep,
1514                                struct array_cache *ac, int force);
1515
1516static void do_drain(void *arg)
1517{
1518        kmem_cache_t *cachep = (kmem_cache_t*)arg;
1519        struct array_cache *ac;
1520
1521        check_irq_off();
1522        ac = ac_data(cachep);
1523        spin_lock(&cachep->spinlock);
1524        free_block(cachep, &ac_entry(ac)[0], ac->avail);
1525        spin_unlock(&cachep->spinlock);
1526        ac->avail = 0;
1527}
1528
1529static void drain_cpu_caches(kmem_cache_t *cachep)
1530{
1531        smp_call_function_all_cpus(do_drain, cachep);
1532        check_irq_on();
1533        spin_lock_irq(&cachep->spinlock);
1534        if (cachep->lists.shared)
1535                drain_array_locked(cachep, cachep->lists.shared, 1);
1536        spin_unlock_irq(&cachep->spinlock);
1537}
1538
1539
1540/* NUMA shrink all list3s */
1541static int __cache_shrink(kmem_cache_t *cachep)
1542{
1543        struct slab *slabp;
1544        int ret;
1545
1546        drain_cpu_caches(cachep);
1547
1548        check_irq_on();
1549        spin_lock_irq(&cachep->spinlock);
1550
1551        for(;;) {
1552                struct list_head *p;
1553
1554                p = cachep->lists.slabs_free.prev;
1555                if (p == &cachep->lists.slabs_free)
1556                        break;
1557
1558                slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list);
1559#if DEBUG
1560                if (slabp->inuse)
1561                        BUG();
1562#endif
1563                list_del(&slabp->list);
1564
1565                cachep->lists.free_objects -= cachep->num;
1566                spin_unlock_irq(&cachep->spinlock);
1567                slab_destroy(cachep, slabp);
1568                spin_lock_irq(&cachep->spinlock);
1569        }
1570        ret = !list_empty(&cachep->lists.slabs_full) ||
1571                !list_empty(&cachep->lists.slabs_partial);
1572        spin_unlock_irq(&cachep->spinlock);
1573        return ret;
1574}
1575
1576/**
1577 * kmem_cache_shrink - Shrink a cache.
1578 * @cachep: The cache to shrink.
1579 *
1580 * Releases as many slabs as possible for a cache.
1581 * To help debugging, a zero exit status indicates all slabs were released.
1582 */
1583int kmem_cache_shrink(kmem_cache_t *cachep)
1584{
1585        if (!cachep || in_interrupt())
1586                BUG();
1587
1588        return __cache_shrink(cachep);
1589}
1590
1591EXPORT_SYMBOL(kmem_cache_shrink);
1592
1593/**
1594 * kmem_cache_destroy - delete a cache
1595 * @cachep: the cache to destroy
1596 *
1597 * Remove a kmem_cache_t object from the slab cache.
1598 * Returns 0 on success.
1599 *
1600 * It is expected this function will be called by a module when it is
1601 * unloaded.  This will remove the cache completely, and avoid a duplicate
1602 * cache being allocated each time a module is loaded and unloaded, if the
1603 * module doesn't have persistent in-kernel storage across loads and unloads.
1604 *
1605 * The cache must be empty before calling this function.
1606 *
1607 * The caller must guarantee that noone will allocate memory from the cache
1608 * during the kmem_cache_destroy().
1609 */
1610int kmem_cache_destroy (kmem_cache_t * cachep)
1611{
1612        int i;
1613
1614        if (!cachep || in_interrupt())
1615                BUG();
1616
1617        /* Don't let CPUs to come and go */
1618        lock_cpu_hotplug();
1619
1620        /* Find the cache in the chain of caches. */
1621        down(&cache_chain_sem);
1622        /*
1623         * the chain is never empty, cache_cache is never destroyed
1624         */
1625        list_del(&cachep->next);
1626        up(&cache_chain_sem);
1627
1628        if (__cache_shrink(cachep)) {
1629                slab_error(cachep, "Can't free all objects");
1630                down(&cache_chain_sem);
1631                list_add(&cachep->next,&cache_chain);
1632                up(&cache_chain_sem);
1633                unlock_cpu_hotplug();
1634                return 1;
1635        }
1636
1637        if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
1638                synchronize_kernel();
1639
1640        /* no cpu_online check required here since we clear the percpu
1641         * array on cpu offline and set this to NULL.
1642         */
1643        for (i = 0; i < NR_CPUS; i++)
1644                kfree(cachep->array[i]);
1645
1646        /* NUMA: free the list3 structures */
1647        kfree(cachep->lists.shared);
1648        cachep->lists.shared = NULL;
1649        kmem_cache_free(&cache_cache, cachep);
1650
1651        unlock_cpu_hotplug();
1652
1653        return 0;
1654}
1655
1656EXPORT_SYMBOL(kmem_cache_destroy);
1657
1658/* Get the memory for a slab management obj. */
1659static struct slab* alloc_slabmgmt (kmem_cache_t *cachep,
1660                        void *objp, int colour_off, int local_flags)
1661{
1662        struct slab *slabp;
1663        
1664        if (OFF_SLAB(cachep)) {
1665                /* Slab management obj is off-slab. */
1666                slabp = kmem_cache_alloc(cachep->slabp_cache, local_flags);
1667                if (!slabp)
1668                        return NULL;
1669        } else {
1670                slabp = objp+colour_off;
1671                colour_off += cachep->slab_size;
1672        }
1673        slabp->inuse = 0;
1674        slabp->colouroff = colour_off;
1675        slabp->s_mem = objp+colour_off;
1676
1677        return slabp;
1678}
1679
1680static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
1681{
1682        return (kmem_bufctl_t *)(slabp+1);
1683}
1684
1685static void cache_init_objs (kmem_cache_t * cachep,
1686                        struct slab * slabp, unsigned long ctor_flags)
1687{
1688        int i;
1689
1690        for (i = 0; i < cachep->num; i++) {
1691                void* objp = slabp->s_mem+cachep->objsize*i;
1692#if DEBUG
1693                /* need to poison the objs? */
1694                if (cachep->flags & SLAB_POISON)
1695                        poison_obj(cachep, objp, POISON_FREE);
1696                if (cachep->flags & SLAB_STORE_USER)
1697                        *dbg_userword(cachep, objp) = NULL;
1698
1699                if (cachep->flags & SLAB_RED_ZONE) {
1700                        *dbg_redzone1(cachep, objp) = RED_INACTIVE;
1701                        *dbg_redzone2(cachep, objp) = RED_INACTIVE;
1702                }
1703                /*
1704                 * Constructors are not allowed to allocate memory from
1705                 * the same cache which they are a constructor for.
1706                 * Otherwise, deadlock. They must also be threaded.
1707                 */
1708                if (cachep->ctor && !(cachep->flags & SLAB_POISON))
1709                        cachep->ctor(objp+obj_dbghead(cachep), cachep, ctor_flags);
1710
1711                if (cachep->flags & SLAB_RED_ZONE) {
1712                        if (*dbg_redzone2(cachep, objp) != RED_INACTIVE)
1713                                slab_error(cachep, "constructor overwrote the"
1714                                                        " end of an object");
1715                        if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
1716                                slab_error(cachep, "constructor overwrote the"
1717                                                        " start of an object");
1718                }
1719                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
1720                        kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
1721#else
1722                if (cachep->ctor)
1723                        cachep->ctor(objp, cachep, ctor_flags);
1724#endif
1725                slab_bufctl(slabp)[i] = i+1;
1726        }
1727        slab_bufctl(slabp)[i-1] = BUFCTL_END;
1728        slabp->free = 0;
1729}
1730
1731static void kmem_flagcheck(kmem_cache_t *cachep, int flags)
1732{
1733        if (flags & SLAB_DMA) {
1734                if (!(cachep->gfpflags & GFP_DMA))
1735                        BUG();
1736        } else {
1737                if (cachep->gfpflags & GFP_DMA)
1738                        BUG();
1739        }
1740}
1741
1742static void set_slab_attr(kmem_cache_t *cachep, struct slab *slabp, void *objp)
1743{
1744        int i;
1745        struct page *page;
1746
1747        /* Nasty!!!!!! I hope this is OK. */
1748        i = 1 << cachep->gfporder;
1749        page = virt_to_page(objp);
1750        do {
1751                SET_PAGE_CACHE(page, cachep);
1752                SET_PAGE_SLAB(page, slabp);
1753                page++;
1754        } while (--i);
1755}
1756
1757/*
1758 * Grow (by 1) the number of slabs within a cache.  This is called by
1759 * kmem_cache_alloc() when there are no active objs left in a cache.
1760 */
1761static int cache_grow (kmem_cache_t * cachep, int flags)
1762{
1763        struct slab     *slabp;
1764        void            *objp;
1765        size_t           offset;
1766        int              local_flags;
1767        unsigned long    ctor_flags;
1768
1769        /* Be lazy and only check for valid flags here,
1770         * keeping it out of the critical path in kmem_cache_alloc().
1771         */
1772        if (flags & ~(SLAB_DMA|SLAB_LEVEL_MASK|SLAB_NO_GROW))
1773                BUG();
1774        if (flags & SLAB_NO_GROW)
1775                return 0;
1776
1777        ctor_flags = SLAB_CTOR_CONSTRUCTOR;
1778        local_flags = (flags & SLAB_LEVEL_MASK);
1779        if (!(local_flags & __GFP_WAIT))
1780                /*
1781                 * Not allowed to sleep.  Need to tell a constructor about
1782                 * this - it might need to know...
1783                 */
1784                ctor_flags |= SLAB_CTOR_ATOMIC;
1785
1786        /* About to mess with non-constant members - lock. */
1787        check_irq_off();
1788        spin_lock(&cachep->spinlock);
1789
1790        /* Get colour for the slab, and cal the next value. */
1791        offset = cachep->colour_next;
1792        cachep->colour_next++;
1793        if (cachep->colour_next >= cachep->colour)
1794                cachep->colour_next = 0;
1795        offset *= cachep->colour_off;
1796
1797        spin_unlock(&cachep->spinlock);
1798
1799        if (local_flags & __GFP_WAIT)
1800                local_irq_enable();
1801
1802        /*
1803         * The test for missing atomic flag is performed here, rather than
1804         * the more obvious place, simply to reduce the critical path length
1805         * in kmem_cache_alloc(). If a caller is seriously mis-behaving they
1806         * will eventually be caught here (where it matters).
1807         */
1808        kmem_flagcheck(cachep, flags);
1809
1810
1811        /* Get mem for the objs. */
1812        if (!(objp = kmem_getpages(cachep, flags, -1)))
1813                goto failed;
1814
1815        /* Get slab management. */
1816        if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
1817                goto opps1;
1818
1819        set_slab_attr(cachep, slabp, objp);
1820
1821        cache_init_objs(cachep, slabp, ctor_flags);
1822
1823        if (local_flags & __GFP_WAIT)
1824                local_irq_disable();
1825        check_irq_off();
1826        spin_lock(&cachep->spinlock);
1827
1828        /* Make slab active. */
1829        list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free));
1830        STATS_INC_GROWN(cachep);
1831        list3_data(cachep)->free_objects += cachep->num;
1832        spin_unlock(&cachep->spinlock);
1833        return 1;
1834opps1:
1835        kmem_freepages(cachep, objp);
1836failed:
1837        if (local_flags & __GFP_WAIT)
1838                local_irq_disable();
1839        return 0;
1840}
1841
1842#if DEBUG
1843
1844/*
1845 * Perform extra freeing checks:
1846 * - detect bad pointers.
1847 * - POISON/RED_ZONE checking
1848 * - destructor calls, for caches with POISON+dtor
1849 */
1850static void kfree_debugcheck(const void *objp)
1851{
1852        struct page *page;
1853
1854        if (!virt_addr_valid(objp)) {
1855                printk(KERN_ERR "kfree_debugcheck: out of range ptr %lxh.\n",
1856                        (unsigned long)objp);   
1857                BUG();  
1858        }
1859        page = virt_to_page(objp);
1860        if (!PageSlab(page)) {
1861                printk(KERN_ERR "kfree_debugcheck: bad ptr %lxh.\n", (unsigned long)objp);
1862                BUG();
1863        }
1864}
1865
1866static void *cache_free_debugcheck (kmem_cache_t * cachep, void * objp, void *caller)
1867{
1868        struct page *page;
1869        unsigned int objnr;
1870        struct slab *slabp;
1871
1872        objp -= obj_dbghead(cachep);
1873        kfree_debugcheck(objp);
1874        page = virt_to_page(objp);
1875
1876        if (GET_PAGE_CACHE(page) != cachep) {
1877                printk(KERN_ERR "mismatch in kmem_cache_free: expected cache %p, got %p\n",
1878                                GET_PAGE_CACHE(page),cachep);
1879                printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
1880                printk(KERN_ERR "%p is %s.\n", GET_PAGE_CACHE(page), GET_PAGE_CACHE(page)->name);
1881                WARN_ON(1);
1882        }
1883        slabp = GET_PAGE_SLAB(page);
1884
1885        if (cachep->flags & SLAB_RED_ZONE) {
1886                if (*dbg_redzone1(cachep, objp) != RED_ACTIVE || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
1887                        slab_error(cachep, "double free, or memory outside"
1888                                                " object was overwritten");
1889                        printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
1890                                        objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
1891                }
1892                *dbg_redzone1(cachep, objp) = RED_INACTIVE;
1893                *dbg_redzone2(cachep, objp) = RED_INACTIVE;
1894        }
1895        if (cachep->flags & SLAB_STORE_USER)
1896                *dbg_userword(cachep, objp) = caller;
1897
1898        objnr = (objp-slabp->s_mem)/cachep->objsize;
1899
1900        BUG_ON(objnr >= cachep->num);
1901        BUG_ON(objp != slabp->s_mem + objnr*cachep->objsize);
1902
1903        if (cachep->flags & SLAB_DEBUG_INITIAL) {
1904                /* Need to call the slab's constructor so the
1905                 * caller can perform a verify of its state (debugging).
1906                 * Called without the cache-lock held.
1907                 */
1908                cachep->ctor(objp+obj_dbghead(cachep),
1909                                        cachep, SLAB_CTOR_CONSTRUCTOR|SLAB_CTOR_VERIFY);
1910        }
1911        if (cachep->flags & SLAB_POISON && cachep->dtor) {
1912                /* we want to cache poison the object,
1913                 * call the destruction callback
1914                 */
1915                cachep->dtor(objp+obj_dbghead(cachep), cachep, 0);
1916        }
1917        if (cachep->flags & SLAB_POISON) {
1918#ifdef CONFIG_DEBUG_PAGEALLOC
1919                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
1920                        store_stackinfo(cachep, objp, (unsigned long)caller);
1921                        kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 0);
1922                } else {
1923                        poison_obj(cachep, objp, POISON_FREE);
1924                }
1925#else
1926                poison_obj(cachep, objp, POISON_FREE);
1927#endif
1928        }
1929        return objp;
1930}
1931
1932static void check_slabp(kmem_cache_t *cachep, struct slab *slabp)
1933{
1934        int i;
1935        int entries = 0;
1936        
1937        check_spinlock_acquired(cachep);
1938        /* Check slab's freelist to see if this obj is there. */
1939        for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
1940                entries++;
1941                if (entries > cachep->num || i < 0 || i >= cachep->num)
1942                        goto bad;
1943        }
1944        if (entries != cachep->num - slabp->inuse) {
1945                int i;
1946bad:
1947                printk(KERN_ERR "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
1948                                cachep->name, cachep->num, slabp, slabp->inuse);
1949                for (i=0;i<sizeof(slabp)+cachep->num*sizeof(kmem_bufctl_t);i++) {
1950                        if ((i%16)==0)
1951                                printk("\n%03x:", i);
1952                        printk(" %02x", ((unsigned char*)slabp)[i]);
1953                }
1954                printk("\n");
1955                BUG();
1956        }
1957}
1958#else
1959#define kfree_debugcheck(x) do { } while(0)
1960#define cache_free_debugcheck(x,objp,z) (objp)
1961#define check_slabp(x,y) do { } while(0)
1962#endif
1963
1964static void* cache_alloc_refill(kmem_cache_t* cachep, int flags)
1965{
1966        int batchcount;
1967        struct kmem_list3 *l3;
1968        struct array_cache *ac;
1969
1970        check_irq_off();
1971        ac = ac_data(cachep);
1972retry:
1973        batchcount = ac->batchcount;
1974        if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
1975                /* if there was little recent activity on this
1976                 * cache, then perform only a partial refill.
1977                 * Otherwise we could generate refill bouncing.
1978                 */
1979                batchcount = BATCHREFILL_LIMIT;
1980        }
1981        l3 = list3_data(cachep);
1982
1983        BUG_ON(ac->avail > 0);
1984        spin_lock(&cachep->spinlock);
1985        if (l3->shared) {
1986                struct array_cache *shared_array = l3->shared;
1987                if (shared_array->avail) {
1988                        if (batchcount > shared_array->avail)
1989                                batchcount = shared_array->avail;
1990                        shared_array->avail -= batchcount;
1991                        ac->avail = batchcount;
1992                        memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail],
1993                                        sizeof(void*)*batchcount);
1994                        shared_array->touched = 1;
1995                        goto alloc_done;
1996                }
1997        }
1998        while (batchcount > 0) {
1999                struct list_head *entry;
2000                struct slab *slabp;
2001                /* Get slab alloc is to come from. */
2002                entry = l3->slabs_partial.next;
2003                if (entry == &l3->slabs_partial) {
2004                        l3->free_touched = 1;
2005                        entry = l3->slabs_free.next;
2006                        if (entry == &l3->slabs_free)
2007                                goto must_grow;
2008                }
2009
2010                slabp = list_entry(entry, struct slab, list);
2011                check_slabp(cachep, slabp);
2012                check_spinlock_acquired(cachep);
2013                while (slabp->inuse < cachep->num && batchcount--) {
2014                        kmem_bufctl_t next;
2015                        STATS_INC_ALLOCED(cachep);
2016                        STATS_INC_ACTIVE(cachep);
2017                        STATS_SET_HIGH(cachep);
2018
2019                        /* get obj pointer */
2020                        ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize;
2021
2022                        slabp->inuse++;
2023                        next = slab_bufctl(slabp)[slabp->free];
2024#if DEBUG
2025                        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2026#endif
2027                        slabp->free = next;
2028                }
2029                check_slabp(cachep, slabp);
2030
2031                /* move slabp to correct slabp list: */
2032                list_del(&slabp->list);
2033                if (slabp->free == BUFCTL_END)
2034                        list_add(&slabp->list, &l3->slabs_full);
2035                else
2036                        list_add(&slabp->list, &l3->slabs_partial);
2037        }
2038
2039must_grow:
2040        l3->free_objects -= ac->avail;
2041alloc_done:
2042        spin_unlock(&cachep->spinlock);
2043
2044        if (unlikely(!ac->avail)) {
2045                int x;
2046                x = cache_grow(cachep, flags);
2047                
2048                // cache_grow can reenable interrupts, then ac could change.
2049                ac = ac_data(cachep);
2050                if (!x && ac->avail == 0)       // no objects in sight? abort
2051                        return NULL;
2052
2053                if (!ac->avail)         // objects refilled by interrupt?
2054                        goto retry;
2055        }
2056        ac->touched = 1;
2057        return ac_entry(ac)[--ac->avail];
2058}
2059
2060static inline void
2061cache_alloc_debugcheck_before(kmem_cache_t *cachep, int flags)
2062{
2063        might_sleep_if(flags & __GFP_WAIT);
2064#if DEBUG
2065        kmem_flagcheck(cachep, flags);
2066#endif
2067}
2068
2069#if DEBUG
2070static void *
2071cache_alloc_debugcheck_after(kmem_cache_t *cachep,
2072                        unsigned long flags, void *objp, void *caller)
2073{
2074        if (!objp)      
2075                return objp;
2076        if (cachep->flags & SLAB_POISON) {
2077#ifdef CONFIG_DEBUG_PAGEALLOC
2078                if ((cachep->objsize % PAGE_SIZE) == 0 && OFF_SLAB(cachep))
2079                        kernel_map_pages(virt_to_page(objp), cachep->objsize/PAGE_SIZE, 1);
2080                else
2081                        check_poison_obj(cachep, objp);
2082#else
2083                check_poison_obj(cachep, objp);
2084#endif
2085                poison_obj(cachep, objp, POISON_INUSE);
2086        }
2087        if (cachep->flags & SLAB_STORE_USER)
2088                *dbg_userword(cachep, objp) = caller;
2089
2090        if (cachep->flags & SLAB_RED_ZONE) {
2091                if (*dbg_redzone1(cachep, objp) != RED_INACTIVE || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
2092                        slab_error(cachep, "double free, or memory outside"
2093                                                " object was overwritten");
2094                        printk(KERN_ERR "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
2095                                        objp, *dbg_redzone1(cachep, objp), *dbg_redzone2(cachep, objp));
2096                }
2097                *dbg_redzone1(cachep, objp) = RED_ACTIVE;
2098                *dbg_redzone2(cachep, objp) = RED_ACTIVE;
2099        }
2100        objp += obj_dbghead(cachep);
2101        if (cachep->ctor && cachep->flags & SLAB_POISON) {
2102                unsigned long   ctor_flags = SLAB_CTOR_CONSTRUCTOR;
2103
2104                if (!(flags & __GFP_WAIT))
2105                        ctor_flags |= SLAB_CTOR_ATOMIC;
2106
2107                cachep->ctor(objp, cachep, ctor_flags);
2108        }       
2109        return objp;
2110}
2111#else
2112#define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
2113#endif
2114
2115
2116static inline void * __cache_alloc (kmem_cache_t *cachep, int flags)
2117{
2118        unsigned long save_flags;
2119        void* objp;
2120        struct array_cache *ac;
2121
2122        cache_alloc_debugcheck_before(cachep, flags);
2123
2124        local_irq_save(save_flags);
2125        ac = ac_data(cachep);
2126        if (likely(ac->avail)) {
2127                STATS_INC_ALLOCHIT(cachep);
2128                ac->touched = 1;
2129                objp = ac_entry(ac)[--ac->avail];
2130        } else {
2131                STATS_INC_ALLOCMISS(cachep);
2132                objp = cache_alloc_refill(cachep, flags);
2133        }
2134        local_irq_restore(save_flags);
2135        objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0));
2136        return objp;
2137}
2138
2139/* 
2140 * NUMA: different approach needed if the spinlock is moved into
2141 * the l3 structure
2142 */
2143
2144static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
2145{
2146        int i;
2147
2148        check_spinlock_acquired(cachep);
2149
2150        /* NUMA: move add into loop */
2151        cachep->lists.free_objects += nr_objects;
2152
2153        for (i = 0; i < nr_objects; i++) {
2154                void *objp = objpp[i];
2155                struct slab *slabp;
2156                unsigned int objnr;
2157
2158                slabp = GET_PAGE_SLAB(virt_to_page(objp));
2159                list_del(&slabp->list);
2160                objnr = (objp - slabp->s_mem) / cachep->objsize;
2161                check_slabp(cachep, slabp);
2162#if DEBUG
2163                if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
2164                        printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n",
2165                                                cachep->name, objp);
2166                        BUG();
2167                }
2168#endif
2169                slab_bufctl(slabp)[objnr] = slabp->free;
2170                slabp->free = objnr;
2171                STATS_DEC_ACTIVE(cachep);
2172                slabp->inuse--;
2173                check_slabp(cachep, slabp);
2174
2175                /* fixup slab chains */
2176                if (slabp->inuse == 0) {
2177                        if (cachep->lists.free_objects > cachep->free_limit) {
2178                                cachep->lists.free_objects -= cachep->num;
2179                                slab_destroy(cachep, slabp);
2180                        } else {
2181                                list_add(&slabp->list,
2182                                &list3_data_ptr(cachep, objp)->slabs_free);
2183                        }
2184                } else {
2185                        /* Unconditionally move a slab to the end of the
2186                         * partial list on free - maximum time for the
2187                         * other objects to be freed, too.
2188                         */
2189                        list_add_tail(&slabp->list,
2190                                &list3_data_ptr(cachep, objp)->slabs_partial);
2191                }
2192        }
2193}
2194
2195static void cache_flusharray (kmem_cache_t* cachep, struct array_cache *ac)
2196{
2197        int batchcount;
2198
2199        batchcount = ac->batchcount;
2200#if DEBUG
2201        BUG_ON(!batchcount || batchcount > ac->avail);
2202#endif
2203        check_irq_off();
2204        spin_lock(&cachep->spinlock);
2205        if (cachep->lists.shared) {
2206                struct array_cache *shared_array = cachep->lists.shared;
2207                int max = shared_array->limit-shared_array->avail;
2208                if (max) {
2209                        if (batchcount > max)
2210                                batchcount = max;
2211                        memcpy(&ac_entry(shared_array)[shared_array->avail],
2212                                        &ac_entry(ac)[0],
2213                                        sizeof(void*)*batchcount);
2214                        shared_array->avail += batchcount;
2215                        goto free_done;
2216                }
2217        }
2218
2219        free_block(cachep, &ac_entry(ac)[0], batchcount);
2220free_done:
2221#if STATS
2222        {
2223                int i = 0;
2224                struct list_head *p;
2225
2226                p = list3_data(cachep)->slabs_free.next;
2227                while (p != &(list3_data(cachep)->slabs_free)) {
2228                        struct slab *slabp;
2229
2230                        slabp = list_entry(p, struct slab, list);
2231                        BUG_ON(slabp->inuse);
2232
2233                        i++;
2234                        p = p->next;
2235                }
2236                STATS_SET_FREEABLE(cachep, i);
2237        }
2238#endif
2239        spin_unlock(&cachep->spinlock);
2240        ac->avail -= batchcount;
2241        memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount],
2242                        sizeof(void*)*ac->avail);
2243}
2244
2245/*
2246 * __cache_free
2247 * Release an obj back to its cache. If the obj has a constructed
2248 * state, it must be in this state _before_ it is released.
2249 *
2250 * Called with disabled ints.
2251 */
2252static inline void __cache_free (kmem_cache_t *cachep, void* objp)
2253{
2254        struct array_cache *ac = ac_data(cachep);
2255
2256        check_irq_off();
2257        objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
2258
2259        if (likely(ac->avail < ac->limit)) {
2260                STATS_INC_FREEHIT(cachep);
2261                ac_entry(ac)[ac->avail++] = objp;
2262                return;
2263        } else {
2264                STATS_INC_FREEMISS(cachep);
2265                cache_flusharray(cachep, ac);
2266                ac_entry(ac)[ac->avail++] = objp;
2267        }
2268}
2269
2270/**
2271 * kmem_cache_alloc - Allocate an object
2272 * @cachep: The cache to allocate from.
2273 * @flags: See kmalloc().
2274 *
2275 * Allocate an object from this cache.  The flags are only relevant
2276 * if the cache has no available objects.
2277 */
2278void * kmem_cache_alloc (kmem_cache_t *cachep, int flags)
2279{
2280        return __cache_alloc(cachep, flags);
2281}
2282
2283EXPORT_SYMBOL(kmem_cache_alloc);
2284
2285/**
2286 * kmem_ptr_validate - check if an untrusted pointer might
2287 *      be a slab entry.
2288 * @cachep: the cache we're checking against
2289 * @ptr: pointer to validate
2290 *
2291 * This verifies that the untrusted pointer looks sane:
2292 * it is _not_ a guarantee that the pointer is actually
2293 * part of the slab cache in question, but it at least
2294 * validates that the pointer can be dereferenced and
2295 * looks half-way sane.
2296 *
2297 * Currently only used for dentry validation.
2298 */
2299int fastcall kmem_ptr_validate(kmem_cache_t *cachep, void *ptr)
2300{
2301        unsigned long addr = (unsigned long) ptr;
2302        unsigned long min_addr = PAGE_OFFSET;
2303        unsigned long align_mask = BYTES_PER_WORD-1;
2304        unsigned long size = cachep->objsize;
2305        struct page *page;
2306
2307        if (unlikely(addr < min_addr))
2308                goto out;
2309        if (unlikely(addr > (unsigned long)high_memory - size))
2310                goto out;
2311        if (unlikely(addr & align_mask))
2312                goto out;
2313        if (unlikely(!kern_addr_valid(addr)))
2314                goto out;
2315        if (unlikely(!kern_addr_valid(addr + size - 1)))
2316                goto out;
2317        page = virt_to_page(ptr);
2318        if (unlikely(!PageSlab(page)))
2319                goto out;
2320        if (unlikely(GET_PAGE_CACHE(page) != cachep))
2321                goto out;
2322        return 1;
2323out:
2324        return 0;
2325}
2326
2327/**
2328 * kmem_cache_alloc_node - Allocate an object on the specified node
2329 * @cachep: The cache to allocate from.
2330 * @flags: See kmalloc().
2331 * @nodeid: node number of the target node.
2332 *
2333 * Identical to kmem_cache_alloc, except that this function is slow
2334 * and can sleep. And it will allocate memory on the given node, which
2335 * can improve the performance for cpu bound structures.
2336 */
2337void *kmem_cache_alloc_node(kmem_cache_t *cachep, int nodeid)
2338{
2339        size_t offset;
2340        void *objp;
2341        struct slab *slabp;
2342        kmem_bufctl_t next;
2343
2344        /* The main algorithms are not node aware, thus we have to cheat:
2345         * We bypass all caches and allocate a new slab.
2346         * The following code is a streamlined copy of cache_grow().
2347         */
2348
2349        /* Get colour for the slab, and update the next value. */
2350        spin_lock_irq(&cachep->spinlock);
2351        offset = cachep->colour_next;
2352        cachep->colour_next++;
2353        if (cachep->colour_next >= cachep->colour)
2354                cachep->colour_next = 0;
2355        offset *= cachep->colour_off;
2356        spin_unlock_irq(&cachep->spinlock);
2357
2358        /* Get mem for the objs. */
2359        if (!(objp = kmem_getpages(cachep, GFP_KERNEL, nodeid)))
2360                goto failed;
2361
2362        /* Get slab management. */
2363        if (!(slabp = alloc_slabmgmt(cachep, objp, offset, GFP_KERNEL)))
2364                goto opps1;
2365
2366        set_slab_attr(cachep, slabp, objp);
2367        cache_init_objs(cachep, slabp, SLAB_CTOR_CONSTRUCTOR);
2368
2369        /* The first object is ours: */
2370        objp = slabp->s_mem + slabp->free*cachep->objsize;
2371        slabp->inuse++;
2372        next = slab_bufctl(slabp)[slabp->free];
2373#if DEBUG
2374        slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
2375#endif
2376        slabp->free = next;
2377
2378        /* add the remaining objects into the cache */
2379        spin_lock_irq(&cachep->spinlock);
2380        check_slabp(cachep, slabp);
2381        STATS_INC_GROWN(cachep);
2382        /* Make slab active. */
2383        if (slabp->free == BUFCTL_END) {
2384                list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_full));
2385        } else {
2386                list_add_tail(&slabp->list,
2387                                &(list3_data(cachep)->slabs_partial));
2388                list3_data(cachep)->free_objects += cachep->num-1;
2389        }
2390        spin_unlock_irq(&cachep->spinlock);
2391        objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp,
2392                                        __builtin_return_address(0));
2393        return objp;
2394opps1:
2395        kmem_freepages(cachep, objp);
2396failed:
2397        return NULL;
2398
2399}
2400EXPORT_SYMBOL(kmem_cache_alloc_node);
2401
2402/**
2403 * kmalloc - allocate memory
2404 * @size: how many bytes of memory are required.
2405 * @flags: the type of memory to allocate.
2406 *
2407 * kmalloc is the normal method of allocating memory
2408 * in the kernel.
2409 *
2410 * The @flags argument may be one of:
2411 *
2412 * %GFP_USER - Allocate memory on behalf of user.  May sleep.
2413 *
2414 * %GFP_KERNEL - Allocate normal kernel ram.  May sleep.
2415 *
2416 * %GFP_ATOMIC - Allocation will not sleep.  Use inside interrupt handlers.
2417 *
2418 * Additionally, the %GFP_DMA flag may be set to indicate the memory
2419 * must be suitable for DMA.  This can mean different things on different
2420 * platforms.  For example, on i386, it means that the memory must come
2421 * from the first 16MB.
2422 */
2423void * __kmalloc (size_t size, int flags)
2424{
2425        struct cache_sizes *csizep = malloc_sizes;
2426
2427        for (; csizep->cs_size; csizep++) {
2428                if (size > csizep->cs_size)
2429                        continue;
2430#if DEBUG
2431                /* This happens if someone tries to call
2432                 * kmem_cache_create(), or kmalloc(), before
2433                 * the generic caches are initialized.
2434                 */
2435                BUG_ON(csizep->cs_cachep == NULL);
2436#endif
2437                return __cache_alloc(flags & GFP_DMA ?
2438                         csizep->cs_dmacachep : csizep->cs_cachep, flags);
2439        }
2440        return NULL;
2441}
2442
2443EXPORT_SYMBOL(__kmalloc);
2444
2445#ifdef CONFIG_SMP
2446/**
2447 * __alloc_percpu - allocate one copy of the object for every present
2448 * cpu in the system, zeroing them.
2449 * Objects should be dereferenced using the per_cpu_ptr macro only.
2450 *
2451 * @size: how many bytes of memory are required.
2452 * @align: the alignment, which can't be greater than SMP_CACHE_BYTES.
2453 */
2454void *__alloc_percpu(size_t size, size_t align)
2455{
2456        int i;
2457        struct percpu_data *pdata = kmalloc(sizeof (*pdata), GFP_KERNEL);
2458
2459        if (!pdata)
2460                return NULL;
2461
2462        for (i = 0; i < NR_CPUS; i++) {
2463                if (!cpu_possible(i))
2464                        continue;
2465                pdata->ptrs[i] = kmem_cache_alloc_node(
2466                                kmem_find_general_cachep(size, GFP_KERNEL),
2467                                cpu_to_node(i));
2468
2469                if (!pdata->ptrs[i])
2470                        goto unwind_oom;
2471                memset(pdata->ptrs[i], 0, size);
2472        }
2473
2474        /* Catch derefs w/o wrappers */
2475        return (void *) (~(unsigned long) pdata);
2476
2477unwind_oom:
2478        while (--i >= 0) {
2479                if (!cpu_possible(i))
2480                        continue;
2481                kfree(pdata->ptrs[i]);
2482        }
2483        kfree(pdata);
2484        return NULL;
2485}
2486
2487EXPORT_SYMBOL(__alloc_percpu);
2488#endif
2489
2490/**
2491 * kmem_cache_free - Deallocate an object
2492 * @cachep: The cache the allocation was from.
2493 * @objp: The previously allocated object.
2494 *
2495 * Free an object which was previously allocated from this
2496 * cache.
2497 */
2498void kmem_cache_free (kmem_cache_t *cachep, void *objp)
2499{
2500        unsigned long flags;
2501
2502        local_irq_save(flags);
2503        __cache_free(cachep, objp);
2504        local_irq_restore(flags);
2505}
2506
2507EXPORT_SYMBOL(kmem_cache_free);
2508
2509/**
2510 * kzalloc - allocate memory. The memory is set to zero.
2511 * @size: how many bytes of memory are required.
2512 * @flags: the type of memory to allocate.
2513 */
2514void *kzalloc(size_t size, int flags)
2515{
2516        void *ret = kmalloc(size, flags);
2517        if (ret)
2518                memset(ret, 0, size);
2519        return ret;
2520}
2521EXPORT_SYMBOL(kzalloc);
2522
2523/**
2524 * kcalloc - allocate memory for an array. The memory is set to zero.
2525 * @n: number of elements.
2526 * @size: element size.
2527 * @flags: the type of memory to allocate.
2528 */
2529void *kcalloc(size_t n, size_t size, int flags)
2530{
2531        void *ret = NULL;
2532
2533        if (n != 0 && size > INT_MAX / n)
2534                return ret;
2535
2536        ret = kmalloc(n * size, flags);
2537        if (ret)
2538                memset(ret, 0, n * size);
2539        return ret;
2540}
2541
2542EXPORT_SYMBOL(kcalloc);
2543
2544/**
2545 * kfree - free previously allocated memory
2546 * @objp: pointer returned by kmalloc.
2547 *
2548 * Don't free memory not originally allocated by kmalloc()
2549 * or you will run into trouble.
2550 */
2551void kfree (const void *objp)
2552{
2553        kmem_cache_t *c;
2554        unsigned long flags;
2555
2556        if (!objp)
2557                return;
2558        local_irq_save(flags);
2559        kfree_debugcheck(objp);
2560        c = GET_PAGE_CACHE(virt_to_page(objp));
2561        __cache_free(c, (void*)objp);
2562        local_irq_restore(flags);
2563}
2564
2565EXPORT_SYMBOL(kfree);
2566
2567#ifdef CONFIG_SMP
2568/**
2569 * free_percpu - free previously allocated percpu memory
2570 * @objp: pointer returned by alloc_percpu.
2571 *
2572 * Don't free memory not originally allocated by alloc_percpu()
2573 * The complemented objp is to check for that.
2574 */
2575void
2576free_percpu(const void *objp)
2577{
2578        int i;
2579        struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
2580
2581        for (i = 0; i < NR_CPUS; i++) {
2582                if (!cpu_possible(i))
2583                        continue;
2584                kfree(p->ptrs[i]);
2585        }
2586        kfree(p);
2587}
2588
2589EXPORT_SYMBOL(free_percpu);
2590#endif
2591
2592unsigned int kmem_cache_size(kmem_cache_t *cachep)
2593{
2594        return obj_reallen(cachep);
2595}
2596
2597EXPORT_SYMBOL(kmem_cache_size);
2598
2599struct ccupdate_struct {
2600        kmem_cache_t *cachep;
2601        struct array_cache *new[NR_CPUS];
2602};
2603
2604static void do_ccupdate_local(void *info)
2605{
2606        struct ccupdate_struct *new = (struct ccupdate_struct *)info;
2607        struct array_cache *old;
2608
2609        check_irq_off();
2610        old = ac_data(new->cachep);
2611        
2612        new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
2613        new->new[smp_processor_id()] = old;
2614}
2615
2616
2617static int do_tune_cpucache (kmem_cache_t* cachep, int limit, int batchcount, int shared)
2618{
2619        struct ccupdate_struct new;
2620        struct array_cache *new_shared;
2621        int i;
2622
2623        memset(&new.new,0,sizeof(new.new));
2624        for (i = 0; i < NR_CPUS; i++) {
2625                if (cpu_online(i)) {
2626                        new.new[i] = alloc_arraycache(i, limit, batchcount);
2627                        if (!new.new[i]) {
2628                                for (i--; i >= 0; i--) kfree(new.new[i]);
2629                                return -ENOMEM;
2630                        }
2631                } else {
2632                        new.new[i] = NULL;
2633                }
2634        }
2635        new.cachep = cachep;
2636
2637        smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
2638        
2639        check_irq_on();
2640        spin_lock_irq(&cachep->spinlock);
2641        cachep->batchcount = batchcount;
2642        cachep->limit = limit;
2643        cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num;
2644        spin_unlock_irq(&cachep->spinlock);
2645
2646        for (i = 0; i < NR_CPUS; i++) {
2647                struct array_cache *ccold = new.new[i];
2648                if (!ccold)
2649                        continue;
2650                spin_lock_irq(&cachep->spinlock);
2651                free_block(cachep, ac_entry(ccold), ccold->avail);
2652                spin_unlock_irq(&cachep->spinlock);
2653                kfree(ccold);
2654        }
2655        new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
2656        if (new_shared) {
2657                struct array_cache *old;
2658
2659                spin_lock_irq(&cachep->spinlock);
2660                old = cachep->lists.shared;
2661                cachep->lists.shared = new_shared;
2662                if (old)
2663                        free_block(cachep, ac_entry(old), old->avail);
2664                spin_unlock_irq(&cachep->spinlock);
2665                kfree(old);
2666        }
2667
2668        return 0;
2669}
2670
2671
2672static void enable_cpucache (kmem_cache_t *cachep)
2673{
2674        int err;
2675        int limit, shared;
2676
2677        /* The head array serves three purposes:
2678         * - create a LIFO ordering, i.e. return objects that are cache-warm
2679         * - reduce the number of spinlock operations.
2680         * - reduce the number of linked list operations on the slab and 
2681         *   bufctl chains: array operations are cheaper.
2682         * The numbers are guessed, we should auto-tune as described by
2683         * Bonwick.
2684         */
2685        if (cachep->objsize > 131072)
2686                limit = 1;
2687        else if (cachep->objsize > PAGE_SIZE)
2688                limit = 8;
2689        else if (cachep->objsize > 1024)
2690                limit = 24;
2691        else if (cachep->objsize > 256)
2692                limit = 54;
2693        else
2694                limit = 120;
2695
2696        /* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
2697         * allocation behaviour: Most allocs on one cpu, most free operations
2698         * on another cpu. For these cases, an efficient object passing between
2699         * cpus is necessary. This is provided by a shared array. The array
2700         * replaces Bonwick's magazine layer.
2701         * On uniprocessor, it's functionally equivalent (but less efficient)
2702         * to a larger limit. Thus disabled by default.
2703         */
2704        shared = 0;
2705#ifdef CONFIG_SMP
2706        if (cachep->objsize <= PAGE_SIZE)
2707                shared = 8;
2708#endif
2709
2710#if DEBUG
2711        /* With debugging enabled, large batchcount lead to excessively
2712         * long periods with disabled local interrupts. Limit the 
2713         * batchcount
2714         */
2715        if (limit > 32)
2716                limit = 32;
2717#endif
2718        err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
2719        if (err)
2720                printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
2721                                        cachep->name, -err);
2722}
2723
2724static void drain_array_locked(kmem_cache_t *cachep,
2725                                struct array_cache *ac, int force)
2726{
2727        int tofree;
2728
2729        check_spinlock_acquired(cachep);
2730        if (ac->touched && !force) {
2731                ac->touched = 0;
2732        } else if (ac->avail) {
2733                tofree = force ? ac->avail : (ac->limit+4)/5;
2734                if (tofree > ac->avail) {
2735                        tofree = (ac->avail+1)/2;
2736                }
2737                free_block(cachep, ac_entry(ac), tofree);
2738                ac->avail -= tofree;
2739                memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
2740                                        sizeof(void*)*ac->avail);
2741        }
2742}
2743
2744/**
2745 * cache_reap - Reclaim memory from caches.
2746 *
2747 * Called from workqueue/eventd every few seconds.
2748 * Purpose:
2749 * - clear the per-cpu caches for this CPU.
2750 * - return freeable pages to the main free memory pool.
2751 *
2752 * If we cannot acquire the cache chain semaphore then just give up - we'll
2753 * try again on the next iteration.
2754 */
2755static void cache_reap(void *unused)
2756{
2757        struct list_head *walk;
2758
2759        if (down_trylock(&cache_chain_sem)) {
2760                /* Give up. Setup the next iteration. */
2761                schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());