RHEL4/mm/oom_kill.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/oom_kill.c
   3 * 
   4 *  Copyright (C)  1998,2000  Rik van Riel
   5 *      Thanks go out to Claus Fischer for some serious inspiration and
   6 *      for goading me into coding this file...
   7 *
   8 *  The routines in this file are used to kill a process when
   9 *  we're seriously out of memory. This gets called from kswapd()
  10 *  in linux/mm/vmscan.c when we really run out of memory.
  11 *
  12 *  Since we won't call these routines often (on a well-configured
  13 *  machine) this file will double as a 'coding guide' and a signpost
  14 *  for newbie kernel hackers. It features several pointers to major
  15 *  kernel subsystems and hints as to where to find out what things do.
  16 */
  17
  18#include <linux/mm.h>
  19#include <linux/sched.h>
  20#include <linux/swap.h>
  21#include <linux/timex.h>
  22#include <linux/jiffies.h>
  23#include <linux/module.h>
  24#include <linux/notifier.h>
  25
  26int oom_kill_enabled = 1;
  27int sysctl_panic_on_oom;
  28
  29/* #define DEBUG */
  30
  31/**
  32 * oom_badness - calculate a numeric value for how bad this task has been
  33 * @p: task struct of which task we should calculate
  34 * @p: current uptime in seconds
  35 *
  36 * The formula used is relatively simple and documented inline in the
  37 * function. The main rationale is that we want to select a good task
  38 * to kill when we run out of memory.
  39 *
  40 * Good in this context means that:
  41 * 1) we lose the minimum amount of work done
  42 * 2) we recover a large amount of memory
  43 * 3) we don't kill anything innocent of eating tons of memory
  44 * 4) we want to kill the minimum amount of processes (one)
  45 * 5) we try to kill the process the user expects us to kill, this
  46 *    algorithm has been meticulously tuned to meet the principle
  47 *    of least surprise ... (be careful when you change it)
  48 */
  49
  50static unsigned long badness(struct task_struct *p, unsigned long uptime)
  51{
  52        unsigned long points, cpu_time, run_time, s;
  53
  54        if (!p->mm)
  55                return 0;
  56
  57        if (p->flags & PF_MEMDIE)
  58                return 0;
  59        /*
  60         * The memory size of the process is the basis for the badness.
  61         */
  62        points = p->mm->total_vm;
  63
  64        /*
  65         * CPU time is in tens of seconds and run time is in thousands
  66         * of seconds. There is no particular reason for this other than
  67         * that it turned out to work very well in practice.
  68         */
  69        cpu_time = (p->utime + p->stime) >> (SHIFT_HZ + 3);
  70
  71        if (uptime >= p->start_time.tv_sec)
  72                run_time = (uptime - p->start_time.tv_sec) >> 10;
  73        else
  74                run_time = 0;
  75
  76        s = int_sqrt(cpu_time);
  77        if (s)
  78                points /= s;
  79        s = int_sqrt(int_sqrt(run_time));
  80        if (s)
  81                points /= s;
  82
  83        /*
  84         * Niced processes are most likely less important, so double
  85         * their badness points.
  86         */
  87        if (task_nice(p) > 0)
  88                points *= 2;
  89
  90        /*
  91         * Superuser processes are usually more important, so we make it
  92         * less likely that we kill those.
  93         */
  94        if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
  95                                p->uid == 0 || p->euid == 0)
  96                points /= 4;
  97
  98        /*
  99         * We don't want to kill a process with direct hardware access.
 100         * Not only could that mess up the hardware, but usually users
 101         * tend to only have this flag set on applications they think
 102         * of as important.
 103         */
 104        if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
 105                points /= 4;
 106#ifdef DEBUG
 107        printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
 108        p->pid, p->comm, points);
 109#endif
 110        return points;
 111}
 112
 113/*
 114 * Simple selection loop. We chose the process with the highest
 115 * number of 'points'. We expect the caller will lock the tasklist.
 116 *
 117 * (not docbooked, we don't want this one cluttering up the manual)
 118 */
 119static struct task_struct * select_bad_process(void)
 120{
 121        unsigned long maxpoints = 0;
 122        struct task_struct *g, *p;
 123        struct task_struct *chosen = NULL;
 124        struct timespec uptime;
 125
 126        do_posix_clock_monotonic_gettime(&uptime);
 127        do_each_thread(g, p)
 128                if (p->pid) {
 129                        unsigned long points = badness(p, uptime.tv_sec);
 130                        if (points > maxpoints) {
 131                                chosen = p;
 132                                maxpoints = points;
 133                        }
 134                        if (p->flags & PF_SWAPOFF)
 135                                return p;
 136                }
 137        while_each_thread(g, p);
 138        return chosen;
 139}
 140
 141/**
 142 * We must be careful though to never send SIGKILL a process with
 143 * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
 144 * we select a process with CAP_SYS_RAW_IO set).
 145 */
 146static void __oom_kill_task(task_t *p)
 147{
 148        task_lock(p);
 149        if (!p->mm || p->mm == &init_mm) {
 150                WARN_ON(1);
 151                printk(KERN_WARNING "tried to kill an mm-less task!\n");
 152                task_unlock(p);
 153                return;
 154        }
 155        task_unlock(p);
 156        printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
 157
 158        /*
 159         * We give our sacrificial lamb high priority and access to
 160         * all the memory it needs. That way it should be able to
 161         * exit() and clear out its resources quickly...
 162         */
 163        p->time_slice = HZ;
 164        p->flags |= PF_MEMALLOC | PF_MEMDIE;
 165
 166        /* This process has hardware access, be more careful. */
 167        if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) {
 168                force_sig(SIGTERM, p);
 169        } else {
 170                force_sig(SIGKILL, p);
 171        }
 172}
 173
 174static struct mm_struct *oom_kill_task(task_t *p)
 175{
 176        struct mm_struct *mm = get_task_mm(p);
 177        if (!mm || mm == &init_mm)
 178                return NULL;
 179        __oom_kill_task(p);
 180        return mm;
 181}
 182
 183
 184/**
 185 * oom_kill - kill the "best" process when we run out of memory
 186 *
 187 * If we run out of memory, we have the choice between either
 188 * killing a random task (bad), letting the system crash (worse)
 189 * OR try to be smart about which process to kill. Note that we
 190 * don't have to be perfect here, we just have to be good.
 191 */
 192static void oom_kill(void)
 193{
 194        struct mm_struct *mm;
 195        struct task_struct *g, *p, *q;
 196
 197        /* print the memory stats whenever we OOM kill */
 198        show_mem();
 199
 200        if (!oom_kill_enabled) {
 201                printk(KERN_INFO "Would have oom-killed but /proc/sys/vm/oom-kill is disabled\n");
 202                return;
 203        }
 204
 205        if (sysctl_panic_on_oom)
 206                panic("out of memory. panic_on_oom is selected\n");
 207
 208        read_lock(&tasklist_lock);
 209retry:
 210        p = select_bad_process();
 211
 212        /* Found nothing?!?! Either we hang forever, or we panic. */
 213        if (!p) {
 214                show_free_areas();
 215                panic("Out of memory and no killable processes...\n");
 216        }
 217
 218        mm = oom_kill_task(p);
 219        if (!mm)
 220                goto retry;
 221        /*
 222         * kill all processes that share the ->mm (i.e. all threads),
 223         * but are in a different thread group
 224         */
 225        do_each_thread(g, q)
 226                if (q->mm == mm && q->tgid != p->tgid)
 227                        __oom_kill_task(q);
 228        while_each_thread(g, q);
 229        if (!p->mm)
 230                printk(KERN_INFO "Fixed up OOM kill of mm-less task\n");
 231        read_unlock(&tasklist_lock);
 232        mmput(mm);
 233
 234        /*
 235         * Make kswapd go out of the way, so "p" has a good chance of
 236         * killing itself before someone else gets the chance to ask
 237         * for more memory.
 238         */
 239        yield();
 240        return;
 241}
 242
 243static struct notifier_block *oom_notify_list;
 244
 245int register_oom_notifier(struct notifier_block *nb)
 246{
 247        return notifier_chain_register(&oom_notify_list, nb);
 248}
 249EXPORT_SYMBOL_GPL(register_oom_notifier);
 250
 251int unregister_oom_notifier(struct notifier_block *nb)
 252{
 253        return notifier_chain_unregister(&oom_notify_list, nb);
 254}
 255EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 256
 257/**
 258 * out_of_memory - is the system out of memory?
 259 */
 260void out_of_memory(int gfp_mask)
 261{
 262        /*
 263         * oom_lock protects out_of_memory()'s static variables.
 264         * It's a global lock; this is not performance-critical.
 265         */
 266        static spinlock_t oom_lock = SPIN_LOCK_UNLOCKED;
 267        static unsigned long first, last, count, lastkill;
 268        unsigned long now, since;
 269        unsigned long freed = 0;
 270
 271        notifier_call_chain(&oom_notify_list, 0, &freed);
 272        if (freed > 0)
 273                /* Got some memory back in the last second. */
 274                return;
 275
 276        spin_lock(&oom_lock);
 277        now = jiffies;
 278        since = now - last;
 279        last = now;
 280
 281        /*
 282         * If it's been a long time since last failure,
 283         * we're not oom.
 284         */
 285        if (since > 5*HZ)
 286                goto reset;
 287
 288        /*
 289         * If we haven't tried for at least one second,
 290         * we're not really oom.
 291         */
 292        since = now - first;
 293        if (since < HZ)
 294                goto out_unlock;
 295
 296        /*
 297         * If we have gotten only a few failures,
 298         * we're not really oom. 
 299         */
 300        if (++count < 10)
 301                goto out_unlock;
 302
 303        /*
 304         * If we just killed a process, wait a while
 305         * to give that task a chance to exit. This
 306         * avoids killing multiple processes needlessly.
 307         */
 308        since = now - lastkill;
 309        if (since < HZ*5)
 310                goto out_unlock;
 311
 312        /*
 313         * Ok, really out of memory. Kill something.
 314         */
 315        lastkill = now;
 316
 317        printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
 318
 319        /* oom_kill() sleeps */
 320        spin_unlock(&oom_lock);
 321        oom_kill();
 322        spin_lock(&oom_lock);
 323
 324reset:
 325        /*
 326         * We dropped the lock above, so check to be sure the variable
 327         * first only ever increases to prevent false OOM's.
 328         */
 329        if (time_after(now, first))
 330                first = now;
 331        count = 0;
 332
 333out_unlock:
 334        spin_unlock(&oom_lock);
 335}
 336