RHEL4/kernel/timer.c
<<
>>
Prefs
   1/*
   2 *  linux/kernel/timer.c
   3 *
   4 *  Kernel internal timers, kernel timekeeping, basic process system calls
   5 *
   6 *  Copyright (C) 1991, 1992  Linus Torvalds
   7 *
   8 *  1997-01-28  Modified by Finn Arne Gangstad to make timers scale better.
   9 *
  10 *  1997-09-10  Updated NTP code according to technical memorandum Jan '96
  11 *              "A Kernel Model for Precision Timekeeping" by Dave Mills
  12 *  1998-12-24  Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
  13 *              serialize accesses to xtime/lost_ticks).
  14 *                              Copyright (C) 1998  Andrea Arcangeli
  15 *  1999-03-10  Improved NTP compatibility by Ulrich Windl
  16 *  2002-05-31  Move sys_sysinfo here and make its locking sane, Robert Love
  17 *  2000-10-05  Implemented scalable SMP per-CPU timer handling.
  18 *                              Copyright (C) 2000, 2001, 2002  Ingo Molnar
  19 *              Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
  20 */
  21
  22#include <linux/kernel_stat.h>
  23#include <linux/module.h>
  24#include <linux/interrupt.h>
  25#include <linux/percpu.h>
  26#include <linux/init.h>
  27#include <linux/mm.h>
  28#include <linux/swap.h>
  29#include <linux/notifier.h>
  30#include <linux/thread_info.h>
  31#include <linux/time.h>
  32#include <linux/jiffies.h>
  33#include <linux/cpu.h>
  34#include <linux/delay.h>
  35#include <linux/diskdump.h>
  36
  37#include <asm/uaccess.h>
  38#include <asm/unistd.h>
  39#include <asm/div64.h>
  40#include <asm/timex.h>
  41#include <asm/io.h>
  42
  43#ifdef CONFIG_TIME_INTERPOLATION
  44static void time_interpolator_update(long delta_nsec);
  45#else
  46#define time_interpolator_update(x)
  47#endif
  48
  49/*
  50 * per-CPU timer vector definitions:
  51 */
  52#define TVN_BITS 6
  53#define TVR_BITS 8
  54#define TVN_SIZE (1 << TVN_BITS)
  55#define TVR_SIZE (1 << TVR_BITS)
  56#define TVN_MASK (TVN_SIZE - 1)
  57#define TVR_MASK (TVR_SIZE - 1)
  58
  59typedef struct tvec_s {
  60        struct list_head vec[TVN_SIZE];
  61} tvec_t;
  62
  63typedef struct tvec_root_s {
  64        struct list_head vec[TVR_SIZE];
  65} tvec_root_t;
  66
  67struct tvec_t_base_s {
  68        spinlock_t lock;
  69        unsigned long timer_jiffies;
  70        struct timer_list *running_timer;
  71        tvec_root_t tv1;
  72        tvec_t tv2;
  73        tvec_t tv3;
  74        tvec_t tv4;
  75        tvec_t tv5;
  76} ____cacheline_aligned_in_smp;
  77
  78typedef struct tvec_t_base_s tvec_base_t;
  79
  80static inline void set_running_timer(tvec_base_t *base,
  81                                        struct timer_list *timer)
  82{
  83#ifdef CONFIG_SMP
  84        base->running_timer = timer;
  85#endif
  86}
  87
  88/* Fake initialization */
  89static DEFINE_PER_CPU(tvec_base_t, tvec_bases) = { SPIN_LOCK_UNLOCKED };
  90
  91static void check_timer_failed(struct timer_list *timer)
  92{
  93        static int whine_count;
  94        if (whine_count < 16) {
  95                whine_count++;
  96                printk("Uninitialised timer!\n");
  97                printk("This is just a warning.  Your computer is OK\n");
  98                printk("function=0x%p, data=0x%lx\n",
  99                        timer->function, timer->data);
 100                dump_stack();
 101        }
 102        /*
 103         * Now fix it up
 104         */
 105        spin_lock_init(&timer->lock);
 106        timer->magic = TIMER_MAGIC;
 107}
 108
 109static inline void check_timer(struct timer_list *timer)
 110{
 111        if (timer->magic != TIMER_MAGIC)
 112                check_timer_failed(timer);
 113}
 114
 115
 116static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
 117{
 118        unsigned long expires = timer->expires;
 119        unsigned long idx = expires - base->timer_jiffies;
 120        struct list_head *vec;
 121
 122        if (idx < TVR_SIZE) {
 123                int i = expires & TVR_MASK;
 124                vec = base->tv1.vec + i;
 125        } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
 126                int i = (expires >> TVR_BITS) & TVN_MASK;
 127                vec = base->tv2.vec + i;
 128        } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
 129                int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
 130                vec = base->tv3.vec + i;
 131        } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
 132                int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
 133                vec = base->tv4.vec + i;
 134        } else if ((signed long) idx < 0) {
 135                /*
 136                 * Can happen if you add a timer with expires == jiffies,
 137                 * or you set a timer to go off in the past
 138                 */
 139                vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK);
 140        } else {
 141                int i;
 142                /* If the timeout is larger than 0xffffffff on 64-bit
 143                 * architectures then we use the maximum timeout:
 144                 */
 145                if (idx > 0xffffffffUL) {
 146                        idx = 0xffffffffUL;
 147                        expires = idx + base->timer_jiffies;
 148                }
 149                i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 150                vec = base->tv5.vec + i;
 151        }
 152        /*
 153         * Timers are FIFO:
 154         */
 155        list_add_tail(&timer->entry, vec);
 156}
 157
 158int __mod_timer(struct timer_list *timer, unsigned long expires)
 159{
 160        tvec_base_t *old_base, *new_base;
 161        unsigned long flags;
 162        int ret = 0;
 163
 164        BUG_ON(!timer->function);
 165
 166        check_timer(timer);
 167
 168        spin_lock_irqsave(&timer->lock, flags);
 169        new_base = &__get_cpu_var(tvec_bases);
 170repeat:
 171        old_base = timer->base;
 172
 173        /*
 174         * Prevent deadlocks via ordering by old_base < new_base.
 175         */
 176        if (old_base && (new_base != old_base)) {
 177                if (old_base < new_base) {
 178                        spin_lock(&new_base->lock);
 179                        spin_lock(&old_base->lock);
 180                } else {
 181                        spin_lock(&old_base->lock);
 182                        spin_lock(&new_base->lock);
 183                }
 184                /*
 185                 * The timer base might have been cancelled while we were
 186                 * trying to take the lock(s):
 187                 */
 188                if (timer->base != old_base) {
 189                        spin_unlock(&new_base->lock);
 190                        spin_unlock(&old_base->lock);
 191                        goto repeat;
 192                }
 193        } else {
 194                spin_lock(&new_base->lock);
 195                if (timer->base != old_base) {
 196                        spin_unlock(&new_base->lock);
 197                        goto repeat;
 198                }
 199        }
 200
 201        /*
 202         * Delete the previous timeout (if there was any), and install
 203         * the new one:
 204         */
 205        if (old_base) {
 206                list_del(&timer->entry);
 207                ret = 1;
 208        }
 209        timer->expires = expires;
 210        internal_add_timer(new_base, timer);
 211        timer->base = new_base;
 212
 213        if (old_base && (new_base != old_base))
 214                spin_unlock(&old_base->lock);
 215        spin_unlock(&new_base->lock);
 216        spin_unlock_irqrestore(&timer->lock, flags);
 217
 218        return ret;
 219}
 220
 221EXPORT_SYMBOL(__mod_timer);
 222
 223/***
 224 * add_timer_on - start a timer on a particular CPU
 225 * @timer: the timer to be added
 226 * @cpu: the CPU to start it on
 227 *
 228 * This is not very scalable on SMP. Double adds are not possible.
 229 */
 230void add_timer_on(struct timer_list *timer, int cpu)
 231{
 232        tvec_base_t *base = &per_cpu(tvec_bases, cpu);
 233        unsigned long flags;
 234  
 235        BUG_ON(timer_pending(timer) || !timer->function);
 236
 237        check_timer(timer);
 238
 239        spin_lock_irqsave(&base->lock, flags);
 240        internal_add_timer(base, timer);
 241        timer->base = base;
 242        spin_unlock_irqrestore(&base->lock, flags);
 243}
 244
 245EXPORT_SYMBOL_GPL(add_timer_on);
 246
 247/***
 248 * mod_timer - modify a timer's timeout
 249 * @timer: the timer to be modified
 250 *
 251 * mod_timer is a more efficient way to update the expire field of an
 252 * active timer (if the timer is inactive it will be activated)
 253 *
 254 * mod_timer(timer, expires) is equivalent to:
 255 *
 256 *     del_timer(timer); timer->expires = expires; add_timer(timer);
 257 *
 258 * Note that if there are multiple unserialized concurrent users of the
 259 * same timer, then mod_timer() is the only safe way to modify the timeout,
 260 * since add_timer() cannot modify an already running timer.
 261 *
 262 * The function returns whether it has modified a pending timer or not.
 263 * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
 264 * active timer returns 1.)
 265 */
 266int mod_timer(struct timer_list *timer, unsigned long expires)
 267{
 268        BUG_ON(!timer->function);
 269
 270        check_timer(timer);
 271
 272        /*
 273         * This is a common optimization triggered by the
 274         * networking code - if the timer is re-modified
 275         * to be the same thing then just return:
 276         */
 277        if (timer->expires == expires && timer_pending(timer))
 278                return 1;
 279
 280        return __mod_timer(timer, expires);
 281}
 282
 283EXPORT_SYMBOL(mod_timer);
 284
 285/***
 286 * del_timer - deactive a timer.
 287 * @timer: the timer to be deactivated
 288 *
 289 * del_timer() deactivates a timer - this works on both active and inactive
 290 * timers.
 291 *
 292 * The function returns whether it has deactivated a pending timer or not.
 293 * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
 294 * active timer returns 1.)
 295 */
 296int del_timer(struct timer_list *timer)
 297{
 298        unsigned long flags;
 299        tvec_base_t *base;
 300
 301        check_timer(timer);
 302
 303repeat:
 304        base = timer->base;
 305        if (!base)
 306                return 0;
 307        spin_lock_irqsave(&base->lock, flags);
 308        if (base != timer->base) {
 309                spin_unlock_irqrestore(&base->lock, flags);
 310                goto repeat;
 311        }
 312        list_del(&timer->entry);
 313        smp_wmb(); /* the list del must have taken effect before timer->base
 314                    * change is visible to other CPUs, or a concurrent mod_timer
 315                    * would cause a race with list_add
 316                    */
 317        timer->base = NULL;
 318        spin_unlock_irqrestore(&base->lock, flags);
 319
 320        return 1;
 321}
 322
 323EXPORT_SYMBOL(del_timer);
 324
 325#ifdef CONFIG_SMP
 326/***
 327 * del_timer_sync - deactivate a timer and wait for the handler to finish.
 328 * @timer: the timer to be deactivated
 329 *
 330 * This function only differs from del_timer() on SMP: besides deactivating
 331 * the timer it also makes sure the handler has finished executing on other
 332 * CPUs.
 333 *
 334 * Synchronization rules: callers must prevent restarting of the timer,
 335 * otherwise this function is meaningless. It must not be called from
 336 * interrupt contexts. The caller must not hold locks which would prevent
 337 * completion of the timer's handler.  Upon exit the timer is not queued and
 338 * the handler is not running on any CPU.
 339 *
 340 * The function returns whether it has deactivated a pending timer or not.
 341 *
 342 * del_timer_sync() is slow and complicated because it copes with timer
 343 * handlers which re-arm the timer (periodic timers).  If the timer handler
 344 * is known to not do this (a single shot timer) then use
 345 * del_singleshot_timer_sync() instead.
 346 */
 347int del_timer_sync(struct timer_list *timer)
 348{
 349        tvec_base_t *base;
 350        int i, ret = 0;
 351
 352        check_timer(timer);
 353
 354del_again:
 355        ret += del_timer(timer);
 356
 357        for_each_online_cpu(i) {
 358                base = &per_cpu(tvec_bases, i);
 359                if (base->running_timer == timer) {
 360                        while (base->running_timer == timer) {
 361                                cpu_relax();
 362                                preempt_check_resched();
 363                        }
 364                        break;
 365                }
 366        }
 367        smp_rmb();
 368        if (timer_pending(timer))
 369                goto del_again;
 370
 371        return ret;
 372}
 373EXPORT_SYMBOL(del_timer_sync);
 374
 375/***
 376 * del_singleshot_timer_sync - deactivate a non-recursive timer
 377 * @timer: the timer to be deactivated
 378 *
 379 * This function is an optimization of del_timer_sync for the case where the
 380 * caller can guarantee the timer does not reschedule itself in its timer
 381 * function.
 382 *
 383 * Synchronization rules: callers must prevent restarting of the timer,
 384 * otherwise this function is meaningless. It must not be called from
 385 * interrupt contexts. The caller must not hold locks which wold prevent
 386 * completion of the timer's handler.  Upon exit the timer is not queued and
 387 * the handler is not running on any CPU.
 388 *
 389 * The function returns whether it has deactivated a pending timer or not.
 390 */
 391int del_singleshot_timer_sync(struct timer_list *timer)
 392{
 393        int ret = del_timer(timer);
 394
 395        if (!ret) {
 396                ret = del_timer_sync(timer);
 397                BUG_ON(ret);
 398        }
 399
 400        return ret;
 401}
 402EXPORT_SYMBOL(del_singleshot_timer_sync);
 403#endif
 404
 405static int cascade(tvec_base_t *base, tvec_t *tv, int index)
 406{
 407        /* cascade all the timers from tv up one level */
 408        struct list_head *head, *curr;
 409
 410        head = tv->vec + index;
 411        curr = head->next;
 412        /*
 413         * We are removing _all_ timers from the list, so we don't  have to
 414         * detach them individually, just clear the list afterwards.
 415         */
 416        while (curr != head) {
 417                struct timer_list *tmp;
 418
 419                tmp = list_entry(curr, struct timer_list, entry);
 420                BUG_ON(tmp->base != base);
 421                curr = curr->next;
 422                internal_add_timer(base, tmp);
 423        }
 424        INIT_LIST_HEAD(head);
 425
 426        return index;
 427}
 428
 429/***
 430 * __run_timers - run all expired timers (if any) on this CPU.
 431 * @base: the timer vector to be processed.
 432 *
 433 * This function cascades all vectors and executes all expired timer
 434 * vectors.
 435 */
 436#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
 437
 438static inline void __run_timers(tvec_base_t *base)
 439{
 440        struct timer_list *timer;
 441        unsigned long flags;
 442
 443        spin_lock_irqsave(&base->lock, flags);
 444        while (time_after_eq(jiffies, base->timer_jiffies)) {
 445                struct list_head work_list = LIST_HEAD_INIT(work_list);
 446                struct list_head *head = &work_list;
 447                int index = base->timer_jiffies & TVR_MASK;
 448 
 449                /*
 450                 * Cascade timers:
 451                 */
 452                if (!index &&
 453                        (!cascade(base, &base->tv2, INDEX(0))) &&
 454                                (!cascade(base, &base->tv3, INDEX(1))) &&
 455                                        !cascade(base, &base->tv4, INDEX(2)))
 456                        cascade(base, &base->tv5, INDEX(3));
 457                ++base->timer_jiffies; 
 458                list_splice_init(base->tv1.vec + index, &work_list);
 459repeat:
 460                if (!list_empty(head)) {
 461                        void (*fn)(unsigned long);
 462                        unsigned long data;
 463
 464                        timer = list_entry(head->next,struct timer_list,entry);
 465                        fn = timer->function;
 466                        data = timer->data;
 467
 468                        list_del(&timer->entry);
 469                        set_running_timer(base, timer);
 470                        smp_wmb(); /* the list del must have taken effect before timer->base
 471                                    * change is visible to other CPUs, or a concurrent mod_timer
 472                                    * would cause a race with list_add
 473                                    */
 474                        timer->base = NULL;
 475                        spin_unlock_irqrestore(&base->lock, flags);
 476                        fn(data);
 477                        spin_lock_irq(&base->lock);
 478                        goto repeat;
 479                }
 480        }
 481        set_running_timer(base, NULL);
 482        spin_unlock_irqrestore(&base->lock, flags);
 483}
 484
 485#ifdef CONFIG_NO_IDLE_HZ
 486/*
 487 * Find out when the next timer event is due to happen. This
 488 * is used on S/390 to stop all activity when a cpus is idle.
 489 * This functions needs to be called disabled.
 490 */
 491unsigned long next_timer_interrupt(void)
 492{
 493        tvec_base_t *base;
 494        struct list_head *list;
 495        struct timer_list *nte;
 496        unsigned long expires;
 497        tvec_t *varray[4];
 498        int i, j;
 499
 500        base = &__get_cpu_var(tvec_bases);
 501        spin_lock(&base->lock);
 502        expires = base->timer_jiffies + (LONG_MAX >> 1);
 503        list = 0;
 504
 505        /* Look for timer events in tv1. */
 506        j = base->timer_jiffies & TVR_MASK;
 507        do {
 508                list_for_each_entry(nte, base->tv1.vec + j, entry) {
 509                        expires = nte->expires;
 510                        if (j < (base->timer_jiffies & TVR_MASK))
 511                                list = base->tv2.vec + (INDEX(0));
 512                        goto found;
 513                }
 514                j = (j + 1) & TVR_MASK;
 515        } while (j != (base->timer_jiffies & TVR_MASK));
 516
 517        /* Check tv2-tv5. */
 518        varray[0] = &base->tv2;
 519        varray[1] = &base->tv3;
 520        varray[2] = &base->tv4;
 521        varray[3] = &base->tv5;
 522        for (i = 0; i < 4; i++) {
 523                j = INDEX(i);
 524                do {
 525                        if (list_empty(varray[i]->vec + j)) {
 526                                j = (j + 1) & TVN_MASK;
 527                                continue;
 528                        }
 529                        list_for_each_entry(nte, varray[i]->vec + j, entry)
 530                                if (time_before(nte->expires, expires))
 531                                        expires = nte->expires;
 532                        if (j < (INDEX(i)) && i < 3)
 533                                list = varray[i + 1]->vec + (INDEX(i + 1));
 534                        goto found;
 535                } while (j != (INDEX(i)));
 536        }
 537found:
 538        if (list) {
 539                /*
 540                 * The search wrapped. We need to look at the next list
 541                 * from next tv element that would cascade into tv element
 542                 * where we found the timer element.
 543                 */
 544                list_for_each_entry(nte, list, entry) {
 545                        if (time_before(nte->expires, expires))
 546                                expires = nte->expires;
 547                }
 548        }
 549        spin_unlock(&base->lock);
 550        return expires;
 551}
 552#endif
 553
 554/******************************************************************/
 555
 556/*
 557 * Timekeeping variables
 558 */
 559unsigned long tick_usec = TICK_USEC;            /* USER_HZ period (usec) */
 560unsigned long tick_nsec = TICK_NSEC;            /* ACTHZ period (nsec) */
 561
 562/* 
 563 * The current time 
 564 * wall_to_monotonic is what we need to add to xtime (or xtime corrected 
 565 * for sub jiffie times) to get to monotonic time.  Monotonic is pegged at zero
 566 * at zero at system boot time, so wall_to_monotonic will be negative,
 567 * however, we will ALWAYS keep the tv_nsec part positive so we can use
 568 * the usual normalization.
 569 */
 570struct timespec xtime __attribute__ ((aligned (16)));
 571struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
 572
 573EXPORT_SYMBOL(xtime);
 574
 575/* Don't completely fail for HZ > 500.  */
 576int tickadj = 500/HZ ? : 1;             /* microsecs */
 577
 578
 579/*
 580 * phase-lock loop variables
 581 */
 582/* TIME_ERROR prevents overwriting the CMOS clock */
 583int time_state = TIME_OK;               /* clock synchronization status */
 584int time_status = STA_UNSYNC;           /* clock status bits            */
 585long time_offset;                       /* time adjustment (us)         */
 586long time_constant = 2;                 /* pll time constant            */
 587long time_tolerance = MAXFREQ;          /* frequency tolerance (ppm)    */
 588long time_precision = 1;                /* clock precision (us)         */
 589long time_maxerror = NTP_PHASE_LIMIT;   /* maximum error (us)           */
 590long time_esterror = NTP_PHASE_LIMIT;   /* estimated error (us)         */
 591long time_phase;                        /* phase offset (scaled us)     */
 592long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
 593                                        /* frequency offset (scaled ppm)*/
 594long time_adj;                          /* tick adjust (scaled 1 / HZ)  */
 595long time_reftime;                      /* time at last adjustment (s)  */
 596long time_adjust;
 597long time_next_adjust;
 598
 599unsigned long leap_second = TIME_OK;
 600/*
 601 * this routine handles the overflow of the microsecond field
 602 *
 603 * The tricky bits of code to handle the accurate clock support
 604 * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame.
 605 * They were originally developed for SUN and DEC kernels.
 606 * All the kudos should go to Dave for this stuff.
 607 *
 608 */
 609static void second_overflow(void)
 610{
 611    long ltemp;
 612
 613    /* Bump the maxerror field */
 614    time_maxerror += time_tolerance >> SHIFT_USEC;
 615    if ( time_maxerror > NTP_PHASE_LIMIT ) {
 616        time_maxerror = NTP_PHASE_LIMIT;
 617        time_status |= STA_UNSYNC;
 618    }
 619
 620    /*
 621     * Leap second processing. If in leap-insert state at
 622     * the end of the day, the system clock is set back one
 623     * second; if in leap-delete state, the system clock is
 624     * set ahead one second. The microtime() routine or
 625     * external clock driver will insure that reported time
 626     * is always monotonic. The ugly divides should be
 627     * replaced.
 628     */
 629    switch (time_state) {
 630
 631    case TIME_OK:
 632        if (time_status & STA_INS)
 633            time_state = TIME_INS;
 634        else if (time_status & STA_DEL)
 635            time_state = TIME_DEL;
 636        break;
 637
 638    case TIME_INS:
 639        if (xtime.tv_sec % 86400 == 0) {
 640            xtime.tv_sec--;
 641            wall_to_monotonic.tv_sec++;
 642            /* The timer interpolator will make time change gradually instead
 643             * of an immediate jump by one second.
 644             */
 645            time_interpolator_update(-NSEC_PER_SEC);
 646            time_state = TIME_OOP;
 647            clock_was_set();
 648            leap_second = TIME_INS;
 649        }
 650        break;
 651
 652    case TIME_DEL:
 653        if ((xtime.tv_sec + 1) % 86400 == 0) {
 654            xtime.tv_sec++;
 655            wall_to_monotonic.tv_sec--;
 656            /* Use of time interpolator for a gradual change of time */
 657            time_interpolator_update(NSEC_PER_SEC);
 658            time_state = TIME_WAIT;
 659            clock_was_set();
 660            leap_second = TIME_DEL;
 661        }
 662        break;
 663
 664    case TIME_OOP:
 665        time_state = TIME_WAIT;
 666        break;
 667
 668    case TIME_WAIT:
 669        if (!(time_status & (STA_INS | STA_DEL)))
 670            time_state = TIME_OK;
 671    }
 672
 673    /*
 674     * Compute the phase adjustment for the next second. In
 675     * PLL mode, the offset is reduced by a fixed factor
 676     * times the time constant. In FLL mode the offset is
 677     * used directly. In either mode, the maximum phase
 678     * adjustment for each second is clamped so as to spread
 679     * the adjustment over not more than the number of
 680     * seconds between updates.
 681     */
 682    if (time_offset < 0) {
 683        ltemp = -time_offset;
 684        if (!(time_status & STA_FLL))
 685            ltemp >>= SHIFT_KG + time_constant;
 686        if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
 687            ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
 688        time_offset += ltemp;
 689        time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
 690    } else {
 691        ltemp = time_offset;
 692        if (!(time_status & STA_FLL))
 693            ltemp >>= SHIFT_KG + time_constant;
 694        if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
 695            ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
 696        time_offset -= ltemp;
 697        time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
 698    }
 699
 700    /*
 701     * Compute the frequency estimate and additional phase
 702     * adjustment due to frequency error for the next
 703     * second. When the PPS signal is engaged, gnaw on the
 704     * watchdog counter and update the frequency computed by
 705     * the pll and the PPS signal.
 706     */
 707    pps_valid++;
 708    if (pps_valid == PPS_VALID) {       /* PPS signal lost */
 709        pps_jitter = MAXTIME;
 710        pps_stabil = MAXFREQ;
 711        time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
 712                         STA_PPSWANDER | STA_PPSERROR);
 713    }
 714    ltemp = time_freq + pps_freq;
 715    if (ltemp < 0)
 716        time_adj -= -ltemp >>
 717            (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
 718    else
 719        time_adj += ltemp >>
 720            (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
 721
 722#if HZ == 100
 723    /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
 724     * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
 725     */
 726    if (time_adj < 0)
 727        time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
 728    else
 729        time_adj += (time_adj >> 2) + (time_adj >> 5);
 730#endif
 731#if HZ == 1000
 732    /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
 733     * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
 734     */
 735    if (time_adj < 0)
 736        time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
 737    else
 738        time_adj += (time_adj >> 6) + (time_adj >> 7);
 739#endif
 740}
 741
 742/* in the NTP reference this is called "hardclock()" */
 743static void update_wall_time_one_tick(void)
 744{
 745        long time_adjust_step, delta_nsec;
 746
 747        if ( (time_adjust_step = time_adjust) != 0 ) {
 748            /* We are doing an adjtime thing. 
 749             *
 750             * Prepare time_adjust_step to be within bounds.
 751             * Note that a positive time_adjust means we want the clock
 752             * to run faster.
 753             *
 754             * Limit the amount of the step to be in the range
 755             * -tickadj .. +tickadj
 756             */
 757             if (time_adjust > tickadj)
 758                time_adjust_step = tickadj;
 759             else if (time_adjust < -tickadj)
 760                time_adjust_step = -tickadj;
 761
 762            /* Reduce by this step the amount of time left  */
 763            time_adjust -= time_adjust_step;
 764        }
 765        delta_nsec = tick_nsec + time_adjust_step * 1000;
 766        /*
 767         * Advance the phase, once it gets to one microsecond, then
 768         * advance the tick more.
 769         */
 770        time_phase += time_adj;
 771        if (time_phase <= -FINENSEC) {
 772                long ltemp = -time_phase >> (SHIFT_SCALE - 10);
 773                time_phase += ltemp << (SHIFT_SCALE - 10);
 774                delta_nsec -= ltemp;
 775        }
 776        else if (time_phase >= FINENSEC) {
 777                long ltemp = time_phase >> (SHIFT_SCALE - 10);
 778                time_phase -= ltemp << (SHIFT_SCALE - 10);
 779                delta_nsec += ltemp;
 780        }
 781        xtime.tv_nsec += delta_nsec;
 782        time_interpolator_update(delta_nsec);
 783
 784        /* Changes by adjtime() do not take effect till next tick. */
 785        if (time_next_adjust != 0) {
 786                time_adjust = time_next_adjust;
 787                time_next_adjust = 0;
 788        }
 789}
 790
 791/*
 792 * Using a loop looks inefficient, but "ticks" is
 793 * usually just one (we shouldn't be losing ticks,
 794 * we're doing this this way mainly for interrupt
 795 * latency reasons, not because we think we'll
 796 * have lots of lost timer ticks
 797 */
 798static void update_wall_time(unsigned long ticks)
 799{
 800        do {
 801                ticks--;
 802                update_wall_time_one_tick();
 803                if (xtime.tv_nsec >= 1000000000) {
 804                        xtime.tv_nsec -= 1000000000;
 805                        xtime.tv_sec++;
 806                        second_overflow();
 807                }
 808        } while (ticks);
 809}
 810
 811static inline void do_process_times(struct task_struct *p,
 812        unsigned long user, unsigned long system)
 813{
 814        unsigned long psecs;
 815
 816        psecs = (p->utime += user);
 817        psecs += (p->stime += system);
 818        if (p->signal && !unlikely(p->state & (EXIT_DEAD|EXIT_ZOMBIE)) &&
 819                        (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_cur)) {
 820                /* Send SIGXCPU every second.. */
 821                if (!(psecs % HZ))
 822                        send_sig(SIGXCPU, p, 1);
 823                /* and SIGKILL when we go over max.. */
 824                if (psecs / HZ >= p->rlim[RLIMIT_CPU].rlim_max)
 825                        send_sig(SIGKILL, p, 1);
 826        }
 827}
 828
 829static inline void do_it_virt(struct task_struct * p, unsigned long ticks)
 830{
 831        unsigned long it_virt = p->it_virt_value;
 832
 833        if (it_virt) {
 834                it_virt -= ticks;
 835                if (!it_virt) {
 836                        it_virt = p->it_virt_incr;
 837                        send_sig(SIGVTALRM, p, 1);
 838                }
 839                p->it_virt_value = it_virt;
 840        }
 841}
 842
 843static inline void do_it_prof(struct task_struct *p)
 844{
 845        unsigned long it_prof = p->it_prof_value;
 846
 847        if (it_prof) {
 848                if (--it_prof == 0) {
 849                        it_prof = p->it_prof_incr;
 850                        send_sig(SIGPROF, p, 1);
 851                }
 852                p->it_prof_value = it_prof;
 853        }
 854}
 855
 856static void update_one_process(struct task_struct *p, unsigned long user,
 857                        unsigned long system, int cpu)
 858{
 859        do_process_times(p, user, system);
 860        do_it_virt(p, user);
 861        do_it_prof(p);
 862}       
 863
 864/*
 865 * Called from the timer interrupt handler to charge one tick to the current 
 866 * process.  user_tick is 1 if the tick is user time, 0 for system.
 867 */
 868void update_process_times(int user_tick)
 869{
 870        struct task_struct *p = current;
 871        int cpu = smp_processor_id(), system = user_tick ^ 1;
 872
 873        update_one_process(p, user_tick, system, cpu);
 874        run_local_timers();
 875        scheduler_tick(user_tick, system);
 876}
 877
 878/*
 879 * Nr of active tasks - counted in fixed-point numbers
 880 */
 881static unsigned long count_active_tasks(void)
 882{
 883        return (nr_running() + nr_uninterruptible()) * FIXED_1;
 884}
 885
 886/*
 887 * Hmm.. Changed this, as the GNU make sources (load.c) seems to
 888 * imply that avenrun[] is the standard name for this kind of thing.
 889 * Nothing else seems to be standardized: the fractional size etc
 890 * all seem to differ on different machines.
 891 *
 892 * Requires xtime_lock to access.
 893 */
 894unsigned long avenrun[3];
 895
 896/*
 897 * calc_load - given tick count, update the avenrun load estimates.
 898 * This is called while holding a write_lock on xtime_lock.
 899 */
 900static inline void calc_load(unsigned long ticks)
 901{
 902        unsigned long active_tasks; /* fixed-point */
 903        static int count = LOAD_FREQ;
 904
 905        count -= ticks;
 906        if (count < 0) {
 907                count += LOAD_FREQ;
 908                active_tasks = count_active_tasks();
 909                CALC_LOAD(avenrun[0], EXP_1, active_tasks);
 910                CALC_LOAD(avenrun[1], EXP_5, active_tasks);
 911                CALC_LOAD(avenrun[2], EXP_15, active_tasks);
 912        }
 913}
 914
 915/* jiffies at the most recent update of wall time */
 916unsigned long wall_jiffies = INITIAL_JIFFIES;
 917
 918/*
 919 * This read-write spinlock protects us from races in SMP while
 920 * playing with xtime and avenrun.
 921 */
 922#ifndef ARCH_HAVE_XTIME_LOCK
 923seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
 924
 925EXPORT_SYMBOL(xtime_lock);
 926#endif
 927
 928/*
 929 * This function runs timers and the timer-tq in bottom half context.
 930 */
 931static void run_timer_softirq(struct softirq_action *h)
 932{
 933        tvec_base_t *base = &__get_cpu_var(tvec_bases);
 934
 935        if (time_after_eq(jiffies, base->timer_jiffies))
 936                __run_timers(base);
 937}
 938
 939/*
 940 * Called by the local, per-CPU timer interrupt on SMP.
 941 */
 942void run_local_timers(void)
 943{
 944        raise_softirq(TIMER_SOFTIRQ);
 945}
 946
 947/*
 948 * Called by the timer interrupt. xtime_lock must already be taken
 949 * by the timer IRQ!
 950 */
 951static inline void update_times(void)
 952{
 953        unsigned long ticks;
 954
 955        ticks = jiffies - wall_jiffies;
 956        if (ticks) {
 957                wall_jiffies += ticks;
 958                update_wall_time(ticks);
 959        }
 960        calc_load(ticks);
 961}
 962  
 963/*
 964 * The 64-bit jiffies value is not atomic - you MUST NOT read it
 965 * without sampling the sequence number in xtime_lock.
 966 * jiffies is defined in the linker script...
 967 */
 968
 969void do_timer(struct pt_regs *regs)
 970{
 971        jiffies_64++;
 972#ifndef CONFIG_SMP
 973        /* SMP process accounting uses the local APIC timer */
 974
 975        update_process_times(user_mode(regs));
 976#endif
 977        update_times();
 978}
 979
 980#ifdef __ARCH_WANT_SYS_ALARM
 981
 982/*
 983 * For backwards compatibility?  This can be done in libc so Alpha
 984 * and all newer ports shouldn't need it.
 985 */
 986asmlinkage unsigned long sys_alarm(unsigned int seconds)
 987{
 988        struct itimerval it_new, it_old;
 989        unsigned int oldalarm;
 990
 991        it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0;
 992        it_new.it_value.tv_sec = seconds;
 993        it_new.it_value.tv_usec = 0;
 994        do_setitimer(ITIMER_REAL, &it_new, &it_old);
 995        oldalarm = it_old.it_value.tv_sec;
 996        /* ehhh.. We can't return 0 if we have an alarm pending.. */
 997        /* And we'd better return too much than too little anyway */
 998        if ((!oldalarm && it_old.it_value.tv_usec) || it_old.it_value.tv_usec >= 500000)
 999                oldalarm++;
1000        return oldalarm;
1001}
1002
1003#endif
1004
1005#ifndef __alpha__
1006
1007/*
1008 * The Alpha uses getxpid, getxuid, and getxgid instead.  Maybe this
1009 * should be moved into arch/i386 instead?
1010 */
1011
1012/**
1013 * sys_getpid - return the thread group id of the current process
1014 *
1015 * Note, despite the name, this returns the tgid not the pid.  The tgid and
1016 * the pid are identical unless CLONE_THREAD was specified on clone() in
1017 * which case the tgid is the same in all threads of the same group.
1018 *
1019 * This is SMP safe as current->tgid does not change.
1020 */
1021asmlinkage long sys_getpid(void)
1022{
1023        return current->tgid;
1024}
1025
1026/*
1027 * Accessing ->group_leader->real_parent is not SMP-safe, it could
1028 * change from under us. However, rather than getting any lock
1029 * we can use an optimistic algorithm: get the parent
1030 * pid, and go back and check that the parent is still
1031 * the same. If it has changed (which is extremely unlikely
1032 * indeed), we just try again..
1033 *
1034 * NOTE! This depends on the fact that even if we _do_
1035 * get an old value of "parent", we can happily dereference
1036 * the pointer (it was and remains a dereferencable kernel pointer
1037 * no matter what): we just can't necessarily trust the result
1038 * until we know that the parent pointer is valid.
1039 *
1040 * NOTE2: ->group_leader never changes from under us.
1041 */
1042asmlinkage long sys_getppid(void)
1043{
1044        int pid;
1045        struct task_struct *me = current;
1046        struct task_struct *parent;
1047
1048        parent = me->group_leader->real_parent;
1049        for (;;) {
1050                pid = parent->tgid;
1051#ifdef CONFIG_SMP
1052{
1053                struct task_struct *old = parent;
1054
1055                /*
1056                 * Make sure we read the pid before re-reading the
1057                 * parent pointer:
1058                 */
1059                rmb();
1060                parent = me->group_leader->real_parent;
1061                if (old != parent)
1062                        continue;
1063}
1064#endif
1065                break;
1066        }
1067        return pid;
1068}
1069
1070asmlinkage long sys_getuid(void)
1071{
1072        /* Only we change this so SMP safe */
1073        return current->uid;
1074}
1075
1076asmlinkage long sys_geteuid(void)
1077{
1078        /* Only we change this so SMP safe */
1079        return current->euid;
1080}
1081
1082asmlinkage long sys_getgid(void)
1083{
1084        /* Only we change this so SMP safe */
1085        return current->gid;
1086}
1087
1088asmlinkage long sys_getegid(void)
1089{
1090        /* Only we change this so SMP safe */
1091        return  current->egid;
1092}
1093
1094#endif
1095
1096struct ptimeout {
1097        struct task_struct *tsk;
1098        int timer_ran;
1099};
1100
1101static void process_timeout(unsigned long __data)
1102{
1103        struct ptimeout *p = (struct ptimeout *) __data;
1104        wake_up_process(p->tsk);
1105        /* make sure the wake-up has completed */
1106        smp_wmb();
1107        p->timer_ran = 1;
1108}
1109
1110/**
1111 * schedule_timeout - sleep until timeout
1112 * @timeout: timeout value in jiffies
1113 *
1114 * Make the current task sleep until @timeout jiffies have
1115 * elapsed. The routine will return immediately unless
1116 * the current task state has been set (see set_current_state()).
1117 *
1118 * You can set the task state as follows -
1119 *
1120 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
1121 * pass before the routine returns. The routine will return 0
1122 *
1123 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1124 * delivered to the current task. In this case the remaining time
1125 * in jiffies will be returned, or 0 if the timer expired in time
1126 *
1127 * The current task state is guaranteed to be TASK_RUNNING when this
1128 * routine returns.
1129 *
1130 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
1131 * the CPU away without a bound on the timeout. In this case the return
1132 * value will be %MAX_SCHEDULE_TIMEOUT.
1133 *
1134 * In all cases the return value is guaranteed to be non-negative.
1135 */
1136fastcall signed long __sched schedule_timeout(signed long timeout)
1137{
1138        struct timer_list timer;
1139        unsigned long expire;
1140        struct ptimeout p = { current, 0 };
1141
1142        if (crashdump_mode()) {
1143                diskdump_mdelay(timeout);
1144                set_current_state(TASK_RUNNING);
1145                return timeout;
1146        }
1147
1148        switch (timeout)
1149        {
1150        case MAX_SCHEDULE_TIMEOUT:
1151                /*
1152                 * These two special cases are useful to be comfortable
1153                 * in the caller. Nothing more. We could take
1154                 * MAX_SCHEDULE_TIMEOUT from one of the negative value
1155                 * but I' d like to return a valid offset (>=0) to allow
1156                 * the caller to do everything it want with the retval.
1157                 */
1158                schedule();
1159                goto out;
1160        default:
1161                /*
1162                 * Another bit of PARANOID. Note that the retval will be
1163                 * 0 since no piece of kernel is supposed to do a check
1164                 * for a negative retval of schedule_timeout() (since it
1165                 * should never happens anyway). You just have the printk()
1166                 * that will tell you if something is gone wrong and where.
1167                 */
1168                if (timeout < 0)
1169                {
1170                        printk(KERN_ERR "schedule_timeout: wrong timeout "
1171                               "value %lx from %p\n", timeout,
1172                               __builtin_return_address(0));
1173                        current->state = TASK_RUNNING;
1174                        goto out;
1175                }
1176        }
1177
1178        expire = timeout + jiffies;
1179
1180        init_timer(&timer);
1181        timer.expires = expire;
1182        timer.data = (unsigned long) &p;
1183        timer.function = process_timeout;
1184
1185        add_timer(&timer);
1186        schedule();
1187        if (!p.timer_ran)
1188                del_singleshot_timer_sync(&timer);
1189
1190        timeout = expire - jiffies;
1191
1192 out:
1193        return timeout < 0 ? 0 : timeout;
1194}
1195
1196EXPORT_SYMBOL(schedule_timeout);
1197
1198signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1199{
1200        __set_current_state(TASK_UNINTERRUPTIBLE);
1201        return schedule_timeout(timeout);
1202}
1203EXPORT_SYMBOL(schedule_timeout_uninterruptible);
1204
1205/* Thread ID - the internal kernel "pid" */
1206asmlinkage long sys_gettid(void)
1207{
1208        return current->pid;
1209}
1210
1211static long __sched nanosleep_restart(struct restart_block *restart)
1212{
1213        unsigned long expire = restart->arg0, now = jiffies;
1214        struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
1215        long ret;
1216
1217        /* Did it expire while we handled signals? */
1218        if (!time_after(expire, now))
1219                return 0;
1220
1221        current->state = TASK_INTERRUPTIBLE;
1222        expire = schedule_timeout(expire - now);
1223
1224        ret = 0;
1225        if (expire) {
1226                struct timespec t;
1227                jiffies_to_timespec(expire, &t);
1228
1229                ret = -ERESTART_RESTARTBLOCK;
1230                if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
1231                        ret = -EFAULT;
1232                /* The 'restart' block is already filled in */
1233        }
1234        return ret;
1235}
1236
1237asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1238{
1239        struct timespec t;
1240        unsigned long expire;
1241        long ret;
1242
1243        if (copy_from_user(&t, rqtp, sizeof(t)))
1244                return -EFAULT;
1245
1246        if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
1247                return -EINVAL;
1248
1249        expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
1250        current->state = TASK_INTERRUPTIBLE;
1251        expire = schedule_timeout(expire);
1252
1253        ret = 0;
1254        if (expire) {
1255                struct restart_block *restart;
1256                jiffies_to_timespec(expire, &t);
1257                if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
1258                        return -EFAULT;
1259
1260                restart = &current_thread_info()->restart_block;
1261                restart->fn = nanosleep_restart;
1262                restart->arg0 = jiffies + expire;
1263                restart->arg1 = (unsigned long) rmtp;
1264                ret = -ERESTART_RESTARTBLOCK;
1265        }
1266        return ret;
1267}
1268
1269/*
1270 * sys_sysinfo - fill in sysinfo struct
1271 */ 
1272asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1273{
1274        struct sysinfo val;
1275        unsigned long mem_total, sav_total;
1276        unsigned int mem_unit, bitcount;
1277        unsigned long seq;
1278
1279        memset((char *)&val, 0, sizeof(struct sysinfo));
1280
1281        do {
1282                struct timespec tp;
1283                seq = read_seqbegin(&xtime_lock);
1284
1285                /*
1286                 * This is annoying.  The below is the same thing
1287                 * posix_get_clock_monotonic() does, but it wants to
1288                 * take the lock which we want to cover the loads stuff
1289                 * too.
1290                 */
1291
1292                getnstimeofday(&tp);
1293                tp.tv_sec += wall_to_monotonic.tv_sec;
1294                tp.tv_nsec += wall_to_monotonic.tv_nsec;
1295                if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
1296                        tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
1297                        tp.tv_sec++;
1298                }
1299                val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
1300
1301                val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
1302                val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
1303                val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
1304
1305                val.procs = nr_threads;
1306        } while (read_seqretry(&xtime_lock, seq));
1307
1308        si_meminfo(&val);
1309        si_swapinfo(&val);
1310
1311        /*
1312         * If the sum of all the available memory (i.e. ram + swap)
1313         * is less than can be stored in a 32 bit unsigned long then
1314         * we can be binary compatible with 2.2.x kernels.  If not,
1315         * well, in that case 2.2.x was broken anyways...
1316         *
1317         *  -Erik Andersen <andersee@debian.org>
1318         */
1319
1320        mem_total = val.totalram + val.totalswap;
1321        if (mem_total < val.totalram || mem_total < val.totalswap)
1322                goto out;
1323        bitcount = 0;
1324        mem_unit = val.mem_unit;
1325        while (mem_unit > 1) {
1326                bitcount++;
1327                mem_unit >>= 1;
1328                sav_total = mem_total;
1329                mem_total <<= 1;
1330                if (mem_total < sav_total)
1331                        goto out;
1332        }
1333
1334        /*
1335         * If mem_total did not overflow, multiply all memory values by
1336         * val.mem_unit and set it to 1.  This leaves things compatible
1337         * with 2.2.x, and also retains compatibility with earlier 2.4.x
1338         * kernels...
1339         */
1340
1341        val.mem_unit = 1;
1342        val.totalram <<= bitcount;
1343        val.freeram <<= bitcount;
1344        val.sharedram <<= bitcount;
1345        val.bufferram <<= bitcount;
1346        val.totalswap <<= bitcount;
1347        val.freeswap <<= bitcount;
1348        val.totalhigh <<= bitcount;
1349        val.freehigh <<= bitcount;
1350
1351 out:
1352        if (copy_to_user(info, &val, sizeof(struct sysinfo)))
1353                return -EFAULT;
1354
1355        return 0;
1356}
1357
1358static void /* __devinit */ init_timers_cpu(int cpu)
1359{
1360        int j;
1361        tvec_base_t *base;
1362       
1363        base = &per_cpu(tvec_bases, cpu);
1364        spin_lock_init(&base->lock);
1365        for (j = 0; j < TVN_SIZE; j++) {
1366                INIT_LIST_HEAD(base->tv5.vec + j);
1367                INIT_LIST_HEAD(base->tv4.vec + j);
1368                INIT_LIST_HEAD(base->tv3.vec + j);
1369                INIT_LIST_HEAD(base->tv2.vec + j);
1370        }
1371        for (j = 0; j < TVR_SIZE; j++)
1372                INIT_LIST_HEAD(base->tv1.vec + j);
1373
1374        base->timer_jiffies = jiffies;
1375}
1376
1377static tvec_base_t saved_tvec_base;
1378
1379void dump_clear_timers(void)
1380{
1381        tvec_base_t *base = &per_cpu(tvec_bases, smp_processor_id());
1382
1383        memcpy(&saved_tvec_base, base, sizeof(saved_tvec_base));
1384        init_timers_cpu(smp_processor_id());
1385}
1386
1387EXPORT_SYMBOL_GPL(dump_clear_timers);
1388
1389void dump_run_timers(void)
1390{
1391        tvec_base_t *base = &__get_cpu_var(tvec_bases);
1392
1393        __run_timers(base);
1394}
1395
1396EXPORT_SYMBOL_GPL(dump_run_timers);
1397
1398#ifdef CONFIG_HOTPLUG_CPU
1399static int migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1400{
1401        struct timer_list *timer;
1402
1403        while (!list_empty(head)) {
1404                timer = list_entry(head->next, struct timer_list, entry);
1405                /* We're locking backwards from __mod_timer order here,
1406                   beware deadlock. */
1407                if (!spin_trylock(&timer->lock))
1408                        return 0;
1409                list_del(&timer->entry);
1410                internal_add_timer(new_base, timer);
1411                timer->base = new_base;
1412                spin_unlock(&timer->lock);
1413        }
1414        return 1;
1415}
1416
1417static void __devinit migrate_timers(int cpu)
1418{
1419        tvec_base_t *old_base;
1420        tvec_base_t *new_base;
1421        int i;
1422
1423        BUG_ON(cpu_online(cpu));
1424        old_base = &per_cpu(tvec_bases, cpu);
1425        new_base = &get_cpu_var(tvec_bases);
1426
1427        local_irq_disable();
1428again:
1429        /* Prevent deadlocks via ordering by old_base < new_base. */
1430        if (old_base < new_base) {
1431                spin_lock(&new_base->lock);
1432                spin_lock(&old_base->lock);
1433        } else {
1434                spin_lock(&old_base->lock);
1435                spin_lock(&new_base->lock);
1436        }
1437
1438        if (old_base->running_timer)
1439                BUG();
1440        for (i = 0; i < TVR_SIZE; i++)
1441                if (!migrate_timer_list(new_base, old_base->tv1.vec + i))
1442                        goto unlock_again;
1443        for (i = 0; i < TVN_SIZE; i++)
1444                if (!migrate_timer_list(new_base, old_base->tv2.vec + i)
1445                    || !migrate_timer_list(new_base, old_base->tv3.vec + i)
1446                    || !migrate_timer_list(new_base, old_base->tv4.vec + i)
1447                    || !migrate_timer_list(new_base, old_base->tv5.vec + i))
1448                        goto unlock_again;
1449        spin_unlock(&old_base->lock);
1450        spin_unlock(&new_base->lock);
1451        local_irq_enable();
1452        put_cpu_var(tvec_bases);
1453        return;
1454
1455unlock_again:
1456        /* Avoid deadlock with __mod_timer, by backing off. */
1457        spin_unlock(&old_base->lock);
1458        spin_unlock(&new_base->lock);
1459        cpu_relax();
1460        goto again;
1461}
1462#endif /* CONFIG_HOTPLUG_CPU */
1463
1464static int __devinit timer_cpu_notify(struct notifier_block *self, 
1465                                unsigned long action, void *hcpu)
1466{
1467        long cpu = (long)hcpu;
1468        switch(action) {
1469        case CPU_UP_PREPARE:
1470                init_timers_cpu(cpu);
1471                break;
1472#ifdef CONFIG_HOTPLUG_CPU
1473        case CPU_DEAD:
1474                migrate_timers(cpu);
1475                break;
1476#endif
1477        default:
1478                break;
1479        }
1480        return NOTIFY_OK;
1481}
1482
1483static struct notifier_block __devinitdata timers_nb = {
1484        .notifier_call  = timer_cpu_notify,
1485};
1486
1487
1488void __init init_timers(void)
1489{
1490        timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1491                                (void *)(long)smp_processor_id());
1492        register_cpu_notifier(&timers_nb);
1493        open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
1494}
1495
1496#ifdef CONFIG_TIME_INTERPOLATION
1497
1498struct time_interpolator *time_interpolator;
1499static struct time_interpolator *time_interpolator_list;
1500static spinlock_t time_interpolator_lock = SPIN_LOCK_UNLOCKED;
1501
1502static inline u64 time_interpolator_get_cycles(unsigned int src)
1503{
1504        unsigned long (*x)(void);
1505
1506        switch (src)
1507        {
1508                case TIME_SOURCE_FUNCTION:
1509                        x = time_interpolator->addr;
1510                        return x();
1511
1512                case TIME_SOURCE_MMIO64 :
1513                        return readq(time_interpolator->addr);
1514
1515                case TIME_SOURCE_MMIO32 :
1516                        return readl(time_interpolator->addr);
1517
1518                default: return get_cycles();
1519        }
1520}
1521
1522static inline u64 time_interpolator_get_counter(int writelock)
1523{
1524        unsigned int src = time_interpolator->source;
1525
1526        if (time_interpolator->jitter)
1527        {
1528                u64 lcycle;
1529                u64 now;
1530
1531                do {
1532                        lcycle = time_interpolator->last_cycle;
1533                        now = time_interpolator_get_cycles(src);
1534                        if (lcycle && time_after(lcycle, now))
1535                                return lcycle;
1536
1537                        /* When holding the xtime write lock, there's no need
1538                         * to add the overhead of the cmpxchg.  Readers are
1539                         * force to retry until the write lock is released.
1540                         */
1541                        if (writelock) {
1542                                time_interpolator->last_cycle = now;
1543                                return now;
1544                        }
1545                        /* Keep track of the last timer value returned. The use of cmpxchg here
1546                         * will cause contention in an SMP environment.
1547                         */
1548                } while (unlikely(cmpxchg(&time_interpolator->last_cycle, lcycle, now) != lcycle));
1549                return now;
1550        }
1551        else
1552                return time_interpolator_get_cycles(src);
1553}
1554
1555void time_interpolator_reset(void)
1556{
1557        time_interpolator->offset = 0;
1558        time_interpolator->last_counter = time_interpolator_get_counter(1);
1559}
1560
1561#define GET_TI_NSECS(count,i) (((((count) - i->last_counter) & (i)->mask) * (i)->nsec_per_cyc) >> (i)->shift)
1562
1563unsigned long time_interpolator_get_offset(void)
1564{
1565        /* If we do not have a time interpolator set up then just return zero */
1566        if (!time_interpolator)
1567                return 0;
1568
1569        return time_interpolator->offset +
1570                GET_TI_NSECS(time_interpolator_get_counter(0), time_interpolator);
1571}
1572
1573#define INTERPOLATOR_ADJUST 65536
1574#define INTERPOLATOR_MAX_SKIP 10*INTERPOLATOR_ADJUST
1575
1576static void time_interpolator_update(long delta_nsec)
1577{
1578        u64 counter;
1579        unsigned long offset;
1580
1581        /* If there is no time interpolator set up then do nothing */
1582        if (!time_interpolator)
1583                return;
1584
1585        /* The interpolator compensates for late ticks by accumulating
1586         * the late time in time_interpolator->offset. A tick earlier than
1587         * expected will lead to a reset of the offset and a corresponding
1588         * jump of the clock forward. Again this only works if the
1589         * interpolator clock is running slightly slower than the regular clock
1590         * and the tuning logic insures that.
1591         */
1592
1593        counter = time_interpolator_get_counter(1);
1594        offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
1595
1596        if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
1597                time_interpolator->offset = offset - delta_nsec;
1598        else {
1599                time_interpolator->skips++;
1600                time_interpolator->ns_skipped += delta_nsec - offset;
1601                time_interpolator->offset = 0;
1602        }
1603        time_interpolator->last_counter = counter;
1604
1605        /* Tuning logic for time interpolator invoked every minute or so.
1606         * Decrease interpolator clock speed if no skips occurred and an offset is carried.
1607         * Increase interpolator clock speed if we skip too much time.
1608         */
1609        if (jiffies % INTERPOLATOR_ADJUST == 0)
1610        {
1611                if (time_interpolator->skips == 0 && time_interpolator->offset > TICK_NSEC)
1612                        time_interpolator->nsec_per_cyc--;
1613                if (time_interpolator->ns_skipped > INTERPOLATOR_MAX_SKIP && time_interpolator->offset == 0)
1614                        time_interpolator->nsec_per_cyc++;
1615                time_interpolator->skips = 0;
1616                time_interpolator->ns_skipped = 0;
1617        }
1618}
1619
1620static inline int
1621is_better_time_interpolator(struct time_interpolator *new)
1622{
1623        if (!time_interpolator)
1624                return 1;
1625        return new->frequency > 2*time_interpolator->frequency ||
1626            (unsigned long)new->drift < (unsigned long)time_interpolator->drift;
1627}
1628
1629void
1630register_time_interpolator(struct time_interpolator *ti)
1631{
1632        unsigned long flags;
1633
1634        /* Sanity check */
1635        if (ti->frequency == 0 || ti->mask == 0)
1636                BUG();
1637
1638        ti->nsec_per_cyc = ((u64)NSEC_PER_SEC << ti->shift) / ti->frequency;
1639        spin_lock(&time_interpolator_lock);
1640        write_seqlock_irqsave(&xtime_lock, flags);
1641        if (is_better_time_interpolator(ti)) {
1642                time_interpolator = ti;
1643                time_interpolator_reset();
1644        }
1645        write_sequnlock_irqrestore(&xtime_lock, flags);
1646
1647        ti->next = time_interpolator_list;
1648        time_interpolator_list = ti;
1649        spin_unlock(&time_interpolator_lock);
1650}
1651
1652void
1653unregister_time_interpolator(struct time_interpolator *ti)
1654{
1655        struct time_interpolator *curr, **prev;
1656        unsigned long flags;
1657
1658        spin_lock(&time_interpolator_lock);
1659        prev = &time_interpolator_list;
1660        for (curr = *prev; curr; curr = curr->next) {
1661                if (curr == ti) {
1662                        *prev = curr->next;
1663                        break;
1664                }
1665                prev = &curr->next;
1666        }
1667
1668        write_seqlock_irqsave(&xtime_lock, flags);
1669        if (ti == time_interpolator) {
1670                /* we lost the best time-interpolator: */
1671                time_interpolator = NULL;
1672                /* find the next-best interpolator */
1673                for (curr = time_interpolator_list; curr; curr = curr->next)
1674                        if (is_better_time_interpolator(curr))
1675                                time_interpolator = curr;
1676                time_interpolator_reset();
1677        }
1678        write_sequnlock_irqrestore(&xtime_lock, flags);
1679        spin_unlock(&time_interpolator_lock);
1680}
1681#endif /* CONFIG_TIME_INTERPOLATION */
1682
1683/**
1684 * msleep - sleep safely even with waitqueue interruptions
1685 * @msecs: Time in milliseconds to sleep for
1686 */
1687void msleep(unsigned int msecs)
1688{
1689        unsigned long timeout = msecs_to_jiffies(msecs);
1690
1691        if (unlikely(crashdump_mode())) {
1692                while (msecs--) udelay(1000);
1693                return;
1694        }
1695
1696        while (timeout) {
1697                set_current_state(TASK_UNINTERRUPTIBLE);
1698                timeout = schedule_timeout(timeout);
1699        }
1700}
1701
1702EXPORT_SYMBOL(msleep);
1703
1704/**
1705 * msleep_interruptible - sleep waiting for waitqueue interruptions
1706 * @msecs: Time in milliseconds to sleep for
1707 */
1708unsigned long msleep_interruptible(unsigned int msecs)
1709{
1710        unsigned long timeout = msecs_to_jiffies(msecs);
1711
1712        while (timeout && !signal_pending(current)) {
1713                set_current_state(TASK_INTERRUPTIBLE);
1714                timeout = schedule_timeout(timeout);
1715        }
1716        return jiffies_to_msecs(timeout);
1717}
1718
1719EXPORT_SYMBOL(msleep_interruptible);
1720