kernel/sched.c
26041 /*
26042 * linux/kernel/sched.c
26043 *
26044 * Copyright (C) 1991, 1992 Linus Torvalds
26045 *
26046 * 1996-12-23 Modified by Dave Grothe to fix bugs in
26047 * semaphores and make semaphores SMP safe
26048 * 1997-01-28 Modified by Finn Arne Gangstad to make
26049 * timers scale better.
26050 * 1997-09-10 Updated NTP code according to technical
26051 * memorandum Jan '96 "A Kernel Model for Precision
26052 * Timekeeping" by Dave Mills
26053 * 1998-11-19 Implemented schedule_timeout() and related
26054 * stuff by Andrea Arcangeli
26055 * 1998-12-24 Fixed a xtime SMP race (we need the
26056 * xtime_lock rw spinlock to serialize accesses to
26057 * xtime/lost_ticks). Copyright (C) 1998 Andrea
26058 * Arcangeli
26059 * 1998-12-28 Implemented better SMP scheduling by Ingo
26060 * Molnar
26061 * 1999-03-10 Improved NTP compatibility by Ulrich Windl
26062 */
26063
26064 /* 'sched.c' is the main kernel file. It contains
26065 * scheduling primitives (sleep_on, wakeup, schedule etc)
26066 * as well as a number of simple system call functions
26067 * (type getpid()), which just extract a field from
26068 * current-task */
26069
26070 #include <linux/mm.h>
26071 #include <linux/kernel_stat.h>
26072 #include <linux/fdreg.h>
26073 #include <linux/delay.h>
26074 #include <linux/interrupt.h>
26075 #include <linux/smp_lock.h>
26076 #include <linux/init.h>
26077
26078 #include <asm/io.h>
26079 #include <asm/uaccess.h>
26080 #include <asm/pgtable.h>
26081 #include <asm/mmu_context.h>
26082 #include <asm/semaphore-helper.h>
26083
26084 #include <linux/timex.h>
26085
26086 /* kernel variables */
26087
26088 /* systemwide security settings */
26089 unsigned securebits = SECUREBITS_DEFAULT;
26090
26091 /* timer interrupt period */
26092 long tick = (1000000 + HZ/2) / HZ;
26093
26094 /* The current time */
26095 volatile struct timeval
26096 xtime __attribute__ ((aligned (16)));
26097
26098 /* Don't completely fail for HZ > 500. */
26099 int tickadj = 500/HZ ? : 1; /* microsecs */
26100
26101 DECLARE_TASK_QUEUE(tq_timer);
26102 DECLARE_TASK_QUEUE(tq_immediate);
26103 DECLARE_TASK_QUEUE(tq_scheduler);
26104
26105 /* phase-lock loop variables */
26106 /* TIME_ERROR prevents overwriting the CMOS clock */
26107 /* clock synchronization status */
26108 int time_state = TIME_OK;
26109 /* clock status bits */
26110 int time_status = STA_UNSYNC;
26111 /* time adjustment (us) */
26112 long time_offset = 0;
26113 /* pll time constant */
26114 long time_constant = 2;
26115 /* frequency tolerance (ppm) */
26116 long time_tolerance = MAXFREQ;
26117 /* clock precision (us) */
26118 long time_precision = 1;
26119 /* maximum error (us) */
26120 long time_maxerror = NTP_PHASE_LIMIT;
26121 /* estimated error (us) */
26122 long time_esterror = NTP_PHASE_LIMIT;
26123 /* phase offset (scaled us) */
26124 long time_phase = 0;
26125 /* frequency offset (scaled ppm) */
26126 long time_freq =
26127 ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC;
26128 /* tick adjust (scaled 1 / HZ) */
26129 long time_adj = 0;
26130 /* time at last adjustment (s) */
26131 long time_reftime = 0;
26132
26133 long time_adjust = 0;
26134 long time_adjust_step = 0;
26135
26136 unsigned long event = 0;
26137
26138 extern int do_setitimer(int, struct itimerval *,
26139 struct itimerval *);
26140 unsigned int * prof_buffer = NULL;
26141 unsigned long prof_len = 0;
26142 unsigned long prof_shift = 0;
26143
26144 extern void mem_use(void);
26145
26146 unsigned long volatile jiffies=0;
26147
26148 /* Init task must be ok at boot for the ix86 as we will
26149 * check its signals via the SMP irq return path. */
26150 struct task_struct * task[NR_TASKS] = {&init_task, };
26151
26152 struct kernel_stat kstat = { 0 };
26153
26154 void scheduling_functions_start_here(void) { }
26155
26156 #ifdef __SMP__
26157 static void reschedule_idle_slow(struct task_struct * p)
26158 {
26159 /* (see reschedule_idle() for an explanation first ...)
26160 *
26161 * Pass #2
26162 *
26163 * We try to find another (idle) CPU for this woken-up
26164 * process.
26165 *
26166 * On SMP, we mostly try to see if the CPU the task used
26167 * to run on is idle.. but we will use another idle CPU
26168 * too, at this point we already know that this CPU is
26169 * not willing to reschedule in the near future.
26170 *
26171 * An idle CPU is definitely wasted, especially if this
26172 * CPU is running long-timeslice processes. The following
26173 * algorithm is pretty good at finding the best idle CPU
26174 * to send this process to.
26175 *
26176 * [We can try to preempt low-priority processes on other
26177 * CPUs in 2.3. Also we can try to use the avg_slice
26178 * value to predict 'likely reschedule' events even on
26179 * other CPUs.] */
26180 int best_cpu = p->processor,
26181 this_cpu = smp_processor_id();
26182 struct task_struct **idle = task, *tsk, *target_tsk;
26183 int i = smp_num_cpus;
26184
26185 target_tsk = NULL;
26186 do {
26187 tsk = *idle;
26188 idle++;
26189 if (tsk->has_cpu) {
26190 if (tsk->processor == this_cpu)
26191 continue;
26192 target_tsk = tsk;
26193 if (tsk->processor == best_cpu) {
26194 /* bingo, we couldn't get a better CPU, activate
26195 * it. */
26196 goto send; /* this one helps GCC ... */
26197 }
26198 }
26199 } while (--i > 0);
26200
26201 /* found any idle CPU? */
26202 if (target_tsk) {
26203 send:
26204 target_tsk->need_resched = 1;
26205 smp_send_reschedule(target_tsk->processor);
26206 return;
26207 }
26208 }
26209 #endif /* __SMP__ */
26210
26211 /* If there is a dependency between p1 and p2, don't be
26212 * too eager to go into the slow schedule. In
26213 * particular, if p1 and p2 both want the kernel lock,
26214 * there is no point in trying to make them extremely
26215 * parallel..
26216 *
26217 * (No lock - lock_depth < 0) */
26218 #define related(p1,p2) \
26219 ((p1)->lock_depth >= 0 && (p2)->lock_depth >= 0)
26220
26221 static inline void reschedule_idle(
26222 struct task_struct * p)
26223 {
26224
26225 if (p->policy != SCHED_OTHER ||
26226 p->counter > current->counter + 3) {
26227 current->need_resched = 1;
26228 return;
26229 }
26230
26231 #ifdef __SMP__
26232 /* ("wakeup()" should not be called before we've
26233 * initialized SMP completely. Basically a not-yet
26234 * initialized SMP subsystem can be considered as a
26235 * not-yet working scheduler, simply dont use it before
26236 * it's up and running ...)
26237 *
26238 * SMP rescheduling is done in 2 passes:
26239 * - pass #1: faster: quick decisions
26240 * - pass #2: slower: let's try to find another CPU */
26241
26242 /* Pass #1
26243 *
26244 * There are two metrics here:
26245 *
26246 * first, a 'cutoff' interval, currently 0-200 usecs on
26247 * x86 CPUs, depending on the size of the 'SMP-local
26248 * cache'. If the current process has longer average
26249 * timeslices than this, then we utilize the idle CPU.
26250 *
26251 * second, if the wakeup comes from a process context,
26252 * then the two processes are 'related'. (they form a
26253 * 'gang')
26254 *
26255 * An idle CPU is almost always a bad thing, thus we
26256 * skip the idle-CPU utilization only if both these
26257 * conditions are true. (ie. a 'process-gang'
26258 * rescheduling with rather high frequency should stay
26259 * on the same CPU).
26260 *
26261 * [We can switch to something more finegrained in
26262 * 2.3.] */
26263 if ((current->avg_slice < cacheflush_time) &&
26264 related(current, p))
26265 return;
26266
26267 reschedule_idle_slow(p);
26268 #endif /* __SMP__ */
26269 }
26270
26271 /* Careful!
26272 *
26273 * This has to add the process to the _beginning_ of the
26274 * run-queue, not the end. See the comment about "This is
26275 * subtle" in the scheduler proper.. */
26276 static inline void add_to_runqueue(struct task_struct *p)
26277 {
26278 struct task_struct *next = init_task.next_run;
26279
26280 p->prev_run = &init_task;
26281 init_task.next_run = p;
26282 p->next_run = next;
26283 next->prev_run = p;
26284 nr_running++;
26285 }
26286
26287 static inline void del_from_runqueue(
26288 struct task_struct * p)
26289 {
26290 struct task_struct *next = p->next_run;
26291 struct task_struct *prev = p->prev_run;
26292
26293 nr_running--;
26294 next->prev_run = prev;
26295 prev->next_run = next;
26296 p->next_run = NULL;
26297 p->prev_run = NULL;
26298 }
26299
26300 static inline void move_last_runqueue(
26301 struct task_struct * p)
26302 {
26303 struct task_struct *next = p->next_run;
26304 struct task_struct *prev = p->prev_run;
26305
26306 /* remove from list */
26307 next->prev_run = prev;
26308 prev->next_run = next;
26309 /* add back to list */
26310 p->next_run = &init_task;
26311 prev = init_task.prev_run;
26312 init_task.prev_run = p;
26313 p->prev_run = prev;
26314 prev->next_run = p;
26315 }
26316
26317 static inline void
26318 move_first_runqueue(struct task_struct * p)
26319 {
26320 struct task_struct *next = p->next_run;
26321 struct task_struct *prev = p->prev_run;
26322
26323 /* remove from list */
26324 next->prev_run = prev;
26325 prev->next_run = next;
26326 /* add back to list */
26327 p->prev_run = &init_task;
26328 next = init_task.next_run;
26329 init_task.next_run = p;
26330 p->next_run = next;
26331 next->prev_run = p;
26332 }
26333
26334 /* The tasklist_lock protects the linked list of
26335 * processes.
26336 *
26337 * The scheduler lock is protecting against multiple
26338 * entry into the scheduling code, and doesn't need to
26339 * worry about interrupts (because interrupts cannot call
26340 * the scheduler).
26341 *
26342 * The run-queue lock locks the parts that actually
26343 * access and change the run-queues, and have to be
26344 * interrupt-safe. */
26345 /* should be acquired first */
26346 spinlock_t scheduler_lock = SPIN_LOCK_UNLOCKED;
26347 spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED; /* 2nd */
26348 rwlock_t tasklist_lock = RW_LOCK_UNLOCKED; /* 3rd */
26349
26350 /* Wake up a process. Put it on the run-queue if it's not
26351 * already there. The "current" process is always on the
26352 * run-queue (except when the actual re-schedule is in
26353 * progress), and as such you're allowed to do the
26354 * simpler "current->state = TASK_RUNNING" to mark
26355 * yourself runnable without the overhead of this. */
26356 void wake_up_process(struct task_struct * p)
26357 {
26358 unsigned long flags;
26359
26360 spin_lock_irqsave(&runqueue_lock, flags);
26361 p->state = TASK_RUNNING;
26362 if (!p->next_run) {
26363 add_to_runqueue(p);
26364 reschedule_idle(p);
26365 }
26366 spin_unlock_irqrestore(&runqueue_lock, flags);
26367 }
26368
26369 static void process_timeout(unsigned long __data)
26370 {
26371 struct task_struct * p = (struct task_struct *) __data;
26372
26373 wake_up_process(p);
26374 }
26375
26376 /* This is the function that decides how desirable a
26377 * process is.. You can weigh different processes
26378 * against each other depending on what CPU they've run
26379 * on lately etc to try to handle cache and TLB miss
26380 * penalties.
26381 *
26382 * Return values:
26383 * -1000: never select this
26384 * 0: out of time, recalculate counters
26385 * (but it might still be selected)
26386 * +ve: "goodness" value (the larger, the better)
26387 * +1000: realtime process, select this. */
26388 static inline int goodness(struct task_struct * p,
26389 struct task_struct * prev, int this_cpu)
26390 {
26391 int policy = p->policy;
26392 int weight;
26393
26394 if (policy & SCHED_YIELD) {
26395 p->policy = policy & ~SCHED_YIELD;
26396 return 0;
26397 }
26398
26399 /* Realtime process, select the first one on the
26400 * runqueue (taking priorities within processes into
26401 * account). */
26402 if (policy != SCHED_OTHER)
26403 return 1000 + p->rt_priority;
26404
26405 /* Give the process a first-approximation goodness
26406 * value according to the number of clock-ticks it has
26407 * left.
26408 *
26409 * Don't do any other calculations if the time slice is
26410 * over.. */
26411 weight = p->counter;
26412 if (weight) {
26413
26414 #ifdef __SMP__
26415 /* Give a largish advantage to the same processor...
26416 * (this is equivalent to penalizing other
26417 * processors) */
26418 if (p->processor == this_cpu)
26419 weight += PROC_CHANGE_PENALTY;
26420 #endif
26421
26422 /* .. and a slight advantage to the current thread */
26423 if (p->mm == prev->mm)
26424 weight += 1;
26425 weight += p->priority;
26426 }
26427
26428 return weight;
26429 }
26430
26431 /* Event timer code */
26432 #define TVN_BITS 6
26433 #define TVR_BITS 8
26434 #define TVN_SIZE (1 << TVN_BITS)
26435 #define TVR_SIZE (1 << TVR_BITS)
26436 #define TVN_MASK (TVN_SIZE - 1)
26437 #define TVR_MASK (TVR_SIZE - 1)
26438
26439 struct timer_vec {
26440 int index;
26441 struct timer_list *vec[TVN_SIZE];
26442 };
26443
26444 struct timer_vec_root {
26445 int index;
26446 struct timer_list *vec[TVR_SIZE];
26447 };
26448
26449 static struct timer_vec tv5 = { 0 };
26450 static struct timer_vec tv4 = { 0 };
26451 static struct timer_vec tv3 = { 0 };
26452 static struct timer_vec tv2 = { 0 };
26453 static struct timer_vec_root tv1 = { 0 };
26454
26455 static struct timer_vec * const tvecs[] = {
26456 (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
26457 };
26458
26459 #define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
26460
26461 static unsigned long timer_jiffies = 0;
26462
26463 static inline void insert_timer(struct timer_list *timer,
26464 struct timer_list **vec, int idx)
26465 {
26466 if ((timer->next = vec[idx]))
26467 vec[idx]->prev = timer;
26468 vec[idx] = timer;
26469 timer->prev = (struct timer_list *)&vec[idx];
26470 }
26471
26472 static inline void internal_add_timer(
26473 struct timer_list *timer)
26474 {
26475 /* must be cli-ed when calling this */
26476 unsigned long expires = timer->expires;
26477 unsigned long idx = expires - timer_jiffies;
26478
26479 if (idx < TVR_SIZE) {
26480 int i = expires & TVR_MASK;
26481 insert_timer(timer, tv1.vec, i);
26482 } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
26483 int i = (expires >> TVR_BITS) & TVN_MASK;
26484 insert_timer(timer, tv2.vec, i);
26485 } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
26486 int i =(expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
26487 insert_timer(timer, tv3.vec, i);
26488 } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
26489 int i =
26490 (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
26491 insert_timer(timer, tv4.vec, i);
26492 } else if ((signed long) idx < 0) {
26493 /* can happen if you add a timer with expires ==
26494 * jiffies, or you set a timer to go off in the past
26495 */
26496 insert_timer(timer, tv1.vec, tv1.index);
26497 } else if (idx <= 0xffffffffUL) {
26498 int i =
26499 (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
26500 insert_timer(timer, tv5.vec, i);
26501 } else {
26502 /* Can only get here on architectures with 64-bit
26503 * jiffies */
26504 timer->next = timer->prev = timer;
26505 }
26506 }
26507
26508 spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED;
26509
26510 void add_timer(struct timer_list *timer)
26511 {
26512 unsigned long flags;
26513
26514 spin_lock_irqsave(&timerlist_lock, flags);
26515 if (timer->prev)
26516 goto bug;
26517 internal_add_timer(timer);
26518 out:
26519 spin_unlock_irqrestore(&timerlist_lock, flags);
26520 return;
26521
26522 bug:
26523 printk("bug: kernel timer added twice at %p.\n",
26524 __builtin_return_address(0));
26525 goto out;
26526 }
26527
26528 static inline int detach_timer(struct timer_list *timer)
26529 {
26530 struct timer_list *prev = timer->prev;
26531 if (prev) {
26532 struct timer_list *next = timer->next;
26533 prev->next = next;
26534 if (next)
26535 next->prev = prev;
26536 return 1;
26537 }
26538 return 0;
26539 }
26540
26541 void mod_timer(struct timer_list *timer,
26542 unsigned long expires)
26543 {
26544 unsigned long flags;
26545
26546 spin_lock_irqsave(&timerlist_lock, flags);
26547 timer->expires = expires;
26548 detach_timer(timer);
26549 internal_add_timer(timer);
26550 spin_unlock_irqrestore(&timerlist_lock, flags);
26551 }
26552
26553 int del_timer(struct timer_list * timer)
26554 {
26555 int ret;
26556 unsigned long flags;
26557
26558 spin_lock_irqsave(&timerlist_lock, flags);
26559 ret = detach_timer(timer);
26560 timer->next = timer->prev = 0;
26561 spin_unlock_irqrestore(&timerlist_lock, flags);
26562 return ret;
26563 }
26564
26565 #ifdef __SMP__
26566
26567 #define idle_task (task[cpu_number_map[this_cpu]])
26568 #define can_schedule(p) (!(p)->has_cpu)
26569
26570 #else
26571
26572 #define idle_task (&init_task)
26573 #define can_schedule(p) (1)
26574
26575 #endif
26576
26577 signed long schedule_timeout(signed long timeout)
26578 {
26579 struct timer_list timer;
26580 unsigned long expire;
26581
26582 switch (timeout)
26583 {
26584 case MAX_SCHEDULE_TIMEOUT:
26585 /* These two special cases are useful to be
26586 * comfortable in the caller. Nothing more. We could
26587 * take MAX_SCHEDULE_TIMEOUT from one of the negative
26588 * value but I' d like to return a valid offset (>=0)
26589 * to allow the caller to do everything it want with
26590 * the retval. */
26591 schedule();
26592 goto out;
26593 default:
26594 /* Another bit of PARANOID. Note that the retval will
26595 * be 0 since no piece of kernel is supposed to do a
26596 * check for a negative retval of schedule_timeout()
26597 * (since it should never happens anyway). You just
26598 * have the printk() that will tell you if something
26599 * is gone wrong and where. */
26600 if (timeout < 0)
26601 {
26602 printk(KERN_ERR "schedule_timeout: wrong timeout "
26603 "value %lx from %p\n", timeout,
26604 __builtin_return_address(0));
26605 goto out;
26606 }
26607 }
26608
26609 expire = timeout + jiffies;
26610
26611 init_timer(&timer);
26612 timer.expires = expire;
26613 timer.data = (unsigned long) current;
26614 timer.function = process_timeout;
26615
26616 add_timer(&timer);
26617 schedule();
26618 del_timer(&timer);
26619
26620 timeout = expire - jiffies;
26621
26622 out:
26623 return timeout < 0 ? 0 : timeout;
26624 }
26625
26626 /* This one aligns per-CPU data on cacheline boundaries.
26627 */
26628 static union {
26629 struct schedule_data {
26630 struct task_struct * prev;
26631 long prevstate;
26632 cycles_t last_schedule;
26633 } schedule_data;
26634 char __pad [SMP_CACHE_BYTES];
26635 } aligned_data [NR_CPUS] __cacheline_aligned =
26636 { {{&init_task,0}}};
26637
26638 static inline void __schedule_tail (void)
26639 {
26640 #ifdef __SMP__
26641 struct schedule_data * sched_data;
26642
26643 /* We might have switched CPUs: */
26644 sched_data =
26645 &aligned_data[smp_processor_id()].schedule_data;
26646
26647 /* Subtle. In the rare event that we got a wakeup to
26648 * 'prev' just during the reschedule (this is possible,
26649 * the scheduler is pretty parallel), we should do
26650 * another reschedule in the next task's
26651 * context. schedule() will do the right thing next
26652 * time around. This is equivalent to 'delaying' the
26653 * wakeup until the reschedule has finished. */
26654 if (sched_data->prev->state != sched_data->prevstate)
26655 current->need_resched = 1;
26656
26657 /* Release the previous process ...
26658 *
26659 * We have dropped all locks, and we must make sure
26660 * that we only mark the previous process as no longer
26661 * having a CPU after all other state has been seen by
26662 * other CPUs. Thus the write memory barrier! */
26663 wmb();
26664 sched_data->prev->has_cpu = 0;
26665 #endif /* __SMP__ */
26666 }
26667
26668 /* schedule_tail() is getting called from the fork return
26669 * path. This cleans up all remaining scheduler things,
26670 * without impacting the common case. */
26671 void schedule_tail (void)
26672 {
26673 __schedule_tail();
26674 }
26675
26676 /* 'schedule()' is the scheduler function. It's a very
26677 * simple and nice scheduler: it's not perfect, but
26678 * certainly works for most things.
26679 *
26680 * The goto is "interesting".
26681 *
26682 * NOTE!! Task 0 is the 'idle' task, which gets called
26683 * when no other tasks can run. It can not be killed, and
26684 * it cannot sleep. The 'state' information in task[0] is
26685 * never used. */
26686 asmlinkage void schedule(void)
26687 {
26688 struct schedule_data * sched_data;
26689 struct task_struct * prev, * next;
26690 int this_cpu;
26691
26692 run_task_queue(&tq_scheduler);
26693
26694 prev = current;
26695 this_cpu = prev->processor;
26696 /* 'sched_data' is protected by the fact that we can
26697 * run only one process per CPU. */
26698 sched_data = & aligned_data[this_cpu].schedule_data;
26699
26700 if (in_interrupt())
26701 goto scheduling_in_interrupt;
26702 release_kernel_lock(prev, this_cpu);
26703
26704 /* Do "administrative" work here while we don't hold
26705 * any locks */
26706 if (bh_active & bh_mask)
26707 do_bottom_half();
26708
26709 spin_lock(&scheduler_lock);
26710 spin_lock_irq(&runqueue_lock);
26711
26712 /* move an exhausted RR process to be last.. */
26713 prev->need_resched = 0;
26714
26715 if (!prev->counter && prev->policy == SCHED_RR) {
26716 prev->counter = prev->priority;
26717 move_last_runqueue(prev);
26718 }
26719
26720 switch (prev->state) {
26721 case TASK_INTERRUPTIBLE:
26722 if (signal_pending(prev)) {
26723 prev->state = TASK_RUNNING;
26724 break;
26725 }
26726 default:
26727 del_from_runqueue(prev);
26728 case TASK_RUNNING:
26729 }
26730
26731 sched_data->prevstate = prev->state;
26732
26733 /* this is the scheduler proper: */
26734 {
26735 struct task_struct * p = init_task.next_run;
26736 int c = -1000;
26737
26738 /* Default process to select.. */
26739 next = idle_task;
26740 if (prev->state == TASK_RUNNING) {
26741 c = goodness(prev, prev, this_cpu);
26742 next = prev;
26743 }
26744
26745 /* This is subtle. Note how we can enable interrupts
26746 * here, even though interrupts can add processes to
26747 * the run- queue. This is because any new processes
26748 * will be added to the front of the queue, so "p"
26749 * above is a safe starting point. run-queue
26750 * deletion and re-ordering is protected by the
26751 * scheduler lock */
26752 spin_unlock_irq(&runqueue_lock);
26753 /* Note! there may appear new tasks on the run-queue
26754 * during this, as interrupts are enabled. However, they
26755 * will be put on front of the list, so our list starting
26756 * at "p" is essentially fixed. */
26757 while (p != &init_task) {
26758 if (can_schedule(p)) {
26759 int weight = goodness(p, prev, this_cpu);
26760 if (weight > c)
26761 c = weight, next = p;
26762 }
26763 p = p->next_run;
26764 }
26765
26766 /* Do we need to re-calculate counters? */
26767 if (!c) {
26768 struct task_struct *p;
26769 read_lock(&tasklist_lock);
26770 for_each_task(p)
26771 p->counter = (p->counter >> 1) + p->priority;
26772 read_unlock(&tasklist_lock);
26773 }
26774 }
26775
26776 /* maintain the per-process 'average timeslice' value.
26777 * (this has to be recalculated even if we reschedule
26778 * to the same process) Currently this is only used on
26779 * SMP: */
26780 #ifdef __SMP__
26781 {
26782 cycles_t t, this_slice;
26783
26784 t = get_cycles();
26785 this_slice = t - sched_data->last_schedule;
26786 sched_data->last_schedule = t;
26787
26788 /* Simple, exponentially fading average calculation:
26789 */
26790 prev->avg_slice = this_slice + prev->avg_slice;
26791 prev->avg_slice >>= 1;
26792 }
26793
26794 /* We drop the scheduler lock early (it's a global
26795 * spinlock), thus we have to lock the previous process
26796 * from getting rescheduled during switch_to(). */
26797 next->processor = this_cpu;
26798 next->has_cpu = 1;
26799 spin_unlock(&scheduler_lock);
26800 #endif /* __SMP__ */
26801 if (prev != next) {
26802 #ifdef __SMP__
26803 sched_data->prev = prev;
26804 #endif
26805 kstat.context_swtch++;
26806 get_mmu_context(next);
26807 switch_to(prev,next);
26808
26809 __schedule_tail();
26810 }
26811
26812 reacquire_kernel_lock(current);
26813 return;
26814
26815 scheduling_in_interrupt:
26816 printk("Scheduling in interrupt\n");
26817 *(int *)0 = 0;
26818 }
26819
26820 rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED;
26821
26822 /* wake_up doesn't wake up stopped processes - they have
26823 * to be awakened with signals or similar.
26824 *
26825 * Note that we only need a read lock for the wait queue
26826 * (and thus do not have to protect against interrupts),
26827 * as the actual removal from the queue is handled by the
26828 * process itself. */
26829 void __wake_up(struct wait_queue **q, unsigned int mode)
26830 {
26831 struct wait_queue *next;
26832
26833 read_lock(&waitqueue_lock);
26834 if (q && (next = *q)) {
26835 struct wait_queue *head;
26836
26837 head = WAIT_QUEUE_HEAD(q);
26838 while (next != head) {
26839 struct task_struct *p = next->task;
26840 next = next->next;
26841 if (p->state & mode)
26842 wake_up_process(p);
26843 }
26844 }
26845 read_unlock(&waitqueue_lock);
26846 }
26847
26848 /* Semaphores are implemented using a two-way counter:
26849 * The "count" variable is decremented for each process
26850 * that tries to sleep, while the "waking" variable is
26851 * incremented when the "up()" code goes to wake up
26852 * waiting processes.
26853 *
26854 * Notably, the inline "up()" and "down()" functions can
26855 * efficiently test if they need to do any extra work (up
26856 * needs to do something only if count was negative
26857 * before the increment operation.
26858 *
26859 * waking_non_zero() (from asm/semaphore.h) must execute
26860 * atomically.
26861 *
26862 * When __up() is called, the count was negative before
26863 * incrementing it, and we need to wake up somebody.
26864 *
26865 * This routine adds one to the count of processes that
26866 * need to wake up and exit. ALL waiting processes
26867 * actually wake up but only the one that gets to the
26868 * "waking" field first will gate through and acquire the
26869 * semaphore. The others will go back to sleep.
26870 *
26871 * Note that these functions are only called when there
26872 * is contention on the lock, and as such all this is the
26873 * "non-critical" part of the whole semaphore
26874 * business. The critical part is the inline stuff in
26875 * <asm/semaphore.h> where we want to avoid any extra
26876 * jumps and calls. */
26877 void __up(struct semaphore *sem)
26878 {
26879 wake_one_more(sem);
26880 wake_up(&sem->wait);
26881 }
26882
26883 /* Perform the "down" function. Return zero for
26884 * semaphore acquired, return negative for signalled out
26885 * of the function.
26886 *
26887 * If called from __down, the return is ignored and the
26888 * wait loop is not interruptible. This means that a
26889 * task waiting on a semaphore using "down()" cannot be
26890 * killed until someone does an "up()" on the semaphore.
26891 *
26892 * If called from __down_interruptible, the return value
26893 * gets checked upon return. If the return value is
26894 * negative then the task continues with the negative
26895 * value in the return register (it can be tested by the
26896 * caller).
26897 *
26898 * Either form may be used in conjunction with "up()". */
26899
26900 #define DOWN_VAR \
26901 struct task_struct *tsk = current; \
26902 struct wait_queue wait = { tsk, NULL };
26903
26904 #define DOWN_HEAD(task_state) \
26905 \
26906 tsk->state = (task_state); \
26907 add_wait_queue(&sem->wait, &wait); \
26908 \
26909 /* Ok, we're set up. sem->count is known to be less \
26910 * than zero so we must wait. \
26911 * \
26912 * We can let go the lock for purposes of waiting. \
26913 * We re-acquire it after awaking so as to protect \
26914 * all semaphore operations. \
26915 * \
26916 * If "up()" is called before we call \
26917 * waking_non_zero() then we will catch it right away.\
26918 * If it is called later then we will have to go \
26919 * through a wakeup cycle to catch it. \
26920 * \
26921 * Multiple waiters contend for the semaphore lock to \
26922 * see who gets to gate through and who has to wait \
26923 * some more. */ \
26924 for (;;) {
26925
26926 #define DOWN_TAIL(task_state) \
26927 tsk->state = (task_state); \
26928 } \
26929 tsk->state = TASK_RUNNING; \
26930 remove_wait_queue(&sem->wait, &wait);
26931
26932 void __down(struct semaphore * sem)
26933 {
26934 DOWN_VAR
26935 DOWN_HEAD(TASK_UNINTERRUPTIBLE)
26936 if (waking_non_zero(sem))
26937 break;
26938 schedule();
26939 DOWN_TAIL(TASK_UNINTERRUPTIBLE)
26940 }
26941
26942 int __down_interruptible(struct semaphore * sem)
26943 {
26944 DOWN_VAR
26945 int ret = 0;
26946 DOWN_HEAD(TASK_INTERRUPTIBLE)
26947
26948 ret = waking_non_zero_interruptible(sem, tsk);
26949 if (ret)
26950 {
26951 if (ret == 1)
26952 /* ret != 0 only if we get interrupted -arca */
26953 ret = 0;
26954 break;
26955 }
26956 schedule();
26957 DOWN_TAIL(TASK_INTERRUPTIBLE)
26958 return ret;
26959 }
26960
26961 int __down_trylock(struct semaphore * sem)
26962 {
26963 return waking_non_zero_trylock(sem);
26964 }
26965
26966 #define SLEEP_ON_VAR \
26967 unsigned long flags; \
26968 struct wait_queue wait;
26969
26970 #define SLEEP_ON_HEAD \
26971 wait.task = current; \
26972 write_lock_irqsave(&waitqueue_lock, flags); \
26973 __add_wait_queue(p, &wait); \
26974 write_unlock(&waitqueue_lock);
26975
26976 #define SLEEP_ON_TAIL \
26977 write_lock_irq(&waitqueue_lock); \
26978 __remove_wait_queue(p, &wait); \
26979 write_unlock_irqrestore(&waitqueue_lock, flags);
26980
26981 void interruptible_sleep_on(struct wait_queue **p)
26982 {
26983 SLEEP_ON_VAR
26984
26985 current->state = TASK_INTERRUPTIBLE;
26986
26987 SLEEP_ON_HEAD
26988 schedule();
26989 SLEEP_ON_TAIL
26990 }
26991
26992 long interruptible_sleep_on_timeout(
26993 struct wait_queue **p, long timeout)
26994 {
26995 SLEEP_ON_VAR
26996
26997 current->state = TASK_INTERRUPTIBLE;
26998
26999 SLEEP_ON_HEAD
27000 timeout = schedule_timeout(timeout);
27001 SLEEP_ON_TAIL
27002
27003 return timeout;
27004 }
27005
27006 void sleep_on(struct wait_queue **p)
27007 {
27008 SLEEP_ON_VAR
27009
27010 current->state = TASK_UNINTERRUPTIBLE;
27011
27012 SLEEP_ON_HEAD
27013 schedule();
27014 SLEEP_ON_TAIL
27015 }
27016
27017 long sleep_on_timeout(struct wait_queue **p,
27018 long timeout)
27019 {
27020 SLEEP_ON_VAR
27021
27022 current->state = TASK_UNINTERRUPTIBLE;
27023
27024 SLEEP_ON_HEAD
27025 timeout = schedule_timeout(timeout);
27026 SLEEP_ON_TAIL
27027
27028 return timeout;
27029 }
27030
27031 void scheduling_functions_end_here(void) { }
27032
27033 static inline void cascade_timers(struct timer_vec *tv)
27034 {
27035 /* cascade all the timers from tv up one level */
27036 struct timer_list *timer;
27037 timer = tv->vec[tv->index];
27038 /* We are removing _all_ timers from the list, so we
27039 * don't have to detach them individually, just clear
27040 * the list afterwards. */
27041 while (timer) {
27042 struct timer_list *tmp = timer;
27043 timer = timer->next;
27044 internal_add_timer(tmp);
27045 }
27046 tv->vec[tv->index] = NULL;
27047 tv->index = (tv->index + 1) & TVN_MASK;
27048 }
27049
27050 static inline void run_timer_list(void)
27051 {
27052 spin_lock_irq(&timerlist_lock);
27053 while ((long)(jiffies - timer_jiffies) >= 0) {
27054 struct timer_list *timer;
27055 if (!tv1.index) {
27056 int n = 1;
27057 do {
27058 cascade_timers(tvecs[n]);
27059 } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
27060 }
27061 while ((timer = tv1.vec[tv1.index])) {
27062 void (*fn)(unsigned long) = timer->function;
27063 unsigned long data = timer->data;
27064 detach_timer(timer);
27065 timer->next = timer->prev = NULL;
27066 spin_unlock_irq(&timerlist_lock);
27067 fn(data);
27068 spin_lock_irq(&timerlist_lock);
27069 }
27070 ++timer_jiffies;
27071 tv1.index = (tv1.index + 1) & TVR_MASK;
27072 }
27073 spin_unlock_irq(&timerlist_lock);
27074 }
27075
27076
27077 static inline void run_old_timers(void)
27078 {
27079 struct timer_struct *tp;
27080 unsigned long mask;
27081
27082 for (mask = 1, tp = timer_table+0; mask;
27083 tp++,mask += mask) {
27084 if (mask > timer_active)
27085 break;
27086 if (!(mask & timer_active))
27087 continue;
27088 if (time_after(tp->expires, jiffies))
27089 continue;
27090 timer_active &= ~mask;
27091 tp->fn();
27092 sti();
27093 }
27094 }
27095
27096 spinlock_t tqueue_lock;
27097
27098 void tqueue_bh(void)
27099 {
27100 run_task_queue(&tq_timer);
27101 }
27102
27103 void immediate_bh(void)
27104 {
27105 run_task_queue(&tq_immediate);
27106 }
27107
27108 unsigned long timer_active = 0;
27109 struct timer_struct timer_table[32];
27110
27111 /* Hmm.. Changed this, as the GNU make sources (load.c)
27112 * seems to imply that avenrun[] is the standard name for
27113 * this kind of thing. Nothing else seems to be
27114 * standardized: the fractional size etc all seem to
27115 * differ on different machines. */
27116 unsigned long avenrun[3] = { 0,0,0 };
27117
27118 /* Nr of active tasks - counted in fixed-point numbers */
27119 static unsigned long count_active_tasks(void)
27120 {
27121 struct task_struct *p;
27122 unsigned long nr = 0;
27123
27124 read_lock(&tasklist_lock);
27125 for_each_task(p) {
27126 if ((p->state == TASK_RUNNING ||
27127 p->state == TASK_UNINTERRUPTIBLE ||
27128 p->state == TASK_SWAPPING))
27129 nr += FIXED_1;
27130 }
27131 read_unlock(&tasklist_lock);
27132 return nr;
27133 }
27134
27135 static inline void calc_load(unsigned long ticks)
27136 {
27137 unsigned long active_tasks; /* fixed-point */
27138 static int count = LOAD_FREQ;
27139
27140 count -= ticks;
27141 if (count < 0) {
27142 count += LOAD_FREQ;
27143 active_tasks = count_active_tasks();
27144 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
27145 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
27146 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
27147 }
27148 }
27149
27150 /* this routine handles the overflow of the microsecond
27151 * field
27152 *
27153 * The tricky bits of code to handle the accurate clock
27154 * support were provided by Dave Mills (Mills@UDEL.EDU)
27155 * of NTP fame. They were originally developed for SUN
27156 * and DEC kernels. All the kudos should go to Dave for
27157 * this stuff. */
27158 static void second_overflow(void)
27159 {
27160 long ltemp;
27161
27162 /* Bump the maxerror field */
27163 time_maxerror += time_tolerance >> SHIFT_USEC;
27164 if ( time_maxerror > NTP_PHASE_LIMIT ) {
27165 time_maxerror = NTP_PHASE_LIMIT;
27166 time_status |= STA_UNSYNC;
27167 }
27168
27169 /* Leap second processing. If in leap-insert state at
27170 * the end of the day, the system clock is set back one
27171 * second; if in leap-delete state, the system clock is
27172 * set ahead one second. The microtime() routine or
27173 * external clock driver will insure that reported time
27174 * is always monotonic. The ugly divides should be
27175 * replaced. */
27176 switch (time_state) {
27177
27178 case TIME_OK:
27179 if (time_status & STA_INS)
27180 time_state = TIME_INS;
27181 else if (time_status & STA_DEL)
27182 time_state = TIME_DEL;
27183 break;
27184
27185 case TIME_INS:
27186 if (xtime.tv_sec % 86400 == 0) {
27187 xtime.tv_sec--;
27188 time_state = TIME_OOP;
27189 printk(KERN_NOTICE "Clock: "
27190 "inserting leap second 23:59:60 UTC\n");
27191 }
27192 break;
27193
27194 case TIME_DEL:
27195 if ((xtime.tv_sec + 1) % 86400 == 0) {
27196 xtime.tv_sec++;
27197 time_state = TIME_WAIT;
27198 printk(KERN_NOTICE "Clock: "
27199 "deleting leap second 23:59:59 UTC\n");
27200 }
27201 break;
27202
27203 case TIME_OOP:
27204 time_state = TIME_WAIT;
27205 break;
27206
27207 case TIME_WAIT:
27208 if (!(time_status & (STA_INS | STA_DEL)))
27209 time_state = TIME_OK;
27210 }
27211
27212 /* Compute the phase adjustment for the next second. In
27213 * PLL mode, the offset is reduced by a fixed factor
27214 * times the time constant. In FLL mode the offset is
27215 * used directly. In either mode, the maximum phase
27216 * adjustment for each second is clamped so as to
27217 * spread the adjustment over not more than the number
27218 * of seconds between updates. */
27219 if (time_offset < 0) {
27220 ltemp = -time_offset;
27221 if (!(time_status & STA_FLL))
27222 ltemp >>= SHIFT_KG + time_constant;
27223 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
27224 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
27225 time_offset += ltemp;
27226 time_adj = -ltemp <<
27227 (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
27228 } else {
27229 ltemp = time_offset;
27230 if (!(time_status & STA_FLL))
27231 ltemp >>= SHIFT_KG + time_constant;
27232 if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
27233 ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
27234 time_offset -= ltemp;
27235 time_adj = ltemp <<
27236 (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
27237 }
27238
27239 /* Compute the frequency estimate and additional phase
27240 * adjustment due to frequency error for the next
27241 * second. When the PPS signal is engaged, gnaw on the
27242 * watchdog counter and update the frequency computed
27243 * by the pll and the PPS signal. */
27244 pps_valid++;
27245 if (pps_valid == PPS_VALID) { /* PPS signal lost */
27246 pps_jitter = MAXTIME;
27247 pps_stabil = MAXFREQ;
27248 time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
27249 STA_PPSWANDER | STA_PPSERROR);
27250 }
27251 ltemp = time_freq + pps_freq;
27252 if (ltemp < 0)
27253 time_adj -= -ltemp >>
27254 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
27255 else
27256 time_adj += ltemp >>
27257 (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
27258
27259 #if HZ == 100
27260 /* Compensate for (HZ==100) != (1 << SHIFT_HZ). Add
27261 * 25% and 3.125% to get 128.125; => only 0.125% error
27262 * (p. 14) */
27263 if (time_adj < 0)
27264 time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
27265 else
27266 time_adj += (time_adj >> 2) + (time_adj >> 5);
27267 #endif
27268 }
27269
27270 /* in the NTP reference this is called "hardclock()" */
27271 static void update_wall_time_one_tick(void)
27272 {
27273 if ( (time_adjust_step = time_adjust) != 0 ) {
27274 /* We are doing an adjtime thing.
27275 *
27276 * Prepare time_adjust_step to be within bounds.
27277 * Note that a positive time_adjust means we want the
27278 * clock to run faster.
27279 *
27280 * Limit the amount of the step to be in the range
27281 * -tickadj .. +tickadj */
27282 if (time_adjust > tickadj)
27283 time_adjust_step = tickadj;
27284 else if (time_adjust < -tickadj)
27285 time_adjust_step = -tickadj;
27286
27287 /* Reduce by this step the amount of time left */
27288 time_adjust -= time_adjust_step;
27289 }
27290 xtime.tv_usec += tick + time_adjust_step;
27291 /* Advance the phase, once it gets to one microsecond,
27292 * then advance the tick more. */
27293 time_phase += time_adj;
27294 if (time_phase <= -FINEUSEC) {
27295 long ltemp = -time_phase >> SHIFT_SCALE;
27296 time_phase += ltemp << SHIFT_SCALE;
27297 xtime.tv_usec -= ltemp;
27298 }
27299 else if (time_phase >= FINEUSEC) {
27300 long ltemp = time_phase >> SHIFT_SCALE;
27301 time_phase -= ltemp << SHIFT_SCALE;
27302 xtime.tv_usec += ltemp;
27303 }
27304 }
27305
27306 /* Using a loop looks inefficient, but "ticks" is usually
27307 * just one (we shouldn't be losing ticks, we're doing
27308 * this this way mainly for interrupt latency reasons,
27309 * not because we think we'll have lots of lost timer
27310 * ticks */
27311 static void update_wall_time(unsigned long ticks)
27312 {
27313 do {
27314 ticks--;
27315 update_wall_time_one_tick();
27316 } while (ticks);
27317
27318 if (xtime.tv_usec >= 1000000) {
27319 xtime.tv_usec -= 1000000;
27320 xtime.tv_sec++;
27321 second_overflow();
27322 }
27323 }
27324
27325 static inline void do_process_times(
27326 struct task_struct *p, unsigned long user,
27327 unsigned long system)
27328 {
27329 long psecs;
27330
27331 psecs = (p->times.tms_utime += user);
27332 psecs += (p->times.tms_stime += system);
27333 if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) {
27334 /* Send SIGXCPU every second.. */
27335 if (!(psecs % HZ))
27336 send_sig(SIGXCPU, p, 1);
27337 /* and SIGKILL when we go over max.. */
27338 if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max)
27339 send_sig(SIGKILL, p, 1);
27340 }
27341 }
27342
27343 static inline void do_it_virt(struct task_struct * p,
27344 unsigned long ticks)
27345 {
27346 unsigned long it_virt = p->it_virt_value;
27347
27348 if (it_virt) {
27349 if (it_virt <= ticks) {
27350 it_virt = ticks + p->it_virt_incr;
27351 send_sig(SIGVTALRM, p, 1);
27352 }
27353 p->it_virt_value = it_virt - ticks;
27354 }
27355 }
27356
27357 static inline void do_it_prof(struct task_struct * p,
27358 unsigned long ticks)
27359 {
27360 unsigned long it_prof = p->it_prof_value;
27361
27362 if (it_prof) {
27363 if (it_prof <= ticks) {
27364 it_prof = ticks + p->it_prof_incr;
27365 send_sig(SIGPROF, p, 1);
27366 }
27367 p->it_prof_value = it_prof - ticks;
27368 }
27369 }
27370
27371 void update_one_process(struct task_struct *p,
27372 unsigned long ticks, unsigned long user,
27373 unsigned long system, int cpu)
27374 {
27375 p->per_cpu_utime[cpu] += user;
27376 p->per_cpu_stime[cpu] += system;
27377 do_process_times(p, user, system);
27378 do_it_virt(p, user);
27379 do_it_prof(p, ticks);
27380 }
27381
27382 static void update_process_times(unsigned long ticks,
27383 unsigned long system)
27384 {
27385 /* SMP does this on a per-CPU basis elsewhere */
27386 #ifndef __SMP__
27387 struct task_struct * p = current;
27388 unsigned long user = ticks - system;
27389 if (p->pid) {
27390 p->counter -= ticks;
27391 if (p->counter < 0) {
27392 p->counter = 0;
27393 p->need_resched = 1;
27394 }
27395 if (p->priority < DEF_PRIORITY)
27396 kstat.cpu_nice += user;
27397 else
27398 kstat.cpu_user += user;
27399 kstat.cpu_system += system;
27400 }
27401 update_one_process(p, ticks, user, system, 0);
27402 #endif
27403 }
27404
27405 volatile unsigned long lost_ticks = 0;
27406 static unsigned long lost_ticks_system = 0;
27407
27408 /* This spinlock protect us from races in SMP while
27409 * playing with xtime. -arca */
27410 rwlock_t xtime_lock = RW_LOCK_UNLOCKED;
27411
27412 static inline void update_times(void)
27413 {
27414 unsigned long ticks;
27415
27416 /* update_times() is run from the raw timer_bh handler
27417 * so we just know that the irqs are locally enabled
27418 * and so we don't need to save/restore the flags of
27419 * the local CPU here. -arca */
27420 write_lock_irq(&xtime_lock);
27421
27422 ticks = lost_ticks;
27423 lost_ticks = 0;
27424
27425 if (ticks) {
27426 unsigned long system;
27427 system = xchg(&lost_ticks_system, 0);
27428
27429 calc_load(ticks);
27430 update_wall_time(ticks);
27431 write_unlock_irq(&xtime_lock);
27432
27433 update_process_times(ticks, system);
27434
27435 } else
27436 write_unlock_irq(&xtime_lock);
27437 }
27438
27439 static void timer_bh(void)
27440 {
27441 update_times();
27442 run_old_timers();
27443 run_timer_list();
27444 }
27445
27446 void do_timer(struct pt_regs * regs)
27447 {
27448 (*(unsigned long *)&jiffies)++;
27449 lost_ticks++;
27450 mark_bh(TIMER_BH);
27451 if (!user_mode(regs))
27452 lost_ticks_system++;
27453 if (tq_timer)
27454 mark_bh(TQUEUE_BH);
27455 }
27456
27457 #ifndef __alpha__
27458
27459 /* For backwards compatibility? This can be done in libc
27460 * so Alpha and all newer ports shouldn't need it. */
27461 asmlinkage unsigned int sys_alarm(unsigned int seconds)
27462 {
27463 struct itimerval it_new, it_old;
27464 unsigned int oldalarm;
27465
27466 it_new.it_interval.tv_sec = it_new.it_interval.tv_usec
27467 = 0;
27468 it_new.it_value.tv_sec = seconds;
27469 it_new.it_value.tv_usec = 0;
27470 do_setitimer(ITIMER_REAL, &it_new, &it_old);
27471 oldalarm = it_old.it_value.tv_sec;
27472 /* ehhh.. We can't return 0 if we have an alarm
27473 * pending.. And we'd better return too much than too
27474 * little anyway */
27475 if (it_old.it_value.tv_usec)
27476 oldalarm++;
27477 return oldalarm;
27478 }
27479
27480 /* The Alpha uses getxpid, getxuid, and getxgid instead.
27481 * Maybe this should be moved into arch/i386 instead? */
27482
27483 asmlinkage int sys_getpid(void)
27484 {
27485 /* This is SMP safe - current->pid doesn't change */
27486 return current->pid;
27487 }
27488
27489 /* This is not strictly SMP safe: p_opptr could change
27490 * from under us. However, rather than getting any lock
27491 * we can use an optimistic algorithm: get the parent
27492 * pid, and go back and check that the parent is still
27493 * the same. If it has changed (which is extremely
27494 * unlikely indeed), we just try again..
27495 *
27496 * NOTE! This depends on the fact that even if we _do_
27497 * get an old value of "parent", we can happily
27498 * dereference the pointer: we just can't necessarily
27499 * trust the result until we know that the parent pointer
27500 * is valid.
27501 *
27502 * The "mb()" macro is a memory barrier - a synchronizing
27503 * event. It also makes sure that gcc doesn't optimize
27504 * away the necessary memory references.. The barrier
27505 * doesn't have to have all that strong semantics: on x86
27506 * we don't really require a synchronizing instruction,
27507 * for example. The barrier is more important for code
27508 * generation than for any real memory ordering semantics
27509 * (even if there is a small window for a race, using the
27510 * old pointer is harmless for a while). */
27511 asmlinkage int sys_getppid(void)
27512 {
27513 int pid;
27514 struct task_struct * me = current;
27515 struct task_struct * parent;
27516
27517 parent = me->p_opptr;
27518 for (;;) {
27519 pid = parent->pid;
27520 #if __SMP__
27521 {
27522 struct task_struct *old = parent;
27523 mb();
27524 parent = me->p_opptr;
27525 if (old != parent)
27526 continue;
27527 }
27528 #endif
27529 break;
27530 }
27531 return pid;
27532 }
27533
27534 asmlinkage int sys_getuid(void)
27535 {
27536 /* Only we change this so SMP safe */
27537 return current->uid;
27538 }
27539
27540 asmlinkage int sys_geteuid(void)
27541 {
27542 /* Only we change this so SMP safe */
27543 return current->euid;
27544 }
27545
27546 asmlinkage int sys_getgid(void)
27547 {
27548 /* Only we change this so SMP safe */
27549 return current->gid;
27550 }
27551
27552 asmlinkage int sys_getegid(void)
27553 {
27554 /* Only we change this so SMP safe */
27555 return current->egid;
27556 }
27557
27558 /* This has been replaced by sys_setpriority. Maybe it
27559 * should be moved into the arch dependent tree for those
27560 * ports that require it for backward compatibility? */
27561
27562 asmlinkage int sys_nice(int increment)
27563 {
27564 unsigned long newprio;
27565 int increase = 0;
27566
27567 /* Setpriority might change our priority at the same
27568 * moment. We don't have to worry. Conceptually one
27569 * call occurs first and we have a single winner. */
27570
27571 newprio = increment;
27572 if (increment < 0) {
27573 if (!capable(CAP_SYS_NICE))
27574 return -EPERM;
27575 newprio = -increment;
27576 increase = 1;
27577 }
27578
27579 if (newprio > 40)
27580 newprio = 40;
27581 /* do a "normalization" of the priority (traditionally
27582 * Unix nice values are -20 to 20; Linux doesn't really
27583 * use that kind of thing, but uses the length of the
27584 * timeslice instead (default 210 ms). The rounding is
27585 * why we want to avoid negative values. */
27586 newprio = (newprio * DEF_PRIORITY + 10) / 20;
27587 increment = newprio;
27588 if (increase)
27589 increment = -increment;
27590 /* Current->priority can change between this point and
27591 * the assignment. We are assigning not doing add/subs
27592 * so thats ok. Conceptually a process might just
27593 * instantaneously read the value we stomp over. I
27594 * don't think that is an issue unless posix makes it
27595 * one. If so we can loop on changes to
27596 * current->priority. */
27597 newprio = current->priority - increment;
27598 if ((signed) newprio < 1)
27599 newprio = 1;
27600 if (newprio > DEF_PRIORITY*2)
27601 newprio = DEF_PRIORITY*2;
27602 current->priority = newprio;
27603 return 0;
27604 }
27605
27606 #endif
27607
27608 static inline struct task_struct *
27609 find_process_by_pid(pid_t pid)
27610 {
27611 struct task_struct *tsk = current;
27612
27613 if (pid)
27614 tsk = find_task_by_pid(pid);
27615 return tsk;
27616 }
27617
27618 static int setscheduler(pid_t pid, int policy,
27619 struct sched_param *param)
27620 {
27621 struct sched_param lp;
27622 struct task_struct *p;
27623 int retval;
27624
27625 retval = -EINVAL;
27626 if (!param || pid < 0)
27627 goto out_nounlock;
27628
27629 retval = -EFAULT;
27630 if (copy_from_user(&lp, param,
27631 sizeof(struct sched_param)))
27632 goto out_nounlock;
27633
27634 /* We play safe to avoid deadlocks. */
27635 spin_lock(&scheduler_lock);
27636 spin_lock_irq(&runqueue_lock);
27637 read_lock(&tasklist_lock);
27638
27639 p = find_process_by_pid(pid);
27640
27641 retval = -ESRCH;
27642 if (!p)
27643 goto out_unlock;
27644
27645 if (policy < 0)
27646 policy = p->policy;
27647 else {
27648 retval = -EINVAL;
27649 if (policy != SCHED_FIFO && policy != SCHED_RR &&
27650 policy != SCHED_OTHER)
27651 goto out_unlock;
27652 }
27653
27654 /* Valid priorities for SCHED_FIFO and SCHED_RR are
27655 * 1..99, valid priority for SCHED_OTHER is 0. */
27656 retval = -EINVAL;
27657 if (lp.sched_priority < 0 || lp.sched_priority > 99)
27658 goto out_unlock;
27659 if((policy == SCHED_OTHER) != (lp.sched_priority == 0))
27660 goto out_unlock;
27661
27662 retval = -EPERM;
27663 if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
27664 !capable(CAP_SYS_NICE))
27665 goto out_unlock;
27666 if ((current->euid != p->euid) &&
27667 (current->euid != p->uid) &&
27668 !capable(CAP_SYS_NICE))
27669 goto out_unlock;
27670
27671 retval = 0;
27672 p->policy = policy;
27673 p->rt_priority = lp.sched_priority;
27674 if (p->next_run)
27675 move_first_runqueue(p);
27676
27677 current->need_resched = 1;
27678
27679 out_unlock:
27680 read_unlock(&tasklist_lock);
27681 spin_unlock_irq(&runqueue_lock);
27682 spin_unlock(&scheduler_lock);
27683
27684 out_nounlock:
27685 return retval;
27686 }
27687
27688 asmlinkage int sys_sched_setscheduler(pid_t pid,
27689 int policy, struct sched_param *param)
27690 {
27691 return setscheduler(pid, policy, param);
27692 }
27693
27694 asmlinkage int sys_sched_setparam(pid_t pid,
27695 struct sched_param *param)
27696 {
27697 return setscheduler(pid, -1, param);
27698 }
27699
27700 asmlinkage int sys_sched_getscheduler(pid_t pid)
27701 {
27702 struct task_struct *p;
27703 int retval;
27704
27705 retval = -EINVAL;
27706 if (pid < 0)
27707 goto out_nounlock;
27708
27709 read_lock(&tasklist_lock);
27710
27711 retval = -ESRCH;
27712 p = find_process_by_pid(pid);
27713 if (!p)
27714 goto out_unlock;
27715
27716 retval = p->policy;
27717
27718 out_unlock:
27719 read_unlock(&tasklist_lock);
27720
27721 out_nounlock:
27722 return retval;
27723 }
27724
27725 asmlinkage int sys_sched_getparam(pid_t pid,
27726 struct sched_param *param)
27727 {
27728 struct task_struct *p;
27729 struct sched_param lp;
27730 int retval;
27731
27732 retval = -EINVAL;
27733 if (!param || pid < 0)
27734 goto out_nounlock;
27735
27736 read_lock(&tasklist_lock);
27737 p = find_process_by_pid(pid);
27738 retval = -ESRCH;
27739 if (!p)
27740 goto out_unlock;
27741 lp.sched_priority = p->rt_priority;
27742 read_unlock(&tasklist_lock);
27743
27744 /* This one might sleep, we cannot do it with a
27745 * spinlock held ... */
27746 retval = copy_to_user(param, &lp,
27747 sizeof(*param)) ? -EFAULT : 0;
27748
27749 out_nounlock:
27750 return retval;
27751
27752 out_unlock:
27753 read_unlock(&tasklist_lock);
27754 return retval;
27755 }
27756
27757 asmlinkage int sys_sched_yield(void)
27758 {
27759 spin_lock(&scheduler_lock);
27760 spin_lock_irq(&runqueue_lock);
27761 if (current->policy == SCHED_OTHER)
27762 current->policy |= SCHED_YIELD;
27763 current->need_resched = 1;
27764 move_last_runqueue(current);
27765 spin_unlock_irq(&runqueue_lock);
27766 spin_unlock(&scheduler_lock);
27767 return 0;
27768 }
27769
27770 asmlinkage int sys_sched_get_priority_max(int policy)
27771 {
27772 int ret = -EINVAL;
27773
27774 switch (policy) {
27775 case SCHED_FIFO:
27776 case SCHED_RR:
27777 ret = 99;
27778 break;
27779 case SCHED_OTHER:
27780 ret = 0;
27781 break;
27782 }
27783 return ret;
27784 }
27785
27786 asmlinkage int sys_sched_get_priority_min(int policy)
27787 {
27788 int ret = -EINVAL;
27789
27790 switch (policy) {
27791 case SCHED_FIFO:
27792 case SCHED_RR:
27793 ret = 1;
27794 break;
27795 case SCHED_OTHER:
27796 ret = 0;
27797 }
27798 return ret;
27799 }
27800
27801 asmlinkage int sys_sched_rr_get_interval(pid_t pid,
27802 struct timespec *interval)
27803 {
27804 struct timespec t;
27805
27806 t.tv_sec = 0;
27807 t.tv_nsec = 150000;
27808 if (copy_to_user(interval, &t,
27809 sizeof(struct timespec)))
27810 return -EFAULT;
27811 return 0;
27812 }
27813
27814 asmlinkage int sys_nanosleep(struct timespec *rqtp,
27815 struct timespec *rmtp)
27816 {
27817 struct timespec t;
27818 unsigned long expire;
27819
27820 if (copy_from_user(&t, rqtp, sizeof(struct timespec)))
27821 return -EFAULT;
27822
27823 if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 ||
27824 t.tv_sec < 0)
27825 return -EINVAL;
27826
27827
27828 if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
27829 current->policy != SCHED_OTHER)
27830 {
27831 /* Short delay requests up to 2 ms will be handled
27832 * with high precision by a busy wait for all
27833 * real-time processes.
27834 *
27835 * It's important on SMP not to do this holding
27836 * locks. */
27837 udelay((t.tv_nsec + 999) / 1000);
27838 return 0;
27839 }
27840
27841 expire =
27842 timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
27843
27844 current->state = TASK_INTERRUPTIBLE;
27845 expire = schedule_timeout(expire);
27846
27847 if (expire) {
27848 if (rmtp) {
27849 jiffies_to_timespec(expire, &t);
27850 if (copy_to_user(rmtp, &t,sizeof(struct timespec)))
27851 return -EFAULT;
27852 }
27853 return -EINTR;
27854 }
27855 return 0;
27856 }
27857
27858 static void show_task(int nr,struct task_struct * p)
27859 {
27860 unsigned long free = 0;
27861 int state;
27862 static const char * stat_nam[] =
27863 { "R", "S", "D", "Z", "T", "W" };
27864
27865 printk("%-8s %3d ",
27866 p->comm, (p == current) ? -nr : nr);
27867 state = p->state ? ffz(~p->state) + 1 : 0;
27868 if (((unsigned) state) <
27869 sizeof(stat_nam)/sizeof(char *))
27870 printk(stat_nam[state]);
27871 else
27872 printk(" ");
27873 #if (BITS_PER_LONG == 32)
27874 if (p == current)
27875 printk(" current ");
27876 else
27877 printk(" %08lX ", thread_saved_pc(&p->tss));
27878 #else
27879 if (p == current)
27880 printk(" current task ");
27881 else
27882 printk(" %016lx ", thread_saved_pc(&p->tss));
27883 #endif
27884 {
27885 unsigned long * n = (unsigned long *) (p+1);
27886 while (!*n)
27887 n++;
27888 free = (unsigned long) n - (unsigned long)(p+1);
27889 }
27890 printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
27891 if (p->p_cptr)
27892 printk("%5d ", p->p_cptr->pid);
27893 else
27894 printk(" ");
27895 if (p->p_ysptr)
27896 printk("%7d", p->p_ysptr->pid);
27897 else
27898 printk(" ");
27899 if (p->p_osptr)
27900 printk(" %5d\n", p->p_osptr->pid);
27901 else
27902 printk("\n");
27903
27904 {
27905 struct signal_queue *q;
27906 char s[sizeof(sigset_t)*2+1],b[sizeof(sigset_t)*2+1];
27907
27908 render_sigset_t(&p->signal, s);
27909 render_sigset_t(&p->blocked, b);
27910 printk(" sig: %d %s %s :",
27911 signal_pending(p), s, b);
27912 for (q = p->sigqueue; q ; q = q->next)
27913 printk(" %d", q->info.si_signo);
27914 printk(" X\n");
27915 }
27916 }
27917
27918 char * render_sigset_t(sigset_t *set, char *buffer)
27919 {
27920 int i = _NSIG, x;
27921 do {
27922 i -= 4, x = 0;
27923 if (sigismember(set, i+1)) x |= 1;
27924 if (sigismember(set, i+2)) x |= 2;
27925 if (sigismember(set, i+3)) x |= 4;
27926 if (sigismember(set, i+4)) x |= 8;
27927 *buffer++ = (x < 10 ? '0' : 'a' - 10) + x;
27928 } while (i >= 4);
27929 *buffer = 0;
27930 return buffer;
27931 }
27932
27933 void show_state(void)
27934 {
27935 struct task_struct *p;
27936
27937 #if (BITS_PER_LONG == 32)
27938 printk("\n"
27939 " free "
27940 " sibling\n");
27941 printk(" task PC stack pid father "
27942 "child younger older\n");
27943 #else
27944 printk("\n"
27945 " free "
27946 " sibling\n");
27947 printk(" task PC stack pid "
27948 "father child younger older\n");
27949 #endif
27950 read_lock(&tasklist_lock);
27951 for_each_task(p)
27952 show_task((p->tarray_ptr - &task[0]),p);
27953 read_unlock(&tasklist_lock);
27954 }
27955
27956 void __init sched_init(void)
27957 {
27958 /* We have to do a little magic to get the first
27959 * process right in SMP mode. */
27960 int cpu=hard_smp_processor_id();
27961 int nr = NR_TASKS;
27962
27963 init_task.processor=cpu;
27964
27965 /* Init task array free list and pidhash table. */
27966 while(--nr > 0)
27967 add_free_taskslot(&task[nr]);
27968
27969 for(nr = 0; nr < PIDHASH_SZ; nr++)
27970 pidhash[nr] = NULL;
27971
27972 init_bh(TIMER_BH, timer_bh);
27973 init_bh(TQUEUE_BH, tqueue_bh);
27974 init_bh(IMMEDIATE_BH, immediate_bh);
27975 }
Сайт управляется системой
uCoz