Код ядра - kernel/sched.c

netlib.narod.ru	< Назад \| Оглавление \| Далее >
kernel/sched.c

26041 /*
26042  *  linux/kernel/sched.c
26043  *
26044  *  Copyright (C) 1991, 1992  Linus Torvalds
26045  *
26046  *  1996-12-23 Modified by Dave Grothe to fix bugs in
26047  *  semaphores and make semaphores SMP safe
26048  *  1997-01-28 Modified by Finn Arne Gangstad to make
26049  *  timers scale better.
26050  *  1997-09-10 Updated NTP code according to technical
26051  *  memorandum Jan '96 "A Kernel Model for Precision
26052  *  Timekeeping" by Dave Mills
26053  *  1998-11-19 Implemented schedule_timeout() and related
26054  *  stuff by Andrea Arcangeli
26055  *  1998-12-24 Fixed a xtime SMP race (we need the
26056  *  xtime_lock rw spinlock to serialize accesses to
26057  *  xtime/lost_ticks).  Copyright (C) 1998 Andrea
26058  *  Arcangeli
26059  *  1998-12-28 Implemented better SMP scheduling by Ingo
26060  *  Molnar
26061  *  1999-03-10 Improved NTP compatibility by Ulrich Windl
26062  */
26063 
26064 /* 'sched.c' is the main kernel file. It contains
26065  * scheduling primitives (sleep_on, wakeup, schedule etc)
26066  * as well as a number of simple system call functions
26067  * (type getpid()), which just extract a field from
26068  * current-task */
26069 
26070 #include <linux/mm.h>
26071 #include <linux/kernel_stat.h>
26072 #include <linux/fdreg.h>
26073 #include <linux/delay.h>
26074 #include <linux/interrupt.h>
26075 #include <linux/smp_lock.h>
26076 #include <linux/init.h>
26077 
26078 #include <asm/io.h>
26079 #include <asm/uaccess.h>
26080 #include <asm/pgtable.h>
26081 #include <asm/mmu_context.h>
26082 #include <asm/semaphore-helper.h>
26083 
26084 #include <linux/timex.h>
26085 
26086 /* kernel variables */
26087 
26088 /* systemwide security settings */
26089 unsigned securebits = SECUREBITS_DEFAULT;
26090 
26091 /* timer interrupt period */
26092 long tick = (1000000 + HZ/2) / HZ;
26093 
26094 /* The current time */
26095 volatile struct timeval
26096   xtime __attribute__ ((aligned (16)));
26097 
26098 /* Don't completely fail for HZ > 500.  */
26099 int tickadj = 500/HZ ? : 1;     /* microsecs */
26100 
26101 DECLARE_TASK_QUEUE(tq_timer);
26102 DECLARE_TASK_QUEUE(tq_immediate);
26103 DECLARE_TASK_QUEUE(tq_scheduler);
26104 
26105 /* phase-lock loop variables */
26106 /* TIME_ERROR prevents overwriting the CMOS clock */
26107 /* clock synchronization status */
26108 int time_state = TIME_OK;
26109 /* clock status bits */
26110 int time_status = STA_UNSYNC;
26111 /* time adjustment (us) */
26112 long time_offset = 0;
26113 /* pll time constant */
26114 long time_constant = 2;
26115 /* frequency tolerance (ppm) */
26116 long time_tolerance = MAXFREQ;
26117 /* clock precision (us) */
26118 long time_precision = 1;
26119 /* maximum error (us) */
26120 long time_maxerror = NTP_PHASE_LIMIT;
26121 /* estimated error (us) */
26122 long time_esterror = NTP_PHASE_LIMIT;
26123 /* phase offset (scaled us) */
26124 long time_phase = 0;
26125 /* frequency offset (scaled ppm) */
26126 long time_freq =
26127   ((1000000 + HZ/2) % HZ - HZ/2) << SHIFT_USEC;
26128 /* tick adjust (scaled 1 / HZ) */
26129 long time_adj = 0;
26130 /* time at last adjustment (s) */
26131 long time_reftime = 0;
26132 
26133 long time_adjust = 0;
26134 long time_adjust_step = 0;
26135 
26136 unsigned long event = 0;
26137 
26138 extern int do_setitimer(int, struct itimerval *,
26139                         struct itimerval *);
26140 unsigned int * prof_buffer = NULL;
26141 unsigned long prof_len = 0;
26142 unsigned long prof_shift = 0;
26143 
26144 extern void mem_use(void);
26145 
26146 unsigned long volatile jiffies=0;
26147 
26148 /* Init task must be ok at boot for the ix86 as we will
26149  * check its signals via the SMP irq return path. */
26150 struct task_struct * task[NR_TASKS] = {&init_task, };
26151 
26152 struct kernel_stat kstat = { 0 };
26153 
26154 void scheduling_functions_start_here(void) { }
26155 
26156 #ifdef __SMP__
 Комментарий
26157 static void reschedule_idle_slow(struct task_struct * p)
26158 {
26159 /* (see reschedule_idle() for an explanation first ...)
26160  *
26161  * Pass #2
26162  *
26163  * We try to find another (idle) CPU for this woken-up
26164  * process.
26165  *
26166  * On SMP, we mostly try to see if the CPU the task used
26167  * to run on is idle.. but we will use another idle CPU
26168  * too, at this point we already know that this CPU is
26169  * not willing to reschedule in the near future.
26170  *
26171  * An idle CPU is definitely wasted, especially if this
26172  * CPU is running long-timeslice processes. The following
26173  * algorithm is pretty good at finding the best idle CPU
26174  * to send this process to.
26175  *
26176  * [We can try to preempt low-priority processes on other
26177  * CPUs in 2.3. Also we can try to use the avg_slice
26178  * value to predict 'likely reschedule' events even on
26179  * other CPUs.]  */
26180   int best_cpu = p->processor,
26181     this_cpu = smp_processor_id();
26182   struct task_struct **idle = task, *tsk, *target_tsk;
26183   int i = smp_num_cpus;
26184 
26185   target_tsk = NULL;
26186   do {
26187     tsk = *idle;
26188     idle++;
26189     if (tsk->has_cpu) {
26190       if (tsk->processor == this_cpu)
26191         continue;
 Комментарий
26192       target_tsk = tsk;
26193       if (tsk->processor == best_cpu) {
26194         /* bingo, we couldn't get a better CPU, activate
26195          * it.  */
26196         goto send; /* this one helps GCC ... */
26197       }
26198     }
26199   } while (--i > 0);
26200 
26201   /* found any idle CPU? */
26202   if (target_tsk) {
26203 send:
26204     target_tsk->need_resched = 1;
26205     smp_send_reschedule(target_tsk->processor);
26206     return;
26207   }
26208 }
26209 #endif /* __SMP__ */
26210 
26211 /* If there is a dependency between p1 and p2, don't be
26212  * too eager to go into the slow schedule.  In
26213  * particular, if p1 and p2 both want the kernel lock,
26214  * there is no point in trying to make them extremely
26215  * parallel..
26216  *
26217  * (No lock - lock_depth < 0) */
26218 #define related(p1,p2)                                  \
26219   ((p1)->lock_depth >= 0 && (p2)->lock_depth >= 0)
26220 
 Комментарий
26221 static inline void reschedule_idle(
26222   struct task_struct * p)
26223 {
26224 
26225   if (p->policy != SCHED_OTHER ||
26226       p->counter > current->counter + 3) {
26227     current->need_resched = 1;
26228     return;
26229   }
26230 
26231 #ifdef __SMP__
26232   /* ("wakeup()" should not be called before we've
26233    * initialized SMP completely.  Basically a not-yet
26234    * initialized SMP subsystem can be considered as a
26235    * not-yet working scheduler, simply dont use it before
26236    * it's up and running ...)
26237    *
26238    * SMP rescheduling is done in 2 passes:
26239    * - pass #1: faster: quick decisions
26240    * - pass #2: slower: let's try to find another CPU */
26241 
26242   /* Pass #1
26243    *
26244    * There are two metrics here:
26245    *
26246    * first, a 'cutoff' interval, currently 0-200 usecs on
26247    * x86 CPUs, depending on the size of the 'SMP-local
26248    * cache'.  If the current process has longer average
26249    * timeslices than this, then we utilize the idle CPU.
26250    *
26251    * second, if the wakeup comes from a process context,
26252    * then the two processes are 'related'. (they form a
26253    * 'gang')
26254    *
26255    * An idle CPU is almost always a bad thing, thus we
26256    * skip the idle-CPU utilization only if both these
26257    * conditions are true. (ie. a 'process-gang'
26258    * rescheduling with rather high frequency should stay
26259    * on the same CPU).
26260    *
26261    * [We can switch to something more finegrained in
26262    * 2.3.]  */
26263   if ((current->avg_slice < cacheflush_time) &&
26264       related(current, p))
26265     return;
26266 
26267   reschedule_idle_slow(p);
26268 #endif /* __SMP__ */
26269 }
26270 
26271 /* Careful!
26272  *
26273  * This has to add the process to the _beginning_ of the
26274  * run-queue, not the end. See the comment about "This is
26275  * subtle" in the scheduler proper..  */
26276 static inline void add_to_runqueue(struct task_struct *p)
26277 {
26278   struct task_struct *next = init_task.next_run;
26279 
26280   p->prev_run = &init_task;
26281   init_task.next_run = p;
26282   p->next_run = next;
26283   next->prev_run = p;
26284   nr_running++;
26285 }
26286 
26287 static inline void del_from_runqueue(
26288   struct task_struct * p)
26289 {
26290   struct task_struct *next = p->next_run;
26291   struct task_struct *prev = p->prev_run;
26292 
26293   nr_running--;
26294   next->prev_run = prev;
26295   prev->next_run = next;
26296   p->next_run = NULL;
26297   p->prev_run = NULL;
26298 }
26299 
26300 static inline void move_last_runqueue(
26301   struct task_struct * p)
26302 {
26303   struct task_struct *next = p->next_run;
26304   struct task_struct *prev = p->prev_run;
26305 
26306   /* remove from list */
26307   next->prev_run = prev;
26308   prev->next_run = next;
26309   /* add back to list */
26310   p->next_run = &init_task;
26311   prev = init_task.prev_run;
26312   init_task.prev_run = p;
26313   p->prev_run = prev;
26314   prev->next_run = p;
26315 }
26316 
26317 static inline void
26318 move_first_runqueue(struct task_struct * p)
26319 {
26320   struct task_struct *next = p->next_run;
26321   struct task_struct *prev = p->prev_run;
26322 
26323   /* remove from list */
26324   next->prev_run = prev;
26325   prev->next_run = next;
26326   /* add back to list */
26327   p->prev_run = &init_task;
26328   next = init_task.next_run;
26329   init_task.next_run = p;
26330   p->next_run = next;
26331   next->prev_run = p;
26332 }
26333 
26334 /* The tasklist_lock protects the linked list of
26335  * processes.
26336  *
26337  * The scheduler lock is protecting against multiple
26338  * entry into the scheduling code, and doesn't need to
26339  * worry about interrupts (because interrupts cannot call
26340  * the scheduler).
26341  *
26342  * The run-queue lock locks the parts that actually
26343  * access and change the run-queues, and have to be
26344  * interrupt-safe.  */
26345 /* should be acquired first */
26346 spinlock_t scheduler_lock = SPIN_LOCK_UNLOCKED;
26347 spinlock_t runqueue_lock = SPIN_LOCK_UNLOCKED; /* 2nd */
26348 rwlock_t tasklist_lock = RW_LOCK_UNLOCKED;  /* 3rd */
26349 
26350 /* Wake up a process. Put it on the run-queue if it's not
26351  * already there.  The "current" process is always on the
26352  * run-queue (except when the actual re-schedule is in
26353  * progress), and as such you're allowed to do the
26354  * simpler "current->state = TASK_RUNNING" to mark
26355  * yourself runnable without the overhead of this.  */
26356 void wake_up_process(struct task_struct * p)
26357 {
26358   unsigned long flags;
26359 
26360   spin_lock_irqsave(&runqueue_lock, flags);
26361   p->state = TASK_RUNNING;
26362   if (!p->next_run) {
26363     add_to_runqueue(p);
26364     reschedule_idle(p);
26365   }
26366   spin_unlock_irqrestore(&runqueue_lock, flags);
26367 }
26368 
26369 static void process_timeout(unsigned long __data)
26370 {
26371   struct task_struct * p = (struct task_struct *) __data;
26372 
26373   wake_up_process(p);
26374 }
26375 
26376 /* This is the function that decides how desirable a
26377  * process is..  You can weigh different processes
26378  * against each other depending on what CPU they've run
26379  * on lately etc to try to handle cache and TLB miss
26380  * penalties.
26381  *
26382  * Return values:
26383  *       -1000: never select this
26384  *           0: out of time, recalculate counters
26385  *              (but it might still be selected)
26386  *         +ve: "goodness" value (the larger, the better)
26387  *       +1000: realtime process, select this.  */
26388 static inline int goodness(struct task_struct * p,
26389   struct task_struct * prev, int this_cpu)
26390 {
26391   int policy = p->policy;
26392   int weight;
26393 
 Комментарий
26394   if (policy & SCHED_YIELD) {
26395     p->policy = policy & ~SCHED_YIELD;
26396     return 0;
26397   }
26398 
26399   /* Realtime process, select the first one on the
26400    * runqueue (taking priorities within processes into
26401    * account).  */
26402   if (policy != SCHED_OTHER)
26403     return 1000 + p->rt_priority;
26404 
26405   /* Give the process a first-approximation goodness
26406    * value according to the number of clock-ticks it has
26407    * left.
26408    *
26409    * Don't do any other calculations if the time slice is
26410    * over..  */
26411   weight = p->counter;
26412   if (weight) {
26413 
26414 #ifdef __SMP__
26415     /* Give a largish advantage to the same processor...
26416      * (this is equivalent to penalizing other
26417      * processors) */
26418     if (p->processor == this_cpu)
26419       weight += PROC_CHANGE_PENALTY;
26420 #endif
26421 
26422     /* .. and a slight advantage to the current thread */
26423     if (p->mm == prev->mm)
26424       weight += 1;
26425     weight += p->priority;
26426   }
26427 
26428   return weight;
26429 }
26430 
26431 /* Event timer code */
26432 #define TVN_BITS 6
26433 #define TVR_BITS 8
26434 #define TVN_SIZE (1 << TVN_BITS)
26435 #define TVR_SIZE (1 << TVR_BITS)
26436 #define TVN_MASK (TVN_SIZE - 1)
26437 #define TVR_MASK (TVR_SIZE - 1)
26438 
26439 struct timer_vec {
26440   int index;
26441   struct timer_list *vec[TVN_SIZE];
26442 };
26443 
26444 struct timer_vec_root {
26445   int index;
26446   struct timer_list *vec[TVR_SIZE];
26447 };
26448 
26449 static struct timer_vec tv5 = { 0 };
26450 static struct timer_vec tv4 = { 0 };
26451 static struct timer_vec tv3 = { 0 };
26452 static struct timer_vec tv2 = { 0 };
26453 static struct timer_vec_root tv1 = { 0 };
26454 
26455 static struct timer_vec * const tvecs[] = {
26456   (struct timer_vec *)&tv1, &tv2, &tv3, &tv4, &tv5
26457 };
26458 
26459 #define NOOF_TVECS (sizeof(tvecs) / sizeof(tvecs[0]))
26460 
26461 static unsigned long timer_jiffies = 0;
26462 
26463 static inline void insert_timer(struct timer_list *timer,
26464         struct timer_list **vec, int idx)
26465 {
26466   if ((timer->next = vec[idx]))
26467     vec[idx]->prev = timer;
26468   vec[idx] = timer;
26469   timer->prev = (struct timer_list *)&vec[idx];
26470 }
26471 
26472 static inline void internal_add_timer(
26473   struct timer_list *timer)
26474 {
26475   /* must be cli-ed when calling this */
26476   unsigned long expires = timer->expires;
26477   unsigned long idx = expires - timer_jiffies;
26478 
26479   if (idx < TVR_SIZE) {
26480     int i = expires & TVR_MASK;
26481     insert_timer(timer, tv1.vec, i);
26482   } else if (idx < 1 << (TVR_BITS + TVN_BITS)) {
26483     int i = (expires >> TVR_BITS) & TVN_MASK;
26484     insert_timer(timer, tv2.vec, i);
26485   } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) {
26486     int i =(expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK;
26487     insert_timer(timer, tv3.vec, i);
26488   } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) {
26489     int i =
26490       (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK;
26491     insert_timer(timer, tv4.vec, i);
26492   } else if ((signed long) idx < 0) {
26493     /* can happen if you add a timer with expires ==
26494      * jiffies, or you set a timer to go off in the past
26495      */
26496     insert_timer(timer, tv1.vec, tv1.index);
26497   } else if (idx <= 0xffffffffUL) {
26498     int i =
26499       (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
26500     insert_timer(timer, tv5.vec, i);
26501   } else {
26502     /* Can only get here on architectures with 64-bit
26503      * jiffies */
26504     timer->next = timer->prev = timer;
26505   }
26506 }
26507 
26508 spinlock_t timerlist_lock = SPIN_LOCK_UNLOCKED;
26509 
26510 void add_timer(struct timer_list *timer)
26511 {
26512   unsigned long flags;
26513 
26514   spin_lock_irqsave(&timerlist_lock, flags);
26515   if (timer->prev)
26516     goto bug;
26517   internal_add_timer(timer);
26518 out:
26519   spin_unlock_irqrestore(&timerlist_lock, flags);
26520   return;
26521 
26522 bug:
26523   printk("bug: kernel timer added twice at %p.\n",
26524       __builtin_return_address(0));
26525   goto out;
26526 }
26527 
26528 static inline int detach_timer(struct timer_list *timer)
26529 {
26530   struct timer_list *prev = timer->prev;
26531   if (prev) {
26532     struct timer_list *next = timer->next;
26533     prev->next = next;
26534     if (next)
26535       next->prev = prev;
26536     return 1;
26537   }
26538   return 0;
26539 }
26540 
26541 void mod_timer(struct timer_list *timer,
26542                unsigned long expires)
26543 {
26544   unsigned long flags;
26545 
26546   spin_lock_irqsave(&timerlist_lock, flags);
26547   timer->expires = expires;
26548   detach_timer(timer);
26549   internal_add_timer(timer);
26550   spin_unlock_irqrestore(&timerlist_lock, flags);
26551 }
26552 
26553 int del_timer(struct timer_list * timer)
26554 {
26555   int ret;
26556   unsigned long flags;
26557 
26558   spin_lock_irqsave(&timerlist_lock, flags);
26559   ret = detach_timer(timer);
26560   timer->next = timer->prev = 0;
26561   spin_unlock_irqrestore(&timerlist_lock, flags);
26562   return ret;
26563 }
26564 
26565 #ifdef __SMP__
26566 
26567 #define idle_task (task[cpu_number_map[this_cpu]])
26568 #define can_schedule(p) (!(p)->has_cpu)
26569 
26570 #else
26571 
26572 #define idle_task (&init_task)
26573 #define can_schedule(p) (1)
26574 
26575 #endif
26576 
26577 signed long schedule_timeout(signed long timeout)
26578 {
26579   struct timer_list timer;
26580   unsigned long expire;
26581 
26582   switch (timeout)
26583   {
26584   case MAX_SCHEDULE_TIMEOUT:
26585     /* These two special cases are useful to be
26586      * comfortable in the caller. Nothing more. We could
26587      * take MAX_SCHEDULE_TIMEOUT from one of the negative
26588      * value but I' d like to return a valid offset (>=0)
26589      * to allow the caller to do everything it want with
26590      * the retval.  */
26591     schedule();
26592     goto out;
26593   default:
26594     /* Another bit of PARANOID. Note that the retval will
26595      * be 0 since no piece of kernel is supposed to do a
26596      * check for a negative retval of schedule_timeout()
26597      * (since it should never happens anyway). You just
26598      * have the printk() that will tell you if something
26599      * is gone wrong and where.  */
26600     if (timeout < 0)
26601     {
26602       printk(KERN_ERR "schedule_timeout: wrong timeout "
26603              "value %lx from %p\n", timeout,
26604              __builtin_return_address(0));
26605       goto out;
26606     }
26607   }
26608 
26609   expire = timeout + jiffies;
26610 
26611   init_timer(&timer);
26612   timer.expires = expire;
26613   timer.data = (unsigned long) current;
26614   timer.function = process_timeout;
26615 
26616   add_timer(&timer);
26617   schedule();
26618   del_timer(&timer);
26619 
26620   timeout = expire - jiffies;
26621 
26622  out:
26623   return timeout < 0 ? 0 : timeout;
26624 }
26625 
26626 /* This one aligns per-CPU data on cacheline boundaries.
26627  */
26628 static union {
26629   struct schedule_data {
26630     struct task_struct * prev;
26631     long prevstate;
26632     cycles_t last_schedule;
26633   } schedule_data;
26634   char __pad [SMP_CACHE_BYTES];
26635 } aligned_data [NR_CPUS] __cacheline_aligned =
26636   { {{&init_task,0}}};
26637 
26638 static inline void __schedule_tail (void)
26639 {
26640 #ifdef __SMP__
26641   struct schedule_data * sched_data;
26642 
26643   /* We might have switched CPUs: */
26644   sched_data =
26645     &aligned_data[smp_processor_id()].schedule_data;
26646 
26647   /* Subtle. In the rare event that we got a wakeup to
26648    * 'prev' just during the reschedule (this is possible,
26649    * the scheduler is pretty parallel), we should do
26650    * another reschedule in the next task's
26651    * context. schedule() will do the right thing next
26652    * time around.  This is equivalent to 'delaying' the
26653    * wakeup until the reschedule has finished.  */
 Комментарий
26654   if (sched_data->prev->state != sched_data->prevstate)
26655     current->need_resched = 1;
26656 
26657   /* Release the previous process ...
26658    *
26659    * We have dropped all locks, and we must make sure
26660    * that we only mark the previous process as no longer
26661    * having a CPU after all other state has been seen by
26662    * other CPUs. Thus the write memory barrier!  */
26663   wmb();
26664   sched_data->prev->has_cpu = 0;
26665 #endif /* __SMP__ */
26666 }
26667 
26668 /* schedule_tail() is getting called from the fork return
26669  * path. This cleans up all remaining scheduler things,
26670  * without impacting the common case.  */
26671 void schedule_tail (void)
26672 {
26673   __schedule_tail();
26674 }
26675 
26676 /* 'schedule()' is the scheduler function. It's a very
26677  * simple and nice scheduler: it's not perfect, but
26678  * certainly works for most things.
26679  *
26680  * The goto is "interesting".
26681  *
26682  * NOTE!!  Task 0 is the 'idle' task, which gets called
26683  * when no other tasks can run. It can not be killed, and
26684  * it cannot sleep. The 'state' information in task[0] is
26685  * never used.  */
26686 asmlinkage void schedule(void)
26687 {
26688   struct schedule_data * sched_data;
 Комментарий
26689   struct task_struct * prev, * next;
26690   int this_cpu;
26691 
26692   run_task_queue(&tq_scheduler);
26693 
26694   prev = current;
26695   this_cpu = prev->processor;
26696   /* 'sched_data' is protected by the fact that we can
26697    * run only one process per CPU.  */
26698   sched_data = & aligned_data[this_cpu].schedule_data;
26699 
26700   if (in_interrupt())
26701     goto scheduling_in_interrupt;
26702   release_kernel_lock(prev, this_cpu);
26703 
26704   /* Do "administrative" work here while we don't hold
26705    * any locks */
26706   if (bh_active & bh_mask)
26707     do_bottom_half();
26708 
26709   spin_lock(&scheduler_lock);
26710   spin_lock_irq(&runqueue_lock);
26711 
26712   /* move an exhausted RR process to be last.. */
26713   prev->need_resched = 0;
26714 
26715   if (!prev->counter && prev->policy == SCHED_RR) {
26716     prev->counter = prev->priority;
26717     move_last_runqueue(prev);
26718   }
26719 
26720   switch (prev->state) {
26721     case TASK_INTERRUPTIBLE:
26722       if (signal_pending(prev)) {
26723         prev->state = TASK_RUNNING;
26724         break;
26725       }
26726     default:
26727       del_from_runqueue(prev);
26728     case TASK_RUNNING:
26729   }
26730 
26731   sched_data->prevstate = prev->state;
26732 
26733 /* this is the scheduler proper: */
26734   {
26735     struct task_struct * p = init_task.next_run;
26736     int c = -1000;
26737 
26738     /* Default process to select.. */
26739     next = idle_task;
26740     if (prev->state == TASK_RUNNING) {
26741       c = goodness(prev, prev, this_cpu);
26742       next = prev;
26743     }
26744 
26745     /* This is subtle.  Note how we can enable interrupts
26746      * here, even though interrupts can add processes to
26747      * the run- queue. This is because any new processes
26748      * will be added to the front of the queue, so "p"
26749      * above is a safe starting point.  run-queue
26750      * deletion and re-ordering is protected by the
26751      * scheduler lock */
26752     spin_unlock_irq(&runqueue_lock);
26753 /* Note! there may appear new tasks on the run-queue
26754  * during this, as interrupts are enabled. However, they
26755  * will be put on front of the list, so our list starting
26756  * at "p" is essentially fixed.  */
26757     while (p != &init_task) {
26758       if (can_schedule(p)) {
26759         int weight = goodness(p, prev, this_cpu);
26760         if (weight > c)
26761           c = weight, next = p;
26762       }
26763       p = p->next_run;
26764     }
26765 
26766     /* Do we need to re-calculate counters? */
 Комментарий
26767     if (!c) {
26768       struct task_struct *p;
26769       read_lock(&tasklist_lock);
26770       for_each_task(p)
26771         p->counter = (p->counter >> 1) + p->priority;
26772       read_unlock(&tasklist_lock);
26773     }
26774   }
26775 
26776   /* maintain the per-process 'average timeslice' value.
26777    * (this has to be recalculated even if we reschedule
26778    * to the same process) Currently this is only used on
26779    * SMP: */
26780 #ifdef __SMP__
26781   {
26782     cycles_t t, this_slice;
26783 
 Комментарий
26784     t = get_cycles();
26785     this_slice = t - sched_data->last_schedule;
26786     sched_data->last_schedule = t;
26787 
26788     /* Simple, exponentially fading average calculation:
26789      */
26790     prev->avg_slice = this_slice + prev->avg_slice;
26791     prev->avg_slice >>= 1;
26792   }
26793 
26794   /* We drop the scheduler lock early (it's a global
26795    * spinlock), thus we have to lock the previous process
26796    * from getting rescheduled during switch_to().  */
 Комментарий
26797   next->processor = this_cpu;
26798   next->has_cpu = 1;
26799   spin_unlock(&scheduler_lock);
26800 #endif /* __SMP__ */
26801   if (prev != next) {
26802 #ifdef __SMP__
26803     sched_data->prev = prev;
26804 #endif
26805     kstat.context_swtch++;
26806     get_mmu_context(next);
26807     switch_to(prev,next);
26808 
26809     __schedule_tail();
26810   }
26811 
26812   reacquire_kernel_lock(current);
26813   return;
26814 
26815 scheduling_in_interrupt:
26816   printk("Scheduling in interrupt\n");
26817   *(int *)0 = 0;
26818 }
26819 
26820 rwlock_t waitqueue_lock = RW_LOCK_UNLOCKED;
26821 
26822 /* wake_up doesn't wake up stopped processes - they have
26823  * to be awakened with signals or similar.
26824  *
26825  * Note that we only need a read lock for the wait queue
26826  * (and thus do not have to protect against interrupts),
26827  * as the actual removal from the queue is handled by the
26828  * process itself.  */
 Комментарий
26829 void __wake_up(struct wait_queue **q, unsigned int mode)
26830 {
26831   struct wait_queue *next;
26832 
26833   read_lock(&waitqueue_lock);
26834   if (q && (next = *q)) {
26835     struct wait_queue *head;
26836 
26837     head = WAIT_QUEUE_HEAD(q);
26838     while (next != head) {
26839       struct task_struct *p = next->task;
26840       next = next->next;
26841       if (p->state & mode)
26842         wake_up_process(p);
26843     }
26844   }
26845   read_unlock(&waitqueue_lock);
26846 }
26847 
26848 /* Semaphores are implemented using a two-way counter:
26849  * The "count" variable is decremented for each process
26850  * that tries to sleep, while the "waking" variable is
26851  * incremented when the "up()" code goes to wake up
26852  * waiting processes.
26853  *
26854  * Notably, the inline "up()" and "down()" functions can
26855  * efficiently test if they need to do any extra work (up
26856  * needs to do something only if count was negative
26857  * before the increment operation.
26858  *
26859  * waking_non_zero() (from asm/semaphore.h) must execute
26860  * atomically.
26861  *
26862  * When __up() is called, the count was negative before
26863  * incrementing it, and we need to wake up somebody.
26864  *
26865  * This routine adds one to the count of processes that
26866  * need to wake up and exit.  ALL waiting processes
26867  * actually wake up but only the one that gets to the
26868  * "waking" field first will gate through and acquire the
26869  * semaphore.  The others will go back to sleep.
26870  *
26871  * Note that these functions are only called when there
26872  * is contention on the lock, and as such all this is the
26873  * "non-critical" part of the whole semaphore
26874  * business. The critical part is the inline stuff in
26875  * <asm/semaphore.h> where we want to avoid any extra
26876  * jumps and calls.  */
 Комментарий
26877 void __up(struct semaphore *sem)
26878 {
26879   wake_one_more(sem);
26880   wake_up(&sem->wait);
26881 }
26882 
26883 /* Perform the "down" function.  Return zero for
26884  * semaphore acquired, return negative for signalled out
26885  * of the function.
26886  *
26887  * If called from __down, the return is ignored and the
26888  * wait loop is not interruptible.  This means that a
26889  * task waiting on a semaphore using "down()" cannot be
26890  * killed until someone does an "up()" on the semaphore.
26891  *
26892  * If called from __down_interruptible, the return value
26893  * gets checked upon return.  If the return value is
26894  * negative then the task continues with the negative
26895  * value in the return register (it can be tested by the
26896  * caller).
26897  *
26898  * Either form may be used in conjunction with "up()". */
26899 
 Комментарий
26900 #define DOWN_VAR                                        \
26901   struct task_struct *tsk = current;                    \
26902   struct wait_queue wait = { tsk, NULL };
26903 
 Комментарий
26904 #define DOWN_HEAD(task_state)                           \
26905                                                         \
26906   tsk->state = (task_state);                            \
26907   add_wait_queue(&sem->wait, &wait);                    \
26908                                                         \
26909   /* Ok, we're set up.  sem->count is known to be less  \
26910    * than zero so we must wait.                         \
26911    *                                                    \
26912    * We can let go the lock for purposes of waiting.    \
26913    * We re-acquire it after awaking so as to protect    \
26914    * all semaphore operations.                          \
26915    *                                                    \
26916    * If "up()" is called before we call                 \
26917    * waking_non_zero() then we will catch it right away.\
26918    * If it is called later then we will have to go      \
26919    * through a wakeup cycle to catch it.                \
26920    *                                                    \
26921    * Multiple waiters contend for the semaphore lock to \
26922    * see who gets to gate through and who has to wait   \
26923    * some more.  */                                     \
26924   for (;;) {
26925 
 Комментарий
26926 #define DOWN_TAIL(task_state)                           \
26927     tsk->state = (task_state);                          \
26928   }                                                     \
26929   tsk->state = TASK_RUNNING;                            \
26930   remove_wait_queue(&sem->wait, &wait);
26931 
 Комментарий
26932 void __down(struct semaphore * sem)
26933 {
26934   DOWN_VAR
26935   DOWN_HEAD(TASK_UNINTERRUPTIBLE)
26936   if (waking_non_zero(sem))
26937     break;
26938   schedule();
26939   DOWN_TAIL(TASK_UNINTERRUPTIBLE)
26940 }
26941 
 Комментарий
26942 int __down_interruptible(struct semaphore * sem)
26943 {
26944   DOWN_VAR
26945   int ret = 0;
26946   DOWN_HEAD(TASK_INTERRUPTIBLE)
26947 
26948   ret = waking_non_zero_interruptible(sem, tsk);
26949   if (ret)
26950   {
26951     if (ret == 1)
26952       /* ret != 0 only if we get interrupted -arca */
26953       ret = 0;
26954     break;
26955   }
26956   schedule();
26957   DOWN_TAIL(TASK_INTERRUPTIBLE)
26958   return ret;
26959 }
26960 
 Комментарий
26961 int __down_trylock(struct semaphore * sem)
26962 {
26963   return waking_non_zero_trylock(sem);
26964 }
26965 
26966 #define SLEEP_ON_VAR                                    \
26967   unsigned long flags;                                  \
26968   struct wait_queue wait;
26969 
26970 #define SLEEP_ON_HEAD                                   \
26971   wait.task = current;                                  \
26972   write_lock_irqsave(&waitqueue_lock, flags);           \
26973   __add_wait_queue(p, &wait);                           \
26974   write_unlock(&waitqueue_lock);
26975 
26976 #define SLEEP_ON_TAIL                                   \
26977   write_lock_irq(&waitqueue_lock);                      \
26978   __remove_wait_queue(p, &wait);                        \
26979   write_unlock_irqrestore(&waitqueue_lock, flags);
26980 
26981 void interruptible_sleep_on(struct wait_queue **p)
26982 {
26983   SLEEP_ON_VAR
26984 
26985   current->state = TASK_INTERRUPTIBLE;
26986 
26987   SLEEP_ON_HEAD
26988   schedule();
26989   SLEEP_ON_TAIL
26990 }
26991 
26992 long interruptible_sleep_on_timeout(
26993   struct wait_queue **p, long timeout)
26994 {
26995   SLEEP_ON_VAR
26996 
26997   current->state = TASK_INTERRUPTIBLE;
26998 
26999   SLEEP_ON_HEAD
27000   timeout = schedule_timeout(timeout);
27001   SLEEP_ON_TAIL
27002 
27003   return timeout;
27004 }
27005 
27006 void sleep_on(struct wait_queue **p)
27007 {
27008   SLEEP_ON_VAR
27009 
27010   current->state = TASK_UNINTERRUPTIBLE;
27011 
27012   SLEEP_ON_HEAD
27013   schedule();
27014   SLEEP_ON_TAIL
27015 }
27016 
27017 long sleep_on_timeout(struct wait_queue **p,
27018                       long timeout)
27019 {
27020   SLEEP_ON_VAR
27021 
27022   current->state = TASK_UNINTERRUPTIBLE;
27023 
27024   SLEEP_ON_HEAD
27025   timeout = schedule_timeout(timeout);
27026   SLEEP_ON_TAIL
27027 
27028   return timeout;
27029 }
27030 
27031 void scheduling_functions_end_here(void) { }
27032 
27033 static inline void cascade_timers(struct timer_vec *tv)
27034 {
27035   /* cascade all the timers from tv up one level */
27036   struct timer_list *timer;
27037   timer = tv->vec[tv->index];
27038   /* We are removing _all_ timers from the list, so we
27039    * don't have to detach them individually, just clear
27040    * the list afterwards.  */
27041   while (timer) {
27042     struct timer_list *tmp = timer;
27043     timer = timer->next;
27044     internal_add_timer(tmp);
27045   }
27046   tv->vec[tv->index] = NULL;
27047   tv->index = (tv->index + 1) & TVN_MASK;
27048 }
27049 
27050 static inline void run_timer_list(void)
27051 {
27052   spin_lock_irq(&timerlist_lock);
27053   while ((long)(jiffies - timer_jiffies) >= 0) {
27054     struct timer_list *timer;
27055     if (!tv1.index) {
27056       int n = 1;
27057       do {
27058         cascade_timers(tvecs[n]);
27059       } while (tvecs[n]->index == 1 && ++n < NOOF_TVECS);
27060     }
27061     while ((timer = tv1.vec[tv1.index])) {
27062       void (*fn)(unsigned long) = timer->function;
27063       unsigned long data = timer->data;
27064       detach_timer(timer);
27065       timer->next = timer->prev = NULL;
27066       spin_unlock_irq(&timerlist_lock);
27067       fn(data);
27068       spin_lock_irq(&timerlist_lock);
27069     }
27070     ++timer_jiffies;
27071     tv1.index = (tv1.index + 1) & TVR_MASK;
27072   }
27073   spin_unlock_irq(&timerlist_lock);
27074 }
27075 
27076 
 Комментарий
27077 static inline void run_old_timers(void)
27078 {
27079   struct timer_struct *tp;
27080   unsigned long mask;
27081 
27082   for (mask = 1, tp = timer_table+0; mask;
27083        tp++,mask += mask) {
27084     if (mask > timer_active)
27085       break;
27086     if (!(mask & timer_active))
27087       continue;
27088     if (time_after(tp->expires, jiffies))
27089       continue;
27090     timer_active &= ~mask;
27091     tp->fn();
27092     sti();
27093   }
27094 }
27095 
27096 spinlock_t tqueue_lock;
27097 
27098 void tqueue_bh(void)
27099 {
27100   run_task_queue(&tq_timer);
27101 }
27102 
27103 void immediate_bh(void)
27104 {
27105   run_task_queue(&tq_immediate);
27106 }
27107 
27108 unsigned long timer_active = 0;
27109 struct timer_struct timer_table[32];
27110 
27111 /* Hmm.. Changed this, as the GNU make sources (load.c)
27112  * seems to imply that avenrun[] is the standard name for
27113  * this kind of thing.  Nothing else seems to be
27114  * standardized: the fractional size etc all seem to
27115  * differ on different machines.  */
27116 unsigned long avenrun[3] = { 0,0,0 };
27117 
27118 /* Nr of active tasks - counted in fixed-point numbers */
27119 static unsigned long count_active_tasks(void)
27120 {
27121   struct task_struct *p;
27122   unsigned long nr = 0;
27123 
27124   read_lock(&tasklist_lock);
27125   for_each_task(p) {
27126     if ((p->state == TASK_RUNNING ||
27127          p->state == TASK_UNINTERRUPTIBLE ||
27128          p->state == TASK_SWAPPING))
27129       nr += FIXED_1;
27130   }
27131   read_unlock(&tasklist_lock);
27132   return nr;
27133 }
27134 
 Комментарий
27135 static inline void calc_load(unsigned long ticks)
27136 {
27137   unsigned long active_tasks; /* fixed-point */
27138   static int count = LOAD_FREQ;
27139 
27140   count -= ticks;
27141   if (count < 0) {
27142     count += LOAD_FREQ;
27143     active_tasks = count_active_tasks();
27144     CALC_LOAD(avenrun[0], EXP_1, active_tasks);
27145     CALC_LOAD(avenrun[1], EXP_5, active_tasks);
27146     CALC_LOAD(avenrun[2], EXP_15, active_tasks);
27147   }
27148 }
27149 
27150 /* this routine handles the overflow of the microsecond
27151  * field
27152  *
27153  * The tricky bits of code to handle the accurate clock
27154  * support were provided by Dave Mills (Mills@UDEL.EDU)
27155  * of NTP fame.  They were originally developed for SUN
27156  * and DEC kernels.  All the kudos should go to Dave for
27157  * this stuff. */
27158 static void second_overflow(void)
27159 {
27160   long ltemp;
27161 
27162   /* Bump the maxerror field */
27163   time_maxerror += time_tolerance >> SHIFT_USEC;
27164   if ( time_maxerror > NTP_PHASE_LIMIT ) {
27165     time_maxerror = NTP_PHASE_LIMIT;
27166     time_status |= STA_UNSYNC;
27167   }
27168 
27169   /* Leap second processing. If in leap-insert state at
27170    * the end of the day, the system clock is set back one
27171    * second; if in leap-delete state, the system clock is
27172    * set ahead one second. The microtime() routine or
27173    * external clock driver will insure that reported time
27174    * is always monotonic. The ugly divides should be
27175    * replaced.  */
27176   switch (time_state) {
27177 
27178   case TIME_OK:
27179     if (time_status & STA_INS)
27180       time_state = TIME_INS;
27181     else if (time_status & STA_DEL)
27182       time_state = TIME_DEL;
27183     break;
27184 
27185   case TIME_INS:
27186     if (xtime.tv_sec % 86400 == 0) {
27187       xtime.tv_sec--;
27188       time_state = TIME_OOP;
27189       printk(KERN_NOTICE "Clock: "
27190              "inserting leap second 23:59:60 UTC\n");
27191     }
27192     break;
27193 
27194   case TIME_DEL:
27195     if ((xtime.tv_sec + 1) % 86400 == 0) {
27196       xtime.tv_sec++;
27197       time_state = TIME_WAIT;
27198       printk(KERN_NOTICE "Clock: "
27199              "deleting leap second 23:59:59 UTC\n");
27200     }
27201     break;
27202 
27203   case TIME_OOP:
27204     time_state = TIME_WAIT;
27205     break;
27206 
27207   case TIME_WAIT:
27208     if (!(time_status & (STA_INS | STA_DEL)))
27209       time_state = TIME_OK;
27210   }
27211 
27212   /* Compute the phase adjustment for the next second. In
27213    * PLL mode, the offset is reduced by a fixed factor
27214    * times the time constant. In FLL mode the offset is
27215    * used directly. In either mode, the maximum phase
27216    * adjustment for each second is clamped so as to
27217    * spread the adjustment over not more than the number
27218    * of seconds between updates.  */
27219   if (time_offset < 0) {
27220     ltemp = -time_offset;
27221     if (!(time_status & STA_FLL))
27222       ltemp >>= SHIFT_KG + time_constant;
27223     if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
27224       ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
27225     time_offset += ltemp;
27226     time_adj = -ltemp <<
27227       (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
27228   } else {
27229     ltemp = time_offset;
27230     if (!(time_status & STA_FLL))
27231       ltemp >>= SHIFT_KG + time_constant;
27232     if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
27233       ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
27234     time_offset -= ltemp;
27235     time_adj = ltemp <<
27236       (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
27237   }
27238 
27239   /* Compute the frequency estimate and additional phase
27240    * adjustment due to frequency error for the next
27241    * second. When the PPS signal is engaged, gnaw on the
27242    * watchdog counter and update the frequency computed
27243    * by the pll and the PPS signal.  */
27244   pps_valid++;
27245   if (pps_valid == PPS_VALID) {  /* PPS signal lost */
27246     pps_jitter = MAXTIME;
27247     pps_stabil = MAXFREQ;
27248     time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
27249                      STA_PPSWANDER | STA_PPSERROR);
27250   }
27251   ltemp = time_freq + pps_freq;
27252   if (ltemp < 0)
27253     time_adj -= -ltemp >>
27254       (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
27255   else
27256     time_adj += ltemp >>
27257       (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
27258 
27259 #if HZ == 100
27260   /* Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add
27261    * 25% and 3.125% to get 128.125; => only 0.125% error
27262    * (p. 14) */
27263   if (time_adj < 0)
27264     time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
27265   else
27266     time_adj += (time_adj >> 2) + (time_adj >> 5);
27267 #endif
27268 }
27269 
27270 /* in the NTP reference this is called "hardclock()" */
27271 static void update_wall_time_one_tick(void)
27272 {
27273   if ( (time_adjust_step = time_adjust) != 0 ) {
27274     /* We are doing an adjtime thing.
27275      *
27276      * Prepare time_adjust_step to be within bounds.
27277      * Note that a positive time_adjust means we want the
27278      * clock to run faster.
27279      *
27280      * Limit the amount of the step to be in the range
27281      * -tickadj .. +tickadj */
27282     if (time_adjust > tickadj)
27283       time_adjust_step = tickadj;
27284     else if (time_adjust < -tickadj)
27285       time_adjust_step = -tickadj;
27286 
27287     /* Reduce by this step the amount of time left  */
27288     time_adjust -= time_adjust_step;
27289   }
27290   xtime.tv_usec += tick + time_adjust_step;
27291   /* Advance the phase, once it gets to one microsecond,
27292    * then advance the tick more.  */
27293   time_phase += time_adj;
27294   if (time_phase <= -FINEUSEC) {
27295     long ltemp = -time_phase >> SHIFT_SCALE;
27296     time_phase += ltemp << SHIFT_SCALE;
27297     xtime.tv_usec -= ltemp;
27298   }
27299   else if (time_phase >= FINEUSEC) {
27300     long ltemp = time_phase >> SHIFT_SCALE;
27301     time_phase -= ltemp << SHIFT_SCALE;
27302     xtime.tv_usec += ltemp;
27303   }
27304 }
27305 
27306 /* Using a loop looks inefficient, but "ticks" is usually
27307  * just one (we shouldn't be losing ticks, we're doing
27308  * this this way mainly for interrupt latency reasons,
27309  * not because we think we'll have lots of lost timer
27310  * ticks */
27311 static void update_wall_time(unsigned long ticks)
27312 {
 Комментарий
27313   do {
27314     ticks--;
27315     update_wall_time_one_tick();
27316   } while (ticks);
27317 
27318   if (xtime.tv_usec >= 1000000) {
27319       xtime.tv_usec -= 1000000;
27320       xtime.tv_sec++;
27321       second_overflow();
27322   }
27323 }
27324 
27325 static inline void do_process_times(
27326   struct task_struct *p, unsigned long user,
27327   unsigned long system)
27328 {
27329   long psecs;
27330 
27331   psecs = (p->times.tms_utime += user);
27332   psecs += (p->times.tms_stime += system);
27333   if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_cur) {
27334     /* Send SIGXCPU every second.. */
27335     if (!(psecs % HZ))
27336       send_sig(SIGXCPU, p, 1);
27337     /* and SIGKILL when we go over max.. */
27338     if (psecs / HZ > p->rlim[RLIMIT_CPU].rlim_max)
27339       send_sig(SIGKILL, p, 1);
27340   }
27341 }
27342 
27343 static inline void do_it_virt(struct task_struct * p,
27344                               unsigned long ticks)
27345 {
27346   unsigned long it_virt = p->it_virt_value;
27347 
27348   if (it_virt) {
27349     if (it_virt <= ticks) {
27350       it_virt = ticks + p->it_virt_incr;
27351       send_sig(SIGVTALRM, p, 1);
27352     }
27353     p->it_virt_value = it_virt - ticks;
27354   }
27355 }
27356 
27357 static inline void do_it_prof(struct task_struct * p,
27358                               unsigned long ticks)
27359 {
27360   unsigned long it_prof = p->it_prof_value;
27361 
27362   if (it_prof) {
27363     if (it_prof <= ticks) {
27364       it_prof = ticks + p->it_prof_incr;
27365       send_sig(SIGPROF, p, 1);
27366     }
27367     p->it_prof_value = it_prof - ticks;
27368   }
27369 }
27370 
27371 void update_one_process(struct task_struct *p,
27372   unsigned long ticks, unsigned long user,
27373   unsigned long system, int cpu)
27374 {
27375   p->per_cpu_utime[cpu] += user;
27376   p->per_cpu_stime[cpu] += system;
27377   do_process_times(p, user, system);
27378   do_it_virt(p, user);
27379   do_it_prof(p, ticks);
27380 }
27381 
27382 static void update_process_times(unsigned long ticks,
27383                                  unsigned long system)
27384 {
27385   /* SMP does this on a per-CPU basis elsewhere */
27386 #ifndef  __SMP__
27387   struct task_struct * p = current;
27388   unsigned long user = ticks - system;
27389   if (p->pid) {
27390     p->counter -= ticks;
27391     if (p->counter < 0) {
27392       p->counter = 0;
27393       p->need_resched = 1;
27394     }
27395     if (p->priority < DEF_PRIORITY)
27396       kstat.cpu_nice += user;
27397     else
27398       kstat.cpu_user += user;
27399     kstat.cpu_system += system;
27400   }
27401   update_one_process(p, ticks, user, system, 0);
27402 #endif
27403 }
27404 
27405 volatile unsigned long lost_ticks = 0;
27406 static unsigned long lost_ticks_system = 0;
27407 
27408 /* This spinlock protect us from races in SMP while
27409  * playing with xtime. -arca */
27410 rwlock_t xtime_lock = RW_LOCK_UNLOCKED;
27411 
 Комментарий
27412 static inline void update_times(void)
27413 {
27414   unsigned long ticks;
27415 
27416   /* update_times() is run from the raw timer_bh handler
27417    * so we just know that the irqs are locally enabled
27418    * and so we don't need to save/restore the flags of
27419    * the local CPU here. -arca */
27420   write_lock_irq(&xtime_lock);
27421 
27422   ticks = lost_ticks;
27423   lost_ticks = 0;
27424 
27425   if (ticks) {
27426     unsigned long system;
 Комментарий
27427     system = xchg(&lost_ticks_system, 0);
27428 
27429     calc_load(ticks);
27430     update_wall_time(ticks);
27431     write_unlock_irq(&xtime_lock);
27432 
27433     update_process_times(ticks, system);
27434 
27435   } else
27436     write_unlock_irq(&xtime_lock);
27437 }
27438 
 Комментарий
27439 static void timer_bh(void)
27440 {
27441   update_times();
27442   run_old_timers();
27443   run_timer_list();
27444 }
27445 
 Комментарий
27446 void do_timer(struct pt_regs * regs)
27447 {
27448   (*(unsigned long *)&jiffies)++;
27449   lost_ticks++;
27450   mark_bh(TIMER_BH);
27451   if (!user_mode(regs))
27452     lost_ticks_system++;
27453   if (tq_timer)
27454     mark_bh(TQUEUE_BH);
27455 }
27456 
27457 #ifndef __alpha__
27458 
27459 /* For backwards compatibility?  This can be done in libc
27460  * so Alpha and all newer ports shouldn't need it.  */
27461 asmlinkage unsigned int sys_alarm(unsigned int seconds)
27462 {
27463   struct itimerval it_new, it_old;
27464   unsigned int oldalarm;
27465 
27466   it_new.it_interval.tv_sec = it_new.it_interval.tv_usec
27467     = 0;
27468   it_new.it_value.tv_sec = seconds;
27469   it_new.it_value.tv_usec = 0;
27470   do_setitimer(ITIMER_REAL, &it_new, &it_old);
27471   oldalarm = it_old.it_value.tv_sec;
27472   /* ehhh.. We can't return 0 if we have an alarm
27473    * pending..  And we'd better return too much than too
27474    * little anyway */
27475   if (it_old.it_value.tv_usec)
27476     oldalarm++;
27477   return oldalarm;
27478 }
27479 
27480 /* The Alpha uses getxpid, getxuid, and getxgid instead.
27481  * Maybe this should be moved into arch/i386 instead?  */
27482 
27483 asmlinkage int sys_getpid(void)
27484 {
27485   /* This is SMP safe - current->pid doesn't change */
27486   return current->pid;
27487 }
27488 
27489 /* This is not strictly SMP safe: p_opptr could change
27490  * from under us. However, rather than getting any lock
27491  * we can use an optimistic algorithm: get the parent
27492  * pid, and go back and check that the parent is still
27493  * the same. If it has changed (which is extremely
27494  * unlikely indeed), we just try again..
27495  *
27496  * NOTE! This depends on the fact that even if we _do_
27497  * get an old value of "parent", we can happily
27498  * dereference the pointer: we just can't necessarily
27499  * trust the result until we know that the parent pointer
27500  * is valid.
27501  *
27502  * The "mb()" macro is a memory barrier - a synchronizing
27503  * event. It also makes sure that gcc doesn't optimize
27504  * away the necessary memory references.. The barrier
27505  * doesn't have to have all that strong semantics: on x86
27506  * we don't really require a synchronizing instruction,
27507  * for example.  The barrier is more important for code
27508  * generation than for any real memory ordering semantics
27509  * (even if there is a small window for a race, using the
27510  * old pointer is harmless for a while).  */
27511 asmlinkage int sys_getppid(void)
27512 {
27513   int pid;
27514   struct task_struct * me = current;
27515   struct task_struct * parent;
27516 
27517   parent = me->p_opptr;
27518   for (;;) {
27519     pid = parent->pid;
27520 #if __SMP__
27521 {
27522     struct task_struct *old = parent;
27523     mb();
27524     parent = me->p_opptr;
27525     if (old != parent)
27526       continue;
27527 }
27528 #endif
27529     break;
27530   }
27531   return pid;
27532 }
27533 
27534 asmlinkage int sys_getuid(void)
27535 {
27536   /* Only we change this so SMP safe */
27537   return current->uid;
27538 }
27539 
27540 asmlinkage int sys_geteuid(void)
27541 {
27542   /* Only we change this so SMP safe */
27543   return current->euid;
27544 }
27545 
27546 asmlinkage int sys_getgid(void)
27547 {
27548   /* Only we change this so SMP safe */
27549   return current->gid;
27550 }
27551 
27552 asmlinkage int sys_getegid(void)
27553 {
27554   /* Only we change this so SMP safe */
27555   return  current->egid;
27556 }
27557 
27558 /* This has been replaced by sys_setpriority.  Maybe it
27559  * should be moved into the arch dependent tree for those
27560  * ports that require it for backward compatibility?  */
27561 
27562 asmlinkage int sys_nice(int increment)
27563 {
27564   unsigned long newprio;
27565   int increase = 0;
27566 
27567   /* Setpriority might change our priority at the same
27568    * moment.  We don't have to worry. Conceptually one
27569    * call occurs first and we have a single winner.  */
27570 
27571   newprio = increment;
27572   if (increment < 0) {
27573     if (!capable(CAP_SYS_NICE))
27574       return -EPERM;
27575     newprio = -increment;
27576     increase = 1;
27577   }
27578 
27579   if (newprio > 40)
27580     newprio = 40;
27581   /* do a "normalization" of the priority (traditionally
27582    * Unix nice values are -20 to 20; Linux doesn't really
27583    * use that kind of thing, but uses the length of the
27584    * timeslice instead (default 210 ms). The rounding is
27585    * why we want to avoid negative values.  */
27586   newprio = (newprio * DEF_PRIORITY + 10) / 20;
27587   increment = newprio;
27588   if (increase)
27589     increment = -increment;
27590   /* Current->priority can change between this point and
27591    * the assignment. We are assigning not doing add/subs
27592    * so thats ok. Conceptually a process might just
27593    * instantaneously read the value we stomp over. I
27594    * don't think that is an issue unless posix makes it
27595    * one. If so we can loop on changes to
27596    * current->priority.  */
27597   newprio = current->priority - increment;
27598   if ((signed) newprio < 1)
27599     newprio = 1;
27600   if (newprio > DEF_PRIORITY*2)
27601     newprio = DEF_PRIORITY*2;
27602   current->priority = newprio;
27603   return 0;
27604 }
27605 
27606 #endif
27607 
27608 static inline struct task_struct *
27609 find_process_by_pid(pid_t pid)
27610 {
27611   struct task_struct *tsk = current;
27612 
27613   if (pid)
27614     tsk = find_task_by_pid(pid);
27615   return tsk;
27616 }
27617 
 Комментарий
27618 static int setscheduler(pid_t pid, int policy,
27619       struct sched_param *param)
27620 {
27621   struct sched_param lp;
27622   struct task_struct *p;
27623   int retval;
27624 
27625   retval = -EINVAL;
27626   if (!param || pid < 0)
27627     goto out_nounlock;
27628 
27629   retval = -EFAULT;
27630   if (copy_from_user(&lp, param,
27631                      sizeof(struct sched_param)))
27632     goto out_nounlock;
27633 
27634   /* We play safe to avoid deadlocks. */
27635   spin_lock(&scheduler_lock);
27636   spin_lock_irq(&runqueue_lock);
27637   read_lock(&tasklist_lock);
27638 
27639   p = find_process_by_pid(pid);
27640 
27641   retval = -ESRCH;
27642   if (!p)
27643     goto out_unlock;
27644 
27645   if (policy < 0)
27646     policy = p->policy;
27647   else {
27648     retval = -EINVAL;
27649     if (policy != SCHED_FIFO && policy != SCHED_RR &&
27650         policy != SCHED_OTHER)
27651       goto out_unlock;
27652   }
27653 
27654   /* Valid priorities for SCHED_FIFO and SCHED_RR are
27655    * 1..99, valid priority for SCHED_OTHER is 0.  */
27656   retval = -EINVAL;
27657   if (lp.sched_priority < 0 || lp.sched_priority > 99)
27658     goto out_unlock;
 Комментарий
27659   if((policy == SCHED_OTHER) != (lp.sched_priority == 0))
27660     goto out_unlock;
27661 
27662   retval = -EPERM;
27663   if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
27664       !capable(CAP_SYS_NICE))
27665     goto out_unlock;
27666   if ((current->euid != p->euid) &&
27667       (current->euid != p->uid) &&
27668       !capable(CAP_SYS_NICE))
27669     goto out_unlock;
27670 
27671   retval = 0;
27672   p->policy = policy;
27673   p->rt_priority = lp.sched_priority;
27674   if (p->next_run)
27675     move_first_runqueue(p);
27676 
27677   current->need_resched = 1;
27678 
27679 out_unlock:
27680   read_unlock(&tasklist_lock);
27681   spin_unlock_irq(&runqueue_lock);
27682   spin_unlock(&scheduler_lock);
27683 
27684 out_nounlock:
27685   return retval;
27686 }
27687 
27688 asmlinkage int sys_sched_setscheduler(pid_t pid,
27689   int policy, struct sched_param *param)
27690 {
27691   return setscheduler(pid, policy, param);
27692 }
27693 
27694 asmlinkage int sys_sched_setparam(pid_t pid,
27695   struct sched_param *param)
27696 {
27697   return setscheduler(pid, -1, param);
27698 }
27699 
27700 asmlinkage int sys_sched_getscheduler(pid_t pid)
27701 {
27702   struct task_struct *p;
27703   int retval;
27704 
27705   retval = -EINVAL;
27706   if (pid < 0)
27707     goto out_nounlock;
27708 
27709   read_lock(&tasklist_lock);
27710 
27711   retval = -ESRCH;
27712   p = find_process_by_pid(pid);
27713   if (!p)
27714     goto out_unlock;
27715 
27716   retval = p->policy;
27717 
27718 out_unlock:
27719   read_unlock(&tasklist_lock);
27720 
27721 out_nounlock:
27722   return retval;
27723 }
27724 
27725 asmlinkage int sys_sched_getparam(pid_t pid,
27726   struct sched_param *param)
27727 {
27728   struct task_struct *p;
27729   struct sched_param lp;
27730   int retval;
27731 
27732   retval = -EINVAL;
27733   if (!param || pid < 0)
27734     goto out_nounlock;
27735 
27736   read_lock(&tasklist_lock);
27737   p = find_process_by_pid(pid);
27738   retval = -ESRCH;
27739   if (!p)
27740     goto out_unlock;
27741   lp.sched_priority = p->rt_priority;
27742   read_unlock(&tasklist_lock);
27743 
27744   /* This one might sleep, we cannot do it with a
27745    * spinlock held ...  */
27746   retval = copy_to_user(param, &lp,
27747                         sizeof(*param)) ? -EFAULT : 0;
27748 
27749 out_nounlock:
27750   return retval;
27751 
27752 out_unlock:
27753   read_unlock(&tasklist_lock);
27754   return retval;
27755 }
27756 
27757 asmlinkage int sys_sched_yield(void)
27758 {
27759   spin_lock(&scheduler_lock);
27760   spin_lock_irq(&runqueue_lock);
27761   if (current->policy == SCHED_OTHER)
27762     current->policy |= SCHED_YIELD;
27763   current->need_resched = 1;
27764   move_last_runqueue(current);
27765   spin_unlock_irq(&runqueue_lock);
27766   spin_unlock(&scheduler_lock);
27767   return 0;
27768 }
27769 
27770 asmlinkage int sys_sched_get_priority_max(int policy)
27771 {
27772   int ret = -EINVAL;
27773 
27774   switch (policy) {
27775   case SCHED_FIFO:
27776   case SCHED_RR:
27777     ret = 99;
27778     break;
27779   case SCHED_OTHER:
27780     ret = 0;
27781     break;
27782   }
27783   return ret;
27784 }
27785 
27786 asmlinkage int sys_sched_get_priority_min(int policy)
27787 {
27788   int ret = -EINVAL;
27789 
27790   switch (policy) {
27791   case SCHED_FIFO:
27792   case SCHED_RR:
27793     ret = 1;
27794     break;
27795   case SCHED_OTHER:
27796     ret = 0;
27797   }
27798   return ret;
27799 }
27800 
27801 asmlinkage int sys_sched_rr_get_interval(pid_t pid,
27802   struct timespec *interval)
27803 {
27804   struct timespec t;
27805 
27806   t.tv_sec = 0;
27807   t.tv_nsec = 150000;
27808   if (copy_to_user(interval, &t,
27809                    sizeof(struct timespec)))
27810     return -EFAULT;
27811   return 0;
27812 }
27813 
27814 asmlinkage int sys_nanosleep(struct timespec *rqtp,
27815                              struct timespec *rmtp)
27816 {
27817   struct timespec t;
27818   unsigned long expire;
27819 
27820   if (copy_from_user(&t, rqtp, sizeof(struct timespec)))
27821     return -EFAULT;
27822 
27823   if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0 ||
27824       t.tv_sec < 0)
27825     return -EINVAL;
27826 
27827 
27828   if (t.tv_sec == 0 && t.tv_nsec <= 2000000L &&
27829       current->policy != SCHED_OTHER)
27830   {
27831     /* Short delay requests up to 2 ms will be handled
27832      * with high precision by a busy wait for all
27833      * real-time processes.
27834      *
27835      * It's important on SMP not to do this holding
27836      * locks.  */
27837     udelay((t.tv_nsec + 999) / 1000);
27838     return 0;
27839   }
27840 
27841   expire =
27842     timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
27843 
27844   current->state = TASK_INTERRUPTIBLE;
27845   expire = schedule_timeout(expire);
27846 
27847   if (expire) {
27848     if (rmtp) {
27849       jiffies_to_timespec(expire, &t);
27850       if (copy_to_user(rmtp, &t,sizeof(struct timespec)))
27851         return -EFAULT;
27852     }
27853     return -EINTR;
27854   }
27855   return 0;
27856 }
27857 
27858 static void show_task(int nr,struct task_struct * p)
27859 {
27860   unsigned long free = 0;
27861   int state;
27862   static const char * stat_nam[] =
27863     { "R", "S", "D", "Z", "T", "W" };
27864 
27865   printk("%-8s %3d ",
27866          p->comm, (p == current) ? -nr : nr);
27867   state = p->state ? ffz(~p->state) + 1 : 0;
27868   if (((unsigned) state) <
27869       sizeof(stat_nam)/sizeof(char *))
27870     printk(stat_nam[state]);
27871   else
27872     printk(" ");
27873 #if (BITS_PER_LONG == 32)
27874   if (p == current)
27875     printk(" current  ");
27876   else
27877     printk(" %08lX ", thread_saved_pc(&p->tss));
27878 #else
27879   if (p == current)
27880     printk("   current task   ");
27881   else
27882     printk(" %016lx ", thread_saved_pc(&p->tss));
27883 #endif
27884   {
27885     unsigned long * n = (unsigned long *) (p+1);
27886     while (!*n)
27887       n++;
27888     free = (unsigned long) n - (unsigned long)(p+1);
27889   }
27890   printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
27891   if (p->p_cptr)
27892     printk("%5d ", p->p_cptr->pid);
27893   else
27894     printk("      ");
27895   if (p->p_ysptr)
27896     printk("%7d", p->p_ysptr->pid);
27897   else
27898     printk("       ");
27899   if (p->p_osptr)
27900     printk(" %5d\n", p->p_osptr->pid);
27901   else
27902     printk("\n");
27903 
27904   {
27905     struct signal_queue *q;
27906     char s[sizeof(sigset_t)*2+1],b[sizeof(sigset_t)*2+1];
27907 
27908     render_sigset_t(&p->signal, s);
27909     render_sigset_t(&p->blocked, b);
27910     printk("   sig: %d %s %s :",
27911            signal_pending(p), s, b);
27912     for (q = p->sigqueue; q ; q = q->next)
27913       printk(" %d", q->info.si_signo);
27914     printk(" X\n");
27915   }
27916 }
27917 
27918 char * render_sigset_t(sigset_t *set, char *buffer)
27919 {
27920   int i = _NSIG, x;
27921   do {
27922     i -= 4, x = 0;
27923     if (sigismember(set, i+1)) x |= 1;
27924     if (sigismember(set, i+2)) x |= 2;
27925     if (sigismember(set, i+3)) x |= 4;
27926     if (sigismember(set, i+4)) x |= 8;
27927     *buffer++ = (x < 10 ? '0' : 'a' - 10) + x;
27928   } while (i >= 4);
27929   *buffer = 0;
27930   return buffer;
27931 }
27932 
27933 void show_state(void)
27934 {
27935   struct task_struct *p;
27936 
27937 #if (BITS_PER_LONG == 32)
27938   printk("\n"
27939          "                         free                 "
27940          "       sibling\n");
27941   printk("  task             PC    stack   pid father "
27942          "child younger older\n");
27943 #else
27944   printk("\n"
27945          "                                 free         "
27946          "               sibling\n");
27947   printk("  task                 PC        stack   pid "
27948          "father child younger older\n");
27949 #endif
27950   read_lock(&tasklist_lock);
27951   for_each_task(p)
27952     show_task((p->tarray_ptr - &task[0]),p);
27953   read_unlock(&tasklist_lock);
27954 }
27955 
27956 void __init sched_init(void)
27957 {
27958   /* We have to do a little magic to get the first
27959    * process right in SMP mode.  */
27960   int cpu=hard_smp_processor_id();
27961   int nr = NR_TASKS;
27962 
27963   init_task.processor=cpu;
27964 
27965   /* Init task array free list and pidhash table. */
27966   while(--nr > 0)
27967     add_free_taskslot(&task[nr]);
27968 
27969   for(nr = 0; nr < PIDHASH_SZ; nr++)
27970     pidhash[nr] = NULL;
27971 
27972   init_bh(TIMER_BH, timer_bh);
27973   init_bh(TQUEUE_BH, tqueue_bh);
27974   init_bh(IMMEDIATE_BH, immediate_bh);
27975 }

netlib.narod.ru	< Назад \| Оглавление \| Далее >
Сайт управляется системой uCoz