Код ядра - kernel/fork.c

netlib.narod.ru	< Назад \| Оглавление \| Далее >
kernel/fork.c

23435 /*
23436  *  linux/kernel/fork.c
23437  *
23438  *  Copyright (C) 1991, 1992  Linus Torvalds
23439  */
23440 
23441 /* 'fork.c' contains the help-routines for the 'fork'
23442  * system call (see also system_call.s).  Fork is rather
23443  * simple, once you get the hang of it, but the memory
23444  * management can be a bitch. See 'mm/mm.c':
23445  * 'copy_page_tables()' */
23446 
23447 #include <linux/malloc.h>
23448 #include <linux/init.h>
23449 #include <linux/unistd.h>
23450 #include <linux/smp_lock.h>
23451 #include <linux/module.h>
23452 #include <linux/vmalloc.h>
23453 
23454 #include <asm/pgtable.h>
23455 #include <asm/mmu_context.h>
23456 #include <asm/uaccess.h>
23457 
23458 /* The idle tasks do not count.. */
23459 int nr_tasks=0;
23460 int nr_running=0;
23461 
23462 /* Handle normal Linux uptimes. */
23463 unsigned long int total_forks=0;
23464 int last_pid=0;
23465 
23466 /* SLAB cache for mm_struct's. */
23467 kmem_cache_t *mm_cachep;
23468 
23469 /* SLAB cache for files structs */
23470 kmem_cache_t *files_cachep;
23471 
23472 struct task_struct *pidhash[PIDHASH_SZ];
23473 
23474 struct task_struct **tarray_freelist = NULL;
23475 spinlock_t taskslot_lock = SPIN_LOCK_UNLOCKED;
23476 
23477 /* UID task count cache, to prevent walking entire
23478  * process list every single fork() operation.  */
23479 #define UIDHASH_SZ      (PIDHASH_SZ >> 2)
23480 
23481 static struct user_struct {
23482   atomic_t count;
23483   struct user_struct *next, **pprev;
23484   unsigned int uid;
23485 } *uidhash[UIDHASH_SZ];
23486 
23487 spinlock_t uidhash_lock = SPIN_LOCK_UNLOCKED;
23488 
23489 kmem_cache_t *uid_cachep;
23490 
23491 #define uidhashfn(uid)                                  \
23492   (((uid >> 8) ^ uid) & (UIDHASH_SZ - 1))
23493 
23494 /* These routines must be called with the uidhash
23495  * spinlock held!  */
23496 static inline void uid_hash_insert(
23497   struct user_struct *up, unsigned int hashent)
23498 {
23499   if ((up->next = uidhash[hashent]) != NULL)
23500     uidhash[hashent]->pprev = &up->next;
23501   up->pprev = &uidhash[hashent];
23502   uidhash[hashent] = up;
23503 }
23504 
23505 static inline void uid_hash_remove(
23506   struct user_struct *up)
23507 {
23508   if (up->next)
23509     up->next->pprev = up->pprev;
23510   *up->pprev = up->next;
23511 }
23512 
23513 static inline struct user_struct *uid_hash_find(
23514   unsigned short uid, unsigned int hashent)
23515 {
23516   struct user_struct *up, *next;
23517 
23518   next = uidhash[hashent];
23519   for (;;) {
23520     up = next;
23521     if (next) {
23522       next = up->next;
23523       if (up->uid != uid)
23524         continue;
23525       atomic_inc(&up->count);
23526     }
23527     break;
23528   }
23529   return up;
23530 }
23531 
23532 void free_uid(struct task_struct *p)
23533 {
23534   struct user_struct *up = p->user;
23535 
23536   if (up) {
23537     p->user = NULL;
23538     if (atomic_dec_and_test(&up->count)) {
23539       spin_lock(&uidhash_lock);
23540       uid_hash_remove(up);
23541       spin_unlock(&uidhash_lock);
23542       kmem_cache_free(uid_cachep, up);
23543     }
23544   }
23545 }
23546 
23547 int alloc_uid(struct task_struct *p)
23548 {
23549   unsigned int hashent = uidhashfn(p->uid);
23550   struct user_struct *up;
23551 
23552   spin_lock(&uidhash_lock);
23553   up = uid_hash_find(p->uid, hashent);
23554   spin_unlock(&uidhash_lock);
23555 
23556   if (!up) {
23557     struct user_struct *new;
23558 
23559     new = kmem_cache_alloc(uid_cachep, SLAB_KERNEL);
23560     if (!new)
23561       return -EAGAIN;
23562     new->uid = p->uid;
23563     atomic_set(&new->count, 1);
23564 
23565     /* Before adding this, check whether we raced on
23566      * adding the same user already..  */
23567     spin_lock(&uidhash_lock);
23568     up = uid_hash_find(p->uid, hashent);
23569     if (up) {
23570       kmem_cache_free(uid_cachep, new);
23571     } else {
23572       uid_hash_insert(new, hashent);
23573       up = new;
23574     }
23575     spin_unlock(&uidhash_lock);
23576 
23577   }
23578   p->user = up;
23579   return 0;
23580 }
23581 
23582 void __init uidcache_init(void)
23583 {
23584   int i;
23585 
23586   uid_cachep =
23587     kmem_cache_create("uid_cache",
23588                       sizeof(struct user_struct),
23589                       0, SLAB_HWCACHE_ALIGN, NULL, NULL);
23590   if (!uid_cachep)
23591     panic("Cannot create uid taskcount SLAB cache\n");
23592 
23593   for (i = 0; i < UIDHASH_SZ; i++)
23594     uidhash[i] = 0;
23595 }
23596 
23597 static inline struct task_struct **
23598 find_empty_process(void)
23599 {
23600   struct task_struct **tslot = NULL;
23601 
23602   if ((nr_tasks < NR_TASKS - MIN_TASKS_LEFT_FOR_ROOT) ||
23603       !current->uid)
23604     tslot = get_free_taskslot();
23605   return tslot;
23606 }
23607 
23608 /* Protects next_safe and last_pid. */
23609 spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;
23610 
23611 static int get_pid(unsigned long flags)
23612 {
 Комментарий
23613   static int next_safe = PID_MAX;
23614   struct task_struct *p;
23615 
23616   if (flags & CLONE_PID)
23617     return current->pid;
23618 
23619   spin_lock(&lastpid_lock);
23620   if((++last_pid) & 0xffff8000) {
23621     last_pid = 300;         /* Skip daemons etc. */
23622     goto inside;
23623   }
23624   if(last_pid >= next_safe) {
23625 inside:
23626     next_safe = PID_MAX;
23627     read_lock(&tasklist_lock);
23628   repeat:
23629     for_each_task(p) {
23630       if(p->pid == last_pid   ||
23631          p->pgrp == last_pid  ||
23632          p->session == last_pid) {
23633         if(++last_pid >= next_safe) {
23634           if(last_pid & 0xffff8000)
23635             last_pid = 300;
23636           next_safe = PID_MAX;
23637         }
23638         goto repeat;
23639       }
23640       if(p->pid > last_pid && next_safe > p->pid)
23641         next_safe = p->pid;
23642       if(p->pgrp > last_pid && next_safe > p->pgrp)
23643         next_safe = p->pgrp;
23644       if(p->session > last_pid && next_safe > p->session)
23645         next_safe = p->session;
23646     }
23647     read_unlock(&tasklist_lock);
23648   }
23649   spin_unlock(&lastpid_lock);
23650 
23651   return last_pid;
23652 }
23653 
23654 static inline int dup_mmap(struct mm_struct * mm)
23655 {
23656   struct vm_area_struct * mpnt, *tmp, **pprev;
23657   int retval;
23658 
23659   flush_cache_mm(current->mm);
23660   pprev = &mm->mmap;
23661   for (mpnt = current->mm->mmap; mpnt;
23662        mpnt = mpnt->vm_next) {
23663     struct file *file;
23664 
23665     retval = -ENOMEM;
23666     tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
23667     if (!tmp)
23668       goto fail_nomem;
23669     *tmp = *mpnt;
23670     tmp->vm_flags &= ~VM_LOCKED;
23671     tmp->vm_mm = mm;
23672     mm->map_count++;
23673     tmp->vm_next = NULL;
23674     file = tmp->vm_file;
23675     if (file) {
23676       file->f_count++;
23677       if (tmp->vm_flags & VM_DENYWRITE)
23678         file->f_dentry->d_inode->i_writecount--;
23679 
23680       /*insert tmp into the share list, just after mpnt*/
23681       if ((tmp->vm_next_share = mpnt->vm_next_share) !=
23682           NULL)
23683         mpnt->vm_next_share->vm_pprev_share =
23684           &tmp->vm_next_share;
23685       mpnt->vm_next_share = tmp;
23686       tmp->vm_pprev_share = &mpnt->vm_next_share;
23687     }
23688 
23689     /* Copy the pages, but defer checking for errors */
23690     retval = copy_page_range(mm, current->mm, tmp);
23691     if (!retval && tmp->vm_ops && tmp->vm_ops->open)
23692       tmp->vm_ops->open(tmp);
23693 
23694     /* Link in the new vma even if an error occurred, so
23695      * that exit_mmap() can clean up the mess.  */
23696     tmp->vm_next = *pprev;
23697     *pprev = tmp;
23698 
23699     pprev = &tmp->vm_next;
23700     if (retval)
23701       goto fail_nomem;
23702   }
23703   retval = 0;
23704   if (mm->map_count >= AVL_MIN_MAP_COUNT)
23705     build_mmap_avl(mm);
23706 
23707 fail_nomem:
23708   flush_tlb_mm(current->mm);
23709   return retval;
23710 }
23711 
23712 /* Allocate and initialize an mm_struct.
23713  *
23714  * NOTE! The mm mutex will be locked until the caller
23715  * decides that all systems are go..  */
23716 struct mm_struct * mm_alloc(void)
23717 {
23718   struct mm_struct * mm;
23719 
23720   mm = kmem_cache_alloc(mm_cachep, SLAB_KERNEL);
23721   if (mm) {
23722     *mm = *current->mm;
23723     init_new_context(mm);
23724     atomic_set(&mm->count, 1);
23725     mm->map_count = 0;
23726     mm->def_flags = 0;
23727     mm->mmap_sem = MUTEX_LOCKED;
23728     /* Leave mm->pgd set to the parent's pgd so that
23729      * pgd_offset() is always valid.  */
23730     mm->mmap = mm->mmap_avl = mm->mmap_cache = NULL;
23731 
23732     /* It has not run yet, so cannot be present in
23733      * anyone's cache or tlb.  */
23734     mm->cpu_vm_mask = 0;
23735   }
23736   return mm;
23737 }
23738 
23739 /* Please note the differences between mmput and
23740  * mm_release.  mmput is called whenever we stop holding
23741  * onto a mm_struct, error success whatever.
23742  *
23743  * mm_release is called after a mm_struct has been
23744  * removed from the current process.
23745  *
23746  * This difference is important for error handling, when
23747  * we only half set up a mm_struct for a new process and
23748  * need to restore the old one.  Because we mmput the new
23749  * mm_struct before restoring the old one. . .  Eric
23750  * Biederman 10 January 1998 */
23751 void mm_release(void)
23752 {
23753   struct task_struct *tsk = current;
23754   forget_segments();
23755   /* notify parent sleeping on vfork() */
23756   if (tsk->flags & PF_VFORK) {
23757     tsk->flags &= ~PF_VFORK;
23758     up(tsk->p_opptr->vfork_sem);
23759   }
23760 }
23761 
23762 /* Decrement the use count and release all resources for
23763  * an mm.  */
23764 void mmput(struct mm_struct *mm)
23765 {
23766   if (atomic_dec_and_test(&mm->count)) {
23767     release_segments(mm);
23768     exit_mmap(mm);
23769     free_page_tables(mm);
23770     kmem_cache_free(mm_cachep, mm);
23771   }
23772 }
23773 
23774 static inline int copy_mm(int nr,
23775   unsigned long clone_flags, struct task_struct * tsk)
23776 {
23777   struct mm_struct * mm;
23778   int retval;
23779 
23780   if (clone_flags & CLONE_VM) {
23781     mmget(current->mm);
23782     /* Set up the LDT descriptor for the clone task. */
23783     copy_segments(nr, tsk, NULL);
23784     SET_PAGE_DIR(tsk, current->mm->pgd);
23785     return 0;
23786   }
23787 
23788   retval = -ENOMEM;
23789   mm = mm_alloc();
23790   if (!mm)
23791     goto fail_nomem;
23792 
23793   tsk->mm = mm;
23794   tsk->min_flt = tsk->maj_flt = 0;
23795   tsk->cmin_flt = tsk->cmaj_flt = 0;
23796   tsk->nswap = tsk->cnswap = 0;
23797   copy_segments(nr, tsk, mm);
23798   retval = new_page_tables(tsk);
23799   if (retval)
23800     goto free_mm;
23801   retval = dup_mmap(mm);
23802   if (retval)
23803     goto free_pt;
23804   up(&mm->mmap_sem);
23805   return 0;
23806 
23807 free_mm:
23808   mm->pgd = NULL;
23809 free_pt:
23810   tsk->mm = NULL;
23811   mmput(mm);
23812 fail_nomem:
23813   return retval;
23814 }
23815 
23816 static inline int copy_fs(unsigned long clone_flags,
23817                           struct task_struct * tsk)
23818 {
23819   if (clone_flags & CLONE_FS) {
23820     atomic_inc(&current->fs->count);
23821     return 0;
23822   }
23823   tsk->fs = kmalloc(sizeof(*tsk->fs), GFP_KERNEL);
23824   if (!tsk->fs)
23825     return -1;
23826   atomic_set(&tsk->fs->count, 1);
23827   tsk->fs->umask = current->fs->umask;
23828   tsk->fs->root = dget(current->fs->root);
23829   tsk->fs->pwd = dget(current->fs->pwd);
23830   return 0;
23831 }
23832 
23833 /* Copy a fd_set and compute the maximum fd it contains.
23834  */
23835 static inline int __copy_fdset(unsigned long *d,
23836                                unsigned long *src)
23837 {
23838   int i;
23839   unsigned long *p = src;
23840   unsigned long *max = src;
23841 
23842   for (i = __FDSET_LONGS; i; --i) {
23843     if ((*d++ = *p++) != 0)
23844       max = p;
23845   }
23846   return (max - src)*sizeof(long)*8;
23847 }
23848 
23849 static inline int copy_fdset(fd_set *dst, fd_set *src)
23850 {
23851   return __copy_fdset(dst->fds_bits, src->fds_bits);
23852 }
23853 
23854 static int copy_files(unsigned long clone_flags,
23855                       struct task_struct * tsk)
23856 {
23857   struct files_struct *oldf, *newf;
23858   struct file **old_fds, **new_fds;
23859   int size, i, error = 0;
23860 
23861   /* A background process may not have any files ... */
23862   oldf = current->files;
23863   if (!oldf)
23864     goto out;
23865 
23866   if (clone_flags & CLONE_FILES) {
23867     atomic_inc(&oldf->count);
23868     goto out;
23869   }
23870 
23871   tsk->files = NULL;
23872   error = -ENOMEM;
23873   newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
23874   if (!newf)
23875     goto out;
23876 
23877   /* Allocate the fd array, using get_free_page() if
23878    * possible.  Eventually we want to make the array size
23879    * variable ...  */
23880   size = NR_OPEN * sizeof(struct file *);
23881   if (size == PAGE_SIZE)
23882     new_fds =
23883       (struct file **) __get_free_page(GFP_KERNEL);
23884   else
23885     new_fds = (struct file **) kmalloc(size, GFP_KERNEL);
23886   if (!new_fds)
23887     goto out_release;
23888 
23889   atomic_set(&newf->count, 1);
23890   newf->max_fds = NR_OPEN;
23891   newf->fd = new_fds;
23892   newf->close_on_exec = oldf->close_on_exec;
23893   i = copy_fdset(&newf->open_fds, &oldf->open_fds);
23894 
23895   old_fds = oldf->fd;
23896   for (; i != 0; i--) {
23897     struct file *f = *old_fds++;
23898     *new_fds = f;
23899     if (f)
23900       f->f_count++;
23901     new_fds++;
23902   }
23903   /* This is long word aligned thus could use a optimized
23904    * version */
23905   memset(new_fds, 0,
23906          (char *)newf->fd + size - (char *)new_fds);
23907 
23908   tsk->files = newf;
23909   error = 0;
23910 out:
23911   return error;
23912 
23913 out_release:
23914   kmem_cache_free(files_cachep, newf);
23915   goto out;
23916 }
23917 
23918 static inline int copy_sighand(unsigned long clone_flags,
23919                                struct task_struct * tsk)
23920 {
23921   if (clone_flags & CLONE_SIGHAND) {
23922     atomic_inc(&current->sig->count);
23923     return 0;
23924   }
23925   tsk->sig = kmalloc(sizeof(*tsk->sig), GFP_KERNEL);
23926   if (!tsk->sig)
23927     return -1;
23928   spin_lock_init(&tsk->sig->siglock);
23929   atomic_set(&tsk->sig->count, 1);
23930   memcpy(tsk->sig->action, current->sig->action,
23931          sizeof(tsk->sig->action));
23932   return 0;
23933 }
23934 
23935 static inline void copy_flags(unsigned long clone_flags,
23936                               struct task_struct *p)
23937 {
23938   unsigned long new_flags = p->flags;
23939 
23940   new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU | PF_VFORK);
23941   new_flags |= PF_FORKNOEXEC;
23942   if (!(clone_flags & CLONE_PTRACE))
23943     new_flags &= ~(PF_PTRACED|PF_TRACESYS);
23944   if (clone_flags & CLONE_VFORK)
23945     new_flags |= PF_VFORK;
23946   p->flags = new_flags;
23947 }
23948 
23949 /* Ok, this is the main fork-routine. It copies the
23950  * system process information (task[nr]) and sets up the
23951  * necessary registers. It also copies the data segment
23952  * in its entirety.  */
 Комментарий
23953 int do_fork(unsigned long clone_flags, unsigned long usp,
23954             struct pt_regs *regs)
23955 {
23956   int nr;
23957   int retval = -ENOMEM;
23958   struct task_struct *p;
23959   struct semaphore sem = MUTEX_LOCKED;
23960 
23961   current->vfork_sem = &sem;
23962 
23963   p = alloc_task_struct();
23964   if (!p)
23965     goto fork_out;
23966 
23967   *p = *current;
23968 
23969   down(&current->mm->mmap_sem);
23970   lock_kernel();
23971 
23972   retval = -EAGAIN;
23973   if (p->user) {
23974     if (atomic_read(&p->user->count) >=
23975         p->rlim[RLIMIT_NPROC].rlim_cur)
23976       goto bad_fork_free;
23977   }
23978 
23979   {
23980     struct task_struct **tslot;
23981     tslot = find_empty_process();
23982     if (!tslot)
23983       goto bad_fork_free;
23984     p->tarray_ptr = tslot;
23985     *tslot = p;
23986     nr = tslot - &task[0];
23987   }
23988 
23989   if (p->exec_domain && p->exec_domain->module)
23990     __MOD_INC_USE_COUNT(p->exec_domain->module);
23991   if (p->binfmt && p->binfmt->module)
23992     __MOD_INC_USE_COUNT(p->binfmt->module);
23993 
23994   p->did_exec = 0;
23995   p->swappable = 0;
23996   p->state = TASK_UNINTERRUPTIBLE;
23997 
23998   copy_flags(clone_flags, p);
23999   p->pid = get_pid(clone_flags);
24000 
24001   /* This is a "shadow run" state. The process is marked
24002    * runnable, but isn't actually on any run queue
24003    * yet.. (that happens at the very end).  */
24004   p->state = TASK_RUNNING;
24005   p->next_run = p;
24006   p->prev_run = p;
24007 
24008   p->p_pptr = p->p_opptr = current;
24009   p->p_cptr = NULL;
24010   init_waitqueue(&p->wait_chldexit);
24011   p->vfork_sem = NULL;
24012 
24013   p->sigpending = 0;
24014   sigemptyset(&p->signal);
24015   p->sigqueue = NULL;
24016   p->sigqueue_tail = &p->sigqueue;
24017 
24018   p->it_real_value = p->it_virt_value = p->it_prof_value
24019     = 0;
24020   p->it_real_incr = p->it_virt_incr = p->it_prof_incr
24021     = 0;
24022   init_timer(&p->real_timer);
24023   p->real_timer.data = (unsigned long) p;
24024 
24025   p->leader = 0; /* session leadership doesn't inherit */
24026   p->tty_old_pgrp = 0;
24027   p->times.tms_utime = p->times.tms_stime = 0;
24028   p->times.tms_cutime = p->times.tms_cstime = 0;
24029 #ifdef __SMP__
24030   {
24031     int i;
24032     p->has_cpu = 0;
24033     p->processor = NO_PROC_ID;
24034     /* ?? should we just memset this ?? */
24035     for(i = 0; i < smp_num_cpus; i++)
24036       p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
24037     spin_lock_init(&p->sigmask_lock);
24038   }
24039 #endif
24040   p->lock_depth = -1;             /* -1 = no lock */
24041   p->start_time = jiffies;
24042 
24043   retval = -ENOMEM;
24044   /* copy all the process information */
24045   if (copy_files(clone_flags, p))
24046     goto bad_fork_cleanup;
24047   if (copy_fs(clone_flags, p))
24048     goto bad_fork_cleanup_files;
24049   if (copy_sighand(clone_flags, p))
24050     goto bad_fork_cleanup_fs;
24051   if (copy_mm(nr, clone_flags, p))
24052     goto bad_fork_cleanup_sighand;
24053   retval = copy_thread(nr, clone_flags, usp, p, regs);
24054   if (retval)
24055     goto bad_fork_cleanup_sighand;
24056   p->semundo = NULL;
24057 
24058   /* ok, now we should be set up.. */
24059   p->swappable = 1;
24060   p->exit_signal = clone_flags & CSIGNAL;
24061   p->pdeath_signal = 0;
24062 
24063   /* "share" dynamic priority between parent and child,
24064    * thus the total amount of dynamic priorities in the
24065    * system doesnt change, more scheduling fairness. This
24066    * is only important in the first timeslice, on the
24067    * long run the scheduling behaviour is unchanged.  */
24068   current->counter >>= 1;
24069   p->counter = current->counter;
24070 
24071   /* OK, add it to the run-queues and make it visible to
24072    * the rest of the system.
24073    *
24074    * Let it rip!  */
24075   retval = p->pid;
24076   if (retval) {
24077     write_lock_irq(&tasklist_lock);
 Комментарий
24078     SET_LINKS(p);
24079     hash_pid(p);
24080     write_unlock_irq(&tasklist_lock);
24081 
24082     nr_tasks++;
24083     if (p->user)
24084       atomic_inc(&p->user->count);
24085 
24086     p->next_run = NULL;
24087     p->prev_run = NULL;
24088     wake_up_process(p);             /* do this last */
24089   }
24090   ++total_forks;
24091 bad_fork:
24092   unlock_kernel();
24093   up(&current->mm->mmap_sem);
24094 fork_out:
24095   if ((clone_flags & CLONE_VFORK) && (retval > 0))
24096     down(&sem);
24097   return retval;
24098 
24099 bad_fork_cleanup_sighand:
24100   exit_sighand(p);
24101 bad_fork_cleanup_fs:
24102   exit_fs(p); /* blocking */
24103 bad_fork_cleanup_files:
24104   exit_files(p); /* blocking */
24105 bad_fork_cleanup:
24106   if (p->exec_domain && p->exec_domain->module)
24107     __MOD_DEC_USE_COUNT(p->exec_domain->module);
24108   if (p->binfmt && p->binfmt->module)
24109     __MOD_DEC_USE_COUNT(p->binfmt->module);
24110 
24111   add_free_taskslot(p->tarray_ptr);
24112 bad_fork_free:
24113   free_task_struct(p);
24114   goto bad_fork;
24115 }
24116 
24117 void __init filescache_init(void)
24118 {
24119   files_cachep = kmem_cache_create("files_cache",
24120            sizeof(struct files_struct),
24121            0,
24122            SLAB_HWCACHE_ALIGN,
24123            NULL, NULL);
24124   if (!files_cachep)
24125     panic("Cannot create files cache");
24126 }

netlib.narod.ru	< Назад \| Оглавление \| Далее >
Сайт управляется системой uCoz