Код ядра - mm/vmscan.c

netlib.narod.ru	< Назад \| Оглавление \| Далее >
mm/vmscan.c

38830 /*
38831  *  linux/mm/vmscan.c
38832  *
38833  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
38834  *
38835  *  Swap reorganised 29.12.95, Stephen Tweedie.
38836  *  kswapd added: 7.1.96  sct
38837  *  Removed kswapd_ctl limits, and swap out as many pages
38838  *  as needed to bring the system back to freepages.high:
38839  *  2.4.97, Rik van Riel.
38840  *  Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct
38841  *  Exp $ */
38842 
38843 #include <linux/slab.h>
38844 #include <linux/kernel_stat.h>
38845 #include <linux/swap.h>
38846 #include <linux/swapctl.h>
38847 #include <linux/smp_lock.h>
38848 #include <linux/pagemap.h>
38849 #include <linux/init.h>
38850 
38851 #include <asm/pgtable.h>
38852 
38853 /* The swap-out functions return 1 if they successfully
38854  * threw something out, and we got a free page. It
38855  * returns zero if it couldn't do anything, and any other
38856  * value indicates it decreased rss, but the page was
38857  * shared.
38858  *
38859  * NOTE! If it sleeps, it *must* return 1 to make sure we
38860  * don't continue with the swap-out. Otherwise we may be
38861  * using a process that no longer actually exists (it
38862  * might have died while we slept).  */
 Комментарий
38863 static int try_to_swap_out(struct task_struct * tsk,
38864   struct vm_area_struct* vma, unsigned long address,
38865   pte_t * page_table, int gfp_mask)
38866 {
38867   pte_t pte;
38868   unsigned long entry;
38869   unsigned long page;
38870   struct page * page_map;
38871 
38872   pte = *page_table;
38873   if (!pte_present(pte))
38874     return 0;
38875   page = pte_page(pte);
38876   if (MAP_NR(page) >= max_mapnr)
38877     return 0;
38878 
38879   page_map = mem_map + MAP_NR(page);
38880   if (PageReserved(page_map)
38881       || PageLocked(page_map)
38882       || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
38883     return 0;
38884 
38885   if (pte_young(pte)) {
38886     /* Transfer the "accessed" bit from the page tables
38887      * to the global page map. */
38888     set_pte(page_table, pte_mkold(pte));
38889     set_bit(PG_referenced, &page_map->flags);
38890     return 0;
38891   }
38892 
38893   /* Is the page already in the swap cache? If so, then
38894    * we can just drop our reference to it without doing
38895    * any IO - it's already up-to-date on disk.
38896    *
38897    * Return 0, as we didn't actually free any real
38898    * memory, and we should just continue our scan.  */
38899   if (PageSwapCache(page_map)) {
38900     entry = page_map->offset;
38901     swap_duplicate(entry);
38902     set_pte(page_table, __pte(entry));
38903 drop_pte:
38904     vma->vm_mm->rss--;
38905     flush_tlb_page(vma, address);
38906     __free_page(page_map);
38907     return 0;
38908   }
38909 
38910   /* Is it a clean page? Then it must be recoverable by
38911    * just paging it in again, and we can just drop it..
38912    *
38913    * However, this won't actually free any real memory,
38914    * as the page will just be in the page cache
38915    * somewhere, and as such we should just continue our
38916    * scan.
38917    *
38918    * Basically, this just makes it possible for us to do
38919    * some real work in the future in "shrink_mmap()".  */
38920   if (!pte_dirty(pte)) {
38921     pte_clear(page_table);
38922     goto drop_pte;
38923   }
38924 
38925   /* Don't go down into the swap-out stuff if we cannot
38926    * do I/O! Avoid recursing on FS locks etc.  */
38927   if (!(gfp_mask & __GFP_IO))
38928     return 0;
38929 
38930   /* Ok, it's really dirty. That means that we should
38931    * either create a new swap cache entry for it, or we
38932    * should write it back to its own backing store.
38933    *
38934    * Note that in neither case do we actually know that
38935    * we make a page available, but as we potentially
38936    * sleep we can no longer continue scanning, so we
38937    * migth as well assume we free'd something.
38938    *
38939    * NOTE NOTE NOTE! This should just set a dirty bit in
38940    * page_map, and just drop the pte. All the hard work
38941    * would be done by shrink_mmap().
38942    *
38943    * That would get rid of a lot of problems.  */
38944   flush_cache_page(vma, address);
38945   if (vma->vm_ops && vma->vm_ops->swapout) {
38946     pid_t pid = tsk->pid;
38947     pte_clear(page_table);
38948     flush_tlb_page(vma, address);
38949     vma->vm_mm->rss--;
38950 
38951     if (vma->vm_ops->swapout(vma, page_map))
38952       kill_proc(pid, SIGBUS, 1);
38953     __free_page(page_map);
38954     return 1;
38955   }
38956 
38957   /* This is a dirty, swappable page.  First of all, get
38958    * a suitable swap entry for it, and make sure we have
38959    * the swap cache set up to associate the page with
38960    * that swap entry.  */
38961   entry = get_swap_page();
38962   if (!entry)
38963     return 0; /* No swap space left */
38964 
38965   vma->vm_mm->rss--;
38966   tsk->nswap++;
38967   set_pte(page_table, __pte(entry));
38968   flush_tlb_page(vma, address);
38969   /* One for the process, one for the swap cache */
38970   swap_duplicate(entry);
38971   add_to_swap_cache(page_map, entry);
38972   /* We checked we were unlocked way up above, and we
38973    * have been careful not to stall until here */
38974   set_bit(PG_locked, &page_map->flags);
38975 
38976   /* OK, do a physical asynchronous write to swap.  */
38977   rw_swap_page(WRITE, entry, (char *) page, 0);
38978 
38979   __free_page(page_map);
38980   return 1;
38981 }
38982 
38983 /* A new implementation of swap_out().  We do not swap
38984  * complete processes, but only a small number of blocks,
38985  * before we continue with the next process.  The number
38986  * of blocks actually swapped is determined on the number
38987  * of page faults, that this process actually had in the
38988  * last time, so we won't swap heavily used processes all
38989  * the time ...
38990  *
38991  * Note: the priority argument is a hint on much CPU to
38992  * waste with the swap block search, not a hint, of how
38993  * much blocks to swap with each process.
38994  *
38995  * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de */
38996 static inline int swap_out_pmd(struct task_struct * tsk,
38997   struct vm_area_struct * vma, pmd_t * dir,
38998   unsigned long address, unsigned long end, int gfp_mask)
38999 {
39000   pte_t * pte;
39001   unsigned long pmd_end;
39002 
39003   if (pmd_none(*dir))
39004     return 0;
39005   if (pmd_bad(*dir)) {
39006     printk("swap_out_pmd: bad pmd (%08lx)\n",
39007            pmd_val(*dir));
39008     pmd_clear(dir);
39009     return 0;
39010   }
39011 
39012   pte = pte_offset(dir, address);
39013 
39014   pmd_end = (address + PMD_SIZE) & PMD_MASK;
39015   if (end > pmd_end)
39016     end = pmd_end;
39017 
39018   do {
39019     int result;
39020     tsk->mm->swap_address = address + PAGE_SIZE;
39021     result = try_to_swap_out(tsk, vma, address, pte,
39022                              gfp_mask);
39023     if (result)
39024       return result;
39025     address += PAGE_SIZE;
39026     pte++;
39027   } while (address < end);
39028   return 0;
39029 }
39030 
39031 static inline int swap_out_pgd(struct task_struct * tsk,
39032   struct vm_area_struct * vma, pgd_t * dir,
39033   unsigned long address, unsigned long end, int gfp_mask)
39034 {
39035   pmd_t * pmd;
39036   unsigned long pgd_end;
39037 
39038   if (pgd_none(*dir))
39039     return 0;
39040   if (pgd_bad(*dir)) {
39041     printk("swap_out_pgd: bad pgd (%08lx)\n",
39042            pgd_val(*dir));
39043     pgd_clear(dir);
39044     return 0;
39045   }
39046 
39047   pmd = pmd_offset(dir, address);
39048 
39049   pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
39050   if (end > pgd_end)
39051     end = pgd_end;
39052 
39053   do {
39054     int result = swap_out_pmd(tsk, vma, pmd, address,
39055                               end, gfp_mask);
39056     if (result)
39057       return result;
39058     address = (address + PMD_SIZE) & PMD_MASK;
39059     pmd++;
39060   } while (address < end);
39061   return 0;
39062 }
39063 
39064 static int swap_out_vma(struct task_struct * tsk,
39065   struct vm_area_struct * vma, unsigned long address,
39066   int gfp_mask)
39067 {
39068   pgd_t *pgdir;
39069   unsigned long end;
39070 
39071   /* Don't swap out areas like shared memory which have
39072    * their own separate swapping mechanism or areas which
39073    * are locked down */
39074   if (vma->vm_flags & (VM_SHM | VM_LOCKED))
39075     return 0;
39076 
39077   pgdir = pgd_offset(tsk->mm, address);
39078 
39079   end = vma->vm_end;
39080   while (address < end) {
39081     int result = swap_out_pgd(tsk, vma, pgdir, address,
39082                               end, gfp_mask);
39083     if (result)
39084       return result;
39085     address = (address + PGDIR_SIZE) & PGDIR_MASK;
39086     pgdir++;
39087   }
39088   return 0;
39089 }
39090 
39091 static int swap_out_process(struct task_struct * p,
39092                             int gfp_mask)
39093 {
39094   unsigned long address;
39095   struct vm_area_struct* vma;
39096 
39097   /* Go through process' page directory. */
39098   address = p->mm->swap_address;
39099 
39100   /* Find the proper vm-area */
39101   vma = find_vma(p->mm, address);
39102   if (vma) {
39103     if (address < vma->vm_start)
39104       address = vma->vm_start;
39105 
39106     for (;;) {
39107       int result =
39108         swap_out_vma(p, vma, address, gfp_mask);
39109       if (result)
39110         return result;
39111       vma = vma->vm_next;
39112       if (!vma)
39113         break;
39114       address = vma->vm_start;
39115     }
39116   }
39117 
39118   /* We didn't find anything for the process */
39119   p->mm->swap_cnt = 0;
39120   p->mm->swap_address = 0;
39121   return 0;
39122 }
39123 
39124 /* Select the task with maximal swap_cnt and try to swap
39125  * out a page.  N.B. This function returns only 0 or 1.
39126  * Return values != 1 from the lower-level routines
39127  * result in continued processing.  */
39128 static int swap_out(unsigned int priority, int gfp_mask)
39129 {
39130   struct task_struct * p, * pbest;
39131   int counter, assign, max_cnt;
39132 
39133   /* We make one or two passes through the task list,
39134    * indexed by assign = {0, 1}:
39135    *   Pass 1: select the swappable task with maximal
39136    *           RSS that HAS not yet been swapped out.
39137    *   Pass 2: re-assign rss swap_cnt values, then
39138    *           select as above.
39139    *
39140    * With this approach, there's no need to remember the
39141    * last task swapped out.  If the swap-out fails, we
39142    * clear swap_cnt so the task won't be selected again
39143    * until all others have been tried.
39144    *
39145    * Think of swap_cnt as a "shadow rss" - it tells us
39146    * which process we want to page out (always try
39147    * largest first).  */
39148   counter = nr_tasks / (priority+1);
39149   if (counter < 1)
39150     counter = 1;
39151   if (counter > nr_tasks)
39152     counter = nr_tasks;
39153 
39154   for (; counter >= 0; counter--) {
39155     assign = 0;
39156     max_cnt = 0;
39157     pbest = NULL;
39158   select:
39159     read_lock(&tasklist_lock);
39160     p = init_task.next_task;
39161     for (; p != &init_task; p = p->next_task) {
39162       if (!p->swappable)
39163         continue;
39164       if (p->mm->rss <= 0)
39165         continue;
39166       /* Refresh swap_cnt? */
39167       if (assign)
39168         p->mm->swap_cnt = p->mm->rss;
39169       if (p->mm->swap_cnt > max_cnt) {
39170         max_cnt = p->mm->swap_cnt;
39171         pbest = p;
39172       }
39173     }
39174     read_unlock(&tasklist_lock);
39175     if (!pbest) {
39176       if (!assign) {
39177         assign = 1;
39178         goto select;
39179       }
39180       goto out;
39181     }
39182 
39183     if (swap_out_process(pbest, gfp_mask))
39184       return 1;
39185   }
39186 out:
39187   return 0;
39188 }
39189 
39190 /* We need to make the locks finer granularity, but right
39191  * now we need this so that we can do page allocations
39192  * without holding the kernel lock etc.
39193  *
39194  * We want to try to free "count" pages, and we need to
39195  * cluster them so that we get good swap-out
39196  * behaviour. See the "free_memory()" macro for details.
39197  */
39198 static int do_try_to_free_pages(unsigned int gfp_mask)
39199 {
39200   int priority;
39201   int count = SWAP_CLUSTER_MAX;
39202 
39203   lock_kernel();
39204 
39205   /* Always trim SLAB caches when memory gets low. */
39206   kmem_cache_reap(gfp_mask);
39207 
39208   priority = 6;
39209   do {
39210     while (shrink_mmap(priority, gfp_mask)) {
39211       if (!--count)
39212         goto done;
39213     }
39214 
39215     /* Try to get rid of some shared memory pages.. */
39216     if (gfp_mask & __GFP_IO) {
39217       while (shm_swap(priority, gfp_mask)) {
39218         if (!--count)
39219           goto done;
39220       }
39221     }
39222 
39223     /* Then, try to page stuff out.. */
39224     while (swap_out(priority, gfp_mask)) {
39225       if (!--count)
39226         goto done;
39227     }
39228 
39229     shrink_dcache_memory(priority, gfp_mask);
39230   } while (--priority >= 0);
39231 done:
39232   unlock_kernel();
39233 
39234   return priority >= 0;
39235 }
39236 
39237 /* Before we start the kernel thread, print out the
39238  * kswapd initialization message (otherwise the init
39239  * message may be printed in the middle of another
39240  * driver's init message).  It looks very bad when that
39241  * happens.  */
39242 void __init kswapd_setup(void)
39243 {
39244        int i;
39245        char *revision="$Revision: 1.5 $", *s, *e;
39246 
39247        swap_setup();
39248 
39249        if ((s = strchr(revision, ':')) &&
39250      (e = strchr(s, '$')))
39251          s++, i = e - s;
39252        else
39253          s = revision, i = -1;
39254        printk ("Starting kswapd v%.*s\n", i, s);
39255 }
39256 
39257 static struct task_struct *kswapd_process;
39258 
39259 /* The background pageout daemon, started as a kernel
39260  * thread from the init process.
39261  *
39262  * This basically executes once a second, trickling out
39263  * pages so that we have _some_ free memory available
39264  * even if there is no other activity that frees anything
39265  * up. This is needed for things like routing etc, where
39266  * we otherwise might have all activity going on in
39267  * asynchronous contexts that cannot page things out.
39268  *
39269  * If there are applications that are active
39270  * memory-allocators (most normal use), this basically
39271  * shouldn't matter.  */
39272 int kswapd(void *unused)
39273 {
39274   struct task_struct *tsk = current;
39275 
39276   kswapd_process = tsk;
39277   tsk->session = 1;
39278   tsk->pgrp = 1;
39279   strcpy(tsk->comm, "kswapd");
39280   sigfillset(&tsk->blocked);
39281 
39282   /* Tell the memory management that we're a "memory
39283    * allocator", and that if we need more memory we
39284    * should get access to it regardless (see
39285    * "__get_free_pages()"). "kswapd" should never get
39286    * caught in the normal page freeing logic.
39287    *
39288    * (Kswapd normally doesn't need memory anyway, but
39289    * sometimes you need a small amount of memory in order
39290    * to be able to page out something else, and this flag
39291    * essentially protects us from recursively trying to
39292    * free more memory as we're trying to free the first
39293    * piece of memory in the first place).  */
39294   tsk->flags |= PF_MEMALLOC;
39295 
39296   while (1) {
39297     /* Wake up once a second to see if we need to make
39298      * more memory available.
39299      *
39300      * If we actually get into a low-memory situation,
39301      * the processes needing more memory will wake us up
39302      * on a more timely basis.  */
39303     do {
39304       if (nr_free_pages >= freepages.high)
39305         break;
39306 
39307       if (!do_try_to_free_pages(GFP_KSWAPD))
39308         break;
39309     } while (!tsk->need_resched);
39310     run_task_queue(&tq_disk);
39311     tsk->state = TASK_INTERRUPTIBLE;
39312     schedule_timeout(HZ);
39313   }
39314 }
39315 
39316 /* Called by non-kswapd processes when they want more
39317  * memory.
39318  *
39319  * In a perfect world, this should just wake up kswapd
39320  * and return. We don't actually want to swap stuff out
39321  * from user processes, because the locking issues are
39322  * nasty to the extreme (file write locks, and MM
39323  * locking)
39324  *
39325  * One option might be to let kswapd do all the page-out
39326  * and VM page table scanning that needs locking, and
39327  * this process thread could do just the mmap shrink
39328  * stage that can be done by just dropping cached pages
39329  * without having any deadlock issues.  */
39330 int try_to_free_pages(unsigned int gfp_mask)
39331 {
39332   int retval = 1;
39333 
39334   wake_up_process(kswapd_process);
39335   if (gfp_mask & __GFP_WAIT)
39336     retval = do_try_to_free_pages(gfp_mask);
39337   return retval;
39338 }
39339

netlib.narod.ru	< Назад \| Оглавление \| Далее >
Сайт управляется системой uCoz