mm/vmscan.c
38830 /*
38831 * linux/mm/vmscan.c
38832 *
38833 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
38834 *
38835 * Swap reorganised 29.12.95, Stephen Tweedie.
38836 * kswapd added: 7.1.96 sct
38837 * Removed kswapd_ctl limits, and swap out as many pages
38838 * as needed to bring the system back to freepages.high:
38839 * 2.4.97, Rik van Riel.
38840 * Version: $Id: vmscan.c,v 1.5 1998/02/23 22:14:28 sct
38841 * Exp $ */
38842
38843 #include <linux/slab.h>
38844 #include <linux/kernel_stat.h>
38845 #include <linux/swap.h>
38846 #include <linux/swapctl.h>
38847 #include <linux/smp_lock.h>
38848 #include <linux/pagemap.h>
38849 #include <linux/init.h>
38850
38851 #include <asm/pgtable.h>
38852
38853 /* The swap-out functions return 1 if they successfully
38854 * threw something out, and we got a free page. It
38855 * returns zero if it couldn't do anything, and any other
38856 * value indicates it decreased rss, but the page was
38857 * shared.
38858 *
38859 * NOTE! If it sleeps, it *must* return 1 to make sure we
38860 * don't continue with the swap-out. Otherwise we may be
38861 * using a process that no longer actually exists (it
38862 * might have died while we slept). */
38863 static int try_to_swap_out(struct task_struct * tsk,
38864 struct vm_area_struct* vma, unsigned long address,
38865 pte_t * page_table, int gfp_mask)
38866 {
38867 pte_t pte;
38868 unsigned long entry;
38869 unsigned long page;
38870 struct page * page_map;
38871
38872 pte = *page_table;
38873 if (!pte_present(pte))
38874 return 0;
38875 page = pte_page(pte);
38876 if (MAP_NR(page) >= max_mapnr)
38877 return 0;
38878
38879 page_map = mem_map + MAP_NR(page);
38880 if (PageReserved(page_map)
38881 || PageLocked(page_map)
38882 || ((gfp_mask & __GFP_DMA) && !PageDMA(page_map)))
38883 return 0;
38884
38885 if (pte_young(pte)) {
38886 /* Transfer the "accessed" bit from the page tables
38887 * to the global page map. */
38888 set_pte(page_table, pte_mkold(pte));
38889 set_bit(PG_referenced, &page_map->flags);
38890 return 0;
38891 }
38892
38893 /* Is the page already in the swap cache? If so, then
38894 * we can just drop our reference to it without doing
38895 * any IO - it's already up-to-date on disk.
38896 *
38897 * Return 0, as we didn't actually free any real
38898 * memory, and we should just continue our scan. */
38899 if (PageSwapCache(page_map)) {
38900 entry = page_map->offset;
38901 swap_duplicate(entry);
38902 set_pte(page_table, __pte(entry));
38903 drop_pte:
38904 vma->vm_mm->rss--;
38905 flush_tlb_page(vma, address);
38906 __free_page(page_map);
38907 return 0;
38908 }
38909
38910 /* Is it a clean page? Then it must be recoverable by
38911 * just paging it in again, and we can just drop it..
38912 *
38913 * However, this won't actually free any real memory,
38914 * as the page will just be in the page cache
38915 * somewhere, and as such we should just continue our
38916 * scan.
38917 *
38918 * Basically, this just makes it possible for us to do
38919 * some real work in the future in "shrink_mmap()". */
38920 if (!pte_dirty(pte)) {
38921 pte_clear(page_table);
38922 goto drop_pte;
38923 }
38924
38925 /* Don't go down into the swap-out stuff if we cannot
38926 * do I/O! Avoid recursing on FS locks etc. */
38927 if (!(gfp_mask & __GFP_IO))
38928 return 0;
38929
38930 /* Ok, it's really dirty. That means that we should
38931 * either create a new swap cache entry for it, or we
38932 * should write it back to its own backing store.
38933 *
38934 * Note that in neither case do we actually know that
38935 * we make a page available, but as we potentially
38936 * sleep we can no longer continue scanning, so we
38937 * migth as well assume we free'd something.
38938 *
38939 * NOTE NOTE NOTE! This should just set a dirty bit in
38940 * page_map, and just drop the pte. All the hard work
38941 * would be done by shrink_mmap().
38942 *
38943 * That would get rid of a lot of problems. */
38944 flush_cache_page(vma, address);
38945 if (vma->vm_ops && vma->vm_ops->swapout) {
38946 pid_t pid = tsk->pid;
38947 pte_clear(page_table);
38948 flush_tlb_page(vma, address);
38949 vma->vm_mm->rss--;
38950
38951 if (vma->vm_ops->swapout(vma, page_map))
38952 kill_proc(pid, SIGBUS, 1);
38953 __free_page(page_map);
38954 return 1;
38955 }
38956
38957 /* This is a dirty, swappable page. First of all, get
38958 * a suitable swap entry for it, and make sure we have
38959 * the swap cache set up to associate the page with
38960 * that swap entry. */
38961 entry = get_swap_page();
38962 if (!entry)
38963 return 0; /* No swap space left */
38964
38965 vma->vm_mm->rss--;
38966 tsk->nswap++;
38967 set_pte(page_table, __pte(entry));
38968 flush_tlb_page(vma, address);
38969 /* One for the process, one for the swap cache */
38970 swap_duplicate(entry);
38971 add_to_swap_cache(page_map, entry);
38972 /* We checked we were unlocked way up above, and we
38973 * have been careful not to stall until here */
38974 set_bit(PG_locked, &page_map->flags);
38975
38976 /* OK, do a physical asynchronous write to swap. */
38977 rw_swap_page(WRITE, entry, (char *) page, 0);
38978
38979 __free_page(page_map);
38980 return 1;
38981 }
38982
38983 /* A new implementation of swap_out(). We do not swap
38984 * complete processes, but only a small number of blocks,
38985 * before we continue with the next process. The number
38986 * of blocks actually swapped is determined on the number
38987 * of page faults, that this process actually had in the
38988 * last time, so we won't swap heavily used processes all
38989 * the time ...
38990 *
38991 * Note: the priority argument is a hint on much CPU to
38992 * waste with the swap block search, not a hint, of how
38993 * much blocks to swap with each process.
38994 *
38995 * (C) 1993 Kai Petzke, wpp@marie.physik.tu-berlin.de */
38996 static inline int swap_out_pmd(struct task_struct * tsk,
38997 struct vm_area_struct * vma, pmd_t * dir,
38998 unsigned long address, unsigned long end, int gfp_mask)
38999 {
39000 pte_t * pte;
39001 unsigned long pmd_end;
39002
39003 if (pmd_none(*dir))
39004 return 0;
39005 if (pmd_bad(*dir)) {
39006 printk("swap_out_pmd: bad pmd (%08lx)\n",
39007 pmd_val(*dir));
39008 pmd_clear(dir);
39009 return 0;
39010 }
39011
39012 pte = pte_offset(dir, address);
39013
39014 pmd_end = (address + PMD_SIZE) & PMD_MASK;
39015 if (end > pmd_end)
39016 end = pmd_end;
39017
39018 do {
39019 int result;
39020 tsk->mm->swap_address = address + PAGE_SIZE;
39021 result = try_to_swap_out(tsk, vma, address, pte,
39022 gfp_mask);
39023 if (result)
39024 return result;
39025 address += PAGE_SIZE;
39026 pte++;
39027 } while (address < end);
39028 return 0;
39029 }
39030
39031 static inline int swap_out_pgd(struct task_struct * tsk,
39032 struct vm_area_struct * vma, pgd_t * dir,
39033 unsigned long address, unsigned long end, int gfp_mask)
39034 {
39035 pmd_t * pmd;
39036 unsigned long pgd_end;
39037
39038 if (pgd_none(*dir))
39039 return 0;
39040 if (pgd_bad(*dir)) {
39041 printk("swap_out_pgd: bad pgd (%08lx)\n",
39042 pgd_val(*dir));
39043 pgd_clear(dir);
39044 return 0;
39045 }
39046
39047 pmd = pmd_offset(dir, address);
39048
39049 pgd_end = (address + PGDIR_SIZE) & PGDIR_MASK;
39050 if (end > pgd_end)
39051 end = pgd_end;
39052
39053 do {
39054 int result = swap_out_pmd(tsk, vma, pmd, address,
39055 end, gfp_mask);
39056 if (result)
39057 return result;
39058 address = (address + PMD_SIZE) & PMD_MASK;
39059 pmd++;
39060 } while (address < end);
39061 return 0;
39062 }
39063
39064 static int swap_out_vma(struct task_struct * tsk,
39065 struct vm_area_struct * vma, unsigned long address,
39066 int gfp_mask)
39067 {
39068 pgd_t *pgdir;
39069 unsigned long end;
39070
39071 /* Don't swap out areas like shared memory which have
39072 * their own separate swapping mechanism or areas which
39073 * are locked down */
39074 if (vma->vm_flags & (VM_SHM | VM_LOCKED))
39075 return 0;
39076
39077 pgdir = pgd_offset(tsk->mm, address);
39078
39079 end = vma->vm_end;
39080 while (address < end) {
39081 int result = swap_out_pgd(tsk, vma, pgdir, address,
39082 end, gfp_mask);
39083 if (result)
39084 return result;
39085 address = (address + PGDIR_SIZE) & PGDIR_MASK;
39086 pgdir++;
39087 }
39088 return 0;
39089 }
39090
39091 static int swap_out_process(struct task_struct * p,
39092 int gfp_mask)
39093 {
39094 unsigned long address;
39095 struct vm_area_struct* vma;
39096
39097 /* Go through process' page directory. */
39098 address = p->mm->swap_address;
39099
39100 /* Find the proper vm-area */
39101 vma = find_vma(p->mm, address);
39102 if (vma) {
39103 if (address < vma->vm_start)
39104 address = vma->vm_start;
39105
39106 for (;;) {
39107 int result =
39108 swap_out_vma(p, vma, address, gfp_mask);
39109 if (result)
39110 return result;
39111 vma = vma->vm_next;
39112 if (!vma)
39113 break;
39114 address = vma->vm_start;
39115 }
39116 }
39117
39118 /* We didn't find anything for the process */
39119 p->mm->swap_cnt = 0;
39120 p->mm->swap_address = 0;
39121 return 0;
39122 }
39123
39124 /* Select the task with maximal swap_cnt and try to swap
39125 * out a page. N.B. This function returns only 0 or 1.
39126 * Return values != 1 from the lower-level routines
39127 * result in continued processing. */
39128 static int swap_out(unsigned int priority, int gfp_mask)
39129 {
39130 struct task_struct * p, * pbest;
39131 int counter, assign, max_cnt;
39132
39133 /* We make one or two passes through the task list,
39134 * indexed by assign = {0, 1}:
39135 * Pass 1: select the swappable task with maximal
39136 * RSS that HAS not yet been swapped out.
39137 * Pass 2: re-assign rss swap_cnt values, then
39138 * select as above.
39139 *
39140 * With this approach, there's no need to remember the
39141 * last task swapped out. If the swap-out fails, we
39142 * clear swap_cnt so the task won't be selected again
39143 * until all others have been tried.
39144 *
39145 * Think of swap_cnt as a "shadow rss" - it tells us
39146 * which process we want to page out (always try
39147 * largest first). */
39148 counter = nr_tasks / (priority+1);
39149 if (counter < 1)
39150 counter = 1;
39151 if (counter > nr_tasks)
39152 counter = nr_tasks;
39153
39154 for (; counter >= 0; counter--) {
39155 assign = 0;
39156 max_cnt = 0;
39157 pbest = NULL;
39158 select:
39159 read_lock(&tasklist_lock);
39160 p = init_task.next_task;
39161 for (; p != &init_task; p = p->next_task) {
39162 if (!p->swappable)
39163 continue;
39164 if (p->mm->rss <= 0)
39165 continue;
39166 /* Refresh swap_cnt? */
39167 if (assign)
39168 p->mm->swap_cnt = p->mm->rss;
39169 if (p->mm->swap_cnt > max_cnt) {
39170 max_cnt = p->mm->swap_cnt;
39171 pbest = p;
39172 }
39173 }
39174 read_unlock(&tasklist_lock);
39175 if (!pbest) {
39176 if (!assign) {
39177 assign = 1;
39178 goto select;
39179 }
39180 goto out;
39181 }
39182
39183 if (swap_out_process(pbest, gfp_mask))
39184 return 1;
39185 }
39186 out:
39187 return 0;
39188 }
39189
39190 /* We need to make the locks finer granularity, but right
39191 * now we need this so that we can do page allocations
39192 * without holding the kernel lock etc.
39193 *
39194 * We want to try to free "count" pages, and we need to
39195 * cluster them so that we get good swap-out
39196 * behaviour. See the "free_memory()" macro for details.
39197 */
39198 static int do_try_to_free_pages(unsigned int gfp_mask)
39199 {
39200 int priority;
39201 int count = SWAP_CLUSTER_MAX;
39202
39203 lock_kernel();
39204
39205 /* Always trim SLAB caches when memory gets low. */
39206 kmem_cache_reap(gfp_mask);
39207
39208 priority = 6;
39209 do {
39210 while (shrink_mmap(priority, gfp_mask)) {
39211 if (!--count)
39212 goto done;
39213 }
39214
39215 /* Try to get rid of some shared memory pages.. */
39216 if (gfp_mask & __GFP_IO) {
39217 while (shm_swap(priority, gfp_mask)) {
39218 if (!--count)
39219 goto done;
39220 }
39221 }
39222
39223 /* Then, try to page stuff out.. */
39224 while (swap_out(priority, gfp_mask)) {
39225 if (!--count)
39226 goto done;
39227 }
39228
39229 shrink_dcache_memory(priority, gfp_mask);
39230 } while (--priority >= 0);
39231 done:
39232 unlock_kernel();
39233
39234 return priority >= 0;
39235 }
39236
39237 /* Before we start the kernel thread, print out the
39238 * kswapd initialization message (otherwise the init
39239 * message may be printed in the middle of another
39240 * driver's init message). It looks very bad when that
39241 * happens. */
39242 void __init kswapd_setup(void)
39243 {
39244 int i;
39245 char *revision="$Revision: 1.5 $", *s, *e;
39246
39247 swap_setup();
39248
39249 if ((s = strchr(revision, ':')) &&
39250 (e = strchr(s, '$')))
39251 s++, i = e - s;
39252 else
39253 s = revision, i = -1;
39254 printk ("Starting kswapd v%.*s\n", i, s);
39255 }
39256
39257 static struct task_struct *kswapd_process;
39258
39259 /* The background pageout daemon, started as a kernel
39260 * thread from the init process.
39261 *
39262 * This basically executes once a second, trickling out
39263 * pages so that we have _some_ free memory available
39264 * even if there is no other activity that frees anything
39265 * up. This is needed for things like routing etc, where
39266 * we otherwise might have all activity going on in
39267 * asynchronous contexts that cannot page things out.
39268 *
39269 * If there are applications that are active
39270 * memory-allocators (most normal use), this basically
39271 * shouldn't matter. */
39272 int kswapd(void *unused)
39273 {
39274 struct task_struct *tsk = current;
39275
39276 kswapd_process = tsk;
39277 tsk->session = 1;
39278 tsk->pgrp = 1;
39279 strcpy(tsk->comm, "kswapd");
39280 sigfillset(&tsk->blocked);
39281
39282 /* Tell the memory management that we're a "memory
39283 * allocator", and that if we need more memory we
39284 * should get access to it regardless (see
39285 * "__get_free_pages()"). "kswapd" should never get
39286 * caught in the normal page freeing logic.
39287 *
39288 * (Kswapd normally doesn't need memory anyway, but
39289 * sometimes you need a small amount of memory in order
39290 * to be able to page out something else, and this flag
39291 * essentially protects us from recursively trying to
39292 * free more memory as we're trying to free the first
39293 * piece of memory in the first place). */
39294 tsk->flags |= PF_MEMALLOC;
39295
39296 while (1) {
39297 /* Wake up once a second to see if we need to make
39298 * more memory available.
39299 *
39300 * If we actually get into a low-memory situation,
39301 * the processes needing more memory will wake us up
39302 * on a more timely basis. */
39303 do {
39304 if (nr_free_pages >= freepages.high)
39305 break;
39306
39307 if (!do_try_to_free_pages(GFP_KSWAPD))
39308 break;
39309 } while (!tsk->need_resched);
39310 run_task_queue(&tq_disk);
39311 tsk->state = TASK_INTERRUPTIBLE;
39312 schedule_timeout(HZ);
39313 }
39314 }
39315
39316 /* Called by non-kswapd processes when they want more
39317 * memory.
39318 *
39319 * In a perfect world, this should just wake up kswapd
39320 * and return. We don't actually want to swap stuff out
39321 * from user processes, because the locking issues are
39322 * nasty to the extreme (file write locks, and MM
39323 * locking)
39324 *
39325 * One option might be to let kswapd do all the page-out
39326 * and VM page table scanning that needs locking, and
39327 * this process thread could do just the mmap shrink
39328 * stage that can be done by just dropping cached pages
39329 * without having any deadlock issues. */
39330 int try_to_free_pages(unsigned int gfp_mask)
39331 {
39332 int retval = 1;
39333
39334 wake_up_process(kswapd_process);
39335 if (gfp_mask & __GFP_WAIT)
39336 retval = do_try_to_free_pages(gfp_mask);
39337 return retval;
39338 }
39339
Сайт управляется системой
uCoz