Код ядра - mm/memory.c

netlib.narod.ru	< Назад \| Оглавление \| Далее >
mm/memory.c

31766 /*
31767  *  linux/mm/memory.c
31768  *
31769  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
31770  */
31771 
31772 /* demand-loading started 01.12.91 - seems it is high on
31773  * the list of things wanted, and it should be easy to
31774  * implement. - Linus */
31775 
31776 /* Ok, demand-loading was easy, shared pages a little bit
31777  * tricker. Shared pages started 02.12.91, seems to
31778  * work. - Linus.
31779  *
31780  * Tested sharing by executing about 30 /bin/sh: under
31781  * the old kernel it would have taken more than the 6M I
31782  * have free, but it worked well as far as I could see.
31783  *
31784  * Also corrected some "invalidate()"s - I wasn't doing
31785  * enough of them.  */
31786 
31787 /* Real VM (paging to/from disk) started 18.12.91. Much
31788  * more work and thought has to go into this. Oh, well..
31789  * 19.12.91 - works, somewhat. Sometimes I get faults,
31790  * don't know why.  Fund it. Everything seems to work
31791  * now.
31792  * 20.12.91 - Ok, making the swap-device changeable like
31793  * the root.  */
31794 /* 05.04.94 - Multi-pg memory management added for v1.1.
31795  *            Idea by Alex Bligh (alex@cconcepts.co.uk)
31796  */
31797 
31798 #include <linux/mm.h>
31799 #include <linux/mman.h>
31800 #include <linux/swap.h>
31801 #include <linux/smp_lock.h>
31802 
31803 #include <asm/uaccess.h>
31804 #include <asm/pgtable.h>
31805 
31806 unsigned long max_mapnr = 0;
31807 unsigned long num_physpages = 0;
31808 void * high_memory = NULL;
31809 
31810 /* We special-case the C-O-W ZERO_PAGE, because it's such
31811  * a common occurrence (no need to read the page to know
31812  * that it's zero - better for the cache and memory
31813  * subsystem).  */
31814 static inline void copy_cow_page(unsigned long from,
31815                                  unsigned long to)
31816 {
31817   if (from == ZERO_PAGE) {
31818     clear_page(to);
31819     return;
31820   }
31821   copy_page(to, from);
31822 }
31823 
31824 mem_map_t * mem_map = NULL;
31825 
31826 /* oom() prints a message (so that the user knows why the
31827  * process died), and gives the process an untrappable
31828  * SIGKILL.  */
31829 void oom(struct task_struct * task)
31830 {
31831   printk("\nOut of memory for %s.\n", task->comm);
31832   force_sig(SIGKILL, task);
31833 }
31834 
31835 /* Note: this doesn't free the actual pages
31836  * themselves. That has been handled earlier when
31837  * unmapping all the memory regions.  */
31838 static inline void free_one_pmd(pmd_t * dir)
31839 {
31840   pte_t * pte;
31841 
31842   if (pmd_none(*dir))
31843     return;
31844   if (pmd_bad(*dir)) {
31845     printk("free_one_pmd: bad directory entry %08lx\n",
31846            pmd_val(*dir));
31847     pmd_clear(dir);
31848     return;
31849   }
31850   pte = pte_offset(dir, 0);
31851   pmd_clear(dir);
31852   pte_free(pte);
31853 }
31854 
31855 static inline void free_one_pgd(pgd_t * dir)
31856 {
31857   int j;
31858   pmd_t * pmd;
31859 
31860   if (pgd_none(*dir))
31861     return;
31862   if (pgd_bad(*dir)) {
31863     printk("free_one_pgd: bad directory entry %08lx\n",
31864            pgd_val(*dir));
31865     pgd_clear(dir);
31866     return;
31867   }
31868   pmd = pmd_offset(dir, 0);
31869   pgd_clear(dir);
31870   for (j = 0; j < PTRS_PER_PMD ; j++)
31871     free_one_pmd(pmd+j);
31872   pmd_free(pmd);
31873 }
31874 
31875 /* Low and high watermarks for page table cache.  The
31876  * system should try to have pgt_water[0] <= cache
31877  * elements <= pgt_water[1] */
31878 int pgt_cache_water[2] = { 25, 50 };
31879 
31880 /* Returns the number of pages freed */
31881 int check_pgt_cache(void)
31882 {
31883   return do_check_pgt_cache(pgt_cache_water[0],
31884                             pgt_cache_water[1]);
31885 }
31886 
31887 
31888 /* This function clears all user-level page tables of a
31889  * process - this is needed by execve(), so that old
31890  * pages aren't in the way.  */
31891 void clear_page_tables(struct mm_struct *mm,
31892                        unsigned long first, int nr)
31893 {
31894   pgd_t * page_dir = mm->pgd;
31895 
31896   if (page_dir && page_dir != swapper_pg_dir) {
31897     page_dir += first;
31898     do {
31899       free_one_pgd(page_dir);
31900       page_dir++;
31901     } while (--nr);
31902 
31903     /* keep the page table cache within bounds */
31904     check_pgt_cache();
31905   }
31906 }
31907 
31908 /* This function just free's the page directory - the
31909  * page tables themselves have been freed earlier by
31910  * clear_page_tables().  */
31911 void free_page_tables(struct mm_struct * mm)
31912 {
31913   pgd_t * page_dir = mm->pgd;
31914 
31915   if (page_dir) {
31916     if (page_dir == swapper_pg_dir)
31917       goto out_bad;
31918     pgd_free(page_dir);
31919   }
31920   return;
31921 
31922 out_bad:
31923   printk(KERN_ERR
31924     "free_page_tables: Trying to free kernel pgd\n");
31925   return;
31926 }
31927 
31928 int new_page_tables(struct task_struct * tsk)
31929 {
31930   pgd_t * new_pg;
31931 
31932   if (!(new_pg = pgd_alloc()))
31933     return -ENOMEM;
31934   SET_PAGE_DIR(tsk, new_pg);
31935   tsk->mm->pgd = new_pg;
31936   return 0;
31937 }
31938 
31939 #define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t))
31940 #define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t))
31941 
31942 /* copy one vm_area from one task to the other. Assumes
31943  * the page tables already present in the new task to be
31944  * cleared in the whole range covered by this vma.
31945  *
31946  * 08Jan98 Merged into one routine from several inline
31947  * routines to reduce variable count and make things
31948  * faster. -jj */
31949 int copy_page_range(struct mm_struct *dst,
31950                     struct mm_struct *src,
31951                     struct vm_area_struct *vma)
31952 {
31953   pgd_t * src_pgd, * dst_pgd;
31954   unsigned long address = vma->vm_start;
31955   unsigned long end = vma->vm_end;
31956   unsigned long cow =
31957     (vma->vm_flags & (VM_SHARED | VM_MAYWRITE))
31958     == VM_MAYWRITE;
31959 
31960   src_pgd = pgd_offset(src, address)-1;
31961   dst_pgd = pgd_offset(dst, address)-1;
31962 
31963   for (;;) {
31964     pmd_t * src_pmd, * dst_pmd;
31965 
31966     src_pgd++; dst_pgd++;
31967 
31968     /* copy_pmd_range */
31969 
31970     if (pgd_none(*src_pgd))
31971       goto skip_copy_pmd_range;
31972     if (pgd_bad(*src_pgd)) {
31973       printk("copy_pmd_range: bad pgd (%08lx)\n",
31974         pgd_val(*src_pgd));
31975       pgd_clear(src_pgd);
31976 skip_copy_pmd_range:
31977       address = (address + PGDIR_SIZE) & PGDIR_MASK;
31978       if (address >= end)
31979         goto out;
31980       continue;
31981     }
31982     if (pgd_none(*dst_pgd)) {
31983       if (!pmd_alloc(dst_pgd, 0))
31984         goto nomem;
31985     }
31986 
31987     src_pmd = pmd_offset(src_pgd, address);
31988     dst_pmd = pmd_offset(dst_pgd, address);
31989 
31990     do {
31991       pte_t * src_pte, * dst_pte;
31992 
31993       /* copy_pte_range */
31994 
31995       if (pmd_none(*src_pmd))
31996         goto skip_copy_pte_range;
31997       if (pmd_bad(*src_pmd)) {
31998         printk("copy_pte_range: bad pmd (%08lx)\n",
31999                pmd_val(*src_pmd));
32000         pmd_clear(src_pmd);
32001 skip_copy_pte_range:
32002         address = (address + PMD_SIZE) & PMD_MASK;
32003         if (address >= end)
32004           goto out;
32005         goto cont_copy_pmd_range;
32006       }
32007       if (pmd_none(*dst_pmd)) {
32008         if (!pte_alloc(dst_pmd, 0))
32009           goto nomem;
32010       }
32011 
32012       src_pte = pte_offset(src_pmd, address);
32013       dst_pte = pte_offset(dst_pmd, address);
32014 
32015       do {
32016         pte_t pte = *src_pte;
32017         unsigned long page_nr;
32018 
32019         /* copy_one_pte */
32020 
32021         if (pte_none(pte))
32022           goto cont_copy_pte_range;
32023         if (!pte_present(pte)) {
32024           swap_duplicate(pte_val(pte));
32025           set_pte(dst_pte, pte);
32026           goto cont_copy_pte_range;
32027         }
32028         page_nr = MAP_NR(pte_page(pte));
32029         if (page_nr >= max_mapnr ||
32030             PageReserved(mem_map+page_nr)) {
32031           set_pte(dst_pte, pte);
32032           goto cont_copy_pte_range;
32033         }
32034         /* If it's a COW mapping, write protect it both
32035          * in the parent and the child */
32036         if (cow) {
32037           pte = pte_wrprotect(pte);
32038           set_pte(src_pte, pte);
32039         }
32040         /* If it's a shared mapping, mark it clean in the
32041          * child */
32042         if (vma->vm_flags & VM_SHARED)
32043           pte = pte_mkclean(pte);
32044         set_pte(dst_pte, pte_mkold(pte));
32045         atomic_inc(&mem_map[page_nr].count);
32046 
32047 cont_copy_pte_range:
32048         address += PAGE_SIZE;
32049         if (address >= end)
32050           goto out;
32051         src_pte++;
32052         dst_pte++;
32053       } while ((unsigned long)src_pte & PTE_TABLE_MASK);
32054 
32055 cont_copy_pmd_range:
32056       src_pmd++;
32057       dst_pmd++;
32058     } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
32059   }
32060 out:
32061   return 0;
32062 
32063 nomem:
32064   return -ENOMEM;
32065 }
32066 
32067 /* Return indicates whether a page was freed so caller
32068  * can adjust rss */
32069 static inline int free_pte(pte_t page)
32070 {
32071   if (pte_present(page)) {
32072     unsigned long addr = pte_page(page);
32073     if (MAP_NR(addr) >= max_mapnr ||
32074         PageReserved(mem_map+MAP_NR(addr)))
32075       return 0;
32076     /* free_page() used to be able to clear swap cache
32077      * entries.  We may now have to do it manually.  */
32078     free_page_and_swap_cache(addr);
32079     return 1;
32080   }
32081   swap_free(pte_val(page));
32082   return 0;
32083 }
32084 
32085 static inline void forget_pte(pte_t page)
32086 {
32087   if (!pte_none(page)) {
32088     printk("forget_pte: old mapping existed!\n");
32089     free_pte(page);
32090   }
32091 }
32092 
32093 static inline int zap_pte_range(pmd_t * pmd,
32094   unsigned long address, unsigned long size)
32095 {
32096   pte_t * pte;
32097   int freed;
32098 
32099   if (pmd_none(*pmd))
32100     return 0;
32101   if (pmd_bad(*pmd)) {
32102     printk("zap_pte_range: bad pmd (%08lx)\n",
32103            pmd_val(*pmd));
32104     pmd_clear(pmd);
32105     return 0;
32106   }
32107   pte = pte_offset(pmd, address);
32108   address &= ~PMD_MASK;
32109   if (address + size > PMD_SIZE)
32110     size = PMD_SIZE - address;
32111   size >>= PAGE_SHIFT;
32112   freed = 0;
32113   for (;;) {
32114     pte_t page;
32115     if (!size)
32116       break;
32117     page = *pte;
32118     pte++;
32119     size--;
32120     if (pte_none(page))
32121       continue;
32122     pte_clear(pte-1);
32123     freed += free_pte(page);
32124   }
32125   return freed;
32126 }
32127 
32128 static inline int zap_pmd_range(pgd_t * dir,
32129   unsigned long address, unsigned long size)
32130 {
32131   pmd_t * pmd;
32132   unsigned long end;
32133   int freed;
32134 
32135   if (pgd_none(*dir))
32136     return 0;
32137   if (pgd_bad(*dir)) {
32138     printk("zap_pmd_range: bad pgd (%08lx)\n",
32139            pgd_val(*dir));
32140     pgd_clear(dir);
32141     return 0;
32142   }
32143   pmd = pmd_offset(dir, address);
32144   address &= ~PGDIR_MASK;
32145   end = address + size;
32146   if (end > PGDIR_SIZE)
32147     end = PGDIR_SIZE;
32148   freed = 0;
32149   do {
32150     freed += zap_pte_range(pmd, address, end - address);
32151     address = (address + PMD_SIZE) & PMD_MASK;
32152     pmd++;
32153   } while (address < end);
32154   return freed;
32155 }
32156 
32157 /* remove user pages in a given range. */
32158 void zap_page_range(struct mm_struct *mm,
32159   unsigned long address, unsigned long size)
32160 {
32161   pgd_t * dir;
32162   unsigned long end = address + size;
32163   int freed = 0;
32164 
32165   dir = pgd_offset(mm, address);
32166   while (address < end) {
32167     freed += zap_pmd_range(dir, address, end - address);
32168     address = (address + PGDIR_SIZE) & PGDIR_MASK;
32169     dir++;
32170   }
32171   /* Update rss for the mm_struct (not necessarily
32172    * current->mm) */
32173   if (mm->rss > 0) {
32174     mm->rss -= freed;
32175     if (mm->rss < 0)
32176       mm->rss = 0;
32177   }
32178 }
32179 
32180 static inline void zeromap_pte_range(pte_t * pte,
32181   unsigned long address, unsigned long size,
32182   pte_t zero_pte)
32183 {
32184   unsigned long end;
32185 
32186   address &= ~PMD_MASK;
32187   end = address + size;
32188   if (end > PMD_SIZE)
32189     end = PMD_SIZE;
32190   do {
32191     pte_t oldpage = *pte;
32192     set_pte(pte, zero_pte);
32193     forget_pte(oldpage);
32194     address += PAGE_SIZE;
32195     pte++;
32196   } while (address < end);
32197 }
32198 
32199 static inline int zeromap_pmd_range(pmd_t * pmd,
32200   unsigned long address, unsigned long size,
32201   pte_t zero_pte)
32202 {
32203   unsigned long end;
32204 
32205   address &= ~PGDIR_MASK;
32206   end = address + size;
32207   if (end > PGDIR_SIZE)
32208     end = PGDIR_SIZE;
32209   do {
32210     pte_t * pte = pte_alloc(pmd, address);
32211     if (!pte)
32212       return -ENOMEM;
32213     zeromap_pte_range(pte, address, end - address,
32214                       zero_pte);
32215     address = (address + PMD_SIZE) & PMD_MASK;
32216     pmd++;
32217   } while (address < end);
32218   return 0;
32219 }
32220 
32221 int zeromap_page_range(unsigned long address,
32222                        unsigned long size, pgprot_t prot)
32223 {
32224   int error = 0;
32225   pgd_t * dir;
32226   unsigned long beg = address;
32227   unsigned long end = address + size;
32228   pte_t zero_pte;
32229 
32230   zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
32231   dir = pgd_offset(current->mm, address);
32232   flush_cache_range(current->mm, beg, end);
32233   while (address < end) {
32234     pmd_t *pmd = pmd_alloc(dir, address);
32235     error = -ENOMEM;
32236     if (!pmd)
32237       break;
32238     error = zeromap_pmd_range(pmd, address,
32239                               end - address, zero_pte);
32240     if (error)
32241       break;
32242     address = (address + PGDIR_SIZE) & PGDIR_MASK;
32243     dir++;
32244   }
32245   flush_tlb_range(current->mm, beg, end);
32246   return error;
32247 }
32248 
32249 /* maps a range of physical memory into the requested
32250  * pages. the old mappings are removed. any references to
32251  * nonexistent pages results in null mappings (currently
32252  * treated as "copy-on-access") */
32253 static inline void remap_pte_range(pte_t * pte,
32254   unsigned long address, unsigned long size,
32255   unsigned long phys_addr, pgprot_t prot)
32256 {
32257   unsigned long end;
32258 
32259   address &= ~PMD_MASK;
32260   end = address + size;
32261   if (end > PMD_SIZE)
32262     end = PMD_SIZE;
32263   do {
32264     unsigned long mapnr;
32265     pte_t oldpage = *pte;
32266     pte_clear(pte);
32267 
32268     mapnr = MAP_NR(__va(phys_addr));
32269     if (mapnr >= max_mapnr ||
32270         PageReserved(mem_map+mapnr))
32271       set_pte(pte, mk_pte_phys(phys_addr, prot));
32272     forget_pte(oldpage);
32273     address += PAGE_SIZE;
32274     phys_addr += PAGE_SIZE;
32275     pte++;
32276   } while (address < end);
32277 }
32278 
32279 static inline int remap_pmd_range(pmd_t * pmd,
32280   unsigned long address, unsigned long size,
32281   unsigned long phys_addr, pgprot_t prot)
32282 {
32283   unsigned long end;
32284 
32285   address &= ~PGDIR_MASK;
32286   end = address + size;
32287   if (end > PGDIR_SIZE)
32288     end = PGDIR_SIZE;
32289   phys_addr -= address;
32290   do {
32291     pte_t * pte = pte_alloc(pmd, address);
32292     if (!pte)
32293       return -ENOMEM;
32294     remap_pte_range(pte, address, end - address,
32295                     address + phys_addr, prot);
32296     address = (address + PMD_SIZE) & PMD_MASK;
32297     pmd++;
32298   } while (address < end);
32299   return 0;
32300 }
32301 
32302 int remap_page_range(unsigned long from,
32303   unsigned long phys_addr, unsigned long size,
32304   pgprot_t prot)
32305 {
32306   int error = 0;
32307   pgd_t * dir;
32308   unsigned long beg = from;
32309   unsigned long end = from + size;
32310 
32311   phys_addr -= from;
32312   dir = pgd_offset(current->mm, from);
32313   flush_cache_range(current->mm, beg, end);
32314   while (from < end) {
32315     pmd_t *pmd = pmd_alloc(dir, from);
32316     error = -ENOMEM;
32317     if (!pmd)
32318       break;
32319     error = remap_pmd_range(pmd, from, end - from,
32320                             phys_addr + from, prot);
32321     if (error)
32322       break;
32323     from = (from + PGDIR_SIZE) & PGDIR_MASK;
32324     dir++;
32325   }
32326   flush_tlb_range(current->mm, beg, end);
32327   return error;
32328 }
32329 
32330 /* sanity-check function.. */
32331 static void put_page(pte_t * page_table, pte_t pte)
32332 {
32333   if (!pte_none(*page_table)) {
32334     free_page_and_swap_cache(pte_page(pte));
32335     return;
32336   }
32337 /* no need for flush_tlb */
32338   set_pte(page_table, pte);
32339 }
32340 
32341 /* This routine is used to map in a page into an address
32342  * space: needed by execve() for the initial stack and
32343  * environment pages.  */
32344 unsigned long put_dirty_page(struct task_struct * tsk,
32345   unsigned long page, unsigned long address)
32346 {
32347   pgd_t * pgd;
32348   pmd_t * pmd;
32349   pte_t * pte;
32350 
32351   if (MAP_NR(page) >= max_mapnr)
32352     printk("put_dirty_page: trying to put page %08lx at "
32353            "%08lx\n",page,address);
32354   if (atomic_read(&mem_map[MAP_NR(page)].count) != 1)
32355     printk("mem_map disagrees with %08lx at %08lx\n",
32356            page,address);
32357   pgd = pgd_offset(tsk->mm,address);
32358   pmd = pmd_alloc(pgd, address);
32359   if (!pmd) {
32360     free_page(page);
32361     oom(tsk);
32362     return 0;
32363   }
32364   pte = pte_alloc(pmd, address);
32365   if (!pte) {
32366     free_page(page);
32367     oom(tsk);
32368     return 0;
32369   }
32370   if (!pte_none(*pte)) {
32371     printk("put_dirty_page: page already exists\n");
32372     free_page(page);
32373     return 0;
32374   }
32375   flush_page_to_ram(page);
32376   set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page,
32377                                            PAGE_COPY))));
32378   /* no need for flush_tlb */
32379   return page;
32380 }
32381 
32382 /* This routine handles present pages, when users try to
32383  * write to a shared page. It is done by copying the page
32384  * to a new address and decrementing the shared-page
32385  * counter for the old page.
32386  *
32387  * Goto-purists beware: the only reason for goto's here
32388  * is that it results in better assembly code.. The
32389  * "default" path will see no jumps at all.
32390  *
32391  * Note that this routine assumes that the protection
32392  * checks have been done by the caller (the low-level
32393  * page fault routine in most cases).  Thus we can safely
32394  * just mark it writable once we've done any necessary
32395  * COW.
32396  *
32397  * We also mark the page dirty at this point even though
32398  * the page will change only once the write actually
32399  * happens. This avoids a few races, and potentially
32400  * makes it more efficient.  */
 Комментарий
32401 static int do_wp_page(struct task_struct * tsk,
32402   struct vm_area_struct * vma, unsigned long address,
32403   pte_t *page_table)
32404 {
32405   pte_t pte;
32406   unsigned long old_page, new_page;
32407   struct page * page_map;
32408 
32409   pte = *page_table;
32410   new_page = __get_free_page(GFP_USER);
32411   /* Did someone else copy this page for us while we
32412    * slept? */
32413   if (pte_val(*page_table) != pte_val(pte))
32414     goto end_wp_page;
32415   if (!pte_present(pte))
32416     goto end_wp_page;
32417   if (pte_write(pte))
32418     goto end_wp_page;
32419   old_page = pte_page(pte);
32420   if (MAP_NR(old_page) >= max_mapnr)
32421     goto bad_wp_page;
32422   tsk->min_flt++;
32423   page_map = mem_map + MAP_NR(old_page);
32424 
32425   /* We can avoid the copy if:
32426    * - we're the only user (count == 1)
32427    * - the only other user is the swap cache,
32428    *   and the only swap cache user is itself,
32429    *   in which case we can remove the page
32430    *   from the swap cache.
32431    */
32432   switch (atomic_read(&page_map->count)) {
32433   case 2:
32434     if (!PageSwapCache(page_map))
32435       break;
32436     if (swap_count(page_map->offset) != 1)
32437       break;
32438     delete_from_swap_cache(page_map);
32439     /* FallThrough */
32440   case 1:
32441     /* We can release the kernel lock now.. */
32442     unlock_kernel();
32443 
32444     flush_cache_page(vma, address);
32445     set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
32446     flush_tlb_page(vma, address);
32447 end_wp_page:
32448     if (new_page)
32449       free_page(new_page);
32450     return 1;
32451   }
32452 
32453   unlock_kernel();
32454   if (!new_page)
32455     return 0;
32456 
32457   if (PageReserved(mem_map + MAP_NR(old_page)))
32458     ++vma->vm_mm->rss;
32459   copy_cow_page(old_page,new_page);
32460   flush_page_to_ram(old_page);
32461   flush_page_to_ram(new_page);
32462   flush_cache_page(vma, address);
32463   set_pte(page_table,
32464           pte_mkwrite(pte_mkdirty(mk_pte(new_page,
32465                                    vma->vm_page_prot))));
32466   free_page(old_page);
32467   flush_tlb_page(vma, address);
32468   return 1;
32469 
32470 bad_wp_page:
32471   printk("do_wp_page: bogus page at address "
32472          "%08lx (%08lx)\n", address, old_page);
32473   send_sig(SIGKILL, tsk, 1);
32474   if (new_page)
32475     free_page(new_page);
32476   return 0;
32477 }
32478 
32479 /* This function zeroes out partial mmap'ed pages at
32480  truncation time..  */
32481 static void partial_clear(struct vm_area_struct *vma,
32482                           unsigned long address)
32483 {
32484   pgd_t *page_dir;
32485   pmd_t *page_middle;
32486   pte_t *page_table, pte;
32487 
32488   page_dir = pgd_offset(vma->vm_mm, address);
32489   if (pgd_none(*page_dir))
32490     return;
32491   if (pgd_bad(*page_dir)) {
32492     printk("bad page table directory entry %p:[%lx]\n",
32493            page_dir, pgd_val(*page_dir));
32494     pgd_clear(page_dir);
32495     return;
32496   }
32497   page_middle = pmd_offset(page_dir, address);
32498   if (pmd_none(*page_middle))
32499     return;
32500   if (pmd_bad(*page_middle)) {
32501     printk("bad page table directory entry %p:[%lx]\n",
32502            page_dir, pgd_val(*page_dir));
32503     pmd_clear(page_middle);
32504     return;
32505   }
32506   page_table = pte_offset(page_middle, address);
32507   pte = *page_table;
32508   if (!pte_present(pte))
32509     return;
32510   flush_cache_page(vma, address);
32511   address &= ~PAGE_MASK;
32512   address += pte_page(pte);
32513   if (MAP_NR(address) >= max_mapnr)
32514     return;
32515   memset((void *) address, 0,
32516          PAGE_SIZE - (address & ~PAGE_MASK));
32517   flush_page_to_ram(pte_page(pte));
32518 }
32519 
32520 /* Handle all mappings that got truncated by a
32521  * "truncate()" system call.
32522  *
32523  * NOTE! We have to be ready to update the memory sharing
32524  * between the file and the memory map for a potential
32525  * last incomplete page.  Ugly, but necessary.  */
32526 void vmtruncate(struct inode * inode,
32527                 unsigned long offset)
32528 {
32529   struct vm_area_struct * mpnt;
32530 
32531   truncate_inode_pages(inode, offset);
32532   if (!inode->i_mmap)
32533     return;
32534   mpnt = inode->i_mmap;
32535   do {
32536     struct mm_struct *mm = mpnt->vm_mm;
32537     unsigned long start = mpnt->vm_start;
32538     unsigned long end = mpnt->vm_end;
32539     unsigned long len = end - start;
32540     unsigned long diff;
32541 
32542     /* mapping wholly truncated? */
32543     if (mpnt->vm_offset >= offset) {
32544       flush_cache_range(mm, start, end);
32545       zap_page_range(mm, start, len);
32546       flush_tlb_range(mm, start, end);
32547       continue;
32548     }
32549     /* mapping wholly unaffected? */
32550     diff = offset - mpnt->vm_offset;
32551     if (diff >= len)
32552       continue;
32553     /* Ok, partially affected.. */
32554     start += diff;
32555     len = (len - diff) & PAGE_MASK;
32556     if (start & ~PAGE_MASK) {
32557       partial_clear(mpnt, start);
32558       start = (start + ~PAGE_MASK) & PAGE_MASK;
32559     }
32560     flush_cache_range(mm, start, end);
32561     zap_page_range(mm, start, len);
32562     flush_tlb_range(mm, start, end);
32563   } while ((mpnt = mpnt->vm_next_share) != NULL);
32564 }
32565 
32566 
32567 /* This is called with the kernel lock held, we need to
32568  * return without it.  */
32569 static int do_swap_page(struct task_struct * tsk,
32570   struct vm_area_struct * vma, unsigned long address,
32571   pte_t * page_table, pte_t entry, int write_access)
32572 {
32573   if (!vma->vm_ops || !vma->vm_ops->swapin) {
32574     swap_in(tsk, vma, page_table, pte_val(entry),
32575             write_access);
32576     flush_page_to_ram(pte_page(*page_table));
32577   } else {
32578     pte_t page =
32579       vma->vm_ops->swapin(vma,
32580         address - vma->vm_start + vma->vm_offset,
32581         pte_val(entry));
32582     if (pte_val(*page_table) != pte_val(entry)) {
32583       free_page(pte_page(page));
32584     } else {
32585       if (atomic_read(&mem_map[MAP_NR(pte_page(page))].
32586                       count) > 1 &&
32587           !(vma->vm_flags & VM_SHARED))
32588         page = pte_wrprotect(page);
32589       ++vma->vm_mm->rss;
32590       ++tsk->maj_flt;
32591       flush_page_to_ram(pte_page(page));
32592       set_pte(page_table, page);
32593     }
32594   }
32595   unlock_kernel();
32596   return 1;
32597 }
32598 
32599 /* This only needs the MM semaphore */
32600 static int do_anonymous_page(struct task_struct * tsk,
32601   struct vm_area_struct * vma, pte_t *page_table,
32602   int write_access)
32603 {
32604   pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE,
32605                                      vma->vm_page_prot));
32606   if (write_access) {
32607     unsigned long page = __get_free_page(GFP_USER);
32608     if (!page)
32609       return 0;
32610     clear_page(page);
32611     entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
32612                                     vma->vm_page_prot)));
32613     vma->vm_mm->rss++;
32614     tsk->min_flt++;
32615     flush_page_to_ram(page);
32616   }
32617   put_page(page_table, entry);
32618   return 1;
32619 }
32620 
32621 /* do_no_page() tries to create a new page mapping. It
32622  * aggressively tries to share with existing pages, but
32623  * makes a separate copy if the "write_access" parameter
32624  * is true in order to avoid the next page fault.
32625  *
32626  * As this is called only for pages that do not currently
32627  * exist, we do not need to flush old virtual caches or
32628  * the TLB.
32629  *
32630  * This is called with the MM semaphore and the kernel
32631  * lock held.  We need to release the kernel lock as soon
32632  * as possible..  */
32633 static int do_no_page(struct task_struct * tsk,
32634   struct vm_area_struct * vma, unsigned long address,
32635   int write_access, pte_t *page_table)
32636 {
32637   unsigned long page;
32638   pte_t entry;
32639 
32640   if (!vma->vm_ops || !vma->vm_ops->nopage) {
32641     unlock_kernel();
32642     return do_anonymous_page(tsk, vma, page_table,
32643                              write_access);
32644   }
32645 
32646   /* The third argument is "no_share", which tells the
32647    * low-level code to copy, not share the page even if
32648    * sharing is possible.  It's essentially an early COW
32649    * detection.  */
32650   page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
32651     (vma->vm_flags & VM_SHARED)?0:write_access);
32652 
32653   unlock_kernel();
32654   if (!page)
32655     return 0;
32656 
32657   ++tsk->maj_flt;
32658   ++vma->vm_mm->rss;
32659   /* This silly early PAGE_DIRTY setting removes a race
32660    * due to the bad i386 page protection. But it's valid
32661    * for other architectures too.
32662    *
32663    * Note that if write_access is true, we either now
32664    * have an exclusive copy of the page, or this is a
32665    * shared mapping, so we can make it writable and dirty
32666    * to avoid having to handle that later.  */
32667   flush_page_to_ram(page);
32668   entry = mk_pte(page, vma->vm_page_prot);
32669   if (write_access) {
32670     entry = pte_mkwrite(pte_mkdirty(entry));
32671   } else if (atomic_read(&mem_map[MAP_NR(page)].
32672                          count) > 1 &&
32673              !(vma->vm_flags & VM_SHARED))
32674     entry = pte_wrprotect(entry);
32675   put_page(page_table, entry);
32676   /* no need to invalidate: a not-present page shouldn't
32677    * be cached */
32678   return 1;
32679 }
32680 
32681 /* These routines also need to handle stuff like marking
32682  * pages dirty and/or accessed for architectures that
32683  * don't do it in hardware (most RISC architectures).
32684  * The early dirtying is also good on the i386.
32685  *
32686  * There is also a hook called "update_mmu_cache()" that
32687  * architectures with external mmu caches can use to
32688  * update those (ie the Sparc or PowerPC hashed page
32689  * tables that act as extended TLBs).  */
 Комментарий
32690 static inline int handle_pte_fault(
32691   struct task_struct *tsk,
32692   struct vm_area_struct * vma, unsigned long address,
32693   int write_access, pte_t * pte)
32694 {
32695   pte_t entry;
32696 
32697   lock_kernel();
32698   entry = *pte;
32699 
32700   if (!pte_present(entry)) {
32701     if (pte_none(entry))
32702       return do_no_page(tsk, vma, address, write_access,
32703                         pte);
32704     return do_swap_page(tsk, vma, address, pte, entry,
32705                         write_access);
32706   }
32707 
32708   entry = pte_mkyoung(entry);
32709   set_pte(pte, entry);
32710   flush_tlb_page(vma, address);
32711   if (write_access) {
32712     if (!pte_write(entry))
32713       return do_wp_page(tsk, vma, address, pte);
32714 
32715     entry = pte_mkdirty(entry);
32716     set_pte(pte, entry);
32717     flush_tlb_page(vma, address);
32718   }
32719   unlock_kernel();
32720   return 1;
32721 }
32722 
32723 /* By the time we get here, we already hold the mm
32724  * semaphore */
 Комментарий
32725 int handle_mm_fault(struct task_struct *tsk,
32726   struct vm_area_struct * vma, unsigned long address,
32727   int write_access)
32728 {
32729   pgd_t *pgd;
32730   pmd_t *pmd;
32731 
32732   pgd = pgd_offset(vma->vm_mm, address);
32733   pmd = pmd_alloc(pgd, address);
32734   if (pmd) {
32735     pte_t * pte = pte_alloc(pmd, address);
32736     if (pte) {
32737       if (handle_pte_fault(tsk, vma, address,
32738                            write_access, pte)) {
32739         update_mmu_cache(vma, address, *pte);
32740         return 1;
32741       }
32742     }
32743   }
32744   return 0;
32745 }
32746 
32747 /* Simplistic page force-in.. */
32748 void make_pages_present(unsigned long addr,
32749                         unsigned long end)
32750 {
32751   int write;
32752   struct vm_area_struct * vma;
32753 
32754   vma = find_vma(current->mm, addr);
32755   write = (vma->vm_flags & VM_WRITE) != 0;
32756   while (addr < end) {
32757     handle_mm_fault(current, vma, addr, write);
32758     addr += PAGE_SIZE;
32759   }
32760 }

netlib.narod.ru	< Назад \| Оглавление \| Далее >
Сайт управляется системой uCoz