mm/memory.c
31766 /*
31767 * linux/mm/memory.c
31768 *
31769 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
31770 */
31771
31772 /* demand-loading started 01.12.91 - seems it is high on
31773 * the list of things wanted, and it should be easy to
31774 * implement. - Linus */
31775
31776 /* Ok, demand-loading was easy, shared pages a little bit
31777 * tricker. Shared pages started 02.12.91, seems to
31778 * work. - Linus.
31779 *
31780 * Tested sharing by executing about 30 /bin/sh: under
31781 * the old kernel it would have taken more than the 6M I
31782 * have free, but it worked well as far as I could see.
31783 *
31784 * Also corrected some "invalidate()"s - I wasn't doing
31785 * enough of them. */
31786
31787 /* Real VM (paging to/from disk) started 18.12.91. Much
31788 * more work and thought has to go into this. Oh, well..
31789 * 19.12.91 - works, somewhat. Sometimes I get faults,
31790 * don't know why. Fund it. Everything seems to work
31791 * now.
31792 * 20.12.91 - Ok, making the swap-device changeable like
31793 * the root. */
31794 /* 05.04.94 - Multi-pg memory management added for v1.1.
31795 * Idea by Alex Bligh (alex@cconcepts.co.uk)
31796 */
31797
31798 #include <linux/mm.h>
31799 #include <linux/mman.h>
31800 #include <linux/swap.h>
31801 #include <linux/smp_lock.h>
31802
31803 #include <asm/uaccess.h>
31804 #include <asm/pgtable.h>
31805
31806 unsigned long max_mapnr = 0;
31807 unsigned long num_physpages = 0;
31808 void * high_memory = NULL;
31809
31810 /* We special-case the C-O-W ZERO_PAGE, because it's such
31811 * a common occurrence (no need to read the page to know
31812 * that it's zero - better for the cache and memory
31813 * subsystem). */
31814 static inline void copy_cow_page(unsigned long from,
31815 unsigned long to)
31816 {
31817 if (from == ZERO_PAGE) {
31818 clear_page(to);
31819 return;
31820 }
31821 copy_page(to, from);
31822 }
31823
31824 mem_map_t * mem_map = NULL;
31825
31826 /* oom() prints a message (so that the user knows why the
31827 * process died), and gives the process an untrappable
31828 * SIGKILL. */
31829 void oom(struct task_struct * task)
31830 {
31831 printk("\nOut of memory for %s.\n", task->comm);
31832 force_sig(SIGKILL, task);
31833 }
31834
31835 /* Note: this doesn't free the actual pages
31836 * themselves. That has been handled earlier when
31837 * unmapping all the memory regions. */
31838 static inline void free_one_pmd(pmd_t * dir)
31839 {
31840 pte_t * pte;
31841
31842 if (pmd_none(*dir))
31843 return;
31844 if (pmd_bad(*dir)) {
31845 printk("free_one_pmd: bad directory entry %08lx\n",
31846 pmd_val(*dir));
31847 pmd_clear(dir);
31848 return;
31849 }
31850 pte = pte_offset(dir, 0);
31851 pmd_clear(dir);
31852 pte_free(pte);
31853 }
31854
31855 static inline void free_one_pgd(pgd_t * dir)
31856 {
31857 int j;
31858 pmd_t * pmd;
31859
31860 if (pgd_none(*dir))
31861 return;
31862 if (pgd_bad(*dir)) {
31863 printk("free_one_pgd: bad directory entry %08lx\n",
31864 pgd_val(*dir));
31865 pgd_clear(dir);
31866 return;
31867 }
31868 pmd = pmd_offset(dir, 0);
31869 pgd_clear(dir);
31870 for (j = 0; j < PTRS_PER_PMD ; j++)
31871 free_one_pmd(pmd+j);
31872 pmd_free(pmd);
31873 }
31874
31875 /* Low and high watermarks for page table cache. The
31876 * system should try to have pgt_water[0] <= cache
31877 * elements <= pgt_water[1] */
31878 int pgt_cache_water[2] = { 25, 50 };
31879
31880 /* Returns the number of pages freed */
31881 int check_pgt_cache(void)
31882 {
31883 return do_check_pgt_cache(pgt_cache_water[0],
31884 pgt_cache_water[1]);
31885 }
31886
31887
31888 /* This function clears all user-level page tables of a
31889 * process - this is needed by execve(), so that old
31890 * pages aren't in the way. */
31891 void clear_page_tables(struct mm_struct *mm,
31892 unsigned long first, int nr)
31893 {
31894 pgd_t * page_dir = mm->pgd;
31895
31896 if (page_dir && page_dir != swapper_pg_dir) {
31897 page_dir += first;
31898 do {
31899 free_one_pgd(page_dir);
31900 page_dir++;
31901 } while (--nr);
31902
31903 /* keep the page table cache within bounds */
31904 check_pgt_cache();
31905 }
31906 }
31907
31908 /* This function just free's the page directory - the
31909 * page tables themselves have been freed earlier by
31910 * clear_page_tables(). */
31911 void free_page_tables(struct mm_struct * mm)
31912 {
31913 pgd_t * page_dir = mm->pgd;
31914
31915 if (page_dir) {
31916 if (page_dir == swapper_pg_dir)
31917 goto out_bad;
31918 pgd_free(page_dir);
31919 }
31920 return;
31921
31922 out_bad:
31923 printk(KERN_ERR
31924 "free_page_tables: Trying to free kernel pgd\n");
31925 return;
31926 }
31927
31928 int new_page_tables(struct task_struct * tsk)
31929 {
31930 pgd_t * new_pg;
31931
31932 if (!(new_pg = pgd_alloc()))
31933 return -ENOMEM;
31934 SET_PAGE_DIR(tsk, new_pg);
31935 tsk->mm->pgd = new_pg;
31936 return 0;
31937 }
31938
31939 #define PTE_TABLE_MASK ((PTRS_PER_PTE-1) * sizeof(pte_t))
31940 #define PMD_TABLE_MASK ((PTRS_PER_PMD-1) * sizeof(pmd_t))
31941
31942 /* copy one vm_area from one task to the other. Assumes
31943 * the page tables already present in the new task to be
31944 * cleared in the whole range covered by this vma.
31945 *
31946 * 08Jan98 Merged into one routine from several inline
31947 * routines to reduce variable count and make things
31948 * faster. -jj */
31949 int copy_page_range(struct mm_struct *dst,
31950 struct mm_struct *src,
31951 struct vm_area_struct *vma)
31952 {
31953 pgd_t * src_pgd, * dst_pgd;
31954 unsigned long address = vma->vm_start;
31955 unsigned long end = vma->vm_end;
31956 unsigned long cow =
31957 (vma->vm_flags & (VM_SHARED | VM_MAYWRITE))
31958 == VM_MAYWRITE;
31959
31960 src_pgd = pgd_offset(src, address)-1;
31961 dst_pgd = pgd_offset(dst, address)-1;
31962
31963 for (;;) {
31964 pmd_t * src_pmd, * dst_pmd;
31965
31966 src_pgd++; dst_pgd++;
31967
31968 /* copy_pmd_range */
31969
31970 if (pgd_none(*src_pgd))
31971 goto skip_copy_pmd_range;
31972 if (pgd_bad(*src_pgd)) {
31973 printk("copy_pmd_range: bad pgd (%08lx)\n",
31974 pgd_val(*src_pgd));
31975 pgd_clear(src_pgd);
31976 skip_copy_pmd_range:
31977 address = (address + PGDIR_SIZE) & PGDIR_MASK;
31978 if (address >= end)
31979 goto out;
31980 continue;
31981 }
31982 if (pgd_none(*dst_pgd)) {
31983 if (!pmd_alloc(dst_pgd, 0))
31984 goto nomem;
31985 }
31986
31987 src_pmd = pmd_offset(src_pgd, address);
31988 dst_pmd = pmd_offset(dst_pgd, address);
31989
31990 do {
31991 pte_t * src_pte, * dst_pte;
31992
31993 /* copy_pte_range */
31994
31995 if (pmd_none(*src_pmd))
31996 goto skip_copy_pte_range;
31997 if (pmd_bad(*src_pmd)) {
31998 printk("copy_pte_range: bad pmd (%08lx)\n",
31999 pmd_val(*src_pmd));
32000 pmd_clear(src_pmd);
32001 skip_copy_pte_range:
32002 address = (address + PMD_SIZE) & PMD_MASK;
32003 if (address >= end)
32004 goto out;
32005 goto cont_copy_pmd_range;
32006 }
32007 if (pmd_none(*dst_pmd)) {
32008 if (!pte_alloc(dst_pmd, 0))
32009 goto nomem;
32010 }
32011
32012 src_pte = pte_offset(src_pmd, address);
32013 dst_pte = pte_offset(dst_pmd, address);
32014
32015 do {
32016 pte_t pte = *src_pte;
32017 unsigned long page_nr;
32018
32019 /* copy_one_pte */
32020
32021 if (pte_none(pte))
32022 goto cont_copy_pte_range;
32023 if (!pte_present(pte)) {
32024 swap_duplicate(pte_val(pte));
32025 set_pte(dst_pte, pte);
32026 goto cont_copy_pte_range;
32027 }
32028 page_nr = MAP_NR(pte_page(pte));
32029 if (page_nr >= max_mapnr ||
32030 PageReserved(mem_map+page_nr)) {
32031 set_pte(dst_pte, pte);
32032 goto cont_copy_pte_range;
32033 }
32034 /* If it's a COW mapping, write protect it both
32035 * in the parent and the child */
32036 if (cow) {
32037 pte = pte_wrprotect(pte);
32038 set_pte(src_pte, pte);
32039 }
32040 /* If it's a shared mapping, mark it clean in the
32041 * child */
32042 if (vma->vm_flags & VM_SHARED)
32043 pte = pte_mkclean(pte);
32044 set_pte(dst_pte, pte_mkold(pte));
32045 atomic_inc(&mem_map[page_nr].count);
32046
32047 cont_copy_pte_range:
32048 address += PAGE_SIZE;
32049 if (address >= end)
32050 goto out;
32051 src_pte++;
32052 dst_pte++;
32053 } while ((unsigned long)src_pte & PTE_TABLE_MASK);
32054
32055 cont_copy_pmd_range:
32056 src_pmd++;
32057 dst_pmd++;
32058 } while ((unsigned long)src_pmd & PMD_TABLE_MASK);
32059 }
32060 out:
32061 return 0;
32062
32063 nomem:
32064 return -ENOMEM;
32065 }
32066
32067 /* Return indicates whether a page was freed so caller
32068 * can adjust rss */
32069 static inline int free_pte(pte_t page)
32070 {
32071 if (pte_present(page)) {
32072 unsigned long addr = pte_page(page);
32073 if (MAP_NR(addr) >= max_mapnr ||
32074 PageReserved(mem_map+MAP_NR(addr)))
32075 return 0;
32076 /* free_page() used to be able to clear swap cache
32077 * entries. We may now have to do it manually. */
32078 free_page_and_swap_cache(addr);
32079 return 1;
32080 }
32081 swap_free(pte_val(page));
32082 return 0;
32083 }
32084
32085 static inline void forget_pte(pte_t page)
32086 {
32087 if (!pte_none(page)) {
32088 printk("forget_pte: old mapping existed!\n");
32089 free_pte(page);
32090 }
32091 }
32092
32093 static inline int zap_pte_range(pmd_t * pmd,
32094 unsigned long address, unsigned long size)
32095 {
32096 pte_t * pte;
32097 int freed;
32098
32099 if (pmd_none(*pmd))
32100 return 0;
32101 if (pmd_bad(*pmd)) {
32102 printk("zap_pte_range: bad pmd (%08lx)\n",
32103 pmd_val(*pmd));
32104 pmd_clear(pmd);
32105 return 0;
32106 }
32107 pte = pte_offset(pmd, address);
32108 address &= ~PMD_MASK;
32109 if (address + size > PMD_SIZE)
32110 size = PMD_SIZE - address;
32111 size >>= PAGE_SHIFT;
32112 freed = 0;
32113 for (;;) {
32114 pte_t page;
32115 if (!size)
32116 break;
32117 page = *pte;
32118 pte++;
32119 size--;
32120 if (pte_none(page))
32121 continue;
32122 pte_clear(pte-1);
32123 freed += free_pte(page);
32124 }
32125 return freed;
32126 }
32127
32128 static inline int zap_pmd_range(pgd_t * dir,
32129 unsigned long address, unsigned long size)
32130 {
32131 pmd_t * pmd;
32132 unsigned long end;
32133 int freed;
32134
32135 if (pgd_none(*dir))
32136 return 0;
32137 if (pgd_bad(*dir)) {
32138 printk("zap_pmd_range: bad pgd (%08lx)\n",
32139 pgd_val(*dir));
32140 pgd_clear(dir);
32141 return 0;
32142 }
32143 pmd = pmd_offset(dir, address);
32144 address &= ~PGDIR_MASK;
32145 end = address + size;
32146 if (end > PGDIR_SIZE)
32147 end = PGDIR_SIZE;
32148 freed = 0;
32149 do {
32150 freed += zap_pte_range(pmd, address, end - address);
32151 address = (address + PMD_SIZE) & PMD_MASK;
32152 pmd++;
32153 } while (address < end);
32154 return freed;
32155 }
32156
32157 /* remove user pages in a given range. */
32158 void zap_page_range(struct mm_struct *mm,
32159 unsigned long address, unsigned long size)
32160 {
32161 pgd_t * dir;
32162 unsigned long end = address + size;
32163 int freed = 0;
32164
32165 dir = pgd_offset(mm, address);
32166 while (address < end) {
32167 freed += zap_pmd_range(dir, address, end - address);
32168 address = (address + PGDIR_SIZE) & PGDIR_MASK;
32169 dir++;
32170 }
32171 /* Update rss for the mm_struct (not necessarily
32172 * current->mm) */
32173 if (mm->rss > 0) {
32174 mm->rss -= freed;
32175 if (mm->rss < 0)
32176 mm->rss = 0;
32177 }
32178 }
32179
32180 static inline void zeromap_pte_range(pte_t * pte,
32181 unsigned long address, unsigned long size,
32182 pte_t zero_pte)
32183 {
32184 unsigned long end;
32185
32186 address &= ~PMD_MASK;
32187 end = address + size;
32188 if (end > PMD_SIZE)
32189 end = PMD_SIZE;
32190 do {
32191 pte_t oldpage = *pte;
32192 set_pte(pte, zero_pte);
32193 forget_pte(oldpage);
32194 address += PAGE_SIZE;
32195 pte++;
32196 } while (address < end);
32197 }
32198
32199 static inline int zeromap_pmd_range(pmd_t * pmd,
32200 unsigned long address, unsigned long size,
32201 pte_t zero_pte)
32202 {
32203 unsigned long end;
32204
32205 address &= ~PGDIR_MASK;
32206 end = address + size;
32207 if (end > PGDIR_SIZE)
32208 end = PGDIR_SIZE;
32209 do {
32210 pte_t * pte = pte_alloc(pmd, address);
32211 if (!pte)
32212 return -ENOMEM;
32213 zeromap_pte_range(pte, address, end - address,
32214 zero_pte);
32215 address = (address + PMD_SIZE) & PMD_MASK;
32216 pmd++;
32217 } while (address < end);
32218 return 0;
32219 }
32220
32221 int zeromap_page_range(unsigned long address,
32222 unsigned long size, pgprot_t prot)
32223 {
32224 int error = 0;
32225 pgd_t * dir;
32226 unsigned long beg = address;
32227 unsigned long end = address + size;
32228 pte_t zero_pte;
32229
32230 zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE, prot));
32231 dir = pgd_offset(current->mm, address);
32232 flush_cache_range(current->mm, beg, end);
32233 while (address < end) {
32234 pmd_t *pmd = pmd_alloc(dir, address);
32235 error = -ENOMEM;
32236 if (!pmd)
32237 break;
32238 error = zeromap_pmd_range(pmd, address,
32239 end - address, zero_pte);
32240 if (error)
32241 break;
32242 address = (address + PGDIR_SIZE) & PGDIR_MASK;
32243 dir++;
32244 }
32245 flush_tlb_range(current->mm, beg, end);
32246 return error;
32247 }
32248
32249 /* maps a range of physical memory into the requested
32250 * pages. the old mappings are removed. any references to
32251 * nonexistent pages results in null mappings (currently
32252 * treated as "copy-on-access") */
32253 static inline void remap_pte_range(pte_t * pte,
32254 unsigned long address, unsigned long size,
32255 unsigned long phys_addr, pgprot_t prot)
32256 {
32257 unsigned long end;
32258
32259 address &= ~PMD_MASK;
32260 end = address + size;
32261 if (end > PMD_SIZE)
32262 end = PMD_SIZE;
32263 do {
32264 unsigned long mapnr;
32265 pte_t oldpage = *pte;
32266 pte_clear(pte);
32267
32268 mapnr = MAP_NR(__va(phys_addr));
32269 if (mapnr >= max_mapnr ||
32270 PageReserved(mem_map+mapnr))
32271 set_pte(pte, mk_pte_phys(phys_addr, prot));
32272 forget_pte(oldpage);
32273 address += PAGE_SIZE;
32274 phys_addr += PAGE_SIZE;
32275 pte++;
32276 } while (address < end);
32277 }
32278
32279 static inline int remap_pmd_range(pmd_t * pmd,
32280 unsigned long address, unsigned long size,
32281 unsigned long phys_addr, pgprot_t prot)
32282 {
32283 unsigned long end;
32284
32285 address &= ~PGDIR_MASK;
32286 end = address + size;
32287 if (end > PGDIR_SIZE)
32288 end = PGDIR_SIZE;
32289 phys_addr -= address;
32290 do {
32291 pte_t * pte = pte_alloc(pmd, address);
32292 if (!pte)
32293 return -ENOMEM;
32294 remap_pte_range(pte, address, end - address,
32295 address + phys_addr, prot);
32296 address = (address + PMD_SIZE) & PMD_MASK;
32297 pmd++;
32298 } while (address < end);
32299 return 0;
32300 }
32301
32302 int remap_page_range(unsigned long from,
32303 unsigned long phys_addr, unsigned long size,
32304 pgprot_t prot)
32305 {
32306 int error = 0;
32307 pgd_t * dir;
32308 unsigned long beg = from;
32309 unsigned long end = from + size;
32310
32311 phys_addr -= from;
32312 dir = pgd_offset(current->mm, from);
32313 flush_cache_range(current->mm, beg, end);
32314 while (from < end) {
32315 pmd_t *pmd = pmd_alloc(dir, from);
32316 error = -ENOMEM;
32317 if (!pmd)
32318 break;
32319 error = remap_pmd_range(pmd, from, end - from,
32320 phys_addr + from, prot);
32321 if (error)
32322 break;
32323 from = (from + PGDIR_SIZE) & PGDIR_MASK;
32324 dir++;
32325 }
32326 flush_tlb_range(current->mm, beg, end);
32327 return error;
32328 }
32329
32330 /* sanity-check function.. */
32331 static void put_page(pte_t * page_table, pte_t pte)
32332 {
32333 if (!pte_none(*page_table)) {
32334 free_page_and_swap_cache(pte_page(pte));
32335 return;
32336 }
32337 /* no need for flush_tlb */
32338 set_pte(page_table, pte);
32339 }
32340
32341 /* This routine is used to map in a page into an address
32342 * space: needed by execve() for the initial stack and
32343 * environment pages. */
32344 unsigned long put_dirty_page(struct task_struct * tsk,
32345 unsigned long page, unsigned long address)
32346 {
32347 pgd_t * pgd;
32348 pmd_t * pmd;
32349 pte_t * pte;
32350
32351 if (MAP_NR(page) >= max_mapnr)
32352 printk("put_dirty_page: trying to put page %08lx at "
32353 "%08lx\n",page,address);
32354 if (atomic_read(&mem_map[MAP_NR(page)].count) != 1)
32355 printk("mem_map disagrees with %08lx at %08lx\n",
32356 page,address);
32357 pgd = pgd_offset(tsk->mm,address);
32358 pmd = pmd_alloc(pgd, address);
32359 if (!pmd) {
32360 free_page(page);
32361 oom(tsk);
32362 return 0;
32363 }
32364 pte = pte_alloc(pmd, address);
32365 if (!pte) {
32366 free_page(page);
32367 oom(tsk);
32368 return 0;
32369 }
32370 if (!pte_none(*pte)) {
32371 printk("put_dirty_page: page already exists\n");
32372 free_page(page);
32373 return 0;
32374 }
32375 flush_page_to_ram(page);
32376 set_pte(pte, pte_mkwrite(pte_mkdirty(mk_pte(page,
32377 PAGE_COPY))));
32378 /* no need for flush_tlb */
32379 return page;
32380 }
32381
32382 /* This routine handles present pages, when users try to
32383 * write to a shared page. It is done by copying the page
32384 * to a new address and decrementing the shared-page
32385 * counter for the old page.
32386 *
32387 * Goto-purists beware: the only reason for goto's here
32388 * is that it results in better assembly code.. The
32389 * "default" path will see no jumps at all.
32390 *
32391 * Note that this routine assumes that the protection
32392 * checks have been done by the caller (the low-level
32393 * page fault routine in most cases). Thus we can safely
32394 * just mark it writable once we've done any necessary
32395 * COW.
32396 *
32397 * We also mark the page dirty at this point even though
32398 * the page will change only once the write actually
32399 * happens. This avoids a few races, and potentially
32400 * makes it more efficient. */
32401 static int do_wp_page(struct task_struct * tsk,
32402 struct vm_area_struct * vma, unsigned long address,
32403 pte_t *page_table)
32404 {
32405 pte_t pte;
32406 unsigned long old_page, new_page;
32407 struct page * page_map;
32408
32409 pte = *page_table;
32410 new_page = __get_free_page(GFP_USER);
32411 /* Did someone else copy this page for us while we
32412 * slept? */
32413 if (pte_val(*page_table) != pte_val(pte))
32414 goto end_wp_page;
32415 if (!pte_present(pte))
32416 goto end_wp_page;
32417 if (pte_write(pte))
32418 goto end_wp_page;
32419 old_page = pte_page(pte);
32420 if (MAP_NR(old_page) >= max_mapnr)
32421 goto bad_wp_page;
32422 tsk->min_flt++;
32423 page_map = mem_map + MAP_NR(old_page);
32424
32425 /* We can avoid the copy if:
32426 * - we're the only user (count == 1)
32427 * - the only other user is the swap cache,
32428 * and the only swap cache user is itself,
32429 * in which case we can remove the page
32430 * from the swap cache.
32431 */
32432 switch (atomic_read(&page_map->count)) {
32433 case 2:
32434 if (!PageSwapCache(page_map))
32435 break;
32436 if (swap_count(page_map->offset) != 1)
32437 break;
32438 delete_from_swap_cache(page_map);
32439 /* FallThrough */
32440 case 1:
32441 /* We can release the kernel lock now.. */
32442 unlock_kernel();
32443
32444 flush_cache_page(vma, address);
32445 set_pte(page_table, pte_mkdirty(pte_mkwrite(pte)));
32446 flush_tlb_page(vma, address);
32447 end_wp_page:
32448 if (new_page)
32449 free_page(new_page);
32450 return 1;
32451 }
32452
32453 unlock_kernel();
32454 if (!new_page)
32455 return 0;
32456
32457 if (PageReserved(mem_map + MAP_NR(old_page)))
32458 ++vma->vm_mm->rss;
32459 copy_cow_page(old_page,new_page);
32460 flush_page_to_ram(old_page);
32461 flush_page_to_ram(new_page);
32462 flush_cache_page(vma, address);
32463 set_pte(page_table,
32464 pte_mkwrite(pte_mkdirty(mk_pte(new_page,
32465 vma->vm_page_prot))));
32466 free_page(old_page);
32467 flush_tlb_page(vma, address);
32468 return 1;
32469
32470 bad_wp_page:
32471 printk("do_wp_page: bogus page at address "
32472 "%08lx (%08lx)\n", address, old_page);
32473 send_sig(SIGKILL, tsk, 1);
32474 if (new_page)
32475 free_page(new_page);
32476 return 0;
32477 }
32478
32479 /* This function zeroes out partial mmap'ed pages at
32480 truncation time.. */
32481 static void partial_clear(struct vm_area_struct *vma,
32482 unsigned long address)
32483 {
32484 pgd_t *page_dir;
32485 pmd_t *page_middle;
32486 pte_t *page_table, pte;
32487
32488 page_dir = pgd_offset(vma->vm_mm, address);
32489 if (pgd_none(*page_dir))
32490 return;
32491 if (pgd_bad(*page_dir)) {
32492 printk("bad page table directory entry %p:[%lx]\n",
32493 page_dir, pgd_val(*page_dir));
32494 pgd_clear(page_dir);
32495 return;
32496 }
32497 page_middle = pmd_offset(page_dir, address);
32498 if (pmd_none(*page_middle))
32499 return;
32500 if (pmd_bad(*page_middle)) {
32501 printk("bad page table directory entry %p:[%lx]\n",
32502 page_dir, pgd_val(*page_dir));
32503 pmd_clear(page_middle);
32504 return;
32505 }
32506 page_table = pte_offset(page_middle, address);
32507 pte = *page_table;
32508 if (!pte_present(pte))
32509 return;
32510 flush_cache_page(vma, address);
32511 address &= ~PAGE_MASK;
32512 address += pte_page(pte);
32513 if (MAP_NR(address) >= max_mapnr)
32514 return;
32515 memset((void *) address, 0,
32516 PAGE_SIZE - (address & ~PAGE_MASK));
32517 flush_page_to_ram(pte_page(pte));
32518 }
32519
32520 /* Handle all mappings that got truncated by a
32521 * "truncate()" system call.
32522 *
32523 * NOTE! We have to be ready to update the memory sharing
32524 * between the file and the memory map for a potential
32525 * last incomplete page. Ugly, but necessary. */
32526 void vmtruncate(struct inode * inode,
32527 unsigned long offset)
32528 {
32529 struct vm_area_struct * mpnt;
32530
32531 truncate_inode_pages(inode, offset);
32532 if (!inode->i_mmap)
32533 return;
32534 mpnt = inode->i_mmap;
32535 do {
32536 struct mm_struct *mm = mpnt->vm_mm;
32537 unsigned long start = mpnt->vm_start;
32538 unsigned long end = mpnt->vm_end;
32539 unsigned long len = end - start;
32540 unsigned long diff;
32541
32542 /* mapping wholly truncated? */
32543 if (mpnt->vm_offset >= offset) {
32544 flush_cache_range(mm, start, end);
32545 zap_page_range(mm, start, len);
32546 flush_tlb_range(mm, start, end);
32547 continue;
32548 }
32549 /* mapping wholly unaffected? */
32550 diff = offset - mpnt->vm_offset;
32551 if (diff >= len)
32552 continue;
32553 /* Ok, partially affected.. */
32554 start += diff;
32555 len = (len - diff) & PAGE_MASK;
32556 if (start & ~PAGE_MASK) {
32557 partial_clear(mpnt, start);
32558 start = (start + ~PAGE_MASK) & PAGE_MASK;
32559 }
32560 flush_cache_range(mm, start, end);
32561 zap_page_range(mm, start, len);
32562 flush_tlb_range(mm, start, end);
32563 } while ((mpnt = mpnt->vm_next_share) != NULL);
32564 }
32565
32566
32567 /* This is called with the kernel lock held, we need to
32568 * return without it. */
32569 static int do_swap_page(struct task_struct * tsk,
32570 struct vm_area_struct * vma, unsigned long address,
32571 pte_t * page_table, pte_t entry, int write_access)
32572 {
32573 if (!vma->vm_ops || !vma->vm_ops->swapin) {
32574 swap_in(tsk, vma, page_table, pte_val(entry),
32575 write_access);
32576 flush_page_to_ram(pte_page(*page_table));
32577 } else {
32578 pte_t page =
32579 vma->vm_ops->swapin(vma,
32580 address - vma->vm_start + vma->vm_offset,
32581 pte_val(entry));
32582 if (pte_val(*page_table) != pte_val(entry)) {
32583 free_page(pte_page(page));
32584 } else {
32585 if (atomic_read(&mem_map[MAP_NR(pte_page(page))].
32586 count) > 1 &&
32587 !(vma->vm_flags & VM_SHARED))
32588 page = pte_wrprotect(page);
32589 ++vma->vm_mm->rss;
32590 ++tsk->maj_flt;
32591 flush_page_to_ram(pte_page(page));
32592 set_pte(page_table, page);
32593 }
32594 }
32595 unlock_kernel();
32596 return 1;
32597 }
32598
32599 /* This only needs the MM semaphore */
32600 static int do_anonymous_page(struct task_struct * tsk,
32601 struct vm_area_struct * vma, pte_t *page_table,
32602 int write_access)
32603 {
32604 pte_t entry = pte_wrprotect(mk_pte(ZERO_PAGE,
32605 vma->vm_page_prot));
32606 if (write_access) {
32607 unsigned long page = __get_free_page(GFP_USER);
32608 if (!page)
32609 return 0;
32610 clear_page(page);
32611 entry = pte_mkwrite(pte_mkdirty(mk_pte(page,
32612 vma->vm_page_prot)));
32613 vma->vm_mm->rss++;
32614 tsk->min_flt++;
32615 flush_page_to_ram(page);
32616 }
32617 put_page(page_table, entry);
32618 return 1;
32619 }
32620
32621 /* do_no_page() tries to create a new page mapping. It
32622 * aggressively tries to share with existing pages, but
32623 * makes a separate copy if the "write_access" parameter
32624 * is true in order to avoid the next page fault.
32625 *
32626 * As this is called only for pages that do not currently
32627 * exist, we do not need to flush old virtual caches or
32628 * the TLB.
32629 *
32630 * This is called with the MM semaphore and the kernel
32631 * lock held. We need to release the kernel lock as soon
32632 * as possible.. */
32633 static int do_no_page(struct task_struct * tsk,
32634 struct vm_area_struct * vma, unsigned long address,
32635 int write_access, pte_t *page_table)
32636 {
32637 unsigned long page;
32638 pte_t entry;
32639
32640 if (!vma->vm_ops || !vma->vm_ops->nopage) {
32641 unlock_kernel();
32642 return do_anonymous_page(tsk, vma, page_table,
32643 write_access);
32644 }
32645
32646 /* The third argument is "no_share", which tells the
32647 * low-level code to copy, not share the page even if
32648 * sharing is possible. It's essentially an early COW
32649 * detection. */
32650 page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
32651 (vma->vm_flags & VM_SHARED)?0:write_access);
32652
32653 unlock_kernel();
32654 if (!page)
32655 return 0;
32656
32657 ++tsk->maj_flt;
32658 ++vma->vm_mm->rss;
32659 /* This silly early PAGE_DIRTY setting removes a race
32660 * due to the bad i386 page protection. But it's valid
32661 * for other architectures too.
32662 *
32663 * Note that if write_access is true, we either now
32664 * have an exclusive copy of the page, or this is a
32665 * shared mapping, so we can make it writable and dirty
32666 * to avoid having to handle that later. */
32667 flush_page_to_ram(page);
32668 entry = mk_pte(page, vma->vm_page_prot);
32669 if (write_access) {
32670 entry = pte_mkwrite(pte_mkdirty(entry));
32671 } else if (atomic_read(&mem_map[MAP_NR(page)].
32672 count) > 1 &&
32673 !(vma->vm_flags & VM_SHARED))
32674 entry = pte_wrprotect(entry);
32675 put_page(page_table, entry);
32676 /* no need to invalidate: a not-present page shouldn't
32677 * be cached */
32678 return 1;
32679 }
32680
32681 /* These routines also need to handle stuff like marking
32682 * pages dirty and/or accessed for architectures that
32683 * don't do it in hardware (most RISC architectures).
32684 * The early dirtying is also good on the i386.
32685 *
32686 * There is also a hook called "update_mmu_cache()" that
32687 * architectures with external mmu caches can use to
32688 * update those (ie the Sparc or PowerPC hashed page
32689 * tables that act as extended TLBs). */
32690 static inline int handle_pte_fault(
32691 struct task_struct *tsk,
32692 struct vm_area_struct * vma, unsigned long address,
32693 int write_access, pte_t * pte)
32694 {
32695 pte_t entry;
32696
32697 lock_kernel();
32698 entry = *pte;
32699
32700 if (!pte_present(entry)) {
32701 if (pte_none(entry))
32702 return do_no_page(tsk, vma, address, write_access,
32703 pte);
32704 return do_swap_page(tsk, vma, address, pte, entry,
32705 write_access);
32706 }
32707
32708 entry = pte_mkyoung(entry);
32709 set_pte(pte, entry);
32710 flush_tlb_page(vma, address);
32711 if (write_access) {
32712 if (!pte_write(entry))
32713 return do_wp_page(tsk, vma, address, pte);
32714
32715 entry = pte_mkdirty(entry);
32716 set_pte(pte, entry);
32717 flush_tlb_page(vma, address);
32718 }
32719 unlock_kernel();
32720 return 1;
32721 }
32722
32723 /* By the time we get here, we already hold the mm
32724 * semaphore */
32725 int handle_mm_fault(struct task_struct *tsk,
32726 struct vm_area_struct * vma, unsigned long address,
32727 int write_access)
32728 {
32729 pgd_t *pgd;
32730 pmd_t *pmd;
32731
32732 pgd = pgd_offset(vma->vm_mm, address);
32733 pmd = pmd_alloc(pgd, address);
32734 if (pmd) {
32735 pte_t * pte = pte_alloc(pmd, address);
32736 if (pte) {
32737 if (handle_pte_fault(tsk, vma, address,
32738 write_access, pte)) {
32739 update_mmu_cache(vma, address, *pte);
32740 return 1;
32741 }
32742 }
32743 }
32744 return 0;
32745 }
32746
32747 /* Simplistic page force-in.. */
32748 void make_pages_present(unsigned long addr,
32749 unsigned long end)
32750 {
32751 int write;
32752 struct vm_area_struct * vma;
32753
32754 vma = find_vma(current->mm, addr);
32755 write = (vma->vm_flags & VM_WRITE) != 0;
32756 while (addr < end) {
32757 handle_mm_fault(current, vma, addr, write);
32758 addr += PAGE_SIZE;
32759 }
32760 }
Сайт управляется системой
uCoz