注:以下和体系结构相关的接口都以Mips为例!
创建进程的时候会把父进程的地址空间拷贝一份到子进程,大体的代码调用流程如下:
do_fork->copy_process->dup_mm->dup_mmap->copy_page_range->copy_hugetlb_page_range
从这个流程可以看出,大页进程fork出来的子进程的地址空间也是大页的;
这里有个问题:src_mm的大页vma是如何被创建的?
int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, struct vm_area_struct *vma) { pgd_t *src_pgd, *dst_pgd; unsigned long next; unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; int ret; /* * Don't copy ptes where a page fault will fill them correctly. * Fork becomes much lighter when there are big shared or private * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) { if (!vma->anon_vma) return 0; } if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); if (unlikely(is_pfn_mapping(vma))) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine */ ret = track_pfn_vma_copy(vma); if (ret) return ret; } /* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ if (is_cow_mapping(vma->vm_flags)) mmu_notifier_invalidate_range_start(src_mm, addr, end); ret = 0; dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_pud_range(dst_mm, src_mm, dst_pgd, src_pgd, vma, addr, next))) { ret = -ENOMEM; break; } } while (dst_pgd++, src_pgd++, addr = next, addr != end); if (is_cow_mapping(vma->vm_flags)) mmu_notifier_invalidate_range_end(src_mm, vma->vm_start, end); return ret; }
下面重点分析copy_hugetlb_page_range的流程
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma) { pte_t *src_pte, *dst_pte, entry; struct page *ptepage; unsigned long addr; int cow; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; dst_pte = huge_pte_alloc(dst, addr, sz); if (!dst_pte) goto nomem; /* If the pagetables are shared don't copy or take references */ if (dst_pte == src_pte) continue; spin_lock(&dst->page_table_lock); spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); if (!huge_pte_none(huge_ptep_get(src_pte))) { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); spin_unlock(&dst->page_table_lock); } return 0; nomem: return -ENOMEM; }
整个操作都在for循环里面,分两部分来分析
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; dst_pte = huge_pte_alloc(dst, addr, sz); if (!dst_pte) goto nomem; /* If the pagetables are shared don't copy or take references */ if (dst_pte == src_pte) continue;
A:
先用huge_pte_offset取出src中addr对应的pte 表项的地址;这里可能会想不同为什么下面会判断src_pte是否为NULL
很简单,pgd->pud->pmd->pte , pte是第一级页表,也就是最底层的一级页表,如果没有pgd,那么肯定不会有pte了;
所以如果这个addr这个地址在pgd这级都没有建立映射关系,那么根本就可能存在pte后面这一级的地址了;
huge_pte_offset实现如下,Mips的页表映射会在以后详细说明。这里只需要看到的是huge_pte_offset把pmd当作pte返回给调用者,因为这是大页映射;
pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; pud_t *pud; pmd_t *pmd = NULL; pgd = pgd_offset(mm, addr); if (pgd_present(*pgd)) { pud = pud_offset(pgd, addr); if (pud_present(*pud)) pmd = pmd_offset(pud, addr); } return (pte_t *) pmd; }
B:调用huge_pte_alloc为dst分配pte页表项,不明白的是需要判断dst_pte和src_pte是否相等?
这个两个地址都是页表的虚拟地址啊,他俩相等有关系吗?
pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; pte_t *pte = NULL; pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); if (pud) pte = (pte_t *)pmd_alloc(mm, pud, addr); return pte; }
如果src_pte存在,而且也成功分配了dst_pte,那么就开始做拷贝
if (!huge_pte_none(huge_ptep_get(src_pte))) { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); }
如果src_pte的表项中不为空,也就是有正常的映射关系,那么调用huge_ptep_get将src_pte表项中的内容拿出来存放在entry中;
这里需要说明一下,父进程fork出子进程的时候,子进程和父进程实际上还是共享一个物理页面
子进程和父进程的虚拟地址空间也是一样的,但是!子进程需要自己的页表来说明这种映射关系;copy_page_range实际上拷贝的不是物理页面,而是虚拟地址到物理页面的映射关系,也就是页表了;
明白了上面这段话,那几行代码就很容易了;取出src_pte表项中的内容entry,然后将entry写到dst_pte的表项中;
这样的话,父进程和子进程各自的虚拟地址空间实际上对应的是同一个物理页面,通俗点的讲,子进程和父进程共享代码段。。。。
除此之外,还需要增加所src_pte中映射的物理页面的引用计数,
arch/mips/include/asm/pgtable-32.h #define pte_page(x) pfn_to_page(pte_pfn(x)) 宏pte_pfn将pte entry转化为页帧号,然后用pfn_to_page将页帧号转化为页面 include/asm-generic/memory_model.h /* * supports 3 memory models. */ #if defined(CONFIG_FLATMEM) #define __pfn_to_page(pfn) (mem_map + ((pfn) - ARCH_PFN_OFFSET)) #define __page_to_pfn(page) ((unsigned long)((page) - mem_map) + / ARCH_PFN_OFFSET) #elif defined(CONFIG_DISCONTIGMEM) #define __pfn_to_page(pfn) / ({ unsigned long __pfn = (pfn); / unsigned long __nid = arch_pfn_to_nid(__pfn); / NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);/ }) #define __page_to_pfn(pg) / ({ struct page *__pg = (pg); / struct pglist_data *__pgdat = NODE_DATA(page_to_nid(__pg)); / (unsigned long)(__pg - __pgdat->node_mem_map) + / __pgdat->node_start_pfn; / }) #elif defined(CONFIG_SPARSEMEM_VMEMMAP)
alloc_pages()返回的不是页面的虚拟地址,而是页面的page结构的指针。 内核维护了一个(struct page *)数组,mem_map就是这数组的头,随后的(struct page *)指针按照(物理)地址顺序进行排列,比如说内存中的第10个页面,它的(struct page *)指针在mem_map数组中的偏移量(page_nr)就是10,而这个页面的(物理)起始地址就是(10 << PAGE_SHIFT) + ARCH_PFN_OFFSET。
* * Conversion between a struct page and a physical address. * * Note: when converting an unknown physical address to a * struct page, the resulting pointer must be validated * using VALID_PAGE(). It must return an invalid struct page * for any physical address not corresponding to a system * RAM address. * * page_to_pfn(page) convert a struct page * to a PFN number * pfn_to_page(pfn) convert a _valid_ PFN number to struct page * * * virt_to_page(k) convert a _valid_ virtual address to struct page * * virt_addr_valid(k) indicates whether a virtual address is valid */ #define ARCH_PFN_OFFSET PHYS_PFN_OFFSET #define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT) #define virt_addr_valid(kaddr) ((unsigned long)(kaddr) >= PAGE_OFFSET && (unsigned long)(kaddr) < (unsigned long)high_memory)
总结:对于在ARM上实现hugetlb,需要解决的问题
1:
copy_hugetlb_page_range中从pte表项的entry值获得对应的page ptepage = pte_page(entry); arch/arm/include/asm/pgtable.h #define pte_pfn(pte) (pte_val(pte) >> PAGE_SHIFT) 4K页只需要将entry值右移PAGE_SHIFT位,但是对于1M的大页呢?显然,这里需要在entry值中插入一个标志位来做下判断,这个是4K页还是64K页/1M/16M页,目前至少需要3个bit位? 问题是entry值中有可用的这些位吗? #define pfn_pte(pfn,prot) (__pte(((pfn) << PAGE_SHIFT) | pgprot_val(prot))) #define pte_none(pte) (!pte_val(pte)) #define pte_clear(mm,addr,ptep) set_pte_ext(ptep, __pte(0), 0) #define pte_page(pte) (pfn_to_page(pte_pfn(pte)))
2:不明白的是,为什么要判断srt_pte和分配的dst_pte值是否相当,这两个值是否相等有关西吗?