回到start_kernel,下一个函数执行mm_init()。这个函数很重要了,来自同一个文件。
static void __init mm_init(void)
{
/*
* page_cgroup requires countinous pages as memmap
* and it's bigger than MAX_ORDER unless SPARSEMEM.
*/
page_cgroup_init_flatmem();
mem_init();
kmem_cache_init();
pgtable_cache_init();
vmalloc_init();
}
这五个函数,其中由于我们没有配置CONFIG_CGROUP_MEM_RES_CTLR,所以第一个函数page_cgroup_init_flatmem是个空函数。其余几个函数各个都是重点。
该函数执行完后不能再用像alloc_bootmem()、alloc_bootmem_low()、alloc_bootmem_pages()等申请低端内存的函数来申请内存,也就不能申请大块的连续物理内存了。
首先是mem_init,来自arch/x86/mm/init_32.c:
867void __init mem_init(void)
868{
869 int codesize, reservedpages, datasize, initsize;
870 int tmp;
871
872 pci_iommu_alloc();
873
874#ifdef CONFIG_FLATMEM
875 BUG_ON(!mem_map);
876#endif
877 /* this will put all low memory onto the freelists */
878 totalram_pages += free_all_bootmem();
879
880 reservedpages = 0;
881 for (tmp = 0; tmp < max_low_pfn; tmp++)
882 /*
883 * Only count reserved RAM pages:
884 */
885 if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))
886 reservedpages++;
887
888 set_highmem_pages_init();
889
890 codesize = (unsigned long) &_etext - (unsigned long) &_text;
891 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
892 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
893
894 printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "
895 "%dk reserved, %dk data, %dk init, %ldk highmem)/n",
896 nr_free_pages() << (PAGE_SHIFT-10),
897 num_physpages << (PAGE_SHIFT-10),
898 codesize >> 10,
899 reservedpages << (PAGE_SHIFT-10),
900 datasize >> 10,
901 initsize >> 10,
902 totalhigh_pages << (PAGE_SHIFT-10));
903
904 printk(KERN_INFO "virtual kernel memory layout:/n"
905 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)/n"
906#ifdef CONFIG_HIGHMEM
907 " pkmap : 0x%08lx - 0x%08lx (%4ld kB)/n"
908#endif
909 " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)/n"
910 " lowmem : 0x%08lx - 0x%08lx (%4ld MB)/n"
911 " .init : 0x%08lx - 0x%08lx (%4ld kB)/n"
912 " .data : 0x%08lx - 0x%08lx (%4ld kB)/n"
913 " .text : 0x%08lx - 0x%08lx (%4ld kB)/n",
914 FIXADDR_START, FIXADDR_TOP,
915 (FIXADDR_TOP - FIXADDR_START) >> 10,
916
917#ifdef CONFIG_HIGHMEM
918 PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,
919 (LAST_PKMAP*PAGE_SIZE) >> 10,
920#endif
921
922 VMALLOC_START, VMALLOC_END,
923 (VMALLOC_END - VMALLOC_START) >> 20,
924
925 (unsigned long)__va(0), (unsigned long)high_memory,
926 ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,
927
928 (unsigned long)&__init_begin, (unsigned long)&__init_end,
929 ((unsigned long)&__init_end -
930 (unsigned long)&__init_begin) >> 10,
931
932 (unsigned long)&_etext, (unsigned long)&_edata,
933 ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,
934
935 (unsigned long)&_text, (unsigned long)&_etext,
936 ((unsigned long)&_etext - (unsigned long)&_text) >> 10);
937
938 /*
939 * Check boundaries twice: Some fundamental inconsistencies can
940 * be detected at build time already.
941 */
942#define __FIXADDR_TOP (-PAGE_SIZE)
943#ifdef CONFIG_HIGHMEM
944 BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
945 BUILD_BUG_ON(VMALLOC_END > PKMAP_BASE);
946#endif
947#define high_memory (-128UL << 20)
948 BUILD_BUG_ON(VMALLOC_START >= VMALLOC_END);
949#undef high_memory
950#undef __FIXADDR_TOP
951
952#ifdef CONFIG_HIGHMEM
953 BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE > FIXADDR_START);
954 BUG_ON(VMALLOC_END > PKMAP_BASE);
955#endif
956 BUG_ON(VMALLOC_START >= VMALLOC_END);
957 BUG_ON((unsigned long)high_memory > VMALLOC_START);
958
959 if (boot_cpu_data.wp_works_ok < 0)
960 test_wp_bit();
961
962 save_pg_dir();
963 zap_low_mappings(true);
964}
872行,Intel IOMMU架构在Linux上的初始化函数pci_iommu_alloc。这个函数不是我们关注的重点,我们就不深入下去了,这里仅仅粗略地介绍一下。该函数首先通过读取 DMA Remapping table,来判断判断是否支持DMAR设备。随后调用pci_swiotlb_init函数对其进行初始化,解析DMAR table,并逐一打印每个dmar项。最后设置全局变量dma_ops,把初始化后的swiotlb_dma_ops传递给它,后者定义了IOMMU架构中所有的swiotlb方法。对IOMMU感兴趣的同学可以去查阅相关资料,这里就不详细介绍了。
878行,totalram_pages这个全局变量我们第一次遇见。它编译的时候初始化为0,现在它就等于free_all_bootmem函数的返回值,该函数在mm/bootmem.c中定义:
unsigned long __init free_all_bootmem(void)
{
#ifdef CONFIG_NO_BOOTMEM
/*
* We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id
* because in some case like Node0 doesnt have RAM installed
* low ram will be on Node1
* Use MAX_NUMNODES will make sure all ranges in early_node_map[]
* will be used instead of only Node0 related
*/
return free_all_memory_core_early(MAX_NUMNODES);
#else
unsigned long total_pages = 0;
bootmem_data_t *bdata;
list_for_each_entry(bdata, &bdata_list, list)
total_pages += free_all_bootmem_core(bdata);
return total_pages;
#endif
}
我们看到,由于CONFIG_NO_BOOTMEM起作用,并且MAX_NUMNODES为1,所以函数直接调用free_all_memory_core_early(1),怎么样,前面说得没错吧,终于碰到了这个函数:
200unsigned long __init free_all_memory_core_early(int nodeid)
201{
202 int i;
203 u64 start, end;
204 unsigned long count = 0;
205 struct range *range = NULL;
206 int nr_range;
207
208 nr_range = get_free_all_memory_range(&range, nodeid);
209
210 for (i = 0; i < nr_range; i++) {
211 start = range[i].start;
212 end = range[i].end;
213 count += end - start;
214 __free_pages_memory(start, end);
215 }
216
217 return count;
218}
205行的那个range结构很简单:
struct range {
u64 start;
u64 end;
};
所以首先208行调用get_free_all_memory_range函数:
393int __init get_free_all_memory_range(struct range **rangep, int nodeid)
394{
395 int i, count;
396 u64 start = 0, end;
397 u64 size;
398 u64 mem;
399 struct range *range;
400 int nr_range;
401
402 count = 0;
403 for (i = 0; i < max_early_res && early_res[i].end; i++)
404 count++;
405
406 count *= 2;
407
408 size = sizeof(struct range) * count;
409 end = get_max_mapped();
410#ifdef MAX_DMA32_PFN
411 if (end > (MAX_DMA32_PFN << PAGE_SHIFT))
412 start = MAX_DMA32_PFN << PAGE_SHIFT;
413#endif
414 mem = find_fw_memmap_area(start, end, size, sizeof(struct range));
415 if (mem == -1ULL)
416 panic("can not find more space for range free");
417
418 range = __va(mem);
419 /* use early_node_map[] and early_res to get range array at first */
420 memset(range, 0, size);
421 nr_range = 0;
422
423 /* need to go over early_node_map to find out good range for node */
424 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
425#ifdef CONFIG_X86_32
426 subtract_range(range, count, max_low_pfn, -1ULL);
427#endif
428 subtract_early_res(range, count);
429 nr_range = clean_sort_range(range, count);
430
431 /* need to clear it ? */
432 if (nodeid == MAX_NUMNODES) {
433 memset(&early_res[0], 0,
434 sizeof(struct early_res) * max_early_res);
435 early_res = NULL;
436 max_early_res = 0;
437 }
438
439 *rangep = range;
440 return nr_range;
441}
403行,全局变量max_early_res和early_res[]数组,老熟人了,一个循环得到目前已经分配了early_res元素的个数,把它的值乘以2赋给size。409行,调用get_max_mapped函数:
u64 __init get_max_mapped(void)
{
u64 end = max_pfn_mapped;
end <<= PAGE_SHIFT;
return end;
}
该函数返回我们的老熟人,最后一个页框max_pfn_mapped对应的物理地址,赋值给内部变量end(start在396行被赋值为0)。然后414行,调用find_fw_memmap_area函数,传给他的参数是start、end、size和range结构的大小:
u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
{
return find_e820_area(start, end, size, align);
}
find_e820_area不用多说了吧,从e820.map[]数组中寻找到一块能够容纳size个字节的内存段,该内存段的首物理地址赋值给get_free_all_memory_range的内部变量mem。418~421行初始化这块区域。随后424行调用add_from_early_node_map函数:
int __init add_from_early_node_map(struct range *range, int az,
int nr_range, int nid)
{
int i;
u64 start, end;
/* need to go over early_node_map to find out good range for node */
for_each_active_range_index_in_nid(i, nid) {
start = early_node_map[i].start_pfn;
end = early_node_map[i].end_pfn;
nr_range = add_range(range, az, nr_range, start, end);
}
return nr_range;
}
int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)
{
if (start >= end)
return nr_range;
/* Out of slots: */
if (nr_range >= az)
return nr_range;
range[nr_range].start = start;
range[nr_range].end = end;
nr_range++;
return nr_range;
}
执行完毕add_from_early_node_map函数之后,range执行的这块区域中,就形成了一个range[nr_range]数组,每个数组元素对应early_node_map[]的数组元素,表示nr_range块空闲内存空间的起始页框号和结束页框号。426行subtract_range函数检验一下这个range是否有问题,并进行调整。428行,调用subtract_early_res对产生冲突的地址进行调整:
static void __init subtract_early_res(struct range *range, int az)
{
int i, count;
u64 final_start, final_end;
int idx = 0;
count = 0;
for (i = 0; i < max_early_res && early_res[i].end; i++)
count++;
/* need to skip first one ?*/
if (early_res != early_res_x)
idx = 1;
#define DEBUG_PRINT_EARLY_RES 1
#if DEBUG_PRINT_EARLY_RES
printk(KERN_INFO "Subtract (%d early reservations)/n", count);
#endif
for (i = idx; i < count; i++) {
struct early_res *r = &early_res[i];
#if DEBUG_PRINT_EARLY_RES
printk(KERN_INFO " #%d [%010llx - %010llx] %15s/n", i,
r->start, r->end, r->name);
#endif
final_start = PFN_DOWN(r->start);
final_end = PFN_UP(r->end);
if (final_start >= final_end)
continue;
subtract_range(range, az, final_start, final_end);
}
}
对early_res体系熟悉的同学对上述代码一定不会困惑,我们看到subtract_early_res对地址进行调整,去掉那些已经被占用了的地址空间。回到get_free_all_memory_range,最后两行,把range赋给结果参数rangep,并且返回最终的range数组的元素个数nr_range。
回到free_all_memory_core_early函数中,内部变量range有了,其元素个数nr_range也有了,那么210~215执行一个循环,将range数组的每一个元素调用__free_pages_memory进行释放:
174static void __init __free_pages_memory(unsigned long start, unsigned long end)
175{
176 int i;
177 unsigned long start_aligned, end_aligned;
178 int order = ilog2(BITS_PER_LONG);
179
180 start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
181 end_aligned = end & ~(BITS_PER_LONG - 1);
182
183 if (end_aligned <= start_aligned) {
184 for (i = start; i < end; i++)
185 __free_pages_bootmem(pfn_to_page(i), 0);
186
187 return;
188 }
189
190 for (i = start; i < start_aligned; i++)
191 __free_pages_bootmem(pfn_to_page(i), 0);
192
193 for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
194 __free_pages_bootmem(pfn_to_page(i), order);
195
196 for (i = end_aligned; i < end; i++)
197 __free_pages_bootmem(pfn_to_page(i), 0);
198}
函数主要执行183~188行代码,通过__free_pages_bootmem函数释放对应号码的页框,从号码从start到end号。
下面来看看__free_pages_bootmem:
637void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
638{
639 if (order == 0) {
640 __ClearPageReserved(page);
641 set_page_count(page, 0);
642 set_page_refcounted(page);
643 __free_page(page);
644 } else {
645 int loop;
646
647 prefetchw(page);
648 for (loop = 0; loop < BITS_PER_LONG; loop++) {
649 struct page *p = &page[loop];
650
651 if (loop + 1 < BITS_PER_LONG)
652 prefetchw(p + 1);
653 __ClearPageReserved(p);
654 set_page_count(p, 0);
655 }
656
657 set_page_refcounted(page);
658 __free_pages(page, order);
659 }
660}
我们传递进来的参数order为0,所以来到643行,针对这个页面page,著名的伙伴算法到来了,我们来看它的定义:
#define __free_page(page) __free_pages((page), 0)
释放页框的所有内核宏和函数都依赖于__free_pages()函数。它接收的参数为将要释放的第一个页框的页描述符的地址(page)和将要释放的一组连续页框的数量的对数(order)。该函数执行如下步骤:
1. 检查第一个页框是否真正属于动态内存(它的PG_reserved 标志被清0);如果不是,则终止。
2. 减少page->_count 使用计数器的值;如果它仍然大于或等于0,则终止。
3. 如果order 等于0,那么该函数调用free_hot_page()来释放页框给适当内存管理区的每CPU 热高速缓存。
4. 如果order大于0,那么它将页框加入到本地链表中,并调用free_pages_bulk()函数把它们释放到适当内存管理区的伙伴系统中。
我们这里order为0,所以调用free_hot_page(),最终会调用__free_one_page。由于前面的pglist和zone的体系已经建立好,该函数对当前页面page对应的那个zone的free_area数组进行处理。由于这个地方是第一次触及该数组,那么这一次free_hot_page调用的__free_one_page将会找到全部伙伴,等于是初始化了整个伙伴算法系统。好了,怀疑我这句话的同志可以去看看博客“伙伴系统算法”
http://blog.csdn.net/yunsongice/archive/2010/01/22/5225155.aspx
回到mem_init函数中,伙伴系统建立起来以后,free_all_bootmem返回空闲页面的总数给全局参数totalram_pages。随后880~886行代码计算被保留的页面数,保存在内部变量reservedpages中。888行,set_highmem_pages_init函数,通过调用add_highpages_work_fn函数初始化876MB以上的高端页面,并把他们加入伙伴系统,最后计算出包含了这些高端页面的新的可用页面的数量totalram_pages。
继续走,890行,让内部变量codesize、datasize、initsize分别等于内核代码段、数据段和初始化相关函数指针空间段的大小。随后894~936行打印相关信息。942~957是一群调试信息,略去。962行save_pg_dir()函数,来自同一文件:
char swsusp_pg_dir[PAGE_SIZE]
__attribute__ ((aligned(PAGE_SIZE)));
static inline void save_pg_dir(void)
{
memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
}
很简单,就是把页全局目录拷贝到全局变量swsusp_pg_dir数组中,做个备份。963行,执行zap_low_mappings(true)函数,这个函数也来自于同一个文件:
void zap_low_mappings(bool early)
{
int i;
/*
* Zap initial low-memory mappings.
*
* Note that "pgd_clear()" doesn't do it for
* us, because pgd_clear() is a no-op on i386.
*/
for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
#ifdef CONFIG_X86_PAE
set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
#else
set_pgd(swapper_pg_dir+i, __pgd(0));
#endif
}
if (early)
__flush_tlb();
else
flush_tlb_all();
}
这个函数很简单,就是把前面我们在arch/x86/kernel/head_32.S中设置的页全局目录的前若干项清零。这若干项到底是多少项呢?我们看看KERNEL_PGD_BOUNDARY是什么东西:
#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
#define PGDIR_SHIFT 22
#define PTRS_PER_PGD 1024
不错,0xc0000000>>22 & 1023= 768,这些也全局目录项代表虚拟地址前3G的页面,也就是所谓的用户区,我们在这里把它全清零了。