启用伙伴算法

    技术2024-07-17  91

    5.8 初始化内存管理

    回到start_kernel,下一个函数执行mm_init()。这个函数很重要了,来自同一个文件。

     

    static void __init mm_init(void)

    {

           /*

            * page_cgroup requires countinous pages as memmap

            * and it's bigger than MAX_ORDER unless SPARSEMEM.

            */

           page_cgroup_init_flatmem();

           mem_init();

           kmem_cache_init();

           pgtable_cache_init();

           vmalloc_init();

    }

     

    这五个函数,其中由于我们没有配置CONFIG_CGROUP_MEM_RES_CTLR,所以第一个函数page_cgroup_init_flatmem是个空函数。其余几个函数各个都是重点。

     

    该函数执行完后不能再用像alloc_bootmem()alloc_bootmem_low()alloc_bootmem_pages()等申请低端内存的函数来申请内存,也就不能申请大块的连续物理内存了。

     

    5.8.1 启用伙伴算法

    首先是mem_init,来自arch/x86/mm/init_32.c

    867void __init mem_init(void)

     868{

     869        int codesize, reservedpages, datasize, initsize;

     870        int tmp;

     871

     872        pci_iommu_alloc();

     873

     874#ifdef CONFIG_FLATMEM

     875        BUG_ON(!mem_map);

     876#endif

     877        /* this will put all low memory onto the freelists */

     878        totalram_pages += free_all_bootmem();

     879

     880        reservedpages = 0;

     881        for (tmp = 0; tmp < max_low_pfn; tmp++)

     882                /*

     883                 * Only count reserved RAM pages:

     884                 */

     885                if (page_is_ram(tmp) && PageReserved(pfn_to_page(tmp)))

     886                        reservedpages++;

     887

     888        set_highmem_pages_init();

     889

     890        codesize =  (unsigned long) &_etext - (unsigned long) &_text;

     891        datasize =  (unsigned long) &_edata - (unsigned long) &_etext;

     892        initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

     893

     894        printk(KERN_INFO "Memory: %luk/%luk available (%dk kernel code, "

     895                        "%dk reserved, %dk data, %dk init, %ldk highmem)/n",

     896                nr_free_pages() << (PAGE_SHIFT-10),

     897                num_physpages << (PAGE_SHIFT-10),

     898                codesize >> 10,

     899                reservedpages << (PAGE_SHIFT-10),

     900                datasize >> 10,

     901                initsize >> 10,

     902                totalhigh_pages << (PAGE_SHIFT-10));

     903

     904        printk(KERN_INFO "virtual kernel memory layout:/n"

     905                "    fixmap  : 0x%08lx - 0x%08lx   (%4ld kB)/n"

     906#ifdef CONFIG_HIGHMEM

     907                "    pkmap   : 0x%08lx - 0x%08lx   (%4ld kB)/n"

     908#endif

     909                "    vmalloc : 0x%08lx - 0x%08lx   (%4ld MB)/n"

     910                "    lowmem  : 0x%08lx - 0x%08lx   (%4ld MB)/n"

     911                "      .init : 0x%08lx - 0x%08lx   (%4ld kB)/n"

     912                "      .data : 0x%08lx - 0x%08lx   (%4ld kB)/n"

     913                "      .text : 0x%08lx - 0x%08lx   (%4ld kB)/n",

     914                FIXADDR_START, FIXADDR_TOP,

     915                (FIXADDR_TOP - FIXADDR_START) >> 10,

     916

     917#ifdef CONFIG_HIGHMEM

     918                PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE,

     919                (LAST_PKMAP*PAGE_SIZE) >> 10,

     920#endif

     921

     922                VMALLOC_START, VMALLOC_END,

     923                (VMALLOC_END - VMALLOC_START) >> 20,

     924

     925                (unsigned long)__va(0), (unsigned long)high_memory,

     926                ((unsigned long)high_memory - (unsigned long)__va(0)) >> 20,

     927

     928                (unsigned long)&__init_begin, (unsigned long)&__init_end,

     929                ((unsigned long)&__init_end -

     930                 (unsigned long)&__init_begin) >> 10,

     931

     932                (unsigned long)&_etext, (unsigned long)&_edata,

     933                ((unsigned long)&_edata - (unsigned long)&_etext) >> 10,

     934

     935                (unsigned long)&_text, (unsigned long)&_etext,

     936                ((unsigned long)&_etext - (unsigned long)&_text) >> 10);

     937

     938        /*

     939         * Check boundaries twice: Some fundamental inconsistencies can

     940         * be detected at build time already.

     941         */

     942#define __FIXADDR_TOP (-PAGE_SIZE)

     943#ifdef CONFIG_HIGHMEM

     944        BUILD_BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE  > FIXADDR_START);

     945        BUILD_BUG_ON(VMALLOC_END                        > PKMAP_BASE);

     946#endif

     947#define high_memory (-128UL << 20)

     948        BUILD_BUG_ON(VMALLOC_START                      >= VMALLOC_END);

     949#undef high_memory

     950#undef __FIXADDR_TOP

     951

     952#ifdef CONFIG_HIGHMEM

     953        BUG_ON(PKMAP_BASE + LAST_PKMAP*PAGE_SIZE        > FIXADDR_START);

     954        BUG_ON(VMALLOC_END                              > PKMAP_BASE);

     955#endif

     956        BUG_ON(VMALLOC_START                            >= VMALLOC_END);

     957        BUG_ON((unsigned long)high_memory               > VMALLOC_START);

     958

     959        if (boot_cpu_data.wp_works_ok < 0)

     960                test_wp_bit();

     961

     962        save_pg_dir();

     963        zap_low_mappings(true);

     964}

     

    872行,Intel IOMMU架构在Linux上的初始化函数pci_iommu_alloc。这个函数不是我们关注的重点,我们就不深入下去了,这里仅仅粗略地介绍一下。该函数首先通过读取 DMA Remapping table,来判断判断是否支持DMAR设备。随后调用pci_swiotlb_init函数对其进行初始化,解析DMAR table,并逐一打印每个dmar项。最后设置全局变量dma_ops,把初始化后的swiotlb_dma_ops传递给它,后者定义了IOMMU架构中所有的swiotlb方法。对IOMMU感兴趣的同学可以去查阅相关资料,这里就不详细介绍了。

     

    878行,totalram_pages这个全局变量我们第一次遇见。它编译的时候初始化为0,现在它就等于free_all_bootmem函数的返回值,该函数在mm/bootmem.c中定义:

     

    unsigned long __init free_all_bootmem(void)

    {

    #ifdef CONFIG_NO_BOOTMEM

           /*

            * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id

            *  because in some case like Node0 doesnt have RAM installed

            *  low ram will be on Node1

            * Use MAX_NUMNODES will make sure all ranges in early_node_map[]

            *  will be used instead of only Node0 related

            */

           return free_all_memory_core_early(MAX_NUMNODES);

    #else

           unsigned long total_pages = 0;

           bootmem_data_t *bdata;

     

           list_for_each_entry(bdata, &bdata_list, list)

                  total_pages += free_all_bootmem_core(bdata);

     

           return total_pages;

    #endif

    }

     

    我们看到,由于CONFIG_NO_BOOTMEM起作用,并且MAX_NUMNODES1,所以函数直接调用free_all_memory_core_early(1),怎么样,前面说得没错吧,终于碰到了这个函数:

     

    200unsigned long __init free_all_memory_core_early(int nodeid)

     201{

     202        int i;

     203        u64 start, end;

     204        unsigned long count = 0;

     205        struct range *range = NULL;

     206        int nr_range;

     207

     208        nr_range = get_free_all_memory_range(&range, nodeid);

     209

     210        for (i = 0; i < nr_range; i++) {

     211                start = range[i].start;

     212                end = range[i].end;

     213                count += end - start;

     214                __free_pages_memory(start, end);

     215        }

     216

     217        return count;

     218}

     

    205行的那个range结构很简单:

    struct range {

           u64   start;

           u64   end;

    };

     

    所以首先208行调用get_free_all_memory_range函数:

     

    393int __init get_free_all_memory_range(struct range **rangep, int nodeid)

     394{

     395        int i, count;

     396        u64 start = 0, end;

     397        u64 size;

     398        u64 mem;

     399        struct range *range;

     400        int nr_range;

     401

     402        count  = 0;

     403        for (i = 0; i < max_early_res && early_res[i].end; i++)

     404                count++;

     405

     406        count *= 2;

     407

     408        size = sizeof(struct range) * count;

     409        end = get_max_mapped();

     410#ifdef MAX_DMA32_PFN

     411        if (end > (MAX_DMA32_PFN << PAGE_SHIFT))

     412                start = MAX_DMA32_PFN << PAGE_SHIFT;

     413#endif

     414        mem = find_fw_memmap_area(start, end, size, sizeof(struct range));

     415        if (mem == -1ULL)

     416                panic("can not find more space for range free");

     417

     418        range = __va(mem);

     419        /* use early_node_map[] and early_res to get range array at first */

     420        memset(range, 0, size);

     421        nr_range = 0;

     422

     423        /* need to go over early_node_map to find out good range for node */

     424        nr_range = add_from_early_node_map(range, count, nr_range, nodeid);

     425#ifdef CONFIG_X86_32

     426        subtract_range(range, count, max_low_pfn, -1ULL);

     427#endif

     428        subtract_early_res(range, count);

     429        nr_range = clean_sort_range(range, count);

     430

     431        /* need to clear it ? */

     432        if (nodeid == MAX_NUMNODES) {

     433                memset(&early_res[0], 0,

     434                         sizeof(struct early_res) * max_early_res);

     435                early_res = NULL;

     436                max_early_res = 0;

     437        }

     438

     439        *rangep = range;

     440        return nr_range;

     441}

     

    403行,全局变量max_early_researly_res[]数组,老熟人了,一个循环得到目前已经分配了early_res元素的个数,把它的值乘以2赋给size409行,调用get_max_mapped函数:

    u64 __init get_max_mapped(void)

    {

           u64 end = max_pfn_mapped;

           end <<= PAGE_SHIFT;

           return end;

    }

     

    该函数返回我们的老熟人最后一个页框max_pfn_mapped对应的物理地址赋值给内部变量endstart396行被赋值为0。然后414调用find_fw_memmap_area函数传给他的参数是startendsizerange结构的大小

    u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)

    {

           return find_e820_area(start, end, size, align);

    }

     

    find_e820_area不用多说了吧,从e820.map[]数组中寻找到一块能够容纳size个字节的内存段,该内存段的首物理地址赋值给get_free_all_memory_range的内部变量mem418~421行初始化这块区域。随后424行调用add_from_early_node_map函数:

     

    int __init add_from_early_node_map(struct range *range, int az,

                                   int nr_range, int nid)

    {

           int i;

           u64 start, end;

     

           /* need to go over early_node_map to find out good range for node */

           for_each_active_range_index_in_nid(i, nid) {

                  start = early_node_map[i].start_pfn;

                  end = early_node_map[i].end_pfn;

                  nr_range = add_range(range, az, nr_range, start, end);

           }

           return nr_range;

    }

     

    int add_range(struct range *range, int az, int nr_range, u64 start, u64 end)

    {

           if (start >= end)

                  return nr_range;

     

           /* Out of slots: */

           if (nr_range >= az)

                  return nr_range;

     

           range[nr_range].start = start;

           range[nr_range].end = end;

     

           nr_range++;

     

           return nr_range;

    }

     

    执行完毕add_from_early_node_map函数之后,range执行的这块区域中,就形成了一个range[nr_range]数组,每个数组元素对应early_node_map[]的数组元素,表示nr_range块空闲内存空间的起始页框号和结束页框号。426subtract_range函数检验一下这个range是否有问题,并进行调整。428行,调用subtract_early_res对产生冲突的地址进行调整:

     

    static void __init subtract_early_res(struct range *range, int az)

    {

           int i, count;

           u64 final_start, final_end;

           int idx = 0;

     

           count  = 0;

           for (i = 0; i < max_early_res && early_res[i].end; i++)

                  count++;

     

           /* need to skip first one ?*/

           if (early_res != early_res_x)

                  idx = 1;

     

    #define DEBUG_PRINT_EARLY_RES 1

     

    #if DEBUG_PRINT_EARLY_RES

           printk(KERN_INFO "Subtract (%d early reservations)/n", count);

    #endif

           for (i = idx; i < count; i++) {

                  struct early_res *r = &early_res[i];

    #if DEBUG_PRINT_EARLY_RES

                  printk(KERN_INFO "  #%d [%010llx - %010llx] %15s/n", i,

                         r->start, r->end, r->name);

    #endif

                  final_start = PFN_DOWN(r->start);

                  final_end = PFN_UP(r->end);

                  if (final_start >= final_end)

                         continue;

                  subtract_range(range, az, final_start, final_end);

           }

     

    }

     

    early_res体系熟悉的同学对上述代码一定不会困惑,我们看到subtract_early_res对地址进行调整,去掉那些已经被占用了的地址空间。回到get_free_all_memory_range,最后两行,把range赋给结果参数rangep,并且返回最终的range数组的元素个数nr_range

     

    回到free_all_memory_core_early函数中,内部变量range有了,其元素个数nr_range也有了,那么210~215执行一个循环,将range数组的每一个元素调用__free_pages_memory进行释放:

     

    174static void __init __free_pages_memory(unsigned long start, unsigned long end)

     175{

     176        int i;

     177        unsigned long start_aligned, end_aligned;

     178        int order = ilog2(BITS_PER_LONG);

     179

     180        start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);

     181        end_aligned = end & ~(BITS_PER_LONG - 1);

     182

     183        if (end_aligned <= start_aligned) {

     184                for (i = start; i < end; i++)

     185                        __free_pages_bootmem(pfn_to_page(i), 0);

     186

     187                return;

     188        }

     189

     190        for (i = start; i < start_aligned; i++)

     191                __free_pages_bootmem(pfn_to_page(i), 0);

     192

     193        for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)

     194                __free_pages_bootmem(pfn_to_page(i), order);

     195

     196        for (i = end_aligned; i < end; i++)

     197                __free_pages_bootmem(pfn_to_page(i), 0);

     198}

     

    函数主要执行183~188行代码,通过__free_pages_bootmem函数释放对应号码的页框,从号码从startend号。

     

    下面来看看__free_pages_bootmem

     

    637void __meminit __free_pages_bootmem(struct page *page, unsigned int order)

     638{

     639        if (order == 0) {

     640                __ClearPageReserved(page);

     641                set_page_count(page, 0);

     642                set_page_refcounted(page);

     643                __free_page(page);

     644        } else {

     645                int loop;

     646

     647                prefetchw(page);

     648                for (loop = 0; loop < BITS_PER_LONG; loop++) {

     649                        struct page *p = &page[loop];

     650

     651                        if (loop + 1 < BITS_PER_LONG)

     652                                prefetchw(p + 1);

     653                        __ClearPageReserved(p);

     654                        set_page_count(p, 0);

     655                }

     656

     657                set_page_refcounted(page);

     658                __free_pages(page, order);

     659        }

     660}

     

    我们传递进来的参数order0,所以来到643行,针对这个页面page,著名的伙伴算法到来了,我们来看它的定义:

    #define __free_page(page) __free_pages((page), 0)

     

    释放页框的所有内核宏和函数都依赖于__free_pages()函数。它接收的参数为将要释放的第一个页框的页描述符的地址(page)和将要释放的一组连续页框的数量的对数(order)。该函数执行如下步骤:

    1.       检查第一个页框是否真正属于动态内存(它的PG_reserved 标志被清0);如果不是,则终止。

    2.       减少page->_count 使用计数器的值;如果它仍然大于或等于0,则终止。

    3.       如果order 等于0,那么该函数调用free_hot_page()来释放页框给适当内存管理区的每CPU 热高速缓存。

    4.       如果order大于0,那么它将页框加入到本地链表中,并调用free_pages_bulk()函数把它们释放到适当内存管理区的伙伴系统中。

     

    我们这里order0,所以调用free_hot_page(),最终会调用__free_one_page。由于前面的pglistzone的体系已经建立好,该函数对当前页面page对应的那个zonefree_area数组进行处理。由于这个地方是第一次触及该数组,那么这一次free_hot_page调用的__free_one_page将会找到全部伙伴,等于是初始化了整个伙伴算法系统。好了,怀疑我这句话的同志可以去看看博客“伙伴系统算法”

    http://blog.csdn.net/yunsongice/archive/2010/01/22/5225155.aspx

     

    回到mem_init函数中,伙伴系统建立起来以后,free_all_bootmem返回空闲页面的总数给全局参数totalram_pages。随后880~886行代码计算被保留的页面数,保存在内部变量reservedpages中。888行,set_highmem_pages_init函数,通过调用add_highpages_work_fn函数初始化876MB以上的高端页面,并把他们加入伙伴系统,最后计算出包含了这些高端页面的新的可用页面的数量totalram_pages

     

    继续走,890行,让内部变量codesizedatasizeinitsize分别等于内核代码段、数据段和初始化相关函数指针空间段的大小。随后894~936行打印相关信息。942~957是一群调试信息,略去。962save_pg_dir()函数,来自同一文件:

    char swsusp_pg_dir[PAGE_SIZE]

           __attribute__ ((aligned(PAGE_SIZE)));

    static inline void save_pg_dir(void)

    {

           memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);

    }

     

    很简单,就是把页全局目录拷贝到全局变量swsusp_pg_dir数组中,做个备份。963行,执行zap_low_mappings(true)函数,这个函数也来自于同一个文件:

     

     

    void zap_low_mappings(bool early)

    {

           int i;

     

           /*

            * Zap initial low-memory mappings.

            *

            * Note that "pgd_clear()" doesn't do it for

            * us, because pgd_clear() is a no-op on i386.

            */

           for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {

    #ifdef CONFIG_X86_PAE

                  set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));

    #else

                  set_pgd(swapper_pg_dir+i, __pgd(0));

    #endif

           }

     

           if (early)

                  __flush_tlb();

           else

                  flush_tlb_all();

    }

     

    这个函数很简单,就是把前面我们在arch/x86/kernel/head_32.S中设置的页全局目录的前若干项清零。这若干项到底是多少项呢?我们看看KERNEL_PGD_BOUNDARY是什么东西:

    #define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)

    #define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))

    #define PGDIR_SHIFT  22

    #define PTRS_PER_PGD     1024

     

    不错,0xc0000000>>22 & 1023= 768,这些也全局目录项代表虚拟地址前3G的页面,也就是所谓的用户区,我们在这里把它全清零了。

    最新回复(0)