回到start_kernel函数,569行的build_all_zonelists()函数,来自mm/page_alloc.c:
2815void build_all_zonelists(void)
2816{
2817 set_zonelist_order();
2818
2819 if (system_state == SYSTEM_BOOTING) {
2820 __build_all_zonelists(NULL);
2821 mminit_verify_zonelist();
2822 cpuset_init_current_mems_allowed();
2823 } else {
2824 /* we have to stop all cpus to guarantee there is no user
2825 of zonelist */
2826 stop_machine(__build_all_zonelists, NULL, NULL);
2827 /* cpuset refresh routine should be here */
2828 }
2829 vm_total_pages = nr_free_pagecache_pages();
2830 /* ……一大堆注释*/
2837 if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
2838 page_group_by_mobility_disabled = 1;
2839 else
2840 page_group_by_mobility_disabled = 0;
2841
2842 printk("Built %i zonelists in %s order, mobility grouping %s. "
2843 "Total pages: %ld/n",
2844 nr_online_nodes,
2845 zonelist_order_name[current_zonelist_order],
2846 page_group_by_mobility_disabled ? "off" : "on",
2847 vm_total_pages);
2848#ifdef CONFIG_NUMA
2849 printk("Policy zone: %s/n", zone_names[policy_zone]);
2850#endif
2851}
其本质上调用__build_all_zonelists(NULL):
2780/* return values int ....just for stop_machine() */
2781static int __build_all_zonelists(void *dummy)
2782{
2783 int nid;
2784 int cpu;
2785
2786#ifdef CONFIG_NUMA
2787 memset(node_load, 0, sizeof(node_load));
2788#endif
2789 for_each_online_node(nid) {
2790 pg_data_t *pgdat = NODE_DATA(nid);
2791
2792 build_zonelists(pgdat);
2793 build_zonelist_cache(pgdat);
2794 }
2795
2796 /* ……一大堆注释*/
2809 for_each_possible_cpu(cpu)
2810 setup_pageset(&per_cpu(boot_pageset, cpu), 0);
2811
2812 return 0;
2813}
2789行,for_each_online_node我们很熟悉了,只执行一次的循环。2790行是最著名的pg_data_t,就是NODE_DATA(0)的那个结构。随后执行build_zonelists函数:
2637static void build_zonelists(pg_data_t *pgdat)
2638{
2639 int j, node, load;
2640 enum zone_type i;
2641 nodemask_t used_mask;
2642 int local_node, prev_node;
2643 struct zonelist *zonelist;
2644 int order = current_zonelist_order;
2645
2646 /* initialize zonelists */
2647 for (i = 0; i < MAX_ZONELISTS; i++) {
2648 zonelist = pgdat->node_zonelists + i;
2649 zonelist->_zonerefs[0].zone = NULL;
2650 zonelist->_zonerefs[0].zone_idx = 0;
2651 }
2652
2653 /* NUMA-aware ordering of nodes */
2654 local_node = pgdat->node_id;
2655 load = nr_online_nodes;
2656 prev_node = local_node;
2657 nodes_clear(used_mask);
2658
2659 memset(node_order, 0, sizeof(node_order));
2660 j = 0;
2661
2662 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
2663 int distance = node_distance(local_node, node);
2664
2665 /*
2666 * If another node is sufficiently far away then it is better
2667 * to reclaim pages in a zone before going off node.
2668 */
2669 if (distance > RECLAIM_DISTANCE)
2670 zone_reclaim_mode = 1;
2671
2672 /*
2673 * We don't want to pressure a particular node.
2674 * So adding penalty to the first node in same
2675 * distance group to make it round-robin.
2676 */
2677 if (distance != node_distance(local_node, prev_node))
2678 node_load[node] = load;
2679
2680 prev_node = node;
2681 load--;
2682 if (order == ZONELIST_ORDER_NODE)
2683 build_zonelists_in_node_order(pgdat, node);
2684 else
2685 node_order[j++] = node; /* remember order */
2686 }
2687
2688 if (order == ZONELIST_ORDER_ZONE) {
2689 /* calculate node order -- i.e., DMA last! */
2690 build_zonelists_in_zone_order(pgdat, j);
2691 }
2692
2693 build_thisnode_zonelists(pgdat);
2694}
build_zonelists函数2647-2651初始化NODE_DATA(0)的node_zonelist字段。我们继续走:
2697static void build_zonelist_cache(pg_data_t *pgdat)
2698{
2699 struct zonelist *zonelist;
2700 struct zonelist_cache *zlc;
2701 struct zoneref *z;
2702
2703 zonelist = &pgdat->node_zonelists[0];
2704 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
2705 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
2706 for (z = zonelist->_zonerefs; z->zone; z++)
2707 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
2708}
build_zonelist_cache函数初始化内存管理区的缓存,我这里就不深入下去了。回到build_all_zonelists()函数中,略去调试的代码,以及设置几个关于zone的策略的全局变量的代码,该函数就结束了。