设置每CPU环境

    技术2024-07-30  64

    5.3 设置每CPU环境

    回到start_kernel563行调用mm_init_owner函数,将init_mmowner字段指回init_task。这个函数可以说进入start_kernel以来最简单的函数了。继续走,setup_command_line也很简单:

     

    static void __init setup_command_line(char *command_line)

    {

           saved_command_line = alloc_bootmem(strlen (boot_command_line)+1);

           static_command_line = alloc_bootmem(strlen (command_line)+1);

           strcpy (saved_command_line, boot_command_line);

           strcpy (static_command_line, command_line);

    }

     

    把刚才在setup_arch()中拷贝进来的command_line,拷贝到全局变量saved_command_linestatic_command_line所指向的内存单元中。这个内存单元通过alloc_bootmem函数在刚刚建立好的内存管理环境中进行分配。

     

    继续走,565行,setup_nr_cpu_ids()函数,在多CPU情况下,调用同一文件中的:

    static void __init setup_nr_cpu_ids(void)

    {

           nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;

    }

     

    nr_cpu_ids是一个特殊的值,在单CPU情况下是1;而SMP情况下,又是一个全局变量,被find_last_bit函数设置,针对x86体系其本质上会调用bsr汇编指令。这里我大概介绍一下这个指令的概念,可能有不对的地方,请高手指教。386以上的CPU有一对指令BSF/BSR ——正/反向位扫描。这个指令的使用方法是:BSF dest,src影响标志位ZF。这个指令的意思是,扫描源操作数中的第一个被设置的位,如果发现某一位被设置了,则设置ZF位并将第一个被设置位的索引装载到目的操作数中;如果没有发现被设置的位,则清除ZFBSF正向扫描各个位(从第0位到第N)BSR相反(从第N位到第0)

     

    继续走,566行,setup_per_cpu_areas,来自arch/x86/kernel/setup_percpu.c。这个函数只是设置一下SMP的每CPU存储区,也就是说为系统中的每个cpuper_cpu变量申请空间。函数比较复杂,我这里只把整个函数列出来,对SMP感兴趣的同学可以尝试深入分析一下:

     

    void __init setup_per_cpu_areas(void)

    {

           unsigned int cpu;

           unsigned long delta;

           int rc;

     

           pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d/n",

                  NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);

     

           /*

            * Allocate percpu area.  Embedding allocator is our favorite;

            * however, on NUMA configurations, it can result in very

            * sparse unit mapping and vmalloc area isn't spacious enough

            * on 32bit.  Use page in that case.

            */

    #ifdef CONFIG_X86_32

           if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())

                  pcpu_chosen_fc = PCPU_FC_PAGE;

    #endif

           rc = -EINVAL;

           if (pcpu_chosen_fc != PCPU_FC_PAGE) {

                  const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;

                  const size_t dyn_size = PERCPU_MODULE_RESERVE +

                         PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;

     

                  rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,

                                           dyn_size, atom_size,

                                           pcpu_cpu_distance,

                                           pcpu_fc_alloc, pcpu_fc_free);

                  if (rc < 0)

                         pr_warning("%s allocator failed (%d), falling back to page size/n",

                                   pcpu_fc_names[pcpu_chosen_fc], rc);

           }

           if (rc < 0)

                  rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,

                                          pcpu_fc_alloc, pcpu_fc_free,

                                          pcpup_populate_pte);

           if (rc < 0)

                  panic("cannot initialize percpu area (err=%d)", rc);

     

           /* alrighty, percpu areas up and running */

           delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;

           for_each_possible_cpu(cpu) {

                  per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];

                  per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);

                  per_cpu(cpu_number, cpu) = cpu;

                  setup_percpu_segment(cpu);

                  setup_stack_canary_segment(cpu);

                  /*

                   * Copy data used in early init routines from the

                   * initial arrays to the per cpu data areas.  These

                   * arrays then become expendable and the *_early_ptr's

                   * are zeroed indicating that the static arrays are

                   * gone.

                   */

    #ifdef CONFIG_X86_LOCAL_APIC

                  per_cpu(x86_cpu_to_apicid, cpu) =

                         early_per_cpu_map(x86_cpu_to_apicid, cpu);

                  per_cpu(x86_bios_cpu_apicid, cpu) =

                         early_per_cpu_map(x86_bios_cpu_apicid, cpu);

    #endif

    #ifdef CONFIG_X86_64

                  per_cpu(irq_stack_ptr, cpu) =

                         per_cpu(irq_stack_union.irq_stack, cpu) +

                         IRQ_STACK_SIZE - 64;

    #ifdef CONFIG_NUMA

                  per_cpu(x86_cpu_to_node_map, cpu) =

                         early_per_cpu_map(x86_cpu_to_node_map, cpu);

    #endif

    #endif

                  /*

                   * Up to this point, the boot CPU has been using .data.init

                   * area.  Reload any changed state for the boot CPU.

                   */

                  if (cpu == boot_cpu_id)

                         switch_to_new_gdt(cpu);

           }

     

           /* indicate the early static arrays will soon be gone */

    #ifdef CONFIG_X86_LOCAL_APIC

           early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;

           early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;

    #endif

    #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)

           early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;

    #endif

     

    #if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)

           /*

            * make sure boot cpu node_number is right, when boot cpu is on the

            * node that doesn't have mem installed

            */

           per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);

    #endif

     

           /* Setup node to cpumask map */

           setup_node_to_cpumask_map();

     

           /* Setup cpu initialized, callin, callout masks */

           setup_cpu_local_masks();

    }

     

    在该函数中,为每个CPU分配一段专有数据区,并将.data.percpu中的数据拷贝到其中, 每个CPU各有一份。由于数据从__per_cpu_start处转移到各CPU自己的专有数据区中了, 因此存取其中的变量就不能再用原先的值了,比如存取per_cpu__runqueues 就不能再用per_cpu__runqueues了,需要做一个偏移量的调整,即需要加上各CPU自己的专有数据区首地址相对于__per_cpu_start的偏移量。在这里也就是__per_cpu_offset[i],其中CPU i的专有数据区相对于__per_cpu_start的偏移量为__per_cpu_offset[i]

     

    这样,就可以方便地计算专有数据区中各变量的新地址,比如对于per_cpu_runqueues 其新地址即变成per_cpu_runqueues+__per_cpu_offset[i]

     

    经过这样的处理,.data.percpu这个section在系统初始化后就可以释放了。为什么要释放它?OK,自己去看arch/x86/kernel/vmlinux.lds文件,整个.data.percpu这个section都在__init_begin__init_end之间,也就是说,该section所占内存会在系统启动后释放(free)掉。

     

    继续走,start_kernel567smp_prepare_boot_cpu函数,来自arch/x86/include/asm/smp.h

    static inline void smp_prepare_boot_cpu(void)

    {

           smp_ops.smp_prepare_boot_cpu();

    }

     

    全局变量smp_ops也是一个smp_ops结构,在代码arch/x86/kernel/smp.c中被初始化成:

     

    struct smp_ops smp_ops = {

           .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,

           .smp_prepare_cpus = native_smp_prepare_cpus,

           .smp_cpus_done            = native_smp_cpus_done,

     

           .smp_send_stop             = native_smp_send_stop,

           .smp_send_reschedule    = native_smp_send_reschedule,

     

           .cpu_up                 = native_cpu_up,

           .cpu_die         = native_cpu_die,

           .cpu_disable           = native_cpu_disable,

           .play_dead             = native_play_dead,

     

           .send_call_func_ipi = native_send_call_func_ipi,

           .send_call_func_single_ipi = native_send_call_func_single_ipi,

    };

     

    所以,567smp_prepare_boot_cpu函数最终调用native_smp_prepare_boot_cpu函数。该函数最终会调用switch_to_new_gdt函数,传给他的参数是当前CPU的编号:

     

    void switch_to_new_gdt(int cpu)

    {

           struct desc_ptr gdt_descr;

     

           gdt_descr.address = (long)get_cpu_gdt_table(cpu);

           gdt_descr.size = GDT_SIZE - 1;

           load_gdt(&gdt_descr);

           /* Reload the per-cpu base */

     

           load_percpu_segment(cpu);

    }

     

    load_gdt很熟悉了,就是调用lgdt汇编指令加载GDT表。那么,自系统启动至此,已经有三次加载GDT了,为啥这里又来加载一次呢?这里面的关键是get_cpu_gdt_table函数,来自arch/x86/include/asm/desc.h

    static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)

    {

           return per_cpu(gdt_page, cpu).gdt;

    }

     

    好了,SMP的重中之重来了,per_cpu宏,这里重点介绍它:

    #define per_cpu(var, cpu) /

           (*SHIFT_PERCPU_PTR(&(var), per_cpu_offset(cpu)))

     

    我们看到传入这个宏的两个参数是gdt_pagecpugdt_page还记得吧,我们在“初始化GDT http://blog.csdn.net/yunsongice/archive/2010/12/31/6110703.aspx中讲过,包含328字节的段描述符;cpu,刚刚传进来的当前CPUid。翻译过来就是:

    *SHIFT_PERCPU_PTR(&(gdt_page), per_cpu_offset(cpu))

     

    #define SHIFT_PERCPU_PTR(__p, __offset)     ({                         /

           __verify_pcpu_ptr((__p));                                 /

           RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)); /

    })

     

    __verify_pcpu_ptrGCC的一个优化过程,我们忽略,所以继续翻译就是:

    RELOC_HIDE((typeof(*(&(gdt_page))) __kernel __force *)(&(gdt_page)), (per_cpu_offset(cpu))

     

    #define RELOC_HIDE(ptr, off)                                /

      ({ unsigned long __ptr;                                 /

        __asm__ ("" : "=r"(__ptr) : "0"(ptr));          /

        (typeof(ptr)) (__ptr + (off)); })

     

    对于per_cpu(gdt_page, cpu),将等效地扩展为:

    __per_cpu_offset[smp_processor_id()] + per_cpu__gdt_page

    并且是一个lvalue,也就是说可以进行赋值操作。这正好是上述per_cpu__runqueues变量在对应CPU的专有数据区中的新地址。

     

    由于不同的每cpu变量有不同的偏移量,并且不同的CPU其专有数据区首地址不同, 因此,通过per_cpu (var,cpu)便访问到了不同的变量。对这个概念还不是很清楚的同学请查阅一下博客“每CPU变量”http://blog.csdn.net/yunsongice/archive/2010/05/18/5605239.aspx

     

    最新回复(0)