设置每CPU环境

技术2024-07-30 64

5.3 设置每CPU环境

回到start_kernel，563行调用mm_init_owner函数，将init_mm的owner字段指回init_task。这个函数可以说进入start_kernel以来最简单的函数了。继续走，setup_command_line也很简单：

static void __init setup_command_line(char *command_line)

{

saved_command_line = alloc_bootmem(strlen (boot_command_line)+1);

static_command_line = alloc_bootmem(strlen (command_line)+1);

strcpy (saved_command_line, boot_command_line);

strcpy (static_command_line, command_line);

}

把刚才在setup_arch()中拷贝进来的command_line，拷贝到全局变量saved_command_line和static_command_line所指向的内存单元中。这个内存单元通过alloc_bootmem函数在刚刚建立好的内存管理环境中进行分配。

继续走，565行，setup_nr_cpu_ids()函数，在多CPU情况下，调用同一文件中的：

static void __init setup_nr_cpu_ids(void)

{

nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;

}

nr_cpu_ids是一个特殊的值，在单CPU情况下是1；而SMP情况下，又是一个全局变量，被find_last_bit函数设置，针对x86体系其本质上会调用bsr汇编指令。这里我大概介绍一下这个指令的概念，可能有不对的地方，请高手指教。386以上的CPU有一对指令BSF/BSR ——正/反向位扫描。这个指令的使用方法是：BSF dest,src，影响标志位ZF。这个指令的意思是，扫描源操作数中的第一个被设置的位，如果发现某一位被设置了，则设置ZF位并将第一个被设置位的索引装载到目的操作数中；如果没有发现被设置的位，则清除ZF。BSF正向扫描各个位(从第0位到第N位)，BSR相反(从第N位到第0位)。

继续走，566行，setup_per_cpu_areas，来自arch/x86/kernel/setup_percpu.c。这个函数只是设置一下SMP的每CPU存储区，也就是说为系统中的每个cpu的per_cpu变量申请空间。函数比较复杂，我这里只把整个函数列出来，对SMP感兴趣的同学可以尝试深入分析一下：

void __init setup_per_cpu_areas(void)

{

unsigned int cpu;

unsigned long delta;

int rc;

pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d/n",

NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);

* Allocate percpu area. Embedding allocator is our favorite;

* however, on NUMA configurations, it can result in very

* sparse unit mapping and vmalloc area isn't spacious enough

* on 32bit. Use page in that case.

#ifdef CONFIG_X86_32

if (pcpu_chosen_fc == PCPU_FC_AUTO && pcpu_need_numa())

pcpu_chosen_fc = PCPU_FC_PAGE;

#endif

rc = -EINVAL;

if (pcpu_chosen_fc != PCPU_FC_PAGE) {

const size_t atom_size = cpu_has_pse ? PMD_SIZE : PAGE_SIZE;

const size_t dyn_size = PERCPU_MODULE_RESERVE +

PERCPU_DYNAMIC_RESERVE - PERCPU_FIRST_CHUNK_RESERVE;

rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,

dyn_size, atom_size,

pcpu_cpu_distance,

pcpu_fc_alloc, pcpu_fc_free);

if (rc < 0)

pr_warning("%s allocator failed (%d), falling back to page size/n",

pcpu_fc_names[pcpu_chosen_fc], rc);

}

if (rc < 0)

rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE,

pcpu_fc_alloc, pcpu_fc_free,

pcpup_populate_pte);

if (rc < 0)

panic("cannot initialize percpu area (err=%d)", rc);

/* alrighty, percpu areas up and running */

delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;

for_each_possible_cpu(cpu) {

per_cpu_offset(cpu) = delta + pcpu_unit_offsets[cpu];

per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);

per_cpu(cpu_number, cpu) = cpu;

setup_percpu_segment(cpu);

setup_stack_canary_segment(cpu);

* Copy data used in early init routines from the

* initial arrays to the per cpu data areas. These

* arrays then become expendable and the *_early_ptr's

* are zeroed indicating that the static arrays are

* gone.

#ifdef CONFIG_X86_LOCAL_APIC

per_cpu(x86_cpu_to_apicid, cpu) =

early_per_cpu_map(x86_cpu_to_apicid, cpu);

per_cpu(x86_bios_cpu_apicid, cpu) =

early_per_cpu_map(x86_bios_cpu_apicid, cpu);

#endif

#ifdef CONFIG_X86_64

per_cpu(irq_stack_ptr, cpu) =

per_cpu(irq_stack_union.irq_stack, cpu) +

IRQ_STACK_SIZE - 64;

#ifdef CONFIG_NUMA

per_cpu(x86_cpu_to_node_map, cpu) =

early_per_cpu_map(x86_cpu_to_node_map, cpu);

#endif

* Up to this point, the boot CPU has been using .data.init

* area. Reload any changed state for the boot CPU.

if (cpu == boot_cpu_id)

switch_to_new_gdt(cpu);

}

/* indicate the early static arrays will soon be gone */

#ifdef CONFIG_X86_LOCAL_APIC

early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;

early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;

#endif

#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)

early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;

#endif

#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)

* make sure boot cpu node_number is right, when boot cpu is on the

* node that doesn't have mem installed

per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);

#endif

/* Setup node to cpumask map */

setup_node_to_cpumask_map();

/* Setup cpu initialized, callin, callout masks */

setup_cpu_local_masks();

}

在该函数中，为每个CPU分配一段专有数据区，并将.data.percpu中的数据拷贝到其中，每个CPU各有一份。由于数据从__per_cpu_start处转移到各CPU自己的专有数据区中了，因此存取其中的变量就不能再用原先的值了，比如存取per_cpu__runqueues 就不能再用per_cpu__runqueues了，需要做一个偏移量的调整，即需要加上各CPU自己的专有数据区首地址相对于__per_cpu_start的偏移量。在这里也就是__per_cpu_offset[i]，其中CPU i的专有数据区相对于__per_cpu_start的偏移量为__per_cpu_offset[i]。

这样，就可以方便地计算专有数据区中各变量的新地址，比如对于per_cpu_runqueues，其新地址即变成per_cpu_runqueues+__per_cpu_offset[i]。

经过这样的处理，.data.percpu这个section在系统初始化后就可以释放了。为什么要释放它？OK，自己去看arch/x86/kernel/vmlinux.lds文件，整个.data.percpu这个section都在__init_begin和__init_end之间，也就是说，该section所占内存会在系统启动后释放(free)掉。

继续走，start_kernel的567行smp_prepare_boot_cpu函数，来自arch/x86/include/asm/smp.h：

static inline void smp_prepare_boot_cpu(void)

{

smp_ops.smp_prepare_boot_cpu();

}

全局变量smp_ops也是一个smp_ops结构，在代码arch/x86/kernel/smp.c中被初始化成：

struct smp_ops smp_ops = {

.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,

.smp_prepare_cpus = native_smp_prepare_cpus,

.smp_cpus_done = native_smp_cpus_done,

.smp_send_stop = native_smp_send_stop,

.smp_send_reschedule = native_smp_send_reschedule,

.cpu_up = native_cpu_up,

.cpu_die = native_cpu_die,

.cpu_disable = native_cpu_disable,

.play_dead = native_play_dead,

.send_call_func_ipi = native_send_call_func_ipi,

.send_call_func_single_ipi = native_send_call_func_single_ipi,

};

所以，567行smp_prepare_boot_cpu函数最终调用native_smp_prepare_boot_cpu函数。该函数最终会调用switch_to_new_gdt函数，传给他的参数是当前CPU的编号：

void switch_to_new_gdt(int cpu)

{

struct desc_ptr gdt_descr;

gdt_descr.address = (long)get_cpu_gdt_table(cpu);

gdt_descr.size = GDT_SIZE - 1;

load_gdt(&gdt_descr);

/* Reload the per-cpu base */

load_percpu_segment(cpu);

}

load_gdt很熟悉了，就是调用lgdt汇编指令加载GDT表。那么，自系统启动至此，已经有三次加载GDT了，为啥这里又来加载一次呢？这里面的关键是get_cpu_gdt_table函数，来自arch/x86/include/asm/desc.h：

static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)

{

return per_cpu(gdt_page, cpu).gdt;

}

好了，SMP的重中之重来了，per_cpu宏，这里重点介绍它：

#define per_cpu(var, cpu) /

(*SHIFT_PERCPU_PTR(&(var), per_cpu_offset(cpu)))

我们看到传入这个宏的两个参数是gdt_page和cpu。gdt_page还记得吧，我们在“初始化GDT” http://blog.csdn.net/yunsongice/archive/2010/12/31/6110703.aspx中讲过，包含32个8字节的段描述符；cpu，刚刚传进来的当前CPU的id。翻译过来就是：

*SHIFT_PERCPU_PTR(&(gdt_page), per_cpu_offset(cpu))

#define SHIFT_PERCPU_PTR(__p, __offset) ({ /

__verify_pcpu_ptr((__p)); /

RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)); /

})

__verify_pcpu_ptr是GCC的一个优化过程，我们忽略，所以继续翻译就是：

RELOC_HIDE((typeof(*(&(gdt_page))) __kernel __force *)(&(gdt_page)), (per_cpu_offset(cpu))

#define RELOC_HIDE(ptr, off) /

({ unsigned long __ptr; /

__asm__ ("" : "=r"(__ptr) : "0"(ptr)); /

(typeof(ptr)) (__ptr + (off)); })

对于per_cpu(gdt_page, cpu)，将等效地扩展为：

__per_cpu_offset[smp_processor_id()] + per_cpu__gdt_page

并且是一个lvalue，也就是说可以进行赋值操作。这正好是上述per_cpu__runqueues变量在对应CPU的专有数据区中的新地址。

由于不同的每cpu变量有不同的偏移量，并且不同的CPU其专有数据区首地址不同，因此，通过per_cpu (var,cpu)便访问到了不同的变量。对这个概念还不是很清楚的同学请查阅一下博客“每CPU变量”http://blog.csdn.net/yunsongice/archive/2010/05/18/5605239.aspx。

最新回复(0)