从上文得知:ext2_readpage 函数是该层的入口点,传给它的参数是文件的file,以及需要读入页高速缓存的页面。这个页面不是别的,正是刚才page_cache_alloc_cold分配的那个空白页面,其现在位于文件的基树中,基树中索引为index:
static int ext2_readpage(struct file *file, struct page *page)
{
return mpage_readpage(page, ext2_get_block);
}
该函数调用 mpage_readpage函数,传给他需要读入页高速缓存的页面的页描述符地址,以及ext2_get_block函数地址作为参数。
需要封装函数是因为mpage_readpage()函数接收的参数为待填充页的页描述符page及有助于mpage_readpage()找到正确块的函数的地址get_block。封装函数依赖文件系统并因此能提供适当的函数来得到块。这个函数把相对于文件开始位置的逻辑块号转换为相对于磁盘分区开始位置的逻辑块号。
当然,后一个参数依赖于普通文件所在文件系统的类型。在我们的例子中,这个参数就是ext2_get_block()函数的地址。所传递的get_block函数总是用缓冲区首部buffer_head来存放有关重要信息,如块设备(b_dev字段)、设备上请求数据的位置(b_blocknr字段)和块状态(b_state字段)。
函数mpage_readpage()在从磁盘读入一页时可选择两种不同的策略。如果包含请求数据的块在磁盘上是连续的,那么函数就用单个bio描述符向通用块层发出读I/O操作;而如果不连续,函数就对页上的每一块用不同的bio描述符来读。所以,依赖于文件系统的get_block函数的一个重要作用就是:确定文件中的下一块在磁盘上是否也是下一块。
看到mpage_readpage函数的具体代码,来自fs/mpage.c:
427int mpage_readpage(struct page *page, get_block_t get_block)
428{
429 struct bio *bio = NULL;
430 sector_t last_block_in_bio = 0;
431 struct buffer_head map_bh;
432 unsigned long first_logical_block = 0;
433
434 clear_buffer_mapped(&map_bh);
435 bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
436 &map_bh, &first_logical_block, get_block);
437 if (bio)
438 mpage_bio_submit(READ, bio);
439 return 0;
440}
该函数434行首先调用clear_buffer_mapped函数将一个类型为buffer_head的变量map_bh的b_state和b_size字段清零;随后调用函数 do_mpage_readpage 函数创建了一个 bio 请求,该请求指明了要读取的数据块所在磁盘的位置、数据块的数量以及拷贝该数据的目标位置——缓存区中 page 的信息。然后调用 mpage_bio_submit 函数处理请求。
这里简单介绍一下通用块层的一些基础知识。通用块层的核心数据结构是一个称为bio的描述符,它描述了块设备的I/O操作。每个bio结构都包含一个磁盘存储区标识符(存储区中的起始扇区号和扇区数目)和一个或多个描述与I/O操作相关的内存区的段。bio由bio数据结构描述:
struct bio {
sector_t bi_sector; //块I/O操作的第一个磁盘扇区
struct bio *bi_next; //链接到请求队列中的下一个bio
struct block_device *bi_bdev;//指向块设备描述符的指针
unsigned long bi_flags; //bio的状态标志
unsigned long bi_rw; //IO操作标志,即这次I.O是读或写
unsigned short bi_vcnt; /* bio的bio_vec数组中段的数目 */
unsigned short bi_idx; /* bio的bio_vec数组中段的当前索引值 */
unsigned short bi_phys_segments; //合并之后bio中物理段的数目
unsigned short bi_hw_segments; //合并之后硬件段的数目
unsigned int bi_size; /* 需要传送的字节数 */
unsigned int bi_hw_front_size;// 硬件段合并算法使用
unsigned int bi_hw_back_size;// 硬件段合并算法使用
unsigned int bi_max_vecs; /* bio的bio vec数组中允许的最大段数 */
struct bio_vec *bi_io_vec; /*指向bio的bio_vec数组中的段的指针 */
bio_end_io_t *bi_end_io; /* bio的I/O操作结束时调用的方法 */
atomic_t bi_cnt; /* bio的引用计数器 */
void *bi_private; //通用块层和块设备驱动程序的I/O完成方法使用的指针
bio_destructor_t *bi_destructor;//释放bio时调用的析构方法(通常是bio_destructor()方法)r
};
好了,知道BIO数据结构以后,我们就可以进入通用块层了。那么,为了启动一次磁盘访问了,需要初始化一个什么样的BIO,来看do_mpage_readpage的内容:
175static struct bio *
176do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
177 sector_t *last_block_in_bio, struct buffer_head *map_bh,
178 unsigned long *first_logical_block, get_block_t get_block)
179{
180 struct inode *inode = page->mapping->host;
181 const unsigned blkbits = inode->i_blkbits;
182 const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
183 const unsigned blocksize = 1 << blkbits;
184 sector_t block_in_file;
185 sector_t last_block;
186 sector_t last_block_in_file;
187 sector_t blocks[MAX_BUF_PER_PAGE];
188 unsigned page_block;
189 unsigned first_hole = blocks_per_page;
190 struct block_device *bdev = NULL;
191 int length;
192 int fully_mapped = 1;
193 unsigned nblocks;
194 unsigned relative_block;
195
196 if (page_has_buffers(page))
197 goto confused;
198
199 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
200 last_block = block_in_file + nr_pages * blocks_per_page;
201 last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
202 if (last_block > last_block_in_file)
203 last_block = last_block_in_file;
204 page_block = 0;
205
206 /*
207 * Map blocks using the result from the previous get_blocks call first.
208 */
209 nblocks = map_bh->b_size >> blkbits;
210 if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
211 block_in_file < (*first_logical_block + nblocks)) {
212 unsigned map_offset = block_in_file - *first_logical_block;
213 unsigned last = nblocks - map_offset;
214
215 for (relative_block = 0; ; relative_block++) {
216 if (relative_block == last) {
217 clear_buffer_mapped(map_bh);
218 break;
219 }
220 if (page_block == blocks_per_page)
221 break;
222 blocks[page_block] = map_bh->b_blocknr + map_offset +
223 relative_block;
224 page_block++;
225 block_in_file++;
226 }
227 bdev = map_bh->b_bdev;
228 }
229
230 /*
231 * Then do more get_blocks calls until we are done with this page.
232 */
233 map_bh->b_page = page;
234 while (page_block < blocks_per_page) {
235 map_bh->b_state = 0;
236 map_bh->b_size = 0;
237
238 if (block_in_file < last_block) {
239 map_bh->b_size = (last_block-block_in_file) << blkbits;
240 if (get_block(inode, block_in_file, map_bh, 0))
241 goto confused;
242 *first_logical_block = block_in_file;
243 }
244
245 if (!buffer_mapped(map_bh)) {
246 fully_mapped = 0;
247 if (first_hole == blocks_per_page)
248 first_hole = page_block;
249 page_block++;
250 block_in_file++;
251 clear_buffer_mapped(map_bh);
252 continue;
253 }
254
255 /* some filesystems will copy data into the page during
256 * the get_block call, in which case we don't want to
257 * read it again. map_buffer_to_page copies the data
258 * we just collected from get_block into the page's buffers
259 * so readpage doesn't have to repeat the get_block call
260 */
261 if (buffer_uptodate(map_bh)) {
262 map_buffer_to_page(page, map_bh, page_block);
263 goto confused;
264 }
265
266 if (first_hole != blocks_per_page)
267 goto confused; /* hole -> non-hole */
268
269 /* Contiguous blocks? */
270 if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
271 goto confused;
272 nblocks = map_bh->b_size >> blkbits;
273 for (relative_block = 0; ; relative_block++) {
274 if (relative_block == nblocks) {
275 clear_buffer_mapped(map_bh);
276 break;
277 } else if (page_block == blocks_per_page)
278 break;
279 blocks[page_block] = map_bh->b_blocknr+relative_block;
280 page_block++;
281 block_in_file++;
282 }
283 bdev = map_bh->b_bdev;
284 }
285
286 if (first_hole != blocks_per_page) {
287 char *kaddr = kmap_atomic(page, KM_USER0);
288 memset(kaddr + (first_hole << blkbits), 0,
289 PAGE_CACHE_SIZE - (first_hole << blkbits));
290 flush_dcache_page(page);
291 kunmap_atomic(kaddr, KM_USER0);
292 if (first_hole == 0) {
293 SetPageUptodate(page);
294 unlock_page(page);
295 goto out;
296 }
297 } else if (fully_mapped) {
298 SetPageMappedToDisk(page);
299 }
300
301 /*
302 * This page will go to BIO. Do we need to send this BIO off first?
303 */
304 if (bio && (*last_block_in_bio != blocks[0] - 1))
305 bio = mpage_bio_submit(READ, bio);
306
307alloc_new:
308 if (bio == NULL) {
309 bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
310 min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
311 GFP_KERNEL);
312 if (bio == NULL)
313 goto confused;
314 }
315
316 length = first_hole << blkbits;
317 if (bio_add_page(bio, page, length, 0) < length) {
318 bio = mpage_bio_submit(READ, bio);
319 goto alloc_new;
320 }
321
322 if (buffer_boundary(map_bh) || (first_hole != blocks_per_page))
323 bio = mpage_bio_submit(READ, bio);
324 else
325 *last_block_in_bio = blocks[blocks_per_page - 1];
326out:
327 return bio;
328
329confused:
330 if (bio)
331 bio = mpage_bio_submit(READ, bio);
332 if (!PageUptodate(page))
333 block_read_full_page(page, get_block);
334 else
335 unlock_page(page);
336 goto out;
337}
首先,180~182行,得到每个页高速缓存的块存放能力。通过存放在page->mapping->host->i_blkbits索引节点字段,得到每一个页面应该存放多少个块,前面我们设定的块大小是1024个字节,那么一个页就应该存放4个块,这也是为什么前面讲到设置块大小的时候一定要跟页面对其的原因。
然后,196行检查页描述符的PG_private字段:
#define page_has_buffers(page) PagePrivate(page)
#define page_private(page) ((page)->private)
如果置位,则该页是块设备页高速缓存的页,也就是该页与描述组成该页的块的缓冲区首部链表相关。这意味着该页过去已从磁盘读入过,而且页中的块在磁盘上不是相邻的。跳到标号confused处,用一次读一块的方式读该页。
然后199行计算出访问该页的所有块所需要的位置值,即页中的块数blocks_per_page及页中第一块的文件块号,也就是相对于文件起始位置页中第一块的索引,存放在内部变量block_in_file中。
210~228行代码是处理如果map_bh已经被映射(通过buffer_mapped宏测试map_bh的b_state字段,这个宏是个BUFFER_FNS宏,在include/linux/buffer_head.h中),并且页中第一块的文件块号位于传递进来的first_logical_block参数和一个buffer_head最后一个块之间,则对未映射的部分进行处理。由于我们在mpage_readpage函数中将map_bh的b_state字段清零了的,而且传递进来的first_logical_block为0,所以略过这段代码。