真实的IO调度层处理

    技术2024-07-25  16

    1.5.6 真实的I/O调度层处理

    现在我们块设备也有了,队列也有了,要提交请求也就可以开始提交了。那就让我们回到generic_make_request来研究一下如何提交请求如何处理请求吧。我们看到,函数最后调用q->make_request_fn(q, bio)。对 make_request_fn 函数的调用可以认为是 IO调度层的入口,该函数用于向请求队列中添加请求。该函数是在创建请求队列时指定的,代码如下(blk_init_queue 函数中):

    q->request_fn = rfn;

    blk_queue_make_request(q, __make_request);

     

    前面看到函数 blk_queue_make_request 将函数 __make_request 的地址赋予了请求队列 q make_request_fn 成员:

     

    void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)

    {

           q->nr_requests = BLKDEV_MAX_RQ;

           q->make_request_fn = mfn;

    ……

     

    那么,__make_request函数才是IO调度层的真实入口,来自block/ll_rw_blk.c

     

    2846static int __make_request(request_queue_t *q, struct bio *bio)

    2847{

    2848        struct request *req;

    2849        int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;

    2850        unsigned short prio;

    2851        sector_t sector;

    2852

    2853        sector = bio->bi_sector;

    2854        nr_sectors = bio_sectors(bio);

    2855        cur_nr_sectors = bio_cur_sectors(bio);

    2856        prio = bio_prio(bio);

    2857

    2858        rw = bio_data_dir(bio);

    2859        sync = bio_sync(bio);

    2860

    2861        /*

    2862         * low level driver can indicate that it wants pages above a

    2863         * certain limit bounced to low memory (ie for highmem, or even

    2864         * ISA dma in theory)

    2865         */

    2866        blk_queue_bounce(q, &bio);

    2867

    2868        spin_lock_prefetch(q->queue_lock);

    2869

    2870        barrier = bio_barrier(bio);

    2871        if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {

    2872                err = -EOPNOTSUPP;

    2873                goto end_io;

    2874        }

    2875

    2876        spin_lock_irq(q->queue_lock);

    2877

    2878        if (unlikely(barrier) || elv_queue_empty(q))

    2879                goto get_rq;

    2880

    2881        el_ret = elv_merge(q, &req, bio);

    2882        switch (el_ret) {

    2883                case ELEVATOR_BACK_MERGE:

    2884                        BUG_ON(!rq_mergeable(req));

    2885

    2886                        if (!q->back_merge_fn(q, req, bio))

    2887                                break;

    2888

    2889                        blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);

    2890

    2891                        req->biotail->bi_next = bio;

    2892                        req->biotail = bio;

    2893                        req->nr_sectors = req->hard_nr_sectors += nr_sectors;

    2894                        req->ioprio = ioprio_best(req->ioprio, prio);

    2895                        drive_stat_acct(req, nr_sectors, 0);

    2896                        if (!attempt_back_merge(q, req))

    2897                                elv_merged_request(q, req);

    2898                        goto out;

    2899

    2900                case ELEVATOR_FRONT_MERGE:

    2901                        BUG_ON(!rq_mergeable(req));

    2902

    2903                        if (!q->front_merge_fn(q, req, bio))

    2904                                break;

    2905

    2906                        blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);

    2907

    2908                        bio->bi_next = req->bio;

    2909                        req->bio = bio;

    2910

    2911                        /*

    2912                         * may not be valid. if the low level driver said

    2913                         * it didn't need a bounce buffer then it better

    2914                         * not touch req->buffer either...

    2915                         */

    2916                        req->buffer = bio_data(bio);

    2917                        req->current_nr_sectors = cur_nr_sectors;

    2918                        req->hard_cur_sectors = cur_nr_sectors;

    2919                        req->sector = req->hard_sector = sector;

    2920                        req->nr_sectors = req->hard_nr_sectors += nr_sectors;

    2921                        req->ioprio = ioprio_best(req->ioprio, prio);

    2922                        drive_stat_acct(req, nr_sectors, 0);

    2923                        if (!attempt_front_merge(q, req))

    2924                                elv_merged_request(q, req);

    2925                        goto out;

    2926

    2927                /* ELV_NO_MERGE: elevator says don't/can't merge. */

    2928                default:

    2929                        ;

    2930        }

    2931

    2932get_rq:

    2933        /*

    2934         * Grab a free request. This is might sleep but can not fail.

    2935         * Returns with the queue unlocked.

    2936         */

    2937        req = get_request_wait(q, rw, bio);

    2938

    2939        /*

    2940         * After dropping the lock and possibly sleeping here, our request

    2941         * may now be mergeable after it had proven unmergeable (above).

    2942         * We don't worry about that case for efficiency. It won't happen

    2943         * often, and the elevators are able to handle it.

    2944         */

    2945        init_request_from_bio(req, bio);

    2946

    2947        spin_lock_irq(q->queue_lock);

    2948        if (elv_queue_empty(q))

    2949                blk_plug_device(q);

    2950        add_request(q, req);

    2951out:

    2952        if (sync)

    2953                __generic_unplug_device(q);

    2954

    2955        spin_unlock_irq(q->queue_lock);

    2956        return 0;

    2957

    2958end_io:

    2959        bio_endio(bio, nr_sectors << 9, err);

    2960        return 0;

    2961}

     

    __make_request 函数比较复杂,它接收request_queue类型的描述符q和一个bio结构的描述符bio作为其参数,然后执行如下操作:

     

    2853内部变量sector赋值为biobi_sector字段,即将传送的bio的第一个扇区。2854行通过bio_sectors(bio)宏得到需要传送多少个连续的扇区,并赋值给内部变量nr_sectorsbio_sectors宏来自include/linux/bio.h

    #define bio_sectors(bio) ((bio)->bi_size >> 9)

     

    bio->bi_size我们知道,在前面如果一个页中的4个块内容连续,则do_mpage_readpage通过调用bio_add_page4个块的大小4096赋值给biobi_size字段,那么nr_sectors就是4096>>9,等于8,表示这个bio一共8个扇区待传输。

     

    2866行,如果需要,调用blk_cqueue_bounce()函数建立一个回弹缓冲区。如果回弹缓冲区被建立,__make_request()函数将对该缓冲区而不是原先的bio结构进行操作。关于回弹缓冲区的相关知识,请查阅相关资料,这里我们不做过多的介绍。

     

    2878行,调用I/O调度程序的elv_queue_empty()函数检查请求队列中是否存在待处理请求——注意,调度队列可能是空的,但是I/O调度程序的其他队列可能包含待处理请求。如果没有待处理请求,那么调用blk_plug_device()函数插入请求队列,然后跳转到2932行的get_rq标号处。

     

    如果插入的请求队列包含待处理请求,则走到2881行调用I/O调度程序的elv_merge()函数检查新的bio结构是否可以并入已存在的请求中。

     

    该函数将返回三个可能值:

    1. ELEVATOR_NO_MERGE:已经存在的请求中不能包含bio结构;这种情况下,跳转到2932行的get_rq标号处。

    2. ELEVATOR_BACK_MERGEbio结构可作为末尾的bio而插入到某个请求req中;这种情形下,函数调用q->back_merge_fn方法检查是否可以扩展该请求。如果不行,则跳转到2932行的get_rq标号处。否则,将bio描述符插入req链表的末尾并更新req的相应字段值。然后,函数试图将该请求与其后面的请求合并(新的bio可能填充在两个请求之间)。

    3. ELEVATOR_FRONT_MERGEbio结构可作为某个请求req的第一个bio被插入;这种情形下,函数调用q->front_merge_fn方法检查是否可以扩展该请求。如果不行,则跳转到2932行的get_rq标号处。否则,将bio描述符插入req链表的首部并更新req的相应字段值。然后,试图将该请求与其前面的请求合并。

     

    不管是ELEVATOR_BACK_MERGE还是ELEVATOR_FRONT_MERGE,说明bio已经被并入存在的请求中,跳转到2951out标号处终止函数。

     

    下面来看2932行的get_rq标号处,bio必须被插人到一个新的请求中。那么通过get_request_wait给我们这个分区的请求队列q分配一个新的请求描述符request。如果没有空闲的内存,get_request_wait函数将调用io_schedule挂起当前进程,直到设置了bio->bi_rw中的BIO_RW_AHEAD标志,该标志表明这个I/O操作是一次预读;在这种情形下,调用bio_endio()并终止:此时将不会执行数据传送。

     

    然后调用2945行的init_request_from_bio(req, bio)初始化请求描述符中的字段。主要有:

    a) 根据bio描述符的内容初始化各个字段,包括扇区数、当前bio以及当前段。

    b) 设置flags字段中的REQ_CMD标志(说明这次request是一个标准的读或写操作)。

    c) 如果第一个bio段的页框存放在低端内存,则将buffer字段设置为缓冲区的线性地址。

    d) rc_disk字段设置为bio->bi_bdev->bd_disk的地址。

    e) bio插入请求链表。

    f) start_time字段设置为jiffies的值。

     

    回到__make_request2948-2949行,再次调用elv_queue_empty检查一下请求队列中是否存在待处理请求。如果没有待处理请求,那么调用blk_plug_device()函数插入请求队列。不管怎样,都会执行2950行的add_request函数:

     

    static inline void add_request(request_queue_t * q, struct request * req)

    {

           drive_stat_acct(req, req->nr_sectors, 1);

     

           if (q->activity_fn)

                  q->activity_fn(q->activity_data, rq_data_dir(req));

     

           /*

            * elevator indicated where it wants this request to be

            * inserted at elevator_merge time

            */

           __elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);

    }

     

    add_request函数本质上调用__elv_add_request函数通过电梯算法把这个新的request插入到对应requesr_queue合适的位置。在介绍__elv_add_request函数之前,我们先介绍几个宏,来自include/linux/elevator.h

     

        155 /*

        156  * Insertion selection

        157  */

        158 #define ELEVATOR_INSERT_FRONT   1

        159 #define ELEVATOR_INSERT_BACK    2

        160 #define ELEVATOR_INSERT_SORT    3

        161 #define ELEVATOR_INSERT_REQUEUE 4

     

    很明显,在add_request函数中传递进来的是ELEVATOR_INSERT_SORT,表示从前面插入。那么带着这个where我们进入下一个函数,即__elv_add_request。来自block/elevator.c

     

        646 void __elv_add_request(request_queue_t *q, struct request *rq, int where,

        647                        int plug)

        648 {

        649         if (q->ordcolor)

        650                 rq->cmd_flags |= REQ_ORDERED_COLOR;

        651

        652         if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {

        653                 /*

        654                  * toggle ordered color

        655                  */

        656                 if (blk_barrier_rq(rq))

        657                         q->ordcolor ^= 1;

        658

        659                 /*

        660                  * barriers implicitly indicate back insertion

        661                  */

        662                 if (where == ELEVATOR_INSERT_SORT)

        663                         where = ELEVATOR_INSERT_BACK;

        664

        665                 /*

        666                  * this request is scheduling boundary, update

        667                  * end_sector

        668                  */

        669                 if (blk_fs_request(rq)) {

        670                         q->end_sector = rq_end_sector(rq);

        671                         q->boundary_rq = rq;

        672                 }

        673   } else if (!(rq->cmd_flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)

        674                 where = ELEVATOR_INSERT_BACK;

        675

        676         if (plug)

        677                 blk_plug_device(q);

        678

        679         elv_insert(q, rq, where);

        680 }

     

    传入的参数plug等于0,所以blk_plug_device()不会被执行。很明显,一路走来我们根本没有设置什么REQ_SOFTBARRIERREQ_HARDBARRIERREQ_ELVPRIV标识,所以前面都和我们无关,直接跳到最后一行这个elv_insert()

     

        548 void elv_insert(request_queue_t *q, struct request *rq, int where)

        549 {

        550         struct list_head *pos;

        551         unsigned ordseq;

        552         int unplug_it = 1;

        553

        554         blk_add_trace_rq(q, rq, BLK_TA_INSERT);

        555

        556         rq->q = q;

        557

        558         switch (where) {

        559         case ELEVATOR_INSERT_FRONT:

        560                 rq->cmd_flags |= REQ_SOFTBARRIER;

        561

        562                 list_add(&rq->queuelist, &q->queue_head);

        563                 break;

        564

        565         case ELEVATOR_INSERT_BACK:

        566                 rq->cmd_flags |= REQ_SOFTBARRIER;

        567                 elv_drain_elevator(q);

        568                 list_add_tail(&rq->queuelist, &q->queue_head);

        569                 /*

        570                  * We kick the queue here for the following reasons.

        571                  * - The elevator might have returned NULL previously

        572                  *   to delay requests and returned them now.  As the

        573                  *   queue wasn't empty before this request, ll_rw_blk

        574                  *   won't run the queue on return, resulting in hang.

        575                  * - Usually, back inserted requests won't be merged

        576                  *   with anything.  There's no point in delaying queue

        577                  *   processing.

        578                  */

        579                 blk_remove_plug(q);

        580                 q->request_fn(q);

        581                 break;

        582

        583         case ELEVATOR_INSERT_SORT:

        584                 BUG_ON(!blk_fs_request(rq));

        585                 rq->cmd_flags |= REQ_SORTED;

        586                 q->nr_sorted++;

        587                 if (rq_mergeable(rq)) {

        588                         elv_rqhash_add(q, rq);

        589                         if (!q->last_merge)

        590                                 q->last_merge = rq;

        591                 }

        592

        593                 /*

        594                  * Some ioscheds (cfq) run q->request_fn directly, so

        595                  * rq cannot be accessed after calling

        596                  * elevator_add_req_fn.

        597                  */

        598                 q->elevator->ops->elevator_add_req_fn(q, rq);

        599                 break;

        600

        601         case ELEVATOR_INSERT_REQUEUE:

        602                 /*

        603                  * If ordered flush isn't in progress, we do front

        604                  * insertion; otherwise, requests should be requeued

        605                  * in ordseq order.

        606                  */

        607                 rq->cmd_flags |= REQ_SOFTBARRIER;

        608

        609                 /*

        610                  * Most requeues happen because of a busy condition,

        611                  * don't force unplug of the queue for that case.

        612                  */

        613                 unplug_it = 0;

        614

        615                 if (q->ordseq == 0) {

        616                         list_add(&rq->queuelist, &q->queue_head);

        617                         break;

        618                 }

        619

        620                 ordseq = blk_ordered_req_seq(rq);

        621

        622                 list_for_each(pos, &q->queue_head) {

        623                         struct request *pos_rq = list_entry_rq(pos);

        624                         if (ordseq <= blk_ordered_req_seq(pos_rq))

        625                                 break;

        626                 }

        627

        628                 list_add_tail(&rq->queuelist, pos);

        629                 break;

        630

        631         default:

        632                 printk(KERN_ERR "%s: bad insertion point %d/n",

        633                        __FUNCTION__, where);

        634                 BUG();

        635         }

        636

        637         if (unplug_it && blk_queue_plugged(q)) {

        638                 int nrq = q->rq.count[READ] + q->rq.count[WRITE]

        639                         - q->in_flight;

        640

        641                 if (nrq >= q->unplug_thresh)

        642                         __generic_unplug_device(q);

        643         }

        644 }

     

    由于我们是从前面插,所以我们执行562行这个list_addstruct request有一个成员struct list_head queuelist,而struct request_queue有一个成员struct list_head queue_head,所以我们就把前者插入到后者所代表的这个队伍中来。然后咱们就返回了。

     

    以上所有操作全部完成后,在终止之前,2952行检查是否设置了bio->bi_rw中的BIO_RW_SYNC标志。如果是,则对“请求队列”调用generic_unplug_device()函数以卸载设备驱动程序,并直接调用q->request_fn(q),这个函数是什么,马上会看到。

     

    如果在调用__make_request()函数之前请求队列不是空的,那么说明该请求队列要么已经被拔掉过,要么很快将被拔掉——因为每个拥有待处理请求的插入请求队列q都有一个正在运行的动态定时器q->unplug_timer。另一方面,如果请求队列是空的,则__make_request()函数插入请求队列。或迟(最坏的情况是当拔出的定时器到期了)或最早(从__make_request()中退出时,如果设置了bioBIO_RW_SYNC标志),该请求队列都会被拔掉。任何情形下,块设备驱动程序的策略例程最后都将处理调度队列中的请求。

     

    generic_make_request执行完scsi磁盘设备对应请求队列的q->make_request_fn方法,也就是刚才分析的__make_request以后,块设备的调度层就结束了。至于包含该biorequest放入到请求队列中后,何时被处理就由 IO 调度器的调度算法决定了。一旦该请求能够被处理,便调用请求队列中request_fn 字段所指向的函数处理。这个成员的初始化也是在创建请求队列时设置的:

     

       1590 struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)

       1591 {

       1592         struct request_queue *q;

       1593

       1594         q = __scsi_alloc_queue(sdev->host, scsi_request_fn);

       1595         if (!q)

       1596                 return NULL;

       1597

       1598         blk_queue_prep_rq(q, scsi_prep_fn);

       1599         blk_queue_issue_flush_fn(q, scsi_issue_flush_fn);

       1600         blk_queue_softirq_done(q, scsi_softirq_done);

       1601         return q;

       1602 }

     

    我们看到,给scsi设备创建request_queue的时候,是把scsi_request_fn作为他的request_fn 字段所指向的函数地址,所以这个scsi_request_fn就是scsi底层驱动的入口。

    最新回复(0)