scsi块设备驱动层处理

    技术2024-07-15  63

    1.6.3 scsi块设备驱动层处理

    好了,了解完必要的scsi设备驱动知识以后,我们就可以安心分析scsi_request_fn函数了。大家回忆一下对,这个函数指针通过几次传递并最终在blk_init_queue_node()中被赋予了q->request_fn。所以这一层的重点就是这个scsi_request_fn函数。

     

    在看scsi_request_fn之前,注意回忆一下scsi_alloc_queue函数的1598行至1560行还赋了三个函数指针:

     

       1590 struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)

       1591 {

       1592         struct request_queue *q;

       1593

       1594         q = __scsi_alloc_queue(sdev->host, scsi_request_fn);

       1595         if (!q)

       1596                 return NULL;

       1597

       1598         blk_queue_prep_rq(q, scsi_prep_fn);

       1599         blk_queue_issue_flush_fn(q, scsi_issue_flush_fn);

       1600         blk_queue_softirq_done(q, scsi_softirq_done);

       1601         return q;

       1602 }

        

        143 void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn)

        144 {

        145         q->prep_rq_fn = pfn;

        146 }

     

        313 void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff)

        314 {

        315         q->issue_flush_fn = iff;

        316 }

        173 void blk_queue_softirq_done(request_queue_t *q, softirq_done_fn *fn)

        174 {

        175         q->softirq_done_fn = fn;

        176 }

     

    分别是把scsi_prep_fn赋给了q->prep_rq_fn,把scsi_issue_flush_fn赋给了q->issue_flush_fn,把scsi_softirq_done赋给了q->softirq_done_fn。尤其是scsi_prep_fn我们马上就会用到。

     

    好,让我们继续前面的话题,重点关注scsi_request_fn()

     

       1422 static void scsi_request_fn(struct request_queue *q)

       1423 {

       1424         struct scsi_device *sdev = q->queuedata;

       1425         struct scsi_Host *shost;

       1426         struct scsi_cmnd *cmd;

       1427         struct request *req;

       1428

       1429         if (!sdev) {

       1430                 printk("scsi: killing requests for dead queue/n");

       1431                 while ((req = elv_next_request(q)) != NULL)

       1432                         scsi_kill_request(req, q);

       1433                 return;

       1434         }

       1435

       1436         if(!get_device(&sdev->sdev_gendev))

       1437                 /* We must be tearing the block queue down already */

       1438                 return;

       1439

       1440         /*

       1441          * To start with, we keep looping until the queue is empty, or until

       1442          * the host is no longer able to accept any more requests.

       1443          */

       1444         shost = sdev->host;

       1445         while (!blk_queue_plugged(q)) {

       1446                 int rtn;

       1447                 /*

       1448                  * get next queueable request.  We do this early to make sure

       1449                  * that the request is fully prepared even if we cannot

       1450                  * accept it.

       1451                  */

       1452                 req = elv_next_request(q);

       1453                 if (!req || !scsi_dev_queue_ready(q, sdev))

       1454                         break;

       1455

       1456                 if (unlikely(!scsi_device_online(sdev))) {

       1457                         sdev_printk(KERN_ERR, sdev,

       1458                                     "rejecting I/O to offline device/n");

       1459                         scsi_kill_request(req, q);

       1460                         continue;

       1461                 }

       1462

       1463

       1464                 /*

       1465                  * Remove the request from the request list.

       1466                  */

       1467                 if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))

       1468                         blkdev_dequeue_request(req);

       1469                 sdev->device_busy++; /* 说明命令正在执行中 */

       1470

       1471                 spin_unlock(q->queue_lock);

       1472                 cmd = req->special;

       1473                 if (unlikely(cmd == NULL)) {

       1474                         printk(KERN_CRIT "impossible request in %s./n"

       1475                                          "please mail a stack trace to "

       1476                                          "linux-scsi@vger.kernel.org/n",

       1477                                          __FUNCTION__);

       1478                         blk_dump_rq_flags(req, "foo");

       1479                         BUG();

       1480                 }

       1481                 spin_lock(shost->host_lock);

       1482

       1483                 if (!scsi_host_queue_ready(q, shost, sdev))

       1484                         goto not_ready;

       1485                 if (sdev->single_lun) {

       1486                         if (scsi_target(sdev)->starget_sdev_user &&

       1487                             scsi_target(sdev)->starget_sdev_user != sdev)

       1488                                 goto not_ready;

       1489                         scsi_target(sdev)->starget_sdev_user = sdev;

       1490                 }

       1491                 shost->host_busy++;

       1492

       1493                 /*

       1494                  * XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will

       1495                  *              take the lock again.

       1496                  */

       1497                 spin_unlock_irq(shost->host_lock);

       1498

       1499                 /*

       1500                  * Finally, initialize any error handling parameters, and set up

       1501                  * the timers for timeouts.

       1502                  */

       1503                 scsi_init_cmd_errh(cmd);

       1504

       1505                 /*

       1506                  * Dispatch the command to the low-level driver.

       1507                  */

       1508                 rtn = scsi_dispatch_cmd(cmd);

       1509                 spin_lock_irq(q->queue_lock);

       1510                 if(rtn) {

       1511                         /* we're refusing the command; because of

       1512                          * the way locks get dropped, we need to

       1513                          * check here if plugging is required */

       1514                         if(sdev->device_busy == 0)

       1515                                 blk_plug_device(q);

       1516

       1517                         break;

       1518                 }

       1519         }

       1520

       1521         goto out;

       1522

       1523  not_ready:

       1524         spin_unlock_irq(shost->host_lock);

       1525

       1526         /*

       1527          * lock q, handle tag, requeue req, and decrement device_busy. We

       1528          * must return with queue_lock held.

       1529          *

       1530          * Decrementing device_busy without checking it is OK, as all such

       1531          * cases (host limits or settings) should run the queue at some

       1532          * later time.

       1533          */

       1534         spin_lock_irq(q->queue_lock);

       1535         blk_requeue_request(q, req);

       1536         sdev->device_busy--;

       1537         if(sdev->device_busy == 0)

       1538                 blk_plug_device(q);

       1539  out:

       1540         /* must be careful here...if we trigger the ->remove() function

       1541          * we cannot be holding the q lock */

       1542         spin_unlock_irq(q->queue_lock);

       1543         put_device(&sdev->sdev_gendev);

       1544         spin_lock_irq(q->queue_lock);

       1545 }

     

    scsi_request_fn函数为scsi设备请求队列处理函数,前面看到该函数被注册到了request_queue->request_fn上。块设备请求的bio最终会mergerequest queue中,然后通过unplug_fn函数调用request_queue->request_fn,实现scsi_reuqest_fn函数的调用。

     

    scsi_request_fn函数实现了请求队列的处理,首先1452-1468行按照电梯算法从请求队列中摘取一个request,所以我们首先关注1452行的elv_next_request(),来自block/elevator.c:

     

        712 struct request *elv_next_request(request_queue_t *q)

        713 {

        714         struct request *rq;

        715         int ret;

        716

        717         while ((rq = __elv_next_request(q)) != NULL) {

        718                 if (!(rq->cmd_flags & REQ_STARTED)) {

        719                         /*

        720                          * This is the first time the device driver

        721                          * sees this request (possibly after

        722                          * requeueing).  Notify IO scheduler.

        723                          */

        724                         if (blk_sorted_rq(rq))

        725                                 elv_activate_rq(q, rq);

        726

        727                         /*

        728                          * just mark as started even if we don't start

        729                          * it, a request that has been delayed should

        730                          * not be passed by new incoming requests

        731                          */

        732                         rq->cmd_flags |= REQ_STARTED;

        733                         blk_add_trace_rq(q, rq, BLK_TA_ISSUE);

        734                 }

        735

        736                 if (!q->boundary_rq || q->boundary_rq == rq) {

        737                         q->end_sector = rq_end_sector(rq);

        738                         q->boundary_rq = NULL;

        739                 }

        740

        741                 if ((rq->cmd_flags & REQ_DONTPREP) || !q->prep_rq_fn)

        742                         break;

        743

        744                 ret = q->prep_rq_fn(q, rq);

        745                 if (ret == BLKPREP_OK) {

        746                         break;

        747                 } else if (ret == BLKPREP_DEFER) {

        748                         /*

        749                          * the request may have been (partially) prepped.

        750                          * we need to keep this request in the front to

        751                          * avoid resource deadlock.  REQ_STARTED will

        752                          * prevent other fs requests from passing this one.

        753                          */

        754                         rq = NULL;

        755                         break;

        756                 } else if (ret == BLKPREP_KILL) {

        757                         int nr_bytes = rq->hard_nr_sectors << 9;

        758

        759                         if (!nr_bytes)

        760                                 nr_bytes = rq->data_len;

        761

        762                         blkdev_dequeue_request(rq);

        763                         rq->cmd_flags |= REQ_QUIET;

        764                         end_that_request_chunk(rq, 0, nr_bytes);

        765                         end_that_request_last(rq, 0);

        766                 } else {

        767                         printk(KERN_ERR "%s: bad return=%d/n", __FUNCTION__,

        768                                                                 ret);

        769                         break;

        770                 }

        771         }

        772

        773         return rq;

        774 }

     

    它调用的__elv_next_request()仍然来自block/elevator.c:

     

        696 static inline struct request *__elv_next_request(request_queue_t *q)

        697 {

        698         struct request *rq;

        699

        700         while (1) {

        701                 while (!list_empty(&q->queue_head)) {

        702                         rq = list_entry_rq(q->queue_head.next);

        703                         if (blk_do_ordered(q, &rq))

        704                                 return rq;

        705                 }

        706

        707                 if (!q->elevator->ops->elevator_dispatch_fn(q, 0))

        708                         return NULL;

        709         }

        710 }

     

    由于我们在I/O调度层中插入了一个request,所以这里q->queue_head不可能为空。所以702行从中取出一个request来。然后是blk_do_ordered(),来自block/ll_rw_blk.c

     

        478 int blk_do_ordered(request_queue_t *q, struct request **rqp)

        479 {

        480         struct request *rq = *rqp;

        481         int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);

        482

        483         if (!q->ordseq) {

        484                 if (!is_barrier)

        485                         return 1;

        486

        487                 if (q->next_ordered != QUEUE_ORDERED_NONE) {

        488                         *rqp = start_ordered(q, rq);

        489                         return 1;

        490                 } else {

        491                         /*

        492                          * This can happen when the queue switches to

        493                          * ORDERED_NONE while this request is on it.

        494                          */

        495                         blkdev_dequeue_request(rq);

        496                         end_that_request_first(rq, -EOPNOTSUPP,

        497                                                rq->hard_nr_sectors);

        498                         end_that_request_last(rq, -EOPNOTSUPP);

        499                         *rqp = NULL;

        500                         return 0;

        501                 }

        502         }

        503

        504         /*

        505          * Ordered sequence in progress

        506          */

        507

        508         /* Special requests are not subject to ordering rules. */

        509         if (!blk_fs_request(rq) &&

        510             rq != &q->pre_flush_rq && rq != &q->post_flush_rq)

        511                 return 1;

        512

        513         if (q->ordered & QUEUE_ORDERED_TAG) {

        514                 /* Ordered by tag.  Blocking the next barrier is enough. */

        515                 if (is_barrier && rq != &q->bar_rq)

        516                         *rqp = NULL;

        517         } else {

        518                 /* Ordered by draining.  Wait for turn. */

        519                 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));

        520                 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))

        521                         *rqp = NULL

        522         }

        523

        524         return 1;

        525 }

     

    首先看一下blk_fs_request

    528 #define blk_fs_request(rq)      ((rq)->cmd_type == REQ_TYPE_FS)

     

    很显然,咱们从来没有设置这个标识,所以不去管它。

     

    所以在咱们这个上下文里,is_barrier一定是0。所以,blk_do_ordered二话不说,直接返回1。那么回到__elv_next_request以后,703行这个if条件是满足的,所以也就是返回rq,下面的那个elevator_dispatch_fn根本不会执行的。另一方面,我们从__elv_next_request返回,回到elv_next_request()的时候,只要request queue不是空的,那么返回值就是队列中最前边的那个request

     

    继续在elv_next_request中往下走,request得到了,cmd_flags其实整个故事中设置REQ_STARTED的也就是这里,732行。所以在我们执行732行之前,这个flag是没有设置的。因此,if条件是满足的。

     

    blk_sorted_rq又是一个宏,来自include/linux/blkdev.h

    543 #define blk_sorted_rq(rq)       ((rq)->cmd_flags & REQ_SORTED)

     

    很显然,咱们也从来没有设置过这个flag,所以这里不关我们的事。

     

    当然了,对于noop,即便执行下一个函数也没有意义,因为这个elv_activate_rq()来自block/elevator.c

     

        272 static void elv_activate_rq(request_queue_t *q, struct request *rq)

        273 {

        274         elevator_t *e = q->elevator;

        275

        276         if (e->ops->elevator_activate_req_fn)

        277                 e->ops->elevator_activate_req_fn(q, rq);

        278 }

     

    我们假设使用最简单的noop电梯算法,即根本就没有这个指针,所以不去管他。

     

    这时候,我们设置REQ_STARTED这个flag,最开始我们在elevator_init()中,有这么一句:

    230         q->boundary_rq = NULL;

     

    于是rq_end_sector会被执行,这其实也只是一个很简单的宏:

    172 #define rq_end_sector(rq)       ((rq)->sector + (rq)->nr_sectors)

     

    同时,boundary_rq还是被置为NULL

     

    回到elv_next_request中,接下来744行,由于我们把prep_rq_fn赋上了scsi_prep_fn,所以我们要看一下这个scsi_prep_fn(),这个来自drivers/scsi/scsi_lib.c的函数:

     

    1093static int scsi_prep_fn(struct request_queue *q, struct request *req)

    1094{

    1095        struct scsi_device *sdev = q->queuedata;

    1096        struct scsi_cmnd *cmd;

    1097        int specials_only = 0;

    1098

    1099        /*

    1100         * Just check to see if the device is online.  If it isn't, we

    1101         * refuse to process any commands.  The device must be brought

    1102         * online before trying any recovery commands

    1103         */

    1104        if (unlikely(!scsi_device_online(sdev))) {

    1105                sdev_printk(KERN_ERR, sdev,

    1106                            "rejecting I/O to offline device/n");

    1107                goto kill;

    1108        }

    1109        if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {

    1110                /* OK, we're not in a running state don't prep

    1111                 * user commands */

    1112                if (sdev->sdev_state == SDEV_DEL) {

    1113                        /* Device is fully deleted, no commands

    1114                         * at all allowed down */

    1115                        sdev_printk(KERN_ERR, sdev,

    1116                                    "rejecting I/O to dead device/n");

    1117                        goto kill;

    1118                }

    1119                /* OK, we only allow special commands (i.e. not

    1120                 * user initiated ones */

    1121                specials_only = sdev->sdev_state;

    1122        }

    1123

    1124        /*

    1125         * Find the actual device driver associated with this command.

    1126         * The SPECIAL requests are things like character device or

    1127         * ioctls, which did not originate from ll_rw_blk.  Note that

    1128         * the special field is also used to indicate the cmd for

    1129         * the remainder of a partially fulfilled request that can

    1130         * come up when there is a medium error.  We have to treat

    1131         * these two cases differently.  We differentiate by looking

    1132         * at request->cmd, as this tells us the real story.

    1133         */

    1134        if (req->flags & REQ_SPECIAL && req->special) {

    1135                cmd = req->special;

    1136        } else if (req->flags & (REQ_CMD | REQ_BLOCK_PC)) {

    1137

    1138                if(unlikely(specials_only) && !(req->flags & REQ_SPECIAL)) {

    1139                        if(specials_only == SDEV_QUIESCE ||

    1140                                        specials_only == SDEV_BLOCK)

    1141                                goto defer;

    1142                       

    1143                        sdev_printk(KERN_ERR, sdev,

    1144                                    "rejecting I/O to device being removed/n");

    1145                        goto kill;

    1146                }

    1147                       

    1148                       

    1149                /*

    1150                 * Now try and find a command block that we can use.

    1151                 */

    1152                if (!req->special) {

    1153                        cmd = scsi_get_command(sdev, GFP_ATOMIC);

    1154                        if (unlikely(!cmd))

    1155                                goto defer;

    1156                } else

    1157                        cmd = req->special;

    1158               

    1159                /* pull a tag out of the request if we have one */

    1160                cmd->tag = req->tag;

    1161        } else {

    1162                blk_dump_rq_flags(req, "SCSI bad req");

    1163                goto kill;

    1164        }

    1165       

    1166        /* note the overloading of req->special.  When the tag

    1167         * is active it always means cmd.  If the tag goes

    1168         * back for re-queueing, it may be reset */

    1169        req->special = cmd;

    1170        cmd->request = req;

    1171       

    1172        /*

    1173         * FIXME: drop the lock here because the functions below

    1174         * expect to be called without the queue lock held.  Also,

    1175         * previously, we dequeued the request before dropping the

    1176         * lock.  We hope REQ_STARTED prevents anything untoward from

    1177         * happening now.

    1178         */

    1179        if (req->flags & (REQ_CMD | REQ_BLOCK_PC)) {

    1180                int ret;

    1181

    1182                /*

    1183                 * This will do a couple of things:

    1184                 *  1) Fill in the actual SCSI command.

    1185                 *  2) Fill in any other upper-level specific fields

    1186                 * (timeout).

    1187                 *

    1188                 * If this returns 0, it means that the request failed

    1189                 * (reading past end of disk, reading offline device,

    1190                 * etc).   This won't actually talk to the device, but

    1191                 * some kinds of consistency checking may cause the    

    1192                 * request to be rejected immediately.

    1193                 */

    1194

    1195                /*

    1196                 * This sets up the scatter-gather table (allocating if

    1197                 * required).

    1198                 */

    1199                ret = scsi_init_io(cmd);

    1200                switch(ret) {

    1201                        /* For BLKPREP_KILL/DEFER the cmd was released */

    1202                case BLKPREP_KILL:

    1203                        goto kill;

    1204                case BLKPREP_DEFER:

    1205                        goto defer;

    1206                }

    1207               

    1208                /*

    1209                 * Initialize the actual SCSI command for this request.

    1210                 */

    1211                if (req->flags & REQ_BLOCK_PC) {

    1212                        scsi_setup_blk_pc_cmnd(cmd);

    1213                } else if (req->rq_disk) {

    1214                        struct scsi_driver *drv;

    1215

    1216                        drv = *(struct scsi_driver **)req->rq_disk->private_data;

    1217                        if (unlikely(!drv->init_command(cmd))) {

    1218                                scsi_release_buffers(cmd);

    1219                                scsi_put_command(cmd);

    1220                                goto kill;

    1221                        }

    1222                }

    1223        }

    1224

    1225        /*

    1226         * The request is now prepped, no need to come back here

    1227         */

    1228        req->flags |= REQ_DONTPREP;

    1229        return BLKPREP_OK;

    1230

    1231 defer:

    1232        /* If we defer, the elv_next_request() returns NULL, but the

    1233         * queue must be restarted, so we plug here if no returning

    1234         * command will automatically do that. */

    1235        if (sdev->device_busy == 0)

    1236                blk_plug_device(q);

    1237        return BLKPREP_DEFER;

    1238 kill:

    1239        req->errors = DID_NO_CONNECT << 16;

    1240        return BLKPREP_KILL;

    1241}

     

    大家还记得我们前面使用__make_request函数创建一个request的时候,曾经通过init_request_from_bio(req, bio)初始化请求描述符中的字段。其中把request的设置flags字段中的REQ_CMD标识,说明这次request是一个标准的读或写操作。注意,前面我们并没有设置REQ_BLOCK_PC标识。

     

    所以scsi_prep_fn函数首先会进入1136那个条件分支。1138-1146的代码是对该块设备状态的一个检查,一般不会出什么问题。随后1153行调用scsi_get_command函数给我们这个request对应的scsi_device分配一个scsi_cmnd结构,其地址赋给函数内部变量cmd指针:

     

    struct scsi_cmnd *scsi_get_command(struct scsi_device *dev, gfp_t gfp_mask)

    {

           struct scsi_cmnd *cmd;

     

           /* Bail if we can't get a reference to the device */

           if (!get_device(&dev->sdev_gendev))

                  return NULL;

     

           cmd = __scsi_get_command(dev->host, gfp_mask);

     

           if (likely(cmd != NULL)) {

                  unsigned long flags;

     

                  memset(cmd, 0, sizeof(*cmd));

                  cmd->device = dev;

                  init_timer(&cmd->eh_timeout);

                  INIT_LIST_HEAD(&cmd->list);

                  spin_lock_irqsave(&dev->list_lock, flags);

                  list_add_tail(&cmd->list, &dev->cmd_list);

                  spin_unlock_irqrestore(&dev->list_lock, flags);

                  cmd->jiffies_at_alloc = jiffies;

           } else

                  put_device(&dev->sdev_gendev);

     

           return cmd;

    }

     

    static struct scsi_cmnd *__scsi_get_command(struct Scsi_Host *shost,

                                           gfp_t gfp_mask)

    {

           struct scsi_cmnd *cmd;

     

           cmd = kmem_cache_alloc(shost->cmd_pool->slab,

                         gfp_mask | shost->cmd_pool->gfp_mask);

     

           if (unlikely(!cmd)) {

                  unsigned long flags;

     

                  spin_lock_irqsave(&shost->free_list_lock, flags);

                  if (likely(!list_empty(&shost->free_list))) {

                         cmd = list_entry(shost->free_list.next,

                                        struct scsi_cmnd, list);

                         list_del_init(&cmd->list);

                  }

                  spin_unlock_irqrestore(&shost->free_list_lock, flags);

           }

     

           return cmd;

    }

     

    看不懂这个分配函数的回去好好看一下“scsi设备驱动体系架构”最后那个图,我就不多费口舌了。回到scsi_prep_fn中,1160行把reqesttag赋给这个全新的scsi_cmnd结构;然后11691170行把这个reqestscsi_cmnd联系起来。随后又进入1179行条件判断,1199行,调用scsi_init_io函数初始化这个scsi_cmnd结构:

     

    static int scsi_init_io(struct scsi_cmnd *cmd)

    {

           struct request     *req = cmd->request;

           struct scatterlist *sgpnt;

           int             count;

     

           /*

            * if this is a rq->data based REQ_BLOCK_PC, setup for a non-sg xfer

            */

           if ((req->flags & REQ_BLOCK_PC) && !req->bio) {

                  cmd->request_bufflen = req->data_len;

                  cmd->request_buffer = req->data;

                  req->buffer = req->data;

                  cmd->use_sg = 0;

                  return 0;

           }

     

           /*

            * we used to not use scatter-gather for single segment request,

            * but now we do (it makes highmem I/O easier to support without

            * kmapping pages)

            */

           cmd->use_sg = req->nr_phys_segments;

     

           /*

            * if sg table allocation fails, requeue request later.

            */

           sgpnt = scsi_alloc_sgtable(cmd, GFP_ATOMIC);

           if (unlikely(!sgpnt)) {

                  scsi_unprep_request(req);

                  return BLKPREP_DEFER;

           }

     

           cmd->request_buffer = (char *) sgpnt;

           cmd->request_bufflen = req->nr_sectors << 9;

           if (blk_pc_request(req))

                  cmd->request_bufflen = req->data_len;

           req->buffer = NULL;

     

           /*

            * Next, walk the list, and fill in the addresses and sizes of

            * each segment.

            */

           count = blk_rq_map_sg(req->q, req, cmd->request_buffer);

     

           /*

            * mapped well, send it off

            */

           if (likely(count <= cmd->use_sg)) {

                  cmd->use_sg = count;

                  return 0;

           }

     

           printk(KERN_ERR "Incorrect number of segments after building list/n");

           printk(KERN_ERR "counted %d, received %d/n", count, cmd->use_sg);

           printk(KERN_ERR "req nr_sec %lu, cur_nr_sec %u/n", req->nr_sectors,

                         req->current_nr_sectors);

     

           /* release the command and kill it */

           scsi_release_buffers(cmd);

           scsi_put_command(cmd);

           return BLKPREP_KILL;

    }

     

    一般情况下,scsi_init_io返回0,否则致命错误,导致scsi_prep_fn退出。继续走,由于我们并没有设置REQ_BLOCK_PC标识,而且reqrq_disk是存在的,gendisk,忘了?那你完了。所以scsi_prep_fn函数来到1217行,执行本函数中最重要的过程,drv->init_command。这个drv是啥?来自gendiskprivate_data字段。还记得sd_probe吗?我们在其中把它赋值给了对应scsi_disk结构的driver字段,就是前面那个sd_template常量,别告诉我你又忘了。如果真忘了,那就好好从头开始,从scsi磁盘驱动的初始化函数init_sd开始。

     

    我们知道sd_template常量的init_command指针指向sd_init_command函数地址,所以下面就来看看sd_init_command这个函数,十分重要,来自drivers/scsi/sd.c

     

    366static int sd_init_command(struct scsi_cmnd * SCpnt)

     367{

     368        struct scsi_device *sdp = SCpnt->device;

     369        struct request *rq = SCpnt->request;

     370        struct gendisk *disk = rq->rq_disk;

     371        sector_t block = rq->sector;

     372        unsigned int this_count = SCpnt->request_bufflen >> 9;

     373        unsigned int timeout = sdp->timeout;

     374

     375        SCSI_LOG_HLQUEUE(1, printk("sd_init_command: disk=%s, block=%llu, "

     376                            "count=%d/n", disk->disk_name,

     377                         (unsigned long long)block, this_count));

     378

     379        if (!sdp || !scsi_device_online(sdp) ||

     380            block + rq->nr_sectors > get_capacity(disk)) {

     381                SCSI_LOG_HLQUEUE(2, printk("Finishing %ld sectors/n",

     382                                 rq->nr_sectors));

     383                SCSI_LOG_HLQUEUE(2, printk("Retry with 0x%p/n", SCpnt));

     384                return 0;

     385        }

     386

     387        if (sdp->changed) {

     388                /*

     389                 * quietly refuse to do anything to a changed disc until

     390                 * the changed bit has been reset

     391                 */

     392                /* printk("SCSI disk has been changed. Prohibiting further I/O./n"); */

     393                return 0;

     394        }

     395        SCSI_LOG_HLQUEUE(2, printk("%s : block=%llu/n",

     396                                   disk->disk_name, (unsigned long long)block));

     397

     398        /*

     399         * If we have a 1K hardware sectorsize, prevent access to single

     400         * 512 byte sectors.  In theory we could handle this - in fact

     401         * the scsi cdrom driver must be able to handle this because

     402         * we typically use 1K blocksizes, and cdroms typically have

     403         * 2K hardware sectorsizes.  Of course, things are simpler

     404         * with the cdrom, since it is read-only.  For performance

     405         * reasons, the filesystems should be able to handle this

     406         * and not force the scsi disk driver to use bounce buffers

     407         * for this.

     408         */

     409        if (sdp->sector_size == 1024) {

     410                if ((block & 1) || (rq->nr_sectors & 1)) {

     411                        printk(KERN_ERR "sd: Bad block number requested");

     412                        return 0;

     413                } else {

     414                        block = block >> 1;

     415                        this_count = this_count >> 1;

     416                }

     417        }

     418        if (sdp->sector_size == 2048) {

     419                if ((block & 3) || (rq->nr_sectors & 3)) {

     420                        printk(KERN_ERR "sd: Bad block number requested");

     421                        return 0;

     422                } else {

     423                        block = block >> 2;

     424                        this_count = this_count >> 2;

     425                }

     426        }

     427        if (sdp->sector_size == 4096) {

     428                if ((block & 7) || (rq->nr_sectors & 7)) {

     429                        printk(KERN_ERR "sd: Bad block number requested");

     430                        return 0;

     431                } else {

     432                        block = block >> 3;

     433                        this_count = this_count >> 3;

     434                }

     435        }

     436        if (rq_data_dir(rq) == WRITE) {

     437                if (!sdp->writeable) {

     438                        return 0;

     439                }

     440                SCpnt->cmnd[0] = WRITE_6;

     441                SCpnt->sc_data_direction = DMA_TO_DEVICE;

     442        } else if (rq_data_dir(rq) == READ) {

     443                SCpnt->cmnd[0] = READ_6;

     444                SCpnt->sc_data_direction = DMA_FROM_DEVICE;

     445        } else {

     446                printk(KERN_ERR "sd: Unknown command %lx/n", rq->flags);

     447/* overkill     panic("Unknown sd command %lx/n", rq->flags); */

     448                return 0;

     449        }

     450

     451        SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks./n",

     452                disk->disk_name, (rq_data_dir(rq) == WRITE) ?

     453                "writing" : "reading", this_count, rq->nr_sectors));

     454

     455        SCpnt->cmnd[1] = 0;

     456       

     457        if (block > 0xffffffff) {

     458                SCpnt->cmnd[0] += READ_16 - READ_6;

     459                SCpnt->cmnd[1] |= blk_fua_rq(rq) ? 0x8 : 0;

     460                SCpnt->cmnd[2] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0;

     461                SCpnt->cmnd[3] = sizeof(block) > 4 ? (unsigned char) (block >> 48) & 0xff : 0;

     462                SCpnt->cmnd[4] = sizeof(block) > 4 ? (unsigned char) (block >> 40) & 0xff : 0;

     463                SCpnt->cmnd[5] = sizeof(block) > 4 ? (unsigned char) (block >> 32) & 0xff : 0;

     464                SCpnt->cmnd[6] = (unsigned char) (block >> 24) & 0xff;

     465                SCpnt->cmnd[7] = (unsigned char) (block >> 16) & 0xff;

     466                SCpnt->cmnd[8] = (unsigned char) (block >> 8) & 0xff;

     467                SCpnt->cmnd[9] = (unsigned char) block & 0xff;

     468                SCpnt->cmnd[10] = (unsigned char) (this_count >> 24) & 0xff;

     469                SCpnt->cmnd[11] = (unsigned char) (this_count >> 16) & 0xff;

     470                SCpnt->cmnd[12] = (unsigned char) (this_count >> 8) & 0xff;

     471                SCpnt->cmnd[13] = (unsigned char) this_count & 0xff;

     472                SCpnt->cmnd[14] = SCpnt->cmnd[15] = 0;

     473        } else if ((this_count > 0xff) || (block > 0x1fffff) ||

     474                   SCpnt->device->use_10_for_rw) {

     475                if (this_count > 0xffff)

     476                        this_count = 0xffff;

     477

     478                SCpnt->cmnd[0] += READ_10 - READ_6;

     479                SCpnt->cmnd[1] |= blk_fua_rq(rq) ? 0x8 : 0;

     480                SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;

     481                SCpnt->cmnd[3] = (unsigned char) (block >> 16) & 0xff;

     482                SCpnt->cmnd[4] = (unsigned char) (block >> 8) & 0xff;

     483                SCpnt->cmnd[5] = (unsigned char) block & 0xff;

     484                SCpnt->cmnd[6] = SCpnt->cmnd[9] = 0;

     485                SCpnt->cmnd[7] = (unsigned char) (this_count >> 8) & 0xff;

     486                SCpnt->cmnd[8] = (unsigned char) this_count & 0xff;

     487        } else {

     488                if (unlikely(blk_fua_rq(rq))) {

     489                        /*

     490                         * This happens only if this drive failed

     491                         * 10byte rw command with ILLEGAL_REQUEST

     492                         * during operation and thus turned off

     493                         * use_10_for_rw.

     494                         */

     495                        printk(KERN_ERR "sd: FUA write on READ/WRITE(6) drive/n");

     496                        return 0;

     497                }

     498

     499                SCpnt->cmnd[1] |= (unsigned char) ((block >> 16) & 0x1f);

     500                SCpnt->cmnd[2] = (unsigned char) ((block >> 8) & 0xff);

     501                SCpnt->cmnd[3] = (unsigned char) block & 0xff;

     502                SCpnt->cmnd[4] = (unsigned char) this_count;

     503                SCpnt->cmnd[5] = 0;

     504        }

     505        SCpnt->request_bufflen = this_count * sdp->sector_size;

     506

     507        /*

     508         * We shouldn't disconnect in the middle of a sector, so with a dumb

     509         * host adapter, it's safe to assume that we can at least transfer

     510         * this many bytes between each connect / disconnect.

     511         */

     512        SCpnt->transfersize = sdp->sector_size;

     513        SCpnt->underflow = this_count << 9;

     514        SCpnt->allowed = SD_MAX_RETRIES;

     515        SCpnt->timeout_per_command = timeout;

     516

     517        /*

     518         * This is the completion routine we use.  This is matched in terms

     519         * of capability to this function.

     520         */

     521        SCpnt->done = sd_rw_intr;

     522

     523        /*

     524         * This indicates that the command is ready from our end to be

     525         * queued.

     526         */

     527        return 1;

     528}

     

    这个函数很重要,看似也很长,但是对照着前面scsi块设备驱动体系架构仔细看看,就会发现其实代码虽多,但很好理解。379~394检查一下磁盘状态,正常的话就不进入相应的条件分支。409~435行,根据扇区大小对内部变量blockthis_count进行调整,其中block表示将要对磁盘读写的起始扇区号,this_count表示将要读入scsi_cmnd对应的那个缓冲区的字节数。这个缓冲区是通过前面scsi_init_io函数调用scsi_alloc_sgtable获得的,感兴趣的同学可以深入研究一下。

     

    继续走,436行,通过rq_data_dir宏获得request的传输方向:

    #define rq_data_dir(rq)         ((rq)->flags & 1)

     

    如果是WRITE就把scsi命令设置成WRITE_6,否则设置成READ_6457-478是针对有些磁盘的大扇区的处理,我们略过,然后499-503初始化CDB的其他字段,大家可以对照“scsi设备驱动体系架构”中CDB的格式来分析这些代码的意思。最后,sd_init_command函数初始化scsi_cmnd的其他字段,并返回到scsi_prep_fn函数中。由于sd_init_command返回的是1,最终,正常的话,scsi_prep_fn函数返回BLKPREP_OKprep表示prepare的意思,用我们的母语说就是准备的意思,最后BLKPREP_OK就说明准备好了,或者说准备就绪。而scsi_prep_fn()也将返回这个值,返回之前还设置了cmd_flags中的REQ_DONTPREP(注意elv_next_request()函数741行判断的就是设这个flag)

     

    回到elv_next_request()中,由于返回值是BLKPREP_OK,所以746行我们就break了。换言之,我们取到了一个request,我们为之准备好了scsi命令,我们下一步就该是执行这个命令了。所以我们不需要再在elv_next_request()中滞留。我们终于回到了scsi_request_fn(),结束了elv_next_request,又要看下一个,不只是一个,而是两个,1467行,一个宏加一个函数,宏是blk_queue_tagged,来自include/linux/blkdev.h

    #define blk_queue_tagged(q)     test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)

     

    而函数是blk_queue_start_tag,来自block/ll_rw_blk.c:

     

       1122 int blk_queue_start_tag(request_queue_t *q, struct request *rq)

       1123 {

       1124         struct blk_queue_tag *bqt = q->queue_tags;

       1125         int tag;

       1126

       1127         if (unlikely((rq->cmd_flags & REQ_QUEUED))) {

       1128                 printk(KERN_ERR

       1129                        "%s: request %p for device [%s] already tagged %d",

       1130                        __FUNCTION__, rq,

       1131                        rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);

       1132                 BUG();

       1133         }

       1134

       1135         /*

       1136          * Protect against shared tag maps, as we may not have exclusive

       1137          * access to the tag map.

       1138          */

       1139         do {

       1140                 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);

       1141                 if (tag >= bqt->max_depth)

       1142                         return 1;

       1143

       1144         } while (test_and_set_bit(tag, bqt->tag_map));

       1145

       1146         rq->cmd_flags |= REQ_QUEUED;

       1147         rq->tag = tag;

       1148         bqt->tag_index[tag] = rq;

       1149         blkdev_dequeue_request(rq);

       1150         list_add(&rq->queuelist, &bqt->busy_list);

       1151         bqt->busy++;

       1152         return 0;

       1153 }

     

    对于我们大多数人来说,这两个函数的返回值都是0

    也因此,下一个函数blkdev_dequeue_request()就会被执行。来自include/linux/blkdev.h

     

        725 static inline void blkdev_dequeue_request(struct request *req)

        726 {

        727         elv_dequeue_request(req->q, req);

        728 }

     

    elv_dequeue_request来自block/elevator.c

     

        778 void elv_dequeue_request(request_queue_t *q, struct request *rq)

        779 {

        780         BUG_ON(list_empty(&rq->queuelist));

        781         BUG_ON(ELV_ON_HASH(rq));

        782

        783         list_del_init(&rq->queuelist);

        784

        785         /*

        786          * the time frame between a request being removed from the lists

        787          * and to it is freed is accounted as io that is in progress at

        788          * the driver side.

        789          */

        790         if (blk_account_rq(rq))

        791                 q->in_flight++;

        792 }

     

    现在这个社会就是利用与被利用的关系,既然这个request已经没有了利用价值,我们已经从它身上得到了我们想要的scsi命令,那么我们完全可以过河拆桥卸磨杀驴了。list_del_init把这个requestrequest queue队列里删除掉。

     

    而下面这个blk_account_rq也是一个来自include/linux/blkdev.h的宏:

    536 #define blk_account_rq(rq)      (blk_rq_started(rq) && blk_fs_request(rq))

     

    很显然,至少第二个条件我们是不满足的。所以不用多说,结束这个elv_dequeue_request

     

    现在是时候去执行scsi命令了,回到scsi_request_fn函数中elv_next_request执行完毕之后,reqspecial就存放对scsi硬件设备发出“特殊”命令的请求所使用的数据的指针,1472行,把它赋给内部 scsi_cmnd型变量cmd。然后1508行调用scsi_dispatch_cmd函数执行这个cmd

     

    整个块设备驱动层的处理就结束了,我还是在网上找到一个图,正好可以总结上面的过程:

    从前面分析可以看出,请求队列queuetop levelmiddle level之间的纽带。上层请求会在请求队列中维护,处理函数的方法由上下各层提供。在请求队列的处理过程中,将普通的块设备请求转换成标准的scsi命令,然后再通过middle levellow level之间的接口将请求递交给scsi host

    最新回复(0)