好了,了解完必要的scsi设备驱动知识以后,我们就可以安心分析scsi_request_fn函数了。大家回忆一下对,这个函数指针通过几次传递并最终在blk_init_queue_node()中被赋予了q->request_fn。所以这一层的重点就是这个scsi_request_fn函数。
在看scsi_request_fn之前,注意回忆一下scsi_alloc_queue函数的1598行至1560行还赋了三个函数指针:
1590 struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
1591 {
1592 struct request_queue *q;
1593
1594 q = __scsi_alloc_queue(sdev->host, scsi_request_fn);
1595 if (!q)
1596 return NULL;
1597
1598 blk_queue_prep_rq(q, scsi_prep_fn);
1599 blk_queue_issue_flush_fn(q, scsi_issue_flush_fn);
1600 blk_queue_softirq_done(q, scsi_softirq_done);
1601 return q;
1602 }
143 void blk_queue_prep_rq(request_queue_t *q, prep_rq_fn *pfn)
144 {
145 q->prep_rq_fn = pfn;
146 }
313 void blk_queue_issue_flush_fn(request_queue_t *q, issue_flush_fn *iff)
314 {
315 q->issue_flush_fn = iff;
316 }
173 void blk_queue_softirq_done(request_queue_t *q, softirq_done_fn *fn)
174 {
175 q->softirq_done_fn = fn;
176 }
分别是把scsi_prep_fn赋给了q->prep_rq_fn,把scsi_issue_flush_fn赋给了q->issue_flush_fn,把scsi_softirq_done赋给了q->softirq_done_fn。尤其是scsi_prep_fn我们马上就会用到。
好,让我们继续前面的话题,重点关注scsi_request_fn():
1422 static void scsi_request_fn(struct request_queue *q)
1423 {
1424 struct scsi_device *sdev = q->queuedata;
1425 struct scsi_Host *shost;
1426 struct scsi_cmnd *cmd;
1427 struct request *req;
1428
1429 if (!sdev) {
1430 printk("scsi: killing requests for dead queue/n");
1431 while ((req = elv_next_request(q)) != NULL)
1432 scsi_kill_request(req, q);
1433 return;
1434 }
1435
1436 if(!get_device(&sdev->sdev_gendev))
1437 /* We must be tearing the block queue down already */
1438 return;
1439
1440 /*
1441 * To start with, we keep looping until the queue is empty, or until
1442 * the host is no longer able to accept any more requests.
1443 */
1444 shost = sdev->host;
1445 while (!blk_queue_plugged(q)) {
1446 int rtn;
1447 /*
1448 * get next queueable request. We do this early to make sure
1449 * that the request is fully prepared even if we cannot
1450 * accept it.
1451 */
1452 req = elv_next_request(q);
1453 if (!req || !scsi_dev_queue_ready(q, sdev))
1454 break;
1455
1456 if (unlikely(!scsi_device_online(sdev))) {
1457 sdev_printk(KERN_ERR, sdev,
1458 "rejecting I/O to offline device/n");
1459 scsi_kill_request(req, q);
1460 continue;
1461 }
1462
1463
1464 /*
1465 * Remove the request from the request list.
1466 */
1467 if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))
1468 blkdev_dequeue_request(req);
1469 sdev->device_busy++; /* 说明命令正在执行中 */
1470
1471 spin_unlock(q->queue_lock);
1472 cmd = req->special;
1473 if (unlikely(cmd == NULL)) {
1474 printk(KERN_CRIT "impossible request in %s./n"
1475 "please mail a stack trace to "
1476 "linux-scsi@vger.kernel.org/n",
1477 __FUNCTION__);
1478 blk_dump_rq_flags(req, "foo");
1479 BUG();
1480 }
1481 spin_lock(shost->host_lock);
1482
1483 if (!scsi_host_queue_ready(q, shost, sdev))
1484 goto not_ready;
1485 if (sdev->single_lun) {
1486 if (scsi_target(sdev)->starget_sdev_user &&
1487 scsi_target(sdev)->starget_sdev_user != sdev)
1488 goto not_ready;
1489 scsi_target(sdev)->starget_sdev_user = sdev;
1490 }
1491 shost->host_busy++;
1492
1493 /*
1494 * XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will
1495 * take the lock again.
1496 */
1497 spin_unlock_irq(shost->host_lock);
1498
1499 /*
1500 * Finally, initialize any error handling parameters, and set up
1501 * the timers for timeouts.
1502 */
1503 scsi_init_cmd_errh(cmd);
1504
1505 /*
1506 * Dispatch the command to the low-level driver.
1507 */
1508 rtn = scsi_dispatch_cmd(cmd);
1509 spin_lock_irq(q->queue_lock);
1510 if(rtn) {
1511 /* we're refusing the command; because of
1512 * the way locks get dropped, we need to
1513 * check here if plugging is required */
1514 if(sdev->device_busy == 0)
1515 blk_plug_device(q);
1516
1517 break;
1518 }
1519 }
1520
1521 goto out;
1522
1523 not_ready:
1524 spin_unlock_irq(shost->host_lock);
1525
1526 /*
1527 * lock q, handle tag, requeue req, and decrement device_busy. We
1528 * must return with queue_lock held.
1529 *
1530 * Decrementing device_busy without checking it is OK, as all such
1531 * cases (host limits or settings) should run the queue at some
1532 * later time.
1533 */
1534 spin_lock_irq(q->queue_lock);
1535 blk_requeue_request(q, req);
1536 sdev->device_busy--;
1537 if(sdev->device_busy == 0)
1538 blk_plug_device(q);
1539 out:
1540 /* must be careful here...if we trigger the ->remove() function
1541 * we cannot be holding the q lock */
1542 spin_unlock_irq(q->queue_lock);
1543 put_device(&sdev->sdev_gendev);
1544 spin_lock_irq(q->queue_lock);
1545 }
scsi_request_fn函数为scsi设备请求队列处理函数,前面看到该函数被注册到了request_queue->request_fn上。块设备请求的bio最终会merge到request queue中,然后通过unplug_fn函数调用request_queue->request_fn,实现scsi_reuqest_fn函数的调用。
scsi_request_fn函数实现了请求队列的处理,首先1452-1468行按照电梯算法从请求队列中摘取一个request,所以我们首先关注1452行的elv_next_request(),来自block/elevator.c:
712 struct request *elv_next_request(request_queue_t *q)
713 {
714 struct request *rq;
715 int ret;
716
717 while ((rq = __elv_next_request(q)) != NULL) {
718 if (!(rq->cmd_flags & REQ_STARTED)) {
719 /*
720 * This is the first time the device driver
721 * sees this request (possibly after
722 * requeueing). Notify IO scheduler.
723 */
724 if (blk_sorted_rq(rq))
725 elv_activate_rq(q, rq);
726
727 /*
728 * just mark as started even if we don't start
729 * it, a request that has been delayed should
730 * not be passed by new incoming requests
731 */
732 rq->cmd_flags |= REQ_STARTED;
733 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
734 }
735
736 if (!q->boundary_rq || q->boundary_rq == rq) {
737 q->end_sector = rq_end_sector(rq);
738 q->boundary_rq = NULL;
739 }
740
741 if ((rq->cmd_flags & REQ_DONTPREP) || !q->prep_rq_fn)
742 break;
743
744 ret = q->prep_rq_fn(q, rq);
745 if (ret == BLKPREP_OK) {
746 break;
747 } else if (ret == BLKPREP_DEFER) {
748 /*
749 * the request may have been (partially) prepped.
750 * we need to keep this request in the front to
751 * avoid resource deadlock. REQ_STARTED will
752 * prevent other fs requests from passing this one.
753 */
754 rq = NULL;
755 break;
756 } else if (ret == BLKPREP_KILL) {
757 int nr_bytes = rq->hard_nr_sectors << 9;
758
759 if (!nr_bytes)
760 nr_bytes = rq->data_len;
761
762 blkdev_dequeue_request(rq);
763 rq->cmd_flags |= REQ_QUIET;
764 end_that_request_chunk(rq, 0, nr_bytes);
765 end_that_request_last(rq, 0);
766 } else {
767 printk(KERN_ERR "%s: bad return=%d/n", __FUNCTION__,
768 ret);
769 break;
770 }
771 }
772
773 return rq;
774 }
它调用的__elv_next_request()仍然来自block/elevator.c:
696 static inline struct request *__elv_next_request(request_queue_t *q)
697 {
698 struct request *rq;
699
700 while (1) {
701 while (!list_empty(&q->queue_head)) {
702 rq = list_entry_rq(q->queue_head.next);
703 if (blk_do_ordered(q, &rq))
704 return rq;
705 }
706
707 if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
708 return NULL;
709 }
710 }
由于我们在I/O调度层中插入了一个request,所以这里q->queue_head不可能为空。所以702行从中取出一个request来。然后是blk_do_ordered(),来自block/ll_rw_blk.c:
478 int blk_do_ordered(request_queue_t *q, struct request **rqp)
479 {
480 struct request *rq = *rqp;
481 int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
482
483 if (!q->ordseq) {
484 if (!is_barrier)
485 return 1;
486
487 if (q->next_ordered != QUEUE_ORDERED_NONE) {
488 *rqp = start_ordered(q, rq);
489 return 1;
490 } else {
491 /*
492 * This can happen when the queue switches to
493 * ORDERED_NONE while this request is on it.
494 */
495 blkdev_dequeue_request(rq);
496 end_that_request_first(rq, -EOPNOTSUPP,
497 rq->hard_nr_sectors);
498 end_that_request_last(rq, -EOPNOTSUPP);
499 *rqp = NULL;
500 return 0;
501 }
502 }
503
504 /*
505 * Ordered sequence in progress
506 */
507
508 /* Special requests are not subject to ordering rules. */
509 if (!blk_fs_request(rq) &&
510 rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
511 return 1;
512
513 if (q->ordered & QUEUE_ORDERED_TAG) {
514 /* Ordered by tag. Blocking the next barrier is enough. */
515 if (is_barrier && rq != &q->bar_rq)
516 *rqp = NULL;
517 } else {
518 /* Ordered by draining. Wait for turn. */
519 WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
520 if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
521 *rqp = NULL
522 }
523
524 return 1;
525 }
首先看一下blk_fs_request,
528 #define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS)
很显然,咱们从来没有设置这个标识,所以不去管它。
所以在咱们这个上下文里,is_barrier一定是0。所以,blk_do_ordered二话不说,直接返回1。那么回到__elv_next_request以后,703行这个if条件是满足的,所以也就是返回rq,下面的那个elevator_dispatch_fn根本不会执行的。另一方面,我们从__elv_next_request返回,回到elv_next_request()的时候,只要request queue不是空的,那么返回值就是队列中最前边的那个request。
继续在elv_next_request中往下走,request得到了,cmd_flags其实整个故事中设置REQ_STARTED的也就是这里,732行。所以在我们执行732行之前,这个flag是没有设置的。因此,if条件是满足的。
而blk_sorted_rq又是一个宏,来自include/linux/blkdev.h:
543 #define blk_sorted_rq(rq) ((rq)->cmd_flags & REQ_SORTED)
很显然,咱们也从来没有设置过这个flag,所以这里不关我们的事。
当然了,对于noop,即便执行下一个函数也没有意义,因为这个elv_activate_rq()来自block/elevator.c:
272 static void elv_activate_rq(request_queue_t *q, struct request *rq)
273 {
274 elevator_t *e = q->elevator;
275
276 if (e->ops->elevator_activate_req_fn)
277 e->ops->elevator_activate_req_fn(q, rq);
278 }
我们假设使用最简单的noop电梯算法,即根本就没有这个指针,所以不去管他。
这时候,我们设置REQ_STARTED这个flag,最开始我们在elevator_init()中,有这么一句:
230 q->boundary_rq = NULL;
于是rq_end_sector会被执行,这其实也只是一个很简单的宏:
172 #define rq_end_sector(rq) ((rq)->sector + (rq)->nr_sectors)
同时,boundary_rq还是被置为NULL。
回到elv_next_request中,接下来744行,由于我们把prep_rq_fn赋上了scsi_prep_fn,所以我们要看一下这个scsi_prep_fn(),这个来自drivers/scsi/scsi_lib.c的函数:
1093static int scsi_prep_fn(struct request_queue *q, struct request *req)
1094{
1095 struct scsi_device *sdev = q->queuedata;
1096 struct scsi_cmnd *cmd;
1097 int specials_only = 0;
1098
1099 /*
1100 * Just check to see if the device is online. If it isn't, we
1101 * refuse to process any commands. The device must be brought
1102 * online before trying any recovery commands
1103 */
1104 if (unlikely(!scsi_device_online(sdev))) {
1105 sdev_printk(KERN_ERR, sdev,
1106 "rejecting I/O to offline device/n");
1107 goto kill;
1108 }
1109 if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
1110 /* OK, we're not in a running state don't prep
1111 * user commands */
1112 if (sdev->sdev_state == SDEV_DEL) {
1113 /* Device is fully deleted, no commands
1114 * at all allowed down */
1115 sdev_printk(KERN_ERR, sdev,
1116 "rejecting I/O to dead device/n");
1117 goto kill;
1118 }
1119 /* OK, we only allow special commands (i.e. not
1120 * user initiated ones */
1121 specials_only = sdev->sdev_state;
1122 }
1123
1124 /*
1125 * Find the actual device driver associated with this command.
1126 * The SPECIAL requests are things like character device or
1127 * ioctls, which did not originate from ll_rw_blk. Note that
1128 * the special field is also used to indicate the cmd for
1129 * the remainder of a partially fulfilled request that can
1130 * come up when there is a medium error. We have to treat
1131 * these two cases differently. We differentiate by looking
1132 * at request->cmd, as this tells us the real story.
1133 */
1134 if (req->flags & REQ_SPECIAL && req->special) {
1135 cmd = req->special;
1136 } else if (req->flags & (REQ_CMD | REQ_BLOCK_PC)) {
1137
1138 if(unlikely(specials_only) && !(req->flags & REQ_SPECIAL)) {
1139 if(specials_only == SDEV_QUIESCE ||
1140 specials_only == SDEV_BLOCK)
1141 goto defer;
1142
1143 sdev_printk(KERN_ERR, sdev,
1144 "rejecting I/O to device being removed/n");
1145 goto kill;
1146 }
1147
1148
1149 /*
1150 * Now try and find a command block that we can use.
1151 */
1152 if (!req->special) {
1153 cmd = scsi_get_command(sdev, GFP_ATOMIC);
1154 if (unlikely(!cmd))
1155 goto defer;
1156 } else
1157 cmd = req->special;
1158
1159 /* pull a tag out of the request if we have one */
1160 cmd->tag = req->tag;
1161 } else {
1162 blk_dump_rq_flags(req, "SCSI bad req");
1163 goto kill;
1164 }
1165
1166 /* note the overloading of req->special. When the tag
1167 * is active it always means cmd. If the tag goes
1168 * back for re-queueing, it may be reset */
1169 req->special = cmd;
1170 cmd->request = req;
1171
1172 /*
1173 * FIXME: drop the lock here because the functions below
1174 * expect to be called without the queue lock held. Also,
1175 * previously, we dequeued the request before dropping the
1176 * lock. We hope REQ_STARTED prevents anything untoward from
1177 * happening now.
1178 */
1179 if (req->flags & (REQ_CMD | REQ_BLOCK_PC)) {
1180 int ret;
1181
1182 /*
1183 * This will do a couple of things:
1184 * 1) Fill in the actual SCSI command.
1185 * 2) Fill in any other upper-level specific fields
1186 * (timeout).
1187 *
1188 * If this returns 0, it means that the request failed
1189 * (reading past end of disk, reading offline device,
1190 * etc). This won't actually talk to the device, but
1191 * some kinds of consistency checking may cause the
1192 * request to be rejected immediately.
1193 */
1194
1195 /*
1196 * This sets up the scatter-gather table (allocating if
1197 * required).
1198 */
1199 ret = scsi_init_io(cmd);
1200 switch(ret) {
1201 /* For BLKPREP_KILL/DEFER the cmd was released */
1202 case BLKPREP_KILL:
1203 goto kill;
1204 case BLKPREP_DEFER:
1205 goto defer;
1206 }
1207
1208 /*
1209 * Initialize the actual SCSI command for this request.
1210 */
1211 if (req->flags & REQ_BLOCK_PC) {
1212 scsi_setup_blk_pc_cmnd(cmd);
1213 } else if (req->rq_disk) {
1214 struct scsi_driver *drv;
1215
1216 drv = *(struct scsi_driver **)req->rq_disk->private_data;
1217 if (unlikely(!drv->init_command(cmd))) {
1218 scsi_release_buffers(cmd);
1219 scsi_put_command(cmd);
1220 goto kill;
1221 }
1222 }
1223 }
1224
1225 /*
1226 * The request is now prepped, no need to come back here
1227 */
1228 req->flags |= REQ_DONTPREP;
1229 return BLKPREP_OK;
1230
1231 defer:
1232 /* If we defer, the elv_next_request() returns NULL, but the
1233 * queue must be restarted, so we plug here if no returning
1234 * command will automatically do that. */
1235 if (sdev->device_busy == 0)
1236 blk_plug_device(q);
1237 return BLKPREP_DEFER;
1238 kill:
1239 req->errors = DID_NO_CONNECT << 16;
1240 return BLKPREP_KILL;
1241}
大家还记得我们前面使用__make_request函数创建一个request的时候,曾经通过init_request_from_bio(req, bio)初始化请求描述符中的字段。其中把request的设置flags字段中的REQ_CMD标识,说明这次request是一个标准的读或写操作。注意,前面我们并没有设置REQ_BLOCK_PC标识。
所以scsi_prep_fn函数首先会进入1136那个条件分支。1138-1146的代码是对该块设备状态的一个检查,一般不会出什么问题。随后1153行调用scsi_get_command函数给我们这个request对应的scsi_device分配一个scsi_cmnd结构,其地址赋给函数内部变量cmd指针:
struct scsi_cmnd *scsi_get_command(struct scsi_device *dev, gfp_t gfp_mask)
{
struct scsi_cmnd *cmd;
/* Bail if we can't get a reference to the device */
if (!get_device(&dev->sdev_gendev))
return NULL;
cmd = __scsi_get_command(dev->host, gfp_mask);
if (likely(cmd != NULL)) {
unsigned long flags;
memset(cmd, 0, sizeof(*cmd));
cmd->device = dev;
init_timer(&cmd->eh_timeout);
INIT_LIST_HEAD(&cmd->list);
spin_lock_irqsave(&dev->list_lock, flags);
list_add_tail(&cmd->list, &dev->cmd_list);
spin_unlock_irqrestore(&dev->list_lock, flags);
cmd->jiffies_at_alloc = jiffies;
} else
put_device(&dev->sdev_gendev);
return cmd;
}
static struct scsi_cmnd *__scsi_get_command(struct Scsi_Host *shost,
gfp_t gfp_mask)
{
struct scsi_cmnd *cmd;
cmd = kmem_cache_alloc(shost->cmd_pool->slab,
gfp_mask | shost->cmd_pool->gfp_mask);
if (unlikely(!cmd)) {
unsigned long flags;
spin_lock_irqsave(&shost->free_list_lock, flags);
if (likely(!list_empty(&shost->free_list))) {
cmd = list_entry(shost->free_list.next,
struct scsi_cmnd, list);
list_del_init(&cmd->list);
}
spin_unlock_irqrestore(&shost->free_list_lock, flags);
}
return cmd;
}
看不懂这个分配函数的回去好好看一下“scsi设备驱动体系架构”最后那个图,我就不多费口舌了。回到scsi_prep_fn中,1160行把reqest的tag赋给这个全新的scsi_cmnd结构;然后1169、1170行把这个reqest和scsi_cmnd联系起来。随后又进入1179行条件判断,1199行,调用scsi_init_io函数初始化这个scsi_cmnd结构:
static int scsi_init_io(struct scsi_cmnd *cmd)
{
struct request *req = cmd->request;
struct scatterlist *sgpnt;
int count;
/*
* if this is a rq->data based REQ_BLOCK_PC, setup for a non-sg xfer
*/
if ((req->flags & REQ_BLOCK_PC) && !req->bio) {
cmd->request_bufflen = req->data_len;
cmd->request_buffer = req->data;
req->buffer = req->data;
cmd->use_sg = 0;
return 0;
}
/*
* we used to not use scatter-gather for single segment request,
* but now we do (it makes highmem I/O easier to support without
* kmapping pages)
*/
cmd->use_sg = req->nr_phys_segments;
/*
* if sg table allocation fails, requeue request later.
*/
sgpnt = scsi_alloc_sgtable(cmd, GFP_ATOMIC);
if (unlikely(!sgpnt)) {
scsi_unprep_request(req);
return BLKPREP_DEFER;
}
cmd->request_buffer = (char *) sgpnt;
cmd->request_bufflen = req->nr_sectors << 9;
if (blk_pc_request(req))
cmd->request_bufflen = req->data_len;
req->buffer = NULL;
/*
* Next, walk the list, and fill in the addresses and sizes of
* each segment.
*/
count = blk_rq_map_sg(req->q, req, cmd->request_buffer);
/*
* mapped well, send it off
*/
if (likely(count <= cmd->use_sg)) {
cmd->use_sg = count;
return 0;
}
printk(KERN_ERR "Incorrect number of segments after building list/n");
printk(KERN_ERR "counted %d, received %d/n", count, cmd->use_sg);
printk(KERN_ERR "req nr_sec %lu, cur_nr_sec %u/n", req->nr_sectors,
req->current_nr_sectors);
/* release the command and kill it */
scsi_release_buffers(cmd);
scsi_put_command(cmd);
return BLKPREP_KILL;
}
一般情况下,scsi_init_io返回0,否则致命错误,导致scsi_prep_fn退出。继续走,由于我们并没有设置REQ_BLOCK_PC标识,而且req的rq_disk是存在的,gendisk,忘了?那你完了。所以scsi_prep_fn函数来到1217行,执行本函数中最重要的过程,drv->init_command。这个drv是啥?来自gendisk的private_data字段。还记得sd_probe吗?我们在其中把它赋值给了对应scsi_disk结构的driver字段,就是前面那个sd_template常量,别告诉我你又忘了。如果真忘了,那就好好从头开始,从scsi磁盘驱动的初始化函数init_sd开始。
我们知道sd_template常量的init_command指针指向sd_init_command函数地址,所以下面就来看看sd_init_command这个函数,十分重要,来自drivers/scsi/sd.c:
366static int sd_init_command(struct scsi_cmnd * SCpnt)
367{
368 struct scsi_device *sdp = SCpnt->device;
369 struct request *rq = SCpnt->request;
370 struct gendisk *disk = rq->rq_disk;
371 sector_t block = rq->sector;
372 unsigned int this_count = SCpnt->request_bufflen >> 9;
373 unsigned int timeout = sdp->timeout;
374
375 SCSI_LOG_HLQUEUE(1, printk("sd_init_command: disk=%s, block=%llu, "
376 "count=%d/n", disk->disk_name,
377 (unsigned long long)block, this_count));
378
379 if (!sdp || !scsi_device_online(sdp) ||
380 block + rq->nr_sectors > get_capacity(disk)) {
381 SCSI_LOG_HLQUEUE(2, printk("Finishing %ld sectors/n",
382 rq->nr_sectors));
383 SCSI_LOG_HLQUEUE(2, printk("Retry with 0x%p/n", SCpnt));
384 return 0;
385 }
386
387 if (sdp->changed) {
388 /*
389 * quietly refuse to do anything to a changed disc until
390 * the changed bit has been reset
391 */
392 /* printk("SCSI disk has been changed. Prohibiting further I/O./n"); */
393 return 0;
394 }
395 SCSI_LOG_HLQUEUE(2, printk("%s : block=%llu/n",
396 disk->disk_name, (unsigned long long)block));
397
398 /*
399 * If we have a 1K hardware sectorsize, prevent access to single
400 * 512 byte sectors. In theory we could handle this - in fact
401 * the scsi cdrom driver must be able to handle this because
402 * we typically use 1K blocksizes, and cdroms typically have
403 * 2K hardware sectorsizes. Of course, things are simpler
404 * with the cdrom, since it is read-only. For performance
405 * reasons, the filesystems should be able to handle this
406 * and not force the scsi disk driver to use bounce buffers
407 * for this.
408 */
409 if (sdp->sector_size == 1024) {
410 if ((block & 1) || (rq->nr_sectors & 1)) {
411 printk(KERN_ERR "sd: Bad block number requested");
412 return 0;
413 } else {
414 block = block >> 1;
415 this_count = this_count >> 1;
416 }
417 }
418 if (sdp->sector_size == 2048) {
419 if ((block & 3) || (rq->nr_sectors & 3)) {
420 printk(KERN_ERR "sd: Bad block number requested");
421 return 0;
422 } else {
423 block = block >> 2;
424 this_count = this_count >> 2;
425 }
426 }
427 if (sdp->sector_size == 4096) {
428 if ((block & 7) || (rq->nr_sectors & 7)) {
429 printk(KERN_ERR "sd: Bad block number requested");
430 return 0;
431 } else {
432 block = block >> 3;
433 this_count = this_count >> 3;
434 }
435 }
436 if (rq_data_dir(rq) == WRITE) {
437 if (!sdp->writeable) {
438 return 0;
439 }
440 SCpnt->cmnd[0] = WRITE_6;
441 SCpnt->sc_data_direction = DMA_TO_DEVICE;
442 } else if (rq_data_dir(rq) == READ) {
443 SCpnt->cmnd[0] = READ_6;
444 SCpnt->sc_data_direction = DMA_FROM_DEVICE;
445 } else {
446 printk(KERN_ERR "sd: Unknown command %lx/n", rq->flags);
447/* overkill panic("Unknown sd command %lx/n", rq->flags); */
448 return 0;
449 }
450
451 SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks./n",
452 disk->disk_name, (rq_data_dir(rq) == WRITE) ?
453 "writing" : "reading", this_count, rq->nr_sectors));
454
455 SCpnt->cmnd[1] = 0;
456
457 if (block > 0xffffffff) {
458 SCpnt->cmnd[0] += READ_16 - READ_6;
459 SCpnt->cmnd[1] |= blk_fua_rq(rq) ? 0x8 : 0;
460 SCpnt->cmnd[2] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0;
461 SCpnt->cmnd[3] = sizeof(block) > 4 ? (unsigned char) (block >> 48) & 0xff : 0;
462 SCpnt->cmnd[4] = sizeof(block) > 4 ? (unsigned char) (block >> 40) & 0xff : 0;
463 SCpnt->cmnd[5] = sizeof(block) > 4 ? (unsigned char) (block >> 32) & 0xff : 0;
464 SCpnt->cmnd[6] = (unsigned char) (block >> 24) & 0xff;
465 SCpnt->cmnd[7] = (unsigned char) (block >> 16) & 0xff;
466 SCpnt->cmnd[8] = (unsigned char) (block >> 8) & 0xff;
467 SCpnt->cmnd[9] = (unsigned char) block & 0xff;
468 SCpnt->cmnd[10] = (unsigned char) (this_count >> 24) & 0xff;
469 SCpnt->cmnd[11] = (unsigned char) (this_count >> 16) & 0xff;
470 SCpnt->cmnd[12] = (unsigned char) (this_count >> 8) & 0xff;
471 SCpnt->cmnd[13] = (unsigned char) this_count & 0xff;
472 SCpnt->cmnd[14] = SCpnt->cmnd[15] = 0;
473 } else if ((this_count > 0xff) || (block > 0x1fffff) ||
474 SCpnt->device->use_10_for_rw) {
475 if (this_count > 0xffff)
476 this_count = 0xffff;
477
478 SCpnt->cmnd[0] += READ_10 - READ_6;
479 SCpnt->cmnd[1] |= blk_fua_rq(rq) ? 0x8 : 0;
480 SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;
481 SCpnt->cmnd[3] = (unsigned char) (block >> 16) & 0xff;
482 SCpnt->cmnd[4] = (unsigned char) (block >> 8) & 0xff;
483 SCpnt->cmnd[5] = (unsigned char) block & 0xff;
484 SCpnt->cmnd[6] = SCpnt->cmnd[9] = 0;
485 SCpnt->cmnd[7] = (unsigned char) (this_count >> 8) & 0xff;
486 SCpnt->cmnd[8] = (unsigned char) this_count & 0xff;
487 } else {
488 if (unlikely(blk_fua_rq(rq))) {
489 /*
490 * This happens only if this drive failed
491 * 10byte rw command with ILLEGAL_REQUEST
492 * during operation and thus turned off
493 * use_10_for_rw.
494 */
495 printk(KERN_ERR "sd: FUA write on READ/WRITE(6) drive/n");
496 return 0;
497 }
498
499 SCpnt->cmnd[1] |= (unsigned char) ((block >> 16) & 0x1f);
500 SCpnt->cmnd[2] = (unsigned char) ((block >> 8) & 0xff);
501 SCpnt->cmnd[3] = (unsigned char) block & 0xff;
502 SCpnt->cmnd[4] = (unsigned char) this_count;
503 SCpnt->cmnd[5] = 0;
504 }
505 SCpnt->request_bufflen = this_count * sdp->sector_size;
506
507 /*
508 * We shouldn't disconnect in the middle of a sector, so with a dumb
509 * host adapter, it's safe to assume that we can at least transfer
510 * this many bytes between each connect / disconnect.
511 */
512 SCpnt->transfersize = sdp->sector_size;
513 SCpnt->underflow = this_count << 9;
514 SCpnt->allowed = SD_MAX_RETRIES;
515 SCpnt->timeout_per_command = timeout;
516
517 /*
518 * This is the completion routine we use. This is matched in terms
519 * of capability to this function.
520 */
521 SCpnt->done = sd_rw_intr;
522
523 /*
524 * This indicates that the command is ready from our end to be
525 * queued.
526 */
527 return 1;
528}
这个函数很重要,看似也很长,但是对照着前面scsi块设备驱动体系架构仔细看看,就会发现其实代码虽多,但很好理解。379~394检查一下磁盘状态,正常的话就不进入相应的条件分支。409~435行,根据扇区大小对内部变量block和this_count进行调整,其中block表示将要对磁盘读写的起始扇区号,this_count表示将要读入scsi_cmnd对应的那个缓冲区的字节数。这个缓冲区是通过前面scsi_init_io函数调用scsi_alloc_sgtable获得的,感兴趣的同学可以深入研究一下。
继续走,436行,通过rq_data_dir宏获得request的传输方向:
#define rq_data_dir(rq) ((rq)->flags & 1)
如果是WRITE就把scsi命令设置成WRITE_6,否则设置成READ_6。457-478是针对有些磁盘的大扇区的处理,我们略过,然后499-503初始化CDB的其他字段,大家可以对照“scsi设备驱动体系架构”中CDB的格式来分析这些代码的意思。最后,sd_init_command函数初始化scsi_cmnd的其他字段,并返回到scsi_prep_fn函数中。由于sd_init_command返回的是1,最终,正常的话,scsi_prep_fn函数返回BLKPREP_OK。prep表示prepare的意思,用我们的母语说就是准备的意思,最后BLKPREP_OK就说明准备好了,或者说准备就绪。而scsi_prep_fn()也将返回这个值,返回之前还设置了cmd_flags中的REQ_DONTPREP。(注意elv_next_request()函数741行判断的就是设这个flag。)
回到elv_next_request()中,由于返回值是BLKPREP_OK,所以746行我们就break了。换言之,我们取到了一个request,我们为之准备好了scsi命令,我们下一步就该是执行这个命令了。所以我们不需要再在elv_next_request()中滞留。我们终于回到了scsi_request_fn(),结束了elv_next_request,又要看下一个,不只是一个,而是两个,1467行,一个宏加一个函数,宏是blk_queue_tagged,来自include/linux/blkdev.h:
#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
而函数是blk_queue_start_tag,来自block/ll_rw_blk.c:
1122 int blk_queue_start_tag(request_queue_t *q, struct request *rq)
1123 {
1124 struct blk_queue_tag *bqt = q->queue_tags;
1125 int tag;
1126
1127 if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
1128 printk(KERN_ERR
1129 "%s: request %p for device [%s] already tagged %d",
1130 __FUNCTION__, rq,
1131 rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
1132 BUG();
1133 }
1134
1135 /*
1136 * Protect against shared tag maps, as we may not have exclusive
1137 * access to the tag map.
1138 */
1139 do {
1140 tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
1141 if (tag >= bqt->max_depth)
1142 return 1;
1143
1144 } while (test_and_set_bit(tag, bqt->tag_map));
1145
1146 rq->cmd_flags |= REQ_QUEUED;
1147 rq->tag = tag;
1148 bqt->tag_index[tag] = rq;
1149 blkdev_dequeue_request(rq);
1150 list_add(&rq->queuelist, &bqt->busy_list);
1151 bqt->busy++;
1152 return 0;
1153 }
对于我们大多数人来说,这两个函数的返回值都是0。
也因此,下一个函数blkdev_dequeue_request()就会被执行。来自include/linux/blkdev.h:
725 static inline void blkdev_dequeue_request(struct request *req)
726 {
727 elv_dequeue_request(req->q, req);
728 }
而elv_dequeue_request来自block/elevator.c:
778 void elv_dequeue_request(request_queue_t *q, struct request *rq)
779 {
780 BUG_ON(list_empty(&rq->queuelist));
781 BUG_ON(ELV_ON_HASH(rq));
782
783 list_del_init(&rq->queuelist);
784
785 /*
786 * the time frame between a request being removed from the lists
787 * and to it is freed is accounted as io that is in progress at
788 * the driver side.
789 */
790 if (blk_account_rq(rq))
791 q->in_flight++;
792 }
现在这个社会就是利用与被利用的关系,既然这个request已经没有了利用价值,我们已经从它身上得到了我们想要的scsi命令,那么我们完全可以过河拆桥卸磨杀驴了。list_del_init把这个request从request queue队列里删除掉。
而下面这个blk_account_rq也是一个来自include/linux/blkdev.h的宏:
536 #define blk_account_rq(rq) (blk_rq_started(rq) && blk_fs_request(rq))
很显然,至少第二个条件我们是不满足的。所以不用多说,结束这个elv_dequeue_request。
现在是时候去执行scsi命令了,回到scsi_request_fn函数中elv_next_request执行完毕之后,req的special就存放对scsi硬件设备发出“特殊”命令的请求所使用的数据的指针,1472行,把它赋给内部 scsi_cmnd型变量cmd。然后1508行调用scsi_dispatch_cmd函数执行这个cmd。
整个块设备驱动层的处理就结束了,我还是在网上找到一个图,正好可以总结上面的过程:
从前面分析可以看出,请求队列queue是top level与middle level之间的纽带。上层请求会在请求队列中维护,处理函数的方法由上下各层提供。在请求队列的处理过程中,将普通的块设备请求转换成标准的scsi命令,然后再通过middle level与low level之间的接口将请求递交给scsi host。