很长时间以来,Linux块设备使用了一种称为“蓄流/泄流”(plugging/unplugging)的技术来改进吞吐率。简单而言,这种工作方式类似浴盆排水系统的塞子。当IO被提交时,它被储存在一个队列,稍后的某个时间,我们才允许IO从队列派发出去。之所以这么做是为IO尽可能做合并和排序。
static void scsi_request_fn(struct request_queue *q)
{
struct scsi_device *sdev = q->queuedata;
struct Scsi_Host *shost;
struct scsi_cmnd *cmd;
struct request *req;
if(!get_device(&sdev->sdev_gendev))
/* We must be tearing the block queue down already */
return;
/*
* To start with, we keep looping until the queue is empty, or until
* the host is no longer able to accept any more requests.
*/
shost = sdev->host;
for (;;) {
int rtn;
/*
* get next queueable request. We do this early to make sure
* that the request is fully prepared even if we cannot
* accept it.
*/
req = blk_peek_request(q); // 获得下一个可排队的请求,如果没有请求或者现在还不能想SCSI设备发送请求,则退出循环
if (!req || !scsi_dev_queue_ready(q, sdev))
break;
/* 如果设备已经离线,则输出错误消息, 调用scsi_kill_request函数释放请求,并以此方式处理后面所有的请求 */
if (unlikely(!scsi_device_online(sdev))) {
sdev_printk(KERN_ERR, sdev,
"rejecting I/O to offline device\n");
scsi_kill_request(req, q);
continue;
}
/*
* Remove the request from the request list.
* 如果队列不是使用generic tag queueing,并且没有为请求启动tagged操作,调用blk_start_request开始由驱动处理请求,这个函数将请求从队列中取出,为它启动超时定时器
*/
if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req))) //
blk_start_request(req);
sdev->device_busy++;
spin_unlock(q->queue_lock);
/* 从块设备驱动层请求描述符的special域获得SCSI命令描述符,这是在之前的blk_peek_request函数中调用请求队列的prep_rq_fn回调函数准备的 */
cmd = req->special;
if (unlikely(cmd == NULL)) {
printk(KERN_CRIT "impossible request in %s.\n"
"please mail a stack trace to "
"[email protected]\n",
__func__);
blk_dump_rq_flags(req, "foo");
BUG();
}
spin_lock(shost->host_lock);
/*
* We hit this when the driver is using a host wide
* tag map. For device level tag maps the queue_depth check
* in the device ready fn would prevent us from trying
* to allocate a tag. Since the map is a shared host resource
* we add the dev to the starved list so it eventually gets
* a run when a tag is freed.
*/
if (blk_queue_tagged(q) && !blk_rq_tagged(req)) {
if (list_empty(&sdev->starved_entry))
list_add_tail(&sdev->starved_entry,
&shost->starved_list);
goto not_ready;
}
if (!scsi_target_queue_ready(shost, sdev))
goto not_ready;
if (!scsi_host_queue_ready(q, shost, sdev))
goto not_ready;
scsi_target(sdev)->target_busy++;
shost->host_busy++;
/*
* XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will
* take the lock again.
*/
spin_unlock_irq(shost->host_lock);
/*
* Finally, initialize any error handling parameters, and set up the timers for timeouts.
* 初始化错误处理参数, 设置超时定时器
*/
scsi_init_cmd_errh(cmd);
/*
* Dispatch the command to the low-level driver.
* 将命令派发到底层驱动
*/
rtn = scsi_dispatch_cmd(cmd);
spin_lock_irq(q->queue_lock);
if (rtn)
goto out_delay;
}
goto out; not_ready:
spin_unlock_irq(shost->host_lock);
/*
* lock q, handle tag, requeue req, and decrement device_busy. We
* must return with queue_lock held.
*
* Decrementing device_busy without checking it is OK, as all such
* cases (host limits or settings) should run the queue at some
* later time.
*/
spin_lock_irq(q->queue_lock);
blk_requeue_request(q, req);
sdev->device_busy--;
out_delay:
if (sdev->device_busy == )
blk_delay_queue(q, SCSI_QUEUE_DELAY);
out:
/* must be careful here...if we trigger the ->remove() function
* we cannot be holding the q lock */
spin_unlock_irq(q->queue_lock);
put_device(&sdev->sdev_gendev);
spin_lock_irq(q->queue_lock);
}
blk_peek_request从请求队列“顶部”取得下一个请求。函数的实现就是一个大循环,每次调用__elv_next_request从电梯队列中取出一个请求进行处理
/**
* blk_peek_request - peek at the top of a request queue
* @q: request queue to peek at
*
* Description:
* Return the request at the top of @q. The returned request
* should be started using blk_start_request() before LLD starts
* processing it.
*
* Return:
* Pointer to the request at the top of @q if available. Null
* otherwise.
*
* Context:
* queue_lock must be held.
*/
struct request *blk_peek_request(struct request_queue *q)
{
struct request *rq;
int ret; while ((rq = __elv_next_request(q)) != NULL) { rq = blk_pm_peek_request(q, rq);
if (!rq)
break;
/* 请求可能是全新的或者是由于暂时不能处理而重新排入队列的,对于后一种情况,必然设置了REQ_STARTED标志。
* 换句话说,如果没有该标志,则表示第一次看见此请求,如果请求被插入还需要排序,则调用elv_activate_rq函数确定合适执行该请求
*/
if (!(rq->cmd_flags & REQ_STARTED)) {
/*
* This is the first time the device driver
* sees this request (possibly after
* requeueing). Notify IO scheduler.
*/
if (rq->cmd_flags & REQ_SORTED)
elv_activate_rq(q, rq); /*
* just mark as started even if we don't start
* it, a request that has been delayed should
* not be passed by new incoming requests
*/
rq->cmd_flags |= REQ_STARTED;
trace_block_rq_issue(q, rq);
}
/* 配合IO调度器 */
if (!q->boundary_rq || q->boundary_rq == rq) {
q->end_sector = rq_end_sector(rq);
q->boundary_rq = NULL;
} /* 如果请求队列设置了REQ_DONTPREP,表明不需要准备SCSI命令,退出循环,向调用者返回这个请求 */
if (rq->cmd_flags & REQ_DONTPREP)
break; /*
* 如果请求队列的dma_drain_size不为0,说明存在“过剩DMA”问题,这种情况下,需要为请求增加一个额外的段
* 以便将来在聚散列表后追加“抽干缓冲区”
*/
if (q->dma_drain_size && blk_rq_bytes(rq)) {
/*
* make sure space for the drain appears we
* know we can do this because max_hw_segments
* has been adjusted to be one fewer than the
* device can handle
*/
rq->nr_phys_segments++;
}
/*
* 如果没有定义 prep_rq_fn回调,则返回
* 否则调用回调为请求准备SCSI命令描述符,它有三种返回值:
* BLKPREP_OK:表示命令初期准备成功
* BLKPREP_DEFER:表示暂时还不能继续处理,需要将命令重新排入队列
* BLKPREP_KILL:该请求没办法继续处理,上上层报告IO错误,这里不退出循环,而是继续尝试下一个请求
*/
if (!q->prep_rq_fn)
break; ret = q->prep_rq_fn(q, rq);
if (ret == BLKPREP_OK) {
break;
} else if (ret == BLKPREP_DEFER) {
/*
* the request may have been (partially) prepped.
* we need to keep this request in the front to
* avoid resource deadlock. REQ_STARTED will
* prevent other fs requests from passing this one.
*/
if (q->dma_drain_size && blk_rq_bytes(rq) &&
!(rq->cmd_flags & REQ_DONTPREP)) {
/*
* remove the space for the drain we added
* so that we don't add it again
*/
--rq->nr_phys_segments;
} rq = NULL;
break;
} else if (ret == BLKPREP_KILL) {
rq->cmd_flags |= REQ_QUIET;
/*
* Mark this request as started so we don't trigger
* any debug logic in the end I/O path.
*/
blk_start_request(rq);
__blk_end_request_all(rq, -EIO);
} else {
printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
break;
}
} return rq;
}
请求队列中的prep_rq_fn回调函数实现了从请求构造SCSI命令的方法,prep_rq_fn回调函数关键有两个任务:
- 构造命令描述块
- 如果需要的话为数据传输准备聚散列表
命令描述块和聚散列表都被封装到SCSI命令描述符中,我们知道,请求至少有两个来源
- 来自上层bio
- 来自SCSI公共服务层
在刚找到SCSI设备为其初始化请求队列时,这个回调函数被设置为scsi_prep_fn
struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
{
struct request_queue *q; q = __scsi_alloc_queue(sdev->host, scsi_request_fn);
if (!q)
return NULL; blk_queue_prep_rq(q, scsi_prep_fn);
blk_queue_softirq_done(q, scsi_softirq_done);
blk_queue_rq_timed_out(q, scsi_times_out);
blk_queue_lld_busy(q, scsi_lld_busy);
return q;
} /**
* blk_queue_prep_rq - set a prepare_request function for queue
* @q: queue
* @pfn: prepare_request function
*
* It's possible for a queue to register a prepare_request callback which
* is invoked before the request is handed to the request_fn. The goal of
* the function is to prepare a request for I/O, it can be used to build a
* cdb from the request data for instance.
*
*/
void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
{
q->prep_rq_fn = pfn;
}
初始化回调
如果SCSI设备被高层驱动绑定,这个回调函数会被修改,例如,在sd_probe中被设置成sd_prep_fn
static void sd_probe_async(void *data, async_cookie_t cookie)
{
struct scsi_disk *sdkp = data;
struct scsi_device *sdp;
struct gendisk *gd;
u32 index;
struct device *dev; sdp = sdkp->device;
gd = sdkp->disk;
index = sdkp->index;
dev = &sdp->sdev_gendev; gd->major = sd_major((index & 0xf0) >> );
gd->first_minor = ((index & 0xf) << ) | (index & 0xfff00);
gd->minors = SD_MINORS; gd->fops = &sd_fops;
gd->private_data = &sdkp->driver;
gd->queue = sdkp->device->request_queue; /* defaults, until the device tells us otherwise */
sdp->sector_size = ;
sdkp->capacity = ;
sdkp->media_present = ;
sdkp->write_prot = ;
sdkp->cache_override = ;
sdkp->WCE = ;
sdkp->RCD = ;
sdkp->ATO = ;
sdkp->first_scan = ;
sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS; sd_revalidate_disk(gd); blk_queue_prep_rq(sdp->request_queue, sd_prep_fn);
blk_queue_unprep_rq(sdp->request_queue, sd_unprep_fn); gd->driverfs_dev = &sdp->sdev_gendev;
gd->flags = GENHD_FL_EXT_DEVT;
if (sdp->removable) {
gd->flags |= GENHD_FL_REMOVABLE;
gd->events |= DISK_EVENT_MEDIA_CHANGE;
} add_disk(gd);
if (sdkp->capacity)
sd_dif_config_host(sdkp); sd_revalidate_disk(gd); sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n",
sdp->removable ? "removable " : "");
blk_pm_runtime_init(sdp->request_queue, dev);
scsi_autopm_put_device(sdp);
put_device(&sdkp->dev);
}
初始化回调
在前一种情况下,SCSI设备只能处理来自SCSI公共服务层的请求,后一种情况下,SCSI命令不仅能处理来自SCSI公共服务层的请求,还能够处理来自上层的bio请求,分析见下一节