对于Nvme SSD,我们有的时候会用到ioctl系统调用,该调用的流程是怎样的呢?

首先,在注册nvme设备的时候,会初始化该设备的注册了file operations:

static const struct file_operations nvme_dev_fops = {
.owner = THIS_MODULE,
.open = nvme_dev_open,
.release = nvme_dev_release,
.unlocked_ioctl = nvme_dev_ioctl,
.compat_ioctl = nvme_dev_ioctl,
};

在nvme_dev_ioctl里,存在switch语句,列举ioctl的几种cmd,其中我们主要关注的是:NVME_IOCTL_ADMIN_CMD和NVME_IO_CMD。

static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
struct nvme_ctrl *ctrl = file->private_data;
void __user *argp = (void __user *)arg; switch (cmd) {
case NVME_IOCTL_ADMIN_CMD:
return nvme_user_cmd(ctrl, NULL, argp);
case NVME_IOCTL_IO_CMD:
return nvme_dev_user_cmd(ctrl, argp);
case NVME_IOCTL_RESET:
dev_warn(ctrl->device, "resetting controller\n");
return ctrl->ops->reset_ctrl(ctrl);
case NVME_IOCTL_SUBSYS_RESET:
return nvme_reset_subsystem(ctrl);
case NVME_IOCTL_RESCAN:
nvme_queue_scan(ctrl);
return ;
default:
return -ENOTTY;
}
}

对于ssd的读写命令,显然是要走 NVME_IOCTL_IO_CMD这一分支,该分支的函数主要做的事情是填充了nvme_command c命令:

static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct nvme_passthru_cmd __user *ucmd)
{
struct nvme_passthru_cmd cmd;
struct nvme_command c;
unsigned timeout = ;
int status; if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
return -EFAULT;
if (cmd.flags)
return -EINVAL; memset(&c, , sizeof(c));
c.common.opcode = cmd.opcode;
c.common.flags = cmd.flags;
c.common.nsid = cpu_to_le32(cmd.nsid);
c.common.cdw2[] = cpu_to_le32(cmd.cdw2);
c.common.cdw2[] = cpu_to_le32(cmd.cdw3);
c.common.cdw10[] = cpu_to_le32(cmd.cdw10);
c.common.cdw10[] = cpu_to_le32(cmd.cdw11);
c.common.cdw10[] = cpu_to_le32(cmd.cdw12);
c.common.cdw10[] = cpu_to_le32(cmd.cdw13);
c.common.cdw10[] = cpu_to_le32(cmd.cdw14);
c.common.cdw10[] = cpu_to_le32(cmd.cdw15); if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
(void __user *)(uintptr_t)cmd.addr, cmd.data_len,
&cmd.result, timeout);
if (status >= ) {
if (put_user(cmd.result, &ucmd->result))
return -EFAULT;
} return status;
}
int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
void __user *ubuffer, unsigned bufflen, u32 *result,
unsigned timeout)
{
return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, , ,
result, timeout);
} int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd,
void __user *ubuffer, unsigned bufflen,
void __user *meta_buffer, unsigned meta_len, u32 meta_seed,
u32 *result, unsigned timeout)
{
bool write = nvme_is_write(cmd);
struct nvme_ns *ns = q->queuedata;
struct gendisk *disk = ns ? ns->disk : NULL;
struct request *req;
struct bio *bio = NULL;
void *meta = NULL;
int ret; req = nvme_alloc_request(q, cmd, , NVME_QID_ANY);
if (IS_ERR(req))
return PTR_ERR(req); req->timeout = timeout ? timeout : ADMIN_TIMEOUT; if (ubuffer && bufflen) {
ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen,
GFP_KERNEL);
if (ret)
goto out;
bio = req->bio; if (!disk)
goto submit;
bio->bi_bdev = bdget_disk(disk, );
if (!bio->bi_bdev) {
ret = -ENODEV;
goto out_unmap;
} if (meta_buffer && meta_len) {
struct bio_integrity_payload *bip; meta = kmalloc(meta_len, GFP_KERNEL);
if (!meta) {
ret = -ENOMEM;
goto out_unmap;
} if (write) {
if (copy_from_user(meta, meta_buffer,
meta_len)) {
ret = -EFAULT;
goto out_free_meta;
}
} bip = bio_integrity_alloc(bio, GFP_KERNEL, );
if (IS_ERR(bip)) {
ret = PTR_ERR(bip);
goto out_free_meta;
} bip->bip_iter.bi_size = meta_len;
bip->bip_iter.bi_sector = meta_seed; ret = bio_integrity_add_page(bio, virt_to_page(meta),
meta_len, offset_in_page(meta));
if (ret != meta_len) {
ret = -ENOMEM;
goto out_free_meta;
}
}
}
submit:
blk_execute_rq(req->q, disk, req, );
if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
ret = -EINTR;
else
ret = nvme_req(req)->status;
if (result)
*result = le32_to_cpu(nvme_req(req)->result.u32);
if (meta && !ret && !write) {
if (copy_to_user(meta_buffer, meta, meta_len))
ret = -EFAULT;
}
out_free_meta:
kfree(meta);
out_unmap:
if (bio) {
if (disk && bio->bi_bdev)
bdput(bio->bi_bdev);
blk_rq_unmap_user(bio);
}
out:
blk_mq_free_request(req);
return ret;
}

__nvme_submit_user_cmd做的主要事情是,通过调用nvme_alloc_request函数分配一个request,对于读写命令,还要对request的bio进行初始化。最后就是提交,调用的函数是blk_execute_rq。

调用的函数也从驱动层到了block层。

在blk-exec.c文件中,找到了blk_execute_rq函数:

void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
struct request *rq, int at_head)
{
DECLARE_COMPLETION_ONSTACK(wait);
unsigned long hang_check; rq->end_io_data = &wait;
blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); /* Prevent hang_check timer from firing at us during very long I/O */
hang_check = sysctl_hung_task_timeout_secs;
if (hang_check)
while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/)));
else
wait_for_completion_io(&wait);
}

该函数主要包括两个部分,第一部分,调用blk_execute_rq_nowait将请求发送下去,第二部分,则是调用wait_for_completion_io函数来等待请求完成(下面也会提到)。

对于blk_execute_rq_nowait函数,其作用是将request insert到software queue中,然后返回。由于NVMe现在是用的是多队列,因此在if(q->mq_ops)中,就会调用blk_mq_sched_insert_request函数。

void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
struct request *rq, int at_head,
rq_end_io_fn *done)
{
int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; WARN_ON(irqs_disabled());
WARN_ON(!blk_rq_is_passthrough(rq)); rq->rq_disk = bd_disk;
rq->end_io = done; /*
* don't check dying flag for MQ because the request won't
* be reused after dying flag is set
*/
if (q->mq_ops) {
blk_mq_sched_insert_request(rq, at_head, true, false, false);
return;
} spin_lock_irq(q->queue_lock); if (unlikely(blk_queue_dying(q))) {
rq->rq_flags |= RQF_QUIET;
__blk_end_request_all(rq, -ENXIO);
spin_unlock_irq(q->queue_lock);
return;
} __elv_add_request(q, rq, where);
__blk_run_queue(q);
spin_unlock_irq(q->queue_lock);
}

然后就调用到了blk_mq_sched_insert_request函数,在这个函数中,由于不是flush,而且也不是不插入队列,而blk-mq走的是mq-deadline调度器,因此会进入if(e && e->type->ops.mq.insert)里面,

void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async, bool can_block)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); if (rq->tag == - && op_is_flush(rq->cmd_flags)) {
blk_mq_sched_insert_flush(hctx, rq, can_block);
return;
} if (e && blk_mq_sched_bypass_insert(hctx, rq))
goto run; if (e && e->type->ops.mq.insert_requests) {
LIST_HEAD(list); list_add(&rq->queuelist, &list);
e->type->ops.mq.insert_requests(hctx, &list, at_head);
} else {
spin_lock(&ctx->lock);
__blk_mq_insert_request(hctx, rq, at_head);
spin_unlock(&ctx->lock);
} run:
if (run_queue)
blk_mq_run_hw_queue(hctx, async);
}

针对mq设计的deadline调度器在文件mq-deadline.c里,我们可以看到这里定义了一些操作,其中就包括insert_requests。

static struct elevator_type mq_deadline = {
.ops.mq = {
.insert_requests = dd_insert_requests,
.dispatch_request = dd_dispatch_request,
.next_request = elv_rb_latter_request,
.former_request = elv_rb_former_request,
.bio_merge = dd_bio_merge,
.request_merge = dd_request_merge,
.requests_merged = dd_merged_requests,
.request_merged = dd_request_merged,
.has_work = dd_has_work,
.init_sched = dd_init_queue,
.exit_sched = dd_exit_queue,
}, .uses_mq = true,
#ifdef CONFIG_BLK_DEBUG_FS
.queue_debugfs_attrs = deadline_queue_debugfs_attrs,
#endif
.elevator_attrs = deadline_attrs,
.elevator_name = "mq-deadline",
.elevator_owner = THIS_MODULE,
};

因此调用的是dd_insert_requests函数,在这个函数中,会将请求加入到hardware queue中,而且前后需要对hardware queue队列加锁。

static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
struct list_head *list, bool at_head)
{
struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data; spin_lock(&dd->lock);
while (!list_empty(list)) {
struct request *rq; rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
dd_insert_request(hctx, rq, at_head);
}
spin_unlock(&dd->lock);
}

然后对list中的每一个request,依次插入到队列中,在本次调用过程中,只存在一个request,调用的函数也非常简单,由于该request是passthrough的,而且会将request插入到队尾,因此调用的函数是list_add_tail。

static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head)
{
struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
const int data_dir = rq_data_dir(rq); if (blk_mq_sched_try_insert_merge(q, rq))
return; blk_mq_sched_request_inserted(rq); if (at_head || blk_rq_is_passthrough(rq)) {
if (at_head)
list_add(&rq->queuelist, &dd->dispatch);
else
list_add_tail(&rq->queuelist, &dd->dispatch);
} else {
deadline_add_rq_rb(dd, rq); if (rq_mergeable(rq)) {
elv_rqhash_add(q, rq);
if (!q->last_merge)
q->last_merge = rq;
} /*
* set expire time and add to fifo list
*/
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
}
}

将request插入到队列之后,就会回到blk_mq_sched_insert_request函数中,调用blk_mq_run_hw_queue函数,在这个函数以及接下来调用的函数,最终会调用dispatch函数,即blk_mq_sched_dispatch_requests函数,

void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
__blk_mq_delay_run_hw_queue(hctx, async, );
} static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
unsigned long msecs)
{
if (unlikely(blk_mq_hctx_stopped(hctx) ||
!blk_mq_hw_queue_mapped(hctx)))
return; if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
int cpu = get_cpu();
if (cpumask_test_cpu(cpu, hctx->cpumask)) {
__blk_mq_run_hw_queue(hctx);
put_cpu();
return;
} put_cpu();
} kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
&hctx->run_work,
msecs_to_jiffies(msecs));
} static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
{
int srcu_idx; WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
cpu_online(hctx->next_cpu)); if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
rcu_read_lock();
blk_mq_sched_dispatch_requests(hctx);
rcu_read_unlock();
} else {
might_sleep(); srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
blk_mq_sched_dispatch_requests(hctx);
srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
}
}

在blk_mq_sched_dispatch_requests函数中,首先会检查一下是否存在以前的entries,然后调用mq-deadline调度器的dispatch函数用来dispatch请求

void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
bool did_work = false;
LIST_HEAD(rq_list); if (unlikely(blk_mq_hctx_stopped(hctx)))
return; hctx->run++; /*
* If we have previous entries on our dispatch list, grab them first for
* more fair dispatch.
*/
if (!list_empty_careful(&hctx->dispatch)) {
spin_lock(&hctx->lock);
if (!list_empty(&hctx->dispatch))
list_splice_init(&hctx->dispatch, &rq_list);
spin_unlock(&hctx->lock);
} /*
* Only ask the scheduler for requests, if we didn't have residual
* requests from the dispatch list. This is to avoid the case where
* we only ever dispatch a fraction of the requests available because
* of low device queue depth. Once we pull requests out of the IO
* scheduler, we can no longer merge or sort them. So it's best to
* leave them there for as long as we can. Mark the hw queue as
* needing a restart in that case.
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
did_work = blk_mq_dispatch_rq_list(q, &rq_list);
} else if (!has_sched_dispatch) {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list);
} /*
* We want to dispatch from the scheduler if we had no work left
* on the dispatch list, OR if we did have work but weren't able
* to make progress.
*/
if (!did_work && has_sched_dispatch) {
do {
struct request *rq; rq = e->type->ops.mq.dispatch_request(hctx);
if (!rq)
break;
list_add(&rq->queuelist, &rq_list);
} while (blk_mq_dispatch_rq_list(q, &rq_list));
}
}

在mq-dealine里定义的dispatch函数是dd_dispatch_request函数,该函数会根据deadline的调度器规则,从中选出一个请求返回给上层

static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
struct request *rq;
bool reads, writes;
int data_dir; if (!list_empty(&dd->dispatch)) {
rq = list_first_entry(&dd->dispatch, struct request, queuelist);
list_del_init(&rq->queuelist);
goto done;
} reads = !list_empty(&dd->fifo_list[READ]);
writes = !list_empty(&dd->fifo_list[WRITE]); /*
* batches are currently reads XOR writes
*/
if (dd->next_rq[WRITE])
rq = dd->next_rq[WRITE];
else
rq = dd->next_rq[READ]; if (rq && dd->batching < dd->fifo_batch)
/* we have a next request are still entitled to batch */
goto dispatch_request; /*
* at this point we are not running a batch. select the appropriate
* data direction (read / write)
*/ if (reads) {
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); if (writes && (dd->starved++ >= dd->writes_starved))
goto dispatch_writes; data_dir = READ; goto dispatch_find_request;
} /*
* there are either no reads or writes have been starved
*/ if (writes) {
dispatch_writes:
BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); dd->starved = ; data_dir = WRITE; goto dispatch_find_request;
} return NULL; dispatch_find_request:
/*
* we are not running a batch, find best request for selected data_dir
*/
if (deadline_check_fifo(dd, data_dir) || !dd->next_rq[data_dir]) {
/*
* A deadline has expired, the last request was in the other
* direction, or we have run out of higher-sectored requests.
* Start again from the request with the earliest expiry time.
*/
rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
} else {
/*
* The last req was the same dir and we have a next request in
* sort order. No expired requests so continue on from here.
*/
rq = dd->next_rq[data_dir];
} dd->batching = ; dispatch_request:
/*
* rq is the selected appropriate request.
*/
dd->batching++;
deadline_move_request(dd, rq);
done:
rq->rq_flags |= RQF_STARTED;
return rq;
} static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
struct request *rq; spin_lock(&dd->lock);
rq = __dd_dispatch_request(hctx);
spin_unlock(&dd->lock); return rq;
}

接着会调用块层函数blk_mq_dispatch_rq_list函数,该函数也是再次和驱动交互的一个函数,在该函数中,会调用queue_rq函数,这也是在nvme driver里定义的operations。

bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list)
{
struct blk_mq_hw_ctx *hctx;
struct request *rq;
int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK; if (list_empty(list))
return false; /*
* Now process all the entries, sending them to the driver.
*/
errors = queued = ;
do {
struct blk_mq_queue_data bd; rq = list_first_entry(list, struct request, queuelist);
if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
if (!queued && reorder_tags_to_front(list))
continue; /*
* The initial allocation attempt failed, so we need to
* rerun the hardware queue when a tag is freed.
*/
if (!blk_mq_dispatch_wait_add(hctx))
break; /*
* It's possible that a tag was freed in the window
* between the allocation failure and adding the
* hardware queue to the wait queue.
*/
if (!blk_mq_get_driver_tag(rq, &hctx, false))
break;
} list_del_init(&rq->queuelist); bd.rq = rq; /*
* Flag last if we have no more requests, or if we have more
* but can't assign a driver tag to it.
*/
if (list_empty(list))
bd.last = true;
else {
struct request *nxt; nxt = list_first_entry(list, struct request, queuelist);
bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
} ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) {
case BLK_MQ_RQ_QUEUE_OK:
queued++;
break;
case BLK_MQ_RQ_QUEUE_BUSY:
blk_mq_put_driver_tag_hctx(hctx, rq);
list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq);
break;
default:
pr_err("blk-mq: bad return on queue: %d\n", ret);
case BLK_MQ_RQ_QUEUE_ERROR:
errors++;
blk_mq_end_request(rq, -EIO);
break;
} if (ret == BLK_MQ_RQ_QUEUE_BUSY)
break;
} while (!list_empty(list)); hctx->dispatched[queued_to_index(queued)]++; /*
* Any items that need requeuing? Stuff them into hctx->dispatch,
* that is where we will continue on next queue run.
*/
if (!list_empty(list)) {
/*
* If an I/O scheduler has been configured and we got a driver
* tag for the next request already, free it again.
*/
rq = list_first_entry(list, struct request, queuelist);
blk_mq_put_driver_tag(rq); spin_lock(&hctx->lock);
list_splice_init(list, &hctx->dispatch);
spin_unlock(&hctx->lock); /*
* If SCHED_RESTART was set by the caller of this function and
* it is no longer set that means that it was cleared by another
* thread and hence that a queue rerun is needed.
*
* If TAG_WAITING is set that means that an I/O scheduler has
* been configured and another thread is waiting for a driver
* tag. To guarantee fairness, do not rerun this hardware queue
* but let the other thread grab the driver tag.
*
* If no I/O scheduler has been configured it is possible that
* the hardware queue got stopped and restarted before requests
* were pushed back onto the dispatch list. Rerun the queue to
* avoid starvation. Notes:
* - blk_mq_run_hw_queue() checks whether or not a queue has
* been stopped before rerunning a queue.
* - Some but not all block drivers stop a queue before
* returning BLK_MQ_RQ_QUEUE_BUSY. Two exceptions are scsi-mq
* and dm-rq.
*/
if (!blk_mq_sched_needs_restart(hctx) &&
!test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state))
blk_mq_run_hw_queue(hctx, true);
} return (queued + errors) != ;
}

对于nvme driver里的queue_rq就是nvme_queue_rq函数,在这个函数中做的事是:setup cmd,init iod,然后发送命令,最后会调用nvme_process_cq函数,这个函数在命令完成之后,ssd controller发中断后也会调用这个函数

static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct nvme_ns *ns = hctx->queue->queuedata;
struct nvme_queue *nvmeq = hctx->driver_data;
struct nvme_dev *dev = nvmeq->dev;
struct request *req = bd->rq;
struct nvme_command cmnd;
int ret = BLK_MQ_RQ_QUEUE_OK; /*
* If formated with metadata, require the block layer provide a buffer
* unless this namespace is formated such that the metadata can be
* stripped/generated by the controller with PRACT=1.
*/
if (ns && ns->ms && !blk_integrity_rq(req)) {
if (!(ns->pi_type && ns->ms == ) &&
!blk_rq_is_passthrough(req)) {
blk_mq_end_request(req, -EFAULT);
return BLK_MQ_RQ_QUEUE_OK;
}
} ret = nvme_setup_cmd(ns, req, &cmnd);
if (ret != BLK_MQ_RQ_QUEUE_OK)
return ret; ret = nvme_init_iod(req, dev);
if (ret != BLK_MQ_RQ_QUEUE_OK)
goto out_free_cmd; if (blk_rq_nr_phys_segments(req))
ret = nvme_map_data(dev, req, &cmnd); if (ret != BLK_MQ_RQ_QUEUE_OK)
goto out_cleanup_iod; blk_mq_start_request(req); spin_lock_irq(&nvmeq->q_lock);
if (unlikely(nvmeq->cq_vector < )) {
ret = BLK_MQ_RQ_QUEUE_ERROR;
spin_unlock_irq(&nvmeq->q_lock);
goto out_cleanup_iod;
}
__nvme_submit_cmd(nvmeq, &cmnd);
nvme_process_cq(nvmeq);
spin_unlock_irq(&nvmeq->q_lock);
return BLK_MQ_RQ_QUEUE_OK;
out_cleanup_iod:
nvme_free_iod(dev, req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
}

接着我们看一下__nvme_submit_cmd函数,该函数将cmd copy到SQ中,然后向SQ的doorbell里写入tail信息,告诉SSD controller来了新命令。

static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
struct nvme_command *cmd)
{
u16 tail = nvmeq->sq_tail; if (nvmeq->sq_cmds_io)
memcpy_toio(&nvmeq->sq_cmds_io[tail], cmd, sizeof(*cmd));
else
memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); if (++tail == nvmeq->q_depth)
tail = ;
if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db,
nvmeq->dbbuf_sq_ei))
writel(tail, nvmeq->q_db);
nvmeq->sq_tail = tail;
}

至此,从ioctl到写入SQ已经结束了,接下来的事情就是需要交给SSD controller取命令,执行命令了。

ssd controller执行完命令之后,会向CQ里写入完成信息,并向host端发出中断,中断调用的函数即nvme_process_cq函数

在这个函数中,执行的主要流程是,从CQ的head检查CQ,对于有效的CQ信息(通过CQ entry里的P位,这个可以参考nvme driver specification),进行处理,并调用nvme_end_request函数,来进行一个I/O request结束处理

static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
{
u16 head, phase; head = nvmeq->cq_head;
phase = nvmeq->cq_phase; while (nvme_cqe_valid(nvmeq, head, phase)) {
struct nvme_completion cqe = nvmeq->cqes[head];
struct request *req; if (++head == nvmeq->q_depth) {
head = ;
phase = !phase;
} if (tag && *tag == cqe.command_id)
*tag = -; if (unlikely(cqe.command_id >= nvmeq->q_depth)) {
dev_warn(nvmeq->dev->ctrl.device,
"invalid id %d completed on queue %d\n",
cqe.command_id, le16_to_cpu(cqe.sq_id));
continue;
} /*
* AEN requests are special as they don't time out and can
* survive any kind of queue freeze and often don't respond to
* aborts. We don't even bother to allocate a struct request
* for them but rather special case them here.
*/
if (unlikely(nvmeq->qid == &&
cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) {
nvme_complete_async_event(&nvmeq->dev->ctrl,
cqe.status, &cqe.result);
continue;
} req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id);
nvme_end_request(req, cqe.status, cqe.result);
} if (head == nvmeq->cq_head && phase == nvmeq->cq_phase)
return; if (likely(nvmeq->cq_vector >= ))
if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
nvmeq->dbbuf_cq_ei))
writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
nvmeq->cq_head = head;
nvmeq->cq_phase = phase; nvmeq->cqe_seen = ;
} static void nvme_process_cq(struct nvme_queue *nvmeq)
{
__nvme_process_cq(nvmeq, NULL);
}

接着我们看一下nvme_end_request函数,在这个函数中,依次调用了blk_mq_complete_request、 __blk_mq_complete_request,最后调用到nvme driver定义的softirq_done_fn函数

static void __blk_mq_complete_request(struct request *rq)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
bool shared = false;
int cpu; if (rq->internal_tag != -)
blk_mq_sched_completed_request(rq);
if (rq->rq_flags & RQF_STATS) {
blk_mq_poll_stats_start(rq->q);
blk_stat_add(rq);
} if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
rq->q->softirq_done_fn(rq);
return;
} cpu = get_cpu();
if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
shared = cpus_share_cache(cpu, ctx->cpu); if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
rq->csd.func = __blk_mq_complete_request_remote;
rq->csd.info = rq;
rq->csd.flags = ;
smp_call_function_single_async(ctx->cpu, &rq->csd);
} else {
rq->q->softirq_done_fn(rq);
}
put_cpu();
}

该函数(定义的过程省略,这一部分是在对nvme 设备初始化定义的)是nvme_pci_complete_rq函数

static void nvme_pci_complete_rq(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req); nvme_unmap_data(iod->nvmeq->dev, req);
nvme_complete_rq(req);
}
void nvme_complete_rq(struct request *req)
{
if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
nvme_req(req)->retries++;
blk_mq_requeue_request(req, !blk_mq_queue_stopped(req->q));
return;
} blk_mq_end_request(req, nvme_error_status(req));
}
void blk_mq_end_request(struct request *rq, int error)
{
if (blk_update_request(rq, error, blk_rq_bytes(rq)))
BUG();
__blk_mq_end_request(rq, error);
}
inline void __blk_mq_end_request(struct request *rq, int error)
{
blk_account_io_done(rq); if (rq->end_io) {
wbt_done(rq->q->rq_wb, &rq->issue_stat);
rq->end_io(rq, error);
} else {
if (unlikely(blk_bidi_rq(rq)))
blk_mq_free_request(rq->next_rq);
blk_mq_free_request(rq);
}
}

可以看出,上面这个__blk_mq_end_request函数会调用request的end_io,这个是在哪里定义的呢,可以看一下blk_execute_rq和blk_execute_rq_nowait函数,request的end_io就是blk_end_sync_rq函数

这个函数的作用就是唤醒上面blk_execute_rq里的wait_for_completion_io(&wait)

static void blk_end_sync_rq(struct request *rq, int error)
{
struct completion *waiting = rq->end_io_data; rq->end_io_data = NULL; /*
* complete last, if this is a stack request the process (and thus
* the rq pointer) could be invalid right after this complete()
*/
complete(waiting);
}

至此,整个ioctl的过程几乎完全结束了,剩下的就是将result向上返回了

05-26 15:38