最近梳理了一下rdma用户态到内核态传参的流程,会基于ibv_create_cq接口介绍一下ioctl版本的流程,代码基于mlnx-ofa_kernel-5.4。
用户态
用户态和内核态传的参数包含两部分,用户执行create_cq会传一些标准的参数,比如队列长度cqe,ibv_comp_channel channel等,还有另外厂商自己的参数,比如mlx5会传cq buffer的地址等。
用户态中首先是通过cmd记录厂商特有的参数,但是用户态和内核态实际进行传参是通过ibv_command_buffer,会通过attr记录每个参数,因此cmd之后会被转成ibv_command_buffer传给内核。
ibv_command_buffer
create_cq首先初始化了一个ibv_command_buffer driver_attrs
static struct ibv_cq_ex *create_cq(struct ibv_context *context,
const struct ibv_cq_init_attr_ex *cq_attr,
int cq_alloc_flags,
struct mlx5dv_cq_init_attr *mlx5cq_attr)
{
DECLARE_COMMAND_BUFFER_LINK(driver_attrs, UVERBS_OBJECT_CQ,
UVERBS_METHOD_CQ_CREATE, 1,
NULL);
...
}
ibv_command_buffer用于存放各种属性,会被组织成为一个链表,next指向下一个ibv_command_buffer,ib_uverbs_ioctl_hdr中记录了所有的attr
struct ibv_command_buffer {
struct ibv_command_buffer *next;
struct ib_uverbs_attr *next_attr;
struct ib_uverbs_attr *last_attr;
/*
* Used by the legacy write interface to keep track of where the UHW
* buffer is located and the 'headroom' space that the common code
* uses to construct the command header and common command struct
* directly before the drivers' UHW.
*/
uint8_t uhw_in_idx;
uint8_t uhw_out_idx;
uint8_t uhw_in_headroom_dwords;
uint8_t uhw_out_headroom_dwords;
uint8_t buffer_error:1;
/*
* These flags control what execute_ioctl_fallback does if the kernel
* does not support ioctl
*/
uint8_t fallback_require_ex:1;
uint8_t fallback_ioctl_only:1;
struct ib_uverbs_ioctl_hdr hdr;
};
struct ib_uverbs_ioctl_hdr {
__u16 length;
__u16 object_id;
__u16 method_id;
__u16 num_attrs;
__aligned_u64 reserved1;
__u32 driver_id;
__u32 reserved2;
struct ib_uverbs_attr attrs[];
};
struct ib_uverbs_attr {
__u16 attr_id; /* command specific type attribute */
__u16 len; /* only for pointers and IDRs array */
__u16 flags; /* combination of UVERBS_ATTR_F_XXXX */
union {
struct {
__u8 elem_id;
__u8 reserved;
} enum_data;
__u16 reserved;
} attr_data;
union {
/*
* ptr to command, inline data, idr/fd or
* ptr to __u32 array of IDRs
*/
__aligned_u64 data;
/* Used by FD_IN and FD_OUT */
__s64 data_s64;
};
};
然后看下driver_attrs是怎么创建的
#define DECLARE_COMMAND_BUFFER_LINK(_name, _object_id, _method_id, _num_attrs, \
_link) \
const unsigned int __##_name##total = \
_ioctl_final_num_attrs(_num_attrs, _link); \
struct ibv_command_buffer _name[_IOCTL_NUM_CMDB(__##_name##total)]; \
int __attribute__((unused)) __##_name##dummy = _ioctl_init_cmdb( \
_name, _object_id, _method_id, __##_name##total, _link)
_ioctl_final_num_attrs就是遍历ibv_command_buffer链表,累计所有的num_attr,不过此时ibv_command_buffer为null,因此就是_num_attrs为1。
#define _ioctl_final_num_attrs(_num_attrs, _link) \
((__builtin_constant_p(!(_link)) && !(_link)) \
? (_num_attrs) \
: __ioctl_final_num_attrs(_num_attrs, _link))
unsigned int __ioctl_final_num_attrs(unsigned int num_attrs,
struct ibv_command_buffer *link)
{
for (; link; link = link->next)
num_attrs += link->next_attr - link->hdr.attrs;
return num_attrs;
}
hdr里的attrs为变长的柔性数组,所以_IOCTL_NUM_CMDB会计算有几个ibv_command_buffer才能足够存的下_num_attrs个attr。
#define _IOCTL_NUM_CMDB(_num_attrs) \
((sizeof(struct ibv_command_buffer) + \
sizeof(struct ib_uverbs_attr) * (_num_attrs) + \
sizeof(struct ibv_command_buffer) - 1) / \
sizeof(struct ibv_command_buffer))
然后申请数组并初始化,设置hdr的object_id和method_id,将当前ibv_command_buffer通过next链接link,next_attr指向attrs的第一个,last_attr指向attrs的第num_attrs个
#define _COMMAND_BUFFER_INIT(_hdr, _object_id, _method_id, _num_attrs, _link) \
((struct ibv_command_buffer){ \
.hdr = \
{ \
.object_id = (_object_id), \
.method_id = (_method_id), \
}, \
.next = _link, \
.uhw_in_idx = _UHW_NO_INDEX, \
.uhw_out_idx = _UHW_NO_INDEX, \
.next_attr = (_hdr).attrs, \
.last_attr = (_hdr).attrs + _num_attrs})
static inline int _ioctl_init_cmdb(struct ibv_command_buffer *cmd,
uint16_t object_id, uint16_t method_id,
size_t num_attrs,
struct ibv_command_buffer *link)
{
*cmd = _COMMAND_BUFFER_INIT(cmd->hdr, object_id, method_id, num_attrs,
link);
return 0;
}
cmd
创建一个mlx5_create_cq_ex cmd_ex,然后开始设置cmd_ex的mlx5_ib_create_cq部分,记录厂商特有的桉树,比如cq buffer地址等。
static struct ibv_cq_ex *create_cq(struct ibv_context *context,
const struct ibv_cq_init_attr_ex *cq_attr,
int cq_alloc_flags,
struct mlx5dv_cq_init_attr *mlx5cq_attr)
{
...
struct mlx5_create_cq_ex cmd_ex = {};
struct mlx5_create_cq_ex_resp resp_ex = {};
struct mlx5_ib_create_cq *cmd_drv;
struct mlx5_ib_create_cq_resp *resp_drv;
...
cmd_drv = &cmd_ex.drv_payload;
resp_drv = &resp_ex.drv_payload;
...
cmd_drv->buf_addr = (uintptr_t) cq->buf_a.buf;
cmd_drv->db_addr = (uintptr_t) cq->dbrec;
cmd_drv->cqe_size = cqe_sz;
...
{
struct ibv_cq_init_attr_ex cq_attr_ex = *cq_attr;
cq_attr_ex.cqe = ncqe - 1;
ret = ibv_cmd_create_cq_ex2(context, &cq_attr_ex, &cq->verbs_cq,
&cmd_ex.ibv_cmd, sizeof(cmd_ex),
&resp_ex.ibv_resp, sizeof(resp_ex),
CREATE_CQ_CMD_FLAGS_TS_IGNORED_EX,
driver_attrs);
}
...
}
然后看下mlx5_create_cq_ex和mlx5_create_cq_ex_resp是怎么来的,以mlx5_create_cq_ex为例,drv_payload为mlx5_ib_create_cq
DECLARE_DRV_CMD(mlx5_create_cq_ex, IB_USER_VERBS_EX_CMD_CREATE_CQ,
mlx5_ib_create_cq, mlx5_ib_create_cq_resp);
#define DECLARE_DRV_CMD(_name, _enum, _kabi_req, _kabi_resp) \
struct _name { \
IBV_ABI_REQ(_enum) ibv_cmd; \
union { \
_STRUCT_##_kabi_req; \
struct _kabi_req drv_payload; \
}; \
}; \
struct mlx5_ib_create_cq {
__aligned_u64 buf_addr;
__aligned_u64 db_addr;
__u32 cqe_size;
__u8 cqe_comp_en;
__u8 cqe_comp_res_format;
__u16 flags;
__u16 uar_page_index;
__u16 reserved0;
__u32 reserved1;
};
接着看下ibv_cmd是怎么来的,定义如下,包含ex_hdr和_kabi,_kabi就是ib_uverbs_ex_create_cq。
然后对ibv_create_cq_ex进行typedef,因此ibv_cmd就是ibv_create_cq_ex 。
DECLARE_CMD_EX(IB_USER_VERBS_EX_CMD_CREATE_CQ, ibv_create_cq_ex, ib_uverbs_ex_create_cq);
#define DECLARE_CMD_EX(_enum, _name, _kabi) \
DECLARE_CMD_EXX(_enum, _name, _kabi, _kabi##_resp)
#define DECLARE_CMD_EXX(_enum, _name, _kabi, _kabi_resp) \
struct _name { \
struct ex_hdr hdr; \
union { \
_STRUCT_##_kabi; \
struct _kabi core_payload; \
}; \
}; \
typedef struct _name IBV_ABI_REQ(_enum); \
typedef struct _kabi IBV_KABI_REQ(_enum); \
typedef struct _kabi_resp IBV_KABI_RESP(_enum); \
struct ib_uverbs_ex_create_cq {
__aligned_u64 user_handle;
__u32 cqe;
__u32 comp_vector;
__s32 comp_channel;
__u32 comp_mask;
__u32 flags; /* bitmask of ib_uverbs_ex_create_cq_flags */
__u32 reserved;
};
设置完drv_payload后执行ibv_cmd_create_cq_ex2
int ibv_cmd_create_cq_ex2(struct ibv_context *context,
const struct ibv_cq_init_attr_ex *cq_attr,
struct verbs_cq *cq,
struct ibv_create_cq_ex *cmd,
size_t cmd_size,
struct ib_uverbs_ex_create_cq_resp *resp,
size_t resp_size,
uint32_t cmd_flags,
struct ibv_command_buffer *driver)
{
DECLARE_CMD_BUFFER_LINK_COMPAT(cmdb, UVERBS_OBJECT_CQ,
UVERBS_METHOD_CQ_CREATE,
driver, cmd, cmd_size, resp, resp_size);
return ibv_icmd_create_cq_ex(context, cq_attr, cq, cmdb, cmd_flags);
}
初始化一个ibv_command_buffer cmdb,链接到driver_attrs的前边,然后执行_write_set_uhw
#define DECLARE_CMD_BUFFER_LINK_COMPAT(_name, _object_id, _method_id, \
_link, cmd, cmd_size, \
resp, resp_size) \
DECLARE_COMMAND_BUFFER_LINK(_name, _object_id, _method_id, 2, _link); \
_write_set_uhw(_name, cmd, sizeof(*cmd), cmd_size, resp, \
sizeof(*resp), resp_size)
前边说到需要将cmd转成ibv_command_buffer,就是通过_write_set_uhw做的,core_req_size为ibv_create_cq_ex的大小,req_size为mlx5_create_cq_ex的大小,因此这里的fill_attr_in就是将mlx5_create_cq_ex中drv_payload的地址记录到cmdb的attrs中,uhw_in_idx表示记录到attrs的第几个,这样就将cmd的地址作为一个attr记录到了ibv_command_buffer里。
void _write_set_uhw(struct ibv_command_buffer *cmdb, const void *req,
size_t core_req_size, size_t req_size, void *resp,
size_t core_resp_size, size_t resp_size)
{
if (req && core_req_size < req_size) {
if (VERBS_IOCTL_ONLY)
cmdb->uhw_in_idx =
fill_attr_in(cmdb, UVERBS_ATTR_UHW_IN,
(uint8_t *)req + core_req_size,
req_size - core_req_size) -
cmdb->hdr.attrs;
else
cmdb->uhw_in_idx =
_fill_attr_in_uhw(cmdb, UVERBS_ATTR_UHW_IN,
(uint8_t *)req +
core_req_size,
req_size - core_req_size) -
cmdb->hdr.attrs;
cmdb->uhw_in_headroom_dwords = __check_divide(core_req_size, 4);
}
...
}
static inline struct ib_uverbs_attr *
_fill_attr_in_uhw(struct ibv_command_buffer *cmd, uint16_t attr_id,
const void *data, size_t len)
{
struct ib_uverbs_attr *attr = _ioctl_next_attr(cmd, attr_id);
if (unlikely(len > UINT16_MAX))
cmd->buffer_error = 1;
attr->len = len;
attr->data = ioctl_ptr_to_u64(data);
return attr;
}
这里又创建了一个新的cmdb,链接到之前的cmdb前,设置各种参数到cmdb的attrs中,然后执行execute_ioctl_fallback
static int ibv_icmd_create_cq(struct ibv_context *context, int cqe,
struct ibv_comp_channel *channel, int comp_vector,
uint32_t flags, struct ibv_cq *cq,
struct ibv_command_buffer *link,
uint32_t cmd_flags)
{
DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_CQ, UVERBS_METHOD_CQ_CREATE, 8, link);
struct verbs_ex_private *priv = get_priv(context);
struct ib_uverbs_attr *handle;
struct ib_uverbs_attr *async_fd_attr;
uint32_t resp_cqe;
int ret;
cq->context = context;
handle = fill_attr_out_obj(cmdb, UVERBS_ATTR_CREATE_CQ_HANDLE);
fill_attr_out_ptr(cmdb, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &resp_cqe);
fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_CQ_CQE, cqe);
fill_attr_in_uint64(cmdb, UVERBS_ATTR_CREATE_CQ_USER_HANDLE, (uintptr_t)cq);
if (channel)
fill_attr_in_fd(cmdb, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL, channel->fd);
fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, comp_vector);
async_fd_attr = fill_attr_in_fd(cmdb, UVERBS_ATTR_CREATE_CQ_EVENT_FD, context->async_fd);
if (priv->imported)
fallback_require_ioctl(cmdb);
else
/* Prevent fallback to the 'write' mode if kernel doesn't support it */
attr_optional(async_fd_attr);
if (flags) {
if ((flags & ~IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION) ||
(!(cmd_flags & CREATE_CQ_CMD_FLAGS_TS_IGNORED_EX)))
fallback_require_ex(cmdb);
fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_CQ_FLAGS, flags);
}
switch (execute_ioctl_fallback(cq->context, create_cq, cmdb, &ret)) {
...
}
int execute_ioctl(struct ibv_context *context, struct ibv_command_buffer *cmd)
{
struct verbs_context *vctx = verbs_get_ctx(context);
prepare_attrs(cmd);
cmd->hdr.length = sizeof(cmd->hdr) +
sizeof(cmd->hdr.attrs[0]) * cmd->hdr.num_attrs;
cmd->hdr.reserved1 = 0;
cmd->hdr.reserved2 = 0;
cmd->hdr.driver_id = vctx->priv->driver_id;
if (ioctl(context->cmd_fd, RDMA_VERBS_IOCTL, &cmd->hdr))
return errno;
finalize_attrs(cmd);
return 0;
}
prepare_attrs遍历所有的ibv_command_buffer,将所有的attrs打平到第一个buffer里,然后设置hdr中的长度,最后执行ioctl,用户态的逻辑就完成了。
static void prepare_attrs(struct ibv_command_buffer *cmd)
{
struct ib_uverbs_attr *end = cmd->next_attr;
struct ibv_command_buffer *link;
for (link = cmd->next; link; link = link->next) {
struct ib_uverbs_attr *cur;
assert(cmd->hdr.object_id == link->hdr.object_id);
assert(cmd->hdr.method_id == link->hdr.method_id);
/*
* Keep track of where the uhw_in lands in the final array if
* we copy it from a link
*/
if (!VERBS_IOCTL_ONLY && link->uhw_in_idx != _UHW_NO_INDEX) {
assert(cmd->uhw_in_idx == _UHW_NO_INDEX);
cmd->uhw_in_idx =
link->uhw_in_idx + (end - cmd->hdr.attrs);
}
for (cur = link->hdr.attrs; cur != link->next_attr; cur++)
*end++ = *cur;
assert(end <= cmd->last_attr);
}
cmd->hdr.num_attrs = end - cmd->hdr.attrs;
if (!VERBS_IOCTL_ONLY && cmd->uhw_in_idx != _UHW_NO_INDEX) {
struct ib_uverbs_attr *uhw = &cmd->hdr.attrs[cmd->uhw_in_idx];
assert(uhw->attr_id == UVERBS_ATTR_UHW_IN);
if (uhw->len <= sizeof(uhw->data))
memcpy(&uhw->data, (void *)(uintptr_t)uhw->data,
uhw->len);
}
}
内核态
初始化
内核有object,method,attr三个概念,cq就对应一个object,其他比如qp,mr都对应不同的object,通过object_id区分;cq这个object有多个method,比如create_cq,destroy_cq,通过method_id区分;create_cq需要多个参数,比如ceq,这里每个参数就是一个attr,通过attr_id区分。
首先定义create_cq的method,method_id为UVERBS_METHOD_CQ_CREATE,这个method的handler为UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE),即实际会执行的函数。
DECLARE_UVERBS_NAMED_METHOD(
UVERBS_METHOD_CQ_CREATE,
UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_CQ_HANDLE,
UVERBS_OBJECT_CQ,
UVERBS_ACCESS_NEW,
UA_MANDATORY),
UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_CQE,
UVERBS_ATTR_TYPE(u32),
UA_MANDATORY),
UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_USER_HANDLE,
UVERBS_ATTR_TYPE(u64),
UA_MANDATORY),
......
UVERBS_ATTR_UHW());
struct uverbs_method_def {
u16 id;
/* Combination of bits from enum UVERBS_ACTION_FLAG_XXXX */
u32 flags;
size_t num_attrs;
const struct uverbs_attr_def * const (*attrs)[];
int (*handler)(struct uverbs_attr_bundle *attrs);
};
#define DECLARE_UVERBS_NAMED_METHOD(_method_id, ...) \
static const struct uverbs_attr_def *const UVERBS_METHOD_ATTRS( \
_method_id)[] = { __VA_ARGS__ }; \
static const struct uverbs_method_def UVERBS_METHOD(_method_id) = { \
.id = _method_id, \
.handler = UVERBS_HANDLER(_method_id), \
.num_attrs = ARRAY_SIZE(UVERBS_METHOD_ATTRS(_method_id)), \
.attrs = &UVERBS_METHOD_ATTRS(_method_id), \
}
然后看下method的attr,对于用户态传进来的一个attr,内核应该如何去解析这块内存,就是通过uverbs_attr_spec,指示内核去解析用户attr的什么字段。
struct uverbs_attr_def {
u16 id;
struct uverbs_attr_spec attr;
};
struct uverbs_attr_spec {
u8 type;
/*
* Support extending attributes by length. Allow the user to provide
* more bytes than ptr.len, but check that everything after is zero'd
* by the user.
*/
u8 zero_trailing:1;
/*
* Valid only for PTR_IN. Allocate and copy the data inside
* the parser
*/
u8 alloc_and_copy:1;
u8 mandatory:1;
/* True if this is from UVERBS_ATTR_UHW */
u8 is_udata:1;
union {
struct {
/* Current known size to kernel */
u16 len;
/* User isn't allowed to provide something < min_len */
u16 min_len;
} ptr;
struct {
/*
* higher bits mean the namespace and lower bits mean
* the type id within the namespace.
*/
u16 obj_type;
u8 access;
} obj;
struct {
u8 num_elems;
} enum_def;
} u;
/* This weird split lets us remove some padding */
union {
struct {
/*
* The enum attribute can select one of the attributes
* contained in the ids array. Currently only PTR_IN
* attributes are supported in the ids array.
*/
const struct uverbs_attr_spec *ids;
} enum_def;
struct {
/*
* higher bits mean the namespace and lower bits mean
* the type id within the namespace.
*/
u16 obj_type;
u16 min_len;
u16 max_len;
u8 access;
} objs_arr;
} u2;
};
以上述定义method时uhw attr为例,就是定义了一个uverbs_attr_def,其中spec的type为UVERBS_ATTR_TYPE_PTR_IN,is_udata为1。
#define UVERBS_ATTR_UHW() \
UVERBS_ATTR_PTR_IN(UVERBS_ATTR_UHW_IN, \
UVERBS_ATTR_MIN_SIZE(0), \
UA_OPTIONAL, \
.is_udata = 1),
#define UVERBS_ATTR_PTR_IN(_attr_id, _type, ...) \
(&(const struct uverbs_attr_def){ \
.id = _attr_id, \
.attr = { .type = UVERBS_ATTR_TYPE_PTR_IN, \
_type, \
__VA_ARGS__ } })
然后开始定义cq的object,创建uverbs_object_def,其中methods指向传入的数组,即create_cq和destroy_cq。
DECLARE_UVERBS_NAMED_OBJECT(
UVERBS_OBJECT_CQ,
UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), uverbs_free_cq),
&UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE),
&UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY)
);
struct uverbs_object_def {
u16 id;
const struct uverbs_obj_type *type_attrs;
size_t num_methods;
const struct uverbs_method_def * const (*methods)[];
};
#define DECLARE_UVERBS_NAMED_OBJECT(_object_id, _type_attrs, ...) \
static const struct uverbs_method_def *const UVERBS_OBJECT_METHODS( \
_object_id)[] = { __VA_ARGS__ }; \
static const struct uverbs_object_def UVERBS_OBJECT(_object_id) = { \
.id = _object_id, \
.type_attrs = &_type_attrs, \
.num_methods = ARRAY_SIZE(UVERBS_OBJECT_METHODS(_object_id)), \
.methods = &UVERBS_OBJECT_METHODS(_object_id) \
}
然后定义cq相关的uapi_definition,就是创建了一个uapi_definition,其中chain_obj_tree指向前边创建的cq object。
const struct uapi_definition uverbs_def_obj_cq[] = {
UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_CQ,
UAPI_DEF_OBJ_NEEDS_FN(destroy_cq)),
{}
};
#define UAPI_DEF_CHAIN_OBJ_TREE_NAMED(_object_enum, ...) \
UAPI_DEF_CHAIN_OBJ_TREE(_object_enum, &UVERBS_OBJECT(_object_enum), \
##__VA_ARGS__)
#define UAPI_DEF_CHAIN_OBJ_TREE(_object_enum, _object_ptr, ...) \
{ \
.kind = UAPI_DEF_CHAIN_OBJ_TREE, \
.object_start = { .object_id = _object_enum }, \
.chain_obj_tree = _object_ptr, \
}, \
##__VA_ARGS__
类似的,将其他的object相关的uapi_definition添加到数组uverbs_core_api中,UAPI_DEF_CHAIN就是新建了一个uapi_definition,然后将chain指向下一个uapi_definition。
static const struct uapi_definition uverbs_core_api[] = {
UAPI_DEF_CHAIN(uverbs_def_obj_async_fd),
UAPI_DEF_CHAIN(uverbs_def_obj_counters),
UAPI_DEF_CHAIN(uverbs_def_obj_cq),
UAPI_DEF_CHAIN(uverbs_def_obj_device),
UAPI_DEF_CHAIN(uverbs_def_obj_dm),
UAPI_DEF_CHAIN(uverbs_def_obj_flow_action),
UAPI_DEF_CHAIN(uverbs_def_obj_intf),
UAPI_DEF_CHAIN(uverbs_def_obj_mr),
UAPI_DEF_CHAIN(uverbs_def_obj_qp),
UAPI_DEF_CHAIN(uverbs_def_obj_srq),
UAPI_DEF_CHAIN(uverbs_def_obj_wq),
UAPI_DEF_CHAIN(uverbs_def_write_intf),
{},
};
#define UAPI_DEF_CHAIN(_def_var) \
{ \
.kind = UAPI_DEF_CHAIN, .chain = _def_var, \
}
到这里就完成了cq相关object,method等的创建,然后开始添加到radix tree。
创建uverbs_api *uapi,uverbs_api用于保存所有的api,内部的radix为radix tree,所有的api会被加入到radix tree中。
然后通过uapi_merge_def添加uverbs_core_api。
struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev)
{
struct uverbs_api *uapi;
int rc;
uapi = kzalloc(sizeof(*uapi), GFP_KERNEL);
if (!uapi)
return ERR_PTR(-ENOMEM);
INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL);
uapi->driver_id = ibdev->ops.driver_id;
rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false);
...
}
struct uverbs_api {
/* radix tree contains struct uverbs_api_* pointers */
struct radix_tree_root radix;
enum rdma_driver_id driver_id;
unsigned int num_write;
unsigned int num_write_ex;
struct uverbs_api_write_method notsupp_method;
const struct uverbs_api_write_method **write_methods;
const struct uverbs_api_write_method **write_ex_methods;
};
uapi_merge_def中会遍历数组uverbs_core_api,假设遍历到uverbs_def_obj_cq对应的元素,由于此时的kind为CHAIN,因此递归对chain执行uapi_merge_def。
chain指向uverbs_def_obj_cq,由于kind为UAPI_DEF_CHAIN_OBJ_TREE,于是对chain_obj_tree执行uapi_merge_obj_tree,chain_obj_tree指向的就是cq对应的object。
static int uapi_merge_def(struct uverbs_api *uapi, struct ib_device *ibdev,
const struct uapi_definition *def_list,
bool is_driver)
{
const struct uapi_definition *def = def_list;
u32 cur_obj_key = UVERBS_API_KEY_ERR;
u32 cur_method_key = UVERBS_API_KEY_ERR;
bool exists;
int rc;
if (!def_list)
return 0;
for (;; def++) {
switch ((enum uapi_definition_kind)def->kind) {
case UAPI_DEF_CHAIN:
rc = uapi_merge_def(uapi, ibdev, def->chain, is_driver);
if (rc)
return rc;
continue;
case UAPI_DEF_CHAIN_OBJ_TREE:
if (WARN_ON(def->object_start.object_id !=
def->chain_obj_tree->id))
return -EINVAL;
cur_obj_key = uapi_key_obj(def->object_start.object_id);
rc = uapi_merge_obj_tree(uapi, def->chain_obj_tree,
is_driver);
if (rc)
return rc;
continue;
case UAPI_DEF_END:
return 0;
...
WARN_ON(true);
return -EINVAL;
}
}
uapi_merge_obj_tree会将object,method,attr分别插入到radix tree。
这里说下插入radix tree的key如何计算,key一共为16位,key的最低6位为attr_id,中间5位为method_id,高5位为object_id。
然后通过uapi_key_obj获取obeject的key obj_key,就是将object_id左移到高5位,然后通过uapi_add_get_elm插入到radix tree中,slot为obj_elm,然后对obj中的每一个method循环执行uapi_merge_method。
static int uapi_merge_obj_tree(struct uverbs_api *uapi,
const struct uverbs_object_def *obj,
bool is_driver)
{
struct uverbs_api_object *obj_elm;
unsigned int i;
u32 obj_key;
bool exists;
int rc;
obj_key = uapi_key_obj(obj->id);
obj_elm = uapi_add_get_elm(uapi, obj_key, sizeof(*obj_elm), &exists);
if (IS_ERR(obj_elm))
return PTR_ERR(obj_elm);
if (obj->type_attrs) {
if (WARN_ON(obj_elm->type_attrs))
return -EINVAL;
obj_elm->id = obj->id;
obj_elm->type_attrs = obj->type_attrs;
obj_elm->type_class = obj->type_attrs->type_class;
if (WARN_ON(is_driver &&
obj->type_attrs->type_class != &uverbs_idr_class &&
obj->type_attrs->type_class != &uverbs_fd_class))
return -EINVAL;
}
if (!obj->methods)
return 0;
for (i = 0; i != obj->num_methods; i++) {
const struct uverbs_method_def *method = (*obj->methods)[i];
if (!method)
continue;
rc = uapi_merge_method(uapi, obj_elm, obj_key, method,
is_driver);
if (rc)
return rc;
}
return 0;
}
然后看下uapi_merge_method,先通过obj_key和mthod->id拼出来method_key,然后继续通过uapi_add_get_elm将method_key插入radix tree,slot为method_elm,然后将handler设置到method_elm中。
然后对于method的所有attr,将index设置为method_key | attr_key,然后也插入radix tree。
static int uapi_merge_method(struct uverbs_api *uapi,
struct uverbs_api_object *obj_elm, u32 obj_key,
const struct uverbs_method_def *method,
bool is_driver)
{
u32 method_key = obj_key | uapi_key_ioctl_method(method->id);
struct uverbs_api_ioctl_method *method_elm;
unsigned int i;
bool exists;
if (!method->attrs)
return 0;
method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm),
&exists);
if (IS_ERR(method_elm))
return PTR_ERR(method_elm);
if (exists) {
/*
* This occurs when a driver uses ADD_UVERBS_ATTRIBUTES_SIMPLE
*/
if (WARN_ON(method->handler))
return -EINVAL;
} else {
WARN_ON(!method->handler);
rcu_assign_pointer(method_elm->handler, method->handler);
if (method->handler != uverbs_destroy_def_handler)
method_elm->driver_method = is_driver;
}
for (i = 0; i != method->num_attrs; i++) {
const struct uverbs_attr_def *attr = (*method->attrs)[i];
struct uverbs_api_attr *attr_slot;
if (!attr)
continue;
/*
* ENUM_IN contains the 'ids' pointer to the driver's .rodata,
* so if it is specified by a driver then it always makes this
* into a driver method.
*/
if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN)
method_elm->driver_method |= is_driver;
/*
* Like other uobject based things we only support a single
* uobject being NEW'd or DESTROY'd
*/
if (attr->attr.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) {
u8 access = attr->attr.u2.objs_arr.access;
if (WARN_ON(access == UVERBS_ACCESS_NEW ||
access == UVERBS_ACCESS_DESTROY))
return -EINVAL;
}
attr_slot =
uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id),
sizeof(*attr_slot));
/* Attributes are not allowed to be modified by drivers */
if (IS_ERR(attr_slot))
return PTR_ERR(attr_slot);
attr_slot->spec = attr->attr;
}
return 0;
}
运行
前边用户态已经看到执行了ioctl,将cmd和参数传到了内核态,现在看下内核态如何执行。
通过copy_from_user将参数拷贝到内核态的hdr,注意这时候attr还没拷贝进来,然后执行ib_uverbs_cmd_verbs
long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct ib_uverbs_file *file = filp->private_data;
struct ib_uverbs_ioctl_hdr __user *user_hdr =
(struct ib_uverbs_ioctl_hdr __user *)arg;
struct ib_uverbs_ioctl_hdr hdr;
int srcu_key;
int err;
err = copy_from_user(&hdr, user_hdr, sizeof(hdr));
...
srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
err = ib_uverbs_cmd_verbs(file, &hdr, user_hdr->attrs);
srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
return err;
}
通过object id和method id查找radix tree获得到method_elm。
static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,
struct ib_uverbs_ioctl_hdr *hdr,
struct ib_uverbs_attr __user *user_attrs)
{
const struct uverbs_api_ioctl_method *method_elm;
struct uverbs_api *uapi = ufile->device->uapi;
struct radix_tree_iter attrs_iter;
struct bundle_priv *pbundle;
struct bundle_priv onstack;
void __rcu **slot;
int ret;
if (unlikely(hdr->driver_id != uapi->driver_id))
return -EINVAL;
#ifdef HAVE_RADIX_TREE_ITER_LOOKUP
slot = radix_tree_iter_lookup(
&uapi->radix, &attrs_iter,
uapi_key_obj(hdr->object_id) |
uapi_key_ioctl_method(hdr->method_id));
#else
radix_tree_iter_init(&attrs_iter, uapi_key_obj(hdr->object_id) |
uapi_key_ioctl_method(hdr->method_id));
slot = radix_tree_next_chunk(&uapi->radix, &attrs_iter, RADIX_TREE_ITER_CONTIG);
#endif
if (unlikely(!slot))
return -EPROTONOSUPPORT;
method_elm = rcu_dereference_protected(*slot, true);
...
}
bundle_priv用于存储所有用户传进来的参数,由于不通method的attrs个数不一样,因此需要动态分配内存存储attr,为了优化小的分配,bundle_priv内部预留了栈上的internal_buffer,后续假设internal_buffer是足够的,即use_stack为1。
然后设置pbundle的各个参数,比如method_elm,method_key,其中user_attrs指向了用户传入的attr。
static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,
struct ib_uverbs_ioctl_hdr *hdr,
struct ib_uverbs_attr __user *user_attrs)
{
struct bundle_priv *pbundle;
struct bundle_priv onstack;
...
if (!method_elm->use_stack) {
pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL);
if (!pbundle)
return -ENOMEM;
pbundle->internal_avail =
method_elm->bundle_size -
offsetof(struct bundle_priv, internal_buffer);
pbundle->alloc_head.next = NULL;
pbundle->allocated_mem = &pbundle->alloc_head;
} else {
pbundle = &onstack;
pbundle->internal_avail = sizeof(pbundle->internal_buffer);
pbundle->allocated_mem = NULL;
}
/* Space for the pbundle->bundle.attrs flex array */
pbundle->method_elm = method_elm;
pbundle->method_key = attrs_iter.index;
pbundle->bundle.ufile = ufile;
pbundle->bundle.context = NULL; /* only valid if bundle has uobject */
pbundle->radix = &uapi->radix;
pbundle->radix_slots = slot;
pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter);
pbundle->user_attrs = user_attrs;
pbundle->internal_used = ALIGN(pbundle->method_elm->key_bitmap_len *
sizeof(*pbundle->bundle.attrs),
sizeof(*pbundle->internal_buffer));
...
ret = ib_uverbs_run_method(pbundle, hdr->num_attrs);
bundle_destroy(pbundle, ret == 0);
return ret;
}
然后执行ib_uverbs_run_method,将method的handler保存到handler,通过用户的num_attrs可以知道需要的内存大小uattrs_size,然后执行uverbs_alloc分配内存到uattrs,最后通过copy_from_user将用户的attr拷贝到uattrs。
static int ib_uverbs_run_method(struct bundle_priv *pbundle,
unsigned int num_attrs)
{
int (*handler)(struct uverbs_attr_bundle *attrs);
size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs);
unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey;
unsigned int i;
int ret;
/* See uverbs_disassociate_api() */
handler = srcu_dereference(
pbundle->method_elm->handler,
&pbundle->bundle.ufile->device->disassociate_srcu);
if (!handler)
return -EIO;
pbundle->uattrs = uverbs_alloc(&pbundle->bundle, uattrs_size);
if (IS_ERR(pbundle->uattrs))
return PTR_ERR(pbundle->uattrs);
if (copy_from_user(pbundle->uattrs, pbundle->user_attrs, uattrs_size))
return -EFAULT;
...
}
然后通过uverbs_set_attr解析用户的attr。
static int ib_uverbs_run_method(struct bundle_priv *pbundle,
unsigned int num_attrs)
{
...
for (i = 0; i != num_attrs; i++) {
ret = uverbs_set_attr(pbundle, &pbundle->uattrs[i]);
if (unlikely(ret))
return ret;
}
...
}
首先通过uapi_get_attr_for_method查找radix tree中的attr,然后执行uverbs_process_attr
static int uverbs_set_attr(struct bundle_priv *pbundle,
struct ib_uverbs_attr *uattr)
{
u32 attr_key = uapi_key_attr(uattr->attr_id);
u32 attr_bkey = uapi_bkey_attr(attr_key);
const struct uverbs_api_attr *attr;
void __rcu **slot;
int ret;
slot = uapi_get_attr_for_method(pbundle, attr_key);
if (!slot) {
/*
* Kernel does not support the attribute but user-space says it
* is mandatory
*/
if (uattr->flags & UVERBS_ATTR_F_MANDATORY)
return -EPROTONOSUPPORT;
return 0;
}
attr = rcu_dereference_protected(*slot, true);
/* Reject duplicate attributes from user-space */
if (test_bit(attr_bkey, pbundle->bundle.attr_present))
return -EINVAL;
ret = uverbs_process_attr(pbundle, attr, uattr, attr_bkey);
if (ret)
return ret;
__set_bit(attr_bkey, pbundle->bundle.attr_present);
return 0;
}
uverbs_process_attr就是根据spec中的type解析对应嗯字段到pbundle的attrs,下边展示了type为UVERBS_ATTR_TYPE_PTR_OUT的场景。
static int uverbs_process_attr(struct bundle_priv *pbundle,
const struct uverbs_api_attr *attr_uapi,
struct ib_uverbs_attr *uattr, u32 attr_bkey)
{
const struct uverbs_attr_spec *spec = &attr_uapi->spec;
struct uverbs_attr *e = &pbundle->bundle.attrs[attr_bkey];
const struct uverbs_attr_spec *val_spec = spec;
struct uverbs_obj_attr *o_attr;
switch (spec->type) {
...
case UVERBS_ATTR_TYPE_PTR_OUT:
if (uattr->len < val_spec->u.ptr.min_len ||
(!val_spec->zero_trailing &&
uattr->len > val_spec->u.ptr.len))
return -EINVAL;
if (spec->type != UVERBS_ATTR_TYPE_ENUM_IN &&
uattr->attr_data.reserved)
return -EINVAL;
e->ptr_attr.uattr_idx = uattr - pbundle->uattrs;
e->ptr_attr.len = uattr->len;
...
e->ptr_attr.data = uattr->data;
...
break;
...
}
}
然后回到ib_uverbs_run_method,前边说到uhw数据会作为一个attr传进来,这里会通过uverbs_fill_udata将uhw的指针记录到driver_udata,最后执行handler就到了真正create_cq的逻辑。
static int ib_uverbs_run_method(struct bundle_priv *pbundle,
unsigned int num_attrs)
{
...
if (pbundle->method_elm->has_udata)
uverbs_fill_udata(&pbundle->bundle,
&pbundle->bundle.driver_udata,
UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT);
else
pbundle->bundle.driver_udata = (struct ib_udata){};
if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) {
struct uverbs_obj_attr *destroy_attr =
&pbundle->bundle.attrs[destroy_bkey].obj_attr;
ret = uobj_destroy(destroy_attr->uobject, &pbundle->bundle);
if (ret)
return ret;
__clear_bit(destroy_bkey, pbundle->uobj_finalize);
ret = handler(&pbundle->bundle);
uobj_put_destroy(destroy_attr->uobject);
} else {
ret = handler(&pbundle->bundle);
}
...
}
最后感谢一下学习rdma过程中几位大佬的答疑(字典序)
Santiago0826,zhigang124以及一位不想透露姓名的大佬