最近梳理了一下rdma用户态到内核态传参的流程,会基于ibv_create_cq接口介绍一下ioctl版本的流程,代码基于mlnx-ofa_kernel-5.4。

用户态

用户态和内核态传的参数包含两部分,用户执行create_cq会传一些标准的参数,比如队列长度cqe,ibv_comp_channel channel等,还有另外厂商自己的参数,比如mlx5会传cq buffer的地址等。
用户态中首先是通过cmd记录厂商特有的参数,但是用户态和内核态实际进行传参是通过ibv_command_buffer,会通过attr记录每个参数,因此cmd之后会被转成ibv_command_buffer传给内核。

ibv_command_buffer

create_cq首先初始化了一个ibv_command_buffer driver_attrs

static struct ibv_cq_ex *create_cq(struct ibv_context *context,
                   const struct ibv_cq_init_attr_ex *cq_attr,
                   int cq_alloc_flags,
                   struct mlx5dv_cq_init_attr *mlx5cq_attr)
{
    DECLARE_COMMAND_BUFFER_LINK(driver_attrs, UVERBS_OBJECT_CQ,
                    UVERBS_METHOD_CQ_CREATE, 1,
                    NULL);
    ...
}

ibv_command_buffer用于存放各种属性,会被组织成为一个链表,next指向下一个ibv_command_buffer,ib_uverbs_ioctl_hdr中记录了所有的attr

struct ibv_command_buffer {
    struct ibv_command_buffer *next;
    struct ib_uverbs_attr *next_attr;
    struct ib_uverbs_attr *last_attr;
    /*
     * Used by the legacy write interface to keep track of where the UHW
     * buffer is located and the 'headroom' space that the common code
     * uses to construct the command header and common command struct
     * directly before the drivers' UHW.
     */
    uint8_t uhw_in_idx;
    uint8_t uhw_out_idx;
    uint8_t uhw_in_headroom_dwords;
    uint8_t uhw_out_headroom_dwords;

    uint8_t buffer_error:1;
    /*
     * These flags control what execute_ioctl_fallback does if the kernel
     * does not support ioctl
     */
    uint8_t fallback_require_ex:1;
    uint8_t fallback_ioctl_only:1;
    struct ib_uverbs_ioctl_hdr hdr;
};

struct ib_uverbs_ioctl_hdr {
    __u16 length;
    __u16 object_id;
    __u16 method_id;
    __u16 num_attrs;
    __aligned_u64 reserved1;
    __u32 driver_id;
    __u32 reserved2;
    struct ib_uverbs_attr  attrs[];
};

struct ib_uverbs_attr {
    __u16 attr_id;      /* command specific type attribute */
    __u16 len;      /* only for pointers and IDRs array */
    __u16 flags;        /* combination of UVERBS_ATTR_F_XXXX */
    union { 
        struct {
            __u8 elem_id;
            __u8 reserved;
        } enum_data;
        __u16 reserved;
    } attr_data;
    union {
        /*
         * ptr to command, inline data, idr/fd or
         * ptr to __u32 array of IDRs
         */
        __aligned_u64 data;
        /* Used by FD_IN and FD_OUT */
        __s64 data_s64;
    };
};

然后看下driver_attrs是怎么创建的

#define DECLARE_COMMAND_BUFFER_LINK(_name, _object_id, _method_id, _num_attrs, \
                    _link)                                     \
    const unsigned int __##_name##total =                                  \
        _ioctl_final_num_attrs(_num_attrs, _link);                     \
    struct ibv_command_buffer _name[_IOCTL_NUM_CMDB(__##_name##total)];    \
    int __attribute__((unused)) __##_name##dummy = _ioctl_init_cmdb(       \
        _name, _object_id, _method_id, __##_name##total, _link)

_ioctl_final_num_attrs就是遍历ibv_command_buffer链表,累计所有的num_attr,不过此时ibv_command_buffer为null,因此就是_num_attrs为1。

#define _ioctl_final_num_attrs(_num_attrs, _link)                              \
    ((__builtin_constant_p(!(_link)) && !(_link))                          \
         ? (_num_attrs)                                                \
         : __ioctl_final_num_attrs(_num_attrs, _link))

unsigned int __ioctl_final_num_attrs(unsigned int num_attrs,
                     struct ibv_command_buffer *link)
{
    for (; link; link = link->next)
        num_attrs += link->next_attr - link->hdr.attrs;

    return num_attrs;
}

hdr里的attrs为变长的柔性数组,所以_IOCTL_NUM_CMDB会计算有几个ibv_command_buffer才能足够存的下_num_attrs个attr。

#define _IOCTL_NUM_CMDB(_num_attrs)                                            \
    ((sizeof(struct ibv_command_buffer) +                                  \
      sizeof(struct ib_uverbs_attr) * (_num_attrs) +                       \
      sizeof(struct ibv_command_buffer) - 1) /                             \
     sizeof(struct ibv_command_buffer))

然后申请数组并初始化,设置hdr的object_id和method_id,将当前ibv_command_buffer通过next链接link,next_attr指向attrs的第一个,last_attr指向attrs的第num_attrs个

#define _COMMAND_BUFFER_INIT(_hdr, _object_id, _method_id, _num_attrs, _link)  \
    ((struct ibv_command_buffer){                                          \
        .hdr =                                                         \
            {                                                      \
                .object_id = (_object_id),                     \
                .method_id = (_method_id),                     \
            },                                                     \
        .next = _link,                                                 \
        .uhw_in_idx = _UHW_NO_INDEX,                                   \
        .uhw_out_idx = _UHW_NO_INDEX,                                  \
        .next_attr = (_hdr).attrs,                                     \
        .last_attr = (_hdr).attrs + _num_attrs})

static inline int _ioctl_init_cmdb(struct ibv_command_buffer *cmd,
                   uint16_t object_id, uint16_t method_id,
                   size_t num_attrs,
                   struct ibv_command_buffer *link)
{                   
    *cmd = _COMMAND_BUFFER_INIT(cmd->hdr, object_id, method_id, num_attrs,
                    link);
    return 0;
}
cmd

创建一个mlx5_create_cq_ex cmd_ex,然后开始设置cmd_ex的mlx5_ib_create_cq部分,记录厂商特有的桉树,比如cq buffer地址等。

static struct ibv_cq_ex *create_cq(struct ibv_context *context,
                   const struct ibv_cq_init_attr_ex *cq_attr,
                   int cq_alloc_flags,
                   struct mlx5dv_cq_init_attr *mlx5cq_attr)
{
	...
    struct mlx5_create_cq_ex    cmd_ex = {};
    struct mlx5_create_cq_ex_resp   resp_ex = {};
    struct mlx5_ib_create_cq       *cmd_drv;
    struct mlx5_ib_create_cq_resp  *resp_drv;
	...
	cmd_drv = &cmd_ex.drv_payload;
    resp_drv = &resp_ex.drv_payload;
    ...
    cmd_drv->buf_addr = (uintptr_t) cq->buf_a.buf;
    cmd_drv->db_addr  = (uintptr_t) cq->dbrec;
    cmd_drv->cqe_size = cqe_sz;
	...
	{
        struct ibv_cq_init_attr_ex cq_attr_ex = *cq_attr;

        cq_attr_ex.cqe = ncqe - 1;
        ret = ibv_cmd_create_cq_ex2(context, &cq_attr_ex, &cq->verbs_cq,
                        &cmd_ex.ibv_cmd, sizeof(cmd_ex),
                        &resp_ex.ibv_resp, sizeof(resp_ex),
                        CREATE_CQ_CMD_FLAGS_TS_IGNORED_EX,
                        driver_attrs);
    }
	...
}

然后看下mlx5_create_cq_ex和mlx5_create_cq_ex_resp是怎么来的,以mlx5_create_cq_ex为例,drv_payload为mlx5_ib_create_cq

DECLARE_DRV_CMD(mlx5_create_cq_ex, IB_USER_VERBS_EX_CMD_CREATE_CQ,
        mlx5_ib_create_cq, mlx5_ib_create_cq_resp);

#define DECLARE_DRV_CMD(_name, _enum, _kabi_req, _kabi_resp)                   \
    struct _name {                                                         \
        IBV_ABI_REQ(_enum) ibv_cmd;                                    \
        union {                                                        \
            _STRUCT_##_kabi_req;                                   \
            struct _kabi_req drv_payload;                          \
        };                                                             \
    };                                                                     \

struct mlx5_ib_create_cq {
    __aligned_u64 buf_addr;
    __aligned_u64 db_addr;
    __u32   cqe_size;
    __u8    cqe_comp_en;
    __u8    cqe_comp_res_format;
    __u16   flags;
    __u16   uar_page_index;
    __u16   reserved0;
    __u32   reserved1;
};

接着看下ibv_cmd是怎么来的,定义如下,包含ex_hdr和_kabi,_kabi就是ib_uverbs_ex_create_cq。
然后对ibv_create_cq_ex进行typedef,因此ibv_cmd就是ibv_create_cq_ex 。

DECLARE_CMD_EX(IB_USER_VERBS_EX_CMD_CREATE_CQ, ibv_create_cq_ex, ib_uverbs_ex_create_cq);

#define DECLARE_CMD_EX(_enum, _name, _kabi)                                    \
    DECLARE_CMD_EXX(_enum, _name, _kabi, _kabi##_resp)

#define DECLARE_CMD_EXX(_enum, _name, _kabi, _kabi_resp)                       \
    struct _name {                                                         \
        struct ex_hdr hdr;                                             \
        union {                                                        \
            _STRUCT_##_kabi;                                       \
            struct _kabi core_payload;                             \
        };                                                             \
    };                                                                     \
    typedef struct _name IBV_ABI_REQ(_enum);                               \
    typedef struct _kabi IBV_KABI_REQ(_enum);                              \
    typedef struct _kabi_resp IBV_KABI_RESP(_enum);                        \

struct ib_uverbs_ex_create_cq {
    __aligned_u64 user_handle;
    __u32 cqe; 
    __u32 comp_vector;
    __s32 comp_channel;
    __u32 comp_mask;
    __u32 flags;  /* bitmask of ib_uverbs_ex_create_cq_flags */
    __u32 reserved;
};              

设置完drv_payload后执行ibv_cmd_create_cq_ex2

int ibv_cmd_create_cq_ex2(struct ibv_context *context,
              const struct ibv_cq_init_attr_ex *cq_attr,
              struct verbs_cq *cq,
              struct ibv_create_cq_ex *cmd,
              size_t cmd_size,
              struct ib_uverbs_ex_create_cq_resp *resp,
              size_t resp_size,
              uint32_t cmd_flags,
              struct ibv_command_buffer *driver)
{
    DECLARE_CMD_BUFFER_LINK_COMPAT(cmdb, UVERBS_OBJECT_CQ,
                       UVERBS_METHOD_CQ_CREATE,
                       driver, cmd, cmd_size, resp, resp_size);

    return ibv_icmd_create_cq_ex(context, cq_attr, cq, cmdb, cmd_flags);
}

初始化一个ibv_command_buffer cmdb,链接到driver_attrs的前边,然后执行_write_set_uhw

#define DECLARE_CMD_BUFFER_LINK_COMPAT(_name, _object_id, _method_id,          \
                      _link, cmd, cmd_size,               \
                      resp, resp_size)                \
   DECLARE_COMMAND_BUFFER_LINK(_name, _object_id, _method_id, 2, _link);  \
   _write_set_uhw(_name, cmd, sizeof(*cmd), cmd_size, resp,               \
              sizeof(*resp), resp_size)

前边说到需要将cmd转成ibv_command_buffer,就是通过_write_set_uhw做的,core_req_size为ibv_create_cq_ex的大小,req_size为mlx5_create_cq_ex的大小,因此这里的fill_attr_in就是将mlx5_create_cq_ex中drv_payload的地址记录到cmdb的attrs中,uhw_in_idx表示记录到attrs的第几个,这样就将cmd的地址作为一个attr记录到了ibv_command_buffer里。

void _write_set_uhw(struct ibv_command_buffer *cmdb, const void *req,
            size_t core_req_size, size_t req_size, void *resp,
            size_t core_resp_size, size_t resp_size)
{
    if (req && core_req_size < req_size) {
        if (VERBS_IOCTL_ONLY)
            cmdb->uhw_in_idx =
                fill_attr_in(cmdb, UVERBS_ATTR_UHW_IN,
                         (uint8_t *)req + core_req_size,
                         req_size - core_req_size) -
                cmdb->hdr.attrs;
        else           
            cmdb->uhw_in_idx =
                _fill_attr_in_uhw(cmdb, UVERBS_ATTR_UHW_IN,
                          (uint8_t *)req +
                              core_req_size,
                          req_size - core_req_size) -
                cmdb->hdr.attrs;
        cmdb->uhw_in_headroom_dwords = __check_divide(core_req_size, 4);
    }
	...
}

static inline struct ib_uverbs_attr *
_fill_attr_in_uhw(struct ibv_command_buffer *cmd, uint16_t attr_id,
         const void *data, size_t len)
{
    struct ib_uverbs_attr *attr = _ioctl_next_attr(cmd, attr_id);

    if (unlikely(len > UINT16_MAX))
        cmd->buffer_error = 1;

    attr->len = len;
    attr->data = ioctl_ptr_to_u64(data);

    return attr;
}

这里又创建了一个新的cmdb,链接到之前的cmdb前,设置各种参数到cmdb的attrs中,然后执行execute_ioctl_fallback

static int ibv_icmd_create_cq(struct ibv_context *context, int cqe,
                  struct ibv_comp_channel *channel, int comp_vector,
                  uint32_t flags, struct ibv_cq *cq,
                  struct ibv_command_buffer *link,
                  uint32_t cmd_flags)
{
    DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_CQ, UVERBS_METHOD_CQ_CREATE, 8, link);
    struct verbs_ex_private *priv = get_priv(context);
    struct ib_uverbs_attr *handle;
    struct ib_uverbs_attr *async_fd_attr;
    uint32_t resp_cqe;
    int ret;

    cq->context = context;

    handle = fill_attr_out_obj(cmdb, UVERBS_ATTR_CREATE_CQ_HANDLE);
    fill_attr_out_ptr(cmdb, UVERBS_ATTR_CREATE_CQ_RESP_CQE, &resp_cqe);

    fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_CQ_CQE, cqe);
    fill_attr_in_uint64(cmdb, UVERBS_ATTR_CREATE_CQ_USER_HANDLE, (uintptr_t)cq);
    if (channel)
        fill_attr_in_fd(cmdb, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL, channel->fd);
    fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, comp_vector);
    async_fd_attr = fill_attr_in_fd(cmdb, UVERBS_ATTR_CREATE_CQ_EVENT_FD, context->async_fd);
    if (priv->imported)
        fallback_require_ioctl(cmdb);
    else
        /* Prevent fallback to the 'write' mode if kernel doesn't support it */
        attr_optional(async_fd_attr);

    if (flags) {
        if ((flags & ~IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION) ||
            (!(cmd_flags & CREATE_CQ_CMD_FLAGS_TS_IGNORED_EX)))
            fallback_require_ex(cmdb);
        fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_CQ_FLAGS, flags);
    }

    switch (execute_ioctl_fallback(cq->context, create_cq, cmdb, &ret)) {
    ...
}
int execute_ioctl(struct ibv_context *context, struct ibv_command_buffer *cmd)
{
    struct verbs_context *vctx = verbs_get_ctx(context);
    prepare_attrs(cmd);
    cmd->hdr.length = sizeof(cmd->hdr) +
        sizeof(cmd->hdr.attrs[0]) * cmd->hdr.num_attrs;
    cmd->hdr.reserved1 = 0;
    cmd->hdr.reserved2 = 0;
    cmd->hdr.driver_id = vctx->priv->driver_id;

    if (ioctl(context->cmd_fd, RDMA_VERBS_IOCTL, &cmd->hdr))
        return errno;

    finalize_attrs(cmd);

    return 0;
}

prepare_attrs遍历所有的ibv_command_buffer,将所有的attrs打平到第一个buffer里,然后设置hdr中的长度,最后执行ioctl,用户态的逻辑就完成了。

static void prepare_attrs(struct ibv_command_buffer *cmd)
{
    struct ib_uverbs_attr *end = cmd->next_attr;
    struct ibv_command_buffer *link;

    for (link = cmd->next; link; link = link->next) {
        struct ib_uverbs_attr *cur;

        assert(cmd->hdr.object_id == link->hdr.object_id);
        assert(cmd->hdr.method_id == link->hdr.method_id);

        /*
         * Keep track of where the uhw_in lands in the final array if
         * we copy it from a link
         */
        if (!VERBS_IOCTL_ONLY && link->uhw_in_idx != _UHW_NO_INDEX) {
            assert(cmd->uhw_in_idx == _UHW_NO_INDEX);
            cmd->uhw_in_idx =
                link->uhw_in_idx + (end - cmd->hdr.attrs);
        }

        for (cur = link->hdr.attrs; cur != link->next_attr; cur++)
            *end++ = *cur;

        assert(end <= cmd->last_attr);
    }

    cmd->hdr.num_attrs = end - cmd->hdr.attrs;

    if (!VERBS_IOCTL_ONLY && cmd->uhw_in_idx != _UHW_NO_INDEX) {
        struct ib_uverbs_attr *uhw = &cmd->hdr.attrs[cmd->uhw_in_idx];

        assert(uhw->attr_id == UVERBS_ATTR_UHW_IN);

        if (uhw->len <= sizeof(uhw->data))
            memcpy(&uhw->data, (void *)(uintptr_t)uhw->data,
                   uhw->len);
    }
}

内核态

初始化

内核有object,method,attr三个概念,cq就对应一个object,其他比如qp,mr都对应不同的object,通过object_id区分;cq这个object有多个method,比如create_cq,destroy_cq,通过method_id区分;create_cq需要多个参数,比如ceq,这里每个参数就是一个attr,通过attr_id区分。

首先定义create_cq的method,method_id为UVERBS_METHOD_CQ_CREATE,这个method的handler为UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE),即实际会执行的函数。

DECLARE_UVERBS_NAMED_METHOD(
    UVERBS_METHOD_CQ_CREATE,
    UVERBS_ATTR_IDR(UVERBS_ATTR_CREATE_CQ_HANDLE,
            UVERBS_OBJECT_CQ,
            UVERBS_ACCESS_NEW,
            UA_MANDATORY),
    UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_CQE,
               UVERBS_ATTR_TYPE(u32),
               UA_MANDATORY),
    UVERBS_ATTR_PTR_IN(UVERBS_ATTR_CREATE_CQ_USER_HANDLE,
               UVERBS_ATTR_TYPE(u64),
               UA_MANDATORY),
    ......
    UVERBS_ATTR_UHW());

struct uverbs_method_def {
    u16                                  id;
    /* Combination of bits from enum UVERBS_ACTION_FLAG_XXXX */
    u32                  flags;
    size_t                   num_attrs;
    const struct uverbs_attr_def * const (*attrs)[];
    int (*handler)(struct uverbs_attr_bundle *attrs);
};

#define DECLARE_UVERBS_NAMED_METHOD(_method_id, ...)                           \
    static const struct uverbs_attr_def *const UVERBS_METHOD_ATTRS(        \
        _method_id)[] = { __VA_ARGS__ };                               \
    static const struct uverbs_method_def UVERBS_METHOD(_method_id) = {    \
        .id = _method_id,                                              \
        .handler = UVERBS_HANDLER(_method_id),                         \
        .num_attrs = ARRAY_SIZE(UVERBS_METHOD_ATTRS(_method_id)),      \
        .attrs = &UVERBS_METHOD_ATTRS(_method_id),                     \
    } 

然后看下method的attr,对于用户态传进来的一个attr,内核应该如何去解析这块内存,就是通过uverbs_attr_spec,指示内核去解析用户attr的什么字段。

struct uverbs_attr_def {
    u16                           id;
    struct uverbs_attr_spec       attr;
};

struct uverbs_attr_spec {
    u8 type;

    /*
     * Support extending attributes by length. Allow the user to provide
     * more bytes than ptr.len, but check that everything after is zero'd
     * by the user.
     */
    u8 zero_trailing:1;
    /*
     * Valid only for PTR_IN. Allocate and copy the data inside
     * the parser
     */
    u8 alloc_and_copy:1;
    u8 mandatory:1;
    /* True if this is from UVERBS_ATTR_UHW */
    u8 is_udata:1;

    union {
        struct {
            /* Current known size to kernel */
            u16 len;
            /* User isn't allowed to provide something < min_len */
            u16 min_len;
        } ptr;

        struct {
            /*
             * higher bits mean the namespace and lower bits mean
             * the type id within the namespace.
             */
            u16 obj_type;
            u8 access;
        } obj;

        struct {
            u8 num_elems;
        } enum_def;
    } u;

    /* This weird split lets us remove some padding */
    union {
        struct {
            /*
             * The enum attribute can select one of the attributes
             * contained in the ids array. Currently only PTR_IN
             * attributes are supported in the ids array.
             */
            const struct uverbs_attr_spec *ids;
        } enum_def;

        struct {
            /*
             * higher bits mean the namespace and lower bits mean
             * the type id within the namespace.
             */
            u16             obj_type;
            u16             min_len;
            u16             max_len;
            u8              access;
        } objs_arr;
    } u2;
};

以上述定义method时uhw attr为例,就是定义了一个uverbs_attr_def,其中spec的type为UVERBS_ATTR_TYPE_PTR_IN,is_udata为1。

#define UVERBS_ATTR_UHW()                                                      \
    UVERBS_ATTR_PTR_IN(UVERBS_ATTR_UHW_IN,                                 \
               UVERBS_ATTR_MIN_SIZE(0),                \
               UA_OPTIONAL,                                        \
               .is_udata = 1), 
               
#define UVERBS_ATTR_PTR_IN(_attr_id, _type, ...)                               \
    (&(const struct uverbs_attr_def){                                      \
        .id = _attr_id,                                                \
        .attr = { .type = UVERBS_ATTR_TYPE_PTR_IN,                     \
              _type,                                               \
              __VA_ARGS__ } })
               

然后开始定义cq的object,创建uverbs_object_def,其中methods指向传入的数组,即create_cq和destroy_cq。

DECLARE_UVERBS_NAMED_OBJECT(
    UVERBS_OBJECT_CQ,
    UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_ucq_object), uverbs_free_cq),
    &UVERBS_METHOD(UVERBS_METHOD_CQ_CREATE),
    &UVERBS_METHOD(UVERBS_METHOD_CQ_DESTROY)
);

struct uverbs_object_def {
    u16                  id;
    const struct uverbs_obj_type            *type_attrs;
    size_t                       num_methods;
    const struct uverbs_method_def * const (*methods)[];
};

#define DECLARE_UVERBS_NAMED_OBJECT(_object_id, _type_attrs, ...)              \
    static const struct uverbs_method_def *const UVERBS_OBJECT_METHODS(    \
        _object_id)[] = { __VA_ARGS__ };                               \
    static const struct uverbs_object_def UVERBS_OBJECT(_object_id) = {    \
        .id = _object_id,                                              \
        .type_attrs = &_type_attrs,                                    \
        .num_methods = ARRAY_SIZE(UVERBS_OBJECT_METHODS(_object_id)),  \
        .methods = &UVERBS_OBJECT_METHODS(_object_id)                  \
    }

然后定义cq相关的uapi_definition,就是创建了一个uapi_definition,其中chain_obj_tree指向前边创建的cq object。

const struct uapi_definition uverbs_def_obj_cq[] = { 
    UAPI_DEF_CHAIN_OBJ_TREE_NAMED(UVERBS_OBJECT_CQ,
                      UAPI_DEF_OBJ_NEEDS_FN(destroy_cq)),
    {}  
};

#define UAPI_DEF_CHAIN_OBJ_TREE_NAMED(_object_enum, ...)                       \
    UAPI_DEF_CHAIN_OBJ_TREE(_object_enum, &UVERBS_OBJECT(_object_enum),    \
                ##__VA_ARGS__)
#define UAPI_DEF_CHAIN_OBJ_TREE(_object_enum, _object_ptr, ...)                \
    {                                                                      \
        .kind = UAPI_DEF_CHAIN_OBJ_TREE,                               \
        .object_start = { .object_id = _object_enum },                 \
        .chain_obj_tree = _object_ptr,                                 \
    },                                     \
        ##__VA_ARGS__

类似的,将其他的object相关的uapi_definition添加到数组uverbs_core_api中,UAPI_DEF_CHAIN就是新建了一个uapi_definition,然后将chain指向下一个uapi_definition。

static const struct uapi_definition uverbs_core_api[] = { 
    UAPI_DEF_CHAIN(uverbs_def_obj_async_fd),
    UAPI_DEF_CHAIN(uverbs_def_obj_counters),
    UAPI_DEF_CHAIN(uverbs_def_obj_cq),
    UAPI_DEF_CHAIN(uverbs_def_obj_device),
    UAPI_DEF_CHAIN(uverbs_def_obj_dm),
    UAPI_DEF_CHAIN(uverbs_def_obj_flow_action),
    UAPI_DEF_CHAIN(uverbs_def_obj_intf),
    UAPI_DEF_CHAIN(uverbs_def_obj_mr),
    UAPI_DEF_CHAIN(uverbs_def_obj_qp),
    UAPI_DEF_CHAIN(uverbs_def_obj_srq),
    UAPI_DEF_CHAIN(uverbs_def_obj_wq),
    UAPI_DEF_CHAIN(uverbs_def_write_intf),
    {}, 
};
#define UAPI_DEF_CHAIN(_def_var)                                               \
    {                                                                      \
        .kind = UAPI_DEF_CHAIN, .chain = _def_var,                     \
    }

到这里就完成了cq相关object,method等的创建,然后开始添加到radix tree。

创建uverbs_api *uapi,uverbs_api用于保存所有的api,内部的radix为radix tree,所有的api会被加入到radix tree中。
然后通过uapi_merge_def添加uverbs_core_api。

struct uverbs_api *uverbs_alloc_api(struct ib_device *ibdev)
{
    struct uverbs_api *uapi;
    int rc;
    
    uapi = kzalloc(sizeof(*uapi), GFP_KERNEL);
    if (!uapi)
        return ERR_PTR(-ENOMEM);
    
    INIT_RADIX_TREE(&uapi->radix, GFP_KERNEL);
    uapi->driver_id = ibdev->ops.driver_id;
    
    rc = uapi_merge_def(uapi, ibdev, uverbs_core_api, false);
    ...
}

struct uverbs_api {
    /* radix tree contains struct uverbs_api_* pointers */
    struct radix_tree_root radix;
    enum rdma_driver_id driver_id;
    
    unsigned int num_write;
    unsigned int num_write_ex;
    struct uverbs_api_write_method notsupp_method;
    const struct uverbs_api_write_method **write_methods;
    const struct uverbs_api_write_method **write_ex_methods;
};  

uapi_merge_def中会遍历数组uverbs_core_api,假设遍历到uverbs_def_obj_cq对应的元素,由于此时的kind为CHAIN,因此递归对chain执行uapi_merge_def。
chain指向uverbs_def_obj_cq,由于kind为UAPI_DEF_CHAIN_OBJ_TREE,于是对chain_obj_tree执行uapi_merge_obj_tree,chain_obj_tree指向的就是cq对应的object。

static int uapi_merge_def(struct uverbs_api *uapi, struct ib_device *ibdev,
              const struct uapi_definition *def_list,
              bool is_driver)
{
    const struct uapi_definition *def = def_list;
    u32 cur_obj_key = UVERBS_API_KEY_ERR;
    u32 cur_method_key = UVERBS_API_KEY_ERR;
    bool exists;
    int rc;

    if (!def_list)
        return 0;

    for (;; def++) {
        switch ((enum uapi_definition_kind)def->kind) {
        case UAPI_DEF_CHAIN:
            rc = uapi_merge_def(uapi, ibdev, def->chain, is_driver);
            if (rc)
                return rc;
            continue;

        case UAPI_DEF_CHAIN_OBJ_TREE:
            if (WARN_ON(def->object_start.object_id !=
                    def->chain_obj_tree->id))
                return -EINVAL;

            cur_obj_key = uapi_key_obj(def->object_start.object_id);
            rc = uapi_merge_obj_tree(uapi, def->chain_obj_tree,
                         is_driver);
            if (rc)
                return rc;
            continue;

        case UAPI_DEF_END:
            return 0;
        ...
        WARN_ON(true);
        return -EINVAL;
    }
}

uapi_merge_obj_tree会将object,method,attr分别插入到radix tree。
这里说下插入radix tree的key如何计算,key一共为16位,key的最低6位为attr_id,中间5位为method_id,高5位为object_id。
然后通过uapi_key_obj获取obeject的key obj_key,就是将object_id左移到高5位,然后通过uapi_add_get_elm插入到radix tree中,slot为obj_elm,然后对obj中的每一个method循环执行uapi_merge_method。

static int uapi_merge_obj_tree(struct uverbs_api *uapi,
                   const struct uverbs_object_def *obj,
                   bool is_driver)
{
    struct uverbs_api_object *obj_elm;
    unsigned int i;
    u32 obj_key;
    bool exists;
    int rc; 

    obj_key = uapi_key_obj(obj->id);
    obj_elm = uapi_add_get_elm(uapi, obj_key, sizeof(*obj_elm), &exists);
    if (IS_ERR(obj_elm))
        return PTR_ERR(obj_elm);

    if (obj->type_attrs) {
        if (WARN_ON(obj_elm->type_attrs))
            return -EINVAL;

        obj_elm->id = obj->id;
        obj_elm->type_attrs = obj->type_attrs;
        obj_elm->type_class = obj->type_attrs->type_class;
        if (WARN_ON(is_driver &&
                obj->type_attrs->type_class != &uverbs_idr_class &&
                obj->type_attrs->type_class != &uverbs_fd_class))
            return -EINVAL;
    }   

    if (!obj->methods)
        return 0;

    for (i = 0; i != obj->num_methods; i++) {
        const struct uverbs_method_def *method = (*obj->methods)[i];

        if (!method)
            continue;

        rc = uapi_merge_method(uapi, obj_elm, obj_key, method,
                       is_driver);
        if (rc)
            return rc; 
    }   

    return 0;
}

然后看下uapi_merge_method,先通过obj_key和mthod->id拼出来method_key,然后继续通过uapi_add_get_elm将method_key插入radix tree,slot为method_elm,然后将handler设置到method_elm中。
然后对于method的所有attr,将index设置为method_key | attr_key,然后也插入radix tree。

static int uapi_merge_method(struct uverbs_api *uapi,
                 struct uverbs_api_object *obj_elm, u32 obj_key,
                 const struct uverbs_method_def *method,
                 bool is_driver)
{
    u32 method_key = obj_key | uapi_key_ioctl_method(method->id);
    struct uverbs_api_ioctl_method *method_elm;
    unsigned int i;
    bool exists;

    if (!method->attrs)
        return 0;

    method_elm = uapi_add_get_elm(uapi, method_key, sizeof(*method_elm),
                      &exists);
    if (IS_ERR(method_elm))
        return PTR_ERR(method_elm);
    if (exists) {
        /*
         * This occurs when a driver uses ADD_UVERBS_ATTRIBUTES_SIMPLE
         */
        if (WARN_ON(method->handler))
            return -EINVAL;
    } else {
        WARN_ON(!method->handler);
        rcu_assign_pointer(method_elm->handler, method->handler);
        if (method->handler != uverbs_destroy_def_handler)
            method_elm->driver_method = is_driver;
    }

    for (i = 0; i != method->num_attrs; i++) {
        const struct uverbs_attr_def *attr = (*method->attrs)[i];
        struct uverbs_api_attr *attr_slot;

        if (!attr)
            continue;

        /*
         * ENUM_IN contains the 'ids' pointer to the driver's .rodata,
         * so if it is specified by a driver then it always makes this
         * into a driver method.
         */
        if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN)
            method_elm->driver_method |= is_driver;

        /*
         * Like other uobject based things we only support a single
         * uobject being NEW'd or DESTROY'd
         */
        if (attr->attr.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) {
            u8 access = attr->attr.u2.objs_arr.access;

            if (WARN_ON(access == UVERBS_ACCESS_NEW ||
                    access == UVERBS_ACCESS_DESTROY))
                return -EINVAL;
        }

        attr_slot =
            uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id),
                     sizeof(*attr_slot));
        /* Attributes are not allowed to be modified by drivers */
        if (IS_ERR(attr_slot))
            return PTR_ERR(attr_slot);

        attr_slot->spec = attr->attr;
    }

    return 0;
}

运行

前边用户态已经看到执行了ioctl,将cmd和参数传到了内核态,现在看下内核态如何执行。
通过copy_from_user将参数拷贝到内核态的hdr,注意这时候attr还没拷贝进来,然后执行ib_uverbs_cmd_verbs

long ib_uverbs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
    struct ib_uverbs_file *file = filp->private_data;
    struct ib_uverbs_ioctl_hdr __user *user_hdr =
        (struct ib_uverbs_ioctl_hdr __user *)arg;
    struct ib_uverbs_ioctl_hdr hdr;
    int srcu_key;
    int err;
    err = copy_from_user(&hdr, user_hdr, sizeof(hdr));
    ...
    srcu_key = srcu_read_lock(&file->device->disassociate_srcu);
    err = ib_uverbs_cmd_verbs(file, &hdr, user_hdr->attrs);
    srcu_read_unlock(&file->device->disassociate_srcu, srcu_key);
    return err;
}

通过object id和method id查找radix tree获得到method_elm。

static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,
                   struct ib_uverbs_ioctl_hdr *hdr,
                   struct ib_uverbs_attr __user *user_attrs)
{
    const struct uverbs_api_ioctl_method *method_elm;
    struct uverbs_api *uapi = ufile->device->uapi;
    struct radix_tree_iter attrs_iter;
    struct bundle_priv *pbundle;
    struct bundle_priv onstack;
    void __rcu **slot;
    int ret;

    if (unlikely(hdr->driver_id != uapi->driver_id))
        return -EINVAL;
#ifdef HAVE_RADIX_TREE_ITER_LOOKUP
    slot = radix_tree_iter_lookup(
        &uapi->radix, &attrs_iter,
        uapi_key_obj(hdr->object_id) |
            uapi_key_ioctl_method(hdr->method_id));
#else
    radix_tree_iter_init(&attrs_iter,  uapi_key_obj(hdr->object_id) |
                    uapi_key_ioctl_method(hdr->method_id));
    slot = radix_tree_next_chunk(&uapi->radix, &attrs_iter, RADIX_TREE_ITER_CONTIG);
#endif
    if (unlikely(!slot))
        return -EPROTONOSUPPORT;
    method_elm = rcu_dereference_protected(*slot, true);
	...
}

bundle_priv用于存储所有用户传进来的参数,由于不通method的attrs个数不一样,因此需要动态分配内存存储attr,为了优化小的分配,bundle_priv内部预留了栈上的internal_buffer,后续假设internal_buffer是足够的,即use_stack为1。
然后设置pbundle的各个参数,比如method_elm,method_key,其中user_attrs指向了用户传入的attr。

static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile,
                   struct ib_uverbs_ioctl_hdr *hdr,
                   struct ib_uverbs_attr __user *user_attrs)
{
    struct bundle_priv *pbundle;
    struct bundle_priv onstack;

    ...

    if (!method_elm->use_stack) {
        pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL);
        if (!pbundle)
            return -ENOMEM;
        pbundle->internal_avail =
            method_elm->bundle_size -
            offsetof(struct bundle_priv, internal_buffer);
        pbundle->alloc_head.next = NULL;
        pbundle->allocated_mem = &pbundle->alloc_head;
    } else {
        pbundle = &onstack;
        pbundle->internal_avail = sizeof(pbundle->internal_buffer);
        pbundle->allocated_mem = NULL;
    }

    /* Space for the pbundle->bundle.attrs flex array */
    pbundle->method_elm = method_elm;
    pbundle->method_key = attrs_iter.index;
    pbundle->bundle.ufile = ufile;
    pbundle->bundle.context = NULL; /* only valid if bundle has uobject */
    pbundle->radix = &uapi->radix;
    pbundle->radix_slots = slot;
    pbundle->radix_slots_len = radix_tree_chunk_size(&attrs_iter);
    pbundle->user_attrs = user_attrs;

    pbundle->internal_used = ALIGN(pbundle->method_elm->key_bitmap_len *
                           sizeof(*pbundle->bundle.attrs),
                       sizeof(*pbundle->internal_buffer));
   ...
    ret = ib_uverbs_run_method(pbundle, hdr->num_attrs);
    bundle_destroy(pbundle, ret == 0);
    return ret;
}

然后执行ib_uverbs_run_method,将method的handler保存到handler,通过用户的num_attrs可以知道需要的内存大小uattrs_size,然后执行uverbs_alloc分配内存到uattrs,最后通过copy_from_user将用户的attr拷贝到uattrs。

static int ib_uverbs_run_method(struct bundle_priv *pbundle,
                unsigned int num_attrs)
{
    int (*handler)(struct uverbs_attr_bundle *attrs);
    size_t uattrs_size = array_size(sizeof(*pbundle->uattrs), num_attrs);
    unsigned int destroy_bkey = pbundle->method_elm->destroy_bkey;
    unsigned int i;
    int ret;

    /* See uverbs_disassociate_api() */
    handler = srcu_dereference(
        pbundle->method_elm->handler,
        &pbundle->bundle.ufile->device->disassociate_srcu);
    if (!handler)
        return -EIO;

    pbundle->uattrs = uverbs_alloc(&pbundle->bundle, uattrs_size);
    if (IS_ERR(pbundle->uattrs))
        return PTR_ERR(pbundle->uattrs);
    if (copy_from_user(pbundle->uattrs, pbundle->user_attrs, uattrs_size))
        return -EFAULT;
	...
}

然后通过uverbs_set_attr解析用户的attr。

static int ib_uverbs_run_method(struct bundle_priv *pbundle,
                unsigned int num_attrs)
{
	...
	for (i = 0; i != num_attrs; i++) {
        ret = uverbs_set_attr(pbundle, &pbundle->uattrs[i]);
        if (unlikely(ret)) 
            return ret;
    }
	...
}

首先通过uapi_get_attr_for_method查找radix tree中的attr,然后执行uverbs_process_attr

static int uverbs_set_attr(struct bundle_priv *pbundle,
               struct ib_uverbs_attr *uattr)
{   
    u32 attr_key = uapi_key_attr(uattr->attr_id);
    u32 attr_bkey = uapi_bkey_attr(attr_key);
    const struct uverbs_api_attr *attr;
    void __rcu **slot;
    int ret;
    
    slot = uapi_get_attr_for_method(pbundle, attr_key);
    if (!slot) {
        /*
         * Kernel does not support the attribute but user-space says it
         * is mandatory
         */
        if (uattr->flags & UVERBS_ATTR_F_MANDATORY)
            return -EPROTONOSUPPORT;
        return 0;
    }
    attr = rcu_dereference_protected(*slot, true);
    
    /* Reject duplicate attributes from user-space */
    if (test_bit(attr_bkey, pbundle->bundle.attr_present))
        return -EINVAL;
    
    ret = uverbs_process_attr(pbundle, attr, uattr, attr_bkey);
    if (ret)
        return ret;
    
    __set_bit(attr_bkey, pbundle->bundle.attr_present);
    
    return 0;
}

uverbs_process_attr就是根据spec中的type解析对应嗯字段到pbundle的attrs,下边展示了type为UVERBS_ATTR_TYPE_PTR_OUT的场景。

static int uverbs_process_attr(struct bundle_priv *pbundle,
                   const struct uverbs_api_attr *attr_uapi,
                   struct ib_uverbs_attr *uattr, u32 attr_bkey)
{
    const struct uverbs_attr_spec *spec = &attr_uapi->spec;
    struct uverbs_attr *e = &pbundle->bundle.attrs[attr_bkey];
    const struct uverbs_attr_spec *val_spec = spec;
    struct uverbs_obj_attr *o_attr;

    switch (spec->type) {
	...
    case UVERBS_ATTR_TYPE_PTR_OUT:
        if (uattr->len < val_spec->u.ptr.min_len ||
            (!val_spec->zero_trailing &&
             uattr->len > val_spec->u.ptr.len))
            return -EINVAL;

        if (spec->type != UVERBS_ATTR_TYPE_ENUM_IN &&
            uattr->attr_data.reserved)
            return -EINVAL;

        e->ptr_attr.uattr_idx = uattr - pbundle->uattrs;
        e->ptr_attr.len = uattr->len;

        ...
        e->ptr_attr.data = uattr->data;
        ...
        break;
		...
	}
}

然后回到ib_uverbs_run_method,前边说到uhw数据会作为一个attr传进来,这里会通过uverbs_fill_udata将uhw的指针记录到driver_udata,最后执行handler就到了真正create_cq的逻辑。

static int ib_uverbs_run_method(struct bundle_priv *pbundle,
                unsigned int num_attrs)
{
	...
	if (pbundle->method_elm->has_udata)
        uverbs_fill_udata(&pbundle->bundle,
                  &pbundle->bundle.driver_udata,
                  UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT);
    else
        pbundle->bundle.driver_udata = (struct ib_udata){};

    if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) {
        struct uverbs_obj_attr *destroy_attr =
            &pbundle->bundle.attrs[destroy_bkey].obj_attr;

        ret = uobj_destroy(destroy_attr->uobject, &pbundle->bundle);
        if (ret)
            return ret;
        __clear_bit(destroy_bkey, pbundle->uobj_finalize);

        ret = handler(&pbundle->bundle);
        uobj_put_destroy(destroy_attr->uobject);
    } else {
        ret = handler(&pbundle->bundle);
    }
	...
}

最后感谢一下学习rdma过程中几位大佬的答疑(字典序)
Santiago0826zhigang124以及一位不想透露姓名的大佬

08-29 06:32