随着IT行业的发展,Linux容器一直是比较火的话题,这种轻量级的虚拟机逐渐替代vmware或者hypervisor成为新一代武林盟主。既然是虚拟机,那么它势必满足虚拟机的特点:虚拟机(container)和虚拟机之间相互独立,可以理解为两个不同的设备,一个大概的框图如下:

Linux 容器运行时之crun分析-LMLPHP

从上图可见,Linux下的容器是运行在OS之上的虚拟机(OS共享),因此这和传统的vmware和hypervisor并不相同。最初运行这个容器(虚拟机)在Linux下是采用的LXC来运行的,但是后来dotcloud改成了runc(基于go语言),当然github上面也有c语言版的运行时crun:https://github.com/containers/crun他们二者主要都是基于Linux内核提供的机制cgroup和namespace来实现的。下图为docker的一个模型,

Linux 容器运行时之crun分析-LMLPHP
蓝色是Linux下的可执行指令,绿色为守护进程,紫色为虚拟机, 下图为google的k8s的模型框图(google引出了CRI接口),
Linux 容器运行时之crun分析-LMLPHP

因此,不管从docker或者是k8s(k8s兼容了docker),我们都可以看到一个runc的结点,这个结点就是本章分析的容器运行时,那么什么叫容器运行时(runtime),这是一种和操作系统强相关的运行程序(最初并没有和contaierd分开),也就是主要在对应的操作系统上动态建立一个虚拟机,然后用该虚拟机来运行容器镜像里面的app。因此,我们可以知道,容器镜像里面一般只有app和app依赖的库文件,并不包含操作系统镜像,这也就是容器更轻量级的原因。

一个容器运行时,所需要的环境bundle包必须在bundle里面包含一个满足runtime规范的配置文件config.json和一个目录结构,通常是rootfs, 这个config.json可以通过runc spec生成,如下图,

Linux 容器运行时之crun分析-LMLPHP

注:运行时的配置文件config.json必须满足运行时规范:https://github.com/opencontainers/runtime-spec; 容器镜像必须满足镜像规范:https://github.com/opencontainers/image-spec, 其中runc/crun只需满足运行时规范。

crun的main函数在crun.c中,实现如下:

点击(此处)折叠或打开

  1. int
  2. main (int argc, char **argv)
  3. {
  4.   libcrun_error_t err = NULL;
  5.   int ret, first_argument;

  6.   argp_program_version_hook = print_version;
  7.  
  8.   // C库的参数解析函数,first_argument得到第一个自己解析的索引
  9.   argp_parse (&argp, argc, argv, ARGP_IN_ORDER, &first_argument, &arguments);
  10.  
  11.   // 通过名字获取到操作函数,这里是create, ps, start, run , exec等函数操作
  12.   command = get_command (argv[first_argument]);
  13.   if (command == NULL)
  14.     libcrun_fail_with_error (0, "unknown command %s", argv[first_argument]);
  15.  
  16.   // 执行对应的操作函数
  17.   ret = command->handler (&arguments, argc - first_argument, argv + first_argument, &err);
  18.   if (ret && err)
  19.     libcrun_fail_with_error (err->status, "%s", err->msg);
  20.   return ret;
  21. }

这里我们只将create函数,实现如下:

点击(此处)折叠或打开

  1. int crun_command_create (struct crun_global_arguments *global_args, int argc, char **argv, libcrun_error_t *err)
  2. {
  3.   int first_arg, ret;
  4.   libcrun_container_t *container;
  5.   cleanup_free char *bundle_cleanup = NULL;

  6.   crun_context.preserve_fds = 0;
  7.   // 同样的,调用C的解析函数,解析CREATE的子参数
  8.   argp_parse (&run_argp, argc, argv, ARGP_IN_ORDER, &first_arg, &crun_context);

  9.   crun_assert_n_args (argc - first_arg, 1, 1);

  10.   /* 确定bundle的路径. */
  11.   if (bundle)
  12.     {
  13.       if (bundle[0] != '/')
  14.         {
  15.           bundle_cleanup = realpath (bundle, NULL);
  16.           if (bundle_cleanup == NULL)
  17.             libcrun_fail_with_error (errno, "realpath `%s` failed", bundle);
  18.           bundle = bundle_cleanup;
  19.         }

  20.       if (chdir (bundle) < 0)
  21.         libcrun_fail_with_error (errno, "chdir `%s` failed", bundle);
  22.     }
  23.  
  24.   // 初始化上下文环境
  25.   ret = init_libcrun_context (&crun_context, argv[first_arg], global_args, err);
  26.   if (UNLIKELY (ret < 0))
  27.     return ret;
  28.  
  29.   // 解析 config.json数据,并保存到container->def结构当中
  30.   container = libcrun_container_load_from_file (config_file, err);
  31.   if (container == NULL)
  32.     libcrun_fail_with_error (0, "error loading config.json");

  33.   crun_context.bundle = bundle ? bundle : ".";
  34.   if (getenv ("LISTEN_FDS"))
  35.     crun_context.preserve_fds += strtoll (getenv ("LISTEN_FDS"), NULL, 10);
  36.  
  37.   // 通过配置文件和上下文,创建一个容器
  38.   return libcrun_container_create (&crun_context, container, 0, err);
  39. }

具体的创建函数,libcrun_container_create函数实现如下:

点击(此处)折叠或打开

  1. int
  2. libcrun_container_create (libcrun_context_t *context, libcrun_container_t *container, unsigned int options, libcrun_error_t *err)
  3. {
  4.   runtime_spec_schema_config_schema *def = container->container_def;
  5.   int ret;
  6.   int container_ready_pipe[2];
  7.   cleanup_close int pipefd0 = -1;
  8.   cleanup_close int pipefd1 = -1;
  9.   cleanup_close int exec_fifo_fd = -1;
  10.   context->detach = 1; // 为1表示,crun create 指令不阻塞,直接返回(只有crun run这个detach是0
  11.   container->context = context;
  12.   // 检查oci版本
  13.   if (def->oci_version && strstr (def->oci_version, "1.0") == NULL)
  14.     return crun_make_error (err, 0, "unknown version specified");
  15.   // 检查配置文件
  16.   ret = check_config_file (def, err);
  17.   if (UNLIKELY (ret < 0))
  18.     return ret;

  19.   if (def->process && def->process->terminal && context->console_socket == NULL)
  20.     return crun_make_error (err, 0, "use --console-socket with create when a terminal is used");
  21.   // 检查运行时目录,通常为/var/run/crun/*
  22.   ret = libcrun_status_check_directories (context->state_root, context->id, err);
  23.   if (UNLIKELY (ret < 0))
  24.     return ret;
  25.  // 创建执行等待fifo,主要是给create/start组合使用,run指令没有这个技能,即start会触发fifo让容器运行
  26.   exec_fifo_fd = libcrun_status_create_exec_fifo (context->state_root, context->id, err);
  27.   if (UNLIKELY (exec_fifo_fd < 0))
  28.     return exec_fifo_fd;

  29.   context->fifo_exec_wait_fd = exec_fifo_fd;
  30.   exec_fifo_fd = -1;

  31.   if ((options & LIBCRUN_RUN_OPTIONS_PREFORK) == 0)
  32.     {
  33.       // 将config.json从bundle拷贝到工作目录
  34.       ret = libcrun_copy_config_file (context->id, context->state_root, context->bundle, err);
  35.       if (UNLIKELY (ret < 0))
  36.         return ret;
  37.       // 创建容器环境,并运行容器
  38.       ret = libcrun_container_run_internal (container, context, -1, err);
  39.       if (UNLIKELY (ret < 0))
  40.         force_delete_container_status (context, def);
  41.       return ret;
  42.     }
  43. ....此处省略,这里不分析的代码....
  44. }

libcrun_container_run_internal实现如下:

点击(此处)折叠或打开

  1. static int
  2. libcrun_container_run_internal (libcrun_container_t *container, libcrun_context_t *context, int container_ready_fd, libcrun_error_t *err)
  3. {
  4.   runtime_spec_schema_config_schema *def = container->container_def;
  5.   int ret;
  6.   pid_t pid;
  7.   int detach = context->detach;
  8.   cleanup_free char *cgroup_path = NULL;
  9.   cleanup_free char *scope = NULL;
  10.   cleanup_close int terminal_fd = -1;
  11.   cleanup_terminal void *orig_terminal = NULL;
  12.   cleanup_close int sync_socket = -1;
  13.   cleanup_close int notify_socket = -1;
  14.   cleanup_close int socket_pair_0 = -1;
  15.   cleanup_close int socket_pair_1 = -1;
  16.   cleanup_close int seccomp_fd = -1;
  17.   cleanup_close int console_socket_fd = -1;
  18.   cleanup_close int hooks_out_fd = -1;
  19.   cleanup_close int hooks_err_fd = -1;
  20.   int cgroup_mode, cgroup_manager;
  21.   char created[35];
  22.   uid_t root_uid = -1;
  23.   gid_t root_gid = -1;
  24.   struct container_entrypoint_s container_args =
  25.     {
  26.       .container = container,
  27.       .context = context,
  28.       .terminal_socketpair = {-1, -1},
  29.       .console_socket_fd = -1,
  30.       .hooks_out_fd = -1,
  31.       .hooks_err_fd = -1,
  32.     };

  33.   if (def->hooks && (def->hooks->prestart_len
  34.                      || def->hooks->poststart_len
  35.                      || def->hooks->create_runtime_len
  36.                      || def->hooks->create_container_len
  37.                      || def->hooks->start_container_len))
  38.     {
  39.       // 打开hook的输入输出fd
  40.       ret = open_hooks_output (container, &hooks_out_fd, &hooks_err_fd, err);
  41.       if (UNLIKELY (ret < 0))
  42.         return ret;
  43.       container_args.hooks_out_fd = hooks_out_fd;
  44.       container_args.hooks_err_fd = hooks_err_fd;
  45.     }

  46.   container->context = context;

  47.   if (!detach || context->notify_socket)
  48.     {
  49.       ret = prctl (PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0);
  50.       if (UNLIKELY (ret < 0))
  51.         return crun_make_error (err, errno, "set child subreaper");
  52.     }

  53.   if (!context->no_new_keyring)
  54.     { // 创建keyring
  55.       ret = libcrun_create_keyring (container->context->id, err);
  56.       if (UNLIKELY (ret < 0))
  57.         return ret;
  58.     }

  59.   if (def->process && def->process->terminal && !detach && context->console_socket == NULL)
  60.     {
  61.       container_args.has_terminal_socket_pair = 1;
  62.       ret = create_socket_pair (container_args.terminal_socketpair, err);
  63.       if (UNLIKELY (ret < 0))
  64.         return crun_error_wrap (err, "create terminal socket");

  65.       socket_pair_0 = container_args.terminal_socketpair[0];
  66.       socket_pair_1 = container_args.terminal_socketpair[1];
  67.     }
  68.  // 设置信号为block
  69.   ret = block_signals (err);
  70.   if (UNLIKELY (ret < 0))
  71.     return ret;

  72.   if (def->linux && def->linux->seccomp)
  73.     { // 创建seccomp.bpf, 用于系统调用安全检查
  74.       ret = open_seccomp_output (context->id, &seccomp_fd, false, context->state_root, err);
  75.       if (UNLIKELY (ret < 0))
  76.         return ret;
  77.     }
  78.   container_args.seccomp_fd = seccomp_fd;

  79.   if (context->console_socket)
  80.     {// 如果指定了本地socket, 则用本地socket作为容器的标准输入输出
  81.       console_socket_fd = open_unix_domain_client_socket (context->console_socket, 0, err);
  82.       if (UNLIKELY (console_socket_fd < 0))
  83.         return crun_error_wrap (err, "open console socket");
  84.       container_args.console_socket_fd = console_socket_fd;
  85.     }
  86.   // 获取cgroup版本模式等
  87.   cgroup_mode = libcrun_get_cgroup_mode (err);
  88.   if (cgroup_mode < 0)
  89.     return cgroup_mode;
  90.   // 运行一个linux容器,容器运行后,调用container_init函数初始化
  91.   pid = libcrun_run_linux_container (container, container_init, &container_args,
  92.                                      &sync_socket, err);
  93.   if (UNLIKELY (pid < 0))
  94.     return pid;

  95.   if (context->fifo_exec_wait_fd < 0 && context->notify_socket)
  96.     {
  97.       /* Do not open the notify socket here on "create". "start" will take care of it. */
  98.       ret = get_notify_fd (context, container, ?ify_socket, err);
  99.       if (UNLIKELY (ret < 0))
  100.         return ret;
  101.     }

  102.   if (container_args.terminal_socketpair[1] >= 0)
  103.     close_and_reset (&socket_pair_1);

  104.   cgroup_manager = CGROUP_MANAGER_CGROUPFS;
  105.   if (context->systemd_cgroup)
  106.     cgroup_manager = CGROUP_MANAGER_SYSTEMD;
  107.   else if (context->force_no_cgroup)
  108.     cgroup_manager = CGROUP_MANAGER_DISABLED;

  109.   get_root_in_the_userns_for_cgroups (def, container->host_uid, container->host_gid, &root_uid, &root_gid);

  110.   {
  111.     struct libcrun_cgroup_args cg =
  112.       {
  113.        .resources = def->linux ? def->linux->resources : NULL,
  114.        .annotations = def->annotations,
  115.        .cgroup_mode = cgroup_mode,
  116.        .path = &cgroup_path,
  117.        .scope = &scope,
  118.        .cgroup_path = def->linux ? def->linux->cgroups_path : "",
  119.        .manager = cgroup_manager,
  120.        .pid = pid,
  121.        .root_uid = root_uid,
  122.        .root_gid = root_gid,
  123.        .id = context->id,
  124.        .systemd_subgroup = find_systemd_subgroup (container, cgroup_mode),
  125.       };
  126.     // 打开cgroup
  127.     ret = libcrun_cgroup_enter (&cg, err);
  128.     if (UNLIKELY (ret < 0))
  129.       return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
  130.     // 配置cgroup
  131.     if (def->linux && def->linux->resources)
  132.       {
  133.         ret = libcrun_update_cgroup_resources (cgroup_mode,
  134.                                                def->linux->resources,
  135.                                                cgroup_path, err);
  136.         if (UNLIKELY (ret < 0))
  137.           return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
  138.       }
  139.   }

  140.   /* sync 1. */ 通知容器可以进行1初始化
  141.   ret = sync_socket_send_sync (sync_socket, true, err);
  142.   if (UNLIKELY (ret < 0))
  143.     return cleanup_watch (context, pid, sync_socket, terminal_fd, err);

  144.   /* sync 2. */等待容器通知,可以进行2初始化
  145.   ret = sync_socket_wait_sync (context, sync_socket, false, err);
  146.   if (UNLIKELY (ret < 0))
  147.     return cleanup_watch (context, pid, sync_socket, terminal_fd, err);

  148.   // 执行hook函数
  149.   if (def->hooks && def->hooks->prestart_len)
  150.     {
  151.       ret = do_hooks (def, pid, context->id, false, NULL, "created",
  152.                       (hook **) def->hooks->prestart,
  153.                       def->hooks->prestart_len, hooks_out_fd, hooks_err_fd, err);
  154.       if (UNLIKELY (ret != 0))
  155.         return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
  156.     }
  157.   if (def->hooks && def->hooks->create_runtime_len)
  158.     {
  159.       ret = do_hooks (def, pid, context->id, false, NULL, "created",
  160.                       (hook **) def->hooks->create_runtime,
  161.                       def->hooks->create_runtime_len, hooks_out_fd, hooks_err_fd, err);
  162.       if (UNLIKELY (ret != 0))
  163.         return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
  164.     }
  165.   // 如果支持系统调用安全检查,这里就产生一个bpf文件
  166.   if (seccomp_fd >= 0)
  167.     {
  168.       unsigned int seccomp_gen_options = 0;
  169.       const char *annotation;

  170.       annotation = find_annotation (container, "run.oci.seccomp_fail_unknown_syscall");
  171.       if (annotation && strcmp (annotation, "0") != 0)
  172.         seccomp_gen_options = LIBCRUN_SECCOMP_FAIL_UNKNOWN_SYSCALL;

  173.       ret = libcrun_generate_seccomp (container, seccomp_fd, seccomp_gen_options, err);
  174.       if (UNLIKELY (ret < 0))
  175.         return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
  176.       close_and_reset (&seccomp_fd);
  177.     }

  178.   /* sync 3. */ 通知容器进程,可以进行3初始化
  179.   ret = sync_socket_send_sync (sync_socket, true, err);
  180.   if (UNLIKELY (ret < 0))
  181.     return cleanup_watch (context, pid, sync_socket, terminal_fd, err);

  182.   if (def->process && def->process->terminal && !detach && context->console_socket == NULL)
  183.     {  // 如果没有指定本地socket作为容器的终端, 这里创建一个
  184.       terminal_fd = receive_fd_from_socket (socket_pair_0, err);
  185.       if (UNLIKELY (terminal_fd < 0))
  186.         return cleanup_watch (context, pid, sync_socket, terminal_fd, err);

  187.       close_and_reset (&socket_pair_0);
  188.       // 设置socket的teminal属性
  189.       ret = libcrun_setup_terminal_master (terminal_fd, &orig_terminal, err);
  190.       if (UNLIKELY (ret < 0))
  191.         return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
  192.     }

  193.   /* sync 4. */ 等待容器通知,可以进行4初始化
  194.   ret = sync_socket_wait_sync (context, sync_socket, false, err);
  195.   if (UNLIKELY (ret < 0))
  196.     return cleanup_watch (context, pid, sync_socket, terminal_fd, err);

  197.   ret = close_and_reset (&sync_socket);
  198.   if (UNLIKELY (ret < 0))
  199.     return cleanup_watch (context, pid, sync_socket, terminal_fd, err);

  200.   get_current_timestamp (created);
  201.   // 写容器状态,pid是容器的进程id
  202.   ret = write_container_status (container, context, pid, cgroup_path, scope, created, err);
  203.   if (UNLIKELY (ret < 0))
  204.     return cleanup_watch (context, pid, sync_socket, terminal_fd, err);

  205.   // 执行running hook函数
  206.   if (context->fifo_exec_wait_fd < 0 && def->hooks && def->hooks->poststart_len)
  207.     {
  208.       ret = do_hooks (def, pid, context->id, true, NULL, "running",
  209.                       (hook **) def->hooks->poststart,
  210.                       def->hooks->poststart_len, hooks_out_fd, hooks_err_fd, err);
  211.       if (UNLIKELY (ret < 0))
  212.         return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
  213.     }
  214.   // 如果detach是true则return;这个wait主要给crun run使用, 对于create就是一个空函数
  215.   ret = wait_for_process (pid, context, terminal_fd, notify_socket, container_ready_fd, err);
  216.   if (!context->detach)
  217.     {
  218.       cleanup_watch (context, 0, sync_socket, terminal_fd, err);
  219.       crun_error_release (err);
  220.     }

  221.   return ret;
  222. }

创建一个容器,并运行libcrun_run_linux_container实现如下:

点击(此处)折叠或打开

  1. pid_t
  2. libcrun_run_linux_container (libcrun_container_t *container,
  3.                              container_entrypoint_t entrypoint,
  4.                              void *args,
  5.                              int *sync_socket_out,
  6.                              libcrun_error_t *err)
  7. {
  8.    __attribute__((cleanup (cleanup_free_init_statusp))) struct init_status_s init_status;
  9.   runtime_spec_schema_config_schema *def = container->container_def;
  10.   cleanup_close int sync_socket_container = -1;
  11.   char *notify_socket_env = NULL;
  12.   cleanup_close int sync_socket_host = -1;
  13.   bool clone_can_create_userns;
  14.   int sync_socket[2];
  15.   pid_t pid;
  16.   size_t i;
  17.   int ret;
  18.   // 初始化命名空间,打开命名空间fd
  19.   ret = configure_init_status (&init_status, container, err);
  20.   if (UNLIKELY (ret < 0))
  21.     return ret;

  22.   get_private_data (container)->unshare_flags = init_status.all_namespaces;
  23. #ifdef CLONE_NEWCGROUP
  24.   /* cgroup will be unshared later. Once the process is in the correct cgroup. */
  25.   init_status.all_namespaces &= ~CLONE_NEWCGROUP;
  26. #endif
  27.   // 创建 容器和主进程通信的socket
  28.   ret = socketpair (AF_UNIX, SOCK_SEQPACKET | SOCK_CLOEXEC, 0, sync_socket);
  29.   if (UNLIKELY (ret < 0))
  30.     return crun_make_error (err, errno, "socketpair");

  31.   sync_socket_host = sync_socket[0];
  32.   sync_socket_container = sync_socket[1];

  33.   get_uid_gid_from_def (container->container_def,
  34.                         &container->container_uid,
  35.                         &container->container_gid);

  36.   /* 设置rlimis值 */
  37.   if (def->process)
  38.     {
  39.       ret = libcrun_set_rlimits (def->process->rlimits, def->process->rlimits_len, err);
  40.       if (UNLIKELY (ret < 0))
  41.         return ret;
  42.     }
  43.   // 设置oom属性
  44.   ret = libcrun_set_oom (container, err);
  45.   if (UNLIKELY (ret < 0))
  46.     return ret;
  47.   // 配置命名空间相关,是否则在新的容器进程支持对应的命名空间
  48.   if ((init_status.all_namespaces & CLONE_NEWIPC) && (init_status.all_namespaces & CLONE_NEWUSER))
  49.     {
  50.       for (i = 0; i < init_status.fd_len; i++)
  51.         if (init_status.value[i] == CLONE_NEWIPC)
  52.           init_status.join_ipcns = true;
  53.     }

  54.   if (init_status.all_namespaces & CLONE_NEWPID)
  55.     {
  56.       init_status.must_fork = true;
  57.       for (i = 0; i < init_status.fd_len; i++)
  58.         {
  59.           if (init_status.value[i] == CLONE_NEWPID)
  60.             {
  61.               init_status.join_pidns = true;
  62.               if (setns (init_status.fd[i], CLONE_NEWPID) == 0)
  63.                 {
  64.                   init_status.namespaces_to_unshare &= ~CLONE_NEWPID;
  65.                   init_status.must_fork = false;
  66.                   close_and_reset (&init_status.fd[i]);
  67.                 }
  68.               break;
  69.             }
  70.         }
  71.       /* It creates a new PID namespace, without a user namespace, we can try to
  72.          join it immediately without another fork. */
  73.       if (i == init_status.fd_len && (init_status.all_namespaces & CLONE_NEWUSER) == 0)
  74.         {
  75.           if (unshare (CLONE_NEWPID) == 0)
  76.             {
  77.               init_status.namespaces_to_unshare &= ~CLONE_NEWPID;
  78.               init_status.must_fork = false;
  79.             }
  80.         }
  81.     }
  82. #ifdef CLONE_NEWTIME
  83.   if (init_status.all_namespaces & CLONE_NEWTIME)
  84.     init_status.must_fork = true;
  85. #endif

  86.   clone_can_create_userns = init_status.fd_len == 0;

  87.   if ((init_status.all_namespaces & CLONE_NEWUSER) && init_status.userns_index < 0)
  88.       init_status.delayed_userns_create = !clone_can_create_userns || init_status.fd_len > 0;

  89.   // 创建一个容器进程
  90.   pid = syscall_clone ((init_status.namespaces_to_unshare & (clone_can_create_userns ? CLONE_NEWUSER : 0)) | SIGCHLD, NULL);
  91.   if (UNLIKELY (pid < 0))
  92.     return crun_make_error (err, errno, "clone");

  93.   if (clone_can_create_userns)
  94.     init_status.namespaces_to_unshare &= ~CLONE_NEWUSER;

  95.   if (pid)
  96.     { // 主进程 保存容器额外的信息
  97.       ret = save_external_descriptors (container, pid, err);
  98.       if (UNLIKELY (ret < 0))
  99.         return ret;
  100.       // 关闭主进程不需要的socket,即容器用的socket
  101.       ret = close_and_reset (&sync_socket_container);
  102.       if (UNLIKELY (ret < 0))
  103.         return crun_make_error (err, errno, "close");
  104.       // 如果是clone_newuser,需要等待容器ready
  105.       if (init_status.all_namespaces & CLONE_NEWUSER)
  106.         {
  107.           if (init_status.delayed_userns_create)
  108.             {
  109.               ret = expect_success_from_sync_socket (sync_socket_host, err);
  110.               if (UNLIKELY (ret < 0))
  111.                 return ret;
  112.             }

  113.           if (init_status.userns_index < 0)
  114.             {
  115.               ret = libcrun_set_usernamespace (container, pid, err);
  116.               if (UNLIKELY (ret < 0))
  117.                 return ret;

  118.               ret = TEMP_FAILURE_RETRY (write (sync_socket_host, "1", 1));
  119.               if (UNLIKELY (ret < 0))
  120.                 return crun_make_error (err, errno, "write to sync socket");
  121.             }
  122.         }
  123.       // 如果容器还要进行fork,则需要等待容器fork出来的进程id
  124.       if (init_status.must_fork)
  125.         {
  126.           pid_t grandchild = 0;

  127.           ret = expect_success_from_sync_socket (sync_socket_host, err);
  128.           if (UNLIKELY (ret < 0))
  129.             return ret;

  130.           ret = TEMP_FAILURE_RETRY (read (sync_socket_host, &grandchild, sizeof (grandchild)));
  131.           if (UNLIKELY (ret < 0))
  132.             return crun_make_error (err, errno, "read pid from sync socket");

  133.           /* Cleanup the first process. */
  134.           waitpid (pid, NULL, 0);

  135.           pid = grandchild;
  136.         }

  137.       ret = expect_success_from_sync_socket (sync_socket_host, err);
  138.       if (UNLIKELY (ret < 0))
  139.         return ret;

  140.       *sync_socket_out = get_and_reset (&sync_socket_host);
  141.       // 返回最终容器的进程id
  142.       return pid;
  143.     }

  144.   /* Inside the container process. */
  145.   // 容器进程关闭主进程的不用的通信套接字
  146.   ret = close_and_reset (&sync_socket_host);
  147.   if (UNLIKELY (ret < 0))
  148.     return crun_make_error (err, errno, "close");

  149.   // 初始化一个命名空间相关的(添加命名空间)
  150.   ret = init_container (container, sync_socket_container, &init_status, err);
  151.   if (UNLIKELY (ret < 0))
  152.     {
  153.       char failure = 1;

  154.       ret = TEMP_FAILURE_RETRY (write (sync_socket_container, &failure, 1));
  155.       if (UNLIKELY (ret < 0))
  156.         goto localfail;

  157.       send_error_to_sync_socket_and_die (sync_socket_container, false, err);

  158. localfail:
  159.       libcrun_fail_with_error ((*err)->status, "%s", (*err)->msg);
  160.       _exit (EXIT_FAILURE);
  161.     }
  162.   else
  163.     {
  164.       char success = 0;

  165.       ret = TEMP_FAILURE_RETRY (write (sync_socket_container, &success, 1));
  166.       if (UNLIKELY (ret < 0))
  167.         return ret;
  168.     }

  169.   /* Jump into the specified entrypoint. */
  170.   if (container->context->notify_socket)
  171.     xasprintf (&notify_socket_env, "NOTIFY_SOCKET=%s/notify", container->context->notify_socket);
  172.  // 执行容器的真正初始化,即container_init
  173.   entrypoint (args, notify_socket_env, sync_socket_container, err);

  174.   /* ENTRYPOINT returns only on an error, fallback here: */
  175.   if (*err)
  176.     libcrun_fail_with_error ((*err)->status, "%s", (*err)->msg);
  177.   _exit (EXIT_FAILURE);
  178. }

容器进程初始化,container_init实现如下:

点击(此处)折叠或打开

  1. static int container_init (void *args, char *notify_socket, int sync_socket,
  2.                 libcrun_error_t *err)
  3. {
  4.   struct container_entrypoint_s *entrypoint_args = args;
  5.   int ret;
  6.   runtime_spec_schema_config_schema *def = entrypoint_args->container->container_def;
  7.   cleanup_free const char *exec_path = NULL;
  8.   cleanup_free char *notify_socket_cleanup = notify_socket;

  9.   entrypoint_args->sync_socket = sync_socket;
  10.   //改变日志输出fd到syncsocket
  11.   crun_set_output_handler (log_write_to_sync_socket, args, false);
  12.   // 初始化容器,一些组件的设置
  13.   ret = container_init_setup (args, notify_socket, sync_socket, &exec_path, err);
  14.   if (UNLIKELY (ret < 0))
  15.     {
  16.       /* If it fails to write the error using the sync socket, then fallback
  17.          to stderr. */
  18.       if (sync_socket_write_error (sync_socket, err) < 0)
  19.         return ret;

  20.       crun_error_release (err);
  21.       return ret;
  22.     }

  23.   entrypoint_args->sync_socket = -1;
  24.   //解除信号阻塞 
  25.   ret = unblock_signals (err);
  26.   if (UNLIKELY (ret < 0))
  27.     return ret;

  28.   /* sync 4. *通知主进程,可以进行第4步初始化
  29.   ret = sync_socket_send_sync (sync_socket, false, err);
  30.   if (UNLIKELY (ret < 0))
  31.     return ret;

  32.   close_and_reset (&sync_socket);
  33.   // 如果exec_wait_fd>=0表示 这是create调用, 则会在这里阻塞,知道命令行调用crun start来解除
  34.   if (entrypoint_args->context->fifo_exec_wait_fd >= 0)
  35.     {
  36.       char buffer[1];
  37.       fd_set read_set;
  38.       cleanup_close int fd = entrypoint_args->context->fifo_exec_wait_fd;
  39.       entrypoint_args->context->fifo_exec_wait_fd = -1;

  40.       FD_ZERO (&read_set);
  41.       FD_SET (fd, &read_set);
  42.       do
  43.         {
  44.           ret = select (fd + 1, &read_set, NULL, NULL, NULL);
  45.           if (UNLIKELY (ret < 0))
  46.             return crun_make_error (err, errno, "select");

  47.           ret = TEMP_FAILURE_RETRY (read (fd, buffer, sizeof (buffer)));
  48.           if (UNLIKELY (ret < 0))
  49.             return crun_make_error (err, errno, "read from the exec fifo");
  50.         }
  51.       while (ret == 0);

  52.       close_and_reset (&entrypoint_args->context->fifo_exec_wait_fd);
  53.     }
  54.   // 设置打印输出到标准错误
  55.   crun_set_output_handler (log_write_to_stderr, NULL, false);

  56.   if (def->process && def->process->no_new_privileges)
  57.     {
  58.       char **seccomp_flags = NULL;
  59.       size_t seccomp_flags_len = 0;

  60.       if (def->linux && def->linux->seccomp)
  61.         {
  62.           seccomp_flags = def->linux->seccomp->flags;
  63.           seccomp_flags_len = def->linux->seccomp->flags_len;
  64.         }
  65.       // 设置  系统调用检查配置
  66.       ret = libcrun_apply_seccomp (entrypoint_args->seccomp_fd, seccomp_flags, seccomp_flags_len, err);
  67.       if (UNLIKELY (ret < 0))
  68.         return ret;
  69.       close_and_reset (&entrypoint_args->seccomp_fd);
  70.     }

  71.   if (UNLIKELY (def->process == NULL))
  72.     return crun_make_error (err, 0, "block 'process' not found");

  73.   if (UNLIKELY (exec_path == NULL))
  74.     return crun_make_error (err, 0, "executable path not specified");
  75.   // 运行starting hook函数
  76.   if (def->hooks && def->hooks->start_container_len)
  77.     {
  78.       libcrun_container_t *container = entrypoint_args->container;

  79.       ret = do_hooks (def, 0, container->context->id, false, NULL, "starting",
  80.                       (hook **) def->hooks->start_container,
  81.                       def->hooks->start_container_len,
  82.                       entrypoint_args->hooks_out_fd,
  83.                       entrypoint_args->hooks_err_fd,
  84.                       err);
  85.       if (UNLIKELY (ret != 0))
  86.         return ret;

  87.       /* Seek stdout/stderr to the end. If the hooks were using the same files,
  88.          the container process overwrites what was previously written. */
  89.       (void) lseek (1, 0, SEEK_END);
  90.       (void) lseek (2, 0, SEEK_END);
  91.     }
  92.   // 执行容器里面的app,这里开始,用户程序开始运行
  93.   execv (exec_path, def->process->args);

  94.   if (errno == ENOENT)
  95.     return crun_make_error (err, errno, "exec container process (missing dynamic library?) `%s`", exec_path);

  96.   return crun_make_error (err, errno, "exec container process `%s`", exec_path);
  97. }

container_init_setup函数实现如下:

点击(此处)折叠或打开

  1. static int
  2. container_init_setup (void *args, char *notify_socket,
  3.                       int sync_socket, const char **exec_path,
  4.                       libcrun_error_t *err)
  5. {
  6.   struct container_entrypoint_s *entrypoint_args = args;
  7.   libcrun_container_t *container = entrypoint_args->container;
  8.   int ret;
  9.   int has_terminal;
  10.   cleanup_close int console_socket = -1;
  11.   cleanup_close int console_socketpair = -1;
  12.   runtime_spec_schema_config_schema *def = container->container_def;
  13.   runtime_spec_schema_config_schema_process_capabilities *capabilities;
  14.   cleanup_free char *rootfs = NULL;
  15.   int no_new_privs;
  16.   // 初始化selinux和apparmor 安全相关的
  17.   ret = initialize_security (def->process, err);
  18.   if (UNLIKELY (ret < 0))
  19.     return ret;
  20.  // UP配置回环网络  
  21.   ret = libcrun_configure_network (container, err);
  22.   if (UNLIKELY (ret < 0))
  23.     return ret;
  24.   // 得到roofs的实际路径
  25.   rootfs = realpath (def->root->path, NULL);
  26.   if (UNLIKELY (rootfs == NULL))
  27.     {
  28.       /* If realpath failed for any reason, try the relative directory. */
  29.       rootfs = xstrdup (def->root->path);
  30.     }
  31.   // 得到终端输入输出fd
  32.   if (entrypoint_args->terminal_socketpair[0] >= 0)
  33.     {
  34.       close_and_reset (&entrypoint_args->terminal_socketpair[0]);
  35.       console_socketpair = entrypoint_args->terminal_socketpair[1];
  36.     }

  37.   /* sync 1. */ 等待主进程通知可以开始第1步初始化
  38.   ret = sync_socket_wait_sync (NULL, sync_socket, false, err);
  39.   if (UNLIKELY (ret < 0))
  40.     return ret;

  41.   has_terminal = container->container_def->process && container->container_def->process->terminal;
  42.   if (has_terminal && entrypoint_args->context->console_socket)
  43.     console_socket = entrypoint_args->console_socket_fd;
  44.  // 配置sysctl参数   
  45.   ret = libcrun_set_sysctl (container, err);
  46.   if (UNLIKELY (ret < 0))
  47.     return ret;
  48.   // 挂载需要的文件系统(包括新的rootfs)
  49.   ret = libcrun_set_mounts (container, rootfs, err);
  50.   if (UNLIKELY (ret < 0))
  51.     return ret;

  52.   /* sync 2. */ 通知主进程,可以开始2阶段初始化
  53.   ret = sync_socket_send_sync (sync_socket, false, err);
  54.   if (UNLIKELY (ret < 0))
  55.     return ret;
  56.  
  57.   /* sync 3. */ 等待主进程通知,可以进行第3阶段初始化
  58.   ret = sync_socket_wait_sync (NULL, sync_socket, false, err);
  59.   if (UNLIKELY (ret < 0))
  60.     return ret;
  61.   // 执行created hook
  62.   if (def->hooks && def->hooks->create_container_len)
  63.     {
  64.       ret = do_hooks (def, 0, container->context->id, false, NULL, "created",
  65.                       (hook **) def->hooks->create_container,
  66.                       def->hooks->create_container_len,
  67.                       entrypoint_args->hooks_out_fd,
  68.                       entrypoint_args->hooks_err_fd,
  69.                       err);
  70.       if (UNLIKELY (ret != 0))
  71.         return ret;
  72.     }
  73.   // 设置selinux相关的label
  74.   if (def->process)
  75.     {
  76.       ret = libcrun_set_selinux_exec_label (def->process, err);
  77.       if (UNLIKELY (ret < 0))
  78.         return ret;

  79.       ret = libcrun_set_apparmor_profile (def->process, err);
  80.       if (UNLIKELY (ret < 0))
  81.         return ret;
  82.     }
  83.   // 关闭多余的fd套件字
  84.   ret = close_fds_ge_than (entrypoint_args->context->preserve_fds + 3, err);
  85.   if (UNLIKELY (ret < 0))
  86.     crun_error_write_warning_and_release (entrypoint_args->context->output_handler_arg, &err);
  87.   // 切换到新的rootfs去工作
  88.   ret = libcrun_do_pivot_root (container, entrypoint_args->context->no_pivot, rootfs, err);
  89.   if (UNLIKELY (ret < 0))
  90.     return ret;
  91.  // 重新使用/dev/null 
  92.   ret = libcrun_reopen_dev_null (err);
  93.   if (UNLIKELY (ret < 0))
  94.     return ret;

  95.   if (clearenv ())
  96.     return crun_make_error (err, errno, "clearenv");
  97.   // 上面清除了环境变量, 这里重新设置新的环境变量
  98.   if (def->process)
  99.     {
  100.       size_t i;

  101.       for (i = 0; i < def->process->env_len; i++)
  102.         if (putenv (def->process->env[i]) < 0)
  103.           return crun_make_error (err, errno, "putenv `%s`", def->process->env[i]);
  104.     }

  105.   if (getenv ("HOME") == NULL)
  106.     {
  107.       ret = set_home_env (container->container_uid);
  108.       if (UNLIKELY (ret < 0 && errno != ENOTSUP))
  109.         {
  110.           setenv("HOME", "/", 1);
  111.           libcrun_warning ("cannot detect HOME environment variable, setting default");
  112.         }
  113.     }

  114.   if (def->process && def->process->cwd)
  115.     if (UNLIKELY (chdir (def->process->cwd) < 0))
  116.       return crun_make_error (err, errno, "chdir");
  117.   // 查找到用户程序路径
  118.   if (def->process && def->process->args)
  119.     {
  120.       *exec_path = find_executable (def->process->args[0], def->process->cwd);
  121.       if (UNLIKELY (*exec_path == NULL))
  122.         {
  123.           if (errno == ENOENT)
  124.             return crun_make_error (err, errno, "executable file not found in $PATH");

  125.           return crun_make_error (err, errno, "open executable");
  126.         }
  127.     }

  128.   ret = setsid ();
  129.   if (UNLIKELY (ret < 0))
  130.     return crun_make_error (err, errno, "setsid");
  131.  // 如果有终端,将终端fd通知主进程
  132.   if (has_terminal)
  133.     {
  134.       cleanup_close int terminal_fd = -1;

  135.       fflush (stderr);

  136.       terminal_fd = libcrun_set_terminal (container, err);
  137.       if (UNLIKELY (terminal_fd < 0))
  138.         return terminal_fd;

  139.       if (console_socket >= 0)
  140.         {
  141.           ret = send_fd_to_socket (console_socket, terminal_fd, err);
  142.           if (UNLIKELY (ret < 0))
  143.             return ret;
  144.           close_and_reset (&console_socket);
  145.         }
  146.       else if (entrypoint_args->has_terminal_socket_pair && console_socketpair >= 0)
  147.         {
  148.           ret = send_fd_to_socket (console_socketpair, terminal_fd, err);
  149.           if (UNLIKELY (ret < 0))
  150.             return ret;

  151.           close_and_reset (&console_socketpair);
  152.         }
  153.     }
  154.   // 设置容器主机名字
  155.   ret = libcrun_set_hostname (container, err);
  156.   if (UNLIKELY (ret < 0))
  157.     return ret;

  158.   if (container->container_def->linux && container->container_def->linux->personality)
  159.     {
  160.       ret = libcrun_set_personality (container->container_def->linux->personality, err);
  161.       if (UNLIKELY (ret < 0))
  162.         return ret;
  163.     }

  164.   if (def->process->user)
  165.     umask (def->process->user->umask_present ? def->process->user->umask : 0022);
  166.  // 设置准备好的读取seccomp.bpf规则到系统调用安全检查
  167.   if (def->process && !def->process->no_new_privileges)
  168.     {
  169.       char **seccomp_flags = NULL;
  170.       size_t seccomp_flags_len = 0;

  171.       if (def->linux && def->linux->seccomp)
  172.         {
  173.           seccomp_flags = def->linux->seccomp->flags;
  174.           seccomp_flags_len = def->linux->seccomp->flags_len;
  175.         }

  176.       ret = libcrun_apply_seccomp (entrypoint_args->seccomp_fd, seccomp_flags, seccomp_flags_len, err);
  177.       if (UNLIKELY (ret < 0))
  178.         return ret;

  179.       close_and_reset (&entrypoint_args->seccomp_fd);
  180.     }

  181.   capabilities = def->process ? def->process->capabilities : NULL;
  182.   no_new_privs = def->process ? def->process->no_new_privileges : 1;
  183.   ret = libcrun_set_caps (capabilities, container->container_uid, container->container_gid, no_new_privs, err);
  184.   if (UNLIKELY (ret < 0))
  185.     return ret;

  186.   if (notify_socket)
  187.     {
  188.       if (putenv (notify_socket) < 0)
  189.         return crun_make_error (err, errno, "putenv `%s`", notify_socket);
  190.     }

  191.   return 0;
  192. }

wait_for_process等待函数实现如下:

点击(此处)折叠或打开

  1. static int wait_for_process (pid_t pid, libcrun_context_t *context, int terminal_fd, int notify_socket, int container_ready_fd, libcrun_error_t *err)
  2. {
  3.   cleanup_close int epollfd = -1;
  4.   cleanup_close int signalfd = -1;
  5.   int ret, container_exit_code = 0, last_process;
  6.   sigset_t mask;
  7.   int fds[10];
  8.   int levelfds[10];
  9.   int levelfds_len = 0;
  10.   int fds_len = 0;

  11.   container_exit_code = 0;

  12.   if (context->pid_file)
  13.     {
  14.       char buf[12];
  15.       size_t buf_len = sprintf (buf, "%d", pid);
  16.       ret = write_file (context->pid_file, buf, buf_len, err);
  17.       if (UNLIKELY (ret < 0))
  18.         return ret;
  19.     }

  20.   /* crun create 将在这里返回,而crun run将会继续运行. */
  21.   if (context->detach && notify_socket < 0)
  22.     return 0;

  23.   if (container_ready_fd >= 0)
  24.     {
  25.       ret = 0;
  26.       TEMP_FAILURE_RETRY (write (container_ready_fd, &ret, sizeof (ret)));
  27.       close_and_reset (&container_ready_fd);
  28.     }

  29.   sigfillset (&mask);
  30.   ret = sigprocmask (SIG_BLOCK, &mask, NULL);
  31.   if (UNLIKELY (ret < 0))
  32.     return crun_make_error (err, errno, "sigprocmask");
  33.  // 将所有发送给当前主进程的信号转换为fd。
  34.   signalfd = create_signalfd (&mask, err);
  35.   if (UNLIKELY (signalfd < 0))
  36.     return signalfd;
  37.   // 等待一个子进程退出
  38.   ret = reap_subprocesses (pid, &container_exit_code, &last_process, err);
  39.   if (UNLIKELY (ret < 0))
  40.     return ret;

  41.   if (last_process)
  42.     return container_exit_code;
  43.   // 将终端和systemd的fd加入到select
  44.   fds[fds_len++] = signalfd;
  45.   if (notify_socket >= 0)
  46.     fds[fds_len++] = notify_socket;
  47.   if (terminal_fd >= 0)
  48.     {
  49.       fds[fds_len++] = 0;
  50.       levelfds[levelfds_len++] = terminal_fd;
  51.     }
  52.   fds[fds_len++] = -1;
  53.   levelfds[levelfds_len++] = -1;
  54.   // 创建epoll
  55.   epollfd = epoll_helper (fds, levelfds, err);
  56.   if (UNLIKELY (epollfd < 0))
  57.     return epollfd;

  58.   while (1)
  59.     {
  60.       struct signalfd_siginfo si;
  61.       ssize_t res;
  62.       struct epoll_event events[10];
  63.       int i, nr_events;

  64.       nr_events = TEMP_FAILURE_RETRY (epoll_wait (epollfd, events, 10, -1));
  65.       if (UNLIKELY (nr_events < 0))
  66.         return crun_make_error (err, errno, "epoll_wait");

  67.       for (i = 0; i < nr_events; i++)
  68.         { // 这里做了一个转发,这样crun run可以实时显示容器的输入输出了
  69.           if (events[i].data.fd == 0)
  70.             {
  71.               ret = copy_from_fd_to_fd (0, terminal_fd, 0, err);
  72.               if (UNLIKELY (ret < 0))
  73.                 return crun_error_wrap (err, "copy to terminal fd");
  74.             }
  75.           else if (events[i].data.fd == terminal_fd)
  76.             {
  77.               ret = set_blocking_fd (terminal_fd, 0, err);
  78.               if (UNLIKELY (ret < 0))
  79.                 return crun_error_wrap (err, "set terminal fd not blocking");

  80.               ret = copy_from_fd_to_fd (terminal_fd, 1, 1, err);
  81.               if (UNLIKELY (ret < 0))
  82.                 return crun_error_wrap (err, "copy from terminal fd");

  83.               ret = set_blocking_fd (terminal_fd, 1, err);
  84.               if (UNLIKELY (ret < 0))
  85.                 return crun_error_wrap (err, "set terminal fd blocking");
  86.             }// 如果是给systemd的
  87.           else if (events[i].data.fd == notify_socket)
  88.             {
  89.               ret = handle_notify_socket (notify_socket, err);
  90.               if (UNLIKELY (ret < 0))
  91.                 return ret;
  92.               if (ret && context->detach)
  93.                 return 0;
  94.             } // 接收的的信号
  95.           else if (events[i].data.fd == signalfd)
  96.             {
  97.               res = TEMP_FAILURE_RETRY (read (signalfd, &si, sizeof (si)));
  98.               if (UNLIKELY (res < 0))
  99.                 return crun_make_error (err, errno, "read from signalfd");
  100.               if (si.ssi_signo == SIGCHLD)
  101.                 {// 表示容器退出,crun run 返回
  102.                   ret = reap_subprocesses (pid, &container_exit_code, &last_process, err);
  103.                   if (UNLIKELY (ret < 0))
  104.                     return ret;
  105.                   if (last_process)
  106.                     return container_exit_code;
  107.                 }
  108.               else
  109.                 {
  110.                   /* Send any other signal to the child process. */
  111.                   ret = kill (pid, si.ssi_signo);
  112.                 }
  113.             }
  114.           else
  115.             {
  116.               return crun_make_error (err, 0, "unknown fd from epoll_wait");
  117.             }
  118.         }
  119.     }

  120.   return 0;
  121. }


这里贴出C代码编写的容器运行时crun的程序调用栈。

Linux 容器运行时之crun分析-LMLPHP


另外crun, 大概涉及到systemd, seccomp, sysctl, oom, keyring, rlimit, apparmor,selinux, cgroup, namespace(UTS, IPC, PID, NET,MOUNT,USER)等与Linux强相关的组件。


创建容器: crun create container_id (当前目录下必须有config.json和rootfs)
运行容器: crun start  container_id
删除容器: crun delete container_id
创建+运行容器: crun run container_id
查看进程id:crun ps container_id



12-17 09:48