随着IT行业的发展,Linux容器一直是比较火的话题,这种轻量级的虚拟机逐渐替代vmware或者hypervisor成为新一代武林盟主。既然是虚拟机,那么它势必满足虚拟机的特点:虚拟机(container)和虚拟机之间相互独立,可以理解为两个不同的设备,一个大概的框图如下:
从上图可见,Linux下的容器是运行在OS之上的虚拟机(OS共享),因此这和传统的vmware和hypervisor并不相同。最初运行这个容器(虚拟机)在Linux下是采用的LXC来运行的,但是后来dotcloud改成了runc(基于go语言),当然github上面也有c语言版的运行时crun:https://github.com/containers/crun。他们二者主要都是基于Linux内核提供的机制cgroup和namespace来实现的。下图为docker的一个模型,
因此,不管从docker或者是k8s(k8s兼容了docker),我们都可以看到一个runc的结点,这个结点就是本章分析的容器运行时,那么什么叫容器运行时(runtime),这是一种和操作系统强相关的运行程序(最初并没有和contaierd分开),也就是主要在对应的操作系统上动态建立一个虚拟机,然后用该虚拟机来运行容器镜像里面的app。因此,我们可以知道,容器镜像里面一般只有app和app依赖的库文件,并不包含操作系统镜像,这也就是容器更轻量级的原因。
一个容器运行时,所需要的环境bundle包必须在bundle里面包含一个满足runtime规范的配置文件config.json和一个目录结构,通常是rootfs, 这个config.json可以通过runc spec生成,如下图,
注:运行时的配置文件config.json必须满足运行时规范:https://github.com/opencontainers/runtime-spec; 容器镜像必须满足镜像规范:https://github.com/opencontainers/image-spec, 其中runc/crun只需满足运行时规范。
crun的main函数在crun.c中,实现如下:
点击(此处)折叠或打开
- int
- main (int argc, char **argv)
- {
- libcrun_error_t err = NULL;
- int ret, first_argument;
- argp_program_version_hook = print_version;
- // C库的参数解析函数,first_argument得到第一个自己解析的索引
- argp_parse (&argp, argc, argv, ARGP_IN_ORDER, &first_argument, &arguments);
- // 通过名字获取到操作函数,这里是create, ps, start, run , exec等函数操作
- command = get_command (argv[first_argument]);
- if (command == NULL)
- libcrun_fail_with_error (0, "unknown command %s", argv[first_argument]);
- // 执行对应的操作函数
- ret = command->handler (&arguments, argc - first_argument, argv + first_argument, &err);
- if (ret && err)
- libcrun_fail_with_error (err->status, "%s", err->msg);
- return ret;
- }
这里我们只将create函数,实现如下:
点击(此处)折叠或打开
- int crun_command_create (struct crun_global_arguments *global_args, int argc, char **argv, libcrun_error_t *err)
- {
- int first_arg, ret;
- libcrun_container_t *container;
- cleanup_free char *bundle_cleanup = NULL;
- crun_context.preserve_fds = 0;
- // 同样的,调用C的解析函数,解析CREATE的子参数
- argp_parse (&run_argp, argc, argv, ARGP_IN_ORDER, &first_arg, &crun_context);
- crun_assert_n_args (argc - first_arg, 1, 1);
- /* 确定bundle的路径. */
- if (bundle)
- {
- if (bundle[0] != '/')
- {
- bundle_cleanup = realpath (bundle, NULL);
- if (bundle_cleanup == NULL)
- libcrun_fail_with_error (errno, "realpath `%s` failed", bundle);
- bundle = bundle_cleanup;
- }
- if (chdir (bundle) < 0)
- libcrun_fail_with_error (errno, "chdir `%s` failed", bundle);
- }
- // 初始化上下文环境
- ret = init_libcrun_context (&crun_context, argv[first_arg], global_args, err);
- if (UNLIKELY (ret < 0))
- return ret;
- // 解析 config.json数据,并保存到container->def结构当中
- container = libcrun_container_load_from_file (config_file, err);
- if (container == NULL)
- libcrun_fail_with_error (0, "error loading config.json");
- crun_context.bundle = bundle ? bundle : ".";
- if (getenv ("LISTEN_FDS"))
- crun_context.preserve_fds += strtoll (getenv ("LISTEN_FDS"), NULL, 10);
- // 通过配置文件和上下文,创建一个容器
- return libcrun_container_create (&crun_context, container, 0, err);
- }
具体的创建函数,libcrun_container_create函数实现如下:
点击(此处)折叠或打开
- int
- libcrun_container_create (libcrun_context_t *context, libcrun_container_t *container, unsigned int options, libcrun_error_t *err)
- {
- runtime_spec_schema_config_schema *def = container->container_def;
- int ret;
- int container_ready_pipe[2];
- cleanup_close int pipefd0 = -1;
- cleanup_close int pipefd1 = -1;
- cleanup_close int exec_fifo_fd = -1;
- context->detach = 1; // 为1表示,crun create 指令不阻塞,直接返回(只有crun run这个detach是0)
- container->context = context;
- // 检查oci版本
- if (def->oci_version && strstr (def->oci_version, "1.0") == NULL)
- return crun_make_error (err, 0, "unknown version specified");
- // 检查配置文件
- ret = check_config_file (def, err);
- if (UNLIKELY (ret < 0))
- return ret;
- if (def->process && def->process->terminal && context->console_socket == NULL)
- return crun_make_error (err, 0, "use --console-socket with create when a terminal is used");
- // 检查运行时目录,通常为/var/run/crun/*
- ret = libcrun_status_check_directories (context->state_root, context->id, err);
- if (UNLIKELY (ret < 0))
- return ret;
- // 创建执行等待fifo,主要是给create/start组合使用,run指令没有这个技能,即start会触发fifo让容器运行
- exec_fifo_fd = libcrun_status_create_exec_fifo (context->state_root, context->id, err);
- if (UNLIKELY (exec_fifo_fd < 0))
- return exec_fifo_fd;
- context->fifo_exec_wait_fd = exec_fifo_fd;
- exec_fifo_fd = -1;
- if ((options & LIBCRUN_RUN_OPTIONS_PREFORK) == 0)
- {
- // 将config.json从bundle拷贝到工作目录
- ret = libcrun_copy_config_file (context->id, context->state_root, context->bundle, err);
- if (UNLIKELY (ret < 0))
- return ret;
- // 创建容器环境,并运行容器
- ret = libcrun_container_run_internal (container, context, -1, err);
- if (UNLIKELY (ret < 0))
- force_delete_container_status (context, def);
- return ret;
- }
- ....此处省略,这里不分析的代码....
- }
libcrun_container_run_internal实现如下:
点击(此处)折叠或打开
- static int
- libcrun_container_run_internal (libcrun_container_t *container, libcrun_context_t *context, int container_ready_fd, libcrun_error_t *err)
- {
- runtime_spec_schema_config_schema *def = container->container_def;
- int ret;
- pid_t pid;
- int detach = context->detach;
- cleanup_free char *cgroup_path = NULL;
- cleanup_free char *scope = NULL;
- cleanup_close int terminal_fd = -1;
- cleanup_terminal void *orig_terminal = NULL;
- cleanup_close int sync_socket = -1;
- cleanup_close int notify_socket = -1;
- cleanup_close int socket_pair_0 = -1;
- cleanup_close int socket_pair_1 = -1;
- cleanup_close int seccomp_fd = -1;
- cleanup_close int console_socket_fd = -1;
- cleanup_close int hooks_out_fd = -1;
- cleanup_close int hooks_err_fd = -1;
- int cgroup_mode, cgroup_manager;
- char created[35];
- uid_t root_uid = -1;
- gid_t root_gid = -1;
- struct container_entrypoint_s container_args =
- {
- .container = container,
- .context = context,
- .terminal_socketpair = {-1, -1},
- .console_socket_fd = -1,
- .hooks_out_fd = -1,
- .hooks_err_fd = -1,
- };
- if (def->hooks && (def->hooks->prestart_len
- || def->hooks->poststart_len
- || def->hooks->create_runtime_len
- || def->hooks->create_container_len
- || def->hooks->start_container_len))
- {
- // 打开hook的输入输出fd
- ret = open_hooks_output (container, &hooks_out_fd, &hooks_err_fd, err);
- if (UNLIKELY (ret < 0))
- return ret;
- container_args.hooks_out_fd = hooks_out_fd;
- container_args.hooks_err_fd = hooks_err_fd;
- }
- container->context = context;
- if (!detach || context->notify_socket)
- {
- ret = prctl (PR_SET_CHILD_SUBREAPER, 1, 0, 0, 0);
- if (UNLIKELY (ret < 0))
- return crun_make_error (err, errno, "set child subreaper");
- }
- if (!context->no_new_keyring)
- { // 创建keyring
- ret = libcrun_create_keyring (container->context->id, err);
- if (UNLIKELY (ret < 0))
- return ret;
- }
- if (def->process && def->process->terminal && !detach && context->console_socket == NULL)
- {
- container_args.has_terminal_socket_pair = 1;
- ret = create_socket_pair (container_args.terminal_socketpair, err);
- if (UNLIKELY (ret < 0))
- return crun_error_wrap (err, "create terminal socket");
- socket_pair_0 = container_args.terminal_socketpair[0];
- socket_pair_1 = container_args.terminal_socketpair[1];
- }
- // 设置信号为block
- ret = block_signals (err);
- if (UNLIKELY (ret < 0))
- return ret;
- if (def->linux && def->linux->seccomp)
- { // 创建seccomp.bpf, 用于系统调用安全检查
- ret = open_seccomp_output (context->id, &seccomp_fd, false, context->state_root, err);
- if (UNLIKELY (ret < 0))
- return ret;
- }
- container_args.seccomp_fd = seccomp_fd;
- if (context->console_socket)
- {// 如果指定了本地socket, 则用本地socket作为容器的标准输入输出
- console_socket_fd = open_unix_domain_client_socket (context->console_socket, 0, err);
- if (UNLIKELY (console_socket_fd < 0))
- return crun_error_wrap (err, "open console socket");
- container_args.console_socket_fd = console_socket_fd;
- }
- // 获取cgroup版本模式等
- cgroup_mode = libcrun_get_cgroup_mode (err);
- if (cgroup_mode < 0)
- return cgroup_mode;
- // 运行一个linux容器,容器运行后,调用container_init函数初始化
- pid = libcrun_run_linux_container (container, container_init, &container_args,
- &sync_socket, err);
- if (UNLIKELY (pid < 0))
- return pid;
- if (context->fifo_exec_wait_fd < 0 && context->notify_socket)
- {
- /* Do not open the notify socket here on "create". "start" will take care of it. */
- ret = get_notify_fd (context, container, ?ify_socket, err);
- if (UNLIKELY (ret < 0))
- return ret;
- }
- if (container_args.terminal_socketpair[1] >= 0)
- close_and_reset (&socket_pair_1);
- cgroup_manager = CGROUP_MANAGER_CGROUPFS;
- if (context->systemd_cgroup)
- cgroup_manager = CGROUP_MANAGER_SYSTEMD;
- else if (context->force_no_cgroup)
- cgroup_manager = CGROUP_MANAGER_DISABLED;
- get_root_in_the_userns_for_cgroups (def, container->host_uid, container->host_gid, &root_uid, &root_gid);
- {
- struct libcrun_cgroup_args cg =
- {
- .resources = def->linux ? def->linux->resources : NULL,
- .annotations = def->annotations,
- .cgroup_mode = cgroup_mode,
- .path = &cgroup_path,
- .scope = &scope,
- .cgroup_path = def->linux ? def->linux->cgroups_path : "",
- .manager = cgroup_manager,
- .pid = pid,
- .root_uid = root_uid,
- .root_gid = root_gid,
- .id = context->id,
- .systemd_subgroup = find_systemd_subgroup (container, cgroup_mode),
- };
- // 打开cgroup
- ret = libcrun_cgroup_enter (&cg, err);
- if (UNLIKELY (ret < 0))
- return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
- // 配置cgroup
- if (def->linux && def->linux->resources)
- {
- ret = libcrun_update_cgroup_resources (cgroup_mode,
- def->linux->resources,
- cgroup_path, err);
- if (UNLIKELY (ret < 0))
- return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
- }
- }
- /* sync 1. */ 通知容器可以进行1初始化
- ret = sync_socket_send_sync (sync_socket, true, err);
- if (UNLIKELY (ret < 0))
- return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
- /* sync 2. */等待容器通知,可以进行2初始化
- ret = sync_socket_wait_sync (context, sync_socket, false, err);
- if (UNLIKELY (ret < 0))
- return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
- // 执行hook函数
- if (def->hooks && def->hooks->prestart_len)
- {
- ret = do_hooks (def, pid, context->id, false, NULL, "created",
- (hook **) def->hooks->prestart,
- def->hooks->prestart_len, hooks_out_fd, hooks_err_fd, err);
- if (UNLIKELY (ret != 0))
- return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
- }
- if (def->hooks && def->hooks->create_runtime_len)
- {
- ret = do_hooks (def, pid, context->id, false, NULL, "created",
- (hook **) def->hooks->create_runtime,
- def->hooks->create_runtime_len, hooks_out_fd, hooks_err_fd, err);
- if (UNLIKELY (ret != 0))
- return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
- }
- // 如果支持系统调用安全检查,这里就产生一个bpf文件
- if (seccomp_fd >= 0)
- {
- unsigned int seccomp_gen_options = 0;
- const char *annotation;
- annotation = find_annotation (container, "run.oci.seccomp_fail_unknown_syscall");
- if (annotation && strcmp (annotation, "0") != 0)
- seccomp_gen_options = LIBCRUN_SECCOMP_FAIL_UNKNOWN_SYSCALL;
- ret = libcrun_generate_seccomp (container, seccomp_fd, seccomp_gen_options, err);
- if (UNLIKELY (ret < 0))
- return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
- close_and_reset (&seccomp_fd);
- }
- /* sync 3. */ 通知容器进程,可以进行3初始化
- ret = sync_socket_send_sync (sync_socket, true, err);
- if (UNLIKELY (ret < 0))
- return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
- if (def->process && def->process->terminal && !detach && context->console_socket == NULL)
- { // 如果没有指定本地socket作为容器的终端, 这里创建一个
- terminal_fd = receive_fd_from_socket (socket_pair_0, err);
- if (UNLIKELY (terminal_fd < 0))
- return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
- close_and_reset (&socket_pair_0);
- // 设置socket的teminal属性
- ret = libcrun_setup_terminal_master (terminal_fd, &orig_terminal, err);
- if (UNLIKELY (ret < 0))
- return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
- }
- /* sync 4. */ 等待容器通知,可以进行4初始化
- ret = sync_socket_wait_sync (context, sync_socket, false, err);
- if (UNLIKELY (ret < 0))
- return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
- ret = close_and_reset (&sync_socket);
- if (UNLIKELY (ret < 0))
- return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
- get_current_timestamp (created);
- // 写容器状态,pid是容器的进程id
- ret = write_container_status (container, context, pid, cgroup_path, scope, created, err);
- if (UNLIKELY (ret < 0))
- return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
- // 执行running hook函数
- if (context->fifo_exec_wait_fd < 0 && def->hooks && def->hooks->poststart_len)
- {
- ret = do_hooks (def, pid, context->id, true, NULL, "running",
- (hook **) def->hooks->poststart,
- def->hooks->poststart_len, hooks_out_fd, hooks_err_fd, err);
- if (UNLIKELY (ret < 0))
- return cleanup_watch (context, pid, sync_socket, terminal_fd, err);
- }
- // 如果detach是true则return;这个wait主要给crun run使用, 对于create就是一个空函数
- ret = wait_for_process (pid, context, terminal_fd, notify_socket, container_ready_fd, err);
- if (!context->detach)
- {
- cleanup_watch (context, 0, sync_socket, terminal_fd, err);
- crun_error_release (err);
- }
- return ret;
- }
创建一个容器,并运行libcrun_run_linux_container实现如下:
点击(此处)折叠或打开
- pid_t
- libcrun_run_linux_container (libcrun_container_t *container,
- container_entrypoint_t entrypoint,
- void *args,
- int *sync_socket_out,
- libcrun_error_t *err)
- {
- __attribute__((cleanup (cleanup_free_init_statusp))) struct init_status_s init_status;
- runtime_spec_schema_config_schema *def = container->container_def;
- cleanup_close int sync_socket_container = -1;
- char *notify_socket_env = NULL;
- cleanup_close int sync_socket_host = -1;
- bool clone_can_create_userns;
- int sync_socket[2];
- pid_t pid;
- size_t i;
- int ret;
- // 初始化命名空间,打开命名空间fd
- ret = configure_init_status (&init_status, container, err);
- if (UNLIKELY (ret < 0))
- return ret;
- get_private_data (container)->unshare_flags = init_status.all_namespaces;
- #ifdef CLONE_NEWCGROUP
- /* cgroup will be unshared later. Once the process is in the correct cgroup. */
- init_status.all_namespaces &= ~CLONE_NEWCGROUP;
- #endif
- // 创建 容器和主进程通信的socket
- ret = socketpair (AF_UNIX, SOCK_SEQPACKET | SOCK_CLOEXEC, 0, sync_socket);
- if (UNLIKELY (ret < 0))
- return crun_make_error (err, errno, "socketpair");
- sync_socket_host = sync_socket[0];
- sync_socket_container = sync_socket[1];
- get_uid_gid_from_def (container->container_def,
- &container->container_uid,
- &container->container_gid);
- /* 设置rlimis值 */
- if (def->process)
- {
- ret = libcrun_set_rlimits (def->process->rlimits, def->process->rlimits_len, err);
- if (UNLIKELY (ret < 0))
- return ret;
- }
- // 设置oom属性
- ret = libcrun_set_oom (container, err);
- if (UNLIKELY (ret < 0))
- return ret;
- // 配置命名空间相关,是否则在新的容器进程支持对应的命名空间
- if ((init_status.all_namespaces & CLONE_NEWIPC) && (init_status.all_namespaces & CLONE_NEWUSER))
- {
- for (i = 0; i < init_status.fd_len; i++)
- if (init_status.value[i] == CLONE_NEWIPC)
- init_status.join_ipcns = true;
- }
- if (init_status.all_namespaces & CLONE_NEWPID)
- {
- init_status.must_fork = true;
- for (i = 0; i < init_status.fd_len; i++)
- {
- if (init_status.value[i] == CLONE_NEWPID)
- {
- init_status.join_pidns = true;
- if (setns (init_status.fd[i], CLONE_NEWPID) == 0)
- {
- init_status.namespaces_to_unshare &= ~CLONE_NEWPID;
- init_status.must_fork = false;
- close_and_reset (&init_status.fd[i]);
- }
- break;
- }
- }
- /* It creates a new PID namespace, without a user namespace, we can try to
- join it immediately without another fork. */
- if (i == init_status.fd_len && (init_status.all_namespaces & CLONE_NEWUSER) == 0)
- {
- if (unshare (CLONE_NEWPID) == 0)
- {
- init_status.namespaces_to_unshare &= ~CLONE_NEWPID;
- init_status.must_fork = false;
- }
- }
- }
- #ifdef CLONE_NEWTIME
- if (init_status.all_namespaces & CLONE_NEWTIME)
- init_status.must_fork = true;
- #endif
- clone_can_create_userns = init_status.fd_len == 0;
- if ((init_status.all_namespaces & CLONE_NEWUSER) && init_status.userns_index < 0)
- init_status.delayed_userns_create = !clone_can_create_userns || init_status.fd_len > 0;
- // 创建一个容器进程
- pid = syscall_clone ((init_status.namespaces_to_unshare & (clone_can_create_userns ? CLONE_NEWUSER : 0)) | SIGCHLD, NULL);
- if (UNLIKELY (pid < 0))
- return crun_make_error (err, errno, "clone");
- if (clone_can_create_userns)
- init_status.namespaces_to_unshare &= ~CLONE_NEWUSER;
- if (pid)
- { // 主进程 保存容器额外的信息
- ret = save_external_descriptors (container, pid, err);
- if (UNLIKELY (ret < 0))
- return ret;
- // 关闭主进程不需要的socket,即容器用的socket
- ret = close_and_reset (&sync_socket_container);
- if (UNLIKELY (ret < 0))
- return crun_make_error (err, errno, "close");
- // 如果是clone_newuser,需要等待容器ready
- if (init_status.all_namespaces & CLONE_NEWUSER)
- {
- if (init_status.delayed_userns_create)
- {
- ret = expect_success_from_sync_socket (sync_socket_host, err);
- if (UNLIKELY (ret < 0))
- return ret;
- }
- if (init_status.userns_index < 0)
- {
- ret = libcrun_set_usernamespace (container, pid, err);
- if (UNLIKELY (ret < 0))
- return ret;
- ret = TEMP_FAILURE_RETRY (write (sync_socket_host, "1", 1));
- if (UNLIKELY (ret < 0))
- return crun_make_error (err, errno, "write to sync socket");
- }
- }
- // 如果容器还要进行fork,则需要等待容器fork出来的进程id
- if (init_status.must_fork)
- {
- pid_t grandchild = 0;
- ret = expect_success_from_sync_socket (sync_socket_host, err);
- if (UNLIKELY (ret < 0))
- return ret;
- ret = TEMP_FAILURE_RETRY (read (sync_socket_host, &grandchild, sizeof (grandchild)));
- if (UNLIKELY (ret < 0))
- return crun_make_error (err, errno, "read pid from sync socket");
- /* Cleanup the first process. */
- waitpid (pid, NULL, 0);
- pid = grandchild;
- }
- ret = expect_success_from_sync_socket (sync_socket_host, err);
- if (UNLIKELY (ret < 0))
- return ret;
- *sync_socket_out = get_and_reset (&sync_socket_host);
- // 返回最终容器的进程id
- return pid;
- }
- /* Inside the container process. */
- // 容器进程关闭主进程的不用的通信套接字
- ret = close_and_reset (&sync_socket_host);
- if (UNLIKELY (ret < 0))
- return crun_make_error (err, errno, "close");
- // 初始化一个命名空间相关的(添加命名空间)
- ret = init_container (container, sync_socket_container, &init_status, err);
- if (UNLIKELY (ret < 0))
- {
- char failure = 1;
- ret = TEMP_FAILURE_RETRY (write (sync_socket_container, &failure, 1));
- if (UNLIKELY (ret < 0))
- goto localfail;
- send_error_to_sync_socket_and_die (sync_socket_container, false, err);
- localfail:
- libcrun_fail_with_error ((*err)->status, "%s", (*err)->msg);
- _exit (EXIT_FAILURE);
- }
- else
- {
- char success = 0;
- ret = TEMP_FAILURE_RETRY (write (sync_socket_container, &success, 1));
- if (UNLIKELY (ret < 0))
- return ret;
- }
- /* Jump into the specified entrypoint. */
- if (container->context->notify_socket)
- xasprintf (¬ify_socket_env, "NOTIFY_SOCKET=%s/notify", container->context->notify_socket);
- // 执行容器的真正初始化,即container_init
- entrypoint (args, notify_socket_env, sync_socket_container, err);
- /* ENTRYPOINT returns only on an error, fallback here: */
- if (*err)
- libcrun_fail_with_error ((*err)->status, "%s", (*err)->msg);
- _exit (EXIT_FAILURE);
- }
容器进程初始化,container_init实现如下:
点击(此处)折叠或打开
- static int container_init (void *args, char *notify_socket, int sync_socket,
- libcrun_error_t *err)
- {
- struct container_entrypoint_s *entrypoint_args = args;
- int ret;
- runtime_spec_schema_config_schema *def = entrypoint_args->container->container_def;
- cleanup_free const char *exec_path = NULL;
- cleanup_free char *notify_socket_cleanup = notify_socket;
- entrypoint_args->sync_socket = sync_socket;
- //改变日志输出fd到syncsocket
- crun_set_output_handler (log_write_to_sync_socket, args, false);
- // 初始化容器,一些组件的设置
- ret = container_init_setup (args, notify_socket, sync_socket, &exec_path, err);
- if (UNLIKELY (ret < 0))
- {
- /* If it fails to write the error using the sync socket, then fallback
- to stderr. */
- if (sync_socket_write_error (sync_socket, err) < 0)
- return ret;
- crun_error_release (err);
- return ret;
- }
- entrypoint_args->sync_socket = -1;
- //解除信号阻塞
- ret = unblock_signals (err);
- if (UNLIKELY (ret < 0))
- return ret;
- /* sync 4. */ 通知主进程,可以进行第4步初始化
- ret = sync_socket_send_sync (sync_socket, false, err);
- if (UNLIKELY (ret < 0))
- return ret;
- close_and_reset (&sync_socket);
- // 如果exec_wait_fd>=0表示 这是create调用, 则会在这里阻塞,知道命令行调用crun start来解除
- if (entrypoint_args->context->fifo_exec_wait_fd >= 0)
- {
- char buffer[1];
- fd_set read_set;
- cleanup_close int fd = entrypoint_args->context->fifo_exec_wait_fd;
- entrypoint_args->context->fifo_exec_wait_fd = -1;
- FD_ZERO (&read_set);
- FD_SET (fd, &read_set);
- do
- {
- ret = select (fd + 1, &read_set, NULL, NULL, NULL);
- if (UNLIKELY (ret < 0))
- return crun_make_error (err, errno, "select");
- ret = TEMP_FAILURE_RETRY (read (fd, buffer, sizeof (buffer)));
- if (UNLIKELY (ret < 0))
- return crun_make_error (err, errno, "read from the exec fifo");
- }
- while (ret == 0);
- close_and_reset (&entrypoint_args->context->fifo_exec_wait_fd);
- }
- // 设置打印输出到标准错误
- crun_set_output_handler (log_write_to_stderr, NULL, false);
- if (def->process && def->process->no_new_privileges)
- {
- char **seccomp_flags = NULL;
- size_t seccomp_flags_len = 0;
- if (def->linux && def->linux->seccomp)
- {
- seccomp_flags = def->linux->seccomp->flags;
- seccomp_flags_len = def->linux->seccomp->flags_len;
- }
- // 设置 系统调用检查配置
- ret = libcrun_apply_seccomp (entrypoint_args->seccomp_fd, seccomp_flags, seccomp_flags_len, err);
- if (UNLIKELY (ret < 0))
- return ret;
- close_and_reset (&entrypoint_args->seccomp_fd);
- }
- if (UNLIKELY (def->process == NULL))
- return crun_make_error (err, 0, "block 'process' not found");
- if (UNLIKELY (exec_path == NULL))
- return crun_make_error (err, 0, "executable path not specified");
- // 运行starting hook函数
- if (def->hooks && def->hooks->start_container_len)
- {
- libcrun_container_t *container = entrypoint_args->container;
- ret = do_hooks (def, 0, container->context->id, false, NULL, "starting",
- (hook **) def->hooks->start_container,
- def->hooks->start_container_len,
- entrypoint_args->hooks_out_fd,
- entrypoint_args->hooks_err_fd,
- err);
- if (UNLIKELY (ret != 0))
- return ret;
- /* Seek stdout/stderr to the end. If the hooks were using the same files,
- the container process overwrites what was previously written. */
- (void) lseek (1, 0, SEEK_END);
- (void) lseek (2, 0, SEEK_END);
- }
- // 执行容器里面的app,这里开始,用户程序开始运行
- execv (exec_path, def->process->args);
- if (errno == ENOENT)
- return crun_make_error (err, errno, "exec container process (missing dynamic library?) `%s`", exec_path);
- return crun_make_error (err, errno, "exec container process `%s`", exec_path);
- }
container_init_setup函数实现如下:
点击(此处)折叠或打开
- static int
- container_init_setup (void *args, char *notify_socket,
- int sync_socket, const char **exec_path,
- libcrun_error_t *err)
- {
- struct container_entrypoint_s *entrypoint_args = args;
- libcrun_container_t *container = entrypoint_args->container;
- int ret;
- int has_terminal;
- cleanup_close int console_socket = -1;
- cleanup_close int console_socketpair = -1;
- runtime_spec_schema_config_schema *def = container->container_def;
- runtime_spec_schema_config_schema_process_capabilities *capabilities;
- cleanup_free char *rootfs = NULL;
- int no_new_privs;
- // 初始化selinux和apparmor 安全相关的
- ret = initialize_security (def->process, err);
- if (UNLIKELY (ret < 0))
- return ret;
- // UP配置回环网络
- ret = libcrun_configure_network (container, err);
- if (UNLIKELY (ret < 0))
- return ret;
- // 得到roofs的实际路径
- rootfs = realpath (def->root->path, NULL);
- if (UNLIKELY (rootfs == NULL))
- {
- /* If realpath failed for any reason, try the relative directory. */
- rootfs = xstrdup (def->root->path);
- }
- // 得到终端输入输出fd
- if (entrypoint_args->terminal_socketpair[0] >= 0)
- {
- close_and_reset (&entrypoint_args->terminal_socketpair[0]);
- console_socketpair = entrypoint_args->terminal_socketpair[1];
- }
- /* sync 1. */ 等待主进程通知可以开始第1步初始化
- ret = sync_socket_wait_sync (NULL, sync_socket, false, err);
- if (UNLIKELY (ret < 0))
- return ret;
- has_terminal = container->container_def->process && container->container_def->process->terminal;
- if (has_terminal && entrypoint_args->context->console_socket)
- console_socket = entrypoint_args->console_socket_fd;
- // 配置sysctl参数
- ret = libcrun_set_sysctl (container, err);
- if (UNLIKELY (ret < 0))
- return ret;
- // 挂载需要的文件系统(包括新的rootfs)
- ret = libcrun_set_mounts (container, rootfs, err);
- if (UNLIKELY (ret < 0))
- return ret;
- /* sync 2. */ 通知主进程,可以开始2阶段初始化
- ret = sync_socket_send_sync (sync_socket, false, err);
- if (UNLIKELY (ret < 0))
- return ret;
- /* sync 3. */ 等待主进程通知,可以进行第3阶段初始化
- ret = sync_socket_wait_sync (NULL, sync_socket, false, err);
- if (UNLIKELY (ret < 0))
- return ret;
- // 执行created hook
- if (def->hooks && def->hooks->create_container_len)
- {
- ret = do_hooks (def, 0, container->context->id, false, NULL, "created",
- (hook **) def->hooks->create_container,
- def->hooks->create_container_len,
- entrypoint_args->hooks_out_fd,
- entrypoint_args->hooks_err_fd,
- err);
- if (UNLIKELY (ret != 0))
- return ret;
- }
- // 设置selinux相关的label
- if (def->process)
- {
- ret = libcrun_set_selinux_exec_label (def->process, err);
- if (UNLIKELY (ret < 0))
- return ret;
- ret = libcrun_set_apparmor_profile (def->process, err);
- if (UNLIKELY (ret < 0))
- return ret;
- }
- // 关闭多余的fd套件字
- ret = close_fds_ge_than (entrypoint_args->context->preserve_fds + 3, err);
- if (UNLIKELY (ret < 0))
- crun_error_write_warning_and_release (entrypoint_args->context->output_handler_arg, &err);
- // 切换到新的rootfs去工作
- ret = libcrun_do_pivot_root (container, entrypoint_args->context->no_pivot, rootfs, err);
- if (UNLIKELY (ret < 0))
- return ret;
- // 重新使用/dev/null
- ret = libcrun_reopen_dev_null (err);
- if (UNLIKELY (ret < 0))
- return ret;
- if (clearenv ())
- return crun_make_error (err, errno, "clearenv");
- // 上面清除了环境变量, 这里重新设置新的环境变量
- if (def->process)
- {
- size_t i;
- for (i = 0; i < def->process->env_len; i++)
- if (putenv (def->process->env[i]) < 0)
- return crun_make_error (err, errno, "putenv `%s`", def->process->env[i]);
- }
- if (getenv ("HOME") == NULL)
- {
- ret = set_home_env (container->container_uid);
- if (UNLIKELY (ret < 0 && errno != ENOTSUP))
- {
- setenv("HOME", "/", 1);
- libcrun_warning ("cannot detect HOME environment variable, setting default");
- }
- }
- if (def->process && def->process->cwd)
- if (UNLIKELY (chdir (def->process->cwd) < 0))
- return crun_make_error (err, errno, "chdir");
- // 查找到用户程序路径
- if (def->process && def->process->args)
- {
- *exec_path = find_executable (def->process->args[0], def->process->cwd);
- if (UNLIKELY (*exec_path == NULL))
- {
- if (errno == ENOENT)
- return crun_make_error (err, errno, "executable file not found in $PATH");
- return crun_make_error (err, errno, "open executable");
- }
- }
- ret = setsid ();
- if (UNLIKELY (ret < 0))
- return crun_make_error (err, errno, "setsid");
- // 如果有终端,将终端fd通知主进程
- if (has_terminal)
- {
- cleanup_close int terminal_fd = -1;
- fflush (stderr);
- terminal_fd = libcrun_set_terminal (container, err);
- if (UNLIKELY (terminal_fd < 0))
- return terminal_fd;
- if (console_socket >= 0)
- {
- ret = send_fd_to_socket (console_socket, terminal_fd, err);
- if (UNLIKELY (ret < 0))
- return ret;
- close_and_reset (&console_socket);
- }
- else if (entrypoint_args->has_terminal_socket_pair && console_socketpair >= 0)
- {
- ret = send_fd_to_socket (console_socketpair, terminal_fd, err);
- if (UNLIKELY (ret < 0))
- return ret;
- close_and_reset (&console_socketpair);
- }
- }
- // 设置容器主机名字
- ret = libcrun_set_hostname (container, err);
- if (UNLIKELY (ret < 0))
- return ret;
- if (container->container_def->linux && container->container_def->linux->personality)
- {
- ret = libcrun_set_personality (container->container_def->linux->personality, err);
- if (UNLIKELY (ret < 0))
- return ret;
- }
- if (def->process->user)
- umask (def->process->user->umask_present ? def->process->user->umask : 0022);
- // 设置准备好的读取seccomp.bpf规则到系统调用安全检查
- if (def->process && !def->process->no_new_privileges)
- {
- char **seccomp_flags = NULL;
- size_t seccomp_flags_len = 0;
- if (def->linux && def->linux->seccomp)
- {
- seccomp_flags = def->linux->seccomp->flags;
- seccomp_flags_len = def->linux->seccomp->flags_len;
- }
- ret = libcrun_apply_seccomp (entrypoint_args->seccomp_fd, seccomp_flags, seccomp_flags_len, err);
- if (UNLIKELY (ret < 0))
- return ret;
- close_and_reset (&entrypoint_args->seccomp_fd);
- }
- capabilities = def->process ? def->process->capabilities : NULL;
- no_new_privs = def->process ? def->process->no_new_privileges : 1;
- ret = libcrun_set_caps (capabilities, container->container_uid, container->container_gid, no_new_privs, err);
- if (UNLIKELY (ret < 0))
- return ret;
- if (notify_socket)
- {
- if (putenv (notify_socket) < 0)
- return crun_make_error (err, errno, "putenv `%s`", notify_socket);
- }
- return 0;
- }
wait_for_process等待函数实现如下:
点击(此处)折叠或打开
- static int wait_for_process (pid_t pid, libcrun_context_t *context, int terminal_fd, int notify_socket, int container_ready_fd, libcrun_error_t *err)
- {
- cleanup_close int epollfd = -1;
- cleanup_close int signalfd = -1;
- int ret, container_exit_code = 0, last_process;
- sigset_t mask;
- int fds[10];
- int levelfds[10];
- int levelfds_len = 0;
- int fds_len = 0;
- container_exit_code = 0;
- if (context->pid_file)
- {
- char buf[12];
- size_t buf_len = sprintf (buf, "%d", pid);
- ret = write_file (context->pid_file, buf, buf_len, err);
- if (UNLIKELY (ret < 0))
- return ret;
- }
- /* crun create 将在这里返回,而crun run将会继续运行. */
- if (context->detach && notify_socket < 0)
- return 0;
- if (container_ready_fd >= 0)
- {
- ret = 0;
- TEMP_FAILURE_RETRY (write (container_ready_fd, &ret, sizeof (ret)));
- close_and_reset (&container_ready_fd);
- }
- sigfillset (&mask);
- ret = sigprocmask (SIG_BLOCK, &mask, NULL);
- if (UNLIKELY (ret < 0))
- return crun_make_error (err, errno, "sigprocmask");
- // 将所有发送给当前主进程的信号转换为fd。
- signalfd = create_signalfd (&mask, err);
- if (UNLIKELY (signalfd < 0))
- return signalfd;
- // 等待一个子进程退出
- ret = reap_subprocesses (pid, &container_exit_code, &last_process, err);
- if (UNLIKELY (ret < 0))
- return ret;
- if (last_process)
- return container_exit_code;
- // 将终端和systemd的fd加入到select
- fds[fds_len++] = signalfd;
- if (notify_socket >= 0)
- fds[fds_len++] = notify_socket;
- if (terminal_fd >= 0)
- {
- fds[fds_len++] = 0;
- levelfds[levelfds_len++] = terminal_fd;
- }
- fds[fds_len++] = -1;
- levelfds[levelfds_len++] = -1;
- // 创建epoll
- epollfd = epoll_helper (fds, levelfds, err);
- if (UNLIKELY (epollfd < 0))
- return epollfd;
- while (1)
- {
- struct signalfd_siginfo si;
- ssize_t res;
- struct epoll_event events[10];
- int i, nr_events;
- nr_events = TEMP_FAILURE_RETRY (epoll_wait (epollfd, events, 10, -1));
- if (UNLIKELY (nr_events < 0))
- return crun_make_error (err, errno, "epoll_wait");
- for (i = 0; i < nr_events; i++)
- { // 这里做了一个转发,这样crun run可以实时显示容器的输入输出了
- if (events[i].data.fd == 0)
- {
- ret = copy_from_fd_to_fd (0, terminal_fd, 0, err);
- if (UNLIKELY (ret < 0))
- return crun_error_wrap (err, "copy to terminal fd");
- }
- else if (events[i].data.fd == terminal_fd)
- {
- ret = set_blocking_fd (terminal_fd, 0, err);
- if (UNLIKELY (ret < 0))
- return crun_error_wrap (err, "set terminal fd not blocking");
- ret = copy_from_fd_to_fd (terminal_fd, 1, 1, err);
- if (UNLIKELY (ret < 0))
- return crun_error_wrap (err, "copy from terminal fd");
- ret = set_blocking_fd (terminal_fd, 1, err);
- if (UNLIKELY (ret < 0))
- return crun_error_wrap (err, "set terminal fd blocking");
- }// 如果是给systemd的
- else if (events[i].data.fd == notify_socket)
- {
- ret = handle_notify_socket (notify_socket, err);
- if (UNLIKELY (ret < 0))
- return ret;
- if (ret && context->detach)
- return 0;
- } // 接收的的信号
- else if (events[i].data.fd == signalfd)
- {
- res = TEMP_FAILURE_RETRY (read (signalfd, &si, sizeof (si)));
- if (UNLIKELY (res < 0))
- return crun_make_error (err, errno, "read from signalfd");
- if (si.ssi_signo == SIGCHLD)
- {// 表示容器退出,crun run 返回
- ret = reap_subprocesses (pid, &container_exit_code, &last_process, err);
- if (UNLIKELY (ret < 0))
- return ret;
- if (last_process)
- return container_exit_code;
- }
- else
- {
- /* Send any other signal to the child process. */
- ret = kill (pid, si.ssi_signo);
- }
- }
- else
- {
- return crun_make_error (err, 0, "unknown fd from epoll_wait");
- }
- }
- }
- return 0;
- }
这里贴出C代码编写的容器运行时crun的程序调用栈。
另外crun, 大概涉及到systemd, seccomp, sysctl, oom, keyring, rlimit, apparmor,selinux, cgroup, namespace(UTS, IPC, PID, NET,MOUNT,USER)等与Linux强相关的组件。
创建容器: crun create container_id (当前目录下必须有config.json和rootfs)
运行容器: crun start container_id
删除容器: crun delete container_id
创建+运行容器: crun run container_id
查看进程id:crun ps container_id