From a8f577c51d851b0a15907ebbc2fbec4b9f2208ab Mon Sep 17 00:00:00 2001 From: Daniel J Walsh Date: Thu, 23 Feb 2017 15:40:45 -0500 Subject: [PATCH 1/2] Remove docker path from systemdhook --- src/systemdhook.c | 175 +++++++++++++++------------------------------- 1 file changed, 58 insertions(+), 117 deletions(-) diff --git a/src/systemdhook.c b/src/systemdhook.c index 74cfdd6..7f0f326 100644 --- a/src/systemdhook.c +++ b/src/systemdhook.c @@ -363,17 +363,6 @@ static int move_mounts(const char *rootfs, return -1; } - /* Special case for /run/secrets which will not be in the - config_mounts */ - if (strcmp("/run", path) == 0) { - if (move_mount_to_tmp(rootfs, tmp_dir, "/run/secrets", strlen(path)) < 0) { - if (errno != EINVAL && errno != ENOENT) { - pr_perror("Failed to move secrets dir"); - return -1; - } - } - } - /* Move other user specified mounts under PATH to temporary directory */ for (unsigned i = 0; i < config_mounts_len; i++) { /* Match destinations that begin with PATH */ @@ -631,8 +620,6 @@ static int poststop(const char *rootfs, return ret; } -#define DOCKER_CONTAINER "docker" - int main(int argc, char *argv[]) { size_t rd; @@ -642,7 +629,6 @@ int main(int argc, char *argv[]) char stateData[CONFIGSZ]; char configData[CONFIGSZ]; _cleanup_fclose_ FILE *fp = NULL; - bool docker = false; stateData[0] = 0; errbuf[0] = 0; @@ -694,28 +680,14 @@ int main(int argc, char *argv[]) } char *id = YAJL_GET_STRING(v_id); - const char *ctr = getenv("container"); - if (ctr && !strncmp(ctr, DOCKER_CONTAINER, strlen(DOCKER_CONTAINER))) { - docker = true; - } - - if (docker) { - if (argc < 3) { - pr_perror("cannot find config file to use"); - return EXIT_FAILURE; - } - - fp = fopen(argv[2], "r"); - } else { - /* bundle_path must be specified for the OCI hooks, and from there we read the configuration file. - If it is not specified, then check that it is specified on the command line. */ - const char *bundle_path[] = { "bundlePath", (const char *)0 }; - yajl_val v_bundle_path = yajl_tree_get(node, bundle_path, yajl_t_string); - if (v_bundle_path) { - char config_file_name[PATH_MAX]; - sprintf(config_file_name, "%s/config.json", YAJL_GET_STRING(v_bundle_path)); - fp = fopen(config_file_name, "r"); - } + /* bundle_path must be specified for the OCI hooks, and from there we read the configuration file. + If it is not specified, then check that it is specified on the command line. */ + const char *bundle_path[] = { "bundlePath", (const char *)0 }; + yajl_val v_bundle_path = yajl_tree_get(node, bundle_path, yajl_t_string); + if (v_bundle_path) { + char config_file_name[PATH_MAX]; + sprintf(config_file_name, "%s/config.json", YAJL_GET_STRING(v_bundle_path)); + fp = fopen(config_file_name, "r"); } @@ -744,105 +716,74 @@ int main(int argc, char *argv[]) return EXIT_FAILURE; } - char *cmd = NULL; char *mount_label = NULL; const char **config_mounts = NULL; unsigned config_mounts_len = 0; - if (!docker) { - /* Extract values from the config json */ - const char *mount_label_path[] = { "linux", "mountLabel", (const char *)0 }; - yajl_val v_mount = yajl_tree_get(config_node, mount_label_path, yajl_t_string); - mount_label = v_mount ? YAJL_GET_STRING(v_mount) : ""; - - const char *mount_points_path[] = {"mounts", (const char *)0 }; - yajl_val v_mounts = yajl_tree_get(config_node, mount_points_path, yajl_t_array); - if (!v_mounts) { - pr_perror("mounts not found in config"); - return EXIT_FAILURE; - } + /* Extract values from the config json */ + const char *mount_label_path[] = { "linux", "mountLabel", (const char *)0 }; + yajl_val v_mount = yajl_tree_get(config_node, mount_label_path, yajl_t_string); + mount_label = v_mount ? YAJL_GET_STRING(v_mount) : ""; - config_mounts_len = YAJL_GET_ARRAY(v_mounts)->len; - config_mounts = malloc (sizeof(char *) * (config_mounts_len + 1)); - if (! config_mounts) { - pr_perror("error malloc'ing"); - return EXIT_FAILURE; - } + const char *mount_points_path[] = {"mounts", (const char *)0 }; + yajl_val v_mounts = yajl_tree_get(config_node, mount_points_path, yajl_t_array); + if (!v_mounts) { + pr_perror("mounts not found in config"); + return EXIT_FAILURE; + } - for (unsigned int i = 0; i < config_mounts_len; i++) { - yajl_val v_mounts_values = YAJL_GET_ARRAY(v_mounts)->values[i]; + config_mounts_len = YAJL_GET_ARRAY(v_mounts)->len; + config_mounts = malloc (sizeof(char *) * (config_mounts_len + 1)); + if (! config_mounts) { + pr_perror("error malloc'ing"); + return EXIT_FAILURE; + } - const char *destination_path[] = {"destination", (const char *)0 }; - yajl_val v_destination = yajl_tree_get(v_mounts_values, destination_path, yajl_t_string); - if (!v_destination) { - pr_perror("Cannot find mount destination"); - return EXIT_FAILURE; - } - config_mounts[i] = YAJL_GET_STRING(v_destination); - } + for (unsigned int i = 0; i < config_mounts_len; i++) { + yajl_val v_mounts_values = YAJL_GET_ARRAY(v_mounts)->values[i]; - const char *args_path[] = {"process", "args", (const char *)0 }; - yajl_val v_args = yajl_tree_get(config_node, args_path, yajl_t_array); - if (!v_args) { - pr_perror("args not found in config"); + const char *destination_path[] = {"destination", (const char *)0 }; + yajl_val v_destination = yajl_tree_get(v_mounts_values, destination_path, yajl_t_string); + if (!v_destination) { + pr_perror("Cannot find mount destination"); return EXIT_FAILURE; } - yajl_val v_arg0_value = YAJL_GET_ARRAY(v_args)->values[0]; - cmd = YAJL_GET_STRING(v_arg0_value); - - const char *envs[] = {"process", "env", (const char *)0 }; - yajl_val v_envs = yajl_tree_get(config_node, envs, yajl_t_array); - if (v_envs) { - for (unsigned int i = 0; i < YAJL_GET_ARRAY(v_envs)->len; i++) { - yajl_val v_env = YAJL_GET_ARRAY(v_envs)->values[i]; - char *str = YAJL_GET_STRING(v_env); - if (strncmp (str, "container_uuid=", strlen ("container_uuid=")) == 0) { - id = strdup (str + strlen ("container_uuid=")); - /* systemd expects $container_uuid= to be an UUID but then treat it as - not containing any '-'. Do the same here. */ - char *to = id; - for (char *from = to; *from; from++) { + config_mounts[i] = YAJL_GET_STRING(v_destination); + } + + const char *args_path[] = {"process", "args", (const char *)0 }; + yajl_val v_args = yajl_tree_get(config_node, args_path, yajl_t_array); + if (!v_args) { + pr_perror("args not found in config"); + return EXIT_FAILURE; + } + + const char *envs[] = {"process", "env", (const char *)0 }; + yajl_val v_envs = yajl_tree_get(config_node, envs, yajl_t_array); + if (v_envs) { + for (unsigned int i = 0; i < YAJL_GET_ARRAY(v_envs)->len; i++) { + yajl_val v_env = YAJL_GET_ARRAY(v_envs)->values[i]; + char *str = YAJL_GET_STRING(v_env); + if (strncmp (str, "container_uuid=", strlen ("container_uuid=")) == 0) { + id = strdup (str + strlen ("container_uuid=")); + /* systemd expects $container_uuid= to be an UUID but then treat it as + not containing any '-'. Do the same here. */ + char *to = id; + for (char *from = to; *from; from++) { if (*from != '-') *to++ = *from; - } - *to = '\0'; } + *to = '\0'; } } - } else { - /* Handle the Docker case here. */ - - /* Extract values from the config json */ - const char *mount_label_path[] = { "MountLabel", (const char *)0 }; - yajl_val v_mount = yajl_tree_get(config_node, mount_label_path, yajl_t_string); - if (!v_mount) { - pr_perror("MountLabel not found in config"); - return EXIT_FAILURE; - } - mount_label = YAJL_GET_STRING(v_mount); - - /* Extract values from the config json */ - const char *mount_points_path[] = { "MountPoints", (const char *)0 }; - yajl_val v_mps = yajl_tree_get(config_node, mount_points_path, yajl_t_object); - if (!v_mps) { - pr_perror("MountPoints not found in config"); - return EXIT_FAILURE; - } - - config_mounts = YAJL_GET_OBJECT(v_mps)->keys; - config_mounts_len = YAJL_GET_OBJECT(v_mps)->len; - - const char *cmd_path[] = { "Path", (const char *)0 }; - yajl_val v_cmd = yajl_tree_get(config_node, cmd_path, yajl_t_string); - if (!v_cmd) { - pr_perror("Path not found in config"); - return EXIT_FAILURE; - } - cmd = YAJL_GET_STRING(v_cmd); } #if ARGS_CHECK - /* Don't do anything if init is actually docker bind mounted /dev/init */ + char *cmd = NULL; + yajl_val v_arg0_value = YAJL_GET_ARRAY(v_args)->values[0]; + cmd = YAJL_GET_STRING(v_arg0_value); + + /* Don't do anything if init is actually container runtime bind mounted /dev/init */ if (!strcmp(cmd, "/dev/init")) { pr_pdebug("Skipping as container command is /dev/init, not systemd init\n"); return EXIT_SUCCESS; From 8d97dfc5d51601677a903b17c9146886addf4006 Mon Sep 17 00:00:00 2001 From: Daniel J Walsh Date: Tue, 28 Feb 2017 14:05:33 -0500 Subject: [PATCH 2/2] Reimplement systemd handling. Following suggested changes from Lennart. This patch set will mount a tmpfs on /sys/fs/systemd inside of the container if it is not already mounted. If it is already mounted it will mount the scope directory from the host for the running container into the container in the same location grep systemd /proc/self/cgroup 1:name=systemd:/system.slice/docker-325a07e48b91822d5ad837ef4117355dcd2b9389daa148585f5083de7580570a.scope The code would take this to generate the path /sys/fs/cgroup/systemd/system.slice/docker-325a07e48b91822d5ad837ef4117355dcd2b9389daa148585f5083de7580570a.scope And then mount this on $ROOTFS/sys/fs/cgroup/systemd/system.slice/docker-325a07e48b91822d5ad837ef4117355dcd2b9389daa148585f5083de7580570a.scope Finally it will change the labels on this directory to match the mount label of the container. This will allow systemd inside of the container to write to only the appropriate directory on the host. --- src/systemdhook.c | 197 +++++++++++++++++++++++++--------------------- 1 file changed, 107 insertions(+), 90 deletions(-) diff --git a/src/systemdhook.c b/src/systemdhook.c index 7f0f326..2fc2367 100644 --- a/src/systemdhook.c +++ b/src/systemdhook.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -83,6 +84,7 @@ DEFINE_CLEANUP_FUNC(yajl_val, yajl_tree_free) #define CONFIGSZ 65536 #define CGROUP_ROOT "/sys/fs/cgroup" +#define CGROUP_SYSTEMD CGROUP_ROOT"/systemd" static int makepath(char *dir, mode_t mode) { @@ -114,83 +116,28 @@ static int bind_mount(const char *src, const char *dest, int readonly) { return 0; } -/* error callback */ -static int parser_errcb(struct libmnt_table *tb __attribute__ ((__unused__)), - const char *filename, int line) -{ - pr_perror("%s: parse error at line %d", filename, line); - return 0; -} - -static struct libmnt_table *parse_tabfile(const char *path) -{ - int rc; - struct libmnt_table *tb = mnt_new_table(); - - if (!tb) { - pr_perror("failed to initialize libmount table"); - return NULL; - } - - mnt_table_set_parser_errcb(tb, parser_errcb); - - rc = mnt_table_parse_file(tb, path); - - if (rc) { - mnt_free_table(tb); - pr_perror("can't read %s", path); - return NULL; - } - return tb; -} - -/* reads filesystems from @tb (libmount) looking for cgroup file systems - then bind mounts these file systems over rootfs - */ -static int mount_cgroup(struct libmnt_table *tb, - struct libmnt_fs *fs, - const char *rootfs) -{ - _cleanup_mnt_fs_ struct libmnt_fs *chld = NULL; - _cleanup_mnt_iter_ struct libmnt_iter *itr = NULL; - - if (!fs) { - /* first call, get root FS */ - if (mnt_table_get_root_fs(tb, &fs)) - return -1; - - } - - itr = mnt_new_iter(MNT_ITER_FORWARD); - if (!itr) - return -1; +static int chcon(const char *path, const char *label) { + DIR *dir; + struct dirent *ent; + if ((dir = opendir (path)) != NULL) { + /* print all the files and directories within directory */ + while ((ent = readdir (dir)) != NULL) { + _cleanup_free_ char *full_path = NULL; + if (asprintf(&full_path, "%s/%s", path, ent->d_name) < 0) { + pr_perror("Failed to create path for chcon"); + closedir(dir); + return -1; - /* - * add all children to the output table - */ - while (mnt_table_next_child_fs(tb, itr, fs, &chld) == 0) { - const char *src = mnt_fs_get_target(chld); - if (strncmp(src, CGROUP_ROOT, strlen(CGROUP_ROOT)) == 0) { - char dest[PATH_MAX]; - snprintf(dest, PATH_MAX, "%s%s", rootfs, src); - - if (makepath(dest, 0755) == -1) { - if (errno != EEXIST) { - pr_perror("Failed to mkdir container cgroup dir: %s", dest); - return -1; - } } - /* Running systemd in a container requires you to - mount all cgroup file systems readonly except - /sys/fs/cgroup/systemd - */ - int readonly = (strcmp(src,"/sys/fs/cgroup/systemd") != 0); - if (bind_mount(src, dest, readonly) < 0) { - return -1; + if (setfilecon (full_path, label) < 0) { + pr_perror("Failed to set context %s on %s", label, path); } } - if (mount_cgroup(tb, chld, rootfs)) - return -1; + closedir (dir); + } else { + /* could not open directory */ + pr_perror("Failed to set labels on %s", path); + return -1; } return 0; } @@ -259,7 +206,14 @@ static char *get_process_cgroup_subsystem_path(int pid, const char *subsystem) { return NULL; } pr_pdebug("PATH: %s", path); - rc = asprintf(&subsystem_path, "%s/%s%s", CGROUP_ROOT, subsystem, path); + const char *subpath = strchr(subsystem, '='); + if (subpath == NULL) { + subpath = subsystem; + } else { + subpath++; + } + + rc = asprintf(&subsystem_path, "%s/%s%s", CGROUP_ROOT, subpath, path); if (rc < 0) { pr_perror("Failed to allocate memory for subsystemd path"); return NULL; @@ -273,6 +227,39 @@ static char *get_process_cgroup_subsystem_path(int pid, const char *subsystem) { return NULL; } +/* + Mount a tmpfs on the /sys/fs/systemd directory inside of container. + Create a systemd subdir + Remount the tmpfs read/only + */ +static int mount_cgroup(const char *rootfs, const char *options, char *systemd_path) +{ + _cleanup_free_ char *cgroup_path = NULL; + + if (asprintf(&cgroup_path, "%s/%s", rootfs, CGROUP_ROOT) < 0) { + pr_perror("Failed to create path for %s", CGROUP_ROOT); + return -1; + } + if ((makepath(cgroup_path, 0755) == -1) && (errno != EEXIST)) { + pr_perror("Failed to mkdir new dest: %s", cgroup_path); + return -1; + } + /* Mount tmpfs at new cgroup directory */ + if (mount("tmpfs", cgroup_path, "tmpfs", MS_NODEV|MS_NOSUID, options) == -1) { + pr_perror("Failed to mount tmpfs at %s", cgroup_path); + return -1; + } + if ((makepath(systemd_path, 0755) == -1) && (errno != EEXIST)) { + pr_perror("Failed to mkdir new dest: %s", systemd_path); + return -1; + } + if (mount(cgroup_path, cgroup_path, "bind", MS_REMOUNT|MS_BIND|MS_RDONLY, "") == -1) { + pr_perror("Failed to remount %s readonly", cgroup_path); + return -1; + } + return 0; +} + static bool contains_mount(const char **config_mounts, unsigned len, const char *mount) { for (unsigned i = 0; i < len; i++) { if (!strcmp(mount, config_mounts[i])) { @@ -480,6 +467,10 @@ static int prestart(const char *rootfs, char tmp_dir[PATH_MAX]; snprintf(tmp_dir, PATH_MAX, "%s/tmp", rootfs); + /* + Create a /var/log/journal directory on the host and mount it into + the container. + */ if (!contains_mount(config_mounts, config_mounts_len, "/var/log/journal")) { char journal_dir[PATH_MAX]; snprintf(journal_dir, PATH_MAX, "/var/log/journal/%.32s", id); @@ -559,28 +550,55 @@ static int prestart(const char *rootfs, } } - if (!contains_mount(config_mounts, config_mounts_len, "/sys/fs/cgroup")) { - /* libmount */ - struct libmnt_table *tb = NULL; - int rc = -1; + /* + * initialize libmount + */ - /* - * initialize libmount - */ - mnt_init_debug(0); + /* + if CGROUP_ROOT is not bind mounted, we need to create a tmpfs on + it, and then create the systemd directory underneath it + */ - tb = parse_tabfile("/proc/self/mountinfo"); - if (!tb) { + _cleanup_free_ char *systemd_path = NULL; + if (asprintf(&systemd_path, "%s/%s", rootfs, CGROUP_SYSTEMD) < 0) { + pr_perror("Failed to create path for %s", CGROUP_ROOT); + return -1; + } + if (!contains_mount(config_mounts, config_mounts_len, CGROUP_ROOT)) { + rc = mount_cgroup(rootfs, options, systemd_path); + } else { + if ((makepath(systemd_path, 0755) == -1) && (errno != EEXIST)) { + pr_perror("Failed to mkdir new dest: %s", systemd_path); return -1; } + } - rc = mount_cgroup(tb, NULL, rootfs); - mnt_free_table(tb); - if (rc == -1) { - return -1; - } + if (bind_mount(CGROUP_SYSTEMD, systemd_path, true)) { + pr_perror("Failed to bind mount %s on %s", CGROUP_SYSTEMD, systemd_path); + return -1; + } + + /* + Mount the writable systemd hierarchy into the container + */ + _cleanup_free_ char *named_path = NULL; + named_path = get_process_cgroup_subsystem_path(pid, "name=systemd"); + _cleanup_free_ char *systemd_named_path = NULL; + if (asprintf(&systemd_named_path, "%s/%s", rootfs, named_path) < 0) { + pr_perror("Failed to create path for %s/%s", rootfs, systemd_named_path); + return -1; + } + if (bind_mount(named_path, systemd_named_path, false)) { + pr_perror("Failed to bind mount %s on %s", CGROUP_SYSTEMD, systemd_named_path); + return -1; + } + if (chcon(systemd_named_path, mount_label) < 0) { + return -1; } + /* + Create /etc/machine-id if it does not exist + */ if (!contains_mount(config_mounts, config_mounts_len, "/etc/machine-id")) { char mid_path[PATH_MAX]; snprintf(mid_path, PATH_MAX, "%s/etc/machine-id", rootfs); @@ -782,7 +800,6 @@ int main(int argc, char *argv[]) char *cmd = NULL; yajl_val v_arg0_value = YAJL_GET_ARRAY(v_args)->values[0]; cmd = YAJL_GET_STRING(v_arg0_value); - /* Don't do anything if init is actually container runtime bind mounted /dev/init */ if (!strcmp(cmd, "/dev/init")) { pr_pdebug("Skipping as container command is /dev/init, not systemd init\n");