/* * contain -- Simple container engine * * v4 "networking": run containerized command with networking capabilities. * * Run a command on an isolated filesystem, with an isolated network stack, * under namespaces and cgroups. * * Requires privileged rights (specifically, CAP_SYS_ADMIN to mount the * container's image and to create network objects). * Requires some setup (run `make setup`). * * Requires libcontain (companion helper library). * Requires libnl (netlink library). */ #define _GNU_SOURCE #include "libcontain.h" #include #include #include #include #include #include #include #include #include #include #include #include #include // The container process waits on this variable before running. static int cont_started = 0; /* * Print the command to run in the container, and its arguments. * * Parameters: * * cont_args: the command and its arguments * * cont_args is a NULL-terminated array of strings, i.e., the last element of * cont_args is NULL, and is not part of the command to run in the container. */ void print_container_cmd(char* const cont_args[]) { /*** STUDENT CODE BELOW (q1) ***/ // Print each argument in cont_args. if (*cont_args) printf("\"%s\"", *cont_args++); while (*cont_args) printf(" \"%s\"", *cont_args++); printf("\n"); /*** STUDENT CODE ABOVE (q1) ***/ } // Flags for the syscall clone3. /*** STUDENT CODE BELOW (q5, q6, q7, q8, q9) ***/ // Add the flags for each namespace kind, one by one. // clone(2) #define CLONE_NAMESPACE_FLAGS (CLONE_NEWUSER | CLONE_NEWUTS | CLONE_NEWPID |\ CLONE_NEWNET | CLONE_NEWNS) /*** STUDENT CODE ABOVE (q5, q6, q7, q8, q9) ***/ /* * Create a child process that runs inside the container. * * Clone to a child process that runs under namespaces, and in a cgroup. * * Parameter: * * cont: the container * * Return: * * the PID of the child process * * -1 on error (and print an error message) */ int clone_to_container(struct container *cont) { pid_t cont_pid = -1; struct clone_args clone_args = {0}; /*** STUDENT CODE BELOW (q3, q5, q13) ***/ // Initialize the clone_args structure: flags, exit_signal, cgroup. // clone3(2) // CLONE_NAMESPACE_FLAGS clone_args.exit_signal = SIGCHLD; clone_args.flags = CLONE_NAMESPACE_FLAGS | CLONE_INTO_CGROUP; clone_args.cgroup = cont->cgroup.fd; // Call the clone3 syscall, store the container process PID in cont->pid // both in the parent and in the child. // syscall(2), clone3(2), getpid(2) if ((cont_pid = syscall(SYS_clone3, &clone_args, sizeof(clone_args))) < 0) raise_err("failed cloning to create process"); else if (cont_pid == 0) cont->pid = getpid(); else cont->pid = cont_pid; /*** STUDENT CODE ABOVE (q3, q5, q13) ***/ return cont_pid; } /* * Signal the container process to start executing. * * Parameter: * * cont_pid: the PID of the container process */ void cont_start(pid_t cont_pid) { /*** STUDENT CODE BELOW (q17) ***/ // Signal the container process to start executing (use SIGUSR1). // kill(2) kill(cont_pid, SIGUSR1); /*** STUDENT CODE ABOVE (q17) ***/ } static void container_startexec() { printf("start execution of container\n"); cont_started = 1; } /* * Set up the container to wait on a signal from the host. */ void set_cont_wait() { // Signal action structure to wait for the signal from container manager. struct sigaction sigact = {0}; /*** STUDENT CODE BELOW (q17) ***/ // Initialize the sigact structure: sa_handler, sa_mask // sigemptyset(3) sigact.sa_handler = container_startexec; sigemptyset(&sigact.sa_mask); // Set handler for signal from container manager to start executing. // sigaction(2) sigaction(SIGUSR1, &sigact, NULL); /*** STUDENT CODE ABOVE (q17) ***/ } /* * Make the container wait for the host signal to start executing. */ void cont_wait() { /*** STUDENT CODE BELOW (q17) ***/ // Make the container wait on a condition variable. while (cont_started == 0); /*** STUDENT CODE ABOVE (q17) ***/ } /* * Finalize the container, from the host. * * 1. map the root user in the container, to the current user * 2. set up the network on the host side * * Parameter: * * cont: the container * * Return: * * 0 on success * * -1 on error (and print an error message) */ int finalize_host(const struct container *cont) { /*** STUDENT CODE BELOW (q5) ***/ // Map the root user to the user ID in the container. // libcontain: cgroup_map_root_user (and implement it!) if (cgroup_map_root_user(cont->pid) < 0) raise_msg("failed mapping root user in container"); /*** STUDENT CODE ABOVE (q5) ***/ /*** STUDENT CODE BELOW (q18) ***/ // Make network from the host for the container. // libcontain: contnet_make_host (and implement it!) if (contnet_make_host(cont->pid) < 0) raise_msg("failed making host-side network"); /*** STUDENT CODE ABOVE (q18) ***/ return 0; } /* * Finalize the container, from the inside of the container. * * 1. set the hostname * 2. mount the pseudo-filesystems: proc, sys and tmp * 3. changing the working and root directories * 4. set up the network on the container side * * Parameter: * * cont: the container * * Return: * * 0 on success * * -1 on error (and print an error message) */ int finalize_cont(const struct container *cont) { /*** STUDENT CODE BELOW (q6) ***/ // Set the hostname of the container. // sethostname(2) if (sethostname(cont->hostname, strlen(cont->hostname)) < 0) raise_err("failed setting hostname"); /*** STUDENT CODE ABOVE (q6) ***/ /*** STUDENT CODE BELOW (q11) ***/ // Mount the pseudo FSs required by the containerized process // libcontain: contfs_mount_pseudo_fs if (contfs_mount_pseudo_fs(&cont->fs) < 0) raise_msg("failed mounting pseudo-filesystems"); // Change current working directory to the FS of the container // chdir(2) if (chdir(cont->fs.root) < 0) raise_err("failed changing working directory to \"%s\"", cont->fs.root); // Change root of the container to the current working directory // chroot(2) if (chroot(".") < 0) raise_err("failed changing root of container to \".\" (\"%s\")", cont->fs.root); /*** STUDENT CODE ABOVE (q11) ***/ /*** STUDENT CODE BELOW (q18) ***/ // Make network from inside the container // libcontain: contnet_make_cont (and implement it!) if (contnet_make_cont(cont->ip_addr) < 0) raise_msg("failed making network inside container"); /*** STUDENT CODE ABOVE (q18) ***/ return 0; } static void host_handle_sigint() { printf("container interrupted by keyboard\n"); } /* * Run a command in a container. * * Main function. * * Parameters: * * cont: the container * * cont_args: the command to run in the container, and its arguments * * cont_args is a NULL-terminated array of strings, i.e., the last element of * cont_args is NULL, and is not part of the command to run in the container. * * Return: * * EXIT_SUCCESS on success * * does not otherwise return (exists the program with EXIT_FAILURE) */ int contain(struct container cont, char* cont_args[]) { // Stored in struct container, but required to check the return value of the // clone syscall. pid_t cont_pid = -1; printf("running container \"%s\" from image \"%s\"\n", cont.hostname, cont.image); printf("container command: "); print_container_cmd(cont_args); // Make the filesystem for the container. if (contfs_make(&cont.fs, cont.image) < 0) errx(EXIT_FAILURE, "failed making filesystem for container %s", cont.hostname); // Make the cgroup for the container. if (cgroup_make(&cont.cgroup, cont.memory_MB, cont.cpu_perc) < 0) { if (contfs_demake(&cont.fs) < 0) warn("failed demaking filesystem for container %s", cont.hostname); errx(EXIT_FAILURE, "failed making cgroup for container %s", cont.hostname); } // Set a signal handler before cloning, to make it active as soon as the // container subprocess runs. set_cont_wait(); // Start the process for the container. if ((cont_pid = clone_to_container(&cont)) > 0) { // In the parent process (container manager). // Wait status structure used to wait for the container process. int wstatus; // Graciously catch SIGINT. struct sigaction sigact; memset(&sigact, 0, sizeof(sigact)); sigact.sa_handler = host_handle_sigint; sigemptyset(&sigact.sa_mask); sigact.sa_flags = 0; sigaction(SIGINT, &sigact, NULL); // Finalize the environment of the container on the host side. if (finalize_host(&cont) < 0) { kill(cont.pid, SIGKILL); wait(NULL); if (cgroup_demake(&cont.cgroup) < 0) warn("failed demaking cgroup for container %s", cont.hostname); if (contfs_demake(&cont.fs) < 0) warn("failed demaking filesystem for container %s", cont.hostname); errx(EXIT_FAILURE, "failed setting up container environment from host"); } cont_start(cont_pid); printf("container started as process %d\n\n", cont_pid); /*** STUDENT CODE BELOW (q4) ***/ // Wait for the container process to terminate. // wait(2) do {} while (wait(&wstatus) < 0); /*** STUDENT CODE ABOVE (q4) ***/ if (WIFEXITED(wstatus)) printf("\ncontainer process exited normally with code %d\n", WEXITSTATUS(wstatus)); else if (WIFSIGNALED(wstatus)) printf("\ncontainer process exited because of signal %s\n", strsignal(WTERMSIG(wstatus))); } else if (cont_pid == 0) { // In the child process (container). cont_wait(); // Esthetics. printf("\n"); if (finalize_cont(&cont) < 0) errx(EXIT_FAILURE, "failed setting up container environment from container"); /*** STUDENT CODE BELOW (q2) ***/ // Execute the command inside the container, never to return unless it // failed executing it (e.g., command not found). // Note that cont_args is already formatted for use by execvp. // execvp(2) execvp(cont_args[0], cont_args); /*** STUDENT CODE ABOVE (q2) ***/ err(EXIT_FAILURE, "failed running container child process for container \"%s\"", cont.hostname); } else { // Cloning to container failed. if (cgroup_demake(&cont.cgroup) < 0) warn("failed demakin cgroup for container %s", cont.hostname); if (contfs_demake(&cont.fs) < 0) warn("failed demaking filesystem for container %s", cont.hostname); errx(EXIT_FAILURE, "failed starting the containerized child process of container " "\"%s\"", cont.hostname); } // The forked child calls execvp so it never reaches this point. // The parent waits for its child, so when this code is reached, the // container is terminated. if (cont.cgroup.fd > 0) { if (cgroup_demake(&cont.cgroup) < 0) warn("failed demaking cgroup for container %s", cont.hostname); } if (contfs_demake(&cont.fs) < 0) warn("failed demaking filesystem for container %s", cont.hostname); printf("container \"%s\" terminated\n", cont.hostname); return EXIT_SUCCESS; } // Help in parsing arguments: the first argument of the container (i.e., the // command to run inside) is at this ID in argv. #define FIRST_CONTAINER_ARG 6 /* * Entrypoint of the command line program. * * Parse the arguments to build a struct container, and treat the container's * arguments. * Eventually calls contain(). */ int main(int argc, char* argv[]) { int ret; char* hostname = argv[1]; char* image = argv[2]; unsigned int memory_MB = atoi(argv[3]); double cpu_perc = atoi(argv[4]); char* ip_addr = argv[5]; int nb_cont_args = argc - FIRST_CONTAINER_ARG; if (argc < FIRST_CONTAINER_ARG) { fprintf(stderr, "Usage: %s HOSTNAME IMAGE MEMORY_MB CPU_PERC IP_ADDRESS " "[ARGS...]\n", argv[0]); return -2; } if (memory_MB <= 0 || cpu_perc <= 0) errx(-2, "bad resource limit value for memory or CPU"); struct container cont = {0}; cont.hostname = hostname; cont.image = image; cont.memory_MB = memory_MB; cont.cpu_perc = cpu_perc; cont.ip_addr = ip_addr; // Compute paths of the container's FS. contfs_set(&cont.fs, cont.hostname, cont.image); // Compute path of the container's cgroup. cgroup_set(&cont.cgroup, cont.hostname); char* *cont_args = malloc((nb_cont_args + 1) * sizeof(argv[0])); // Set the array of arguments for execvp (with a NULL element at the end). memcpy(cont_args, &argv[FIRST_CONTAINER_ARG], nb_cont_args * sizeof(argv[0])); cont_args[nb_cont_args] = NULL; ret = contain(cont, cont_args); free(cont_args); return ret; }