[Devel] [PATCH RH7] mount: fix mounting of detached mounts onto targets that reside on shared mounts

Pavel Tikhomirov ptikhomirov at virtuozzo.com
Wed Jun 9 14:29:27 MSK 2021


Reviewed-by: Pavel Tikhomirov <ptikhomirov at virtuozzo.com>

On 08.06.2021 13:53, Kirill Tkhai wrote:
> From: Christian Brauner <christian.brauner at ubuntu.com>
> 
> ms commit ee2e3f50629f
> 
> Creating a series of detached mounts, attaching them to the filesystem,
> and unmounting them can be used to trigger an integer overflow in
> ns->mounts causing the kernel to block any new mounts in count_mounts()
> and returning ENOSPC because it falsely assumes that the maximum number
> of mounts in the mount namespace has been reached, i.e. it thinks it
> can't fit the new mounts into the mount namespace anymore.
> 
> Depending on the number of mounts in your system, this can be reproduced
> on any kernel that supportes open_tree() and move_mount() by compiling
> and running the following program:
> 
> /* SPDX-License-Identifier: LGPL-2.1+ */
> 
> #define _GNU_SOURCE
> #include <errno.h>
> #include <fcntl.h>
> #include <getopt.h>
> #include <limits.h>
> #include <stdbool.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <sys/mount.h>
> #include <sys/stat.h>
> #include <sys/syscall.h>
> #include <sys/types.h>
> #include <unistd.h>
> 
> /* open_tree() */
> #ifndef OPEN_TREE_CLONE
> #define OPEN_TREE_CLONE 1
> #endif
> 
> #ifndef OPEN_TREE_CLOEXEC
> #define OPEN_TREE_CLOEXEC O_CLOEXEC
> #endif
> 
> #ifndef __NR_open_tree
>          #if defined __alpha__
>                  #define __NR_open_tree 538
>          #elif defined _MIPS_SIM
>                  #if _MIPS_SIM == _MIPS_SIM_ABI32        /* o32 */
>                          #define __NR_open_tree 4428
>                  #endif
>                  #if _MIPS_SIM == _MIPS_SIM_NABI32       /* n32 */
>                          #define __NR_open_tree 6428
>                  #endif
>                  #if _MIPS_SIM == _MIPS_SIM_ABI64        /* n64 */
>                          #define __NR_open_tree 5428
>                  #endif
>          #elif defined __ia64__
>                  #define __NR_open_tree (428 + 1024)
>          #else
>                  #define __NR_open_tree 428
>          #endif
> #endif
> 
> /* move_mount() */
> #ifndef MOVE_MOUNT_F_EMPTY_PATH
> #define MOVE_MOUNT_F_EMPTY_PATH 0x00000004 /* Empty from path permitted */
> #endif
> 
> #ifndef __NR_move_mount
>          #if defined __alpha__
>                  #define __NR_move_mount 539
>          #elif defined _MIPS_SIM
>                  #if _MIPS_SIM == _MIPS_SIM_ABI32        /* o32 */
>                          #define __NR_move_mount 4429
>                  #endif
>                  #if _MIPS_SIM == _MIPS_SIM_NABI32       /* n32 */
>                          #define __NR_move_mount 6429
>                  #endif
>                  #if _MIPS_SIM == _MIPS_SIM_ABI64        /* n64 */
>                          #define __NR_move_mount 5429
>                  #endif
>          #elif defined __ia64__
>                  #define __NR_move_mount (428 + 1024)
>          #else
>                  #define __NR_move_mount 429
>          #endif
> #endif
> 
> static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
> {
>          return syscall(__NR_open_tree, dfd, filename, flags);
> }
> 
> static inline int sys_move_mount(int from_dfd, const char *from_pathname, int to_dfd,
>                                   const char *to_pathname, unsigned int flags)
> {
>          return syscall(__NR_move_mount, from_dfd, from_pathname, to_dfd, to_pathname, flags);
> }
> 
> static bool is_shared_mountpoint(const char *path)
> {
>          bool shared = false;
>          FILE *f = NULL;
>          char *line = NULL;
>          int i;
>          size_t len = 0;
> 
>          f = fopen("/proc/self/mountinfo", "re");
>          if (!f)
>                  return 0;
> 
>          while (getline(&line, &len, f) > 0) {
>                  char *slider1, *slider2;
> 
>                  for (slider1 = line, i = 0; slider1 && i < 4; i++)
>                          slider1 = strchr(slider1 + 1, ' ');
> 
>                  if (!slider1)
>                          continue;
> 
>                  slider2 = strchr(slider1 + 1, ' ');
>                  if (!slider2)
>                          continue;
> 
>                  *slider2 = '\0';
>                  if (strcmp(slider1 + 1, path) == 0) {
>                          /* This is the path. Is it shared? */
>                          slider1 = strchr(slider2 + 1, ' ');
>                          if (slider1 && strstr(slider1, "shared:")) {
>                                  shared = true;
>                                  break;
>                          }
>                  }
>          }
>          fclose(f);
>          free(line);
> 
>          return shared;
> }
> 
> static void usage(void)
> {
>          const char *text = "mount-new [--recursive] <base-dir>\n";
>          fprintf(stderr, "%s", text);
>          _exit(EXIT_SUCCESS);
> }
> 
> #define exit_usage(format, ...)                              \
>          ({                                                   \
>                  fprintf(stderr, format "\n", ##__VA_ARGS__); \
>                  usage();                                     \
>          })
> 
> #define exit_log(format, ...)                                \
>          ({                                                   \
>                  fprintf(stderr, format "\n", ##__VA_ARGS__); \
>                  exit(EXIT_FAILURE);                          \
>          })
> 
> static const struct option longopts[] = {
>          {"help",        no_argument,            0,      'a'},
>          { NULL,         no_argument,            0,       0 },
> };
> 
> int main(int argc, char *argv[])
> {
>          int exit_code = EXIT_SUCCESS, index = 0;
>          int dfd, fd_tree, new_argc, ret;
>          char *base_dir;
>          char *const *new_argv;
>          char target[PATH_MAX];
> 
>          while ((ret = getopt_long_only(argc, argv, "", longopts, &index)) != -1) {
>                  switch (ret) {
>                  case 'a':
>                          /* fallthrough */
>                  default:
>                          usage();
>                  }
>          }
> 
>          new_argv = &argv[optind];
>          new_argc = argc - optind;
>          if (new_argc < 1)
>                  exit_usage("Missing base directory\n");
>          base_dir = new_argv[0];
> 
>          if (*base_dir != '/')
>                  exit_log("Please specify an absolute path");
> 
>          /* Ensure that target is a shared mountpoint. */
>          if (!is_shared_mountpoint(base_dir))
>                  exit_log("Please ensure that \"%s\" is a shared mountpoint", base_dir);
> 
>          dfd = open(base_dir, O_RDONLY | O_DIRECTORY | O_CLOEXEC);
>          if (dfd < 0)
>                  exit_log("%m - Failed to open base directory \"%s\"", base_dir);
> 
>          ret = mkdirat(dfd, "detached-move-mount", 0755);
>          if (ret < 0)
>                  exit_log("%m - Failed to create required temporary directories");
> 
>          ret = snprintf(target, sizeof(target), "%s/detached-move-mount", base_dir);
>          if (ret < 0 || (size_t)ret >= sizeof(target))
>                  exit_log("%m - Failed to assemble target path");
> 
>          /*
>           * Having a mount table with 10000 mounts is already quite excessive
>           * and shoult account even for weird test systems.
>           */
>          for (size_t i = 0; i < 10000; i++) {
>                  fd_tree = sys_open_tree(dfd, "detached-move-mount",
>                                          OPEN_TREE_CLONE |
>                                          OPEN_TREE_CLOEXEC |
>                                          AT_EMPTY_PATH);
>                  if (fd_tree < 0) {
>                          fprintf(stderr, "%m - Failed to open %d(detached-move-mount)", dfd);
>                          exit_code = EXIT_FAILURE;
>                          break;
>                  }
> 
>                  ret = sys_move_mount(fd_tree, "", dfd, "detached-move-mount", MOVE_MOUNT_F_EMPTY_PATH);
>                  if (ret < 0) {
>                          if (errno == ENOSPC)
>                                  fprintf(stderr, "%m - Buggy mount counting");
>                          else
>                                  fprintf(stderr, "%m - Failed to attach mount to %d(detached-move-mount)", dfd);
>                          exit_code = EXIT_FAILURE;
>                          break;
>                  }
>                  close(fd_tree);
> 
>                  ret = umount2(target, MNT_DETACH);
>                  if (ret < 0) {
>                          fprintf(stderr, "%m - Failed to unmount %s", target);
>                          exit_code = EXIT_FAILURE;
>                          break;
>                  }
>          }
> 
>          (void)unlinkat(dfd, "detached-move-mount", AT_REMOVEDIR);
>          close(dfd);
> 
>          exit(exit_code);
> }
> 
> and wait for the kernel to refuse any new mounts by returning ENOSPC.
> How many iterations are needed depends on the number of mounts in your
> system. Assuming you have something like 50 mounts on a standard system
> it should be almost instantaneous.
> 
> The root cause of this is that detached mounts aren't handled correctly
> when source and target mount are identical and reside on a shared mount
> causing a broken mount tree where the detached source itself is
> propagated which propagation prevents for regular bind-mounts and new
> mounts. This ultimately leads to a miscalculation of the number of
> mounts in the mount namespace.
> 
> Detached mounts created via
> open_tree(fd, path, OPEN_TREE_CLONE)
> are essentially like an unattached new mount, or an unattached
> bind-mount. They can then later on be attached to the filesystem via
> move_mount() which calls into attach_recursive_mount(). Part of
> attaching it to the filesystem is making sure that mounts get correctly
> propagated in case the destination mountpoint is MS_SHARED, i.e. is a
> shared mountpoint. This is done by calling into propagate_mnt() which
> walks the list of peers calling propagate_one() on each mount in this
> list making sure it receives the propagation event.
> The propagate_one() functions thereby skips both new mounts and bind
> mounts to not propagate them "into themselves". Both are identified by
> checking whether the mount is already attached to any mount namespace in
> mnt->mnt_ns. The is what the IS_MNT_NEW() helper is responsible for.
> 
> However, detached mounts have an anonymous mount namespace attached to
> them stashed in mnt->mnt_ns which means that IS_MNT_NEW() doesn't
> realize they need to be skipped causing the mount to propagate "into
> itself" breaking the mount table and causing a disconnect between the
> number of mounts recorded as being beneath or reachable from the target
> mountpoint and the number of mounts actually recorded/counted in
> ns->mounts ultimately causing an overflow which in turn prevents any new
> mounts via the ENOSPC issue.
> 
> So teach propagation to handle detached mounts by making it aware of
> them. I've been tracking this issue down for the last couple of days and
> then verifying that the fix is correct by
> unmounting everything in my current mount table leaving only /proc and
> /sys mounted and running the reproducer above overnight verifying the
> number of mounts counted in ns->mounts. With this fix the counts are
> correct and the ENOSPC issue can't be reproduced.
> 
> This change will only have an effect on mounts created with the new
> mount API since detached mounts cannot be created with the old mount API
> so regressions are extremely unlikely.
> 
> Link: https://lore.kernel.org/r/20210306101010.243666-1-christian.brauner@ubuntu.com
> Fixes: 2db154b3ea8e ("vfs: syscall: Add move_mount(2) to move mounts around")
> Cc: David Howells <dhowells at redhat.com>
> Cc: Al Viro <viro at zeniv.linux.org.uk>
> Cc: linux-fsdevel at vger.kernel.org
> Cc: <stable at vger.kernel.org>
> Reviewed-by: Christoph Hellwig <hch at lst.de>
> Signed-off-by: Christian Brauner <christian.brauner at ubuntu.com>
> ---
>   fs/pnode.h |    2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/pnode.h b/fs/pnode.h
> index dc87e65becd2..6317777d7d32 100644
> --- a/fs/pnode.h
> +++ b/fs/pnode.h
> @@ -13,7 +13,7 @@
>   
>   #define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED)
>   #define IS_MNT_SLAVE(m) ((m)->mnt_master)
> -#define IS_MNT_NEW(m)  (!(m)->mnt_ns)
> +#define IS_MNT_NEW(m)  (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns))
>   #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED)
>   #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE)
>   #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED)
> 
> 

-- 
Best regards, Tikhomirov Pavel
Software Developer, Virtuozzo.


More information about the Devel mailing list