[CRIU] [PATCH v2] Add docker phaul driver

Nikita Spiridonov nspiridonov at odin.com
Tue Oct 20 05:05:23 PDT 2015


Imho, we don't need that get_driver_name() at all. It will be extremely
hard to change and maintain code with bunch of "get_driver_name" at the
top level of migration logic.

Actually current patch break all other modules since I remove name
variable from vz, pid and lxc modules in previous commits. That name
used exclusively for phaul modules import (e.g. vz, lxc or pid) before
your patch, and I change that logic to avoid unused modules import.

get_driver_name can be removed using some fake object, additional
abstraction layers and such stuff.

On Fri, 2015-10-16 at 00:41 -0400, Hui Kang wrote:
> See the instruction at test/docker/HOWTO
> 
> TODO (suggestions from xemul and avagin):
>     (1) Send criu image iteratively
>     (2) Validate cpu before migration by enabling the criu_conn for
>         the first time. This will also remove a lot of check for
>         "docker" driver
>     (3) Remove sleep, call wait for docker daemon
>     (4) Wait for docker-py to integrate the C/R APIs
> 
> Signed-off-by: Hui Kang <hkang.sunysb at gmail.com>
> ---
>  p.haul                   |   2 +-
>  phaul/fs_haul_subtree.py |   3 +
>  phaul/images.py          |   5 ++
>  phaul/p_haul_docker.py   | 148 +++++++++++++++++++++++++++++++++++++++++++++++
>  phaul/p_haul_iters.py    |  44 ++++++++++++--
>  phaul/p_haul_pid.py      |   3 +
>  phaul/p_haul_service.py  |  20 ++++---
>  phaul/p_haul_type.py     |   1 +
>  test/docker/HOWTO        |  82 ++++++++++++++++++++++++++
>  9 files changed, 295 insertions(+), 13 deletions(-)
>  create mode 100644 phaul/p_haul_docker.py
>  create mode 100644 test/docker/HOWTO
> 
> diff --git a/p.haul b/p.haul
> index 5a629bc..0e01424 100755
> --- a/p.haul
> +++ b/p.haul
> @@ -25,7 +25,7 @@ import phaul.p_haul_type
>  
>  parser = argparse.ArgumentParser("Process HAULer")
>  parser.add_argument("type", choices=phaul.p_haul_type.get_haul_names(),
> -	help="Type of hat to haul, e.g. vz or lxc")
> +	help="Type of hat to haul, e.g. vz, lxc, or, docker")
>  parser.add_argument("id", help="ID of what to haul")
>  parser.add_argument("--to", help="IP where to haul")
>  parser.add_argument("--fdrpc", help="File descriptor of rpc socket", type=int, required=True)
> diff --git a/phaul/fs_haul_subtree.py b/phaul/fs_haul_subtree.py
> index a9bd559..7400c25 100644
> --- a/phaul/fs_haul_subtree.py
> +++ b/phaul/fs_haul_subtree.py
> @@ -19,6 +19,9 @@ class p_haul_fs:
>  	def set_options(self, opts):
>  		self.__thost = opts["to"]
>  
> +	def set_target_host(self, thost):
> +		self.__thost = thost
> +
>  	def set_work_dir(self, wdir):
>  		self.__wdir = wdir
>  
> diff --git a/phaul/images.py b/phaul/images.py
> index 11b3dbb..fd2baa3 100644
> --- a/phaul/images.py
> +++ b/phaul/images.py
> @@ -140,6 +140,11 @@ class phaul_images:
>  		logging.info("Sending images to target")
>  
>  		start = time.time()
> +
> +		if htype.get_driver_name() == "docker" :
> +			htype.send_criu_images()
> +			return
> +
>  		cdir = self.image_dir()
>  
>  		target_host.start_accept_images(phaul_images.IMGDIR)
> diff --git a/phaul/p_haul_docker.py b/phaul/p_haul_docker.py
> new file mode 100644
> index 0000000..b4b0795
> --- /dev/null
> +++ b/phaul/p_haul_docker.py
> @@ -0,0 +1,148 @@
> +#
> +# Docker container hauler
> +#
> +
> +import os
> +import logging
> +import shutil
> +import time
> +import p_haul_cgroup
> +import p_haul_module
> +import util
> +import fs_haul_shared
> +import fs_haul_subtree
> +import pycriu.rpc
> +
> +import subprocess as sp
> +
> +# TODO use docker-py
> +# import docker
> +
> +name = "docker"
> +
> +docker_exec = "/usr/bin/docker-1.9.0-dev"
> +docker_dir = "/var/lib/docker/"
> +criu_image_dir = "/var/run/docker/execdriver/native"
> +
> +class p_haul_type:
> +	def __init__(self, ctid):
> +
> +		# TODO ctid must > 3 digit; with docker-py, we can also resolve
> +		#	  container name
> +		if len(ctid) < 3:
> +			raise Exception("Docker container ID must be > 3 digits")
> +
> +		self._ctid = ctid
> +		self._ct_rootfs = ""
> +
> +	def get_driver_name(self):
> +		return name
> +
> +	def init_src(self):
> +		self.full_ctid = self.get_full_ctid()
> +		self.__load_ct_config(docker_dir)
> +
> +
> +	def init_dst(self):
> +		pass
> +
> +	def adjust_criu_req(self, req):
> +		"""Add module-specific options to criu request"""
> +		pass
> +
> +	def root_task_pid(self):
> +		pass
> +
> +	def __load_ct_config(self, path):
> +		# Find the aufs filesystem dirname for the container
> +		docker_aufs_dir = os.path.join(docker_dir, "aufs/mnt")
> +		self._ct_rootfs = os.path.join(docker_aufs_dir, self.full_ctid)
> +		logging.info("Container rootfs: %s", self._ct_rootfs)
> +
> +	def set_options(self, opts):
> +		pass
> +
> +	# Remove any specific FS setup
> +	def umount(self):
> +		pass
> +
> +	def get_fs(self, fs_sk=None):
> +		return fs_haul_subtree.p_haul_fs(self._ct_rootfs)
> +
> +	def get_fs_receiver(self, fs_sk=None):
> +		pass
> +
> +	def get_full_ctid(self):
> +		dir_name_list = os.listdir(os.path.join(docker_dir, "containers"))
> +
> +		full_id = ""
> +		for name in dir_name_list:
> +			name = name.rsplit("/")
> +			if (name[0].find(self._ctid) == 0):
> +				full_id = name[0]
> +				break
> +
> +		if full_id != "":
> +			return full_id
> +		else:
> +			raise Exception("Can not find container fs")
> +
> +	def dump(self):
> +		logging.info("Dump docker container")
> +
> +		# TODO: docker API does not have checkpoint right now
> +		# cli.checkpoint() so we have to use the command line
> +		# cli = docker.Client(base_url='unix://var/run/docker.sock')
> +		# output = cli.info()
> +		# call docker API
> +
> +		logf = open("/tmp/docker_checkpoint.log", "w+")
> +		ret = sp.call([docker_exec, "checkpoint", self._ctid],
> +						stdout = logf, stderr = logf)
> +		if ret != 0:
> +			raise Exception("docker checkpoint failed")
> +
> +	def send_criu_images(self, thost):
> +		# Sync checkpointed container images
> +		ct_criu_img_dir = os.path.join(criu_image_dir, self.full_ctid)
> +		dst_img_fs = fs_haul_subtree.p_haul_fs(ct_criu_img_dir)
> +		dst_img_fs.set_target_host(thost)
> +		dst_img_fs.set_work_dir(ct_criu_img_dir)
> +		dst_img_fs.start_migration()
> +
> +		# Sync container status
> +		ct_state_dir = os.path.join(docker_dir, "containers", self.full_ctid)
> +		dst_img_fs_exec = fs_haul_subtree.p_haul_fs(ct_state_dir)
> +		dst_img_fs_exec.set_target_host(thost)
> +		dst_img_fs_exec.set_work_dir(ct_state_dir)
> +		dst_img_fs_exec.start_migration()
> +
> +	def put_meta_images(self, dir):
> +		pass
> +
> +	def kill_last_docker_daemon(self):
> +		p = sp.Popen(['pgrep', '-l' , docker_exec], stdout=sp.PIPE)
> +		out, err = p.communicate()
> +
> +		for line in out.splitlines():
> +			line = bytes.decode(line)
> +			pid = int(line.split(None, 1)[0])
> +			os.kill(pid, signal.SIGKILL)
> +
> +	def final_restore(self, img, criu):
> +		logf = open("/tmp/docker_restore.log", "w+")
> +
> +		# Kill any previous docker daemon in order to reload the
> +		# status of the migrated container
> +		self.kill_last_docker_daemon()
> +
> +		# start docker daemon in background
> +		daemon = sp.Popen([docker_exec, "daemon", "-s", "aufs"],
> +				 stdout = logf, stderr = logf)
> +		# daemon.wait() TODO: docker daemon not return
> +		time.sleep(2)
> +
> +		ret = sp.call([docker_exec, "restore", self._ctid],
> +						stdout = logf, stderr = logf)
> +		if ret != 0:
> +			raise Exception("docker restore failed")
> diff --git a/phaul/p_haul_iters.py b/phaul/p_haul_iters.py
> index b2c76e3..8f8fb73 100644
> --- a/phaul/p_haul_iters.py
> +++ b/phaul/p_haul_iters.py
> @@ -36,6 +36,13 @@ class phaul_iter_worker:
>  		self.img = images.phaul_images("dmp")
>  
>  		self.htype = p_haul_type.get_src(p_type)
> +
> +		if self.htype.get_driver_name() != "docker" :
> +			# docker will talk to swrk in runc
> +			self.criu_connection = criu_api.criu_conn(self.data_socket)
> +		else:
> +			self.criu_connection = ""
> +
>  		if not self.htype:
>  			raise Exception("No htype driver found")
>  
> @@ -55,13 +62,16 @@ class phaul_iter_worker:
>  
>  	def set_options(self, opts):
>  		self.target_host.set_options(opts)
> -		self.criu_connection.verbose(opts["verbose"])
> -		self.criu_connection.shell_job(opts["shell_job"])
> +		if self.htype.get_driver_name() != "docker" :
> +			self.criu_connection.verbose(opts["verbose"])
> +			self.criu_connection.shell_job(opts["shell_job"])
> +
>  		self.img.set_options(opts)
>  		self.htype.set_options(opts)
>  		self.fs.set_options(opts)
>  		self.__force = opts["force"]
>  		self.pre_dump = opts["pre_dump"]
> +		self.target_host_ip = opts["to"]
>  
>  	def validate_cpu(self):
>  		logging.info("Checking CPU compatibility")
> @@ -103,13 +113,39 @@ class phaul_iter_worker:
>  
>  		migration_stats.start()
>  
> -		if not self.__force:
> -			self.validate_cpu()
> +		# TODO fix it
> +		if self.htype.get_driver_name() != "docker" :
> +			if not self.__force:
> +				self.validate_cpu()
>  
>  		logging.info("Preliminary FS migration")
>  		self.fs.set_work_dir(self.img.work_dir())
>  		self.fs.start_migration()
>  
> +		logging.info("Starting iterations")
> +
> +		# For Docker, we take a different path
> +		if self.htype.get_driver_name() == "docker" :
> +			logging.info("Take a special path for Docker")
> +
> +			self.htype.dump()
> +			logging.info("\tDocker dump succeeded")
> +			logging.info("FS and images sync")
> +			# sync the aufs filesystem again
> +			self.fs.stop_migration()
> +
> +			# send the docker criu image to host
> +			self.htype.send_criu_images(self.target_host_ip)
> +
> +			logging.info("Asking target host to restore")
> +			self.target_host.restore_from_images()
> +
> +			return
> +
> +		# TODO: Do not do predump for docker right now. Add page-server
> +		#	to docker C/R API, then we can enable
> +		#	 the pre-dump
> +
>  		logging.info("Checking for Dirty Tracking")
>  		if self.pre_dump == PRE_DUMP_AUTO_DETECT:
>  			# pre-dump auto-detection
> diff --git a/phaul/p_haul_pid.py b/phaul/p_haul_pid.py
> index e0f9d2f..27f8d0a 100644
> --- a/phaul/p_haul_pid.py
> +++ b/phaul/p_haul_pid.py
> @@ -11,6 +11,9 @@ class p_haul_type:
>  		self.pid = int(id)
>  		self._pidfile = None
>  
> +	def get_driver_name(self):
> +		return name
> +
>  	#
>  	# Initialize itself for source node or destination one
>  	#
> diff --git a/phaul/p_haul_service.py b/phaul/p_haul_service.py
> index 11883a6..f0667fc 100644
> --- a/phaul/p_haul_service.py
> +++ b/phaul/p_haul_service.py
> @@ -45,17 +45,19 @@ class phaul_service:
>  		logging.info("Setting up service side %s", htype_id)
>  		self.img = images.phaul_images("rst")
>  
> -		self.criu_connection = criu_api.criu_conn(self._mem_sk)
>  		self.htype = p_haul_type.get_dst(htype_id)
>  
> -		# Create and start fs receiver if current p.haul module provide it
> -		self.__fs_receiver = self.htype.get_fs_receiver(self._fs_sk)
> -		if self.__fs_receiver:
> -			self.__fs_receiver.start()
> +		if self.htype.get_driver_name() != "docker" :
> +			self.criu_connection = criu_api.criu_conn(self._mem_sk)
> +			# Create and start fs receiver if current p.haul module provide it
> +			self.__fs_receiver = self.htype.get_fs_receiver(self._fs_sk)
> +			if self.__fs_receiver:
> +				self.__fs_receiver.start()
>  
>  	def rpc_set_options(self, opts):
> -		self.criu_connection.verbose(opts["verbose"])
> -		self.criu_connection.shell_job(opts["shell_job"])
> +		if self.htype.get_driver_name() != "docker" :
> +			self.criu_connection.verbose(opts["verbose"])
> +			self.criu_connection.shell_job(opts["shell_job"])
>  		self.img.set_options(opts)
>  		self.htype.set_options(opts)
>  
> @@ -94,7 +96,9 @@ class phaul_service:
>  
>  	def rpc_restore_from_images(self):
>  		logging.info("Restoring from images")
> -		self.htype.put_meta_images(self.img.image_dir())
> +		if self.htype.get_driver_name() != "docker" :
> +			self.htype.put_meta_images(self.img.image_dir())
> +
>  		self.htype.final_restore(self.img, self.criu_connection)
>  		logging.info("Restore succeeded")
>  		self.restored = True
> diff --git a/phaul/p_haul_type.py b/phaul/p_haul_type.py
> index 7e05a7b..d4dac83 100644
> --- a/phaul/p_haul_type.py
> +++ b/phaul/p_haul_type.py
> @@ -11,6 +11,7 @@ __haul_modules = {
>  	"vz": "p_haul_vz",
>  	"pid": "p_haul_pid",
>  	"lxc": "p_haul_lxc",
> +	"docker": "p_haul_docker",
>  }
>  
>  def __get(id):
> diff --git a/test/docker/HOWTO b/test/docker/HOWTO
> new file mode 100644
> index 0000000..e154fcd
> --- /dev/null
> +++ b/test/docker/HOWTO
> @@ -0,0 +1,82 @@
> +This HOWTO describes how to _non_ live-migrate a docker container from one
> +docker host to another.
> +
> +** This is an experimental implementation of docker migration, which may affect
> +your running containers.
> +
> +0. Install CRIU, p.haul, docker on both nodes
> +
> +   Besides the packages that are needed to compile and run CRIU and p.haul,
> +   the specific docker binary that supports checkpoint/restore should be used.
> +
> +   Refer to step 0 in test/mtouch/HOWTO about the pacekages for CRIU and p.haul.
> +
> +   The docker version that supports checkpoint and restore can be obtained by
> +
> +   # git clone https://github.com/boucher/docker.git
> +   # cd docker.git
> +   # git checkout cr-combined
> +
> +   On both nodes, compile and store the the docker binary as
> +
> +   /usr/bin/docker-1.9.0-dev
> +
> +   Note that the path above is for now hard-coded in p_haul_docker.py
> +
> +1. Prepare criu and p.haul on both nodes (adapted from test/mtouch/HOWTO)
> +
> +   a) CRIU
> +
> +   * Clone CRIU repository from git://github.com/xemul/criu
> +     and compile it with 'make'
> +
> +   * Make _local_ directory for service ($csdir)
> +   * Start CRIU service by root user
> +     Note that this step is optional because runC will start criu in swrk
> +     mode.
> +
> +   # criu service -o $csdir/log -v4 --daemon
> +
> +   b) On destination node start the p.haul service
> +
> +   [dst]# ./p.haul-wrap service
> +   Starting p.haul rpyc service
> +
> +   It will go daemon and will print logs on the terminals
> +
> +3. Run the test container on source node
> +
> +   a) Start the docker daemon
> +
> +   # /usr/bin/docker-1.9.0-dev daemon -s aufs
> +
> +   b) Start the container
> +
> +   # /usr/bin/docker-1.9.0-dev run -d busybox:latest /bin/sh -c 'i=0; while true; do echo $i >> /foo; i=$(expr $i + 1); sleep 1; done'
> +
> +   This command will return the container's ID, e.g.,  d78.
> +   (borrowed from https://criu.org/Docker)
> +
> +4. Migrate container from source node
> +
> +   [src]# ./p.haul-wrap client to docker [container ID, e.g., d78]
> +
> +   to is the ip address of the dst node
> +
> +   For example:
> +
> +   [src]# ./p.haul-wrap client 192.168.11.106 docker d78
> +   192.168.11.106 is the destination node IP and d78 is the container ID
> +
> +
> +   When the command returns, on the destination node run
> +
> +   [dst]# /usr/bin/docker-1.9.0-dev ps
> +   [dst]# /usr/bin/docker-1.9.0-dev exec d78 cat /foo
> +
> +   to verify the counter is continuously being incremented.
> +
> +Known limitations.
> +
> +1. No support from docker python binding
> +2. Docker daemon has to be restarted on the destination node




More information about the CRIU mailing list