[CRIU] [PATCH] Add docker phaul driver
Andrew Vagin
avagin at odin.com
Wed Oct 7 00:12:35 PDT 2015
Good start. Thank you.Good start. Thank you.
It looks like a draft. Could you explain what you do here and what works now?
A few comments are inline.
On Fri, Oct 02, 2015 at 05:27:26PM +0000, Hui Kang wrote:
> See the instruction at test/docker/HOWTO
>
> Signed-off-by: Hui Kang <hkang.sunysb at gmail.com>
> ---
> .gitignore | 1 +
> p.haul | 2 +-
> phaul/images.py | 5 ++
> phaul/p_haul_docker.py | 141 ++++++++++++++++++++++++++++++++++++++++++++++++
> phaul/p_haul_iters.py | 86 +++++++++++++++++++----------
> phaul/p_haul_pid.py | 3 ++
> phaul/p_haul_service.py | 10 ++--
> phaul/p_haul_type.py | 2 +
> phaul/xem_rpc.py | 1 -
> test/docker/HOWTO | 77 ++++++++++++++++++++++++++
> 10 files changed, 296 insertions(+), 32 deletions(-)
> create mode 100644 phaul/p_haul_docker.py
> create mode 100644 test/docker/HOWTO
>
> diff --git a/.gitignore b/.gitignore
> index 739cb45..1ab6601 100644
> --- a/.gitignore
> +++ b/.gitignore
> @@ -2,3 +2,4 @@ build/
> *.pyc
> rpc_pb2.py
> stats_pb2.py
> +*~
> \ No newline at end of file
> diff --git a/p.haul b/p.haul
> index aea7971..739ea95 100755
> --- a/p.haul
> +++ b/p.haul
> @@ -21,7 +21,7 @@ import phaul.xem_rpc as ph_xem_rpc
> #
>
> parser = argparse.ArgumentParser("Process HAULer")
> -parser.add_argument("type", help="Type of hat to haul, e.g. vz or lxc")
> +parser.add_argument("type", help="Type of hat to haul, e.g. vz, lxc, or docker")
> parser.add_argument("id", help="ID of what to haul")
> parser.add_argument("to", help="IP where to haul")
> parser.add_argument("-v", help="Verbosity level", default=ph_criu_api.def_verb, type=int, dest="verbose")
> diff --git a/phaul/images.py b/phaul/images.py
> index b9326ce..223bc6b 100644
> --- a/phaul/images.py
> +++ b/phaul/images.py
> @@ -137,6 +137,11 @@ class phaul_images:
> logging.info("Sending images to target")
>
> start = time.time()
> +
> + if htype.get_driver_name() == "docker" :
> + htype.send_criu_images()
> + return
Use tabs for indentation
> +
> cdir = self.image_dir()
>
> th.start_accept_images(phaul_images.IMGDIR)
> diff --git a/phaul/p_haul_docker.py b/phaul/p_haul_docker.py
> new file mode 100644
> index 0000000..9434ca2
> --- /dev/null
> +++ b/phaul/p_haul_docker.py
> @@ -0,0 +1,141 @@
> +import os
> +import shutil
> +import time
> +import p_haul_cgroup
> +import p_haul_module
> +import util
> +import fs_haul_shared
> +import fs_haul_subtree
> +import pycriu.rpc
> +
> +import subprocess as sp
> +
> +# TODO use docker-py
> +# import docker
> +
> +name = "docker"
> +
> +docker_exec = "/usr/bin/docker-1.9.0-dev"
> +docker_dir = "/var/lib/docker/"
> +criu_image_dir = "/var/run/docker/execdriver/native"
> +
> +class p_haul_type:
> + def __init__(self, ctid):
> +
> + # TODO ctid must > 3 digit; with docker-py, we can also resolve
> + # container name
> + if len(ctid) < 3:
> + raise Exception("Docker container ID must be > 3 digits")
> +
> + self._ctid = ctid
> + self._ct_rootfs = ""
> +
> + def get_driver_name(self):
> + return name
> +
> + def init_src(self):
> + self.full_ctid = self.get_full_ctid()
> + print "----> Full id: " + self.full_ctid
> + self.__load_ct_config(docker_dir)
> +
> +
> + def init_dst(self):
> + pass
> +
> + def adjust_criu_req(self, req):
> + """Add module-specific options to criu request"""
> + pass
> +
> + def root_task_pid(self):
> + pass
> +
> + def __load_ct_config(self, path):
> + print "---> Load docker config for ct: " + self._ctid
> + print "---> Loading config file from %s" % path
> + # Find the aufs filesystem dirname for the container
> + docker_aufs_dir = os.path.join(docker_dir, "aufs/mnt")
> + self._ct_rootfs = os.path.join(docker_aufs_dir, self.full_ctid)
> +
> + def set_options(self, opts):
> + pass
> +
> + # Remove any specific FS setup
> + def umount(self):
> + pass
> +
> + def get_fs(self):
> + print "---> get fs for docker cotnainer: " + self._ct_rootfs
> + return fs_haul_subtree.p_haul_fs(self._ct_rootfs)
> +
> + def get_full_ctid(self):
> + dir_name_list = os.listdir(os.path.join(docker_dir, "containers"))
> +
> + full_id = ""
> + for name in dir_name_list:
> + name = name.rsplit("/")
> + if (name[0].find(self._ctid) == 0):
> + full_id = name[0]
> + break
> +
> + if full_id != "":
> + return full_id
> + else:
> + raise Exception("Can not find container fs")
> +
> + def dump(self):
> + # TODO: docker API does not have checkpoint right now
> + # cli.checkpoint() so we have to use the command line
> + # cli = docker.Client(base_url='unix://var/run/docker.sock')
> + # output = cli.info()
> + # call docker API
> +
> + logf = open("/tmp/docker_checkpoint.log", "w+")
> + ret = sp.call([docker_exec, "checkpoint", self._ctid],
> + stdout = logf, stderr = logf)
> + if ret != 0:
> + raise Exception("docker checkpoint failed")
> +
> + def send_criu_images(self, thost):
> + # Sync checkpointed container images
> + ct_criu_img_dir = os.path.join(criu_image_dir, self.full_ctid)
> + dst_img_fs = fs_haul_subtree.p_haul_fs(ct_criu_img_dir)
> + dst_img_fs.set_target_host(thost)
> + dst_img_fs.set_work_dir(ct_criu_img_dir)
> + dst_img_fs.start_migration()
> +
> + # Container status
> + ct_state_dir = os.path.join(docker_dir, "containers", self.full_ctid)
> + dst_img_fs_exec = fs_haul_subtree.p_haul_fs(ct_state_dir)
> + dst_img_fs_exec.set_target_host(thost)
> + dst_img_fs_exec.set_work_dir(ct_state_dir)
> + dst_img_fs_exec.start_migration()
> +
> + def put_meta_images(self, dir):
> + pass
> +
> + def kill_last_docker_daemon(self):
> + p = sp.Popen(['pgrep', '-l' , docker_exec], stdout=sp.PIPE)
> + out, err = p.communicate()
> +
> + for line in out.splitlines():
> + line = bytes.decode(line)
> + pid = int(line.split(None, 1)[0])
> + os.kill(pid, signal.SIGKILL)
> +
> + def final_restore(self, img, criu):
> + logf = open("/tmp/docker_restore.log", "w+")
> +
> + # Kill any previous docker daemon in order to reload the
> + # status of the migrated container
> + self.kill_last_docker_daemon()
> +
> + # start docker daemon in background
> + sp.Popen([docker_exec, "daemon", "-s", "aufs"],
> + stdout = logf, stderr = logf)
you need to call wait for Popen objects.
> + time.sleep(2)
TODO It's a bad idea to use sleep for synchronisation
> +
> +
> + ret = sp.call([docker_exec, "restore", self._ctid],
> + stdout = logf, stderr = logf)
> + if ret != 0:
> + raise Exception("docker restore failed")
> diff --git a/phaul/p_haul_iters.py b/phaul/p_haul_iters.py
> index f23e799..3061c47 100644
> --- a/phaul/p_haul_iters.py
> +++ b/phaul/p_haul_iters.py
> @@ -34,8 +34,14 @@ class phaul_iter_worker:
>
> logging.info("Setting up local")
> self.img = images.phaul_images("dmp")
> - self.criu = criu_api.criu_conn(self.data_sk)
> self.htype = p_haul_type.get_src(p_type)
> +
> + if self.htype.get_driver_name() != "docker" :
> + # docker will talk to swrk
> + self.criu = criu_api.criu_conn(self.data_sk)
> + else:
> + self.criu = ""
> +
> if not self.htype:
> raise Exception("No htype driver found")
>
> @@ -45,13 +51,15 @@ class phaul_iter_worker:
>
> self.pid = self.htype.root_task_pid()
> self.fs.set_target_host(host[0])
> + self.target_host = host[0]
>
> logging.info("Setting up remote")
> self.th.setup(p_type)
>
> def set_options(self, opts):
> self.th.set_options(opts)
> - self.criu.verbose(opts["verbose"])
> + if self.htype.get_driver_name() != "docker" :
> + self.criu.verbose(opts["verbose"])
> self.img.set_options(opts)
> self.htype.set_options(opts)
> self.__force = opts["force"]
> @@ -75,8 +83,10 @@ class phaul_iter_worker:
> def start_migration(self):
> self._mstat.start()
>
> - if not self.__force:
> - self.validate_cpu()
> + # TODO fix it
> + if self.htype.get_driver_name() != "docker" :
> + if not self.__force:
> + self.validate_cpu()
>
> logging.info("Preliminary FS migration")
> self.fs.set_work_dir(self.img.work_dir())
> @@ -84,7 +94,30 @@ class phaul_iter_worker:
>
> logging.info("Starting iterations")
>
> - while True:
> + # For Docker, we take a different path
> + if self.htype.get_driver_name() == "docker" :
> + logging.info("Take a special path for Docker")
> +
> + self.htype.dump()
> + logging.info("\tDocker dump succeeded")
> +
> + logging.info("FS and images sync")
> + # sync the aufs filesystem again
> + self.fs.stop_migration()
> +
> + # send the docker criu image to host
> + self.htype.send_criu_images(self.target_host)
> +
> + logging.info("Asking target host to restore")
> + self.th.restore_from_images()
> +
> + return
> +
> + # TODO: Do not do predump for docker right now. Add page-server
> + # to docker C/R API, then we can enable
> + # the pre-dump
> +
> + while True :
> logging.info("* Iteration %d", self.iteration)
>
> self.th.start_iter()
> @@ -141,31 +174,30 @@ class phaul_iter_worker:
>
> logging.info("Final dump and restore")
>
> - self.th.start_iter()
> - self.img.new_image_dir()
> + self.th.start_iter()
> + self.img.new_image_dir()
>
> logging.info("\tIssuing dump command to service")
>
> - req = criu_req.make_dump_req(
> - self.pid, self.htype, self.img, self.criu, self.fs)
> - resp = self.criu.send_req(req)
> - while True:
> - if resp.type != cr_rpc.NOTIFY:
> - raise Exception("Dump failed")
> -
> - if resp.notify.script == "post-dump":
> - #
> - # Dump is effectively over. Now CRIU
> - # waits for us to do whatever we want
> - # and keeps the tasks frozen.
> - #
> - break
> -
> - elif resp.notify.script == "network-lock":
> - self.htype.net_lock()
> - elif resp.notify.script == "network-unlock":
> - self.htype.net_unlock()
> -
> + req = criu_req.make_dump_req(
> + self.pid, self.htype, self.img, self.criu, self.fs)
> + resp = self.criu.send_req(req)
> + while True:
> + if resp.type != cr_rpc.NOTIFY:
> + raise Exception("Dump failed")
> +
> + if resp.notify.script == "post-dump":
> + #
> + # Dump is effectively over. Now CRIU
> + # waits for us to do whatever we want
> + # and keeps the tasks frozen.
> + #
> + break
> +
> + elif resp.notify.script == "network-lock":
> + self.htype.net_lock()
> + elif resp.notify.script == "network-unlock":
> + self.htype.net_unlock()
> logging.info("\t\tNotify (%s)", resp.notify.script)
> resp = self.criu.ack_notify()
>
> diff --git a/phaul/p_haul_pid.py b/phaul/p_haul_pid.py
> index 47cf651..20433a1 100644
> --- a/phaul/p_haul_pid.py
> +++ b/phaul/p_haul_pid.py
> @@ -13,6 +13,9 @@ class p_haul_type:
> self.pid = int(id)
> self._pidfile = None
>
> + def get_driver_name(self):
> + return name
> +
> #
> # Initialize itself for source node or destination one
> #
> diff --git a/phaul/p_haul_service.py b/phaul/p_haul_service.py
> index f2cadb9..8963c2c 100644
> --- a/phaul/p_haul_service.py
> +++ b/phaul/p_haul_service.py
> @@ -44,11 +44,13 @@ class phaul_service:
> def rpc_setup(self, htype_id):
> logging.info("Setting up service side %s", htype_id)
> self.img = images.phaul_images("rst")
> - self.criu = criu_api.criu_conn(self.data_sk)
> self.htype = p_haul_type.get_dst(htype_id)
> + if self.htype.get_driver_name() != "docker" :
> + self.criu = criu_api.criu_conn(self.data_sk)
>
> def rpc_set_options(self, opts):
> - self.criu.verbose(opts["verbose"])
> + if self.htype.get_driver_name() != "docker" :
> + self.criu.verbose(opts["verbose"])
> self.img.set_options(opts)
> self.htype.set_options(opts)
>
> @@ -86,7 +88,9 @@ class phaul_service:
>
> def rpc_restore_from_images(self):
> logging.info("Restoring from images")
> - self.htype.put_meta_images(self.img.image_dir())
> + if self.htype.get_driver_name() != "docker" :
> + self.htype.put_meta_images(self.img.image_dir())
> +
> self.htype.final_restore(self.img, self.criu)
> logging.info("Restore succeeded")
> self.restored = True
> diff --git a/phaul/p_haul_type.py b/phaul/p_haul_type.py
> index 4fa2833..61e6810 100644
> --- a/phaul/p_haul_type.py
> +++ b/phaul/p_haul_type.py
> @@ -8,11 +8,13 @@ import logging
> import p_haul_vz
> import p_haul_pid
> import p_haul_lxc
> +import p_haul_docker
>
> haul_types = {
> p_haul_vz.name: p_haul_vz,
> p_haul_pid.name: p_haul_pid,
> p_haul_lxc.name: p_haul_lxc,
> + p_haul_docker.name: p_haul_docker,
> }
>
> def __get(id):
> diff --git a/phaul/xem_rpc.py b/phaul/xem_rpc.py
> index d320013..1662967 100644
> --- a/phaul/xem_rpc.py
> +++ b/phaul/xem_rpc.py
> @@ -95,7 +95,6 @@ class _rpc_server_sk:
> if data[0] == RPC_CALL:
> if not self._master:
> raise Exception("Proto seq error")
> -
^^^^
> res = getattr(self._master, "rpc_" + data[1])(*data[2])
> elif data[0] == RPC_CMD:
> res = getattr(self, data[1])(mgr, *data[2])
> diff --git a/test/docker/HOWTO b/test/docker/HOWTO
> new file mode 100644
> index 0000000..0b777c2
> --- /dev/null
> +++ b/test/docker/HOWTO
> @@ -0,0 +1,77 @@
> +This HOWTO describes how to _non_ live-migrate a docker container from one
> +docker host to another.
> +
> +** This is an experimental implementation of docker migration, which may affect
> +your running containers.
> +
> +0. Install CRIU, p.haul, docker on both nodes
> +
> + Besides the packages that are needed to compile and run CRIU and p.haul,
> + the specific docker binary that supports checkpoint/restore should be used.
> +
> + Refer to step 0 in test/mtouch/HOWTO about the pacekages for CRIU and p.haul.
> +
> + The docker version that supports checkpoint and restore can be obtained by
> +
> + # git clone https://github.com/boucher/docker.git
> + # cd docker.git
> + # git checkout cr-combined
> +
> + On both nodes, compile and store the the docker binary as
> +
> + /usr/bin/docker-1.9.0-dev
> +
> + Note that the path above is for now hard-coded in p_haul_docker.py
> +
> +1. Prepare criu and p.haul on both nodes (adapted from test/mtouch/HOWTO)
> +
> + a) CRIU
> +
> + * Clone CRIU repository from git://github.com/xemul/criu
> + and compile it with 'make'
> +
> + * Make _local_ directory for service ($csdir)
> + * Start CRIU service by root user
> +
> + # criu service -o $csdir/log -v4 --daemon
> +
> + b) On destination node start the p.haul service
> +
> + [dst]# ./p.haul-service
> + Starting p.haul rpyc service
> +
> + It will not go daemon and will print logs on the terminals
> +
> +3. Run the test container on source node
> +
> + a) Start the docker daemon
> +
> + # /usr/bin/docker-1.9.0-dev daemon -s aufs
> +
> + b) Start the container
> +
> + # /usr/bin/docker-1.9.0-dev run -d busybox:latest /bin/sh -c 'i=0; while true; do echo $i >> /foo; i=$(expr $i + 1); sleep 1; done'
> +
> + This command will return the container's ID, e.g., d78.
> + (borrowed from https://criu.org/Docker)
> +
> +4. Migrate container from source node
> +
> + [src]# ./p.haul docker [container ID, e.g., d78] [ip address of the dst]
> +
> + for example
> +
> + [src]# ./p.haul docker d78 192.168.11.106 # 192.168.11.106 is the destination node IP
> +
> +
> + On the destination node, run
> +
> + [dst]# /usr/bin/docker-1.9.0-dev ps
> + [dst]# /usr/bin/docker-1.9.0-dev exec d78 cat /foo
> +
> + to verify the counter is continuously being incremented.
> +
> +Known limitations.
> +
> +1. No support from docker python binding
> +2. Docker daemon has to be restarted on the destination node
Why do we need to restart the docker daemon?
> --
> 1.9.1
>
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
More information about the CRIU
mailing list