[CRIU] lxc-checkpoint restore failed
Jason Lee
ldm5235 at gmail.com
Mon Oct 19 02:12:22 PDT 2015
Thank you very much!
I have found the bad guy which leads to restore failed in container!
Here it is:
root at dslab:/home/dslab/tools/criu# ./crit show
/home/checkpoint/bad_cr/ids-68.img
{
"magic": "IDS",
"entries": [
{
"vm_id": 4,
"files_id": 4,
"fs_id": 4,
"sighand_id": 4,
"pid_ns_id": 7,
"net_ns_id": 8,
"ipc_ns_id": 9,
"uts_ns_id": 10,
"mnt_ns_id": 11,
"user_ns_id": 6
}
]
}
root at dslab:/home/dslab/tools/criu# ./crit show
/home/checkpoint/bad_cr/core-68.img
{
"magic": "CORE",
"entries": [
{
"mtype": "X86_64",
...
"tc": {
"task_state": 1,
"exit_code": 0,
"personality": 0,
"flags": 1077944384,
"blk_sigset": "0x0",
"comm": "dhclient",
"timers": {
"real": {
"isec": 0,
"iusec": 0,
"vsec": 0,
"vusec": 0
},
"virt": {
"isec": 0,
"iusec": 0,
"vsec": 0,
"vusec": 0
},
"prof": {
"isec": 0,
"iusec": 0,
"vsec": 0,
"vusec": 0
}
},
"rlimits": {
"rlimits": [
{
"cur": 18446744073709551615,
"max": 18446744073709551615
},
{
"cur": 18446744073709551615,
"max": 18446744073709551615
},
{
"cur": 18446744073709551615,
"max": 18446744073709551615
},
{
"cur": 8388608,
"max": 18446744073709551615
},
{
"cur": 0,
"max": 18446744073709551615
},
{
"cur": 18446744073709551615,
"max": 18446744073709551615
},
{
"cur": 31333,
"max": 31333
},
{
"cur": 65536,
"max": 65536
},
{
"cur": 65536,
"max": 65536
},
{
"cur": 18446744073709551615,
"max": 18446744073709551615
},
{
"cur": 18446744073709551615,
"max": 18446744073709551615
},
{
"cur": 31333,
"max": 31333
},
{
"cur": 819200,
"max": 819200
},
{
"cur": 0,
"max": 0
},
{
"cur": 0,
"max": 0
},
{
"cur": 18446744073709551615,
"max": 18446744073709551615
}
]
},
"cg_set": 4,
"signals_s": {}
},
"thread_core": {
"futex_rla": 0,
"futex_rla_len": 24,
"sched_nice": 0,
"sched_policy": 0,
"sas": {
"ss_sp": 0,
"ss_size": 0,
"ss_flags": 2
},
"signals_p": {}
}
}
]
}
dhclient maybe use SOCK_PACKET to acquire IP address. When I kill this
process, CRIU works well !
Best regards
>From Jason Lee
2015-10-19 16:51 GMT+08:00 Pavel Emelyanov <xemul at parallels.com>:
> On 10/16/2015 06:05 AM, Jason Lee wrote:
> > In actually,I have done as you said but there are no pid info in
> fdinfo-img,just id,flag,type and fd.
>
> Yes, fdinfo is a table, without pids. Pid is ... implicit here ;) E.g.
> fdinfo-4.img
> means that this is table number 4. Now you need to look at ids-*.img and
> check which
> one of them has files_id being 4. The respective image file name would
> contain the
> pid of the task owning one.
>
> > root at dslab:/home/dslab/tools/criu# ./crit show
> /home/checkpoint/c2/fdinfo-4.img
> > {
> > "magic": "FDINFO",
> > "entries": [
> > {
> > "id": 56,
> > "flags": 0,
> > "type": "REG",
> > "fd": 0
> > },
> > {
> > "id": 57,
> > "flags": 0,
> > "type": "REG",
> > "fd": 1
> > },
> > {
> > "id": 58,
> > "flags": 0,
> > "type": "REG",
> > "fd": 2
> > },
> > {
> > "id": 59,
> > "flags": 1,
> > "type": "UNIXSK",
> > "fd": 3
> > },
> > {
> > "id": 60,
> > "flags": 0,
> > "type": "REG",
> > "fd": 4
> > },
> > {
> > "id": 61,
> > "flags": 1,
> > "type": "PACKETSK",
> > "fd": 5
> > },
> > {
> > "id": 62,
> > "flags": 1,
> > "type": "INETSK",
> > "fd": 6
> > },
> > {
> > "id": 63,
> > "flags": 0,
> > "type": "INETSK",
> > "fd": 20
> > },
> > {
> > "id": 64,
> > "flags": 0,
> > "type": "INETSK",
> > "fd": 21
> > }
> > ]
> > }
> >
> > In packetsk.img, there are the pid under "fown",It's the pid which I
> look for ?
> >
> > root at dslab:/home/dslab/tools/criu# ./crit show
> /home/checkpoint/c2/packetsk.img
> > {
> > "magic": "PACKETSK",
> > "entries": [
> > {
> > "id": 61,
> > "type": 10,
> > "protocol": 768,
> > "flags": "0x80002",
> > "ifindex": 73,
> > "fown": {
> > "uid": 0,
> > "euid": 0,
> > "signum": 0,
> > "pid_type": 0,
> > "pid": 0
> > },
> > ...
> > }
> >
> > In the checkpoint dir,there are several core-$pid.img:
> > root at dslab:/home/checkpoint# ls c2/core-*
> > c2/core-1.img c2/core-20.img c2/core-68.img c2/core-89.img
> c2/core-90.img c2/core-92.img
> >
> > So I grep each core img
> > root at dslab:/home/dslab/tools/criu# ./crit show
> /home/checkpoint/c2/core-1.img | grep comm
> > "comm": "systemd",
> > root at dslab:/home/dslab/tools/criu# ./crit show
> /home/checkpoint/c2/core-20.img | grep comm
> > "comm": "systemd-journal",
> > root at dslab:/home/dslab/tools/criu# ./crit show
> /home/checkpoint/c2/core-68.img | grep comm
> > "comm": "dhclient",
> > root at dslab:/home/dslab/tools/criu# ./crit show
> /home/checkpoint/c2/core-89.img | grep comm
> > "comm": "sshd",
> > root at dslab:/home/dslab/tools/criu# ./crit show
> /home/checkpoint/c2/core-90.img | grep comm
> > "comm": "rc.local",
> > root at dslab:/home/dslab/tools/criu# ./crit show
> /home/checkpoint/c2/core-92.img | grep comm
> > "comm": "a.out",
> >
> > a.out is just the test app without network usage.Maybe something wrong
> exist in other processes?
> >
> > 2015-10-16 2:10 GMT+08:00 Tycho Andersen <tycho.andersen at canonical.com
> <mailto:tycho.andersen at canonical.com>>:
> >
> > On Thu, Oct 15, 2015 at 08:37:02PM +0800, Jason Lee wrote:
> > > In this server,I only run the latest version LXC and CRIU. There
> are no any
> > > my own applications
> > > using socket in LXC(maybe sshd or dhclient?).My distribution linux
> is
> > > debian 8
> > > so I don't know which applications use SOCK_PACKET ...
> >
> > If you look to see which pid (crit show fdinfo-$pid.img) has an fd
> > info with the id 61 (your SOCK_PACKET socket), you can get the
> command
> > name from core.img (crit show core-$pid.img | grep comm). I'd be
> > curious to know what application this is, because it if is something
> > like systemd, it will come downstream to ubuntu and we'll need to fix
> > this soon :)
> >
> > Tycho
> >
> > > To avoid this situation, I think using two network card is better
> than
> > > net-bridge,isn't it ?
> > >
> > > I wish criu could solve this issue in next version!
> > >
> > >
> > > - Jason
> > >
> > > 2015-10-15 20:08 GMT+08:00 Jason Lee <ldm5235 at gmail.com <mailto:
> ldm5235 at gmail.com>>:
> > >
> > > >
> > > > ---------- Forwarded message ----------
> > > > From: Pavel Emelyanov <xemul at parallels.com <mailto:
> xemul at parallels.com>>
> > > > Date: 2015-10-15 20:06 GMT+08:00
> > > > Subject: Re: [CRIU] lxc-checkpoint restore failed
> > > > To: Jason Lee <ldm5235 at gmail.com <mailto:ldm5235 at gmail.com>>
> > > > Cc: Tycho Andersen <tycho.andersen at canonical.com <mailto:
> tycho.andersen at canonical.com>>, criu at openvz.org <mailto:criu at openvz.org>
> > > >
> > > >
> > > > On 10/15/2015 03:04 PM, Pavel Emelyanov wrote:
> > > > > On 10/15/2015 02:58 PM, Jason Lee wrote:
> > > > >> OK!
> > > > >> Here it is:
> > > > >>
> > > > >> root at dslab:/home/dslab/tools/criu# ./crit show
> > > > /home/checkpoint/c2/packetsk.img
> > > > >> {
> > > > >> "magic": "PACKETSK",
> > > > >> "entries": [
> > > > >> {
> > > > >> "id": 61,
> > > > >> "type": 10,
> > > > >
> > > > > Here it is. This is SOCK_PACKET which we didn't support (and
> didn't put
> > > > check
> > > > > for it on dump). Which software uses this thing? AF_PACKET
> sockets are
> > > > typically
> > > > > SOCK_RAW or SOCK_DGRAM, SOCK_PACKET is, frankly speaking, new
> to me :)
> > > >
> > > > Just FIY, I've created an issue for this feature:
> > > > https://github.com/xemul/criu/issues/73
> > > >
> > > > -- Pavel
> > > >
> > > >
> >
> >
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.openvz.org/pipermail/criu/attachments/20151019/c5cc2f8a/attachment-0001.html>
More information about the CRIU
mailing list