[CRIU] [PATCH] [RFC] criu: test different situations when parasite must cure itself

Andrew Vagin avagin at parallels.com
Sun Oct 19 22:43:05 PDT 2014


On Fri, Oct 17, 2014 at 05:21:54PM +0400, Pavel Emelyanov wrote:
> On 10/15/2014 03:24 PM, Andrey Vagin wrote:
> > Here is a simple fault-injection engine. Each fault has uniq code.
> > One of this code can be set to the CRIU_FAULT environment variable.
> > On the next run this code will be triggered.
> > For each fault we need to have code which emulate a specified behaviour.
> > 
> > This patch checks following cases:
> > * a parasite socket was closed unexpectedly
> > * How parasite handles unsupported command
> > * something failed when a parasite daemon is rinning.
> > * criu dies unexpectedly
> > 
> > Fault-injection code is compiled only if make is executed with DEBUG=1.
> > 
> > The following command can be used to check all existing fault cases:
> > make -C test fault-injection
> 
> We have a systemtap-based fault injection. Why is this version better?

It's much simpler.
It tests more cases.
Systemtap requires kernel-debug and loading kernel modules.
The systemtap version failed sometimes. I don't remember a reason, but
it isn't about criu.

> 
> > Signed-off-by: Andrey Vagin <avagin at openvz.org>
> > ---
> >  Makefile.crtools          |  3 +++
> >  cr-dump.c                 | 30 ++++++++++++++++++++++++++++++
> >  crtools.c                 |  6 ++++++
> >  fault-injection.c         | 28 ++++++++++++++++++++++++++++
> >  include/fault-injection.h | 25 +++++++++++++++++++++++++
> >  test/Makefile             | 12 +++++++++---
> >  test/fault-max.c          |  8 ++++++++
> >  test/zdtm.sh              | 18 +++++++++++++++++-
> >  8 files changed, 126 insertions(+), 4 deletions(-)
> >  create mode 100644 fault-injection.c
> >  create mode 100644 include/fault-injection.h
> >  create mode 100644 test/fault-max.c
> > 
> > diff --git a/Makefile.crtools b/Makefile.crtools
> > index 7f21d25..f4d34ca 100644
> > --- a/Makefile.crtools
> > +++ b/Makefile.crtools
> > @@ -69,6 +69,9 @@ endif
> >  obj-y	+= cr-service.o
> >  obj-y	+= sd-daemon.o
> >  obj-y	+= plugin.o
> > +ifeq ($(DEBUG),1)
> > +obj-y	+= fault-injection.o
> > +endif
> >  
> >  ifneq ($(MAKECMDGOALS),clean)
> >  incdeps := y
> > diff --git a/cr-dump.c b/cr-dump.c
> > index 7d33e3d..ad8a415 100644
> > --- a/cr-dump.c
> > +++ b/cr-dump.c
> > @@ -72,6 +72,7 @@
> >  #include "irmap.h"
> >  #include "sysfs_parse.h"
> >  #include "action-scripts.h"
> > +#include "fault-injection.h"
> >  
> >  #include "asm/dump.h"
> >  
> > @@ -1465,6 +1466,32 @@ err_cure:
> >  	goto err_free;
> >  }
> >  
> > +static int fault_injected_daemon(struct parasite_ctl *ctl)
> > +{
> > +
> > +	if (fault_injected(FI_PARASITE_DAEMON))
> > +		return -1;
> > +
> > +	if (fault_injected(FI_PARASITE_DAEMON_SOCK)) {
> > +		close(ctl->tsock);
> > +		return 0;
> > +	}
> > +
> > +	if (fault_injected(FI_PARASITE_DAEMON_KILL)) {
> > +		exit(1);
> > +		BUG();
> > +	}
> > +
> > +	if (fault_injected(FI_PARASITE_DAEMON_BAD_CMD)) {
> > +		if (parasite_execute_daemon(PARASITE_CMD_MAX, ctl) < 0)
> > +			return -1;
> > +		return 0;
> > +	}
> > +
> > +	return 0;
> > +
> > +}
> > +
> >  static int dump_one_task(struct pstree_item *item)
> >  {
> >  	pid_t pid = item->pid.real;
> > @@ -1540,6 +1567,9 @@ static int dump_one_task(struct pstree_item *item)
> >  		goto err;
> >  	}
> >  
> > +	if (fault_injected_daemon(parasite_ctl))
> > +		goto err;
> > +
> >  	if (root_ns_mask & CLONE_NEWPID && root_item == item) {
> >  		int pfd;
> >  
> > diff --git a/crtools.c b/crtools.c
> > index a245bbb..d6f3054 100644
> > --- a/crtools.c
> > +++ b/crtools.c
> > @@ -38,6 +38,7 @@
> >  #include "cgroup.h"
> >  #include "cpu.h"
> >  #include "action-scripts.h"
> > +#include "fault-injection.h"
> >  
> >  #include "setproctitle.h"
> >  
> > @@ -192,6 +193,11 @@ int main(int argc, char *argv[], char *envp[])
> >  
> >  	BUILD_BUG_ON(PAGE_SIZE != PAGE_IMAGE_SIZE);
> >  
> > +	if (fault_injection_init()) {
> > +		/* criu must be killed, because non-zero status is expected in this case */
> > +		BUG();
> > +	}
> > +
> >  	cr_pb_init();
> >  	if (restrict_uid(getuid(), getgid()))
> >  		return 1;
> > diff --git a/fault-injection.c b/fault-injection.c
> > new file mode 100644
> > index 0000000..a26d361
> > --- /dev/null
> > +++ b/fault-injection.c
> > @@ -0,0 +1,28 @@
> > +#include <stdlib.h>
> > +
> > +#include "fault-injection.h"
> > +
> > +static unsigned int strategy;
> > +
> > +int fault_injection_init()
> > +{
> > +	char *val;
> > +
> > +	val = getenv("CRIU_FAULT");
> > +	if (val == NULL)
> > +		return 0;
> > +
> > +	strategy = atoi(val);
> > +
> > +	if (strategy ==0 || strategy >= FI_MAX)
> > +		return -1;
> > +
> > +	return 0;
> > +}
> > +
> > +int fault_injected(enum faults code)
> > +{
> > +	if (strategy == code)
> > +		return 1;
> > +	return 0;
> > +}
> > diff --git a/include/fault-injection.h b/include/fault-injection.h
> > new file mode 100644
> > index 0000000..d147eb5
> > --- /dev/null
> > +++ b/include/fault-injection.h
> > @@ -0,0 +1,25 @@
> > +#ifndef __CR_FAULT_INJECTION_H__
> > +#define __CR_FAULT_INJECTION_H__
> > +
> > +enum faults {
> > +	FI_NONE = 0,
> > +	FI_PARASITE_DAEMON,
> > +	FI_PARASITE_DAEMON_KILL,
> > +	FI_PARASITE_DAEMON_SOCK,
> > +	FI_PARASITE_DAEMON_BAD_CMD,
> > +	FI_MAX,
> > +};
> > +
> > +#ifdef CR_DEBUG
> > +
> > +extern int fault_injection_init(void);
> > +extern int fault_injected(enum faults code);
> > +
> > +#else
> > +
> > +static inline int fault_injection_init(void) { return 0; }
> > +static inline int fault_injected(enum faults code) { return 0; }
> > +
> > +#endif
> > +
> > +#endif /* __CR_FAULT_INJECTION_H__ */
> > diff --git a/test/Makefile b/test/Makefile
> > index 9ab8f3d..b03c662 100644
> > --- a/test/Makefile
> > +++ b/test/Makefile
> > @@ -35,16 +35,22 @@ zdtm-iter:
> >  
> >  .PHONY: zdtm
> >  
> > -fault-injection: .FORCE
> > -	$(MAKE) -C fault-injection
> > +fault-injection: fault-max .FORCE
> > +	$(MAKE) -C .. clean
> > +	$(MAKE) -C .. DEBUG=1
> > +	for i in $(shell seq 1 `./fault-max`); do						\
> > +		ZDTM_ARGS="--fault $$i -C" $(MAKE) static/sigpending static/socket-tcpbuf;	\
> > +	done
> >  .PHONY: fault-injection
> >  
> >  zdtm_ns:   $(shell echo "$(TST)" | tr ' ' '\n' | awk '/^ns\// && !/tty|pty/ {print}')
> >  zdtm_nons: $(shell echo "$(TST)" | tr ' ' '\n' | awk '!/^ns\// || /tty|pty/ {print}')
> >  
> > -override CFLAGS += -D_GNU_SOURCE
> > +override CFLAGS += -D_GNU_SOURCE -I../include
> >  zdtm_ct: zdtm_ct.c
> >  
> > +fault-max: fault-max.c ../include/fault-injection.h
> > +
> >  clean:
> >  	$(RM) zdtm_ct
> >  	$(Q) $(RM) -r ./lib/
> > diff --git a/test/fault-max.c b/test/fault-max.c
> > new file mode 100644
> > index 0000000..6a1a22c
> > --- /dev/null
> > +++ b/test/fault-max.c
> > @@ -0,0 +1,8 @@
> > +#include "fault-injection.h"
> > +#include <stdio.h>
> > +
> > +int main()
> > +{
> > +	printf("%d\n", FI_MAX - 1);
> > +	return 0;
> > +}
> > diff --git a/test/zdtm.sh b/test/zdtm.sh
> > index cbd098c..a593c6c 100755
> > --- a/test/zdtm.sh
> > +++ b/test/zdtm.sh
> > @@ -256,6 +256,7 @@ START_ONLY=0
> >  BATCH_TEST=0
> >  SPECIFIED_NAME_USED=0
> >  PERF=""
> > +CRIU_FAULT=""
> >  
> >  zdtm_sep()
> >  { (
> > @@ -598,7 +599,8 @@ EOF
> >  		# Here we may have two cases: either checkpoint is failed
> >  		# with some error code, or checkpoint is complete but return
> >  		# code is non-zero because of post dump action.
> > -		if [ "$retcode" -ne 0 ] && [[ "$retcode" -ne 32 || -z "$dump_only" ]]; then
> > +
> > +		if [ "$retcode" -ne 0 ] && [[ "$retcode" -ne 32 || -z "$dump_only" ]] && [ -z "$CRIU_FAULT" ]; then
> >  			if [ $BATCH_TEST -eq 0 ]; then
> >  				echo WARNING: $tname returned $retcode and left running for debug needs
> >  			else
> > @@ -606,6 +608,15 @@ EOF
> >  			fi
> >  			return 1
> >  		fi
> > +
> > +		if [ -n "$CRIU_FAULT" ]; then
> > +			dump_only=1
> > +			if [[ $retcode -eq 0 || $retcode -gt 128 ]]; then
> > +				echo "ERROR: criu returned $retcode"
> > +				return 1
> > +			fi
> > +		fi
> > +
> >  		cat $ddump/dump.log* | grep Error
> >  
> >  		if [ -n "$SNAPSHOT" ]; then
> > @@ -937,6 +948,11 @@ while :; do
> >  		echo $$ > /sys/fs/cgroup/perf_event/zdtm/tasks
> >  		shift
> >  		;;
> > +	  --fault)
> > +		shift
> > +		export CRIU_FAULT=$1
> > +		shift
> > +		;;
> >  	  -*)
> >  		echo "Unrecognized option $1, aborting!" 1>&2
> >  		usage
> > 
> 


More information about the CRIU mailing list