[CRIU] [PATCH 2/2] rpc: Report back first error message on failure

Adrian Reber adrian at lisas.de
Mon Sep 5 08:19:36 PDT 2016


This patch breaks p.haul:

15:18:07.038: 30677: 	Issuing pre-dump command to service
Error (criu/log.c:117): BUG at criu/log.c:117
15:18:08.686: 30677: CRIU RPC error (0/4)
Traceback (most recent call last):
  File "./p.haul", line 47, in <module>
    worker.start_migration()
  File "/share/p.haul/phaul/iters.py", line 159, in start_migration
    self.__start_live_migration()
  File "/share/p.haul/phaul/iters.py", line 197, in __start_live_migration
    self.criu_connection, self.fs)
  File "/share/p.haul/phaul/criu_cr.py", line 14, in criu_predump
    resp = criu_connection.send_req(req)
  File "/share/p.haul/phaul/criu_api.py", line 70, in send_req
    return self._recv_resp()
  File "/share/p.haul/phaul/criu_api.py", line 57, in _recv_resp
    raise Exception("CRIU RPC error (%d/%d)" % (resp.type, self._last_req))
Exception: CRIU RPC error (0/4)

Running p.haul with '--no-pre-dump' does not show that error. Reverting
commit 2157799c also fixes the error.

		Adrian

On Wed, Aug 24, 2016 at 09:12:03PM +0300, Pavel Emelyanov wrote:
> When running criu in swrk mode the client typically wants to know
> the reason of failure. Right now criu reports back NOTHING but the
> fact that dump/restore/etc fails. We've tried to address this by
> introducing the cr-errno engine, but it doesn't seem to be informative
> enough and is hard to maintain -- adding new errno-s is boring :(
> 
> I propose to report back the first message with ERROR level upon
> failrure as __typically__ the very first error message indicates
> that proceeding is impossible and criu rolls back (generating more
> error messages, so it's crucial to know the very first one).
> 
> If we ever meet the situation that the first pr_err/pr_perror doesn't
> cause criu to exit, this printing should be fixed to be pr_warn.
> 
> Signed-off-by: Pavel Emelyanov <xemul at virtuozzo.com>
> ---
>  criu/cr-service.c  |  7 +++++++
>  criu/include/log.h |  2 ++
>  criu/log.c         | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
>  images/rpc.proto   |  1 +
>  4 files changed, 59 insertions(+)
> 
> diff --git a/criu/cr-service.c b/criu/cr-service.c
> index a2b4027..c29e6ef 100644
> --- a/criu/cr-service.c
> +++ b/criu/cr-service.c
> @@ -119,6 +119,8 @@ static void set_resp_err(CriuResp *resp)
>  		resp->has_cr_errno = true;
>  		resp->cr_errno = errn;
>  	}
> +
> +	resp->cr_errmsg = log_first_err();
>  }
>  
>  static void send_criu_err(int sk, char *msg)
> @@ -289,6 +291,11 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
>  		goto err;
>  	}
>  
> +	if (log_keep_err()) {
> +		pr_perror("Can't tune log");
> +		goto err;
> +	}
> +
>  	/* checking flags from client */
>  	if (req->has_leave_running && req->leave_running)
>  		opts.final_state = TASK_ALIVE;
> diff --git a/criu/include/log.h b/criu/include/log.h
> index fe53a7c..d2b25ec 100644
> --- a/criu/include/log.h
> +++ b/criu/include/log.h
> @@ -9,6 +9,8 @@ extern int log_init(const char *output);
>  extern void log_fini(void);
>  extern int log_init_by_pid(void);
>  extern void log_closedir(void);
> +extern int log_keep_err(void);
> +extern char *log_first_err(void);
>  
>  extern void log_set_fd(int fd);
>  extern int log_get_fd(void);
> diff --git a/criu/log.c b/criu/log.c
> index 6c5d807..64ce8c5 100644
> --- a/criu/log.c
> +++ b/criu/log.c
> @@ -17,6 +17,9 @@
>  #include "util.h"
>  #include "cr_options.h"
>  #include "servicefd.h"
> +#include "rst-malloc.h"
> +#include "lock.h"
> +#include "string.h"
>  
>  #define DEFAULT_LOGFD		STDERR_FILENO
>  /* Enable timestamps if verbosity is increased from default */
> @@ -72,6 +75,49 @@ static void reset_buf_off(void)
>  		buf_off = 0;
>  }
>  
> +/*
> + * Keeping the very first error messsage for RPC to report back.
> + */
> +struct str_and_lock {
> +	mutex_t l;
> +	char s[1024];
> +};
> +
> +static struct str_and_lock *first_err;
> +
> +int log_keep_err(void)
> +{
> +	first_err = shmalloc(sizeof(struct str_and_lock));
> +	if (first_err == NULL)
> +		return -1;
> +
> +	mutex_init(&first_err->l);
> +	first_err->s[0] = '\0';
> +	return 0;
> +}
> +
> +static void log_note_err(char *msg)
> +{
> +	if (first_err && first_err->s[0] == '\0') {
> +		/*
> +		 * In any action other than restore this locking is
> +		 * actually not required, but ... it's error path
> +		 * anyway, so it doesn't make much sence to try hard
> +		 * and optimize this out.
> +		 */
> +		mutex_lock(&first_err->l);
> +		if (first_err->s[0] == '\0')
> +			strlcpy(first_err->s, msg, sizeof(first_err->s));
> +		mutex_unlock(&first_err->l);
> +	}
> +}
> +
> +char *log_first_err(void)
> +{
> +	BUG_ON(!first_err);
> +	return first_err->s[0] == '\0' ? NULL : first_err->s;
> +}
> +
>  int log_init(const char *output)
>  {
>  	int new_logfd, fd;
> @@ -178,6 +224,9 @@ static void __print_on_level(unsigned int loglevel, const char *format, va_list
>  		off += ret;
>  	}
>  	errno =  __errno;
> +
> +	if (loglevel == LOG_ERROR)
> +		log_note_err(buffer + buf_off);
>  }
>  
>  void print_on_level(unsigned int loglevel, const char *format, ...)
> diff --git a/images/rpc.proto b/images/rpc.proto
> index 11d169b..9b36178 100644
> --- a/images/rpc.proto
> +++ b/images/rpc.proto
> @@ -187,4 +187,5 @@ message criu_resp {
>  
>  	optional int32			cr_errno	= 7;
>  	optional criu_features		features	= 8;
> +	optional string			cr_errmsg	= 9;
>  }
> -- 
> 2.5.0
> 
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu


More information about the CRIU mailing list