[CRIU] [PATCH 2/2] rpc: Report back first error message on failure
Adrian Reber
adrian at lisas.de
Mon Sep 5 08:19:36 PDT 2016
This patch breaks p.haul:
15:18:07.038: 30677: Issuing pre-dump command to service
Error (criu/log.c:117): BUG at criu/log.c:117
15:18:08.686: 30677: CRIU RPC error (0/4)
Traceback (most recent call last):
File "./p.haul", line 47, in <module>
worker.start_migration()
File "/share/p.haul/phaul/iters.py", line 159, in start_migration
self.__start_live_migration()
File "/share/p.haul/phaul/iters.py", line 197, in __start_live_migration
self.criu_connection, self.fs)
File "/share/p.haul/phaul/criu_cr.py", line 14, in criu_predump
resp = criu_connection.send_req(req)
File "/share/p.haul/phaul/criu_api.py", line 70, in send_req
return self._recv_resp()
File "/share/p.haul/phaul/criu_api.py", line 57, in _recv_resp
raise Exception("CRIU RPC error (%d/%d)" % (resp.type, self._last_req))
Exception: CRIU RPC error (0/4)
Running p.haul with '--no-pre-dump' does not show that error. Reverting
commit 2157799c also fixes the error.
Adrian
On Wed, Aug 24, 2016 at 09:12:03PM +0300, Pavel Emelyanov wrote:
> When running criu in swrk mode the client typically wants to know
> the reason of failure. Right now criu reports back NOTHING but the
> fact that dump/restore/etc fails. We've tried to address this by
> introducing the cr-errno engine, but it doesn't seem to be informative
> enough and is hard to maintain -- adding new errno-s is boring :(
>
> I propose to report back the first message with ERROR level upon
> failrure as __typically__ the very first error message indicates
> that proceeding is impossible and criu rolls back (generating more
> error messages, so it's crucial to know the very first one).
>
> If we ever meet the situation that the first pr_err/pr_perror doesn't
> cause criu to exit, this printing should be fixed to be pr_warn.
>
> Signed-off-by: Pavel Emelyanov <xemul at virtuozzo.com>
> ---
> criu/cr-service.c | 7 +++++++
> criu/include/log.h | 2 ++
> criu/log.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
> images/rpc.proto | 1 +
> 4 files changed, 59 insertions(+)
>
> diff --git a/criu/cr-service.c b/criu/cr-service.c
> index a2b4027..c29e6ef 100644
> --- a/criu/cr-service.c
> +++ b/criu/cr-service.c
> @@ -119,6 +119,8 @@ static void set_resp_err(CriuResp *resp)
> resp->has_cr_errno = true;
> resp->cr_errno = errn;
> }
> +
> + resp->cr_errmsg = log_first_err();
> }
>
> static void send_criu_err(int sk, char *msg)
> @@ -289,6 +291,11 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
> goto err;
> }
>
> + if (log_keep_err()) {
> + pr_perror("Can't tune log");
> + goto err;
> + }
> +
> /* checking flags from client */
> if (req->has_leave_running && req->leave_running)
> opts.final_state = TASK_ALIVE;
> diff --git a/criu/include/log.h b/criu/include/log.h
> index fe53a7c..d2b25ec 100644
> --- a/criu/include/log.h
> +++ b/criu/include/log.h
> @@ -9,6 +9,8 @@ extern int log_init(const char *output);
> extern void log_fini(void);
> extern int log_init_by_pid(void);
> extern void log_closedir(void);
> +extern int log_keep_err(void);
> +extern char *log_first_err(void);
>
> extern void log_set_fd(int fd);
> extern int log_get_fd(void);
> diff --git a/criu/log.c b/criu/log.c
> index 6c5d807..64ce8c5 100644
> --- a/criu/log.c
> +++ b/criu/log.c
> @@ -17,6 +17,9 @@
> #include "util.h"
> #include "cr_options.h"
> #include "servicefd.h"
> +#include "rst-malloc.h"
> +#include "lock.h"
> +#include "string.h"
>
> #define DEFAULT_LOGFD STDERR_FILENO
> /* Enable timestamps if verbosity is increased from default */
> @@ -72,6 +75,49 @@ static void reset_buf_off(void)
> buf_off = 0;
> }
>
> +/*
> + * Keeping the very first error messsage for RPC to report back.
> + */
> +struct str_and_lock {
> + mutex_t l;
> + char s[1024];
> +};
> +
> +static struct str_and_lock *first_err;
> +
> +int log_keep_err(void)
> +{
> + first_err = shmalloc(sizeof(struct str_and_lock));
> + if (first_err == NULL)
> + return -1;
> +
> + mutex_init(&first_err->l);
> + first_err->s[0] = '\0';
> + return 0;
> +}
> +
> +static void log_note_err(char *msg)
> +{
> + if (first_err && first_err->s[0] == '\0') {
> + /*
> + * In any action other than restore this locking is
> + * actually not required, but ... it's error path
> + * anyway, so it doesn't make much sence to try hard
> + * and optimize this out.
> + */
> + mutex_lock(&first_err->l);
> + if (first_err->s[0] == '\0')
> + strlcpy(first_err->s, msg, sizeof(first_err->s));
> + mutex_unlock(&first_err->l);
> + }
> +}
> +
> +char *log_first_err(void)
> +{
> + BUG_ON(!first_err);
> + return first_err->s[0] == '\0' ? NULL : first_err->s;
> +}
> +
> int log_init(const char *output)
> {
> int new_logfd, fd;
> @@ -178,6 +224,9 @@ static void __print_on_level(unsigned int loglevel, const char *format, va_list
> off += ret;
> }
> errno = __errno;
> +
> + if (loglevel == LOG_ERROR)
> + log_note_err(buffer + buf_off);
> }
>
> void print_on_level(unsigned int loglevel, const char *format, ...)
> diff --git a/images/rpc.proto b/images/rpc.proto
> index 11d169b..9b36178 100644
> --- a/images/rpc.proto
> +++ b/images/rpc.proto
> @@ -187,4 +187,5 @@ message criu_resp {
>
> optional int32 cr_errno = 7;
> optional criu_features features = 8;
> + optional string cr_errmsg = 9;
> }
> --
> 2.5.0
>
> _______________________________________________
> CRIU mailing list
> CRIU at openvz.org
> https://lists.openvz.org/mailman/listinfo/criu
More information about the CRIU
mailing list