[CRIU] [PATCH 2/2] rpc: Report back first error message on failure
Pavel Emelyanov
xemul at virtuozzo.com
Tue Sep 6 14:40:00 PDT 2016
On 09/05/2016 06:19 PM, Adrian Reber wrote:
> This patch breaks p.haul:
>
> 15:18:07.038: 30677: Issuing pre-dump command to service
> Error (criu/log.c:117): BUG at criu/log.c:117
:( That's because of service calls fork() on pre-dump, child initializes
log to keep the message, but the responce is sent in parent process...
Hm...
Thanks for catching this, will think how to fix :)
-- Pavel
> 15:18:08.686: 30677: CRIU RPC error (0/4)
> Traceback (most recent call last):
> File "./p.haul", line 47, in <module>
> worker.start_migration()
> File "/share/p.haul/phaul/iters.py", line 159, in start_migration
> self.__start_live_migration()
> File "/share/p.haul/phaul/iters.py", line 197, in __start_live_migration
> self.criu_connection, self.fs)
> File "/share/p.haul/phaul/criu_cr.py", line 14, in criu_predump
> resp = criu_connection.send_req(req)
> File "/share/p.haul/phaul/criu_api.py", line 70, in send_req
> return self._recv_resp()
> File "/share/p.haul/phaul/criu_api.py", line 57, in _recv_resp
> raise Exception("CRIU RPC error (%d/%d)" % (resp.type, self._last_req))
> Exception: CRIU RPC error (0/4)
>
> Running p.haul with '--no-pre-dump' does not show that error. Reverting
> commit 2157799c also fixes the error.
>
> Adrian
>
> On Wed, Aug 24, 2016 at 09:12:03PM +0300, Pavel Emelyanov wrote:
>> When running criu in swrk mode the client typically wants to know
>> the reason of failure. Right now criu reports back NOTHING but the
>> fact that dump/restore/etc fails. We've tried to address this by
>> introducing the cr-errno engine, but it doesn't seem to be informative
>> enough and is hard to maintain -- adding new errno-s is boring :(
>>
>> I propose to report back the first message with ERROR level upon
>> failrure as __typically__ the very first error message indicates
>> that proceeding is impossible and criu rolls back (generating more
>> error messages, so it's crucial to know the very first one).
>>
>> If we ever meet the situation that the first pr_err/pr_perror doesn't
>> cause criu to exit, this printing should be fixed to be pr_warn.
>>
>> Signed-off-by: Pavel Emelyanov <xemul at virtuozzo.com>
>> ---
>> criu/cr-service.c | 7 +++++++
>> criu/include/log.h | 2 ++
>> criu/log.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
>> images/rpc.proto | 1 +
>> 4 files changed, 59 insertions(+)
>>
>> diff --git a/criu/cr-service.c b/criu/cr-service.c
>> index a2b4027..c29e6ef 100644
>> --- a/criu/cr-service.c
>> +++ b/criu/cr-service.c
>> @@ -119,6 +119,8 @@ static void set_resp_err(CriuResp *resp)
>> resp->has_cr_errno = true;
>> resp->cr_errno = errn;
>> }
>> +
>> + resp->cr_errmsg = log_first_err();
>> }
>>
>> static void send_criu_err(int sk, char *msg)
>> @@ -289,6 +291,11 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
>> goto err;
>> }
>>
>> + if (log_keep_err()) {
>> + pr_perror("Can't tune log");
>> + goto err;
>> + }
>> +
>> /* checking flags from client */
>> if (req->has_leave_running && req->leave_running)
>> opts.final_state = TASK_ALIVE;
>> diff --git a/criu/include/log.h b/criu/include/log.h
>> index fe53a7c..d2b25ec 100644
>> --- a/criu/include/log.h
>> +++ b/criu/include/log.h
>> @@ -9,6 +9,8 @@ extern int log_init(const char *output);
>> extern void log_fini(void);
>> extern int log_init_by_pid(void);
>> extern void log_closedir(void);
>> +extern int log_keep_err(void);
>> +extern char *log_first_err(void);
>>
>> extern void log_set_fd(int fd);
>> extern int log_get_fd(void);
>> diff --git a/criu/log.c b/criu/log.c
>> index 6c5d807..64ce8c5 100644
>> --- a/criu/log.c
>> +++ b/criu/log.c
>> @@ -17,6 +17,9 @@
>> #include "util.h"
>> #include "cr_options.h"
>> #include "servicefd.h"
>> +#include "rst-malloc.h"
>> +#include "lock.h"
>> +#include "string.h"
>>
>> #define DEFAULT_LOGFD STDERR_FILENO
>> /* Enable timestamps if verbosity is increased from default */
>> @@ -72,6 +75,49 @@ static void reset_buf_off(void)
>> buf_off = 0;
>> }
>>
>> +/*
>> + * Keeping the very first error messsage for RPC to report back.
>> + */
>> +struct str_and_lock {
>> + mutex_t l;
>> + char s[1024];
>> +};
>> +
>> +static struct str_and_lock *first_err;
>> +
>> +int log_keep_err(void)
>> +{
>> + first_err = shmalloc(sizeof(struct str_and_lock));
>> + if (first_err == NULL)
>> + return -1;
>> +
>> + mutex_init(&first_err->l);
>> + first_err->s[0] = '\0';
>> + return 0;
>> +}
>> +
>> +static void log_note_err(char *msg)
>> +{
>> + if (first_err && first_err->s[0] == '\0') {
>> + /*
>> + * In any action other than restore this locking is
>> + * actually not required, but ... it's error path
>> + * anyway, so it doesn't make much sence to try hard
>> + * and optimize this out.
>> + */
>> + mutex_lock(&first_err->l);
>> + if (first_err->s[0] == '\0')
>> + strlcpy(first_err->s, msg, sizeof(first_err->s));
>> + mutex_unlock(&first_err->l);
>> + }
>> +}
>> +
>> +char *log_first_err(void)
>> +{
>> + BUG_ON(!first_err);
>> + return first_err->s[0] == '\0' ? NULL : first_err->s;
>> +}
>> +
>> int log_init(const char *output)
>> {
>> int new_logfd, fd;
>> @@ -178,6 +224,9 @@ static void __print_on_level(unsigned int loglevel, const char *format, va_list
>> off += ret;
>> }
>> errno = __errno;
>> +
>> + if (loglevel == LOG_ERROR)
>> + log_note_err(buffer + buf_off);
>> }
>>
>> void print_on_level(unsigned int loglevel, const char *format, ...)
>> diff --git a/images/rpc.proto b/images/rpc.proto
>> index 11d169b..9b36178 100644
>> --- a/images/rpc.proto
>> +++ b/images/rpc.proto
>> @@ -187,4 +187,5 @@ message criu_resp {
>>
>> optional int32 cr_errno = 7;
>> optional criu_features features = 8;
>> + optional string cr_errmsg = 9;
>> }
>> --
>> 2.5.0
>>
>> _______________________________________________
>> CRIU mailing list
>> CRIU at openvz.org
>> https://lists.openvz.org/mailman/listinfo/criu
> .
>
More information about the CRIU
mailing list