[Devel] [RFC][PATCH 3/3][cr][v2]: fileleases: C/R of an in-progress lease.
Sukadev Bhattiprolu
sukadev at linux.vnet.ibm.com
Tue May 25 18:07:43 PDT 2010
If process P1 has a F_WRLCK lease on file F1 and process P2 opens the
file, P2's open() blocks for lease_break_time (45 seconds) and P1 gets
a SIGIO to cleanup it lease in preparation for P2's open. If the two
processes are checkpointed/restarted in this window, we should address
following two issues:
- P1 should get a SIGIO only once for the lease (i.e if P1 got the
SIGIO before checkpoint, it should not get the SIGIO after restart).
- If R seconds remain in the lease, P2's open should be blocked for
at least the R seconds, so P1 has the time to clean up its lease.
The previous patch gives P1 the entire lease_break_time but that
can leave P2 stalled for 2*lease_break_time.
To address first, we add a field ->fl_break_notified to "remember" if we
notified the lease-holder already. We save this field in the checkpoint
image and when restarting, we notify the lease-holder only if this field
is not set.
To address the second issue, we also checkpoint the ->fl_break_time for
an in-progress lease. When restarting the process, we ensure that the
lease-holder sleeps only for the remaining-lease rather than the entire
lease.
These two fixes sound like an approximation (see comments in do_setlease()
and __break_lease() below) and are also a bit kludgy (hence a separate patch
for now).
Appreciate comments on how we can do this better. Specifically:
- do we even need to try and address the second issue above or
just let P1 have the entire lease_break_time again ?
- theoretically, the R seconds should start counting after *all*
processes in the application-process tree have been restarted,
since P1 waits inside the kernel for a portion of the remaining
lease - should we then add a delta to R ?
Signed-off-by: Sukadev Bhattiprolu <sukadev at linux.vnet.ibm.com>
---
fs/checkpoint.c | 13 ++++++---
fs/locks.c | 56 ++++++++++++++++++++++++++++++++++-----
include/linux/checkpoint_hdr.h | 1 +
include/linux/fs.h | 4 ++-
4 files changed, 61 insertions(+), 13 deletions(-)
diff --git a/fs/checkpoint.c b/fs/checkpoint.c
index 2baeb32..f2adb38 100644
--- a/fs/checkpoint.c
+++ b/fs/checkpoint.c
@@ -284,9 +284,13 @@ static int checkpoint_one_file_lock(struct ckpt_ctx *ctx, struct file *file,
h->fl_type = lock->fl_type;
h->fl_type_prev = lock->fl_type_prev;
h->fl_flags = lock->fl_flags;
- if (h->fl_type & F_INPROGRESS &&
- (lock->fl_break_time > jiffies))
- h->fl_rem_lease = (lock->fl_break_time - jiffies) / HZ;
+ if (IS_LEASE(lock)) {
+ unsigned long bt = lock->fl_break_time;
+
+ h->fl_break_notified = lock->fl_break_notified;
+ if ((h->fl_type & F_INPROGRESS) && (bt > jiffies))
+ h->fl_rem_lease = (bt - jiffies) / HZ;
+ }
} else {
/* Checkpoint a dummy lock as a marker */
@@ -1084,7 +1088,8 @@ static int restore_file_locks(struct ckpt_ctx *ctx, struct file *file, int fd)
type = h->fl_type;
if (h->fl_type & F_INPROGRESS)
type = h->fl_type_prev;
- ret = do_setlease(fd, file, type, h->fl_rem_lease);
+ ret = do_setlease(fd, file, type, h->fl_rem_lease,
+ h->fl_break_notified);
if (ret)
ckpt_err(ctx, ret, "do_setlease(): %d\n", type);
}
diff --git a/fs/locks.c b/fs/locks.c
index 7a80278..c6ef829 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -185,6 +185,7 @@ void locks_init_lock(struct file_lock *fl)
fl->fl_flags = 0;
fl->fl_type = 0;
fl->fl_type_prev = 0;
+ fl->fl_break_notified = 0;
fl->fl_break_time = 0UL;
fl->fl_start = fl->fl_end = 0;
fl->fl_ops = NULL;
@@ -229,6 +230,8 @@ void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl)
new->fl_flags = fl->fl_flags;
new->fl_type = fl->fl_type;
new->fl_start = fl->fl_start;
+ new->fl_break_time = fl->fl_break_time;
+ new->fl_break_notified = fl->fl_break_notified;
new->fl_end = fl->fl_end;
new->fl_ops = NULL;
new->fl_lmops = NULL;
@@ -460,6 +463,8 @@ static int lease_init(struct file *filp, int type, struct file_lock *fl)
fl->fl_end = OFFSET_MAX;
fl->fl_ops = NULL;
fl->fl_lmops = &lease_manager_ops;
+ fl->fl_break_time = 0UL;
+ fl->fl_break_notified = 0;
return 0;
}
@@ -1139,6 +1144,7 @@ int lease_modify(struct file_lock **before, int arg)
struct file_lock *fl = *before;
int error = assign_type(fl, arg);
+ fl->fl_break_notified = 0;
if (error)
return error;
locks_wake_up_blocks(fl);
@@ -1254,16 +1260,25 @@ int __break_lease(struct inode *inode, unsigned int mode)
if (!fl->fl_type_prev)
fl->fl_type_prev = fl->fl_type;
fl->fl_type = future;
- fl->fl_break_time = break_time;
/*
- * TODO: ->fl_break() sends the SIGIO to lease-holder.
- * If lease-holder was checkpointed/restarted and
- * this is a restarted lease, we should not
- * re-send the SIGIO ?
+ * CHECK:
+ *
+ * Similarly, if ->fl_break_time or ->fl_break_notified
+ * are set, we were in the middle of a lease-break
+ * when we were checkpointed. So we don't need to
+ * notify of the lease holder again or wait for the
+ * entire lease_break_time. Note that this time
+ * could be more than lease_break_time since we use
+ * the value that was checkpointed.
*/
+ if (!fl->fl_break_time)
+ fl->fl_break_time = break_time;
+
/* lease must have lmops break callback */
- fl->fl_lmops->fl_break(fl);
+ if (!fl->fl_break_notified)
+ fl->fl_lmops->fl_break(fl);
+ fl->fl_break_notified = 1;
}
}
@@ -1511,7 +1526,8 @@ int vfs_setlease(struct file *filp, long arg, struct file_lock **lease)
}
EXPORT_SYMBOL_GPL(vfs_setlease);
-int do_setlease(unsigned int fd, struct file *filp, long arg, int rem_lease)
+int do_setlease(unsigned int fd, struct file *filp, long arg, int rem_lease,
+ int notified)
{
struct file_lock fl, *flp = &fl;
struct inode *inode = filp->f_path.dentry->d_inode;
@@ -1521,6 +1537,30 @@ int do_setlease(unsigned int fd, struct file *filp, long arg, int rem_lease)
error = lease_init(filp, arg, &fl);
if (error)
return error;
+ fl.fl_break_notified = notified;
+
+ /*
+ * Checkpoint/restart:
+ *
+ * If this lease is from a checkpoint, use the 'remaining-lease' that
+ * was checkpointed.
+ *
+ * NOTE: There are couple of observations:
+ *
+ * - We look at jiffies now and decide the absolute end time for
+ * the lease, but the lease-holder is still frozen and will not
+ * actually have the entire time to clean up. When the lease-
+ * holder gets to run, depends on how many other processes are
+ * in the checkpointed application's process-tree.
+ *
+ * - We assume that the lease-breaker was also checkpointed and
+ * set up the lease/file_lock object in anticipation of the
+ * lease-breaker retrying their open(). If the lease-breaker
+ * was not checkpointed, the lease-holder continues to have the
+ * lease until another lease-breaker comes along.
+ */
+ if (rem_lease)
+ fl.fl_break_time = jiffies + rem_lease * HZ;
lock_kernel();
@@ -1556,7 +1596,7 @@ out_unlock:
*/
int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
{
- return do_setlease(fd, filp, arg, 0);
+ return do_setlease(fd, filp, arg, 0, 0);
}
/**
diff --git a/include/linux/checkpoint_hdr.h b/include/linux/checkpoint_hdr.h
index ddad73f..c741e6a 100644
--- a/include/linux/checkpoint_hdr.h
+++ b/include/linux/checkpoint_hdr.h
@@ -590,6 +590,7 @@ struct ckpt_hdr_file_lock {
__u8 fl_type;
__u8 fl_type_prev;
__u8 fl_flags;
+ __u8 fl_break_notified;
unsigned long fl_rem_lease;
};
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 54b2a7b..74da7bb 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1067,6 +1067,7 @@ struct file_lock {
unsigned char fl_flags;
unsigned char fl_type;
unsigned char fl_type_prev;
+ unsigned char fl_break_notified;
unsigned int fl_pid;
struct pid *fl_nspid;
wait_queue_head_t fl_wait;
@@ -1123,7 +1124,8 @@ extern int flock64_set(unsigned int, struct file *, unsigned int,
struct flock64 *);
#endif
-extern int do_setlease(unsigned int fd, struct file *filp, long arg, int rem_lease);
+extern int do_setlease(unsigned int fd, struct file *filp, long arg,
+ int rem_lease, int notified);
extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg);
extern int fcntl_getlease(struct file *filp);
--
1.6.0.4
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers
More information about the Devel
mailing list