[Devel] [PATCH 2/3] epoll: Add support for checkpointing large numbers of epoll items

Matt Helsley matthltc at us.ibm.com
Mon Oct 19 10:04:32 PDT 2009


Currently we allocate memory to output all of the epoll items in one
big chunk. At 20 bytes per item, and since epoll was designed to
support on the order of 10,000 items, we may find ourselves kmalloc'ing
200,000 bytes. That's an order 7 allocation whereas the heuristic for
difficult allocations, PAGE_ALLOC_COST_ORDER, is 3.

Instead, output the epoll header and items separately. Chunk the output
much like the pid array gets chunked. This ensures that even sub-order 0
allocations will enable checkpoint of large epoll sets. A subsequent
patch will do something similar for the restore path.

Signed-off-by: Matt Helsley <matthltc at us.ibm.com>
---
 fs/eventpoll.c |   71 ++++++++++++++++++++++++++++++++++++-------------------
 1 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4706ec5..2506b40 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1480,7 +1480,7 @@ static int ep_items_checkpoint(void *data)
 	struct rb_node *rbp;
 	struct eventpoll *ep;
 	__s32 epfile_objref;
-	int i, num_items, ret;
+	int num_items = 0, nchunk, ret;
 
 	ctx = dq_entry->ctx;
 
@@ -1489,9 +1489,8 @@ static int ep_items_checkpoint(void *data)
 
 	ep = dq_entry->epfile->private_data;
 	mutex_lock(&ep->mtx);
-	for (i = 0, rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), i++) {}
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp), num_items++) {}
 	mutex_unlock(&ep->mtx);
-	num_items = i;
 
 	h = ckpt_hdr_get_type(ctx, sizeof(*h), CKPT_HDR_EPOLL_ITEMS);
 	if (!h)
@@ -1503,36 +1502,58 @@ static int ep_items_checkpoint(void *data)
 	if (ret || !num_items)
 		return ret;
 
-	items = kzalloc(sizeof(*items)*num_items, GFP_KERNEL);
+	ret = ckpt_write_obj_type(ctx, NULL, sizeof(*items)*num_items,
+				  CKPT_HDR_BUFFER);
+	if (ret < 0)
+		return ret;
+
+	nchunk = num_items;
+	do {
+		items = kzalloc(sizeof(*items)*nchunk, GFP_KERNEL);
+		if (items)
+			break;
+		nchunk = nchunk >> 1;
+	} while (nchunk > 0);
 	if (!items)
 		return -ENOMEM;
+
+	/*
+	 * Walk the rbtree copying items into the chunk of memory and then
+	 * writing them to the checkpoint image
+	 */
 	ret = 0;
-	i = 0;
 	mutex_lock(&ep->mtx);
-	for (rbp = rb_first(&ep->rbr); i < num_items && rbp; rbp = rb_next(rbp),
-	     i++) {
-		struct epitem *epi;
-		int objref;
-
-		epi = rb_entry(rbp, struct epitem, rbn);
-		items[i].fd = epi->ffd.fd;
-		items[i].events = epi->event.events;
-		items[i].data = epi->event.data;
-		objref = ckpt_obj_lookup(ctx, epi->ffd.file, CKPT_OBJ_FILE);
-		if (objref <= 0) {
-			ret = -EBUSY; /* missing item -- checkpoint obj leak */
-			break;
+	rbp = rb_first(&ep->rbr);
+	while ((num_items > 0) && rbp) {
+		int n = min(num_items, nchunk);
+		int j;
+
+		for (j = 0; rbp && j < n; j++, rbp = rb_next(rbp)) {
+			struct epitem *epi;
+			int objref;
+
+			epi = rb_entry(rbp, struct epitem, rbn);
+			items[j].fd = epi->ffd.fd;
+			items[j].events = epi->event.events;
+			items[j].data = epi->event.data;
+			objref = ckpt_obj_lookup(ctx, epi->ffd.file,
+						 CKPT_OBJ_FILE);
+			if (objref <= 0)
+				goto unlock;
+			items[j].file_objref = objref;
 		}
-		items[i].file_objref = objref;
+		ret = ckpt_kwrite(ctx, items, n*sizeof(*items));
+		if (ret < 0)
+			break;
+		num_items -= n;
 	}
+unlock:
 	mutex_unlock(&ep->mtx);
-	if (i == num_items && rbp)
-		ret = -EBUSY; /* extra item(s) -- checkpoint obj leak */
-	if (!ret)
-		ret = ckpt_write_buffer(ctx, items, sizeof(*items)*num_items);
-	else
-		ckpt_write_err(ctx, "E", "checkpoint leak detected.\n", ret);
 	kfree(items);
+	if (num_items != 0 || (num_items == 0 && rbp))
+		ret = -EBUSY; /* extra item(s) -- checkpoint obj leak */
+	if (ret)
+		ckpt_write_err(ctx, "E", " checkpointing epoll items.\n", ret);
 	return ret;
 }
 
-- 
1.5.6.3


_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers




More information about the Devel mailing list