[Devel] [PATCH 2/2] Support Non-consecutive and dup pipe fds

sukadev at us.ibm.com sukadev at us.ibm.com
Mon Jun 23 20:26:20 PDT 2008


>From a80c5215763f757840214465277e911e46e01219 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev at linux.vnet.ibm.com>
Date: Mon, 23 Jun 2008 20:13:57 -0700
Subject: [PATCH] Support Non-consecutive and dup pipe fds

PATCH 1/1 provides basic infrastructure to save/restore state of
pipes. This patch removes assumptions about order of the pipe-fds
and also supports existence of 'dups' of pipe-fds.

This logic has been separated from PATCH 1/1 for easier review and
the two patches could be combined into a single one.

Thanks to Matt Helsley for the optimized logic/code in match_pipe_ends().

TODO:
	There are few TODO's marked out in the patch. Hopefully these
	can be addressed without significant impact to the central-logic
	of saving/restoring pipes.

	- Temporarily using a regular-file's fd as 'trampoline-fd' when
	  all fds are in use

	- Maybe read all fdinfo into memory during restart, so we can
	  reduce the information we save into the checkpoint-file
	  (see comments near 'struct fdinfo').

	- Check logic of detecting 'dup's of pipe fds (any hidden
	  gotchas ?) See pair_pipe_fds()

	- Alloc ppi_list[] dynamically (see getfdinfo()).

	- Use getrlimit() to compute max-open-fds (see near caller of
	  pair_pipe_fds()).

	- [Oleg Nesterov]: SIGIO/inotify() issues associated with writing-back
	  to pipes (fixing this would require some assitance from kernel ?)

Ran several unit-test cases (see test-patches). Additional cases to be
developed/executed.

Signed-off-by: Sukadev Bhattiprolu <sukadev at us.ibm.com>
---
 cr.c |  262 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 files changed, 240 insertions(+), 22 deletions(-)

diff --git a/cr.c b/cr.c
index 716cc86..f40a4fb 100644
--- a/cr.c
+++ b/cr.c
@@ -79,8 +79,25 @@ typedef struct isockinfo_t {
 	char tcpstate[TPI_LEN];
 } isockinfo_t;
 
+/*
+ * TODO: restore_fd() processes each fd as it reads it of the checkpoint
+ *	file. To avoid making a second-pass at the file, we store following
+ *	fields during checkpoint (for now).
+ *
+ *		peer_fdnum, dup_fdnum, create_pipe, tramp_fd' fields can be
+ *
+ * 	We could eliminate this fields by reading all fdinfo into memory
+ * 	and then 'computing' the above fields before processing the fds.
+ * 	But this would require a non-trivial rewrite of the restore_fd()
+ * 	logic. Hopefully that can be done without significant impact to
+ * 	rest of the logic associated with saving/restoring pipes.
+ */
 typedef struct fdinfo_t {
 	int fdnum;		/* file descriptor number */
+	int peer_fdnum;		/* peer fd for pipes */
+	int dup_fdnum;		/* fd, if fd is dup of another pipe fd */
+	int create_pipe;	/* TRUE if this is the create-end of the pipe */
+	int tramp_fd;		/* trampoline-fd for use in restoring pipes */
 	mode_t mode;		/* mode as per stat(2) */
 	off_t offset;		/* read/write pointer position for regular files */
 	int flag;		/* open(2) flag */
@@ -117,6 +134,7 @@ typedef struct pinfo_t {
 	int nt;			/* number of thread child (0 if no thread lib) */
 	pid_t *tpid;		/* array of thread info */
 	struct pinfo_t *pmt;	/* multithread: pointer to main thread info */
+	int tramp_fd;		/* trampoline-fd for use in restoring pipes */
 } pinfo_t;
 
 /*
@@ -263,6 +281,89 @@ int getsockinfo(pid_t pid, pinfo_t *pi, int num)
 	return ret;
 }
 
+typedef struct pipe_peer_info {
+	fdinfo_t *pipe_fdi;
+	//fdinfo_t *peer_fdi;
+	__ino_t pipe_ino;
+} pipe_peer_info_t;
+
+__ino_t get_fd_ino(char *fname)
+{
+	struct stat sbuf;
+
+	if (stat(fname, &sbuf) < 0)
+		ERROR("stat() on fd %s failed, errno %d\n", fname, errno);
+
+	return sbuf.st_ino;
+}
+
+static void pair_pipe_fds(pipe_peer_info_t *ppi_list, int npipe_fds)
+{
+	int i, j;
+	pipe_peer_info_t *xppi, *yppi;
+	fdinfo_t *xfdi, *yfdi;
+
+	/*
+	 * TODO: This currently assumes pipefds have not been dup'd.
+	 * 	 Of course, need to kill this assumption soon.
+	 */
+	for (i = 0; i < npipe_fds; i++) {
+		xppi = &ppi_list[i];
+		xfdi = xppi->pipe_fdi;
+
+		j = i + 1;
+		for (j = i+1; j < npipe_fds; j++) {
+			yppi = &ppi_list[j];
+			yfdi = yppi->pipe_fdi;
+
+			if (yppi->pipe_ino != xppi->pipe_ino)
+				continue;
+
+			DEBUG("Checking flag i %d, j %d\n", i, j);
+			/*
+			 * i and j refer to same pipe. Check if they are
+			 * peers or aliases (dup'd fds). dup'd fds share
+			 * file-status flags. Peer fds of unnamed pipes
+			 * differ in O_WRONLY bit.
+			 *
+			 * TODO:
+			 * 	CHECK ABOVE ASSUMPTION
+			 */
+			if (xfdi->flag == yfdi->flag) {
+				yfdi->dup_fdnum = xfdi->fdnum;
+				DEBUG("Pipe fds %d and %d are dups\n",
+						xfdi->fdnum, yfdi->fdnum);
+			} else {
+				DEBUG("Pipe fds %d and %d are peers\n", 
+						xfdi->fdnum, yfdi->fdnum);
+
+				/*
+				 * If we have already paired it or determined
+				 * it is a dup, ignore
+				 */
+				if (xfdi->peer_fdnum != -1 ||
+						xfdi->dup_fdnum != -1)
+					continue;
+
+				DEBUG("Pipe fd %d not paired yet\n",
+						xfdi->fdnum);
+				/*
+				 * Create pipe on first end of the pipe we
+				 * come across.
+				 */
+				if (xfdi->fdnum < yfdi->fdnum)
+					xfdi->create_pipe = 1;
+
+				xfdi->peer_fdnum = yfdi->fdnum;
+				yfdi->peer_fdnum = xfdi->fdnum;
+			}
+			DEBUG("Clearing ino i %d, j %d\n", i, j);
+		}
+	}
+
+	DEBUG("Done building pipe list i %d, j %d\n", i, j);
+}
+
 /*
  * getfds() parse the process open file descriptors as found in /proc.
  */
@@ -275,6 +376,10 @@ int getfdinfo(pinfo_t *pi)
 	int len, n = 0;
 	pid_t syscallpid = pi->syscallpid ? pi->syscallpid : pi->pid;
 
+	int npipe_fds = 0;
+	pipe_peer_info_t ppi_list[256];// TODO: alloc dynamically
+	pipe_peer_info_t *ppi;
+
 	snprintf(dname, sizeof(dname), "/proc/%u/fd", pi->pid);
 	if (! (dir = opendir(dname))) return 0;
 	while ((dent = readdir(dir))) {
@@ -288,14 +393,54 @@ int getfdinfo(pinfo_t *pi)
 		stat(dname, &st);
 		pi->fi[n].mode = st.st_mode;
 		pi->fi[n].flag = PT_FCNTL(syscallpid, pi->fi[n].fdnum, F_GETFL, 0);
+		pi->fi[n].create_pipe = 0;
+		pi->fi[n].tramp_fd = -1;
+		pi->fi[n].dup_fdnum = -1;
+		pi->fi[n].peer_fdnum = -1;
 		if (S_ISREG(st.st_mode))
 			pi->fi[n].offset = (off_t)PT_LSEEK(syscallpid, pi->fi[n].fdnum, 0, SEEK_CUR);
-		else if (S_ISFIFO(st.st_mode))
+		else if (S_ISFIFO(st.st_mode)) {
 			t_s("fifo");
+			ppi = &ppi_list[npipe_fds];
+			ppi->pipe_fdi = &pi->fi[n];
+			//ppi->peer_fdi = NULL;
+			ppi->pipe_ino = get_fd_ino(dname);
+
+			DEBUG("Found a pipe: fd %d, flag 0x%x ino %d\n",
+					ppi->pipe_fdi->fdnum,
+					ppi->pipe_fdi->flag, ppi->pipe_ino);
+			npipe_fds++;
+
+		}
 		else if (S_ISSOCK(st.st_mode)) 
 			getsockinfo(syscallpid, pi, pi->fi[n].fdnum);
 		n++;
 	}
+
+	if (n) {
+		pi->tramp_fd = pi->fi[n-1].fdnum + 1;
+		DEBUG("Using %d as trampoline-fd\n", pi->tramp_fd);
+	}
+
+	/*
+	 * TODO: replace 1024 with rlim.rlim_cur (but need to execute
+	 * 	 getrlimit() in checkpointed-process)
+	 */
+	if (npipe_fds && pi->tramp_fd >= 1024) {
+		/*
+		 * TODO:
+		 * 	This restriction can be relaxed a bit. We only
+		 * 	need to give-up here if all fds are in use as
+		 * 	pipe-fds. 
+		 */
+		ERROR("Cannot allocate a 'trampoline_fd' for pipes");
+	}
+
+	/*
+	 * Now that we found all fds,  pair up any pipe_fds
+	 */
+	pair_pipe_fds(ppi_list, npipe_fds);
+
 end:
 	closedir(dir);
 	return n;
@@ -550,7 +695,11 @@ static int save_process_fifo_info(pinfo_t *pi, int fd)
 		DEBUG("FIFO fd %d (%s), flag 0x%x\n", fi[i].fdnum, fi[i].name,
 				fi[i].flag);
 
-		if (!(fi[i].flag & O_WRONLY))
+		/*
+		 * If its read-side fd or a dup of another write-side-fd,
+		 * don't need the data.
+		 */
+		if (!(fi[i].flag & O_WRONLY) || fi[i].dup_fdnum != -1)
 			continue;
 
 		pbuf_size = estimate_fifo_unread_bytes(pi, fd);
@@ -742,6 +891,12 @@ static int save_process_data(pid_t pid, int fd, lh_list_t *ptree)
 	write_item(fd, "FD", NULL, 0);
 	t_d(pi->nf);
 	for (i = 0; i < pi->nf; i++) {
+		/*
+		 * trampoline-fd is common to all fds, so we could write it
+		 * once, as a separate item by itself. Stick it in each
+		 * fdinfo for now.
+		 */
+		pi->fi[i].tramp_fd = pi->tramp_fd;
 		write_item(fd, "fdinfo", &pi->fi[i], sizeof(fdinfo_t));
 	}
 	write_item(fd, "END FD", NULL, 0);
@@ -949,6 +1104,74 @@ pid_t restart_thread(pid_t ppid, int exitsig, int addr)
 	return pid;
 }
 
+static void match_pipe_ends(int pid, int tramp_fd, int expected_fds[],
+		int actual_fds[])
+{
+	int ret;
+	int expected_read_fd = expected_fds[0];
+	int expected_write_fd = expected_fds[1];
+
+	DEBUG("tramp_fd %d\n", tramp_fd);
+        /*
+         * pipe() may have returned one (or both) of the restarted fds
+         * at the wrong end of the pipe. This could cause dup2() to
+         * accidentaly close the pipe. Avoid that with an extra dup().
+         */
+        if (actual_fds[1] == expected_read_fd) {
+                t_d(ret = PT_DUP2(pid, actual_fds[1], tramp_fd + 1));
+                actual_fds[1] = tramp_fd + 1;
+        }
+
+        if (actual_fds[0] != expected_read_fd) {
+                t_d(ret = PT_DUP2(pid, actual_fds[0], expected_read_fd));
+                t_d(PT_CLOSE(pid, actual_fds[0]));
+        }
+
+        if (actual_fds[1] != expected_write_fd) {
+                t_d(ret = PT_DUP2(pid, actual_fds[1], expected_write_fd));
+                t_d(PT_CLOSE(pid, actual_fds[1]));
+        }
+}
+
+static void recreate_pipe(int pid, int tramp_fd, fdinfo_t *fdinfo)
+{
+	int actual_fds[2] = { 0, 0 };
+	int expected_fds[2];
+
+	DEBUG("Creating pipe for fd %d\n", fdinfo->fdnum);
+
+	t_d(PT_PIPE(pid, actual_fds));
+	t_d(actual_fds[0]);
+	t_d(actual_fds[1]);
+
+	/*
+	 * Find read-end and write-end of the checkpointed pipe
+	 * (i.e don't assume that read-side fd is smaller than
+	 * write-side fd)
+	 */
+	if (fdinfo->flag & O_WRONLY) {
+		expected_fds[0] = fdinfo->peer_fdnum;
+		expected_fds[1] = fdinfo->fdnum;
+	} else {
+		expected_fds[0] = fdinfo->fdnum;
+		expected_fds[1] = fdinfo->peer_fdnum;
+	}
+
+	/*
+	 * Match the ends of newly created pipe with the ends of the
+	 * checkpointed pipe.
+	 */
+	match_pipe_ends(pid, tramp_fd, expected_fds, actual_fds);
+
+	/*
+	 * for debug, use fcntl() on fdinfo->fdnum and fdinfo->peer_fdnum
+	 * to ensure ends match
+	 */
+
+	DEBUG("Done creating pipe '{%d, %d}'\n", fdinfo->fdnum,
+			fdinfo->peer_fdnum);
+}
+
 int restore_fd(int fd, pid_t pid)
 {
 	char item[64];
@@ -1001,34 +1224,29 @@ int restore_fd(int fd, pid_t pid)
 				if (pfd != fdinfo->fdnum) t_d(PT_CLOSE(pid, pfd));
 			}
 		} else if (S_ISFIFO(fdinfo->mode)) {
-			int pipefds[2] = { 0, 0 };
-
+			if (fdinfo->dup_fdnum != -1) {
+				t_d(ret = PT_DUP2(pid, fdinfo->dup_fdnum,
+							fdinfo->fdnum));
+			}
 			/*
-			 * We create the pipe when we see the pipe's read-fd.
-			 * Just ignore the pipe's write-fd.
+			 * When checkpointing, we arbitrarily mark one end
+			 * of the pipe as the 'create-end'.  Create a pipe
+			 * if this fd is the 'create-end' and then restore
+			 * the fcntl-flags. For the other end of the pipe,
+			 * just restore its fcntl-flags.
 			 */
-			if (fdinfo->flag == O_WRONLY)
-				continue;
-
-			DEBUG("Creating pipe for fd %d\n", fdinfo->fdnum);
-
-			t_d(PT_PIPE(pid, pipefds));
-			t_d(pipefds[0]);
-			t_d(pipefds[1]);
-
-			if (pipefds[0] != fdinfo->fdnum) {
-				DEBUG("Hmm, new pipe has fds %d, %d "
-					"Old pipe had fd %d\n", pipefds[0],
-					pipefds[1], fdinfo->fdnum); getchar();
-				exit(1);
-			}
-			DEBUG("Done creating pipefds[0] %d\n", pipefds[0]);
+			else if (fdinfo->create_pipe)
+				recreate_pipe(pid, fdinfo->tramp_fd, fdinfo);
 		}
 
 		/*
 		 * Restore any special flags this fd had
 		 */
 		ret = PT_FCNTL(pid, fdinfo->fdnum, F_SETFL, fdinfo->flag);
+		if (ret < 0) {
+			ERROR("restore_fd() fd %d setfl flag 0x%x, ret %d\n",
+					fdinfo->fdnum, fdinfo->flag, ret);
+		}
 		DEBUG("---- restore_fd() fd %d setfl flag 0x%x, ret %d\n",
 				fdinfo->fdnum, fdinfo->flag, ret);
 		free(fdinfo);
-- 
1.5.2.5

_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers




More information about the Devel mailing list