[Devel] Re: [RFC][PATCH 2/2] memory checkpoint with swapfiles

Serge E. Hallyn serue at us.ibm.com
Thu Jun 14 09:16:05 PDT 2007


Quoting Dave Hansen (hansendc at us.ibm.com):
> 
> We have a lot of options with how to do actual checkpointing
> of a process's memory.  We have existing interfaces like
> ptrace and /proc/$pid/mem.  But, I'm sure everybody wants to
> be able to checkpoint things with the smallest amount of
> downtime possible, and being able to do it incrementally
> is important.
> 
> So, I've hacked up the swap code a bit to take requests
> via a syscall (very temporarily) and shoot down pages that
> were previously mapped and put them in swap.  If you want
> to checkpoint such a process, all you have to do is figure
> out which virtual address got placed where in swap, and
> you have all of the data that you need to recreate all of
> the anonymous memory that the process had.
> 
> This needs quite a few more bits to be actually useful,
> like making sure that only a single container's data gets
> put into the target swapfile, but it does appear to work.

Another thing this will need is a way to create a very quick
copy-on-write copy of the swapfile after the checkpoint.  Is
that simple enough to do?

> Is anybody revolted by this approach?
> 
> ---
> 
>  lxc-dave/include/linux/mm.h      |    1 +
>  lxc-dave/include/linux/ptrace.h  |    1 +
>  lxc-dave/include/linux/swapops.h |    5 +++++
>  lxc-dave/kernel/ptrace.c         |   23 +++++++++++++++++++++++
>  lxc-dave/mm/memory.c             |   35 ++++++++++++++++++++++++++++++++++-
>  lxc-dave/mm/migrate.c            |    5 -----
>  lxc-dave/mm/rmap.c               |    2 +-
>  lxc-dave/mm/swap_state.c         |    4 ++++
>  lxc-dave/mm/vmscan.c             |   35 ++++++++++++++++++++++++++++-------
>  9 files changed, 97 insertions(+), 14 deletions(-)
> 
> diff -puN include/linux/mm.h~add-ptrace-extension include/linux/mm.h
> --- lxc/include/linux/mm.h~add-ptrace-extension	2007-06-13 15:24:40.000000000 -0700
> +++ lxc-dave/include/linux/mm.h	2007-06-13 15:24:40.000000000 -0700
> @@ -1129,6 +1129,7 @@ struct page *follow_page(struct vm_area_
>  #define FOLL_TOUCH	0x02	/* mark page accessed */
>  #define FOLL_GET	0x04	/* do get_page on page */
>  #define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */
> +#define FOLL_SWAP	0x10	/* give ZERO_PAGE if no pgtable */
> 
>  #ifdef CONFIG_PROC_FS
>  void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
> diff -puN include/linux/ptrace.h~add-ptrace-extension include/linux/ptrace.h
> --- lxc/include/linux/ptrace.h~add-ptrace-extension	2007-06-13 15:24:40.000000000 -0700
> +++ lxc-dave/include/linux/ptrace.h	2007-06-13 15:24:40.000000000 -0700
> @@ -26,6 +26,7 @@
>  #define PTRACE_GETEVENTMSG	0x4201
>  #define PTRACE_GETSIGINFO	0x4202
>  #define PTRACE_SETSIGINFO	0x4203
> +#define PTRACE_POKEPTE		0x4204

Hmm, something about poking the pte's, I suppose?  But how come this
isn't used anywhere?

>  /* options set using PTRACE_SETOPTIONS */
>  #define PTRACE_O_TRACESYSGOOD	0x00000001
> diff -puN include/linux/swapops.h~add-ptrace-extension include/linux/swapops.h
> --- lxc/include/linux/swapops.h~add-ptrace-extension	2007-06-13 15:24:40.000000000 -0700
> +++ lxc-dave/include/linux/swapops.h	2007-06-13 15:24:40.000000000 -0700
> @@ -12,6 +12,11 @@
>  #define SWP_TYPE_SHIFT(e)	(sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT)
>  #define SWP_OFFSET_MASK(e)	((1UL << SWP_TYPE_SHIFT(e)) - 1)
> 
> +static inline int is_swap_pte(pte_t pte)
> +{
> +	return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
> +}
> +
>  /*
>   * Store a type+offset into a swp_entry_t in an arch-independent format
>   */
> diff -puN kernel/ptrace.c~add-ptrace-extension kernel/ptrace.c
> --- lxc/kernel/ptrace.c~add-ptrace-extension	2007-06-13 15:24:40.000000000 -0700
> +++ lxc-dave/kernel/ptrace.c	2007-06-13 15:24:40.000000000 -0700
> @@ -448,6 +448,29 @@ struct task_struct *ptrace_get_task_stru
>  }
> 
>  #ifndef __ARCH_SYS_PTRACE
> +asmlinkage long sys_hackery(long data, long pid, long addr)
> +{
> +	int ret = 0;
> +	int poke_process_pte(struct task_struct *tsk, unsigned long addr,
> +					pte_t *pte_state);

Odd place to put a prototype  :)  Were you planning on passing this in
as a fn argument at some point?

> +	pte_t pte_state;
> +	struct task_struct *child;
> +
> +	child = find_task_by_pid(pid);
> +	if (child)
> +		get_task_struct(child);

Does this count on the process having been placed in the freezer first
through some other mechanism?  Or is it safe on its own?

> +	ret = poke_process_pte(child, addr, &pte_state);
> +	if (ret)
> +		goto out;
> +	ret = copy_to_user((void *)data,
> +			&pte_state,
> +			sizeof(pte_state));
> +out:
> +	if (child)
> +		put_task_struct(child);
> +	return ret;
> +}
> +
>  asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
>  {
>  	struct task_struct *child;
> diff -puN mm/memory.c~add-ptrace-extension mm/memory.c
> --- lxc/mm/memory.c~add-ptrace-extension	2007-06-13 15:24:40.000000000 -0700
> +++ lxc-dave/mm/memory.c	2007-06-13 15:25:17.000000000 -0700
> @@ -941,8 +941,19 @@ struct page *follow_page(struct vm_area_
>  		goto out;
> 
>  	pte = *ptep;
> -	if (!pte_present(pte))
> +	if (!pte_present(pte)) {
> +		/*
> +		 * We should probably clean the actual entry up
> +		 * a bit, but this will do for now
> +		 */
> +		if (is_swap_pte(pte) && (flags & FOLL_SWAP))
> +			page = (struct page *)ptep;
> +		goto unlock;
> +	}
> +	if (flags & FOLL_SWAP) {
> +		page = NULL;
>  		goto unlock;
> +	}
>  	if ((flags & FOLL_WRITE) && !pte_write(pte))
>  		goto unlock;
>  	page = vm_normal_page(vma, address, pte);
> @@ -2684,6 +2695,28 @@ int in_gate_area_no_task(unsigned long a
> 
>  #endif	/* __HAVE_ARCH_GATE_AREA */
> 
> +int try_to_put_page_in_swap(struct page *page);
> +
> +int poke_process_pte(struct task_struct *tsk, unsigned long addr,
> +	       		pte_t *pte_state)
> +{
> +	struct page *page;
> +	struct vm_area_struct *vma;
> +
> +	vma = find_vma(tsk->mm, addr);
> +	if (!vma)
> +		return -EINVAL;
> +	page = follow_page(vma, addr, FOLL_GET);
> +	if (!page)
> +		return -EINVAL;
> +	try_to_put_page_in_swap(page);
> +	put_page(page);
> +	page = follow_page(vma, addr, FOLL_SWAP);
> +	if (page)
> +		*pte_state = *(pte_t *)page;
> +	return 0;
> +}
> +
>  /*
>   * Access another process' address space.
>   * Source/target buffer must be kernel space,
> diff -puN mm/migrate.c~add-ptrace-extension mm/migrate.c
> --- lxc/mm/migrate.c~add-ptrace-extension	2007-06-13 15:24:40.000000000 -0700
> +++ lxc-dave/mm/migrate.c	2007-06-13 15:24:40.000000000 -0700
> @@ -115,11 +115,6 @@ int putback_lru_pages(struct list_head *
>  	return count;
>  }
> 
> -static inline int is_swap_pte(pte_t pte)
> -{
> -	return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
> -}
> -
>  /*
>   * Restore a potential migration pte to a working pte entry
>   */
> diff -puN mm/rmap.c~add-ptrace-extension mm/rmap.c
> --- lxc/mm/rmap.c~add-ptrace-extension	2007-06-13 15:24:40.000000000 -0700
> +++ lxc-dave/mm/rmap.c	2007-06-13 15:24:40.000000000 -0700
> @@ -795,7 +795,7 @@ static void try_to_unmap_cluster(unsigne
>  	pte_unmap_unlock(pte - 1, ptl);
>  }
> 
> -static int try_to_unmap_anon(struct page *page, int migration)
> +int try_to_unmap_anon(struct page *page, int migration)
>  {
>  	struct anon_vma *anon_vma;
>  	struct vm_area_struct *vma;
> diff -puN mm/swap_state.c~add-ptrace-extension mm/swap_state.c
> --- lxc/mm/swap_state.c~add-ptrace-extension	2007-06-13 15:24:40.000000000 -0700
> +++ lxc-dave/mm/swap_state.c	2007-06-13 15:24:40.000000000 -0700
> @@ -128,6 +128,10 @@ void __delete_from_swap_cache(struct pag
>  	BUG_ON(PageWriteback(page));
>  	BUG_ON(PagePrivate(page));
> 
> +	if (printk_ratelimit()) {
> +		printk("%s(%p)\n", __func__, page);
> +		dump_stack();
> +	}
>  	radix_tree_delete(&swapper_space.page_tree, page_private(page));
>  	set_page_private(page, 0);
>  	ClearPageSwapCache(page);
> diff -puN mm/vmscan.c~add-ptrace-extension mm/vmscan.c
> --- lxc/mm/vmscan.c~add-ptrace-extension	2007-06-13 15:24:40.000000000 -0700
> +++ lxc-dave/mm/vmscan.c	2007-06-13 15:24:40.000000000 -0700
> @@ -611,19 +611,40 @@ static unsigned long shrink_page_list(st
> 
>  int try_to_put_page_in_swap(struct page *page)
>  {
> -
> -	get_page(page);

Ok, so this is called with page's refcount already inc'ed by the 
follow_page(), and the caller also put's the page?  Should the
fact that a ref to page should be held and put by caller be
commented above, or is that pretty obvious to anyone who would
mess with this file?

thanks,
-serge

> +	int ret = 0;
> +	struct writeback_control wbc = {
> +		.sync_mode = WB_SYNC_NONE,
> +	};
>  	if (page_count(page) == 1)
>                  /* page was freed from under us. So we are done. */
> -                return -EAGAON;
> +                return -EAGAIN;
>  	lock_page(page);
>  	if (PageWriteback(page))
>  		wait_on_page_writeback(page);
> -	try_to_unmap(page, 0);
> -	printk("page mapped: %d\n", page_mapped(page));
> +	if (!PageAnon(page))
> +	       goto unlock;
> +	if (!PageSwapCache(page))
> +		if (!add_to_swap(page, GFP_ATOMIC))
> +			goto unlock;
> +
> +	{
> +	/*
> +	 * This used to be a plain try_to_unmap(), but some
> +	 * pages were getting into the _file() function with
> +	 * what I think were null ->mapping pointer and oopsing
> +	 * on the mapping->mapping_lock.
> +	 */
> +	int try_to_unmap_anon(struct page *page, int migration);
> +	ret = try_to_unmap_anon(page, 0);
> +	}
> +	if (!page_mapped(page)) {
> +		swap_writepage(page, &wbc);
> +		lock_page(page);
> +		wait_on_page_writeback(page);
> +	}
> +unlock:
>  	unlock_page(page);
> -	put_page(page);
> -	return 0;
> +	return ret;
>  }
> 
>  /*
> _
> _______________________________________________
> Containers mailing list
> Containers at lists.linux-foundation.org
> https://lists.linux-foundation.org/mailman/listinfo/containers
_______________________________________________
Containers mailing list
Containers at lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/containers




More information about the Devel mailing list