[Devel] OpenVZ patch for VE disk i/o accounting :: 2.6.18

Kirill Korotaev dev at sw.ru
Wed Nov 29 05:13:55 PST 2006


Rick,

looks like Pavel wrote a bit different points, but I want to stress this one:
writes can happen in arbitrary context (e.g. when memory is being reclaimed
it is writted from arbitrary process context looking for the memory),
so get_exec_ub() imho is not always a valid context to be charged to.

Thanks,
Kirill

> I fixed a small bug and attached a revised patch.
> 
> I tested this patch by running 10 VEs on all night bonnie++ sessions and 
> 10 VEs doing nothing. In the morning, the HN and 10 non bonnie VEs were 
> charged for very little disk access (only maintaining syslog). The 10 
> bonnie VEs were charged for the right amount if disk I/O.
> 
> Rick Blundell
> 
> 
> ------------------------------------------------------------------------
> 
> diff -ur linux-2.6.18-FRESH-OPENVZ/block/ll_rw_blk.c kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/block/ll_rw_blk.c
> --- linux-2.6.18-FRESH-OPENVZ/block/ll_rw_blk.c	2006-11-26 04:07:27.000000000 -0500
> +++ kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/block/ll_rw_blk.c	2006-11-26 04:36:44.000000000 -0500
> @@ -28,6 +28,7 @@
>  #include <linux/interrupt.h>
>  #include <linux/cpu.h>
>  #include <linux/blktrace_api.h>
> +#include <ub/ub_misc.h>
>  
>  /*
>   * for max sense size
> @@ -3131,10 +3132,16 @@
>  	BIO_BUG_ON(!bio->bi_size);
>  	BIO_BUG_ON(!bio->bi_io_vec);
>  	bio->bi_rw |= rw;
> -	if (rw & WRITE)
> -		count_vm_events(PGPGOUT, count);
> -	else
> -		count_vm_events(PGPGIN, count);
> +
> +       if (rw & WRITE) {
> +                count_vm_events(PGPGOUT, count);
> +      } else {
> +		ub_bytesread_charge(bio->bi_size);
> +                count_vm_events(PGPGIN, count);
> +       }
> +
> +
> +
>  
>  	if (unlikely(block_dump)) {
>  		char b[BDEVNAME_SIZE];
> diff -ur linux-2.6.18-FRESH-OPENVZ/fs/buffer.c kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/fs/buffer.c
> --- linux-2.6.18-FRESH-OPENVZ/fs/buffer.c	2006-11-26 04:07:28.000000000 -0500
> +++ kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/fs/buffer.c	2006-11-26 05:52:40.000000000 -0500
> @@ -41,7 +41,7 @@
>  #include <linux/bitops.h>
>  #include <linux/mpage.h>
>  #include <linux/bit_spinlock.h>
> -
> +#include <ub/ub_misc.h>
>  static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
>  static void invalidate_bh_lrus(void);
>  
> @@ -858,8 +858,10 @@
>  	if (!TestSetPageDirty(page)) {
>  		write_lock_irq(&mapping->tree_lock);
>  		if (page->mapping) {	/* Race with truncate? */
> -			if (mapping_cap_account_dirty(mapping))
> -				__inc_zone_page_state(page, NR_FILE_DIRTY);
> +               if (mapping_cap_account_dirty(mapping)) {
> +			ub_byteswritten_charge(PAGE_CACHE_SIZE);
> +			__inc_zone_page_state(page, NR_FILE_DIRTY);
> +		}
>  			radix_tree_tag_set(&mapping->page_tree,
>  						page_index(page),
>  						PAGECACHE_TAG_DIRTY);
> @@ -2862,8 +2864,11 @@
>  void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
>  {
>  	int i;
> +       if (likely(nr) && !(rw & WRITE))
> +		ub_bytesread_charge(nr * bhs[0]->b_size);
>  
>  	for (i = 0; i < nr; i++) {
> +
>  		struct buffer_head *bh = bhs[i];
>  
>  		if (rw == SWRITE)
> @@ -2999,7 +3004,9 @@
>  		 * This only applies in the rare case where try_to_free_buffers
>  		 * succeeds but the page is not freed.
>  		 */
> -		clear_page_dirty(page);
> +               if (test_clear_page_dirty(page))
> +                       ub_byteswritten_uncharge(PAGE_CACHE_SIZE);
> +
>  	}
>  	spin_unlock(&mapping->private_lock);
>  out:
> diff -ur linux-2.6.18-FRESH-OPENVZ/fs/cifs/file.c kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/fs/cifs/file.c
> --- linux-2.6.18-FRESH-OPENVZ/fs/cifs/file.c	2006-09-19 23:42:06.000000000 -0400
> +++ kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/fs/cifs/file.c	2006-11-26 04:36:24.000000000 -0500
> @@ -39,6 +39,7 @@
>  #include "cifs_unicode.h"
>  #include "cifs_debug.h"
>  #include "cifs_fs_sb.h"
> +#include <ub/ub_misc.h>
>  
>  static inline struct cifsFileInfo *cifs_init_private(
>  	struct cifsFileInfo *private_data, struct inode *inode,
> @@ -1815,6 +1816,7 @@
>  			}
>  			break;
>  		} else if (bytes_read > 0) {
> +			ub_bytesread_charge(bytes_read);
>  			pSMBr = (struct smb_com_read_rsp *)smb_read_data;
>  			cifs_copy_cache_pages(mapping, page_list, bytes_read,
>  				smb_read_data + 4 /* RFC1001 hdr */ +
> diff -ur linux-2.6.18-FRESH-OPENVZ/fs/direct-io.c kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/fs/direct-io.c
> --- linux-2.6.18-FRESH-OPENVZ/fs/direct-io.c	2006-09-19 23:42:06.000000000 -0400
> +++ kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/fs/direct-io.c	2006-11-26 04:36:01.000000000 -0500
> @@ -35,6 +35,7 @@
>  #include <linux/rwsem.h>
>  #include <linux/uio.h>
>  #include <asm/atomic.h>
> +#include <ub/ub_misc.h>
>  
>  /*
>   * How many user pages to map in one call to get_user_pages().  This determines
> @@ -675,6 +676,11 @@
>  {
>  	int ret = 0;
>  
> +       if (dio->rw & WRITE) {
> +		ub_byteswritten_charge(len);
> +       }
> +
> +
>  	/*
>  	 * Can we just grow the current page's presence in the dio?
>  	 */
> diff -ur linux-2.6.18-FRESH-OPENVZ/include/ub/beancounter.h kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/include/ub/beancounter.h
> --- linux-2.6.18-FRESH-OPENVZ/include/ub/beancounter.h	2006-11-26 04:07:28.000000000 -0500
> +++ kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/include/ub/beancounter.h	2006-11-26 06:57:14.000000000 -0500
> @@ -71,13 +71,21 @@
>  #define UB_NUMOTHERSOCK	17	/* Number of other sockets. */
>  #define UB_DCACHESIZE	18	/* Size of busy dentry/inode cache. */
>  #define UB_NUMFILE	19	/* Number of open files. */
> +#define UB_NUMREADS     20
> +#define UB_NUMWRITES     21
> +#define UB_KBYTESREAD 23
> +#define UB_KBYTESWRITTEN 24
> +
> +
> +#define UB_RESOURCES   25
>  
> -#define UB_RESOURCES	24
>  
>  #define UB_UNUSEDPRIVVM	(UB_RESOURCES + 0)
>  #define UB_TMPFSPAGES	(UB_RESOURCES + 1)
>  #define UB_SWAPPAGES	(UB_RESOURCES + 2)
>  #define UB_HELDPAGES	(UB_RESOURCES + 3)
> +#define UB_BYTESREAD (UB_RESOURCES + 4)
> +#define UB_BYTESWRITTEN (UB_RESOURCES + 5)
>  
>  struct ubparm {
>  	/* 
> diff -ur linux-2.6.18-FRESH-OPENVZ/include/ub/ub_misc.h kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/include/ub/ub_misc.h
> --- linux-2.6.18-FRESH-OPENVZ/include/ub/ub_misc.h	2006-11-26 04:07:28.000000000 -0500
> +++ kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/include/ub/ub_misc.h	2006-11-26 06:25:02.000000000 -0500
> @@ -18,6 +18,12 @@
>  struct file_lock;
>  struct sigqueue;
>  
> +UB_DECLARE_FUNC(int, ub_numreads_charge())
> +UB_DECLARE_FUNC(int, ub_numwrites_charge())
> +UB_DECLARE_FUNC(int, ub_numwrites_uncharge())
> +UB_DECLARE_FUNC(int, ub_byteswritten_charge(int bytes))
> +UB_DECLARE_FUNC(int, ub_byteswritten_uncharge(int bytes))
> +UB_DECLARE_FUNC(int, ub_bytesread_charge(int bytes))
>  UB_DECLARE_FUNC(int, ub_file_charge(struct file *f))
>  UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f))
>  UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard))
> diff -ur linux-2.6.18-FRESH-OPENVZ/kernel/ub/beancounter.c kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/kernel/ub/beancounter.c
> --- linux-2.6.18-FRESH-OPENVZ/kernel/ub/beancounter.c	2006-11-26 04:07:28.000000000 -0500
> +++ kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/kernel/ub/beancounter.c	2006-11-26 08:22:41.000000000 -0500
> @@ -60,14 +60,18 @@
>  	"numothersock",
>  	"dcachesize",
>  	"numfile",
> -	"dummy",	/* 20 */
> -	"dummy",
> -	"dummy",
> -	"numiptent",
> -	"unused_privvmpages",	/* UB_RESOURCES */
> +       "numreads",     /* 20 */
> +       "numwrites",
> +        "numiptent",
> +"kbytesread",
> +"kbyteswritten",
> +        "unused_privvmpages",   /* UB_RESOURCES */
>  	"tmpfs_respages",
>  	"swap_pages",
>  	"held_pages",
> +       "bytesread",
> +       "byteswritten",
> +
>  };
>  
>  static void init_beancounter_struct(struct user_beancounter *ub);
> @@ -623,6 +627,13 @@
>  	ub->ub_parms[UB_NUMSIGINFO].limit = 1024;
>  	ub->ub_parms[UB_DCACHESIZE].limit = 1024*1024;
>  	ub->ub_parms[UB_NUMFILE].limit = 1024;
> +        ub->ub_parms[UB_NUMREADS].limit = UB_MAXVALUE;
> +        ub->ub_parms[UB_NUMWRITES].limit = UB_MAXVALUE;
> +        ub->ub_parms[UB_BYTESREAD].limit = 1024;
> +        ub->ub_parms[UB_BYTESWRITTEN].limit = 1024;
> +        ub->ub_parms[UB_KBYTESREAD].limit = UB_MAXVALUE;
> +        ub->ub_parms[UB_KBYTESWRITTEN].limit = UB_MAXVALUE;
> +
>  
>  	for (k = 0; k < UB_RESOURCES; k++)
>  		ub->ub_parms[k].barrier = ub->ub_parms[k].limit;
> diff -ur linux-2.6.18-FRESH-OPENVZ/kernel/ub/ub_misc.c kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/kernel/ub/ub_misc.c
> --- linux-2.6.18-FRESH-OPENVZ/kernel/ub/ub_misc.c	2006-11-26 04:07:28.000000000 -0500
> +++ kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/kernel/ub/ub_misc.c	2006-11-28 15:53:18.000000000 -0500
> @@ -53,6 +53,86 @@
>  	new_bc->pgfault_handle = 0;
>  	new_bc->pgfault_allot = 0;
>  }
> +int ub_numreads_charge(){
> +       struct user_beancounter *ub;
> +      int retval;
> +       ub=get_exec_ub();
> +       retval=charge_beancounter(ub, UB_NUMREADS, 1, UB_FORCE);
> +       return retval;
> +}
> +int ub_numwrites_charge(){
> +        struct user_beancounter *ub;
> +        int retval;
> +        ub=get_exec_ub();
> +        retval=charge_beancounter(ub, UB_NUMWRITES, 1, UB_FORCE);
> +        return retval;
> +}
> +ub_numwrites_uncharge(){
> +        struct user_beancounter *ub;
> +        ub=get_exec_ub();
> +	if(ub->ub_parms[UB_NUMWRITES].held < 1) return;
> +        uncharge_beancounter(ub, UB_NUMWRITES, 1);
> +	return;
> +}
> +int ub_bytesread_charge(int bytes){
> +	ub_numreads_charge();
> +        struct user_beancounter *ub;
> +        int retval;
> +        ub=get_exec_ub();
> +	int kbytes_charge=0, uncharge_val=0;
> +	if((bytes + ub->ub_parms[UB_BYTESREAD].held)>1024){
> +		kbytes_charge=(int)((bytes + ub->ub_parms[UB_BYTESREAD].held)/1024);
> +		bytes=(bytes + ub->ub_parms[UB_BYTESREAD].held)-kbytes_charge*1024;
> +		uncharge_beancounter(ub,UB_BYTESREAD,ub->ub_parms[UB_BYTESREAD].held);
> +		charge_beancounter(ub,UB_BYTESREAD, bytes, UB_FORCE);
> +		retval=charge_beancounter(ub,UB_KBYTESREAD,kbytes_charge,UB_FORCE);
> +	}else{
> +		retval=charge_beancounter(ub,UB_BYTESREAD, bytes, UB_FORCE);
> +	}
> +        return retval;
> +}
> +int ub_byteswritten_uncharge(int bytes){
> +	struct user_beancounter *ub;
> +        int retval;
> +        ub=get_exec_ub();
> +        int kbytes_uncharge=0;
> +	int held=ub->ub_parms[UB_BYTESWRITTEN].held;
> +        if((held - bytes)<0 || held==0){
> +		kbytes_uncharge=(int)((bytes+held)/1024);
> +                bytes=(kbytes_uncharge*1024-(bytes - held));
> +	
> +                uncharge_beancounter(ub,UB_BYTESWRITTEN,held);
> +		charge_beancounter(ub,UB_BYTESWRITTEN, bytes, UB_FORCE);		
> +		if(ub->ub_parms[UB_KBYTESWRITTEN].held > kbytes_uncharge){
> +	                uncharge_beancounter(ub,UB_KBYTESWRITTEN,kbytes_uncharge);
> +		}
> +	}else{
> +		uncharge_beancounter(ub,UB_BYTESWRITTEN, bytes);
> +
> +	}
> +}
> + int ub_byteswritten_charge(int bytes){
> +	ub_numwrites_charge();
> +        struct user_beancounter *ub;
> +        int retval;
> +        ub=get_exec_ub();
> +        int kbytes_charge=0;
> +        if((bytes + ub->ub_parms[UB_BYTESWRITTEN].held)>1024){
> +
> +                kbytes_charge=((bytes + ub->ub_parms[UB_BYTESWRITTEN].held)/1024);
> +                bytes=(bytes + ub->ub_parms[UB_BYTESWRITTEN].held)-kbytes_charge*1024;
> +		if( ub->ub_parms[UB_BYTESWRITTEN].held < 1024 )
> +			ub_byteswritten_uncharge(ub->ub_parms[UB_BYTESWRITTEN].held);
> +                charge_beancounter(ub,UB_BYTESWRITTEN, bytes, UB_FORCE);
> +                retval=charge_beancounter(ub,UB_KBYTESWRITTEN,kbytes_charge,UB_FORCE);
> +        }else{
> +                retval=charge_beancounter(ub,UB_BYTESWRITTEN, bytes, UB_FORCE);
> +       }
> +       return retval;
> +	
> +}
> +
> +
>  
>  void ub_init_task_bc(struct task_beancounter *tbc)
>  {
> diff -ur linux-2.6.18-FRESH-OPENVZ/mm/page-writeback.c kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/mm/page-writeback.c
> --- linux-2.6.18-FRESH-OPENVZ/mm/page-writeback.c	2006-09-19 23:42:06.000000000 -0400
> +++ kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/mm/page-writeback.c	2006-11-26 05:49:54.000000000 -0500
> @@ -29,6 +29,7 @@
>  #include <linux/sysctl.h>
>  #include <linux/cpu.h>
>  #include <linux/syscalls.h>
> +#include <ub/ub_misc.h>
>  
>  /*
>   * The maximum number of pages to writeout in a single bdflush/kupdate
> @@ -623,9 +624,11 @@
>  			mapping2 = page_mapping(page);
>  			if (mapping2) { /* Race with truncate? */
>  				BUG_ON(mapping2 != mapping);
> -				if (mapping_cap_account_dirty(mapping))
> +				if (mapping_cap_account_dirty(mapping)) {
>  					__inc_zone_page_state(page,
>  								NR_FILE_DIRTY);
> +					ub_byteswritten_charge(PAGE_CACHE_SIZE);
> +				}
>  				radix_tree_tag_set(&mapping->page_tree,
>  					page_index(page), PAGECACHE_TAG_DIRTY);
>  			}
> diff -ur linux-2.6.18-FRESH-OPENVZ/mm/readahead.c kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/mm/readahead.c
> --- linux-2.6.18-FRESH-OPENVZ/mm/readahead.c	2006-09-19 23:42:06.000000000 -0400
> +++ kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/mm/readahead.c	2006-11-26 04:36:12.000000000 -0500
> @@ -14,6 +14,7 @@
>  #include <linux/blkdev.h>
>  #include <linux/backing-dev.h>
>  #include <linux/pagevec.h>
> +#include <ub/ub_misc.h>
>  
>  void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
>  {
> @@ -143,6 +144,7 @@
>  			page_cache_release(page);
>  			continue;
>  		}
> +		ub_bytesread_charge(PAGE_CACHE_SIZE);
>  		ret = filler(data, page);
>  		if (!pagevec_add(&lru_pvec, page))
>  			__pagevec_lru_add(&lru_pvec);
> diff -ur linux-2.6.18-FRESH-OPENVZ/mm/truncate.c kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/mm/truncate.c
> --- linux-2.6.18-FRESH-OPENVZ/mm/truncate.c	2006-11-26 04:07:28.000000000 -0500
> +++ kernel-2.6.18-rjb-004-rev-028-005.1/linux-2.6.18/mm/truncate.c	2006-11-26 10:22:52.000000000 -0500
> @@ -41,8 +41,9 @@
>  
>  	if (PagePrivate(page))
>  		do_invalidatepage(page, 0);
> +       if (test_clear_page_dirty(page))
> +               ub_byteswritten_uncharge(PAGE_CACHE_SIZE);
>  
> -	clear_page_dirty(page);
>  	ClearPageUptodate(page);
>  	ClearPageMappedToDisk(page);
>  	remove_from_page_cache(page);
> 
> 
> ------------------------------------------------------------------------
> 
> _______________________________________________
> Devel mailing list
> Devel at openvz.org
> https://openvz.org/mailman/listinfo/devel




More information about the Devel mailing list