[Devel] [PATCH rh7 v2 1/3] fs/cleancache: fix data invalidation in the cleancache during direct_io

Konstantin Khorenko khorenko at virtuozzo.com
Thu Apr 13 00:39:36 PDT 2017



--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 04/13/2017 09:51 AM, Dmitry Monakhov wrote:
> Andrey Ryabinin <aryabinin at virtuozzo.com> writes:
>
>> Currently some direct_io fs hooks call invalidate_inode_pages2_range()
>> conditionally iff mapping->nrpages is not zero. So if nrpages is zero,
>> data in cleancache wouldn't be invalidated. So the next buffered read
>> may get stale data from the cleancache.
>
>>
>> Fix this by calling invalidate_inode_pages2_range() regardless of nrpages
>> value. And if nrpages is zero, bail out from invalidate_inode_pages2_range()
>> only after cleancache_invalidate_inode(), so that we invalidate cleancache
>> but still avoid pointless page cache lookups.
> BTW, can we please make tcache plugable. So one who do not want fancy
> caching features can simply disable it. As we do with pfcache.

tcache/tswap disable/enable tweak (vz7: patch "Subject: [PATCH rh7] tcache/tswap: enable by default")
tcache/swap can be disabled/enabled using following commands:
echo 'N' > /sys/module/{tcache,tswap}/parameters/active
echo 'Y' > /sys/module/{tcache,tswap}/parameters/active

per-cgroup on the fly: echo 1 > @memory.disable_cleancache

To disable tcache at boot time: "tcache.enabled=0" kernel option

>
>>
>> https://jira.sw.ru/browse/PSBM-63908
>> Signed-off-by: Andrey Ryabinin <aryabinin at virtuozzo.com>
>> ---
>>  fs/9p/vfs_file.c  |  4 ++--
>>  fs/nfs/direct.c   | 16 ++++++----------
>>  fs/nfs/inode.c    |  7 ++++---
>>  fs/xfs/xfs_file.c | 30 ++++++++++++++----------------
>>  mm/filemap.c      | 28 ++++++++++++----------------
>>  mm/truncate.c     |  4 ++++
>>  6 files changed, 42 insertions(+), 47 deletions(-)
>>
>> diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
>> index 7da03f8..afe0036 100644
>> --- a/fs/9p/vfs_file.c
>> +++ b/fs/9p/vfs_file.c
>> @@ -482,7 +482,7 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
>>  	if (invalidate && (total > 0)) {
>>  		pg_start = origin >> PAGE_CACHE_SHIFT;
>>  		pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
>> -		if (inode->i_mapping && inode->i_mapping->nrpages)
>> +		if (inode->i_mapping)
>>  			invalidate_inode_pages2_range(inode->i_mapping,
>>  						      pg_start, pg_end);
>>  		*offset += total;
>> @@ -688,7 +688,7 @@ v9fs_direct_write(struct file *filp, const char __user * data,
>>  	 * about to write.  We do this *before* the write so that if we fail
>>  	 * here we fall back to buffered write
>>  	 */
>> -	if (mapping->nrpages) {
>> +	{
>>  		pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT;
>>  		pgoff_t pg_end   = (offset + count - 1) >> PAGE_CACHE_SHIFT;
>>
>> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
>> index ab96f01..9bbbb63 100644
>> --- a/fs/nfs/direct.c
>> +++ b/fs/nfs/direct.c
>> @@ -1132,12 +1132,10 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
>>  	if (result)
>>  		goto out_unlock;
>>
>> -	if (mapping->nrpages) {
>> -		result = invalidate_inode_pages2_range(mapping,
>> -					pos >> PAGE_CACHE_SHIFT, end);
>> -		if (result)
>> -			goto out_unlock;
>> -	}
>> +	result = invalidate_inode_pages2_range(mapping,
>> +				pos >> PAGE_CACHE_SHIFT, end);
>> +	if (result)
>> +		goto out_unlock;
>>
>>  	task_io_account_write(count);
>>
>> @@ -1161,10 +1159,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
>>
>>  	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
>>
>> -	if (mapping->nrpages) {
>> -		invalidate_inode_pages2_range(mapping,
>> -					      pos >> PAGE_CACHE_SHIFT, end);
>> -	}
>> +	invalidate_inode_pages2_range(mapping,
>> +				pos >> PAGE_CACHE_SHIFT, end);
>>
>>  	mutex_unlock(&inode->i_mutex);
>>
>> diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
>> index 8c06aed..779b05c 100644
>> --- a/fs/nfs/inode.c
>> +++ b/fs/nfs/inode.c
>> @@ -1065,10 +1065,11 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
>>  			if (ret < 0)
>>  				return ret;
>>  		}
>> -		ret = invalidate_inode_pages2(mapping);
>> -		if (ret < 0)
>> -			return ret;
>>  	}
>> +	ret = invalidate_inode_pages2(mapping);
>> +	if (ret < 0)
>> +		return ret;
>> +
>>  	if (S_ISDIR(inode->i_mode)) {
>>  		spin_lock(&inode->i_lock);
>>  		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
>> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
>> index 9a2193b..0b7a35b 100644
>> --- a/fs/xfs/xfs_file.c
>> +++ b/fs/xfs/xfs_file.c
>> @@ -346,7 +346,7 @@ xfs_file_aio_read(
>>  	 * serialisation.
>>  	 */
>>  	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
>> -	if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
>> +	if ((ioflags & XFS_IO_ISDIRECT)) {
>>  		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
>>  		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
>>
>> @@ -361,22 +361,20 @@ xfs_file_aio_read(
>>  		 * flush and reduce the chances of repeated iolock cycles going
>>  		 * forward.
>>  		 */
>> -		if (inode->i_mapping->nrpages) {
>> -			ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
>> -			if (ret) {
>> -				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
>> -				return ret;
>> -			}
>> -
>> -			/*
>> -			 * Invalidate whole pages. This can return an error if
>> -			 * we fail to invalidate a page, but this should never
>> -			 * happen on XFS. Warn if it does fail.
>> -			 */
>> -			ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
>> -			WARN_ON_ONCE(ret);
>> -			ret = 0;
>> +		ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
>> +		if (ret) {
>> +			xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
>> +			return ret;
>>  		}
>> +
>> +		/*
>> +		 * Invalidate whole pages. This can return an error if
>> +		 * we fail to invalidate a page, but this should never
>> +		 * happen on XFS. Warn if it does fail.
>> +		 */
>> +		ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
>> +		WARN_ON_ONCE(ret);
>> +		ret = 0;
>>  		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
>>  	}
>>
>> diff --git a/mm/filemap.c b/mm/filemap.c
>> index 16517c6..8e608fc 100644
>> --- a/mm/filemap.c
>> +++ b/mm/filemap.c
>> @@ -2598,18 +2598,16 @@ generic_file_direct_write_iter(struct kiocb *iocb, struct iov_iter *iter,
>>  	 * about to write.  We do this *before* the write so that we can return
>>  	 * without clobbering -EIOCBQUEUED from ->direct_IO().
>>  	 */
>> -	if (mapping->nrpages) {
>> -		written = invalidate_inode_pages2_range(mapping,
>> -					pos >> PAGE_CACHE_SHIFT, end);
>> -		/*
>> -		 * If a page can not be invalidated, return 0 to fall back
>> -		 * to buffered write.
>> -		 */
>> -		if (written) {
>> -			if (written == -EBUSY)
>> -				return 0;
>> -			goto out;
>> -		}
>> +	written = invalidate_inode_pages2_range(mapping,
>> +						pos >> PAGE_CACHE_SHIFT, end);
>> +	/*
>> +	 * If a page can not be invalidated, return 0 to fall back
>> +	 * to buffered write.
>> +	 */
>> +	if (written) {
>> +		if (written == -EBUSY)
>> +			return 0;
>> +		goto out;
>>  	}
>>
>>  	written = mapping_direct_IO(mapping, WRITE, iocb, iter, pos);
>> @@ -2622,10 +2620,8 @@ generic_file_direct_write_iter(struct kiocb *iocb, struct iov_iter *iter,
>>  	 * so we don't support it 100%.  If this invalidation
>>  	 * fails, tough, the write still worked...
>>  	 */
>> -	if (mapping->nrpages) {
>> -		invalidate_inode_pages2_range(mapping,
>> -					      pos >> PAGE_CACHE_SHIFT, end);
>> -	}
>> +	invalidate_inode_pages2_range(mapping,
>> +				pos >> PAGE_CACHE_SHIFT, end);
>>
>>  	if (written > 0) {
>>  		pos += written;
>> diff --git a/mm/truncate.c b/mm/truncate.c
>> index f460e67..ce4b1d8 100644
>> --- a/mm/truncate.c
>> +++ b/mm/truncate.c
>> @@ -628,6 +628,10 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
>>  	int did_range_unmap = 0;
>>
>>  	cleancache_invalidate_inode(mapping);
>> +
>> +	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
>> +		return 0;
>> +
>>  	pagevec_init(&pvec, 0);
>>  	index = start;
>>  	while (index <= end && __pagevec_lookup(&pvec, mapping, index,
>> --
>> 2.10.2
> .
>


More information about the Devel mailing list