[CRIU] [PATCH 3/3] Issue #360: Anonymize image files

Mon Jun 24 18:15:52 MSK 2019

On 24/06/2019 13:00, Pavel Emelianov wrote:
> On 6/22/19 12:37 PM, Harshavardhan Unnibhavi wrote:
>> This commit adds the file anonymizer function which anonymizes file names present in images.
>>
>> The anonymized file names are just the shuffled names along the path from root.
>>
>> Signed-off-by: Harshavardhan Unnibhavi <hvubfoss at gmail.com>
>> ---
>>   lib/py/cli.py   |  9 ++++++-
>>   lib/py/strip.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
>>   2 files changed, 74 insertions(+), 1 deletion(-)
>>   create mode 100644 lib/py/strip.py
>>
>> diff --git a/lib/py/cli.py b/lib/py/cli.py
>> index 17622fd2..4a8efeff 100755
>> --- a/lib/py/cli.py
>> +++ b/lib/py/cli.py
>> @@ -5,6 +5,7 @@ import json
>>   import os
>>   
>>   import pycriu
>> +import strip
the keyword 'strip' is very popular (e.g. Python has string.strip() 
method) and using it as a module name may cause confusion, perhaps using 
'anonymize' would be better?
>>   
>>   def inf(opts):
>>   	if opts['in']:
>> @@ -281,15 +282,21 @@ def anonymize(opts):
>>   	img_files = os.listdir(opts['in'])
>>   
>>   	for i in img_files:
>> -		temp = {'in':os.path.join(opts['in'], i)}
>> +		temp = {'in':os.path.join(opts['in'], i), 'out':os.path.join(opts['out'], i)}
>>   
>>   		try:
>>   			m, img = pycriu.images.load(inf(temp), anon_info = True)
>> +			print("Processing File name:{} with magic:{}".format(i, m))
>>   		except pycriu.images.MagicException as exc:
>>   			print("Unknown magic %#x.\n"\
>>   					"Found a raw image, continuing ..."% exc.magic, file=sys.stderr)
>>   			continue
>>   		
>> +		anon_dict = strip.anon_handler(img, m)
>> +		if anon_dict != -1:
>> +			pycriu.images.dump(anon_dict, outf(temp))
> A message about skipping the file is needed.
>
>> +		
>> +		
>>   
>>   explorers = { 'ps': explore_ps, 'fds': explore_fds, 'mems': explore_mems, 'rss': explore_rss }
>>   
>> diff --git a/lib/py/strip.py b/lib/py/strip.py
>> new file mode 100644
>> index 00000000..4069275c
>> --- /dev/null
>> +++ b/lib/py/strip.py
The indentation of this file is using spaces, (IMHO we should be using 
space for python code) however, the rest of the code base is using tabs. 
For consistency it might be better to use tabs in this file as well?
>> @@ -0,0 +1,66 @@
>> +# This file contains methods to deal with anonymising images.
>> +#
>> +# Contents being anonymised can be found at: https://github.com/checkpoint-restore/criu/issues/360
Could you please add the content that is being anonymised instead of 
providing an external link to the github issue? This will be helpful 
when reading the source code offline.
>> +#
>> +# Inorder to anonymise the image files three steps are followed:
s/Inorder/In order/g
>> +#    - decode the binary image to json
>> +#    - strip the necessary information from the json dict
>> +#    - encode the json dict back to a binary image, which is now anonymised
>> +
>> +import sys
>> +import json
>> +import random
>> +
>> +def files_anon(image):
>> +    levels = {}
>> +
>> +    for e in image['entries']:
>> +        f_path = e['reg']['name']
we should handle KeyError: 'reg' or check if the reg key exists.
>> +        f_path = f_path.split('/')
>> +
>> +        lev_num = 0
>> +        for p in f_path:
>> +            if p == '':
>> +                continue
>> +            if lev_num in levels.keys():
>> +                if p not in levels[lev_num].keys():
is .keys() necessary here?
>> +                    temp = list(p)
>> +                    random.shuffle(temp)
> Erm, I'm not 100% it's OK to anonymize file paths like that.
Computing a hash could be another option?
>
>> +                    levels[lev_num][p] = ''.join(temp)
>> +            else:
>> +                levels[lev_num] = {}
>> +                temp = list(p)
>> +                random.shuffle(temp)
>> +                levels[lev_num][p] = ''.join(temp)
> Can we factor out these two branches a bit? Smth like
>
>              if lev_num not in levels.keys():
>                  levels[lev_num] = {}
>              if p not in levels[lev_num].keys():
>                  temp = list(p)
>                  random.shuffle(temp)
>                  levels[lev_num][p] = ''.join(temp)
>
>> +            lev_num += 1
>> +
>> +    for i, e in enumerate(image['entries']):
>> +        f_path = e['reg']['name']
>> +        if f_path == '/':
>> +            continue
>> +        f_path = f_path.split('/')
>> +
>> +        lev_num = 0
>> +        for j, p in enumerate(f_path):
>> +            if p == '':
>> +                continue
>> +            f_path[j] = levels[lev_num][p]
>> +            lev_num += 1
>> +        f_path = '/'.join(f_path)
>> +        image['entries'][i]['reg']['name'] = f_path
>> +
>> +    return image
>> +
>> +
>> +
>> +
>> +anonymizers = {
>> +    'FILES': files_anon,
>> +    }
>> +
> Please, run lint tool on this file, I'm afraid this coding style is not correct. The config
> file for lint is in scripts/flake8.cfg.

To run the lint tool you will need to install flake8:
$ pip install flake8
$ make lint

>> +def anon_handler(image, magic):
>> +    if magic != 'FILES':
>> +        return -1
>> +    handler = anonymizers[magic]
>> +    anon_image = handler(image)
>> +    return anon_image
please add an empty line before the return statement?
>> \ No newline at end of file
please add a new line at end of file?