1919import io
2020
2121
22+ def assess_replication (image_file1 ,image_file2 ,version = None ):
23+ '''assess_replications will compare two images on each level of
24+ reproducibility,
25+ '''
26+ levels = get_levels (version = version )
27+ report = dict ()
28+ for level_name , values in levels .items ():
29+ hash1 = get_image_hash (image_path = image_file1 ,
30+ level = level_name )
31+ hash2 = get_image_hash (image_path = image_file2 ,
32+ level = level_name )
33+ if hash1 == hash2 :
34+ report [level_name ] = True
35+ else :
36+ report [level_name ] = False
37+ return report
38+
39+
40+ def assess_differences (image_file1 ,image_file2 ,version = None ):
41+ '''assess_replications will compare two images on each level of
42+ reproducibility,
43+ '''
44+ levels = get_levels (version = version )
45+ different = []
46+ same = []
47+ setdiff = []
48+ for level_name , values in levels .items ():
49+ hashes1 = get_content_hashes (image_path = image_file1 ,
50+ level = level_name )
51+ hashes2 = get_content_hashes (image_path = image_file2 ,
52+ level = level_name )
53+ for file_name ,hash_value in hashes1 .items ():
54+ if file_name in hashes2 :
55+ if hashes2 [file_name ] == hashes1 [file_name ]:
56+ same .append (file_name )
57+ else :
58+ different .append (file_name )
59+ else :
60+ setdiff .append (file_name )
61+
62+ report = {'missing' :setdiff ,
63+ 'same' :same ,
64+ 'different' :different }
65+ return report
66+
2267
2368def get_custom_level (regexp ,name = None ,description = None ):
2469 '''get_custom_level will generate a custom level for the user,
@@ -40,11 +85,18 @@ def get_level(level):
4085 return get_levels (level )
4186
4287
43- def get_levels (level = None ):
88+ def get_levels (level = None , version = None ):
4489 '''get_levels returns a dictionary of levels (key) and values (dictionaries with
4590 descriptions and regular expressions for files) for the user.
91+ :param version: the version of singularity to use (default is 2.2)
4692 '''
47-
93+ if version is None :
94+ version = "2-2"
95+ version = str (version ).replace ('.' ,'-' )
96+ valid_versions = ['2-2' ,'2-3' ]
97+ if version not in valid_versions :
98+ bot .logger .error ("Unsupported version %s, valid versions are %s" ,version ,"," .join (valid_versions ))
99+
48100 levels_file = os .path .abspath (os .path .join (get_installdir (),
49101 'hub' ,
50102 'data' ,
@@ -63,23 +115,39 @@ def get_levels(level=None):
63115 return valid_levels
64116
65117
66- def include_file (member_path ,regexp ):
118+ def include_file (member_path ,file_filter ):
67119 '''include_file will look at a path and determine
68120 if it matches a regular expression from a level
69121 '''
70122 member_path = member_path .replace ('.' ,'' ,1 )
71- if re .search (regexp ,member_path ):
72- return True
123+
124+ # Does the filter skip it explicitly?
125+ if "skip_files" in file_filter :
126+ if member_path in file_filter ['skip_files' ]:
127+ return False
128+
129+ # Include explicitly?
130+ if "include_files" in file_filter :
131+ if member_path in file_filter ['include_files' ]:
132+ return True
133+
134+ # Regular expression?
135+ if "regexp" in file_filter :
136+ if re .search (file_filter ["regexp" ],member_path ):
137+ return True
73138 return False
74139
75140
76- def get_image_hash (image_path ,level = None ,regexp = None ):
141+
142+ def get_image_hash (image_path ,level = None ,regexp = None ,include_files = None ,skip_files = None ):
77143 '''get_image_hash will generate a sha1 hash of an image, depending on a level
78144 of reproducibility specified by the user. (see function get_levels for descriptions)
79145 :param level: the level of reproducibility to use, which maps to a set regular
80146 expression to match particular files/folders in the image. Choices are in notes.
81147 :param regexp: if defined, the level is ignored and the regular expression used
82148 instead.
149+ :param skip_files: an optional list of files to skip
150+ :param include_files: an optional list of files to keep (only if level not defined)
83151
84152 ::notes
85153
@@ -92,51 +160,49 @@ def get_image_hash(image_path,level=None,regexp=None):
92160 '''
93161
94162 # First get a level dictionary, with description and regexp
95- if regexp is not None :
96- file_filter = get_custom_level (regexp )
163+ if level is None :
164+ if regexp is not None or include_files is not None or skip_files is not None :
165+ file_filter = get_custom_level (regexp = regexp ,
166+ include_files = include_files ,
167+ skip_files = skip_files )
168+ else :
169+ file_filter = get_level ("REPLICATE" )
97170
98- elif level is not None :
99- if level is "IDENTICAL" :
100- return get_image_file_hash (image_path )
171+ else :
101172 file_filter = get_level (level )
102173
103- else :
104- file_filter = get_level ("REPLICATE" )
105174
106175 cli = Singularity ()
107176 byte_array = cli .export (image_path ,pipe = True )
108177 file_object = io .BytesIO (byte_array )
109178
110179 # Now create a tarfile from the file object
111180 tar = tarfile .open (mode = "r|*" , fileobj = file_object )
112- chunk_size = 100 * 1024
113181 hasher = hashlib .sha1 ()
114182 for member in tar :
115183 if member .isfile ():
116- if include_file (member .name ,file_filter ['regexp' ]):
117- filey = tar .extractfile (member )
118- buf = filey .read (chunk_size )
119- while buf :
120- hasher .update (buf )
121- buf = filey .read (chunk_size )
184+ if include_file (member .name ,file_filter ):
185+ buf = member .tobuf ()
186+ hasher .update (buf )
122187 return hasher .hexdigest ()
123188
124189
125- def get_content_hashes (image_path ,level = None ,regexp = None ):
190+ def get_content_hashes (image_path ,level = None ,regexp = None , include_files = None , skip_files = None ):
126191 '''get_content_hashes is like get_image_hash, but it returns a complete dictionary
127192 of file names (keys) and their respective hashes (values). This function is intended
128193 for more research purposes and was used to generate the levels in the first place
129194 '''
130-
131195 # First get a level dictionary, with description and regexp
132- if regexp is not None :
133- file_filter = get_custom_level (regexp )
196+ if level is None :
197+ if regexp is not None or include_files is not None or skip_files is not None :
198+ file_filter = get_custom_level (regexp = regexp ,
199+ include_files = include_files ,
200+ skip_files = skip_files )
201+ else :
202+ file_filter = get_level ("REPLICATE" )
134203
135- elif level is not None :
136- file_filter = get_level (level )
137-
138204 else :
139- file_filter = get_level ("REPLICATE" )
205+ file_filter = get_level (level )
140206
141207 cli = Singularity ()
142208 byte_array = cli .export (image_path ,pipe = True )
@@ -148,7 +214,7 @@ def get_content_hashes(image_path,level=None,regexp=None):
148214 digest = dict ()
149215 for member in tar :
150216 if member .isfile ():
151- if include_file (member .name ,file_filter [ 'regexp' ] ):
217+ if include_file (member .name ,file_filters ):
152218 buf = member .tobuf ()
153219 hasher = hashlib .sha1 ()
154220 hasher .update (buf )
0 commit comments