Skip to content

Commit 16444af

Browse files
committed
adding reproduce example!
1 parent 5840398 commit 16444af

File tree

2 files changed

+120
-29
lines changed

2 files changed

+120
-29
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from glob import glob
2+
3+
image_files=glob('*.img')
4+
from singularity.reproduce import *
5+
Environment message level found to be DEBUG
6+
7+
assess_replication(image_files[0],image_files[1])
8+
9+
{'BASE': False,
10+
'ENVIRONMENT': False,
11+
'IDENTICAL': False,
12+
'LABELS': True,
13+
'RECIPE': False,
14+
'REPLICATE': False,
15+
'RUNSCRIPT': False}
16+
17+
assess_replication(image_files[0],image_files[0])
18+
19+
{'BASE': True,
20+
'ENVIRONMENT': True,
21+
'IDENTICAL': True,
22+
'LABELS': True,
23+
'RECIPE': True,
24+
'REPLICATE': True,
25+
'RUNSCRIPT': True}

singularity/reproduce.py

Lines changed: 95 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,51 @@
1919
import io
2020

2121

22+
def assess_replication(image_file1,image_file2,version=None):
23+
'''assess_replications will compare two images on each level of
24+
reproducibility,
25+
'''
26+
levels = get_levels(version=version)
27+
report = dict()
28+
for level_name, values in levels.items():
29+
hash1 = get_image_hash(image_path=image_file1,
30+
level=level_name)
31+
hash2 = get_image_hash(image_path=image_file2,
32+
level=level_name)
33+
if hash1 == hash2:
34+
report[level_name] = True
35+
else:
36+
report[level_name] = False
37+
return report
38+
39+
40+
def assess_differences(image_file1,image_file2,version=None):
41+
'''assess_replications will compare two images on each level of
42+
reproducibility,
43+
'''
44+
levels = get_levels(version=version)
45+
different = []
46+
same = []
47+
setdiff = []
48+
for level_name, values in levels.items():
49+
hashes1 = get_content_hashes(image_path=image_file1,
50+
level=level_name)
51+
hashes2 = get_content_hashes(image_path=image_file2,
52+
level=level_name)
53+
for file_name,hash_value in hashes1.items():
54+
if file_name in hashes2:
55+
if hashes2[file_name] == hashes1[file_name]:
56+
same.append(file_name)
57+
else:
58+
different.append(file_name)
59+
else:
60+
setdiff.append(file_name)
61+
62+
report = {'missing':setdiff,
63+
'same':same,
64+
'different':different}
65+
return report
66+
2267

2368
def get_custom_level(regexp,name=None,description=None):
2469
'''get_custom_level will generate a custom level for the user,
@@ -40,11 +85,18 @@ def get_level(level):
4085
return get_levels(level)
4186

4287

43-
def get_levels(level=None):
88+
def get_levels(level=None,version=None):
4489
'''get_levels returns a dictionary of levels (key) and values (dictionaries with
4590
descriptions and regular expressions for files) for the user.
91+
:param version: the version of singularity to use (default is 2.2)
4692
'''
47-
93+
if version is None:
94+
version = "2-2"
95+
version = str(version).replace('.','-')
96+
valid_versions = ['2-2','2-3']
97+
if version not in valid_versions:
98+
bot.logger.error("Unsupported version %s, valid versions are %s",version,",".join(valid_versions))
99+
48100
levels_file = os.path.abspath(os.path.join(get_installdir(),
49101
'hub',
50102
'data',
@@ -63,23 +115,39 @@ def get_levels(level=None):
63115
return valid_levels
64116

65117

66-
def include_file(member_path,regexp):
118+
def include_file(member_path,file_filter):
67119
'''include_file will look at a path and determine
68120
if it matches a regular expression from a level
69121
'''
70122
member_path = member_path.replace('.','',1)
71-
if re.search(regexp,member_path):
72-
return True
123+
124+
# Does the filter skip it explicitly?
125+
if "skip_files" in file_filter:
126+
if member_path in file_filter['skip_files']:
127+
return False
128+
129+
# Include explicitly?
130+
if "include_files" in file_filter:
131+
if member_path in file_filter['include_files']:
132+
return True
133+
134+
# Regular expression?
135+
if "regexp" in file_filter:
136+
if re.search(file_filter["regexp"],member_path):
137+
return True
73138
return False
74139

75140

76-
def get_image_hash(image_path,level=None,regexp=None):
141+
142+
def get_image_hash(image_path,level=None,regexp=None,include_files=None,skip_files=None):
77143
'''get_image_hash will generate a sha1 hash of an image, depending on a level
78144
of reproducibility specified by the user. (see function get_levels for descriptions)
79145
:param level: the level of reproducibility to use, which maps to a set regular
80146
expression to match particular files/folders in the image. Choices are in notes.
81147
:param regexp: if defined, the level is ignored and the regular expression used
82148
instead.
149+
:param skip_files: an optional list of files to skip
150+
:param include_files: an optional list of files to keep (only if level not defined)
83151
84152
::notes
85153
@@ -92,51 +160,49 @@ def get_image_hash(image_path,level=None,regexp=None):
92160
'''
93161

94162
# First get a level dictionary, with description and regexp
95-
if regexp is not None:
96-
file_filter = get_custom_level(regexp)
163+
if level is None:
164+
if regexp is not None or include_files is not None or skip_files is not None:
165+
file_filter = get_custom_level(regexp=regexp,
166+
include_files=include_files,
167+
skip_files=skip_files)
168+
else:
169+
file_filter = get_level("REPLICATE")
97170

98-
elif level is not None:
99-
if level is "IDENTICAL":
100-
return get_image_file_hash(image_path)
171+
else:
101172
file_filter = get_level(level)
102173

103-
else:
104-
file_filter = get_level("REPLICATE")
105174

106175
cli = Singularity()
107176
byte_array = cli.export(image_path,pipe=True)
108177
file_object = io.BytesIO(byte_array)
109178

110179
# Now create a tarfile from the file object
111180
tar = tarfile.open(mode="r|*", fileobj=file_object)
112-
chunk_size = 100*1024
113181
hasher = hashlib.sha1()
114182
for member in tar:
115183
if member.isfile():
116-
if include_file(member.name,file_filter['regexp']):
117-
filey = tar.extractfile(member)
118-
buf = filey.read(chunk_size)
119-
while buf:
120-
hasher.update(buf)
121-
buf = filey.read(chunk_size)
184+
if include_file(member.name,file_filter):
185+
buf = member.tobuf()
186+
hasher.update(buf)
122187
return hasher.hexdigest()
123188

124189

125-
def get_content_hashes(image_path,level=None,regexp=None):
190+
def get_content_hashes(image_path,level=None,regexp=None,include_files=None,skip_files=None):
126191
'''get_content_hashes is like get_image_hash, but it returns a complete dictionary
127192
of file names (keys) and their respective hashes (values). This function is intended
128193
for more research purposes and was used to generate the levels in the first place
129194
'''
130-
131195
# First get a level dictionary, with description and regexp
132-
if regexp is not None:
133-
file_filter = get_custom_level(regexp)
196+
if level is None:
197+
if regexp is not None or include_files is not None or skip_files is not None:
198+
file_filter = get_custom_level(regexp=regexp,
199+
include_files=include_files,
200+
skip_files=skip_files)
201+
else:
202+
file_filter = get_level("REPLICATE")
134203

135-
elif level is not None:
136-
file_filter = get_level(level)
137-
138204
else:
139-
file_filter = get_level("REPLICATE")
205+
file_filter = get_level(level)
140206

141207
cli = Singularity()
142208
byte_array = cli.export(image_path,pipe=True)
@@ -148,7 +214,7 @@ def get_content_hashes(image_path,level=None,regexp=None):
148214
digest = dict()
149215
for member in tar:
150216
if member.isfile():
151-
if include_file(member.name,file_filter['regexp']):
217+
if include_file(member.name,file_filters):
152218
buf = member.tobuf()
153219
hasher = hashlib.sha1()
154220
hasher.update(buf)

0 commit comments

Comments
 (0)