Skip to content

Commit 28a9f7a

Browse files
committed
adding client
1 parent e50804f commit 28a9f7a

File tree

6 files changed

+315
-187
lines changed

6 files changed

+315
-187
lines changed

examples/singularity_hub/compare_builds.py

Lines changed: 130 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@
1919
base = "/home/vanessa/Documents/Work/singularity/hub"
2020
storage = "%s/containers" %base
2121
clones = "%s/clones" %storage # same image downloaded multiple times
22-
replicates = "%s/replicates" %storage # these are quasi replicates
22+
replicates = "%s/replicates" %storage # these are replicates from singularity hub
2323
# had to change runscripts to commit
24-
replication = "%s/quasi_replicates" %storage # these are exact replicates, from same
24+
replication = "%s/quasi_replicates" %storage # these are replicates produced on same host
2525
hub = "%s/collections" %storage
2626

2727
# Create all folders for images
@@ -97,7 +97,7 @@ def get_top_os(x):
9797
exec "Hello World!"
9898
'''
9999

100-
os.chdir(replication)
100+
os.chdir(replicates)
101101
with open('Singularity','w') as filey:
102102
filey.writelines(runscript)
103103

@@ -109,8 +109,6 @@ def get_top_os(x):
109109
cli.create(container_name)
110110
cli.bootstrap(container_name,'Singularity')
111111

112-
container_uri = '%s-%s' %(container_name,manifest['version'])
113-
containers[container_uri] = manifest
114112

115113

116114
# ALL SINGULARITY HUB
@@ -119,9 +117,14 @@ def get_top_os(x):
119117
for container_name,container in containers.items():
120118
for branch, manifest in container.items():
121119
name = manifest['name'].replace('/','-')
122-
image = shub.pull_container(manifest,
123-
download_folder=hub,
124-
name="%s-%s.img.gz" %(name,branch))
120+
uncompressed = "%s-%s.img" %(name,branch)
121+
if not os.path.exists(uncompressed):
122+
try:
123+
image = shub.pull_container(manifest,
124+
download_folder=hub,
125+
name="%s-%s.img.gz" %(name,branch))
126+
except:
127+
print("error downloading %s" %name)
125128

126129
pickle.dump(containers,open('%s/container_manifests.pkl' %(hub),'wb'))
127130

@@ -130,56 +133,110 @@ def get_top_os(x):
130133
# Task 2: Develop levels of reproducibility
131134
#############################################################################
132135

136+
from singularity.utils import write_json, write_file
133137
from singularity.reproduce import (
134138
assess_differences,
135-
get_content_hashes,
136-
get_image_hash,
137139
get_levels
138140
)
139141

140142
levels = get_levels()
141-
result_file = '%s/results-%s.pkl' %(base,container_name.replace('/','-'))
142-
results = pickle.load(open(result_file,'rb'))
143143

144144

145145
# Let's assess what files are identical across pairs of images in different sets
146146

147-
# Quasi Replicate: meaning same base os, different build host, slightly different runscript
147+
# Singularity Hub (replicate): meaning same base os, different build host
148+
# These should be equal for base, environment, runscript, replicate, but
149+
# not identical.
148150
os.chdir(replication)
149151
image_files = glob('*.img')
150152
diffs = assess_differences(image_files[0],image_files[1],levels=levels)
151-
pickle.dump(diffs,open('%s/diff_quasi_replicate_pair.pkl' %base,'wb'))
153+
print("SINGULARITY HUB REPLICATES")
154+
print(diffs)
155+
write_json(diffs,'%s/diff_hub_replicates_pair.json' %base)
152156

153-
# Replicate: if we produce an equivalent image at a different time, we might have
154-
# variance in package directories (anything involving variable with mirrors, etc)
157+
# Local Replicate: if we produce an equivalent image at a different time, we might have
158+
# variance in package directories (anything involving variable with mirrors, etc)
159+
# these images should also be assessed as equivalent on the level of runscript,
160+
# environment, base, replicate, labels, but not identical. They should be MORE
161+
# identical than the Singularity Hub replicate by way of being produced on the same host.
155162

156163
os.chdir(replicates)
157164
image_files = glob('*.img')
158165
diffs = assess_differences(image_files[0],image_files[1],levels=levels)
159-
pickle.dump(diffs,open('%s/diff_replicate_pair.pkl' %base,'wb'))
166+
print("LOCAL REPLICATES")
167+
print(diffs)
168+
write_json(diffs,'%s/diff_local_replicates_pair.json' %base)
160169

161170
# Identical: all files are the same
162171

163172
os.chdir(clones)
164173
image_files = glob('*.img')
165174
diffs = assess_differences(image_files[0],image_files[1],levels=levels)
166-
pickle.dump(diffs,open('%s/diff_clone_pair.pkl' %base,'wb'))
175+
print("CLONES")
176+
print(diffs)
177+
write_json(diffs,'%s/diff_clone_pair.json' %base)
167178

168-
# Different images, same OS
179+
180+
# Singularity Hub
181+
# This is the real world use case, because these are real images on Singularity Hub
182+
# Let's compare each image on the level of REPLICATE
183+
os.chdir(hub)
184+
image_files = glob('*.img')
185+
186+
# len(image_files)
187+
# 79
188+
189+
total = len(image_files)*len(image_files)
190+
counter = 1
191+
diffs = pandas.DataFrame(0,columns=image_files,index=image_files)
192+
diff_files = dict()
193+
replicate_level = {'REPLICATE':levels['REPLICATE']}
194+
195+
for image_file1 in image_files:
196+
for image_file2 in image_files:
197+
print("%s of %s" %(counter,total))
198+
diff_id = [image_file1,image_file2]
199+
diff_id.sort()
200+
diff_id = '-'.join(diff_id)
201+
if diff_id not in diff_files:
202+
report = assess_differences(image_file1,image_file2,levels=replicate_level)
203+
diffs.loc[image_file1,image_file2] = report['scores']['REPLICATE']
204+
diffs.loc[image_file2,image_file1] = report['scores']['REPLICATE']
205+
print(diff_id)
206+
print(report['scores'])
207+
diff_files[diff_id] = report
208+
counter+=1
209+
210+
pickle.dump(diffs,open('%s/replicate_hubdiffs_dfs.pkl' %base,'wb'))
211+
212+
from singularity.views.trees import make_package_tree
213+
#labels = ['-'.join(x.replace('.img','').replace('-','/',1).split('-')[:-1]) for x in diffs.index.tolist()]
214+
labels = ['-'.join(x.split('-')[1:-1]) for x in diffs.index.tolist()]
215+
fig = make_package_tree(matrix=diffs,labels=labels,title="Singularity Hub Replication Scores")
216+
fig.savefig('%s/replicate_hubdiffs_dfs.png' %base)
217+
218+
# Interactive tree
219+
tree = make_interactive_tree(matrix=diffs,labels=labels)
169220

170221
#############################################################################
171222
# Task 3: Assess levels of reproducibility
172223
#############################################################################
173224

174225
# The first thing we want to do is evaluate our metrics for reproducibility.
175226
dfs = dict()
227+
levels = get_levels()
228+
229+
from singularity.reproduce import (
230+
get_content_hashes,
231+
get_image_hashes,
232+
get_image_hash
233+
)
234+
176235

177236
# ASSESS IDENTICAL IMAGES ACROSS ALL LEVELS
178237

179238
os.chdir(clones)
180239
image_files = glob("*.img")
181-
levels = get_levels(version=2.2)
182-
183240
hashes = pandas.DataFrame(columns=list(levels.keys()))
184241

185242
for image_file in image_files:
@@ -188,22 +245,27 @@ def get_top_os(x):
188245
hashes.loc[image_file,:] = hashy
189246

190247

248+
# HERE
191249
dfs['IDENTICAL'] = hashes
192250
for col in hashes.columns.tolist():
193251
print("%s: %s" %(col,hashes[col].unique().tolist()))
194252

195-
# IDENTICAL: ['364715054c17c29338787bd231e58d90caff154b']
196-
# RUNSCRIPT: ['da39a3ee5e6b4b0d3255bfef95601890afd80709']
197-
# ENVIRONMENT: ['22ff3c5c5fa63d3f08a48669d90fcb1459e6e74b']
198-
# RECIPE: ['0e0efcb05fb4727f77b999d135c8a58a8ce468d5']
199253

254+
# REPLICATE: ['2776174919187e7007619ac74f082b90']
255+
# ENVIRONMENT: ['2060c7583adf2545494bf76113f5d594']
256+
# BASE: ['345d1d687fd0bed73528969d82dd5aa4']
257+
# RUNSCRIPT: ['272844f479bfd9f83e7caf27e40146ea']
258+
# IDENTICAL: ['8a2f03e6d846a1979b694b28c125a852']
259+
# LABELS: ['d41d8cd98f00b204e9800998ecf8427e']
260+
# RECIPE: ['89b5f94c70b261b463c914a4fbe628c5']
200261

201-
# Question 2: What files are consistent across the same image, different builds?
202-
# An image that is a replicate should be assessed as identical using the "REPLICATE"
203-
# criteria, but not identical
204262

205-
# RECIPES
263+
# SINGULARITY HUB "REPLICATES"
206264

265+
# These images, if compared pairwise, would be assessed as equivalent on all
266+
# levels except for identical. This example will show differences on level
267+
# of replicate and base, and this shows that these levels should not
268+
# be calculated in advance.
207269
os.chdir(replication)
208270
image_files = glob('*.img')
209271
hashes = pandas.DataFrame(columns=list(levels.keys()))
@@ -213,79 +275,68 @@ def get_top_os(x):
213275
hashy = get_image_hashes(image_file,levels=levels)
214276
hashes.loc[image_file,:] = hashy
215277

278+
# REPLICATE: 101
279+
# ENVIRONMENT: 1
280+
# BASE: 101
281+
# RUNSCRIPT: 2
282+
# IDENTICAL: 101
283+
# LABELS: 1
284+
# RECIPE: 85
216285

217-
dfs['RECIPES'] = hashes
286+
# The above confirms our prediction - the levels (hashes alone) should not be used
287+
# to assess an image beyond environment, labels, and runscript. Since these images were
288+
# produced by trivially changing the runscript, we also see that reflected in this result.
289+
290+
dfs['QUASI_REPLICATE'] = hashes
218291
for col in hashes.columns.tolist():
219292
print("%s: %s" %(col,len(hashes[col].unique().tolist())))
220293

221294

222295

223-
# QUASI REPLICATES
224-
# These have the same base, but different metadata folders.
296+
# REPLICATES
297+
# These were built from the same spec file, same host, but different times
298+
# Again, we will see differences on most levels.
225299

226300
os.chdir(replicates)
227301
image_files = glob("*.img")
228-
levels = get_levels(version=2.2)
229-
230302
hashes = pandas.DataFrame(columns=list(levels.keys()))
231303

232304
for image_file in image_files:
233305
print('Processing %s' %(image_file))
234306
hashy = get_image_hashes(image_file,levels=levels)
235307
hashes.loc[image_file,:] = hashy
236308

237-
dfs['QUASI_REPLICATE'] = hashes
238-
for col in hashes.columns.tolist():
239-
print("%s: %s" %(col,len(hashes[col].unique().tolist())))
240-
241-
242-
243-
pickle.dump(dfs,open('reproducibility_dfs.pkl','wb'))
244-
245-
246-
# Let's assess what files are identical across the images. We can use this to develop
247-
# our subsequent levels.
248-
# Here we will use the 100 files in the folder, and find files/folders consistent across
249-
# we will not include the runscript, since we know this was changed.
309+
# REPLICATE: 100
310+
# ENVIRONMENT: 100
311+
# BASE: 100
312+
# RUNSCRIPT: 1
313+
# IDENTICAL: 100
314+
# LABELS: 1
315+
# RECIPE: 100
250316

251317

252-
253-
def generate_replication_df(level_name,image_files,version,skip_files=None):
254-
255-
print("CALCULATING COMPARISONS FOR LEVEL %s" %level_name)
256-
df = pandas.DataFrame(0,index=image_files,columns=image_files)
257-
for image_file1 in image_files:
258-
for image_file2 in image_files:
259-
hash1 = get_image_hash(image_file1,level=level_name,version=version)
260-
hash2 = get_image_hash(image_file2,level=level_name,version=version)
261-
if hash1 == hash2:
262-
df.loc[image_file1,image_file2] = 1
263-
df.loc[image_file2,image_file1] = 1
264-
return df
318+
dfs['REPLICATES'] = hashes
319+
for col in hashes.columns.tolist():
320+
print("%s: %s" %(col,len(hashes[col].unique().tolist())))
265321

266322

267-
dfs['IDENTICAL'] = generate_replication_df('IDENTICAL',image_files,version=2.2)
268-
dfs['REPLICATE'] = generate_replication_df('REPLICATE',image_files,version=2.2, skip_files=['/singularity'])
269323

324+
# Singularity Hub
325+
# Are there any files that are identical across all images?
326+
# Can we assess the level of reproducibility of each path?
270327

271-
# Outputs:
272-
# A function that exports, reads tarfile into memory (or disk?) and generates a list of
273-
# key (file) and value (sha1 sum)
274-
0) I'll first experiment with different patterns of files/folders and figure out which are consistent across images. I'll probably do this by doing a content hash of all individual files, and then finding the set that is consistent across 1) the same exact image, and 2) different images but same builds, and 3) different images different builds. We could even give each some kind of score to determine the right set it belongs in.
275-
1) at the highest level of reproduciblity (eg same file) we get equivalent hashes - to do this I'll just download exactly the same image
276-
2) at a "working" (aka, reasonable to use) level of reproducibility, we should get equivalent hashes given the same build, but different files (eg, I built my thing twice from the same spec)
277-
3) at the lowest level of reproducibility (eg, base operating system) we should see some identicalness if the operating systems base are largely the same.
278-
279-
We can then allow the user to use our functions, and go a bit deeper into image comparison and asses, given equal file paths, which are actually equal in content across two images. The user could even save a definition of "how they are assessing reproducibility" of the image by way of a list of regular expressions, and a hash for their image generated from it. I think it would be interesting, given this algorithm, to parse all singularity hub public images and assess the total level of redundancy!
328+
os.chdir(hub)
329+
image_files = glob("*.img")
330+
hashes = pandas.DataFrame(columns=list(levels.keys()))
280331

332+
for image_file in image_files:
333+
print('Processing %s' %(image_file))
334+
hashy = get_image_hashes(image_file,levels=levels)
335+
hashes.loc[image_file,:] = hashy
281336

337+
dfs['HUB_COLLECTIONS'] = hashes
338+
for col in hashes.columns.tolist():
339+
print("%s: %s" %(col,len(hashes[col].unique().tolist())))
282340

283-
from glob import glob
284-
image_files=glob('*.img')
285-
sums = []
286-
for image_file in image_files:
287-
os.system('sudo singularity export %s > tmp.tar' %(image_file))
288-
summy = tarsum('tmp.tar')
289-
print(summy)
290-
sums.append(summy)
291341

342+
pickle.dump(dfs,open('%s/reproducibility_dfs.pkl' %base,'wb'))
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
namechildren

0 commit comments

Comments
 (0)