1919base = "/home/vanessa/Documents/Work/singularity/hub"
2020storage = "%s/containers" % base
2121clones = "%s/clones" % storage # same image downloaded multiple times
22- replicates = "%s/replicates" % storage # these are quasi replicates
22+ replicates = "%s/replicates" % storage # these are replicates from singularity hub
2323 # had to change runscripts to commit
24- replication = "%s/quasi_replicates" % storage # these are exact replicates, from same
24+ replication = "%s/quasi_replicates" % storage # these are replicates produced on same host
2525hub = "%s/collections" % storage
2626
2727# Create all folders for images
@@ -97,7 +97,7 @@ def get_top_os(x):
9797exec "Hello World!"
9898'''
9999
100- os .chdir (replication )
100+ os .chdir (replicates )
101101with open ('Singularity' ,'w' ) as filey :
102102 filey .writelines (runscript )
103103
@@ -109,8 +109,6 @@ def get_top_os(x):
109109 cli .create (container_name )
110110 cli .bootstrap (container_name ,'Singularity' )
111111
112- container_uri = '%s-%s' % (container_name ,manifest ['version' ])
113- containers [container_uri ] = manifest
114112
115113
116114# ALL SINGULARITY HUB
@@ -119,9 +117,14 @@ def get_top_os(x):
119117for container_name ,container in containers .items ():
120118 for branch , manifest in container .items ():
121119 name = manifest ['name' ].replace ('/' ,'-' )
122- image = shub .pull_container (manifest ,
123- download_folder = hub ,
124- name = "%s-%s.img.gz" % (name ,branch ))
120+ uncompressed = "%s-%s.img" % (name ,branch )
121+ if not os .path .exists (uncompressed ):
122+ try :
123+ image = shub .pull_container (manifest ,
124+ download_folder = hub ,
125+ name = "%s-%s.img.gz" % (name ,branch ))
126+ except :
127+ print ("error downloading %s" % name )
125128
126129pickle .dump (containers ,open ('%s/container_manifests.pkl' % (hub ),'wb' ))
127130
@@ -130,56 +133,110 @@ def get_top_os(x):
130133# Task 2: Develop levels of reproducibility
131134#############################################################################
132135
136+ from singularity .utils import write_json , write_file
133137from singularity .reproduce import (
134138 assess_differences ,
135- get_content_hashes ,
136- get_image_hash ,
137139 get_levels
138140)
139141
140142levels = get_levels ()
141- result_file = '%s/results-%s.pkl' % (base ,container_name .replace ('/' ,'-' ))
142- results = pickle .load (open (result_file ,'rb' ))
143143
144144
145145# Let's assess what files are identical across pairs of images in different sets
146146
147- # Quasi Replicate: meaning same base os, different build host, slightly different runscript
147+ # Singularity Hub (replicate): meaning same base os, different build host
148+ # These should be equal for base, environment, runscript, replicate, but
149+ # not identical.
148150os .chdir (replication )
149151image_files = glob ('*.img' )
150152diffs = assess_differences (image_files [0 ],image_files [1 ],levels = levels )
151- pickle .dump (diffs ,open ('%s/diff_quasi_replicate_pair.pkl' % base ,'wb' ))
153+ print ("SINGULARITY HUB REPLICATES" )
154+ print (diffs )
155+ write_json (diffs ,'%s/diff_hub_replicates_pair.json' % base )
152156
153- # Replicate: if we produce an equivalent image at a different time, we might have
154- # variance in package directories (anything involving variable with mirrors, etc)
157+ # Local Replicate: if we produce an equivalent image at a different time, we might have
158+ # variance in package directories (anything involving variable with mirrors, etc)
159+ # these images should also be assessed as equivalent on the level of runscript,
160+ # environment, base, replicate, labels, but not identical. They should be MORE
161+ # identical than the Singularity Hub replicate by way of being produced on the same host.
155162
156163os .chdir (replicates )
157164image_files = glob ('*.img' )
158165diffs = assess_differences (image_files [0 ],image_files [1 ],levels = levels )
159- pickle .dump (diffs ,open ('%s/diff_replicate_pair.pkl' % base ,'wb' ))
166+ print ("LOCAL REPLICATES" )
167+ print (diffs )
168+ write_json (diffs ,'%s/diff_local_replicates_pair.json' % base )
160169
161170# Identical: all files are the same
162171
163172os .chdir (clones )
164173image_files = glob ('*.img' )
165174diffs = assess_differences (image_files [0 ],image_files [1 ],levels = levels )
166- pickle .dump (diffs ,open ('%s/diff_clone_pair.pkl' % base ,'wb' ))
175+ print ("CLONES" )
176+ print (diffs )
177+ write_json (diffs ,'%s/diff_clone_pair.json' % base )
167178
168- # Different images, same OS
179+
180+ # Singularity Hub
181+ # This is the real world use case, because these are real images on Singularity Hub
182+ # Let's compare each image on the level of REPLICATE
183+ os .chdir (hub )
184+ image_files = glob ('*.img' )
185+
186+ # len(image_files)
187+ # 79
188+
189+ total = len (image_files )* len (image_files )
190+ counter = 1
191+ diffs = pandas .DataFrame (0 ,columns = image_files ,index = image_files )
192+ diff_files = dict ()
193+ replicate_level = {'REPLICATE' :levels ['REPLICATE' ]}
194+
195+ for image_file1 in image_files :
196+ for image_file2 in image_files :
197+ print ("%s of %s" % (counter ,total ))
198+ diff_id = [image_file1 ,image_file2 ]
199+ diff_id .sort ()
200+ diff_id = '-' .join (diff_id )
201+ if diff_id not in diff_files :
202+ report = assess_differences (image_file1 ,image_file2 ,levels = replicate_level )
203+ diffs .loc [image_file1 ,image_file2 ] = report ['scores' ]['REPLICATE' ]
204+ diffs .loc [image_file2 ,image_file1 ] = report ['scores' ]['REPLICATE' ]
205+ print (diff_id )
206+ print (report ['scores' ])
207+ diff_files [diff_id ] = report
208+ counter += 1
209+
210+ pickle .dump (diffs ,open ('%s/replicate_hubdiffs_dfs.pkl' % base ,'wb' ))
211+
212+ from singularity .views .trees import make_package_tree
213+ #labels = ['-'.join(x.replace('.img','').replace('-','/',1).split('-')[:-1]) for x in diffs.index.tolist()]
214+ labels = ['-' .join (x .split ('-' )[1 :- 1 ]) for x in diffs .index .tolist ()]
215+ fig = make_package_tree (matrix = diffs ,labels = labels ,title = "Singularity Hub Replication Scores" )
216+ fig .savefig ('%s/replicate_hubdiffs_dfs.png' % base )
217+
218+ # Interactive tree
219+ tree = make_interactive_tree (matrix = diffs ,labels = labels )
169220
170221#############################################################################
171222# Task 3: Assess levels of reproducibility
172223#############################################################################
173224
174225# The first thing we want to do is evaluate our metrics for reproducibility.
175226dfs = dict ()
227+ levels = get_levels ()
228+
229+ from singularity .reproduce import (
230+ get_content_hashes ,
231+ get_image_hashes ,
232+ get_image_hash
233+ )
234+
176235
177236# ASSESS IDENTICAL IMAGES ACROSS ALL LEVELS
178237
179238os .chdir (clones )
180239image_files = glob ("*.img" )
181- levels = get_levels (version = 2.2 )
182-
183240hashes = pandas .DataFrame (columns = list (levels .keys ()))
184241
185242for image_file in image_files :
@@ -188,22 +245,27 @@ def get_top_os(x):
188245 hashes .loc [image_file ,:] = hashy
189246
190247
248+ # HERE
191249dfs ['IDENTICAL' ] = hashes
192250for col in hashes .columns .tolist ():
193251 print ("%s: %s" % (col ,hashes [col ].unique ().tolist ()))
194252
195- # IDENTICAL: ['364715054c17c29338787bd231e58d90caff154b']
196- # RUNSCRIPT: ['da39a3ee5e6b4b0d3255bfef95601890afd80709']
197- # ENVIRONMENT: ['22ff3c5c5fa63d3f08a48669d90fcb1459e6e74b']
198- # RECIPE: ['0e0efcb05fb4727f77b999d135c8a58a8ce468d5']
199253
254+ # REPLICATE: ['2776174919187e7007619ac74f082b90']
255+ # ENVIRONMENT: ['2060c7583adf2545494bf76113f5d594']
256+ # BASE: ['345d1d687fd0bed73528969d82dd5aa4']
257+ # RUNSCRIPT: ['272844f479bfd9f83e7caf27e40146ea']
258+ # IDENTICAL: ['8a2f03e6d846a1979b694b28c125a852']
259+ # LABELS: ['d41d8cd98f00b204e9800998ecf8427e']
260+ # RECIPE: ['89b5f94c70b261b463c914a4fbe628c5']
200261
201- # Question 2: What files are consistent across the same image, different builds?
202- # An image that is a replicate should be assessed as identical using the "REPLICATE"
203- # criteria, but not identical
204262
205- # RECIPES
263+ # SINGULARITY HUB "REPLICATES"
206264
265+ # These images, if compared pairwise, would be assessed as equivalent on all
266+ # levels except for identical. This example will show differences on level
267+ # of replicate and base, and this shows that these levels should not
268+ # be calculated in advance.
207269os .chdir (replication )
208270image_files = glob ('*.img' )
209271hashes = pandas .DataFrame (columns = list (levels .keys ()))
@@ -213,79 +275,68 @@ def get_top_os(x):
213275 hashy = get_image_hashes (image_file ,levels = levels )
214276 hashes .loc [image_file ,:] = hashy
215277
278+ # REPLICATE: 101
279+ # ENVIRONMENT: 1
280+ # BASE: 101
281+ # RUNSCRIPT: 2
282+ # IDENTICAL: 101
283+ # LABELS: 1
284+ # RECIPE: 85
216285
217- dfs ['RECIPES' ] = hashes
286+ # The above confirms our prediction - the levels (hashes alone) should not be used
287+ # to assess an image beyond environment, labels, and runscript. Since these images were
288+ # produced by trivially changing the runscript, we also see that reflected in this result.
289+
290+ dfs ['QUASI_REPLICATE' ] = hashes
218291for col in hashes .columns .tolist ():
219292 print ("%s: %s" % (col ,len (hashes [col ].unique ().tolist ())))
220293
221294
222295
223- # QUASI REPLICATES
224- # These have the same base, but different metadata folders.
296+ # REPLICATES
297+ # These were built from the same spec file, same host, but different times
298+ # Again, we will see differences on most levels.
225299
226300os .chdir (replicates )
227301image_files = glob ("*.img" )
228- levels = get_levels (version = 2.2 )
229-
230302hashes = pandas .DataFrame (columns = list (levels .keys ()))
231303
232304for image_file in image_files :
233305 print ('Processing %s' % (image_file ))
234306 hashy = get_image_hashes (image_file ,levels = levels )
235307 hashes .loc [image_file ,:] = hashy
236308
237- dfs ['QUASI_REPLICATE' ] = hashes
238- for col in hashes .columns .tolist ():
239- print ("%s: %s" % (col ,len (hashes [col ].unique ().tolist ())))
240-
241-
242-
243- pickle .dump (dfs ,open ('reproducibility_dfs.pkl' ,'wb' ))
244-
245-
246- # Let's assess what files are identical across the images. We can use this to develop
247- # our subsequent levels.
248- # Here we will use the 100 files in the folder, and find files/folders consistent across
249- # we will not include the runscript, since we know this was changed.
309+ # REPLICATE: 100
310+ # ENVIRONMENT: 100
311+ # BASE: 100
312+ # RUNSCRIPT: 1
313+ # IDENTICAL: 100
314+ # LABELS: 1
315+ # RECIPE: 100
250316
251317
252-
253- def generate_replication_df (level_name ,image_files ,version ,skip_files = None ):
254-
255- print ("CALCULATING COMPARISONS FOR LEVEL %s" % level_name )
256- df = pandas .DataFrame (0 ,index = image_files ,columns = image_files )
257- for image_file1 in image_files :
258- for image_file2 in image_files :
259- hash1 = get_image_hash (image_file1 ,level = level_name ,version = version )
260- hash2 = get_image_hash (image_file2 ,level = level_name ,version = version )
261- if hash1 == hash2 :
262- df .loc [image_file1 ,image_file2 ] = 1
263- df .loc [image_file2 ,image_file1 ] = 1
264- return df
318+ dfs ['REPLICATES' ] = hashes
319+ for col in hashes .columns .tolist ():
320+ print ("%s: %s" % (col ,len (hashes [col ].unique ().tolist ())))
265321
266322
267- dfs ['IDENTICAL' ] = generate_replication_df ('IDENTICAL' ,image_files ,version = 2.2 )
268- dfs ['REPLICATE' ] = generate_replication_df ('REPLICATE' ,image_files ,version = 2.2 , skip_files = ['/singularity' ])
269323
324+ # Singularity Hub
325+ # Are there any files that are identical across all images?
326+ # Can we assess the level of reproducibility of each path?
270327
271- # Outputs:
272- # A function that exports, reads tarfile into memory (or disk?) and generates a list of
273- # key (file) and value (sha1 sum)
274- 0 ) I 'll first experiment with different patterns of files/folders and figure out which are consistent across images. I' ll probably do this by doing a content hash of all individual files , and then finding the set that is consistent across 1 ) the same exact image , and 2 ) different images but same builds , and 3 ) different images different builds . We could even give each some kind of score to determine the right set it belongs in .
275- 1 ) at the highest level of reproduciblity (eg same file ) we get equivalent hashes - to do this I 'll just download exactly the same image
276- 2 ) at a "working" (aka , reasonable to use ) level of reproducibility , we should get equivalent hashes given the same build , but different files (eg , I built my thing twice from the same spec )
277- 3 ) at the lowest level of reproducibility (eg , base operating system ) we should see some identicalness if the operating systems base are largely the same .
278-
279- We can then allow the user to use our functions , and go a bit deeper into image comparison and asses , given equal file paths , which are actually equal in content across two images . The user could even save a definition of "how they are assessing reproducibility" of the image by way of a list of regular expressions , and a hash for their image generated from it . I think it would be interesting , given this algorithm , to parse all singularity hub public images and assess the total level of redundancy !
328+ os .chdir (hub )
329+ image_files = glob ("*.img" )
330+ hashes = pandas .DataFrame (columns = list (levels .keys ()))
280331
332+ for image_file in image_files :
333+ print ('Processing %s' % (image_file ))
334+ hashy = get_image_hashes (image_file ,levels = levels )
335+ hashes .loc [image_file ,:] = hashy
281336
337+ dfs ['HUB_COLLECTIONS' ] = hashes
338+ for col in hashes .columns .tolist ():
339+ print ("%s: %s" % (col ,len (hashes [col ].unique ().tolist ())))
282340
283- from glob import glob
284- image_files = glob ('*.img' )
285- sums = []
286- for image_file in image_files :
287- os .system ('sudo singularity export %s > tmp.tar' % (image_file ))
288- summy = tarsum ('tmp.tar' )
289- print (summy )
290- sums .append (summy )
291341
342+ pickle .dump (dfs ,open ('%s/reproducibility_dfs.pkl' % base ,'wb' ))
0 commit comments