1818# Let's keep images in a temporary folder
1919base = "/home/vanessa/Documents/Work/singularity/hub"
2020storage = "%s/containers" % base
21- if not os .path .exists (storage ):
22- os .mkdir (storage )
23- os .chdir (storage )
21+ clones = "%s/clones" % storage # same image downloaded multiple times
22+ replicates = "%s/replicates" % storage # these are quasi replicates
23+ # had to change runscripts to commit
24+ replication = "%s/quasi_replicates" % storage # these are exact replicates, from same
25+ hub = "%s/collections" % storage
26+
27+ # Create all folders for images
28+ paths = [storage ,replicates ,clones ,replication ,hub ]
29+ for pathy in paths :
30+ if not os .path .exists (pathy ):
31+ os .mkdir (pathy )
2432
2533# We will keep a table of information
2634columns = ['name' ,'build_time_seconds' ,'size' ,'commit' ,'estimated_os' ]
@@ -32,10 +40,13 @@ def get_top_os(x):
3240 return sorted (x .items (), key = lambda x : (x [1 ],x [0 ]), reverse = True )[0 ][0 ]
3341
3442#############################################################################
35- # Task 1: Download the containers and metadata! (different images)
43+ # Task 1: Get Containers
3644#############################################################################
3745
46+ # SINGULARITY HUB HAS QUASI REPLICATES, complete metadata
47+
3848# Retrieve the container based on the name
49+ os .chdir (replicates )
3950collection = shub .get_collection (container_name )
4051results ['repo_name' ] = container_name
4152results ['collection' ] = collection
@@ -65,91 +76,179 @@ def get_top_os(x):
6576pickle .dump (results ,open (result_file ),'wb' ))
6677
6778
79+ # IDENTICAL
80+
81+ os .chdir (clones )
82+ chosen_one = results ['df' ].index [10 ]
83+ manifest = results ['containers' ][chosen_one ]
84+ for num in range (0 ,100 ):
85+ clone_name = "%s-%s" % (manifest ['name' ].replace ('/' ,'-' ),num )
86+ image = shub .pull_container (manifest ,
87+ download_folder = clones ,
88+ name = "%s.img.gz" % (clone_name ))
89+
90+
91+ # EXACT REPLICATES
92+
93+ runscript = '''Bootstrap:docker
94+ From: ubuntu:latest
95+
96+ %runscript
97+ exec "Hello World!"
98+ '''
99+
100+ os .chdir (replication )
101+ with open ('Singularity' ,'w' ) as filey :
102+ filey .writelines (runscript )
103+
104+ from singularity .cli import Singularity
105+ cli = Singularity ()
106+
107+ for num in range (0 ,100 ):
108+ container_name = 'ubuntu-hello-world-%s.img' % (num )
109+ cli .create (container_name )
110+ cli .bootstrap (container_name ,'Singularity' )
111+
112+ container_uri = '%s-%s' % (container_name ,manifest ['version' ])
113+ containers [container_uri ] = manifest
114+
115+
116+ # ALL SINGULARITY HUB
117+ containers = shub .get_containers ()
118+ os .chdir (hub )
119+ for container_name ,container in containers .items ():
120+ for branch , manifest in container .items ():
121+ name = manifest ['name' ].replace ('/' ,'-' )
122+ image = shub .pull_container (manifest ,
123+ download_folder = hub ,
124+ name = "%s-%s.img.gz" % (name ,branch ))
125+
126+ pickle .dump (containers ,open ('%s/container_manifests.pkl' % (hub ),'wb' ))
127+
128+
68129#############################################################################
69130# Task 2: Develop levels of reproducibility
70131#############################################################################
71132
72133from singularity .reproduce import (
134+ assess_differences ,
73135 get_content_hashes ,
74136 get_image_hash ,
75137 get_levels
76138)
77139
78- levels = get_levels (version = 2.2 )
140+ levels = get_levels ()
79141result_file = '%s/results-%s.pkl' % (base ,container_name .replace ('/' ,'-' ))
80142results = pickle .load (open (result_file ,'rb' ))
81143
82- os .chdir (storage )
83- image_files = glob ("*.img" )
84144
145+ # Let's assess what files are identical across pairs of images in different sets
85146
86- # Let's assess what files are identical across the images. We can use this to develop
87- # our subsequent levels.
88- # Here we will use the 100 files in the folder, and find files/folders consistent across
89- # we will not include the runscript, since we know this was changed.
90- identical_across = get_content_hashes (image_files [0 ],level = 'IDENTICAL' ,version = 2.2 )
91- image_files .pop (0 )
92- not_identical = []
147+ # Quasi Replicate: meaning same base os, different build host, slightly different runscript
148+ os .chdir (replication )
149+ image_files = glob ('*.img' )
150+ diffs = assess_differences (image_files [0 ],image_files [1 ],levels = levels )
151+ pickle .dump (diffs ,open ('%s/diff_quasi_replicate_pair.pkl' % base ,'wb' ))
93152
94- for image_file in image_files :
95- hashes = get_content_hashes (image_file ,level = 'IDENTICAL' ,version = 2.2 )
96- for hash_path ,hash_val in hashes .items ():
97- if hash_path in identical_across :
98- if not identical_across [hash_path ] == hashes [hash_path ]:
99- del identical_across [hash_path ]
100- not_identical .append (hash_path )
101-
102- # From the above we learn that all files are identical except for those
103- # in:
104-
105- #['./.run',
106- # './etc/hosts',
107- # './singularity',
108- # './etc/mtab',
109- # './.exec',
110- # './etc/resolv.conf',
111- # './.shell',
112- # './environment']
113-
114- # Since we know that the images were produced by way of changing the runscript,
115- # and this influences the singularity metadata folders, we can conclude that we would
116- # see differences for REPLICATE in /etc/hosts and /etc/mtab and /etc/resolv.conf
117-
118- # Identical: logically, if we compare an image to itself, all files are the same
119153# Replicate: if we produce an equivalent image at a different time, we might have
120154# variance in package directories (anything involving variable with mirrors, etc)
121- # Environment/Runscript/Labels: these are logical to compare, we compare the hash of
122- # just a few specific files in the image
123155
156+ os .chdir (replicates )
157+ image_files = glob ('*.img' )
158+ diffs = assess_differences (image_files [0 ],image_files [1 ],levels = levels )
159+ pickle .dump (diffs ,open ('%s/diff_replicate_pair.pkl' % base ,'wb' ))
160+
161+ # Identical: all files are the same
162+
163+ os .chdir (clones )
164+ image_files = glob ('*.img' )
165+ diffs = assess_differences (image_files [0 ],image_files [1 ],levels = levels )
166+ pickle .dump (diffs ,open ('%s/diff_clone_pair.pkl' % base ,'wb' ))
167+
168+ # Different images, same OS
124169
125170#############################################################################
126171# Task 3: Assess levels of reproducibility
127172#############################################################################
128173
129174# The first thing we want to do is evaluate our metrics for reproducibility.
175+ dfs = dict ()
130176
131- # Question 1: What files are consistent across the same image?
132- # LEVEL IDENTICAL
133- # Here we will download the same image 10 times, create a sha1 sum of the files,
134- # and determine which sets of files should be consistent for the same image file
177+ # ASSESS IDENTICAL IMAGES ACROSS ALL LEVELS
178+
179+ os .chdir (clones )
180+ image_files = glob ("*.img" )
181+ levels = get_levels (version = 2.2 )
135182
183+ hashes = pandas .DataFrame (columns = list (levels .keys ()))
136184
137- # Question 2: What files are consistent across the same image, different downloads?
138- # LEVEL REPLICATE
185+ for image_file in image_files :
186+ print ('Processing %s' % (image_file ))
187+ hashy = get_image_hashes (image_file ,levels = levels )
188+ hashes .loc [image_file ,:] = hashy
189+
190+
191+ dfs ['IDENTICAL' ] = hashes
192+ for col in hashes .columns .tolist ():
193+ print ("%s: %s" % (col ,hashes [col ].unique ().tolist ()))
194+
195+ # IDENTICAL: ['364715054c17c29338787bd231e58d90caff154b']
196+ # RUNSCRIPT: ['da39a3ee5e6b4b0d3255bfef95601890afd80709']
197+ # ENVIRONMENT: ['22ff3c5c5fa63d3f08a48669d90fcb1459e6e74b']
198+ # RECIPE: ['0e0efcb05fb4727f77b999d135c8a58a8ce468d5']
199+
200+
201+ # Question 2: What files are consistent across the same image, different builds?
139202# An image that is a replicate should be assessed as identical using the "REPLICATE"
140- # criteria.
203+ # criteria, but not identical
204+
205+ # RECIPES
206+
207+ os .chdir (replication )
208+ image_files = glob ('*.img' )
209+ hashes = pandas .DataFrame (columns = list (levels .keys ()))
210+
211+ for image_file in image_files :
212+ print ('Processing %s' % (image_file ))
213+ hashy = get_image_hashes (image_file ,levels = levels )
214+ hashes .loc [image_file ,:] = hashy
215+
216+
217+ dfs ['RECIPES' ] = hashes
218+ for col in hashes .columns .tolist ():
219+ print ("%s: %s" % (col ,len (hashes [col ].unique ().tolist ())))
220+
221+
222+
223+ # QUASI REPLICATES
224+ # These have the same base, but different metadata folders.
141225
226+ os .chdir (replicates )
142227image_files = glob ("*.img" )
228+ levels = get_levels (version = 2.2 )
229+
230+ hashes = pandas .DataFrame (columns = list (levels .keys ()))
231+
232+ for image_file in image_files :
233+ print ('Processing %s' % (image_file ))
234+ hashy = get_image_hashes (image_file ,levels = levels )
235+ hashes .loc [image_file ,:] = hashy
236+
237+ dfs ['QUASI_REPLICATE' ] = hashes
238+ for col in hashes .columns .tolist ():
239+ print ("%s: %s" % (col ,len (hashes [col ].unique ().tolist ())))
240+
241+
242+
243+ pickle .dump (dfs ,open ('reproducibility_dfs.pkl' ,'wb' ))
244+
143245
144246# Let's assess what files are identical across the images. We can use this to develop
145247# our subsequent levels.
146248# Here we will use the 100 files in the folder, and find files/folders consistent across
147249# we will not include the runscript, since we know this was changed.
148- level_names = ['IDENTICAL' ,
149- 'REPLICATE' ,
150- 'RUNSCRIPT' ]
151250
152- dfs = dict ()
251+
153252
154253def generate_replication_df (level_name ,image_files ,version ,skip_files = None ):
155254
@@ -168,20 +267,6 @@ def generate_replication_df(level_name,image_files,version,skip_files=None):
168267dfs ['IDENTICAL' ] = generate_replication_df ('IDENTICAL' ,image_files ,version = 2.2 )
169268dfs ['REPLICATE' ] = generate_replication_df ('REPLICATE' ,image_files ,version = 2.2 , skip_files = ['/singularity' ])
170269
171- # Finally, if we compare runscripts only, we should see two container versions
172- hashes = []
173-
174- for image_file in image_files :
175- hashy = get_image_hash (image_file ,level = "RUNSCRIPT" ,version = 2.2 )
176- hashes .append (hashy )
177-
178- uniques = dict ()
179- for hashy in hashes :
180- if hashy in uniques :
181- uniques [hashy ] += 1
182- else :
183- uniques [hashy ] = 1
184-
185270
186271# Outputs:
187272# A function that exports, reads tarfile into memory (or disk?) and generates a list of
0 commit comments