55
66from singularity .hub .client import Client
77
8- import pickle
8+ from glob import glob
99import os
1010import pandas
11+ import pickle
1112import shutil
1213
1314shub = Client () # Singularity Hub Client
@@ -60,7 +61,8 @@ def get_top_os(x):
6061
6162results ['containers' ] = containers
6263results ['df' ] = df
63- pickle .dump (results ,open ('%s/results.pkl' % storage ,'wb' ))
64+ result_file = '%s/results-%s.pkl' % (base ,container_name .replace ('/' ,'-' ))
65+ pickle .dump (results ,open (result_file ),'wb' ))
6466
6567
6668#############################################################################
@@ -69,45 +71,116 @@ def get_top_os(x):
6971
7072from singularity .reproduce import (
7173 get_content_hashes ,
74+ get_image_hash ,
7275 get_levels
7376)
7477
75- levels = get_levels ()
76- results = pickle .load (open ('%s/results.pkl' % storage ,'rb' ))
78+ levels = get_levels (version = 2.2 )
79+ result_file = '%s/results-%s.pkl' % (base ,container_name .replace ('/' ,'-' ))
80+ results = pickle .load (open (result_file ,'rb' ))
7781
7882os .chdir (storage )
7983image_files = glob ("*.img" )
8084
81- # Question 1: What files are consistent across the same image?
82- # LEVEL IDENTICAL
83- # Here we will download the same image 10 times, create a sha1 sum of the files,
84- # and determine which sets of files should be consistent for the same image file
85-
8685
87- # Question 2: What files are consistent across the same image, different downloads?
88- # LEVEL REPLICATE
86+ # Let's assess what files are identical across the images. We can use this to develop
87+ # our subsequent levels.
8988# Here we will use the 100 files in the folder, and find files/folders consistent across
9089# we will not include the runscript, since we know this was changed.
91- identical_across = get_content_hashes (image_files [0 ],level = 'IDENTICAL' )
90+ identical_across = get_content_hashes (image_files [0 ],level = 'IDENTICAL' , version = 2.2 )
9291image_files .pop (0 )
9392not_identical = []
9493
9594for image_file in image_files :
96- hashes = get_content_hashes (image_file ,level = 'IDENTICAL' )
95+ hashes = get_content_hashes (image_file ,level = 'IDENTICAL' , version = 2.2 )
9796 for hash_path ,hash_val in hashes .items ():
9897 if hash_path in identical_across :
9998 if not identical_across [hash_path ] == hashes [hash_path ]:
10099 del identical_across [hash_path ]
101100 not_identical .append (hash_path )
102101
102+ # From the above we learn that all files are identical except for those
103+ # in:
103104
104- start = time .time ()
105- hashy = get_image_hash (image_file )
106- end = time .time ()
105+ #['./.run',
106+ # './etc/hosts',
107+ # './singularity',
108+ # './etc/mtab',
109+ # './.exec',
110+ # './etc/resolv.conf',
111+ # './.shell',
112+ # './environment']
113+
114+ # Since we know that the images were produced by way of changing the runscript,
115+ # and this influences the singularity metadata folders, we can conclude that we would
116+ # see differences for REPLICATE in /etc/hosts and /etc/mtab and /etc/resolv.conf
117+
118+ # Identical: logically, if we compare an image to itself, all files are the same
119+ # Replicate: if we produce an equivalent image at a different time, we might have
120+ # variance in package directories (anything involving variable with mirrors, etc)
121+ # Environment/Runscript/Labels: these are logical to compare, we compare the hash of
122+ # just a few specific files in the image
123+
124+
125+ #############################################################################
126+ # Task 3: Assess levels of reproducibility
127+ #############################################################################
128+
129+ # The first thing we want to do is evaluate our metrics for reproducibility.
130+
131+ # Question 1: What files are consistent across the same image?
132+ # LEVEL IDENTICAL
133+ # Here we will download the same image 10 times, create a sha1 sum of the files,
134+ # and determine which sets of files should be consistent for the same image file
135+
136+
137+ # Question 2: What files are consistent across the same image, different downloads?
138+ # LEVEL REPLICATE
139+ # An image that is a replicate should be assessed as identical using the "REPLICATE"
140+ # criteria.
141+
142+ image_files = glob ("*.img" )
143+
144+ # Let's assess what files are identical across the images. We can use this to develop
145+ # our subsequent levels.
146+ # Here we will use the 100 files in the folder, and find files/folders consistent across
147+ # we will not include the runscript, since we know this was changed.
148+ level_names = ['IDENTICAL' ,
149+ 'REPLICATE' ,
150+ 'RUNSCRIPT' ]
151+
152+ dfs = dict ()
153+
154+ def generate_replication_df (level_name ,image_files ,version ,skip_files = None ):
155+
156+ print ("CALCULATING COMPARISONS FOR LEVEL %s" % level_name )
157+ df = pandas .DataFrame (0 ,index = image_files ,columns = image_files )
158+ for image_file1 in image_files :
159+ for image_file2 in image_files :
160+ hash1 = get_image_hash (image_file1 ,level = level_name ,version = version )
161+ hash2 = get_image_hash (image_file2 ,level = level_name ,version = version )
162+ if hash1 == hash2 :
163+ df .loc [image_file1 ,image_file2 ] = 1
164+ df .loc [image_file2 ,image_file1 ] = 1
165+ return df
166+
167+
168+ dfs ['IDENTICAL' ] = generate_replication_df ('IDENTICAL' ,image_files ,version = 2.2 )
169+ dfs ['REPLICATE' ] = generate_replication_df ('REPLICATE' ,image_files ,version = 2.2 , skip_files = ['/singularity' ])
170+
171+ # Finally, if we compare runscripts only, we should see two container versions
172+ hashes = []
173+
174+ for image_file in image_files :
175+ hashy = get_image_hash (image_file ,level = "RUNSCRIPT" ,version = 2.2 )
176+ hashes .append (hashy )
107177
108- # Question 3: What files are consistent between the same operating systems?
109- # LEVEL BASE
110- # A base similarity means the base of the images (the OS) are likely the same
178+ uniques = dict ()
179+ for hashy in hashes :
180+ if hashy in uniques :
181+ uniques [hashy ] += 1
182+ else :
183+ uniques [hashy ] = 1
111184
112185
113186# Outputs:
@@ -120,7 +193,6 @@ def get_top_os(x):
120193
121194We can then allow the user to use our functions , and go a bit deeper into image comparison and asses , given equal file paths , which are actually equal in content across two images . The user could even save a definition of "how they are assessing reproducibility" of the image by way of a list of regular expressions , and a hash for their image generated from it . I think it would be interesting , given this algorithm , to parse all singularity hub public images and assess the total level of redundancy !
122195
123- Anyhoo , I think I 'm going to go to sleep now , I keep doing this not sleeping thing , lol .
124196
125197
126198from glob import glob
0 commit comments