Skip to content

Commit ba04516

Browse files
committed
adding pickle with results to repo, along with ipython notebook
1 parent 55cf3b7 commit ba04516

File tree

9 files changed

+643
-89
lines changed

9 files changed

+643
-89
lines changed

examples/singularity_hub/compare_builds.ipynb

Lines changed: 337 additions & 0 deletions
Large diffs are not rendered by default.

examples/singularity_hub/compare_builds.py

Lines changed: 92 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,10 @@
55

66
from singularity.hub.client import Client
77

8-
import pickle
8+
from glob import glob
99
import os
1010
import pandas
11+
import pickle
1112
import shutil
1213

1314
shub = Client() # Singularity Hub Client
@@ -60,7 +61,8 @@ def get_top_os(x):
6061

6162
results['containers'] = containers
6263
results['df'] = df
63-
pickle.dump(results,open('%s/results.pkl' %storage,'wb'))
64+
result_file = '%s/results-%s.pkl' %(base,container_name.replace('/','-'))
65+
pickle.dump(results,open(result_file),'wb'))
6466

6567

6668
#############################################################################
@@ -69,45 +71,116 @@ def get_top_os(x):
6971

7072
from singularity.reproduce import (
7173
get_content_hashes,
74+
get_image_hash,
7275
get_levels
7376
)
7477

75-
levels = get_levels()
76-
results = pickle.load(open('%s/results.pkl' %storage,'rb'))
78+
levels = get_levels(version=2.2)
79+
result_file = '%s/results-%s.pkl' %(base,container_name.replace('/','-'))
80+
results = pickle.load(open(result_file,'rb'))
7781

7882
os.chdir(storage)
7983
image_files = glob("*.img")
8084

81-
# Question 1: What files are consistent across the same image?
82-
# LEVEL IDENTICAL
83-
# Here we will download the same image 10 times, create a sha1 sum of the files,
84-
# and determine which sets of files should be consistent for the same image file
85-
8685

87-
# Question 2: What files are consistent across the same image, different downloads?
88-
# LEVEL REPLICATE
86+
# Let's assess what files are identical across the images. We can use this to develop
87+
# our subsequent levels.
8988
# Here we will use the 100 files in the folder, and find files/folders consistent across
9089
# we will not include the runscript, since we know this was changed.
91-
identical_across = get_content_hashes(image_files[0],level='IDENTICAL')
90+
identical_across = get_content_hashes(image_files[0],level='IDENTICAL',version=2.2)
9291
image_files.pop(0)
9392
not_identical = []
9493

9594
for image_file in image_files:
96-
hashes = get_content_hashes(image_file,level='IDENTICAL')
95+
hashes = get_content_hashes(image_file,level='IDENTICAL',version=2.2)
9796
for hash_path,hash_val in hashes.items():
9897
if hash_path in identical_across:
9998
if not identical_across[hash_path] == hashes[hash_path]:
10099
del identical_across[hash_path]
101100
not_identical.append(hash_path)
102101

102+
# From the above we learn that all files are identical except for those
103+
# in:
103104

104-
start = time.time()
105-
hashy=get_image_hash(image_file)
106-
end = time.time()
105+
#['./.run',
106+
# './etc/hosts',
107+
# './singularity',
108+
# './etc/mtab',
109+
# './.exec',
110+
# './etc/resolv.conf',
111+
# './.shell',
112+
# './environment']
113+
114+
# Since we know that the images were produced by way of changing the runscript,
115+
# and this influences the singularity metadata folders, we can conclude that we would
116+
# see differences for REPLICATE in /etc/hosts and /etc/mtab and /etc/resolv.conf
117+
118+
# Identical: logically, if we compare an image to itself, all files are the same
119+
# Replicate: if we produce an equivalent image at a different time, we might have
120+
# variance in package directories (anything involving variable with mirrors, etc)
121+
# Environment/Runscript/Labels: these are logical to compare, we compare the hash of
122+
# just a few specific files in the image
123+
124+
125+
#############################################################################
126+
# Task 3: Assess levels of reproducibility
127+
#############################################################################
128+
129+
# The first thing we want to do is evaluate our metrics for reproducibility.
130+
131+
# Question 1: What files are consistent across the same image?
132+
# LEVEL IDENTICAL
133+
# Here we will download the same image 10 times, create a sha1 sum of the files,
134+
# and determine which sets of files should be consistent for the same image file
135+
136+
137+
# Question 2: What files are consistent across the same image, different downloads?
138+
# LEVEL REPLICATE
139+
# An image that is a replicate should be assessed as identical using the "REPLICATE"
140+
# criteria.
141+
142+
image_files = glob("*.img")
143+
144+
# Let's assess what files are identical across the images. We can use this to develop
145+
# our subsequent levels.
146+
# Here we will use the 100 files in the folder, and find files/folders consistent across
147+
# we will not include the runscript, since we know this was changed.
148+
level_names = ['IDENTICAL',
149+
'REPLICATE',
150+
'RUNSCRIPT']
151+
152+
dfs = dict()
153+
154+
def generate_replication_df(level_name,image_files,version,skip_files=None):
155+
156+
print("CALCULATING COMPARISONS FOR LEVEL %s" %level_name)
157+
df = pandas.DataFrame(0,index=image_files,columns=image_files)
158+
for image_file1 in image_files:
159+
for image_file2 in image_files:
160+
hash1 = get_image_hash(image_file1,level=level_name,version=version)
161+
hash2 = get_image_hash(image_file2,level=level_name,version=version)
162+
if hash1 == hash2:
163+
df.loc[image_file1,image_file2] = 1
164+
df.loc[image_file2,image_file1] = 1
165+
return df
166+
167+
168+
dfs['IDENTICAL'] = generate_replication_df('IDENTICAL',image_files,version=2.2)
169+
dfs['REPLICATE'] = generate_replication_df('REPLICATE',image_files,version=2.2, skip_files=['/singularity'])
170+
171+
# Finally, if we compare runscripts only, we should see two container versions
172+
hashes = []
173+
174+
for image_file in image_files:
175+
hashy = get_image_hash(image_file,level="RUNSCRIPT",version=2.2)
176+
hashes.append(hashy)
107177

108-
# Question 3: What files are consistent between the same operating systems?
109-
# LEVEL BASE
110-
# A base similarity means the base of the images (the OS) are likely the same
178+
uniques = dict()
179+
for hashy in hashes:
180+
if hashy in uniques:
181+
uniques[hashy] +=1
182+
else:
183+
uniques[hashy] = 1
111184

112185

113186
# Outputs:
@@ -120,7 +193,6 @@ def get_top_os(x):
120193

121194
We can then allow the user to use our functions, and go a bit deeper into image comparison and asses, given equal file paths, which are actually equal in content across two images. The user could even save a definition of "how they are assessing reproducibility" of the image by way of a list of regular expressions, and a hash for their image generated from it. I think it would be interesting, given this algorithm, to parse all singularity hub public images and assess the total level of redundancy!
122195

123-
Anyhoo, I think I'm going to go to sleep now, I keep doing this not sleeping thing, lol.
124196

125197

126198
from glob import glob
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# Compare Singularity Hub containers
2+
3+
# This is a simple script to use the singularity command line tool to obtain manifests
4+
# and compare build specs (using Singularity Hub API)
5+
6+
from singularity.hub.client import Client
7+
8+
from glob import glob
9+
import os
10+
import pandas
11+
import pickle
12+
import shutil
13+
14+
shub = Client() # Singularity Hub Client
15+
16+
container_names = ['vsoch/singularity-hello-world',
17+
'researchapps/quantum_state_diffusion',
18+
'vsoch/pe-predictive']
19+
20+
# Let's keep images in a temporary folder
21+
base = "/home/vanessa/Documents/Work/singularity/hub"
22+
storage = "%s/containers" %base
23+
if not os.path.exists(storage):
24+
os.mkdir(storage)
25+
os.chdir(storage)
26+
27+
# We will keep a table of information
28+
columns = ['name','build_time_seconds','size','commit','estimated_os']
29+
df = pandas.DataFrame(columns=columns)
30+
results = dict()
31+
32+
def get_top_os(x):
33+
return sorted(x.items(), key=lambda x: (x[1],x[0]), reverse=True)[0][0]
34+
35+
#############################################################################
36+
# Task 1: Download the containers and metadata! (different images)
37+
#############################################################################
38+
39+
# Retrieve the container based on the name
40+
for container_name in container_names:
41+
result = dict()
42+
collection = shub.get_collection(container_name)
43+
containers = dict()
44+
result['collection'] = collection
45+
container_ids = collection['container_set']
46+
cids = []
47+
for c in range(0,len(container_ids)):
48+
container_id = container_ids[c]
49+
cids.append(container_id)
50+
manifest = shub.get_container(container_id)
51+
container_uri = '%s-%s' %(container_name,manifest['version'])
52+
containers[container_uri] = manifest
53+
image = shub.pull_container(manifest,
54+
download_folder=storage,
55+
name="%s.img.gz" %(manifest['version']))
56+
metrics = shub.load_metrics(manifest)
57+
top_os = get_top_os(metrics['os_sims'])
58+
entry = [container_name,
59+
metrics['build_time_seconds'],
60+
metrics['size'],
61+
manifest['version'],
62+
top_os]
63+
df.loc[container_uri] = entry
64+
65+
result['containers'] = containers
66+
results[container_name] = result
67+
68+
results['df'] = df
69+
result_file = '%s/results.pkl' %(base)
70+
pickle.dump(results,open(result_file),'wb'))
39.1 KB
Binary file not shown.

singularity/hub/data/reproduce_levels_2-3.json

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
"regexp": "^/usr|^/bin|^/boot|^/lib64|/proc|^/run|^/dev|^/opt|^/sbin|^/srv|^/sys",
1010
"skip_files":["/etc/resolv.conf",
1111
"/etc/hosts",
12-
"/.singularity/actions/exec",
13-
"/.singularity/actions/run",
14-
"/.singularity/actions/shell",
12+
"/singularity.d/actions/exec",
13+
"/singularity.d/actions/run",
14+
"/singularity.d/actions/shell",
1515
"/etc/mtab"]
1616
},
1717

@@ -21,18 +21,18 @@
2121
},
2222
"ENVIRONMENT": {
2323
"description": "only look at the container's environment. This level will only look at the environment files when assessing similarity.",
24-
"regexp": "/.singularity/env"
24+
"regexp": "/singularity.d/env"
2525
},
2626
"LABELS": {
2727
"description": "only look at the container labels, if they exist (singularity version 2.3)",
28-
"include_files": "/.singularity/labels.json"
28+
"include_files": "/singularity.d/labels.json"
2929
},
3030
"RECIPE": {
3131
"description": "singularity looks at everything on the level of the Singularity image, meaning the runscript, environment, and labels.",
32-
"regexp": "/.singularity/env",
33-
"include_files":["/.singularity/actions/exec",
34-
"/.singularity/actions/run",
35-
"/.singularity/actions/shell",
36-
"/.singularity/labels.json"]
32+
"regexp": "/singularity.d/env",
33+
"include_files":["/singularity.d/actions/exec",
34+
"/singularity.d/actions/run",
35+
"/singularity.d/actions/shell",
36+
"/singularity.d/labels.json"]
3737
}
3838
}

singularity/package.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616

1717
from singularity.cli import Singularity
1818
from singularity.reproduce import (
19-
get_image_hash
19+
get_image_hash,
20+
get_image_hashes,
21+
get_memory_tar
22+
2023
)
2124
import tempfile
2225
import tarfile
@@ -85,10 +88,10 @@ def build_from_spec(spec_file=None,build_dir=None,size=None,sudopw=None,build_fo
8588
image_path = "%s/image" %(build_dir)
8689

8790
# Run create image and bootstrap with Singularity command line tool.
88-
if sudopw != None:
91+
if sudopw is not None:
8992
cli = Singularity(sudopw=sudopw,debug=debug)
9093
else:
91-
cli = Singularity(debug=debug) # This command will ask the user for sudo
94+
cli = Singularity(debug=debug)
9295

9396
print("\nCreating and bootstrapping image...")
9497

@@ -130,24 +133,29 @@ def package(image_path,spec_path=None,output_folder=None,runscript=True,
130133
S = Singularity(sudopw=sudopw,debug=verbose)
131134
else:
132135
S = Singularity(debug=verbose) # This command will ask the user for sudo
133-
tmptar = S.export(image_path=image_path,pipe=False)
134-
tar = tarfile.open(tmptar)
136+
137+
tar = get_memory_tar(image_path)
135138
members = tar.getmembers()
136139
image_name = os.path.basename(image_path)
137140
zip_name = "%s.zip" %(image_name.replace(" ","_"))
141+
138142
# Include the image in the package?
139143
if remove_image:
140144
to_package = dict()
141145
else:
142146
to_package = {"files":[image_path]}
147+
143148
# If the specfile is provided, it should also be packaged
144149
if spec_path != None:
145150
singularity_spec = "".join(read_file(spec_path))
146151
to_package['Singularity'] = singularity_spec
147-
# Package the image with an md5 sum as VERSION
148-
version = get_image_hash(image_path)
149-
to_package["VERSION"] = version
152+
153+
# Package the image with an sha1, replication standard, as VERSION
154+
hashes = get_image_hashes(image_path)
155+
to_package["VERSION"] = hashes['REPLICATE']
156+
to_package["HASHES"] = hashes
150157
# Look for runscript
158+
151159
if runscript == True:
152160
try:
153161
runscript_member = tar.getmember("./singularity")
@@ -157,12 +165,14 @@ def package(image_path,spec_path=None,output_folder=None,runscript=True,
157165
bot.logger.debug("Found runscript.")
158166
except KeyError:
159167
bot.logger.warning("No runscript found")
168+
160169
if software == True:
161170
bot.logger.info("Adding software list to package.")
162171
files = [x.path for x in members if x.isfile()]
163172
folders = [x.path for x in members if x.isdir()]
164173
to_package["files.txt"] = files
165174
to_package["folders.txt"] = folders
175+
166176
# Do zip up here - let's start with basic structures
167177
zipfile = zip_up(to_package,zip_name=zip_name,output_folder=output_folder)
168178
bot.logger.debug("Package created at %s" %(zipfile))

0 commit comments

Comments
 (0)