Skip to content

Commit 5840398

Browse files
committed
adding reproduce.py
1 parent 1377f6b commit 5840398

File tree

8 files changed

+332
-45
lines changed

8 files changed

+332
-45
lines changed

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
include README.md
22
include LICENSE
3+
recursive-include singularity/hub *
34
recursive-include singularity/templates *
45
recursive-include singularity/static *
56
recursive-include singularity/build *

examples/singularity_hub/compare_builds.py

Lines changed: 116 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,50 +1,140 @@
11
# Compare Singularity Hub containers
22

3-
# This is a simple script to use the singularity command line tool to download containers
4-
# (using Singularity, Section 1) and compare build specs (using Singularity Hub API, Section 2) and to
5-
# compare the containers themselves using singularity python (Section 3)
3+
# This is a simple script to use the singularity command line tool to obtain manifests
4+
# and compare build specs (using Singularity Hub API)
65

7-
container_names = ['vsoch/singularity-hello-world',
8-
'researchapps/quantum_state_diffusion',
9-
'vsoch/pefinder']
6+
container_name = 'vsoch/singularity-hello-world'
107

118
from singularity.hub.client import Client
12-
from singularity.package import get_image_hash
139

14-
import tempfile
10+
import pickle
1511
import os
16-
import demjson
1712
import pandas
1813
import shutil
1914

2015
shub = Client() # Singularity Hub Client
21-
results = dict()
2216

2317
# Let's keep images in a temporary folder
24-
storage = tempfile.mkdtemp()
18+
base = "/home/vanessa/Documents/Work/singularity/hub"
19+
storage = "%s/containers" %base
20+
if not os.path.exists(storage):
21+
os.mkdir(storage)
2522
os.chdir(storage)
2623

2724
# We will keep a table of information
28-
columns = ['name','build_time_seconds','hash','size','commit','estimated_os']
25+
columns = ['name','build_time_seconds','size','commit','estimated_os']
2926
df = pandas.DataFrame(columns=columns)
27+
containers = dict()
28+
results = dict()
3029

31-
for container_name in container_names:
32-
33-
# Retrieve the container based on the name
34-
collection = shub.get_collection(container_name)
35-
container_ids = collection['container_set']
36-
containers = []
37-
for container_id in container_ids:
30+
def get_top_os(x):
31+
return sorted(x.items(), key=lambda x: (x[1],x[0]), reverse=True)[0][0]
32+
33+
#############################################################################
34+
# Task 1: Download the containers and metadata! (different images)
35+
#############################################################################
36+
37+
# Retrieve the container based on the name
38+
collection = shub.get_collection(container_name)
39+
results['repo_name'] = container_name
40+
results['collection'] = collection
41+
container_ids = collection['container_set']
42+
cids = []
43+
for c in range(0,len(container_ids)):
44+
try:
45+
container_id = container_ids[c]
46+
cids.append(container_id)
3847
manifest = shub.get_container(container_id)
39-
containers.append(manifest)
48+
container_uri = '%s-%s' %(container_name,manifest['version'])
49+
containers[container_uri] = manifest
4050
image = shub.pull_container(manifest,
4151
download_folder=storage,
4252
name="%s.img.gz" %(manifest['version']))
43-
# Get hash of file
44-
hashes.append(get_image_hash(image))
45-
df.loc['%s-%s' %(container_name,manifest['version'])]
53+
metrics = shub.load_metrics(manifest)
54+
top_os = get_top_os(metrics['os_sims'])
55+
entry = [container_name,
56+
metrics['build_time_seconds'],
57+
metrics['size'],
58+
manifest['version'],
59+
top_os]
60+
df.loc[container_uri] = entry
61+
except:
62+
pass
63+
64+
results['containers'] = containers
65+
results['df'] = df
66+
pickle.dump(results,open('%s/results.pkl' %storage,'wb'))
67+
68+
69+
#############################################################################
70+
# Task 2: Develop levels of reproducibility
71+
#############################################################################
72+
73+
from singularity.reproduce import (
74+
get_content_hashes,
75+
get_levels
76+
)
77+
78+
levels = get_levels()
79+
results = pickle.load(open('%s/results.pkl' %storage,'rb'))
80+
81+
os.chdir(storage)
82+
image_files = glob("*.img")
83+
84+
# Question 1: What files are consistent across the same image?
85+
# LEVEL IDENTICAL
86+
# Here we will download the same image 10 times, create a sha1 sum of the files,
87+
# and determine which sets of files should be consistent for the same image file
88+
89+
5665
90+
5673
91+
92+
93+
# Question 2: What files are consistent across the same image, different downloads?
94+
# LEVEL REPLICATE
95+
# Here we will use the 100 files in the folder, and find files/folders consistent across
96+
# we will not include the runscript, since we know this was changed.
97+
identical_across = get_content_hashes(image_files[0],level='IDENTICAL')
98+
image_files.pop(0)
99+
not_identical = []
100+
101+
for image_file in image_files:
102+
hashes = get_content_hashes(image_file,level='IDENTICAL')
103+
for hash_path,hash_val in hashes.items():
104+
if hash_path in identical_across:
105+
if not identical_across[hash_path] == hashes[hash_path]:
106+
del identical_across[hash_path]
107+
not_identical.append(hash_path)
108+
109+
110+
start = time.time()
111+
hashy=get_image_hash(image_file)
112+
end = time.time()
113+
114+
# Question 3: What files are consistent between the same operating systems?
115+
# LEVEL BASE
116+
# A base similarity means the base of the images (the OS) are likely the same
117+
118+
119+
# Outputs:
120+
# A function that exports, reads tarfile into memory (or disk?) and generates a list of
121+
# key (file) and value (sha1 sum)
122+
0) I'll first experiment with different patterns of files/folders and figure out which are consistent across images. I'll probably do this by doing a content hash of all individual files, and then finding the set that is consistent across 1) the same exact image, and 2) different images but same builds, and 3) different images different builds. We could even give each some kind of score to determine the right set it belongs in.
123+
1) at the highest level of reproduciblity (eg same file) we get equivalent hashes - to do this I'll just download exactly the same image
124+
2) at a "working" (aka, reasonable to use) level of reproducibility, we should get equivalent hashes given the same build, but different files (eg, I built my thing twice from the same spec)
125+
3) at the lowest level of reproducibility (eg, base operating system) we should see some identicalness if the operating systems base are largely the same.
126+
127+
We can then allow the user to use our functions, and go a bit deeper into image comparison and asses, given equal file paths, which are actually equal in content across two images. The user could even save a definition of "how they are assessing reproducibility" of the image by way of a list of regular expressions, and a hash for their image generated from it. I think it would be interesting, given this algorithm, to parse all singularity hub public images and assess the total level of redundancy!
128+
129+
Anyhoo, I think I'm going to go to sleep now, I keep doing this not sleeping thing, lol.
130+
46131

47-
results[container_name] = {'collection':collection,
48-
'containers':containers}
132+
from glob import glob
133+
image_files=glob('*.img')
134+
sums = []
135+
for image_file in image_files:
136+
os.system('sudo singularity export %s > tmp.tar' %(image_file))
137+
summy = tarsum('tmp.tar')
138+
print(summy)
139+
sums.append(summy)
49140

50-
shutil.rmtree(storage)

singularity/cli.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,10 @@ def export(self,image_path,pipe=False,output_file=None,export_format="tar"):
144144
will generate temporary directory.
145145
:param export_format: the export format (only tar currently supported)
146146
'''
147-
sudo = True
147+
sudo = False
148148
cmd = ['singularity','export']
149149

150-
if export_format != "tar":
150+
if export_format is not "tar":
151151
print("Currently only supported export format is tar.")
152152
return None
153153

@@ -166,7 +166,7 @@ def export(self,image_path,pipe=False,output_file=None,export_format="tar"):
166166
return None
167167

168168
# if user has specified output file, move it there, return path
169-
if output_file != None:
169+
if output_file is not None:
170170
shutil.copyfile(tmptar,output_file)
171171
return output_file
172172
else:

singularity/hub/base.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
'''
88

99
from singularity.hub.utils import (
10+
download_stream_atomically,
1011
parse_container_name,
1112
is_number,
1213
api_get,
@@ -61,17 +62,19 @@ def download_image(manifest,download_folder=None,extract=True,name=None):
6162
print("Found image %s:%s" %(manifest['name'],manifest['branch']))
6263
print("Downloading image... %s" %(image_file))
6364

64-
if download_folder != None:
65+
#TODO: add temporary file here
66+
if download_folder is not None:
6567
image_file = "%s/%s" %(download_folder,image_file)
6668
url = manifest['image']
67-
image_file = api_get(url,stream_to=image_file)
69+
image_file = download_stream_atomically(url,file_name=image_file)
6870
if extract == True:
6971
print("Decompressing %s" %image_file)
7072
os.system('gzip -d -f %s' %(image_file))
7173
image_file = image_file.replace('.gz','')
7274
return image_file
7375

7476

77+
7578
# Various Helpers ---------------------------------------------------------------------------------
7679
def get_image_name(manifest,extension='img.gz',use_hash=False):
7780
'''get_image_name will return the image name for a manifest

singularity/hub/utils.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,35 @@ def parse_container_name(image):
166166
'user':user }
167167

168168
return parsed
169+
170+
171+
######################################################################
172+
# Downloading
173+
######################################################################
174+
175+
176+
def download_atomically(url,file_name,headers=None):
177+
'''download atomically will stream to a temporary file, and
178+
rename only upon successful completion. This is to ensure that
179+
errored downloads are not found as complete in the cache
180+
:param file_name: the file name to stream to
181+
:param url: the url to stream from
182+
:param headers: additional headers to add to the get (default None)
183+
'''
184+
try: # file_name.tmp.XXXXXX
185+
fd, tmp_file = tempfile.mkstemp(prefix=("%s.tmp." % file_name))
186+
os.close(fd)
187+
response = api_get(url,headers=headers,stream=tmp_file)
188+
if isinstance(response, HTTPError):
189+
logger.error("Error downloading %s, exiting.", url)
190+
sys.exit(1)
191+
os.rename(tmp_file, file_name)
192+
except:
193+
download_folder = os.path.dirname(os.path.abspath(file_name))
194+
logger.error("Error downloading %s. Do you have permission to write to %s?", url, download_folder)
195+
try:
196+
os.remove(tmp_file)
197+
except:
198+
pass
199+
sys.exit(1)
200+
return file_name

singularity/package.py

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,19 @@
1515
)
1616

1717
from singularity.cli import Singularity
18+
from singularity.reproduce import (
19+
get_image_hash
20+
)
1821
import tempfile
1922
import tarfile
2023
import hashlib
2124
import zipfile
2225
import shutil
2326
import json
27+
import io
2428
import os
29+
import re
30+
import sys
2531

2632

2733
def estimate_image_size(spec_file,sudopw=None,padding=None):
@@ -208,16 +214,3 @@ def load_package(package_path,get=None):
208214
bot.logger.debug("Unknown extension %s, skipping %s", ext,g)
209215

210216
return retrieved
211-
212-
213-
def get_image_hash(image_path):
214-
'''get_image_hash will return an md5 hash of the file. Since we don't have git commits
215-
this seems like a reasonable option to "version" an image, since we can easily say yay or nay
216-
if the image matches the spec file
217-
:param image_path: full path to the singularity image
218-
'''
219-
hash_md5 = hashlib.md5()
220-
with open(image_path, "rb") as f:
221-
for chunk in iter(lambda: f.read(4096), b""):
222-
hash_md5.update(chunk)
223-
return hash_md5.hexdigest()

0 commit comments

Comments
 (0)