|
1 | 1 | # Compare Singularity Hub containers |
2 | 2 |
|
3 | | -# This is a simple script to use the singularity command line tool to download containers |
4 | | -# (using Singularity, Section 1) and compare build specs (using Singularity Hub API, Section 2) and to |
5 | | -# compare the containers themselves using singularity python (Section 3) |
| 3 | +# This is a simple script to use the singularity command line tool to obtain manifests |
| 4 | +# and compare build specs (using Singularity Hub API) |
6 | 5 |
|
7 | | -container_names = ['vsoch/singularity-hello-world', |
8 | | - 'researchapps/quantum_state_diffusion', |
9 | | - 'vsoch/pefinder'] |
| 6 | +container_name = 'vsoch/singularity-hello-world' |
10 | 7 |
|
11 | 8 | from singularity.hub.client import Client |
12 | | -from singularity.package import get_image_hash |
13 | 9 |
|
14 | | -import tempfile |
| 10 | +import pickle |
15 | 11 | import os |
16 | | -import demjson |
17 | 12 | import pandas |
18 | 13 | import shutil |
19 | 14 |
|
20 | 15 | shub = Client() # Singularity Hub Client |
21 | | -results = dict() |
22 | 16 |
|
23 | 17 | # Let's keep images in a temporary folder |
24 | | -storage = tempfile.mkdtemp() |
| 18 | +base = "/home/vanessa/Documents/Work/singularity/hub" |
| 19 | +storage = "%s/containers" %base |
| 20 | +if not os.path.exists(storage): |
| 21 | + os.mkdir(storage) |
25 | 22 | os.chdir(storage) |
26 | 23 |
|
27 | 24 | # We will keep a table of information |
28 | | -columns = ['name','build_time_seconds','hash','size','commit','estimated_os'] |
| 25 | +columns = ['name','build_time_seconds','size','commit','estimated_os'] |
29 | 26 | df = pandas.DataFrame(columns=columns) |
| 27 | +containers = dict() |
| 28 | +results = dict() |
30 | 29 |
|
31 | | -for container_name in container_names: |
32 | | - |
33 | | - # Retrieve the container based on the name |
34 | | - collection = shub.get_collection(container_name) |
35 | | - container_ids = collection['container_set'] |
36 | | - containers = [] |
37 | | - for container_id in container_ids: |
| 30 | +def get_top_os(x): |
| 31 | + return sorted(x.items(), key=lambda x: (x[1],x[0]), reverse=True)[0][0] |
| 32 | + |
| 33 | +############################################################################# |
| 34 | +# Task 1: Download the containers and metadata! (different images) |
| 35 | +############################################################################# |
| 36 | + |
| 37 | +# Retrieve the container based on the name |
| 38 | +collection = shub.get_collection(container_name) |
| 39 | +results['repo_name'] = container_name |
| 40 | +results['collection'] = collection |
| 41 | +container_ids = collection['container_set'] |
| 42 | +cids = [] |
| 43 | +for c in range(0,len(container_ids)): |
| 44 | + try: |
| 45 | + container_id = container_ids[c] |
| 46 | + cids.append(container_id) |
38 | 47 | manifest = shub.get_container(container_id) |
39 | | - containers.append(manifest) |
| 48 | + container_uri = '%s-%s' %(container_name,manifest['version']) |
| 49 | + containers[container_uri] = manifest |
40 | 50 | image = shub.pull_container(manifest, |
41 | 51 | download_folder=storage, |
42 | 52 | name="%s.img.gz" %(manifest['version'])) |
43 | | - # Get hash of file |
44 | | - hashes.append(get_image_hash(image)) |
45 | | - df.loc['%s-%s' %(container_name,manifest['version'])] |
| 53 | + metrics = shub.load_metrics(manifest) |
| 54 | + top_os = get_top_os(metrics['os_sims']) |
| 55 | + entry = [container_name, |
| 56 | + metrics['build_time_seconds'], |
| 57 | + metrics['size'], |
| 58 | + manifest['version'], |
| 59 | + top_os] |
| 60 | + df.loc[container_uri] = entry |
| 61 | + except: |
| 62 | + pass |
| 63 | + |
| 64 | +results['containers'] = containers |
| 65 | +results['df'] = df |
| 66 | +pickle.dump(results,open('%s/results.pkl' %storage,'wb')) |
| 67 | + |
| 68 | + |
| 69 | +############################################################################# |
| 70 | +# Task 2: Develop levels of reproducibility |
| 71 | +############################################################################# |
| 72 | + |
| 73 | +from singularity.reproduce import ( |
| 74 | + get_content_hashes, |
| 75 | + get_levels |
| 76 | +) |
| 77 | + |
| 78 | +levels = get_levels() |
| 79 | +results = pickle.load(open('%s/results.pkl' %storage,'rb')) |
| 80 | + |
| 81 | +os.chdir(storage) |
| 82 | +image_files = glob("*.img") |
| 83 | + |
| 84 | +# Question 1: What files are consistent across the same image? |
| 85 | +# LEVEL IDENTICAL |
| 86 | +# Here we will download the same image 10 times, create a sha1 sum of the files, |
| 87 | +# and determine which sets of files should be consistent for the same image file |
| 88 | + |
| 89 | +5665 |
| 90 | +5673 |
| 91 | + |
| 92 | + |
| 93 | +# Question 2: What files are consistent across the same image, different downloads? |
| 94 | +# LEVEL REPLICATE |
| 95 | +# Here we will use the 100 files in the folder, and find files/folders consistent across |
| 96 | +# we will not include the runscript, since we know this was changed. |
| 97 | +identical_across = get_content_hashes(image_files[0],level='IDENTICAL') |
| 98 | +image_files.pop(0) |
| 99 | +not_identical = [] |
| 100 | + |
| 101 | +for image_file in image_files: |
| 102 | + hashes = get_content_hashes(image_file,level='IDENTICAL') |
| 103 | + for hash_path,hash_val in hashes.items(): |
| 104 | + if hash_path in identical_across: |
| 105 | + if not identical_across[hash_path] == hashes[hash_path]: |
| 106 | + del identical_across[hash_path] |
| 107 | + not_identical.append(hash_path) |
| 108 | + |
| 109 | + |
| 110 | +start = time.time() |
| 111 | +hashy=get_image_hash(image_file) |
| 112 | +end = time.time() |
| 113 | + |
| 114 | +# Question 3: What files are consistent between the same operating systems? |
| 115 | +# LEVEL BASE |
| 116 | +# A base similarity means the base of the images (the OS) are likely the same |
| 117 | + |
| 118 | + |
| 119 | +# Outputs: |
| 120 | +# A function that exports, reads tarfile into memory (or disk?) and generates a list of |
| 121 | +# key (file) and value (sha1 sum) |
| 122 | + 0) I'll first experiment with different patterns of files/folders and figure out which are consistent across images. I'll probably do this by doing a content hash of all individual files, and then finding the set that is consistent across 1) the same exact image, and 2) different images but same builds, and 3) different images different builds. We could even give each some kind of score to determine the right set it belongs in. |
| 123 | + 1) at the highest level of reproduciblity (eg same file) we get equivalent hashes - to do this I'll just download exactly the same image |
| 124 | + 2) at a "working" (aka, reasonable to use) level of reproducibility, we should get equivalent hashes given the same build, but different files (eg, I built my thing twice from the same spec) |
| 125 | + 3) at the lowest level of reproducibility (eg, base operating system) we should see some identicalness if the operating systems base are largely the same. |
| 126 | + |
| 127 | +We can then allow the user to use our functions, and go a bit deeper into image comparison and asses, given equal file paths, which are actually equal in content across two images. The user could even save a definition of "how they are assessing reproducibility" of the image by way of a list of regular expressions, and a hash for their image generated from it. I think it would be interesting, given this algorithm, to parse all singularity hub public images and assess the total level of redundancy! |
| 128 | + |
| 129 | +Anyhoo, I think I'm going to go to sleep now, I keep doing this not sleeping thing, lol. |
| 130 | + |
46 | 131 |
|
47 | | - results[container_name] = {'collection':collection, |
48 | | - 'containers':containers} |
| 132 | +from glob import glob |
| 133 | +image_files=glob('*.img') |
| 134 | +sums = [] |
| 135 | +for image_file in image_files: |
| 136 | + os.system('sudo singularity export %s > tmp.tar' %(image_file)) |
| 137 | + summy = tarsum('tmp.tar') |
| 138 | + print(summy) |
| 139 | + sums.append(summy) |
49 | 140 |
|
50 | | -shutil.rmtree(storage) |
|
0 commit comments