singularityhub
diff --git a/‎examples/singularity_hub/compare_builds.ipynb‎
Lines changed: 337 additions & 0 deletions b/‎examples/singularity_hub/compare_builds.ipynb‎
Lines changed: 337 additions & 0 deletions
diff --git a/‎examples/singularity_hub/compare_builds.py‎
Lines changed: 92 additions & 20 deletions b/‎examples/singularity_hub/compare_builds.py‎
Lines changed: 92 additions & 20 deletions
diff --git a/‎examples/singularity_hub/download_metrics.py‎
Lines changed: 70 additions & 0 deletions b/‎examples/singularity_hub/download_metrics.py‎
Lines changed: 70 additions & 0 deletions
diff --git a/‎examples/singularity_hub/results.pkl‎
39.1 KB b/‎examples/singularity_hub/results.pkl‎
39.1 KB
diff --git a/‎singularity/hub/data/reproduce_levels_2-3.json‎
Lines changed: 10 additions & 10 deletions b/‎singularity/hub/data/reproduce_levels_2-3.json‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎singularity/package.py‎
Lines changed: 18 additions & 8 deletions b/‎singularity/package.py‎
Lines changed: 18 additions & 8 deletions
@@ -5,9 +5,10 @@
 
 from singularity.hub.client import Client
 
-import pickle
+from glob import glob
 import os
 import pandas
+import pickle
 import shutil
 
 shub = Client()    # Singularity Hub Client
@@ -60,7 +61,8 @@ def get_top_os(x):
 
 results['containers'] = containers    
 results['df'] = df
-pickle.dump(results,open('%s/results.pkl' %storage,'wb'))
+result_file = '%s/results-%s.pkl' %(base,container_name.replace('/','-'))
+pickle.dump(results,open(result_file),'wb'))
 
 
 #############################################################################
@@ -69,45 +71,116 @@ def get_top_os(x):
 
 from singularity.reproduce import (
     get_content_hashes,
+    get_image_hash,
     get_levels
 )
 
-levels = get_levels()
-results = pickle.load(open('%s/results.pkl' %storage,'rb'))
+levels = get_levels(version=2.2)
+result_file = '%s/results-%s.pkl' %(base,container_name.replace('/','-'))
+results = pickle.load(open(result_file,'rb'))
 
 os.chdir(storage)
 image_files = glob("*.img")
 
-# Question 1: What files are consistent across the same image?
-# LEVEL IDENTICAL
-# Here we will download the same image 10 times, create a sha1 sum of the files,
-# and determine which sets of files should be consistent for the same image file
-
 
-# Question 2: What files are consistent across the same image, different downloads?
-# LEVEL REPLICATE
+# Let's assess what files are identical across the images. We can use this to develop
+# our subsequent levels.
 # Here we will use the 100 files in the folder, and find files/folders consistent across
 # we will not include the runscript, since we know this was changed.
-identical_across = get_content_hashes(image_files[0],level='IDENTICAL')
+identical_across = get_content_hashes(image_files[0],level='IDENTICAL',version=2.2)
 image_files.pop(0)
 not_identical = []
 
 for image_file in image_files:
-    hashes = get_content_hashes(image_file,level='IDENTICAL')
+    hashes = get_content_hashes(image_file,level='IDENTICAL',version=2.2)
     for hash_path,hash_val in hashes.items():
         if hash_path in identical_across:
             if not identical_across[hash_path] == hashes[hash_path]:
                 del identical_across[hash_path]
                 not_identical.append(hash_path)
 
+# From the above we learn that all files are identical except for those
+# in:
 
-start = time.time()
-hashy=get_image_hash(image_file)
-end = time.time()
+#['./.run',
+# './etc/hosts',
+# './singularity',
+# './etc/mtab',
+# './.exec',
+# './etc/resolv.conf',
+# './.shell',
+# './environment']
+
+# Since we know that the images were produced by way of changing the runscript,
+# and this influences the singularity metadata folders, we can conclude that we would
+# see differences for REPLICATE in /etc/hosts and /etc/mtab and /etc/resolv.conf
+
+# Identical: logically, if we compare an image to itself, all files are the same
+# Replicate: if we produce an equivalent image at a different time, we might have
+#            variance in package directories (anything involving variable with mirrors, etc)
+# Environment/Runscript/Labels: these are logical to compare, we compare the hash of
+#             just a few specific files in the image
+
+
+#############################################################################
+# Task 3: Assess levels of reproducibility
+#############################################################################
+
+# The first thing we want to do is evaluate our metrics for reproducibility.
+
+# Question 1: What files are consistent across the same image?
+# LEVEL IDENTICAL
+# Here we will download the same image 10 times, create a sha1 sum of the files,
+# and determine which sets of files should be consistent for the same image file
+
+
+# Question 2: What files are consistent across the same image, different downloads?
+# LEVEL REPLICATE
+# An image that is a replicate should be assessed as identical using the "REPLICATE"
+# criteria. 
+
+image_files = glob("*.img")
+
+# Let's assess what files are identical across the images. We can use this to develop
+# our subsequent levels.
+# Here we will use the 100 files in the folder, and find files/folders consistent across
+# we will not include the runscript, since we know this was changed.
+level_names = ['IDENTICAL',
+               'REPLICATE',
+               'RUNSCRIPT']
+
+dfs = dict()
+
+def generate_replication_df(level_name,image_files,version,skip_files=None):
+
+    print("CALCULATING COMPARISONS FOR LEVEL %s" %level_name)
+    df = pandas.DataFrame(0,index=image_files,columns=image_files)
+    for image_file1 in image_files:
+        for image_file2 in image_files:
+            hash1 = get_image_hash(image_file1,level=level_name,version=version)
+            hash2 = get_image_hash(image_file2,level=level_name,version=version)
+            if hash1 == hash2:
+                df.loc[image_file1,image_file2] = 1
+                df.loc[image_file2,image_file1] = 1
+    return df
+
+
+dfs['IDENTICAL'] = generate_replication_df('IDENTICAL',image_files,version=2.2)
+dfs['REPLICATE'] = generate_replication_df('REPLICATE',image_files,version=2.2, skip_files=['/singularity'])
+
+# Finally, if we compare runscripts only, we should see two container versions
+hashes = []
+
+for image_file in image_files:
+    hashy = get_image_hash(image_file,level="RUNSCRIPT",version=2.2)
+    hashes.append(hashy)
 
-# Question 3: What files are consistent between the same operating systems?
-# LEVEL BASE
-# A base similarity means the base of the images (the OS) are likely the same
+uniques = dict()
+for hashy in hashes:
+    if hashy in uniques:
+        uniques[hashy] +=1
+    else:
+        uniques[hashy] = 1
 
 
 # Outputs:
@@ -120,7 +193,6 @@ def get_top_os(x):
 
 We can then allow the user to use our functions, and go a bit deeper into image comparison and asses, given equal file paths, which are actually equal in content across two images. The user could even save a definition of "how they are assessing reproducibility" of the image by way of a list of regular expressions, and a hash for their image generated from it. I think it would be interesting, given this algorithm, to parse all singularity hub public images and assess the total level of redundancy!
 
-Anyhoo, I think I'm going to go to sleep now, I keep doing this not sleeping thing, lol.
 
 
 from glob import glob
 
@@ -0,0 +1,70 @@
+# Compare Singularity Hub containers
+
+# This is a simple script to use the singularity command line tool to obtain manifests
+# and compare build specs (using Singularity Hub API) 
+
+from singularity.hub.client import Client
+
+from glob import glob
+import os
+import pandas
+import pickle
+import shutil
+
+shub = Client()    # Singularity Hub Client
+
+container_names = ['vsoch/singularity-hello-world',
+                   'researchapps/quantum_state_diffusion',
+                   'vsoch/pe-predictive']
+
+# Let's keep images in a temporary folder
+base = "/home/vanessa/Documents/Work/singularity/hub"
+storage = "%s/containers" %base
+if not os.path.exists(storage):
+    os.mkdir(storage)
+os.chdir(storage)
+
+# We will keep a table of information
+columns = ['name','build_time_seconds','size','commit','estimated_os']
+df = pandas.DataFrame(columns=columns)
+results = dict()
+
+def get_top_os(x):
+    return sorted(x.items(), key=lambda x: (x[1],x[0]), reverse=True)[0][0]
+
+#############################################################################
+# Task 1: Download the containers and metadata! (different images)
+#############################################################################
+
+# Retrieve the container based on the name
+for container_name in container_names:
+    result = dict()
+    collection = shub.get_collection(container_name)
+    containers = dict()
+    result['collection'] = collection
+    container_ids = collection['container_set']
+    cids = []
+    for c in range(0,len(container_ids)):
+        container_id = container_ids[c]
+        cids.append(container_id)
+        manifest = shub.get_container(container_id)
+        container_uri = '%s-%s' %(container_name,manifest['version'])
+        containers[container_uri] = manifest
+        image = shub.pull_container(manifest,
+                                   download_folder=storage,
+                                   name="%s.img.gz" %(manifest['version']))       
+        metrics = shub.load_metrics(manifest)
+        top_os = get_top_os(metrics['os_sims'])       
+        entry = [container_name,
+                 metrics['build_time_seconds'],
+                 metrics['size'],
+                 manifest['version'],
+                top_os]
+        df.loc[container_uri] = entry
+    
+    result['containers'] = containers    
+    results[container_name] = result
+    
+results['df'] = df
+result_file = '%s/results.pkl' %(base)
+pickle.dump(results,open(result_file),'wb'))
@@ -9,9 +9,9 @@
     "regexp": "^/usr|^/bin|^/boot|^/lib64|/proc|^/run|^/dev|^/opt|^/sbin|^/srv|^/sys",
     "skip_files":["/etc/resolv.conf",
                   "/etc/hosts",
-                  "/.singularity/actions/exec",
-                  "/.singularity/actions/run",
-                  "/.singularity/actions/shell",
+                  "/singularity.d/actions/exec",
+                  "/singularity.d/actions/run",
+                  "/singularity.d/actions/shell",
                   "/etc/mtab"]
   },
 
@@ -21,18 +21,18 @@
   },
   "ENVIRONMENT": {
     "description": "only look at the container's environment. This level will only look at the environment files when assessing similarity.",
-   "regexp": "/.singularity/env" 
+   "regexp": "/singularity.d/env" 
   },
   "LABELS": {
     "description": "only look at the container labels, if they exist (singularity version 2.3)",
-    "include_files": "/.singularity/labels.json"
+    "include_files": "/singularity.d/labels.json"
   },
   "RECIPE": {
     "description": "singularity looks at everything on the level of the Singularity image, meaning the runscript, environment, and labels.",
-    "regexp": "/.singularity/env",
-    "include_files":["/.singularity/actions/exec",
-                     "/.singularity/actions/run",
-                     "/.singularity/actions/shell",
-                     "/.singularity/labels.json"]
+    "regexp": "/singularity.d/env",
+    "include_files":["/singularity.d/actions/exec",
+                     "/singularity.d/actions/run",
+                     "/singularity.d/actions/shell",
+                     "/singularity.d/labels.json"]
   }
 }
@@ -16,7 +16,10 @@
 
 from singularity.cli import Singularity
 from singularity.reproduce import (
-    get_image_hash
+    get_image_hash,
+    get_image_hashes,
+    get_memory_tar
+
 )
 import tempfile
 import tarfile
@@ -85,10 +88,10 @@ def build_from_spec(spec_file=None,build_dir=None,size=None,sudopw=None,build_fo
     image_path = "%s/image" %(build_dir)
 
     # Run create image and bootstrap with Singularity command line tool.
-    if sudopw != None:
+    if sudopw is not None:
         cli = Singularity(sudopw=sudopw,debug=debug)
     else:
-        cli = Singularity(debug=debug) # This command will ask the user for sudo
+        cli = Singularity(debug=debug)
 
     print("\nCreating and bootstrapping image...")
 
@@ -130,24 +133,29 @@ def package(image_path,spec_path=None,output_folder=None,runscript=True,
             S = Singularity(sudopw=sudopw,debug=verbose)
         else:
             S = Singularity(debug=verbose) # This command will ask the user for sudo
-    tmptar = S.export(image_path=image_path,pipe=False)
-    tar = tarfile.open(tmptar)
+
+    tar = get_memory_tar(image_path)
     members = tar.getmembers()
     image_name = os.path.basename(image_path)
     zip_name = "%s.zip" %(image_name.replace(" ","_"))
+
     # Include the image in the package?
     if remove_image:
         to_package = dict()
     else:
         to_package = {"files":[image_path]}
+
     # If the specfile is provided, it should also be packaged
     if spec_path != None:
         singularity_spec = "".join(read_file(spec_path))
         to_package['Singularity'] = singularity_spec
-    # Package the image with an md5 sum as VERSION
-    version = get_image_hash(image_path)
-    to_package["VERSION"] = version
+
+    # Package the image with an sha1, replication standard,  as VERSION
+    hashes = get_image_hashes(image_path)
+    to_package["VERSION"] = hashes['REPLICATE']
+    to_package["HASHES"] = hashes
     # Look for runscript
+
     if runscript == True:
         try:
             runscript_member = tar.getmember("./singularity")
@@ -157,12 +165,14 @@ def package(image_path,spec_path=None,output_folder=None,runscript=True,
             bot.logger.debug("Found runscript.")
         except KeyError:
             bot.logger.warning("No runscript found")
+
     if software == True:
         bot.logger.info("Adding software list to package.")
         files = [x.path for x in members if x.isfile()]
         folders = [x.path for x in members if x.isdir()]
         to_package["files.txt"] = files
         to_package["folders.txt"] = folders
+
     # Do zip up here - let's start with basic structures
     zipfile = zip_up(to_package,zip_name=zip_name,output_folder=output_folder)
     bot.logger.debug("Package created at %s" %(zipfile))