adding reproduce.py

vsoch · vsoch · commit 5840398ffd74 · 2017-03-23T15:34:13.000-04:00
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,5 +1,6 @@
 include README.md
 include LICENSE
+recursive-include singularity/hub *
 recursive-include singularity/templates *
 recursive-include singularity/static *
 recursive-include singularity/build *
diff --git a/examples/singularity_hub/compare_builds.py b/examples/singularity_hub/compare_builds.py
@@ -1,50 +1,140 @@
 # Compare Singularity Hub containers
 
-# This is a simple script to use the singularity command line tool to download containers
-# (using Singularity, Section 1) and compare build specs (using Singularity Hub API, Section 2) and to
-# compare the containers themselves using singularity python (Section 3)
+# This is a simple script to use the singularity command line tool to obtain manifests
+# and compare build specs (using Singularity Hub API) 
 
-container_names = ['vsoch/singularity-hello-world',
-                   'researchapps/quantum_state_diffusion',
-                   'vsoch/pefinder']
+container_name = 'vsoch/singularity-hello-world'
 
 from singularity.hub.client import Client
-from singularity.package import get_image_hash
 
-import tempfile
+import pickle
 import os
-import demjson
 import pandas
 import shutil
 
 shub = Client()    # Singularity Hub Client
-results = dict()
 
 # Let's keep images in a temporary folder
-storage = tempfile.mkdtemp()
+base = "/home/vanessa/Documents/Work/singularity/hub"
+storage = "%s/containers" %base
+if not os.path.exists(storage):
+    os.mkdir(storage)
 os.chdir(storage)
 
 # We will keep a table of information
-columns = ['name','build_time_seconds','hash','size','commit','estimated_os']
+columns = ['name','build_time_seconds','size','commit','estimated_os']
 df = pandas.DataFrame(columns=columns)
+containers = dict()
+results = dict()
 
-for container_name in container_names:
-    
-    # Retrieve the container based on the name
-    collection = shub.get_collection(container_name)
-    container_ids = collection['container_set']
-    containers = []
-    for container_id in container_ids:
+def get_top_os(x):
+    return sorted(x.items(), key=lambda x: (x[1],x[0]), reverse=True)[0][0]
+
+#############################################################################
+# Task 1: Download the containers and metadata! (different images)
+#############################################################################
+
+# Retrieve the container based on the name
+collection = shub.get_collection(container_name)
+results['repo_name'] = container_name
+results['collection'] = collection
+container_ids = collection['container_set']
+cids = []
+for c in range(0,len(container_ids)):
+   try:
+       container_id = container_ids[c]
+       cids.append(container_id)
        manifest = shub.get_container(container_id)
-       containers.append(manifest)
+       container_uri = '%s-%s' %(container_name,manifest['version'])
+       containers[container_uri] = manifest
        image = shub.pull_container(manifest,
                                    download_folder=storage,
                                    name="%s.img.gz" %(manifest['version']))       
-       # Get hash of file
-       hashes.append(get_image_hash(image))
-       df.loc['%s-%s' %(container_name,manifest['version'])]
+       metrics = shub.load_metrics(manifest)
+       top_os = get_top_os(metrics['os_sims'])       
+       entry = [container_name,
+                metrics['build_time_seconds'],
+                metrics['size'],
+                manifest['version'],
+                top_os]
+       df.loc[container_uri] = entry
+   except:
+       pass
+    
+results['containers'] = containers    
+results['df'] = df
+pickle.dump(results,open('%s/results.pkl' %storage,'wb'))
+
+
+#############################################################################
+# Task 2: Develop levels of reproducibility
+#############################################################################
+
+from singularity.reproduce import (
+    get_content_hashes,
+    get_levels
+)
+
+levels = get_levels()
+results = pickle.load(open('%s/results.pkl' %storage,'rb'))
+
+os.chdir(storage)
+image_files = glob("*.img")
+
+# Question 1: What files are consistent across the same image?
+# LEVEL IDENTICAL
+# Here we will download the same image 10 times, create a sha1 sum of the files,
+# and determine which sets of files should be consistent for the same image file
+
+5665
+5673
+
+
+# Question 2: What files are consistent across the same image, different downloads?
+# LEVEL REPLICATE
+# Here we will use the 100 files in the folder, and find files/folders consistent across
+# we will not include the runscript, since we know this was changed.
+identical_across = get_content_hashes(image_files[0],level='IDENTICAL')
+image_files.pop(0)
+not_identical = []
+
+for image_file in image_files:
+    hashes = get_content_hashes(image_file,level='IDENTICAL')
+    for hash_path,hash_val in hashes.items():
+        if hash_path in identical_across:
+            if not identical_across[hash_path] == hashes[hash_path]:
+                del identical_across[hash_path]
+                not_identical.append(hash_path)
+
+
+start = time.time()
+hashy=get_image_hash(image_file)
+end = time.time()
+
+# Question 3: What files are consistent between the same operating systems?
+# LEVEL BASE
+# A base similarity means the base of the images (the OS) are likely the same
+
+
+# Outputs:
+# A function that exports, reads tarfile into memory (or disk?) and generates a list of
+# key (file) and value (sha1 sum)
+ 0) I'll first experiment with different patterns of files/folders and figure out which are consistent across images. I'll probably do this by doing a content hash of all individual files, and then finding the set that is consistent across 1) the same exact image, and 2) different images but same builds, and 3) different images different builds. We could even give each some kind of score to determine the right set it belongs in.
+ 1) at the highest level of reproduciblity (eg same file) we get equivalent hashes - to do this I'll just download exactly the same image
+ 2) at a "working" (aka, reasonable to use) level of reproducibility, we should get equivalent hashes given the same build, but different files (eg, I built my thing twice from the same spec)
+ 3) at the lowest level of reproducibility (eg, base operating system) we should see some identicalness if the operating systems base are largely the same.
+ 
+We can then allow the user to use our functions, and go a bit deeper into image comparison and asses, given equal file paths, which are actually equal in content across two images. The user could even save a definition of "how they are assessing reproducibility" of the image by way of a list of regular expressions, and a hash for their image generated from it. I think it would be interesting, given this algorithm, to parse all singularity hub public images and assess the total level of redundancy!
+
+Anyhoo, I think I'm going to go to sleep now, I keep doing this not sleeping thing, lol.
+
 
-    results[container_name] = {'collection':collection,
-                               'containers':containers}
+from glob import glob
+image_files=glob('*.img')    
+sums = []
+for image_file in image_files:
+   os.system('sudo singularity export %s > tmp.tar' %(image_file))
+   summy = tarsum('tmp.tar')
+   print(summy)
+   sums.append(summy)
 
-shutil.rmtree(storage)
diff --git a/singularity/cli.py b/singularity/cli.py
@@ -144,10 +144,10 @@ def export(self,image_path,pipe=False,output_file=None,export_format="tar"):
         will generate temporary directory.
         :param export_format: the export format (only tar currently supported)
         '''
-        sudo = True
+        sudo = False
         cmd = ['singularity','export']
 
-        if export_format != "tar":
+        if export_format is not "tar":
             print("Currently only supported export format is tar.")
             return None
     
@@ -166,7 +166,7 @@ def export(self,image_path,pipe=False,output_file=None,export_format="tar"):
                 return None
 
             # if user has specified output file, move it there, return path
-            if output_file != None:
+            if output_file is not None:
                 shutil.copyfile(tmptar,output_file)
                 return output_file
             else:
diff --git a/singularity/hub/base.py b/singularity/hub/base.py
@@ -7,6 +7,7 @@
 '''
 
 from singularity.hub.utils import (
+    download_stream_atomically,
     parse_container_name,
     is_number,
     api_get,
@@ -61,17 +62,19 @@ def download_image(manifest,download_folder=None,extract=True,name=None):
     print("Found image %s:%s" %(manifest['name'],manifest['branch']))
     print("Downloading image... %s" %(image_file))
 
-    if download_folder != None:
+    #TODO: add temporary file here
+    if download_folder is not None:
         image_file = "%s/%s" %(download_folder,image_file)
     url = manifest['image']
-    image_file = api_get(url,stream_to=image_file)
+    image_file = download_stream_atomically(url,file_name=image_file)
     if extract == True:
         print("Decompressing %s" %image_file)
         os.system('gzip -d -f %s' %(image_file))
         image_file = image_file.replace('.gz','')
     return image_file
 
 
+
 # Various Helpers ---------------------------------------------------------------------------------
 def get_image_name(manifest,extension='img.gz',use_hash=False):
     '''get_image_name will return the image name for a manifest
diff --git a/singularity/hub/utils.py b/singularity/hub/utils.py
@@ -166,3 +166,35 @@ def parse_container_name(image):
               'user':user }
 
     return parsed
+
+
+######################################################################
+# Downloading
+######################################################################
+
+
+def download_atomically(url,file_name,headers=None):
+    '''download atomically will stream to a temporary file, and
+    rename only upon successful completion. This is to ensure that
+    errored downloads are not found as complete in the cache
+    :param file_name: the file name to stream to
+    :param url: the url to stream from
+    :param headers: additional headers to add to the get (default None)
+    '''
+    try:               # file_name.tmp.XXXXXX
+        fd, tmp_file = tempfile.mkstemp(prefix=("%s.tmp." % file_name)) 
+        os.close(fd)
+        response = api_get(url,headers=headers,stream=tmp_file)
+        if isinstance(response, HTTPError):
+            logger.error("Error downloading %s, exiting.", url)
+            sys.exit(1)
+        os.rename(tmp_file, file_name)
+    except:
+        download_folder = os.path.dirname(os.path.abspath(file_name))
+        logger.error("Error downloading %s. Do you have permission to write to %s?", url, download_folder)
+        try:
+            os.remove(tmp_file)
+        except:
+            pass
+        sys.exit(1)
+    return file_name
diff --git a/singularity/package.py b/singularity/package.py
@@ -15,13 +15,19 @@
 )
 
 from singularity.cli import Singularity
+from singularity.reproduce import (
+    get_image_hash
+)
 import tempfile
 import tarfile
 import hashlib
 import zipfile
 import shutil
 import json
+import io
 import os
+import re
+import sys
 
 
 def estimate_image_size(spec_file,sudopw=None,padding=None):
@@ -208,16 +214,3 @@ def load_package(package_path,get=None):
             bot.logger.debug("Unknown extension %s, skipping %s", ext,g)
 
     return retrieved
-
-
-def get_image_hash(image_path):
-    '''get_image_hash will return an md5 hash of the file. Since we don't have git commits
-    this seems like a reasonable option to "version" an image, since we can easily say yay or nay
-    if the image matches the spec file
-    :param image_path: full path to the singularity image
-    '''
-    hash_md5 = hashlib.md5()
-    with open(image_path, "rb") as f:
-        for chunk in iter(lambda: f.read(4096), b""):
-            hash_md5.update(chunk)
-    return hash_md5.hexdigest()
diff --git a/singularity/reproduce.py b/singularity/reproduce.py
diff --git a/singularity/utils.py b/singularity/utils.py