Skip to content

Commit aecbd3e

Browse files
committed
updating all examples (minus original paper)
Signed-off-by: Vanessa Sochat <[email protected]>
1 parent 71dbf6d commit aecbd3e

File tree

11 files changed

+101
-81
lines changed

11 files changed

+101
-81
lines changed

examples/classify/classify_image/count_files.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,11 @@
77
extension_counts
88
)
99

10-
container = "ubuntu.simg"
10+
# singularity pull docker://busybox
11+
container = "busybox_latest.sif"
1112

1213
# Now we might be interested in counting different things
13-
readme_count = file_counts(container)
14-
copyright_count = file_counts(container, patterns=['copyright'])
15-
authors_count = file_counts(container, patterns=['authors','thanks','credit'])
16-
todo_count = file_counts(container, patterns=['todo'])
14+
bin_count = file_counts(container, patterns=['bin'])
1715

1816
# Or getting a complete dict of extensions
1917
extensions = extension_counts(container)

examples/classify/classify_image/derive_tags.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,9 @@
1515
# Default tags will be returned as software in "bin"
1616
tags = get_tags(container)
1717

18-
# We can also get the raw list of flies
19-
file_list = get_container_contents(container)['all']
20-
2118
# We can specify other folders of interest
2219
folders = ['init','init.d','bin','systemd']
2320
tags = get_tags(container, search_folders=folders)
21+
22+
# We can also get the raw list of flies
23+
file_list = get_container_contents(container)['all']
Lines changed: 14 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,23 @@
11
from glob import glob
22

3-
from singularity.analysis.reproduce import assess_differences
3+
from singularity.analysis.reproduce import assess_differences, get_level
44

5-
image_files=glob('*.img')
5+
# singularity pull docker://ubuntu:14.04
6+
# singularity pull docker://ubuntu:12.04
67

7-
# ASSESS DIFFERENCES #######################################
8-
# returns dictionary with
9-
10-
report = assess_differences(image_files[0],image_files[1])
8+
image_files = glob('ubuntu*.sif')
119

12-
report.keys()
13-
# dict_keys(['different', 'missing', 'same'])
1410

15-
# These files are equivalent between the images
16-
print(len(report['same']))
17-
5663
11+
# Choose a level that you want to assess based on
12+
level_filter = {"RECIPE": get_level('RECIPE')}
1813

19-
# These files are present in both, but different
20-
print(report['different'])
21-
['./etc/hosts',
22-
'./.exec',
23-
'./environment',
24-
'./etc/mtab',
25-
'./etc/resolv.conf',
26-
'./.run',
27-
'./.shell',
28-
'./singularity']
14+
# ASSESS DIFFERENCES #######################################
2915

30-
# These files are found in the first image, but not the second
31-
print(report['missing'])
32-
['./var/lib/apt/lists/.wh.archive.ubuntu.com_ubuntu_dists_xenial-updates_main_i18n_Translation-en',
33-
'./bin/gunzip']
16+
# Running for all levels, this will take a few minutes
17+
report = assess_differences(image_files[0], image_files[1], levels=level_filter)
3418

19+
# {'RECIPE': {'difference': [],
20+
# 'intersect_different': [],
21+
# 'same': 7,
22+
# 'union': 14},
23+
# 'scores': {'RECIPE': 1.0}}

examples/reproducibility/generate_image_hash.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
get_image_file_hash
88
)
99

10-
image_files = glob("*.simg")
10+
image_files = glob("*.sif")
1111
image_path = image_files[0]
1212

1313
########################################################
@@ -30,7 +30,7 @@
3030
# We can, then generate an image hash, and by default the level "REPLICATION" will be used:
3131

3232
get_image_hash(image_path)
33-
#'bf8e242931e25ae9496015868ab2e8cc8d156ffd'
33+
# '4c252c8fb818e4b854a478a1a0df5991'
3434

3535
# But we can also specify a level that we want:
3636
get_image_hash(image_path,level="IDENTICAL")
@@ -45,10 +45,10 @@
4545
# of one container at one level!
4646
digest = get_content_hashes(image_path)
4747
digest['hashes']['/usr/bin/chfn']
48-
# 'ee2b438c278011bdac1a3a927e2d37519a8ed9c7'
48+
# '4b5ee4db88c3b8bfb0cb7cb3a90a7793'
4949

5050
# We can also get a hash of the entire image file, this is done on the
5151
# binary file and not contents inside.
5252

5353
file_hash = get_image_file_hash(image_path)
54-
# e'13775a83962ae60744d691eb7f7fd1e96599e656'
54+
# 'd5349c37fdc2e6f2dca8793732e1c420'

examples/shub/paper/README.md

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,13 @@
1-
21
# Singularity Hub Paper
32

43
This is the code for the Singularity Hub paper. You should be able to view the comparison tree at [result/index.html](result/index.html).
4+
Note that the version of Singularity Python used for this is older than currently on master,
5+
you will need to install:
6+
7+
```bash
8+
git clone -b v2.5 https://www.github.com/singularityhub/singularity-python.git
9+
cd singularity-python
10+
python setup.py install
11+
```
12+
13+
And the images used were also Singularity version 2.2 and 2.3.

singularity/analysis/reproduce/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
from .utils import (
1717
extract_content,
1818
delete_image_tar,
19-
get_memory_tar,
2019
get_image_tar
2120
)
2221

singularity/analysis/reproduce/hash.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ def get_content_hashes(image_path,
164164
file_filter = level_filter
165165

166166
elif level is None:
167-
file_filter = get_level("REPLICATE",version=version,
167+
file_filter = get_level("REPLICATE", version=version,
168168
skip_files=skip_files,
169169
include_files=include_files)
170170

singularity/analysis/reproduce/levels.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def get_custom_level(regexp=None,description=None,skip_files=None,include_files=
6161
return custom
6262

6363

64-
def get_level(level,version=None,include_files=None,skip_files=None):
64+
def get_level(level, version=None, include_files=None, skip_files=None):
6565
'''get_level returns a single level, with option to customize files
6666
added and skipped.
6767
'''

singularity/analysis/reproduce/metrics.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
'''
22
3-
Copyright (C) 2017 The Board of Trustees of the Leland Stanford Junior
4-
University.
5-
Copyright (C) 2016-2017 Vanessa Sochat.
3+
Copyright (C) 2016-2019 Vanessa Sochat.
64
75
This program is free software: you can redistribute it and/or modify it
86
under the terms of the GNU Affero General Public License as published by
@@ -19,6 +17,7 @@
1917
2018
'''
2119

20+
from spython.utils import get_singularity_version
2221
from spython.main import Client
2322
from singularity.logger import bot
2423
from .levels import get_levels
@@ -47,6 +46,11 @@ def assess_differences(image_file1,
4746
reports = dict()
4847
scores = dict()
4948

49+
# For version 3, export sandboxes
50+
if 'version 3' in get_singularity_version():
51+
image_file1 = Client.export(image_file1)
52+
image_file2 = Client.export(image_file2)
53+
5054
for level_name, level_filter in levels.items():
5155
contenders = []
5256
different = []
@@ -64,22 +68,25 @@ def assess_differences(image_file1,
6468
guts2 = get_content_hashes(image_path=image_file2,
6569
level_filter=level_filter)
6670

67-
print(level_name)
6871
files = list(set(list(guts1['hashes'].keys()) + list(guts2['hashes'].keys())))
6972

7073
for file_name in files:
7174

72-
# If it's not in one or the other
75+
# If it's not in one or the other, we can't directly compare
7376
if file_name not in guts1['hashes'] or file_name not in guts2['hashes']:
7477
setdiff.append(file_name)
7578

7679
else:
80+
81+
# We can directly compare - and they are the same
7782
if guts1['hashes'][file_name] == guts2['hashes'][file_name]:
7883
same+=1
84+
7985
else:
8086

8187
# If the file is root owned, we compare based on size
8288
if size_heuristic == True:
89+
8390
if guts1['root_owned'][file_name] or guts2['root_owned'][file_name]:
8491
if guts1['sizes'][file_name] == guts2['sizes'][file_name]:
8592
same+=1
@@ -88,28 +95,39 @@ def assess_differences(image_file1,
8895
else:
8996
# Otherwise, we can assess the bytes content by reading it
9097
contenders.append(file_name)
98+
99+
# We don't use a size hueristic, we just will compare based on bytes
91100
else:
92101
contenders.append(file_name)
93102

94103
# If the user wants identical (meaning extraction order and timestamps)
95104
if level_name == "IDENTICAL":
96-
different = different + contenders
105+
different = different + contenders
97106

98107
# Otherwise we need to check based on byte content
99108
else:
100109
if len(contenders) > 0:
110+
101111
for rogue in contenders:
102-
hashy1 = extract_content(image_file1, rogue, return_hash=True)
103-
hashy2 = extract_content(image_file2, rogue, return_hash=True)
104-
112+
113+
if 'version 3' in get_singularity_version():
114+
hashy1 = extract_content(image_file1 + rogue, return_hash=True)
115+
hashy2 = extract_content(image_file2 + rogue, return_hash=True)
116+
else:
117+
hashy1 = extract_content(rogue, return_hash=True)
118+
hashy2 = extract_content(rogue, return_hash=True)
119+
105120
# If we can't compare, we use size as a heuristic
106121
if hashy1 is None or hashy2 is None: # if one is symlink, could be None
107-
different.append(file_name)
122+
different.append(file_name)
123+
124+
# We still fall back to size heuristic if not possible
108125
elif len(hashy1) == 0 or len(hashy2) == 0:
109126
if guts1['sizes'][file_name] == guts2['sizes'][file_name]:
110127
same+=1
111128
else:
112129
different.append(file_name)
130+
113131
elif hashy1 != hashy2:
114132
different.append(rogue)
115133
else:

singularity/analysis/reproduce/utils.py

Lines changed: 31 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
'''
22
3-
Copyright (C) 2017 The Board of Trustees of the Leland Stanford Junior
4-
University.
5-
Copyright (C) 2016-2017 Vanessa Sochat.
3+
Copyright (C) 2016-2019 Vanessa Sochat.
64
75
This program is free software: you can redistribute it and/or modify it
86
under the terms of the GNU Affero General Public License as published by
@@ -43,8 +41,14 @@ def extract_guts(image_path,
4341
tag_root=True,
4442
include_sizes=True):
4543

46-
'''extract the file guts from an in memory tarfile. The file is not closed.
47-
This should not be done for large images.
44+
'''extract the file guts from an image.
45+
46+
Parameters
47+
==========
48+
image_path: can be a tar, a Singularity image (sif) or a sandbox
49+
file_filter: the file filter to extract guts for.
50+
tag_root: if True (default) include if root owned or not.
51+
include_sizes: include content sizes (defaults to True)
4852
'''
4953
if file_filter is None:
5054
file_filter = get_level('IDENTICAL')
@@ -59,33 +63,44 @@ def extract_guts(image_path,
5963
if include_sizes:
6064
sizes = dict()
6165

62-
# Export the image
63-
sandbox = Client.export(image_path)
66+
# Option 1: We are given a sandbox
67+
if os.path.isdir(image_path):
68+
sandbox = image_path
69+
70+
# Option 2: it's not a sandbox, and we need to export.
71+
elif 'version 3' in get_singularity_version():
72+
sandbox = Client.export(image_path)
73+
else:
74+
sandbox = Client.image.export(image_path)
6475

6576
# If it's tar, extract
66-
if sandbox.endswith('tar'):
77+
if os.path.isfile(sandbox) and sandbox.endswith('tar'):
6778
with tarfile.open(sandbox) as tar:
6879
sandbox = os.path.join(os.path.dirname(sandbox), 'sandbox')
6980
tar.extractall(path=sandbox)
7081

7182
# Recursively walk through sandbox
7283
for root, dirnames, filenames in os.walk(sandbox):
7384
for filename in filenames:
74-
member_name = os.path.join(root, filename)
85+
sandbox_name = os.path.join(root, filename)
86+
87+
# Remove the sandbox base
88+
member_name = sandbox_name.lstrip(sandbox)
89+
7590
allfiles.append(member_name)
7691
included = False
7792

7893
# Skip over directories and symbolic links
79-
if os.path.isdir(member_name) or os.path.islink(member_name):
94+
if os.path.isdir(sandbox_name) or os.path.islink(sandbox_name):
8095
continue
8196

8297
# If we have flagged to include, and not flagged to skip
83-
elif assess_content(member_name, file_filter):
84-
digest[member_name] = extract_content(member_name, return_hash=True)
98+
elif assess_content(sandbox_name, file_filter):
99+
digest[member_name] = extract_content(sandbox_name, return_hash=True)
85100
included = True
86-
elif include_file(member_name, file_filter):
101+
elif include_file(sandbox_name, file_filter):
87102
hasher = hashlib.md5()
88-
with open(member_name, 'rb') as filey:
103+
with open(sandbox_name, 'rb') as filey:
89104
buf = filey.read()
90105
hasher.update(buf)
91106
digest[member_name] = hasher.hexdigest()
@@ -94,9 +109,9 @@ def extract_guts(image_path,
94109
# Derive size, and if root owned
95110
if included:
96111
if include_sizes:
97-
sizes[member_name] = os.stat(member_name).st_size
112+
sizes[member_name] = os.stat(sandbox_name).st_size
98113
if tag_root:
99-
roots[member_name] = is_root_owned(member_name)
114+
roots[member_name] = is_root_owned(sandbox_name)
100115

101116
results['all'] = allfiles
102117
results['hashes'] = digest
@@ -107,16 +122,6 @@ def extract_guts(image_path,
107122
return results
108123

109124

110-
111-
def get_memory_tar(image_path):
112-
'''get an in memory tar of an image. Use carefully, not as reliable
113-
as get_image_tar
114-
'''
115-
byte_array = Client.export(image_path)
116-
file_object = io.BytesIO(byte_array)
117-
tar = tarfile.open(mode="r|*", fileobj=file_object)
118-
return (file_object, tar)
119-
120125
def create_tarfile(source_dir, output_filename=None):
121126
''' create a tarfile from a source directory'''
122127
if output_filename == None:

0 commit comments

Comments
 (0)