Skip to content

Commit 74b5649

Browse files
committed
adding examples to count files and extensions
1 parent 89737a7 commit 74b5649

File tree

5 files changed

+163
-26
lines changed

5 files changed

+163
-26
lines changed
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/usr/bin/env python
2+
3+
# This is an example of counting files using an image diff
4+
5+
from singularity.analysis.classify import (
6+
get_diff,
7+
file_counts,
8+
extension_counts
9+
)
10+
11+
image_package = "python:3.6.0.img.zip"
12+
13+
# The diff is a dict of folders --> files that differ between
14+
# image and it's closest OS
15+
diff = get_diff(image_package=image_package)
16+
17+
# Now we might be interested in counting different things
18+
readme_count = file_counts(diff=diff)
19+
copyright_count = file_counts(diff=diff,patterns=['copyright'])
20+
authors_count = file_counts(diff=diff,patterns=['authors','thanks','credit'])
21+
todo_count = file_counts(diff=diff,patterns=['todo'])
22+
23+
# Or getting a complete dict of extensions
24+
extensions = extension_counts(diff=diff)
25+
26+
# Return files instead of counts
27+
extensions = extension_counts(diff=diff,return_counts=False)

examples/classify_image/derive_tags.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
from singularity.analysis.classify import get_tags
99

10-
package = "python:3.6.0.img.zip"
10+
image_package = "python:3.6.0.img.zip"
1111

1212
# The algorithm works as follows:
1313
# 1) first compare package to set of base OS (provided with shub)
@@ -16,6 +16,6 @@
1616
# 4) return search_folders as tags
1717

1818
# Default tags will be returned as software in "bin"
19-
tags = get_tags(package=package)
19+
tags = get_tags(image_package=image_package)
2020

2121
# Most similar OS found to be %s debian:7.11

examples/classify_image/estimate_os.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44

55
from singularity.analysis.classify import estimate_os
66

7-
package = "python:3.6.0.img.zip"
7+
image_package = "python:3.6.0.img.zip"
88

99
# We can obtain the estimated os (top match)
10-
estimated_os = estimate_os(package=package)
10+
estimated_os = estimate_os(image_package=image_package)
1111
# Most similar OS found to be %s debian:7.11
1212

1313
# We can also get the whole list and values
14-
os_similarity = estimate_os(package=package,return_top=False)
14+
os_similarity = estimate_os(image_package=image_package,return_top=False)

singularity/analysis/classify.py

Lines changed: 98 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,12 @@
1818
)
1919
from singularity.analysis.utils import get_package_base
2020
from singularity.package import package as make_package
21-
from singularity.utils import get_installdir
21+
from singularity.utils import (
22+
get_installdir,
23+
update_dict,
24+
update_dict_sum
25+
)
26+
2227
from singularity.views.utils import get_container_contents
2328

2429
from singularity.package import (
@@ -35,12 +40,12 @@
3540

3641

3742

38-
def get_diff(container=None,package=None,sudopw=None):
43+
def get_diff(container=None,image_package=None,sudopw=None):
3944
'''get diff will return a dictionary of folder paths and files that
4045
are in an image or package vs. all standard operating systems. The
4146
algorithm is explained below.
4247
:param container: if provided, will use container as image. Can also provide
43-
:param package: if provided, can be used instead of container
48+
:param image_package: if provided, can be used instead of container
4449
:param sudopw: needed if a package isn't provided (will prompt user)
4550
4651
::notes
@@ -51,14 +56,14 @@ def get_diff(container=None,package=None,sudopw=None):
5156
3) organize custom files into dict based on folder name
5257
5358
'''
54-
if package == None:
55-
package = make_package(container,remove_image=True,sudopw=sudopw)
59+
if image_package == None:
60+
image_package = make_package(container,remove_image=True,sudopw=sudopw)
5661

5762
# Find the most similar os
58-
most_similar = estimate_os(package=package,sudopw=sudopw)
63+
most_similar = estimate_os(image_package=image_package,sudopw=sudopw)
5964
similar_package = "%s/docker-os/%s.img.zip" %(get_package_base(),most_similar)
6065

61-
comparison = compare_containers(image_package1=package,
66+
comparison = compare_containers(image_package1=image_package,
6267
image_package2=similar_package,
6368
by='files.txt')['files.txt']
6469

@@ -86,30 +91,34 @@ def get_diff(container=None,package=None,sudopw=None):
8691
###################################################################################
8792

8893

89-
def estimate_os(container=None,package=None,sudopw=None,return_top=True):
94+
def estimate_os(container=None,image_package=None,sudopw=None,return_top=True):
9095
'''estimate os will compare a package to singularity python's database of
9196
operating system images, and return the docker image most similar
9297
:param return_top: return only the most similar (estimated os) default True
93-
:param package: the package created from the image to estimate.
98+
:param image_package: the package created from the image to estimate.
9499
'''
95-
if package == None:
96-
package = make_package(container,remove_image=True,sudopw=sudopw)
100+
if image_package == None:
101+
image_package = make_package(container,remove_image=True,sudopw=sudopw)
97102

98-
comparison = compare_packages(packages_set1=[package])['files.txt'].transpose()
103+
comparison = compare_packages(packages_set1=[image_package])['files.txt'].transpose()
99104
comparison.columns = ['SCORE']
100105
most_similar = comparison['SCORE'].idxmax()
101-
print("Most similar OS found to be ",most_similar)
106+
print("Most similar OS found to be ", most_similar)
102107
if return_top == True:
103108
return most_similar
104109
return comparison
105110

106111

107-
def get_tags(container=None,package=None,sudopw=None,search_folders=None):
112+
def get_tags(container=None,image_package=None,sudopw=None,search_folders=None,diff=None,
113+
return_unique=True):
108114
'''get tags will return a list of tags that describe the software in an image,
109115
meaning inside of a paricular folder. If search_folder is not defined, uses lib
110116
:param container: if provided, will use container as image. Can also provide
111-
:param package: if provided, can be used instead of container
117+
:param image_package: if provided, can be used instead of container
112118
:param search_folders: specify one or more folders to look for tags
119+
:param diff: the difference between a container and it's parent OS from get_diff
120+
if None, will be derived.
121+
:param return_unique: return unique files in folders. Default True.
113122
Default is 'bin'
114123
115124
::notes
@@ -120,9 +129,10 @@ def get_tags(container=None,package=None,sudopw=None,search_folders=None):
120129
3) organize custom files into dict based on folder name
121130
4) return search_folders as tags
122131
'''
123-
folders = get_diff(container=container,
124-
package=package,
125-
sudopw=sudopw)
132+
if diff == None:
133+
diff = get_diff(container=container,
134+
image_package=image_package,
135+
sudopw=sudopw)
126136

127137
if search_folders == None:
128138
search_folders = 'bin'
@@ -132,10 +142,77 @@ def get_tags(container=None,package=None,sudopw=None,search_folders=None):
132142

133143
tags = []
134144
for search_folder in search_folders:
135-
if search_folder in folders:
145+
if search_folder in diff:
136146
bot.logger.info("Adding tags for folder %s",search_folder)
137-
tags = tags + folders[search_folder]
147+
tags = tags + diff[search_folder]
138148
else:
139149
bot.logger.info("Did not find folder %s in difference.",search_folder)
140-
tags = numpy.unique(tags).tolist()
150+
151+
if return_unique == True:
152+
tags = numpy.unique(tags).tolist()
141153
return tags
154+
155+
156+
###################################################################################
157+
# COUNTING ########################################################################
158+
###################################################################################
159+
160+
161+
def file_counts(container=None,patterns=None,image_package=None,sudopw=None,diff=None):
162+
'''file counts will return a list of files that match one or more regular expressions.
163+
if no patterns is defined, a default of readme is used. All patterns and files are made
164+
case insensitive.
165+
:param container: if provided, will use container as image. Can also provide
166+
:param image_package: if provided, can be used instead of container
167+
:param patterns: one or more patterns (str or list) of files to search for.
168+
:param diff: the difference between a container and it's parent OS from get_diff
169+
if not provided, will be generated.
170+
'''
171+
if diff == None:
172+
diff = get_diff(container=container,
173+
image_package=image_package,
174+
sudopw=sudopw)
175+
176+
if patterns == None:
177+
patterns = 'readme'
178+
179+
if not isinstance(patterns,list):
180+
patterns = [patterns]
181+
182+
count = 0
183+
for folder, items in diff.items():
184+
for pattern in patterns:
185+
count += len([x for x in items if re.search(pattern.lower(),x.lower())])
186+
bot.logger.info("Total files matching patterns is %s",count)
187+
return count
188+
189+
190+
def extension_counts(container=None,image_package=None,sudopw=None,diff=None,return_counts=True):
191+
'''extension counts will return a dictionary with counts of file extensions for
192+
an image.
193+
:param container: if provided, will use container as image. Can also provide
194+
:param image_package: if provided, can be used instead of container
195+
:param diff: the difference between a container and it's parent OS from get_diff
196+
:param return_counts: return counts over dict with files. Default True
197+
'''
198+
if diff == None:
199+
diff = get_diff(container=container,
200+
image_package=image_package,
201+
sudopw=sudopw)
202+
203+
extensions = dict()
204+
for folder, items in diff.items():
205+
for item in items:
206+
filename,ext = os.path.splitext(item)
207+
if ext == '':
208+
if return_counts == False:
209+
extensions = update_dict(extensions,'no-extension',item)
210+
else:
211+
extensions = update_dict_sum(extensions,'no-extension')
212+
else:
213+
if return_counts == False:
214+
extensions = update_dict(extensions,ext,item)
215+
else:
216+
extensions = update_dict_sum(extensions,ext)
217+
218+
return extensions

singularity/utils.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,39 @@ def remove_unicode_dict(input_dict):
265265
return input_dict
266266

267267

268+
def update_dict(input_dict,key,value):
269+
'''update_dict will update lists in a dictionary. If the key is not included,
270+
if will add as new list. If it is, it will append.
271+
:param input_dict: the dict to update
272+
:param value: the value to update with
273+
'''
274+
if key in input_dict:
275+
input_dict[key].append(value)
276+
else:
277+
input_dict[key] = [value]
278+
return input_dict
279+
280+
281+
def update_dict_sum(input_dict,key,increment=None,initial_value=None):
282+
'''update_dict sum will increment a dictionary key
283+
by an increment, and add a value of 0 if it doesn't exist
284+
:param input_dict: the dict to update
285+
:param increment: the value to increment by. Default is 1
286+
:param initial_value: value to start with. Default is 0
287+
'''
288+
if increment == None:
289+
increment = 1
290+
291+
if initial_value == None:
292+
initial_value = 0
293+
294+
if key in input_dict:
295+
input_dict[key] += increment
296+
else:
297+
input_dict[key] = initial_value
298+
return input_dict
299+
300+
268301
def format_container_name(name,special_characters=None):
269302
'''format_container_name will take a name supplied by the user,
270303
remove all special characters (except for those defined by "special-characters"

0 commit comments

Comments
 (0)