|
30 | 30 | import re |
31 | 31 | from singularity.logger import bot |
32 | 32 | from singularity.utils import get_installdir |
33 | | -from singularity.package import ( |
34 | | - get_packages, |
35 | | - get_container_contents, |
36 | | - load_package, |
37 | | - package as make_package |
38 | | -) |
39 | 33 | from singularity.analysis.reproduce import ( |
40 | 34 | get_image_tar, |
41 | 35 | delete_image_tar |
42 | 36 | ) |
43 | 37 |
|
44 | 38 | from .metrics import information_coefficient |
45 | | - |
46 | 39 | import pandas |
47 | 40 |
|
48 | 41 |
|
49 | | -################################################################################### |
50 | | -# CONTAINER COMPARISONS ########################################################### |
51 | | -################################################################################### |
52 | | - |
53 | | -def container_similarity_vector(container1=None,packages_set=None,by=None,custom_set=None): |
54 | | - '''container similarity_vector is similar to compare_packages, but intended |
55 | | - to compare a container object (singularity image or singularity hub container) |
56 | | - to a list of packages. If packages_set is not provided, the default used is |
57 | | - 'docker-os'. This can be changed to 'docker-library', or if the user wants a custom |
58 | | - list, should define custom_set. |
59 | | - :param container1: singularity image or singularity hub container. |
60 | | - :param packages_set: a name of a package set, provided are docker-os and docker-library |
61 | | - :param custom_set: a list of package files, used first if provided. |
62 | | - :by: metrics to compare by (files.txt and or folders.txt) |
63 | | - ''' |
64 | | - if custom_set == None: |
65 | | - if packages_set == None: |
66 | | - packages_set = get_packages('docker-os') |
67 | | - else: |
68 | | - packages_set = custom_set |
69 | | - |
70 | | - if by == None: |
71 | | - by = ['files.txt'] |
72 | | - |
73 | | - if not isinstance(by,list): |
74 | | - by = [by] |
75 | | - if not isinstance(packages_set,list): |
76 | | - packages_set = [packages_set] |
77 | | - |
78 | | - comparisons = dict() |
79 | | - |
80 | | - for b in by: |
81 | | - bot.debug("Starting comparisons for %s" %b) |
82 | | - df = pandas.DataFrame(columns=packages_set) |
83 | | - for package2 in packages_set: |
84 | | - sim = calculate_similarity(container1=container1, |
85 | | - image_package2=package2, |
86 | | - by=b)[b] |
87 | | - |
88 | | - name1 = os.path.basename(package2).replace('.img.zip','') |
89 | | - bot.debug("container vs. %s: %s" %(name1,sim)) |
90 | | - df.loc["container",package2] = sim |
91 | | - df.columns = [os.path.basename(x).replace('.img.zip','') for x in df.columns.tolist()] |
92 | | - comparisons[b] = df |
93 | | - return comparisons |
94 | | - |
95 | 42 |
|
96 | 43 | def compare_singularity_images(image_paths1,image_paths2=None): |
97 | 44 | '''compare_singularity_images is a wrapper for compare_containers to compare |
@@ -136,148 +83,3 @@ def compare_singularity_images(image_paths1,image_paths2=None): |
136 | 83 | comparisons_done.append(comparison_id) |
137 | 84 | delete_image_tar(fileobj1, tar1) |
138 | 85 | return dfs |
139 | | - |
140 | | - |
141 | | -def compare_containers(container1=None,container2=None,by=None, |
142 | | - image_package1=None,image_package2=None): |
143 | | - '''compare_containers will generate a data structure with common and unique files to |
144 | | - two images. If environmental variable SINGULARITY_HUB is set, will use container |
145 | | - database objects. |
146 | | - :param container1: first container for comparison |
147 | | - :param container2: second container for comparison if either not defined must include |
148 | | - :param image_package1: a packaged container1 (produced by package) |
149 | | - :param image_package2: a packaged container2 (produced by package) |
150 | | - :param by: what to compare, one or more of 'files.txt' or 'folders.txt' |
151 | | - default compares just files |
152 | | - ''' |
153 | | - if by == None: |
154 | | - by = ["files.txt"] |
155 | | - if not isinstance(by,list): |
156 | | - by = [by] |
157 | | - |
158 | | - # Get files and folders for each |
159 | | - container1_guts = get_container_contents(gets=by, |
160 | | - split_delim="\n", |
161 | | - container=container1, |
162 | | - image_package=image_package1) |
163 | | - container2_guts = get_container_contents(gets=by, |
164 | | - split_delim="\n", |
165 | | - container=container2, |
166 | | - image_package=image_package2) |
167 | | - |
168 | | - # Do the comparison for each metric |
169 | | - comparisons = dict() |
170 | | - for b in by: |
171 | | - if b in container1_guts and b in container2_guts: |
172 | | - comparisons[b] = compare_lists(container1_guts[b],container2_guts[b]) |
173 | | - |
174 | | - return comparisons |
175 | | - |
176 | | - |
177 | | -def compare_lists(list1,list2): |
178 | | - '''compare lists is the lowest level that drives compare_containers and |
179 | | - compare_packages. It returns a comparison object (dict) with the unique, |
180 | | - total, and intersecting things between two lists |
181 | | - :param list1: the list for container1 |
182 | | - :param list2: the list for container2 |
183 | | - ''' |
184 | | - intersect = list(set(list1).intersection(list2)) |
185 | | - unique1 = list(set(list1).difference(list2)) |
186 | | - unique2 = list(set(list2).difference(list1)) |
187 | | - |
188 | | - # Return data structure |
189 | | - comparison = {"intersect":intersect, |
190 | | - "unique1": unique1, |
191 | | - "unique2": unique2, |
192 | | - "total1": len(list1), |
193 | | - "total2": len(list2)} |
194 | | - return comparison |
195 | | - |
196 | | - |
197 | | -def calculate_similarity(container1=None,container2=None,image_package1=None, |
198 | | - image_package2=None,by="files.txt",comparison=None, |
199 | | - metric=None): |
200 | | - '''calculate_similarity will calculate similarity of two containers by files content, default will calculate |
201 | | - 2.0*len(intersect) / total package1 + total package2 |
202 | | - :param container1: container 1 |
203 | | - :param container2: container 2 must be defined or |
204 | | - :param image_package1: a zipped package for image 1, created with package |
205 | | - :param image_package2: a zipped package for image 2, created with package |
206 | | - :param by: the one or more metrics (eg files.txt) list to use to compare |
207 | | - :param metric: a function to take a total1, total2, and intersect count |
208 | | - (we can make this more general if / when more are added) |
209 | | - valid are currently files.txt or folders.txt |
210 | | - :param comparison: the comparison result object for the tree. If provided, |
211 | | - will skip over function to obtain it. |
212 | | - ''' |
213 | | - if not isinstance(by,list): |
214 | | - by = [by] |
215 | | - |
216 | | - if metric is None: |
217 | | - metric = information_coefficient |
218 | | - |
219 | | - if comparison == None: |
220 | | - comparison = compare_containers(container1=container1, |
221 | | - container2=container2, |
222 | | - image_package1=image_package1, |
223 | | - image_package2=image_package2, |
224 | | - by=by) |
225 | | - scores = dict() |
226 | | - |
227 | | - for b in by: |
228 | | - scores[b] = metric(total1=comparison[b]['total1'], |
229 | | - total2=comparison[b]['total2'], |
230 | | - intersect=comparison[b]["intersect"]) |
231 | | - return scores |
232 | | - |
233 | | - |
234 | | -################################################################################### |
235 | | -# PACKAGE COMPARISONS ############################################################# |
236 | | -################################################################################### |
237 | | - |
238 | | -def compare_packages(packages_set1=None,packages_set2=None,by=None): |
239 | | - '''compare_packages will compare one image or package to one image or package. If |
240 | | - the folder isn't specified, the default singularity packages (included with install) |
241 | | - will be used (os vs. docker library). Images will take preference over packages |
242 | | - :param packages_set1: a list of package files not defined uses docker-library |
243 | | - :param packages_set2: a list of package files, not defined uses docker-os |
244 | | - :by: metrics to compare by (files.txt and or folders.txt) |
245 | | - ''' |
246 | | - if packages_set1 == None: |
247 | | - packages_set1 = get_packages('docker-library') |
248 | | - if packages_set2 == None: |
249 | | - packages_set2 = get_packages('docker-os') |
250 | | - |
251 | | - if by == None: |
252 | | - by = ['files.txt'] |
253 | | - |
254 | | - if not isinstance(by,list): |
255 | | - by = [by] |
256 | | - if not isinstance(packages_set1,list): |
257 | | - packages_set1 = [packages_set1] |
258 | | - if not isinstance(packages_set2,list): |
259 | | - packages_set2 = [packages_set2] |
260 | | - |
261 | | - comparisons = dict() |
262 | | - |
263 | | - for b in by: |
264 | | - bot.debug("Starting comparisons for %s" %b) |
265 | | - df = pandas.DataFrame(index=packages_set1,columns=packages_set2) |
266 | | - for package1 in packages_set1: |
267 | | - for package2 in packages_set2: |
268 | | - if package1 != package2: |
269 | | - sim = calculate_similarity(image_package1=package1, |
270 | | - image_package2=package2, |
271 | | - by=b)[b] |
272 | | - else: |
273 | | - sim = 1.0 |
274 | | - |
275 | | - name1 = os.path.basename(package1).replace('.img.zip','') |
276 | | - name2 = os.path.basename(package2).replace('.img.zip','') |
277 | | - bot.debug("%s vs. %s: %s" %(name1,name2,sim)) |
278 | | - df.loc[package1,package2] = sim |
279 | | - df.index = [os.path.basename(x).replace('.img.zip','') for x in df.index.tolist()] |
280 | | - df.columns = [os.path.basename(x).replace('.img.zip','') for x in df.columns.tolist()] |
281 | | - comparisons[b] = df |
282 | | - return comparisons |
283 | | - |
0 commit comments