Skip to content

Commit fea65ed

Browse files
authored
Merge pull request #170 from jgrewe/terminology
Extensions to the terminology handling
2 parents 41b096a + c25ed13 commit fea65ed

File tree

3 files changed

+246
-20
lines changed

3 files changed

+246
-20
lines changed

odml/section.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ def include(self, new_value):
7676
return
7777

7878
term = terminology.load(url)
79+
if not term:
80+
return
7981
new_section = term.get_section_by_path(path) if path is not None else term.sections[0]
8082

8183
if self._include is not None:

odml/terminology.py

Lines changed: 206 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -9,31 +9,48 @@
99
import odml.tools.xmlparser
1010
from hashlib import md5
1111
py3 = True
12-
1312
try:
1413
from urllib.request import urlopen
1514
except ImportError:
1615
from urllib import urlopen
17-
1816
import threading
1917

20-
CACHE_AGE = datetime.timedelta(days=1)
18+
CACHE_AGE = datetime.timedelta(days=14)
19+
CACHE_DIR = os.path.join(tempfile.gettempdir(), "odml.cache")
20+
FILE_MAP_FILE = os.path.join(CACHE_DIR, "odml_filemap.csv")
21+
22+
if not os.path.exists(CACHE_DIR):
23+
try:
24+
os.makedirs(CACHE_DIR)
25+
except OSError: # might happen due to concurrency
26+
if not os.path.exists(CACHE_DIR):
27+
raise
28+
29+
30+
def open_file_map():
31+
"""
32+
Opens the file_map file stored in the cache that maps the filenames to the urls of the
33+
respective terminolgies.
34+
"""
35+
file_map = {}
36+
if not os.path.exists(FILE_MAP_FILE):
37+
return file_map
38+
else:
39+
with open(FILE_MAP_FILE, 'r') as f:
40+
for l in f.readlines():
41+
parts = l.strip().split(';')
42+
file_map[parts[0].strip()] = parts[1].strip()
43+
return file_map
2144

2245

2346
def cache_load(url):
2447
"""
25-
load the url and store it in a temporary cache directory
48+
Load the url and store it in a temporary cache directory
2649
subsequent requests for this url will use the cached version
2750
"""
28-
filename = md5(url.encode()).hexdigest() + os.path.basename(url)
29-
cache_dir = os.path.join(tempfile.gettempdir(), "odml.cache")
30-
if not os.path.exists(cache_dir):
31-
try:
32-
os.makedirs(cache_dir)
33-
except OSError: # might happen due to concurrency
34-
if not os.path.exists(cache_dir):
35-
raise
36-
cache_file = os.path.join(cache_dir, filename)
51+
filename = md5(url.encode()).hexdigest() + '__' + os.path.basename(url)
52+
cache_file = os.path.join(CACHE_DIR, filename)
53+
3754
if not os.path.exists(cache_file) \
3855
or datetime.datetime.fromtimestamp(os.path.getmtime(cache_file)) < \
3956
datetime.datetime.now() - CACHE_AGE:
@@ -42,18 +59,69 @@ def cache_load(url):
4259
except Exception as e:
4360
print("Failed loading '%s': %s" % (url, e))
4461
return
45-
4662
fp = open(cache_file, "w")
4763
fp.write(data)
4864
fp.close()
49-
65+
with open(FILE_MAP_FILE, 'a') as fm:
66+
fm.write(filename + "; " + url + "\n")
5067
return open(cache_file)
5168

5269

70+
def cached_files():
71+
"""
72+
Returns a list of all locally cached files.
73+
"""
74+
filelist = [ f for f in os.listdir(CACHE_DIR) if \
75+
(f.endswith(".xml") and os.path.isfile(os.path.join(CACHE_DIR, f)))]
76+
return filelist
77+
78+
79+
def show_cache():
80+
"""
81+
Show all locally cached files. Just for display.
82+
"""
83+
print("terminology %s \t updated"%(19*" "))
84+
print(60*"-")
85+
files = cached_files()
86+
for f in files:
87+
cache_file = os.path.join(CACHE_DIR, f)
88+
file_timestamp = datetime.datetime.fromtimestamp(os.path.getmtime(cache_file))
89+
disp_name = '_'.join(f.split('__')[1:])
90+
if len(disp_name) > 30:
91+
disp_name = disp_name[:16] + "..."
92+
if len(disp_name) < 30:
93+
disp_name = disp_name + (30 -len(disp_name)) * " "
94+
print(" %s \t %s"%(disp_name, file_timestamp))
95+
96+
97+
def clear_cache():
98+
"""
99+
Clears the cache, i.e. deletes all locally stored files. Does not remove the cache folder, though.
100+
"""
101+
filelist = cached_files();
102+
for f in filelist:
103+
os.remove(os.path.join(CACHE_DIR, f))
104+
if os.path.exists(FILE_MAP_FILE):
105+
os.remove(FILE_MAP_FILE)
106+
107+
108+
def from_cache(term):
109+
"""
110+
Fills the terminology with the definitions stored in the cache.
111+
"""
112+
assert isinstance(term, Terminologies)
113+
file_list = cached_files();
114+
file_map = open_file_map();
115+
for f in file_map:
116+
if file_map[f] not in term:
117+
term.load(file_map[f])
118+
119+
53120
class Terminologies(dict):
54121
loading = {}
122+
types = None
55123

56-
def load(self, url):
124+
def load(self, url="http://portal.g-node.org/odml/terminologies/v1.0/terminologies.xml"):
57125
"""
58126
load and cache a terminology-url
59127
@@ -62,11 +130,14 @@ def load(self, url):
62130
if url in self:
63131
return self[url]
64132

133+
encode_name = md5(url.encode()).hexdigest() + '__' + os.path.basename(url)
134+
if encode_name in self:
135+
return self[encode_name]
136+
65137
if url in self.loading:
66138
self.loading[url].join()
67139
self.loading.pop(url, None)
68140
return self.load(url)
69-
70141
return self._load(url)
71142

72143
def _load(self, url):
@@ -95,14 +166,129 @@ def deferred_load(self, url):
95166
self.loading[url] = threading.Thread(target=self._load, args=(url,))
96167
self.loading[url].start()
97168

169+
def empty(self):
170+
"""
171+
Tells whether there are no terminolgies stored.
172+
"""
173+
return len(self) == 0
174+
175+
def type_list(self):
176+
"""
177+
returns a dict of all types stored in the cache together with the terminologies it is defined in.
178+
"""
179+
if self.empty():
180+
from_cache(self)
181+
if not self.types:
182+
self.types = {}
183+
for k in self.items():
184+
for s in k[1].itersections():
185+
if s.type in self.types:
186+
self.types[s.type].append((k[0], s.get_path()))
187+
else:
188+
self.types[s.type] = [(k[0], s.get_path())]
189+
return self.types
190+
191+
def _compare_repo(self, candidate_repo, candidate_path, pattern, relaxed):
192+
parts = pattern.lower().split()
193+
match = True
194+
repo = candidate_repo.lower()
195+
path = candidate_path.lower()
196+
for p in parts:
197+
if p.startswith("!"):
198+
if relaxed:
199+
match = match or (p[1:] not in repo.lower() and p[1:] not in path)
200+
else:
201+
match = match and (p[1:] not in repo and p[1:] not in path)
202+
else:
203+
if relaxed:
204+
match = match or (p in repo or p in path)
205+
else:
206+
match = match and (p in repo or p in path)
207+
return match
208+
209+
def _find_match(self, type_matches, pattern, relaxed=False):
210+
if pattern:
211+
matches = []
212+
for i, (r, p) in enumerate(type_matches):
213+
if self._compare_repo(r, p, pattern, relaxed):
214+
matches.append(type_matches[i])
215+
return matches
216+
else: # simply return first
217+
return type_matches
218+
return []
219+
220+
def _get_section_by_type(self, section_type, pattern=None, relaxed=False, find_all=False):
221+
if self.empty() or len(self.types) == 0:
222+
self.type_list()
223+
matches = []
224+
if section_type in self.types:
225+
matches = self._find_match(self.types[section_type], pattern, relaxed)
226+
if len(matches) > 0:
227+
if len(matches) > 1 and find_all:
228+
sections = []
229+
for m in matches:
230+
sections.append(self[m[0]].get_section_by_path(m[1]).clone())
231+
return sections
232+
else:
233+
return self[matches[0][0]].get_section_by_path(matches[0][1]).clone()
234+
else:
235+
return None
236+
237+
98238
terminologies = Terminologies()
99239
load = terminologies.load
100240
deferred_load = terminologies.deferred_load
101241

102242

243+
def get_section_by_type(section_type, pattern=None, relaxed=False, find_all=False):
244+
"""
245+
Finds a section type in the cached repositories and returns it.
246+
247+
@param section_type the type of the section must be a valid full match. Returns the
248+
first match.
249+
@param pattern a optional filter pattern, i.e. a string with characteristics
250+
regarding the repository the section should originate from
251+
and its path in the file (see below)
252+
@param relaxed optional, defines whether all criteria must be met or not.
253+
@param find_all optional, sets whether all possible matches are returned
254+
255+
@return Section or list of sections depending on the find_all parameter, None,
256+
if no match was found.
257+
258+
Example:
259+
Suppose we are looking for a section type 'analysis' and it should be from the g-node
260+
terminologies.
261+
s = get_section_by_type("analysis", "g-node")
262+
print(s)
263+
<Section Analysis[analysis] (0)>
264+
If we want to exclude the g-node terminologies, simply put an ! in front of the pattern
265+
s = get_section_by_type("analysis", "!g-node")
266+
267+
Multiple criteria can be combined (e.g. get_section_by_type("setup/daq", "g-node blackrock !cerebus")).
268+
The relaxed parameter controls whether all criteria have to match.
269+
"""
270+
return terminologies._get_section_by_type(section_type, pattern, relaxed, find_all)
271+
272+
def find_definitions(section_type):
273+
"""
274+
Finds repositories that define the provided section type.
275+
276+
@param section_type the requested section type
277+
278+
@return list of tuples containing the repository and the path at which the respective
279+
section can be found. List may be empty.
280+
"""
281+
tl = terminologies.type_list()
282+
if section_type in tl:
283+
return tl[section_type]
284+
else:
285+
return []
286+
103287
if __name__ == "__main__":
288+
from IPython import embed
104289
print ("Terminologies!")
105-
t = Terminologies()
106-
t.load('http://portal.g-node.org/odml/terminologies/v1.0/terminologies.xml')
290+
from_cache(terminologies)
291+
# t.load('http://portal.g-node.org/odml/terminologies/v1.0/terminologies.xml')
107292
# t.load('http://portal.g-node.org/odml/terminologies/v1.0/analysis/power_spectrum.xml')
108-
293+
find_definitions("analysis")
294+
embed()

test/test_terminology.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import unittest
2+
import odml
3+
4+
class TestTerminolgies(unittest.TestCase):
5+
def setUp(self):
6+
pass
7+
8+
def test_load(self):
9+
if not odml.terminology.terminologies.empty():
10+
odml.terminology.terminologies.clear()
11+
self.assertTrue(odml.terminology.terminologies.empty())
12+
13+
def test_cached_files(self):
14+
odml.terminology.clear_cache()
15+
self.assertTrue(len(odml.terminology.cached_files()) == 0)
16+
odml.terminology.load()
17+
self.assertTrue(len(odml.terminology.cached_files()) > 0)
18+
19+
def test_from_cache(self):
20+
if not odml.terminology.terminologies.empty():
21+
odml.terminology.terminologies.clear()
22+
odml.terminology.from_cache(odml.terminology.terminologies)
23+
self.assertTrue(odml.terminology.terminologies.empty() == False)
24+
25+
def test_find_difinitions(self):
26+
if odml.terminology.terminologies.empty():
27+
odml.terminology.from_cache(odml.terminology.terminologies)
28+
29+
self.assertTrue(len(odml.terminology.find_definitions("analysis")) > 0)
30+
self.assertTrue(len(odml.terminology.find_definitions("foo")) == 0)
31+
32+
def test_get_section(self):
33+
self.assertIsNone(odml.terminology.get_section_by_type("analyses"))
34+
self.assertIsNotNone(odml.terminology.get_section_by_type("analysis"))
35+
self.assertIsInstance(odml.terminology.get_section_by_type("analysis"), odml.section.BaseSection)
36+
self.assertIsNone(odml.terminology.get_section_by_type("analysis", "!g-node"))
37+
self.assertTrue(len((odml.terminology.get_section_by_type("setup/daq", "g-node"))) > 0)
38+

0 commit comments

Comments
 (0)