Skip to content

Commit cdcd36d

Browse files
author
Marcin Kardas
committed
Add default evidences for all datasets and metrics
* add evidences for all datasets and metrics found in taxonomy * refactor evidence finding to EvidenceFinder * fix manual evidences for semantic segmentation metrics
1 parent dec9aa2 commit cdcd36d

File tree

4 files changed

+114
-93
lines changed

4 files changed

+114
-93
lines changed
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from .taxonomy import Taxonomy
22
from .linker import Linker
3-
from .context_search import ContextSearch, DatasetExtractor
3+
from .context_search import ContextSearch, DatasetExtractor, EvidenceFinder
44
from .proposals_filters import *
55

6-
__all__ = ["Taxonomy", "Linker", "ContextSearch", "DatasetExtractor", "ProposalsFilter", "NopFilter",
6+
__all__ = ["Taxonomy", "Linker", "ContextSearch", "DatasetExtractor", "EvidenceFinder", "ProposalsFilter", "NopFilter",
77
"BestResultFilter", "StructurePredictionFilter", "ConfidenceFilter", "CompoundFilter"]

sota_extractor2/models/linking/context_search.py

Lines changed: 96 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
from sota_extractor2.models.linking.acronym_extractor import AcronymExtractor
55
from sota_extractor2.models.linking.probs import get_probs, reverse_probs
6-
from sota_extractor2.models.linking.utils import normalize_dataset, normalize_cell, normalize_cell_ws
6+
from sota_extractor2.models.linking.utils import normalize_dataset, normalize_dataset_ws, normalize_cell, normalize_cell_ws
77
from scipy.special import softmax
88
import re
99
import pandas as pd
@@ -13,81 +13,95 @@
1313

1414
from sota_extractor2.pipeline_logger import pipeline_logger
1515

16-
from sota_extractor2.models.linking.manual_dicts import metrics, datasets, tasks
17-
18-
datasets = {k:(v+['test']) for k,v in datasets.items()}
19-
datasets.update({
20-
'LibriSpeech dev-clean': ['libri speech dev clean', 'libri speech', 'dev', 'clean', 'dev clean', 'development'],
21-
'LibriSpeech dev-other': ['libri speech dev other', 'libri speech', 'dev', 'other', 'dev other', 'development', 'noisy'],
22-
})
23-
24-
# escaped_ws_re = re.compile(r'\\\s+')
25-
# def name_to_re(name):
26-
# return re.compile(r'(?:^|\s+)' + escaped_ws_re.sub(r'\\s*', re.escape(name.strip())) + r'(?:$|\s+)', re.I)
27-
28-
#all_datasets = set(k for k,v in merged_p.items() if k != '' and not re.match("^\d+$", k) and v.get('NOMATCH', 0.0) < 0.9)
29-
all_datasets = set(normalize_cell_ws(normalize_dataset(y)) for x in datasets.values() for y in x)
30-
all_metrics = set(normalize_cell_ws(y) for x in metrics.values() for y in x)
31-
all_tasks = set(normalize_cell_ws(normalize_dataset(y)) for x in tasks.values() for y in x)
32-
33-
#all_metrics = set(metrics_p.keys())
34-
35-
# all_datasets_re = {x:name_to_re(x) for x in all_datasets}
36-
# all_metrics_re = {x:name_to_re(x) for x in all_metrics}
37-
#all_datasets = set(x for v in merged_p.values() for x in v)
38-
39-
# def find_names(text, names_re):
40-
# return set(name for name, name_re in names_re.items() if name_re.search(text))
41-
42-
43-
def make_trie(names):
44-
trie = ahocorasick.Automaton()
45-
for name in names:
46-
norm = name.replace(" ", "")
47-
trie.add_word(norm, (len(norm), name))
48-
trie.make_automaton()
49-
return trie
50-
51-
52-
single_letter_re = re.compile(r"\b\w\b")
53-
init_letter_re = re.compile(r"\b\w")
54-
end_letter_re = re.compile(r"\w\b")
55-
letter_re = re.compile(r"\w")
56-
57-
58-
def find_names(text, names_trie):
59-
text = text.lower()
60-
profile = letter_re.sub("i", text)
61-
profile = init_letter_re.sub("b", profile)
62-
profile = end_letter_re.sub("e", profile)
63-
profile = single_letter_re.sub("x", profile)
64-
text = text.replace(" ", "")
65-
profile = profile.replace(" ", "")
66-
s = set()
67-
for (end, (l, word)) in names_trie.iter(text):
68-
if profile[end] in ['e', 'x'] and profile[end - l + 1] in ['b', 'x']:
69-
s.add(word)
70-
return s
71-
72-
73-
all_datasets_trie = make_trie(all_datasets)
74-
all_metrics_trie = make_trie(all_metrics)
75-
all_tasks_trie = make_trie(all_tasks)
76-
77-
78-
def find_datasets(text):
79-
return find_names(text, all_datasets_trie)
80-
81-
def find_metrics(text):
82-
return find_names(text, all_metrics_trie)
83-
84-
def find_tasks(text):
85-
return find_names(text, all_tasks_trie)
16+
from sota_extractor2.models.linking import manual_dicts
8617

8718
def dummy_item(reason):
8819
return pd.DataFrame(dict(dataset=[reason], task=[reason], metric=[reason], evidence=[""], confidence=[0.0]))
8920

9021

22+
class EvidenceFinder:
23+
single_letter_re = re.compile(r"\b\w\b")
24+
init_letter_re = re.compile(r"\b\w")
25+
end_letter_re = re.compile(r"\w\b")
26+
letter_re = re.compile(r"\w")
27+
28+
def __init__(self, taxonomy):
29+
self._init_structs(taxonomy)
30+
31+
@staticmethod
32+
def evidences_from_name(key):
33+
x = normalize_dataset_ws(key)
34+
y = x.split()
35+
return [x] + y if len(y) > 1 else [x]
36+
37+
@staticmethod
38+
def get_basic_dicts(taxonomy):
39+
tasks = {ts: [normalize_dataset_ws(ts)] for ts in taxonomy.tasks}
40+
datasets = {ds: EvidenceFinder.evidences_from_name(ds) for ds in taxonomy.datasets}
41+
metrics = {ms: EvidenceFinder.evidences_from_name(ms) for ms in taxonomy.metrics}
42+
return tasks, datasets, metrics
43+
44+
@staticmethod
45+
def merge_evidences(target, source):
46+
for name, evs in source.items():
47+
target.setdefault(name, []).extend(evs)
48+
49+
@staticmethod
50+
def make_trie(names):
51+
trie = ahocorasick.Automaton()
52+
for name in names:
53+
norm = name.replace(" ", "")
54+
trie.add_word(norm, (len(norm), name))
55+
trie.make_automaton()
56+
return trie
57+
58+
@staticmethod
59+
def find_names(text, names_trie):
60+
text = text.lower()
61+
profile = EvidenceFinder.letter_re.sub("i", text)
62+
profile = EvidenceFinder.init_letter_re.sub("b", profile)
63+
profile = EvidenceFinder.end_letter_re.sub("e", profile)
64+
profile = EvidenceFinder.single_letter_re.sub("x", profile)
65+
text = text.replace(" ", "")
66+
profile = profile.replace(" ", "")
67+
s = set()
68+
for (end, (l, word)) in names_trie.iter(text):
69+
if profile[end] in ['e', 'x'] and profile[end - l + 1] in ['b', 'x']:
70+
s.add(word)
71+
return s
72+
73+
def find_datasets(self, text):
74+
return EvidenceFinder.find_names(text, self.all_datasets_trie)
75+
76+
def find_metrics(self, text):
77+
return EvidenceFinder.find_names(text, self.all_metrics_trie)
78+
79+
def find_tasks(self, text):
80+
return EvidenceFinder.find_names(text, self.all_tasks_trie)
81+
82+
def _init_structs(self, taxonomy):
83+
self.tasks, self.datasets, self.metrics = EvidenceFinder.get_basic_dicts(taxonomy)
84+
EvidenceFinder.merge_evidences(self.tasks, manual_dicts.tasks)
85+
EvidenceFinder.merge_evidences(self.datasets, manual_dicts.datasets)
86+
EvidenceFinder.merge_evidences(self.metrics, manual_dicts.metrics)
87+
self.datasets = {k: (v + ['test'] if 'val' not in k else v + ['validation', 'dev', 'development']) for k, v in
88+
self.datasets.items()}
89+
self.datasets.update({
90+
'LibriSpeech dev-clean': ['libri speech dev clean', 'libri speech', 'dev', 'clean', 'dev clean', 'development'],
91+
'LibriSpeech dev-other': ['libri speech dev other', 'libri speech', 'dev', 'other', 'dev other', 'development', 'noisy'],
92+
})
93+
94+
self.datasets = {k: set(v) for k, v in self.datasets.items()}
95+
self.metrics = {k: set(v) for k, v in self.metrics.items()}
96+
self.tasks = {k: set(v) for k, v in self.tasks.items()}
97+
98+
self.all_datasets = set(normalize_cell_ws(normalize_dataset(y)) for x in self.datasets.values() for y in x)
99+
self.all_metrics = set(normalize_cell_ws(y) for x in self.metrics.values() for y in x)
100+
self.all_tasks = set(normalize_cell_ws(normalize_dataset(y)) for x in self.tasks.values() for y in x)
101+
102+
self.all_datasets_trie = EvidenceFinder.make_trie(self.all_datasets)
103+
self.all_metrics_trie = EvidenceFinder.make_trie(self.all_metrics)
104+
self.all_tasks_trie = EvidenceFinder.make_trie(self.all_tasks)
91105

92106
@njit
93107
def compute_logprobs(taxonomy, reverse_merged_p, reverse_metrics_p, reverse_task_p,
@@ -114,17 +128,18 @@ def compute_logprobs(taxonomy, reverse_merged_p, reverse_metrics_p, reverse_task
114128

115129

116130
class ContextSearch:
117-
def __init__(self, taxonomy, context_noise=(0.5, 0.2, 0.1), metrics_noise=None, task_noise=None,
131+
def __init__(self, taxonomy, evidence_finder, context_noise=(0.5, 0.2, 0.1), metrics_noise=None, task_noise=None,
118132
ds_pb=0.001, ms_pb=0.01, ts_pb=0.01, debug_gold_df=None):
119133
merged_p = \
120-
get_probs({k: Counter([normalize_cell(normalize_dataset(x)) for x in v]) for k, v in datasets.items()})[1]
134+
get_probs({k: Counter([normalize_cell(normalize_dataset(x)) for x in v]) for k, v in evidence_finder.datasets.items()})[1]
121135
metrics_p = \
122-
get_probs({k: Counter([normalize_cell(normalize_dataset(x)) for x in v]) for k, v in metrics.items()})[1]
136+
get_probs({k: Counter([normalize_cell(normalize_dataset(x)) for x in v]) for k, v in evidence_finder.metrics.items()})[1]
123137
tasks_p = \
124-
get_probs({k: Counter([normalize_cell(normalize_dataset(x)) for x in v]) for k, v in tasks.items()})[1]
138+
get_probs({k: Counter([normalize_cell(normalize_dataset(x)) for x in v]) for k, v in evidence_finder.tasks.items()})[1]
125139

126140
self.queries = {}
127141
self.taxonomy = taxonomy
142+
self.evidence_finder = evidence_finder
128143
self._taxonomy = typed.List()
129144
for t in self.taxonomy.taxonomy:
130145
self._taxonomy.append(t)
@@ -158,9 +173,9 @@ def compute_context_logprobs(self, context, noise, ms_noise, ts_noise, logprobs)
158173
context = context or ""
159174
abbrvs = self.extract_acronyms(context)
160175
context = normalize_cell_ws(normalize_dataset(context))
161-
dss = set(find_datasets(context)) | set(abbrvs.keys())
162-
mss = set(find_metrics(context))
163-
tss = set(find_tasks(context))
176+
dss = set(self.evidence_finder.find_datasets(context)) | set(abbrvs.keys())
177+
mss = set(self.evidence_finder.find_metrics(context))
178+
tss = set(self.evidence_finder.find_tasks(context))
164179
dss -= mss
165180
dss -= tss
166181
dss = [normalize_cell(ds) for ds in dss]
@@ -248,7 +263,8 @@ def __call__(self, query, datasets, caption, topk=1, debug_info=None):
248263

249264
# todo: compare regex approach (old) with find_datasets(.) (current)
250265
class DatasetExtractor:
251-
def __init__(self):
266+
def __init__(self, evidence_finder):
267+
self.evidence_finder = evidence_finder
252268
self.dataset_prefix_re = re.compile(r"[A-Z]|[a-z]+[A-Z]+|[0-9]")
253269
self.dataset_name_re = re.compile(r"\b(the)\b\s*(?P<name>((?!(the)\b)\w+\W+){1,10}?)(test|val(\.|idation)?|dev(\.|elopment)?|train(\.|ing)?\s+)?\bdata\s*set\b", re.IGNORECASE)
254270

@@ -260,4 +276,4 @@ def from_paper(self, paper):
260276

261277
def __call__(self, text):
262278
text = normalize_cell_ws(normalize_dataset(text))
263-
return find_datasets(text) | find_tasks(text)
279+
return self.evidence_finder.find_datasets(text) | self.evidence_finder.find_tasks(text)

sota_extractor2/models/linking/manual_dicts.py

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,13 @@
2626
'BLINDS-II': ['blinds-ii'],
2727
'FSIM': ['fsim'],
2828
# SEMANTIC SEGMENTATION
29-
'Mean iOU': ['miou', 'mean iou', 'mean iu'],
30-
'Pixel Accuracy': ['pixel accuracy', 'pixel acc', 'pixel acc.'],
31-
'Class iOU': ['class iou', 'iou cla.'],
32-
'Category iOU': ['cat iou', 'iou cat.'],
33-
'Class iiOU': ['class iiou', 'iiou cla.'],
34-
'Category iiOU': ['cat iiou', 'iiou cat.'],
29+
'Mean IoU': ['miou', 'mean iou', 'mean iu', 'class iou', 'iou cla', 'cla iou'],
30+
'Pixel Accuracy': ['pixel accuracy', 'pixel acc', 'pixel acc.', 'pixacc', 'pixel'],
31+
'Category IoU': ['cat iou', 'iou cat'],
32+
'class iIoU': ['class iiou', 'iiou cla'],
33+
'Category iIoU': ['cat iiou', 'iiou cat'],
34+
'Mean Accuracy': ['mean acc', 'mean', 'acc']
35+
3536
}
3637

3738
# datasets[taxonomy name] is a list of normalized evidences for taxonomy name
@@ -126,6 +127,4 @@
126127
'SUN RGB-D': ['sun rgbd', 'sunrgbd', 'sunrgb d']
127128
}
128129

129-
tasks = {
130-
'Speech Recognition': ['speech recognition']
131-
}
130+
tasks = {}

sota_extractor2/models/linking/taxonomy.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,20 @@ class Taxonomy:
88
def __init__(self, taxonomy, metrics_info):
99
self.taxonomy = self._read_taxonomy(taxonomy)
1010
self.metrics_info = self._read_metrics_info(metrics_info)
11+
self.tasks = self._get_axis('task')
12+
self.datasets = self._get_axis('dataset')
13+
self.metrics = self._get_axis('metric')
1114

1215
def _read_json(self, path):
1316
with open(path, "rt") as f:
1417
return json.load(f)
1518

1619
def _read_taxonomy(self, path):
17-
records = self._read_json(path)
18-
return [(r["task"], r["dataset"], r["metric"]) for r in records]
20+
self._records = self._read_json(path)
21+
return [(r["task"], r["dataset"], r["metric"]) for r in self._records]
22+
23+
def _get_axis(self, axis):
24+
return set(x[axis] for x in self._records)
1925

2026
def _read_metrics_info(self, path):
2127
records = self._read_json(path)

0 commit comments

Comments
 (0)