Skip to content

Commit 5a9255a

Browse files
author
Marcin Kardas
committed
Add experiments funcitonality
1 parent c60f382 commit 5a9255a

File tree

4 files changed

+297
-22
lines changed

4 files changed

+297
-22
lines changed

sota_extractor2/helpers/training.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11

2-
def set_seed(seed, name):
2+
def set_seed(seed, name, quiet=False):
33
import torch
44
import numpy as np
5-
print(f"Setting {name} seed to {seed}")
5+
if not quiet:
6+
print(f"Setting {name} seed to {seed}")
67
torch.manual_seed(seed)
78
torch.backends.cudnn.deterministic = True
89
torch.backends.cudnn.benchmark = False

sota_extractor2/models/structure/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from ... import config
66

77
def split_by_cell_content(df, seed=42, split_column="cell_content"):
8-
set_seed(seed, "val_split")
8+
set_seed(seed, "val_split", quiet=True)
99
contents = np.random.permutation(df[split_column].unique())
1010
val_split = int(len(contents)*0.1)
1111
val_keys = contents[:val_split]
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
import dataclasses
2+
from dataclasses import dataclass
3+
import json
4+
from pathlib import Path
5+
import numpy as np
6+
import pandas as pd
7+
from sota_extractor2.models.structure.nbsvm import *
8+
from sklearn.metrics import confusion_matrix
9+
from matplotlib import pyplot as plt
10+
import seaborn as sn
11+
from enum import Enum
12+
import pickle
13+
14+
class Labels(Enum):
15+
OTHER=0
16+
DATASET=1
17+
PAPER_MODEL=2
18+
COMPETING_MODEL=3
19+
20+
label_map = {
21+
"dataset": Labels.DATASET.value,
22+
"dataset-sub": Labels.DATASET.value,
23+
"model-paper": Labels.PAPER_MODEL.value,
24+
"model-best": Labels.PAPER_MODEL.value,
25+
"model-competing": Labels.COMPETING_MODEL.value
26+
}
27+
28+
@dataclass
29+
class Experiment:
30+
vectorizer: str = "tfidf"
31+
this_paper: bool = False
32+
merge_fragments: bool = False
33+
evidence_source: str = "text" # "text" or "text_highlited"
34+
split_btags: bool = False # <b>Test</b> -> <b> Test </b>
35+
fixed_tokenizer: bool = False # <b> and </b> are not split
36+
37+
class_weight: str = None
38+
multinomial_type: str = "manual" # "manual", "ovr", "multinomial"
39+
solver: str = "liblinear" # 'lbfgs' - large, liblinear for small datasets
40+
C: float = 4.0
41+
dual: bool = True
42+
penalty: str = "l2"
43+
ngram_range: tuple = (1, 2)
44+
min_df: int = 3
45+
max_df: float = 0.9
46+
max_iter: int = 1000
47+
48+
results: dict = dataclasses.field(default_factory=dict)
49+
50+
has_model: bool = False # either there's already pretrained model or it's a saved experiment and there's a saved model as well
51+
name: str = None
52+
53+
def _get_next_exp_name(self, dir_path):
54+
dir_path = Path(dir_path)
55+
files = [f.name for f in dir_path.glob("*.exp.json")]
56+
for i in range(100000):
57+
name = f"{i:05d}.exp.json"
58+
if name not in files:
59+
return dir_path / name
60+
raise Exception("You have too many files in this dir, really!")
61+
62+
def _save_model(self, path):
63+
with open(path, 'wb') as f:
64+
pickle.dump(self._model, f)
65+
66+
def _load_model(self, path):
67+
with open(path, 'rb') as f:
68+
self._model = pickle.load(f)
69+
return self._model
70+
71+
def load_model(self):
72+
path = self._path.parent / f"{self._path.stem}.model"
73+
return self._load_model(path)
74+
75+
def save(self, dir_path):
76+
dir_path = Path(dir_path)
77+
dir_path.mkdir(exist_ok=True, parents=True)
78+
filename = self._get_next_exp_name(dir_path)
79+
j = dataclasses.asdict(self)
80+
with open(filename, "wt") as f:
81+
json.dump(j, f)
82+
if hasattr(self, "_model"):
83+
fn = filename.stem
84+
self._save_model(dir_path / f"{fn}.model")
85+
return filename.name
86+
87+
def to_df(self):
88+
d = dataclasses.asdict(self)
89+
res = d.pop("results")
90+
d.update(res)
91+
row = pd.DataFrame({k: [v] for k, v in d.items()})
92+
return row
93+
94+
def new_experiment(self, **kwargs):
95+
# reset this fields unless their provided in load()
96+
kwargs.setdefault("has_model", False)
97+
kwargs.setdefault("results", {})
98+
return dataclasses.replace(self, **kwargs)
99+
100+
def update_results(self, **kwargs):
101+
self.results.update(**kwargs)
102+
103+
def get_trained_model(self, train_df):
104+
nbsvm = NBSVM(experiment=self)
105+
nbsvm.fit(train_df["text"], train_df["label"])
106+
self._model = nbsvm
107+
self.has_model = True
108+
return nbsvm
109+
110+
def _transform_df(self, df):
111+
df = df[df["cell_type"] != "table-meta"] # otherwise we get precision 0 on test set
112+
if self.evidence_source != "text":
113+
df = df.copy(True)
114+
df["text"] = df[self.evidence_source]
115+
if self.merge_fragments:
116+
df = df.groupby(by=["ext_id", "cell_content", "cell_type", "this_paper"]).text.apply(
117+
lambda x: "\n".join(x.values)).reset_index()
118+
df = df.drop_duplicates(["text", "cell_content", "cell_type"]).fillna("")
119+
if self.this_paper:
120+
df = df[df.this_paper]
121+
if self.split_btags:
122+
df["text"] = df["text"].replace(re.compile(r"(\</?b\>)"), r" \1 ")
123+
df = df.replace(re.compile(r"(xxref|xxanchor)-[\w\d-]*"), "\\1 ")
124+
df = df.replace(re.compile(r"(^|[ ])\d+\.\d+(\b|%)"), " xxnum ")
125+
df = df.replace(re.compile(r"(^|[ ])\d+(\b|%)"), " xxnum ")
126+
df = df.replace(re.compile(r"\bdata set\b"), " dataset ")
127+
df["label"] = df["cell_type"].apply(lambda x: label_map.get(x, 0))
128+
df["label"] = pd.Categorical(df["label"])
129+
return df
130+
131+
def transform_df(self, *dfs):
132+
return [self._transform_df(df) for df in dfs]
133+
134+
def evaluate(self, model, train_df, valid_df, test_df):
135+
for prefix, tdf in zip(["train", "valid", "test"], [train_df, valid_df, test_df]):
136+
probs = model.predict_proba(tdf["text"])
137+
preds = np.argmax(probs, axis=1)
138+
true_y = tdf["label"]
139+
140+
m = metrics(preds, tdf.label)
141+
r = {}
142+
r[f"{prefix}_accuracy"] = m["accuracy"]
143+
r[f"{prefix}_precision"] = m["precision"]
144+
r[f"{prefix}_cm"] = confusion_matrix(true_y, preds).tolist()
145+
self.update_results(**r)
146+
147+
def show_results(self, *ds):
148+
if not len(ds):
149+
ds = ["train", "valid", "test"]
150+
for prefix in ds:
151+
print(f"{prefix} dataset")
152+
print(f" * accuracy: {self.results[f'{prefix}_accuracy']}")
153+
print(f" * precision: {self.results[f'{prefix}_precision']}")
154+
self._plot_confusion_matrix(np.array(self.results[f'{prefix}_cm']), normalize=True)
155+
156+
def _plot_confusion_matrix(self, cm, normalize):
157+
if normalize:
158+
cm = cm / cm.sum(axis=1)[:, None]
159+
target_names = ["OTHER", "DATASET", "MODEL (paper)", "MODEL (comp.)"]
160+
df_cm = pd.DataFrame(cm, index=[i for i in target_names],
161+
columns=[i for i in target_names])
162+
plt.figure(figsize=(10, 10))
163+
ax = sn.heatmap(df_cm,
164+
annot=True,
165+
square=True,
166+
fmt="0.2f" if normalize else "d",
167+
cmap="YlGnBu",
168+
mask=cm == 0,
169+
linecolor="black",
170+
linewidths=0.01)
171+
ax.set_ylabel("True")
172+
ax.set_xlabel("Predicted")
173+
174+
@classmethod
175+
def load_all(cls, dir_path):
176+
dir_path = Path(dir_path)
177+
return [cls.load(f) for f in dir_path.glob("*.exp.json")]
178+
179+
@classmethod
180+
def load(cls, path):
181+
# a new field added to the class should not change
182+
# the default behaviour of experiment, so that we
183+
# can load older experiments by setting missing fields
184+
# to their default values
185+
e = cls()
186+
path = Path(path)
187+
with open(path, "rt") as f:
188+
j = json.load(f)
189+
j["name"] = path.name
190+
e = e.new_experiment(**j)
191+
e._path = path
192+
return e
193+
194+
@classmethod
195+
def experiments_to_df(cls, exps):
196+
dfs = [e.to_df() for e in exps]
197+
df = pd.concat(dfs)
198+
return df

sota_extractor2/models/structure/nbsvm.py

Lines changed: 95 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -39,55 +39,115 @@ def get_number_of_classes(y):
3939
return y.shape[1]
4040

4141
class NBSVM:
42-
def __init__(self, solver='liblinear', dual=True, C=4, ngram_range=(1, 2)):
43-
self.solver = solver # 'lbfgs' - large, liblinear for small datasets
44-
self.dual = dual
45-
self.C = C
46-
self.ngram_range = ngram_range
42+
def __init__(self, experiment):
43+
self.experiment = experiment
4744

4845
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
46+
re_tok_fixed = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])'.replace('<', '').replace('>', '').replace('/', ''))
4947

50-
def tokenize(self, s):
48+
def tokenize(self, s):
5149
return self.re_tok.sub(r' \1 ', s).split()
5250

51+
def tokenize_fixed(self, s):
52+
return self.re_tok_fixed.sub(r' \1 ', s).split()
53+
5354
def pr(self, y_i, y):
5455
p = self.trn_term_doc[y == y_i].sum(0)
5556
return (p+1) / ((y == y_i).sum()+1)
5657

5758
def get_mdl(self, y):
5859
y = y.values
5960
r = np.log(self.pr(1, y) / self.pr(0, y))
60-
m = LogisticRegression(C=self.C, dual=self.dual, solver=self.solver, max_iter=1000)
61+
m = LogisticRegression(C=self.experiment.C, penalty=self.experiment.penalty,
62+
dual=self.experiment.dual, solver=self.experiment.solver,
63+
max_iter=self.experiment.max_iter)
6164
x_nb = self.trn_term_doc.multiply(r)
6265
return m.fit(x_nb, y), r
6366

6467
def bow(self, X_train):
6568
self.n = X_train.shape[0]
66-
self.vec = TfidfVectorizer(ngram_range=self.ngram_range, tokenizer=self.tokenize,
67-
min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
68-
smooth_idf=1, sublinear_tf=1)
69+
70+
if self.experiment.vectorizer == "tfidf":
71+
self.vec = TfidfVectorizer(ngram_range=self.experiment.ngram_range,
72+
tokenizer=self.tokenize_fixed if self.experiment.fixed_tokenizer else self.tokenize,
73+
min_df=self.experiment.min_df, max_df=self.experiment.max_df,
74+
strip_accents='unicode', use_idf=1,
75+
smooth_idf=1, sublinear_tf=1)
76+
elif self.experiment.vectorizer == "count":
77+
self.vec = CountVectorizer(ngram_range=self.experiment.ngram_range, tokenizer=self.tokenize,
78+
min_df=self.experiment.min_df, max_df=self.experiment.max_df,
79+
strip_accents='unicode')
80+
else:
81+
raise Exception(f"Unknown vectorizer type: {self.experiment.vectorizer}")
82+
6983
return self.vec.fit_transform(X_train)
7084

7185
def train_models(self, y_train):
7286
self.models = []
73-
for i in range(0, self.c):
74-
print('fit', i)
75-
m, r = self.get_mdl(get_class_column(y_train, i))
76-
self.models.append((m, r))
87+
if self.experiment.multinomial_type == "manual":
88+
for i in range(0, self.c):
89+
#print('fit', i)
90+
m, r = self.get_mdl(get_class_column(y_train, i))
91+
self.models.append((m, r))
92+
elif self.experiment.multinomial_type == "multinomial":
93+
m = LogisticRegression(C=self.experiment.C, penalty=self.experiment.penalty,
94+
dual=self.experiment.dual, solver=self.experiment.solver,
95+
max_iter=self.experiment.max_iter,
96+
multi_class="multinomial", class_weight=self.experiment.class_weight)
97+
x_nb = self.trn_term_doc
98+
self.models.append(m.fit(x_nb, y_train))
99+
else:
100+
raise Exception(f"Unsupported multinomial_type {self.experiment.multinomial_type}")
77101

78102
def fit(self, X_train, y_train):
79103
self.trn_term_doc = self.bow(X_train)
80104
self.c = get_number_of_classes(y_train)
81105
self.train_models(y_train)
82106

83107
def predict_proba(self, X_test):
84-
preds = np.zeros((len(X_test), self.c))
85108
test_term_doc = self.vec.transform(X_test)
86-
for i in range(0, self.c):
87-
m, r = self.models[i]
88-
preds[:, i] = m.predict_proba(test_term_doc.multiply(r))[:, 1]
109+
if self.experiment.multinomial_type == "manual":
110+
preds = np.zeros((len(X_test), self.c))
111+
for i in range(0, self.c):
112+
m, r = self.models[i]
113+
preds[:, i] = m.predict_proba(test_term_doc.multiply(r))[:, 1]
114+
elif self.experiment.multinomial_type == "multinomial":
115+
preds = self.models[0].predict_proba(test_term_doc)
116+
else:
117+
raise Exception(f"Unsupported multinomial_type {self.experiment.multinomial_type}")
89118
return preds
90-
119+
120+
def sort_features_by_importance(self, label):
121+
label = label.value
122+
names = np.array(self.vec.get_feature_names())
123+
if self.experiment.multinomial_type == "manual":
124+
m, r = self.models[label]
125+
f = m.coef_[0] * np.array(r[0])
126+
elif self.experiment.multinomial_type == "multinomial":
127+
f = self.models[0].coef_[label]
128+
else:
129+
raise Exception(f"Unsupported multinomial_type {self.experiment.multinomial_type}")
130+
if self.experiment.vectorizer == "tfidf":
131+
f *= self.vec.idf_
132+
indices = f.argsort()[::-1]
133+
return names[indices], f[indices]
134+
135+
def get_mismatched(self, df, true_label, predicted_label):
136+
true_label = true_label.value
137+
predicted_label = predicted_label.value
138+
139+
probs = self.predict_proba(df["text"])
140+
preds = np.argmax(probs, axis=1)
141+
true_y = df["label"]
142+
143+
mismatched_indices = (true_y == true_label) & (preds == predicted_label)
144+
mismatched = df[mismatched_indices]
145+
diff = probs[mismatched_indices, true_label] - probs[mismatched_indices, predicted_label]
146+
indices = diff.argsort()
147+
mismatched = mismatched.iloc[indices]
148+
mismatched["pr_diff"] = diff[indices]
149+
return mismatched
150+
91151
def validate(self, X_test, y_test):
92152
acc = (np.argmax(self.predict_proba(X_test), axis=1) == y_test).mean()
93153
return acc
@@ -98,10 +158,14 @@ def metrics(preds, true_y):
98158
acc = (p == y).mean()
99159
tp = ((y != 0) & (p == y)).sum()
100160
fp = ((p != 0) & (p != y)).sum()
161+
fn = ((y != 0) & (p == 0)).sum()
162+
101163
prec = tp / (fp + tp)
164+
reca = tp / (fn + tp)
102165
return {
103166
"precision": prec,
104167
"accuracy": acc,
168+
"recall": reca,
105169
"TP": tp,
106170
"FP": fp,
107171
}
@@ -130,6 +194,18 @@ def preds_for_cell_content_multi(test_df, probs, group_by=["cell_content"]):
130194
'counts': grouped_counts})
131195
return results
132196

197+
def preds_for_cell_content_best(test_df, probs, group_by=["cell_content"]):
198+
test_df = test_df.copy()
199+
probs_df = pd.DataFrame(probs, index=test_df.index)
200+
test_df = pd.concat([test_df, probs_df], axis=1)
201+
grouped_preds = np.argmax(test_df.groupby(
202+
group_by)[probs_df.columns].sum().values, axis=1)
203+
grouped_counts = test_df.groupby(group_by)["label"].count()
204+
results = pd.DataFrame({'true': test_df.groupby(group_by)["label"].agg(lambda x: x.value_counts().index[0]),
205+
'pred': grouped_preds,
206+
'counts': grouped_counts})
207+
return results
208+
133209
def test_model(model, tdf):
134210
probs = model(tdf["text"])
135211
preds = np.argmax(probs, axis=1)

0 commit comments

Comments
 (0)