Add metric class and emtpy evidences

mkardas · mkardas · commit 7de755c185b5 · 2019-07-23T16:53:00.000+02:00
diff --git a/sota_extractor2/data/structure.py b/sota_extractor2/data/structure.py
@@ -32,6 +32,12 @@ def clear_cell(s):
     return s
 
 
+def empty_fragment(paper_id):
+    fragment = Fragment(paper_id=paper_id)
+    fragment.meta['highlight'] = {'text': ['']}
+    return fragment
+
+
 def fetch_evidence(cell_content, cell_reference, paper_id, paper_limit=10, corpus_limit=10):
     cell_content = clear_cell(cell_content)
     if cell_content == "" and cell_reference == "":
@@ -61,6 +67,8 @@ def fetch_evidence(cell_content, cell_reference, paper_id, paper_limit=10, corpu
                            .query('match_phrase', text=query)[:corpus_limit])
     if not len(paper_fragments) and not len(reference_fragments) and not len(other_fagements):
         print(f"No evidences for '{cell_content}' of {paper_id}")
+    if not len(paper_fragments) and not len(reference_fragments):
+        paper_fragments = [empty_fragment(paper_id)]
     return paper_fragments + reference_fragments + other_fagements
 
 fix_refs_re = re.compile('\(\?\)|\s[?]+(\s|$)')
@@ -124,12 +132,13 @@ def get_limits(cell_type):
 
 
 def prepare_data(tables, csv_path, limit_type='interesting'):
-    df = pd.concat([evidence_for_table(table,  
+    df = pd.concat([evidence_for_table(table,
                                        paper_limit=100,
                                        corpus_limit=20,
                                        limit_type=limit_type) for table in progress_bar(tables)])
-    df = df.drop_duplicates(
-        ["cell_content", "text_highlited", "cell_type", "this_paper"])
+    #moved to experiment preprocessing
+    #df = df.drop_duplicates(
+    #    ["cell_content", "text_highlited", "cell_type", "this_paper"])
     print("Number of text fragments ", len(df))
     csv_path.parent.mkdir(parents=True, exist_ok=True)
     df.to_csv(csv_path, index=None)
diff --git a/sota_extractor2/models/structure/experiment.py b/sota_extractor2/models/structure/experiment.py
@@ -16,13 +16,15 @@ class Labels(Enum):
     DATASET=1
     PAPER_MODEL=2
     COMPETING_MODEL=3
+    METRIC=4
 
 label_map = {
     "dataset": Labels.DATASET.value,
     "dataset-sub": Labels.DATASET.value,
     "model-paper": Labels.PAPER_MODEL.value,
     "model-best": Labels.PAPER_MODEL.value,
-    "model-competing": Labels.COMPETING_MODEL.value
+    "model-competing": Labels.COMPETING_MODEL.value,
+    "dataset-metric": Labels.METRIC.value
 }
 
 # put here to avoid recompiling, used only in _limit_context
@@ -43,6 +45,9 @@ class Experiment:
     context_tokens: int = None      # max. number of words before <b> and after </b>
     analyzer: str = "word"            # "char", "word" or "char_wb"
     lowercase: bool = True
+    remove_num: bool = True
+    drop_duplicates: bool = True
+    mark_this_paper: bool = False
 
     class_weight: str = None
     multinomial_type: str = "manual"  # "manual", "ovr", "multinomial"
@@ -142,6 +147,8 @@ def _limit_context(self, text):
     def _transform_df(self, df):
         if self.merge_type not in ["concat", "vote_maj", "vote_avg", "vote_max"]:
             raise Exception(f"merge_type must be one of concat, vote_maj, vote_avg, vote_max, but {self.merge_type} was given")
+        if self.mark_this_paper and (self.merge_type != "concat" or self.this_paper):
+            raise Exception("merge_type must be 'concat' and this_paper must be false")
         #df = df[df["cell_type"] != "table-meta"]  # otherwise we get precision 0 on test set
         if self.evidence_limit is not None:
             df = df.groupby(by=["ext_id", "this_paper"]).head(self.evidence_limit)
@@ -154,14 +161,25 @@ def _transform_df(self, df):
                 df["text"] = df[self.evidence_source].replace(re.compile("<b>.*?</b>"), " xxmask ")
             else:
                 df["text"] = df[self.evidence_source]
-
         elif self.mask:
             raise Exception("Masking with evidence_source='text' makes no sense")
-        if not self.fixed_this_paper:
+
+        if self.mark_this_paper:
+            df = df.groupby(by=["ext_id", "cell_content", "cell_type", "this_paper"]).text.apply(
+                lambda x: "\n".join(x.values)).reset_index()
+            this_paper_map = {
+                True: "this paper",
+                False: "other paper"
+            }
+            df.text = "xxfld 3 " + df.this_paper.apply(this_paper_map.get) + " " + df.text
+            df = df.groupby(by=["ext_id", "cell_content", "cell_type"]).text.apply(
+                lambda x: " ".join(x.values)).reset_index()
+        elif not self.fixed_this_paper:
             if self.merge_fragments and self.merge_type == "concat":
                 df = df.groupby(by=["ext_id", "cell_content", "cell_type", "this_paper"]).text.apply(
                     lambda x: "\n".join(x.values)).reset_index()
-            df = df.drop_duplicates(["text", "cell_content", "cell_type"]).fillna("")
+            if self.drop_duplicates:
+                df = df.drop_duplicates(["text", "cell_content", "cell_type"]).fillna("")
             if self.this_paper:
                 df = df[df.this_paper]
         else:
@@ -170,13 +188,15 @@ def _transform_df(self, df):
             if self.merge_fragments and self.merge_type == "concat":
                 df = df.groupby(by=["ext_id", "cell_content", "cell_type"]).text.apply(
                     lambda x: "\n".join(x.values)).reset_index()
-            df = df.drop_duplicates(["text", "cell_content", "cell_type"]).fillna("")
+            if self.drop_duplicates:
+                df = df.drop_duplicates(["text", "cell_content", "cell_type"]).fillna("")
 
         if self.split_btags:
             df["text"] = df["text"].replace(re.compile(r"(\</?b\>)"), r" \1 ")
         df = df.replace(re.compile(r"(xxref|xxanchor)-[\w\d-]*"), "\\1 ")
-        df = df.replace(re.compile(r"(^|[ ])\d+\.\d+(\b|%)"), " xxnum ")
-        df = df.replace(re.compile(r"(^|[ ])\d+(\b|%)"), " xxnum ")
+        if self.remove_num:
+            df = df.replace(re.compile(r"(^|[ ])\d+\.\d+(\b|%)"), " xxnum ")
+            df = df.replace(re.compile(r"(^|[ ])\d+(\b|%)"), " xxnum ")
         df = df.replace(re.compile(r"\bdata set\b"), " dataset ")
         df["label"] = df["cell_type"].apply(lambda x: label_map.get(x, 0))
         df["label"] = pd.Categorical(df["label"])
@@ -193,6 +213,7 @@ def _set_results(self, prefix, preds, true_y):
         r = {}
         r[f"{prefix}_accuracy"] = m["accuracy"]
         r[f"{prefix}_precision"] = m["precision"]
+        r[f"{prefix}_recall"] = m["recall"]
         r[f"{prefix}_cm"] = confusion_matrix(true_y, preds).tolist()
         self.update_results(**r)
 
@@ -214,26 +235,29 @@ def evaluate(self, model, train_df, valid_df, test_df):
                 true_y = tdf["label"]
             self._set_results(prefix, preds, true_y)
 
-    def show_results(self, *ds):
+    def show_results(self, *ds, normalize=True):
         if not len(ds):
             ds = ["train", "valid", "test"]
         for prefix in ds:
             print(f"{prefix} dataset")
-            print(f" * accuracy: {self.results[f'{prefix}_accuracy']}")
-            print(f" * precision: {self.results[f'{prefix}_precision']}")
-            self._plot_confusion_matrix(np.array(self.results[f'{prefix}_cm']), normalize=True)
+            print(f" * accuracy: {self.results[f'{prefix}_accuracy']:.3f}")
+            print(f" * μ-precision: {self.results[f'{prefix}_precision']:.3f}")
+            print(f" * μ-recall: {self.results[f'{prefix}_recall']:.3f}")
+            self._plot_confusion_matrix(np.array(self.results[f'{prefix}_cm']), normalize=normalize)
 
-    def _plot_confusion_matrix(self, cm, normalize):
+    def _plot_confusion_matrix(self, cm, normalize, fmt=None):
         if normalize:
             cm = cm / cm.sum(axis=1)[:, None]
-        target_names = ["OTHER", "DATASET", "MODEL (paper)", "MODEL (comp.)"]
+        if fmt is None:
+            fmt = "0.2f" if normalize else "d"
+        target_names = ["OTHER", "DATASET", "MODEL (paper)", "MODEL (comp.)", "METRIC"]
         df_cm = pd.DataFrame(cm, index=[i for i in target_names],
                              columns=[i for i in target_names])
         plt.figure(figsize=(10, 10))
         ax = sn.heatmap(df_cm,
                         annot=True,
                         square=True,
-                        fmt="0.2f" if normalize else "d",
+                        fmt=fmt,
                         cmap="YlGnBu",
                         mask=cm == 0,
                         linecolor="black",