visualization: add register labels

HugoLaurencon · HugoLaurencon · commit 7ed52f934158 · 2022-01-28T00:34:20.000+01:00
diff --git a/ac_dc/visualization/get_data_for_visualization.py b/ac_dc/visualization/get_data_for_visualization.py
@@ -53,7 +53,10 @@ def compute_stats(self):
             stats_document = {}
 
             try:
-                document = next(dataset)["text"]
+                example = next(dataset)
+                document = example["text"]
+                if "labels" in example:
+                    stats_document["labels"] = example["labels"]
 
                 words = ModifyingDocuments.get_words_from_document(
                     document,
@@ -164,17 +167,17 @@ def compute_stats(self):
 
 if __name__ == "__main__":
 
-    dataset_name = "oscar"
-    config_name = "unshuffled_deduplicated_en"
-    data_files = None
+    dataset_name = "TurkuNLP/register_oscar"  # "oscar"
+    config_name = None  # "unshuffled_deduplicated_en"
+    data_files = "en/en_00000.jsonl.gz"  # None
     split = "train"
     num_iter = 15000
 
     lang_dataset_id = "en"
     path_fasttext_model = "ac_dc/lid.176.bin"
     path_sentencepiece_model = f"ac_dc/en.sp.model"
     path_kenlm_model = f"ac_dc/en.arpa.bin"
-    path_save_stats = f"ac_dc/visualization/en_examples_with_stats.json"
+    path_save_stats = f"ac_dc/visualization/en_examples_with_stats_register.json"
 
     dataset = load_dataset(
         dataset_name,
diff --git a/ac_dc/visualization/visualization.py b/ac_dc/visualization/visualization.py
@@ -128,8 +128,6 @@ def display_dataset(dataframe, cond, description, type_of_examples):
         st.dataframe(displayed_examples)
 
     def filtering_of_docs(self):
-        st.sidebar.subheader("Parameters of the filtering on documents")
-
         def set_sliders():
             columns = list(self.docs)
             keys = []
@@ -385,114 +383,153 @@ def get_cond(key, cutoff, max_cutoff):
 
             return keys, conds
 
-        self.keys, conds = set_sliders()
-        self.parameters = self.keys * 1
-
-        all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
-        all_conds = np.all(all_conds, axis=0)
-
         with st.expander(
             f"Filtering on documents, for {self.num_docs} {self.lang} documents"
         ):
             st.header(
                 f"Filtering on documents, for {self.num_docs} {self.lang} documents"
             )
 
-            Visualization_for_lang.display_dataset(
-                self.docs, np.invert(all_conds), "Discarded documents", "docs"
-            )
+            if "labels" in list(self.docs):
+                chosen_label = st.selectbox(
+                    label="Consider only documents that include the following label",
+                    options=[
+                        "All",
+                        "NA: Narrative",
+                        "IN: Informational Description",
+                        "OP: Opinion",
+                        "ID: Interactive Discussion",
+                        "HI: How-to/Instruction",
+                        "IP: Informational Persuasion",
+                        "LY: Lyrical",
+                        "SP: Spoken",
+                    ],
+                )
+                chosen_label = chosen_label.split(":")[0]
+                if chosen_label != "All":
+                    cond_label = list(
+                        self.docs["labels"].apply(
+                            lambda x: True if chosen_label in x else False
+                        )
+                    )
+                    self.docs = self.docs[cond_label]
 
-            # st.subheader("Display discarded documents by filter")
-            display_discarded_documents_by_filter = st.checkbox(
-                "Display discarded documents by filter"
-            )
+            if self.docs.empty:
+                st.markdown(
+                    "No document to display, please try to select a different label."
+                )
+                self.keys = []
+                self.parameters = []
 
-            if display_discarded_documents_by_filter:
-                columns = list(self.docs)
+            else:
+                st.sidebar.subheader("Parameters of the filtering on documents")
+                self.keys, conds = set_sliders()
+                self.parameters = self.keys * 1
 
-                if "number_words" in columns:
-                    cond_filter = np.invert(np.all(conds["number_words"], axis=0))
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the number of words",
-                        "docs",
-                    )
+                all_conds = [
+                    subcond for cond in list(conds.values()) for subcond in cond
+                ]
+                all_conds = np.all(all_conds, axis=0)
 
-                if "character_repetition_ratio" in columns:
-                    cond_filter = np.invert(
-                        np.all(conds["character_repetition_ratio"], axis=0)
-                    )
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the character repetition ratio",
-                        "docs",
-                    )
+                Visualization_for_lang.display_dataset(
+                    self.docs, np.invert(all_conds), "Discarded documents", "docs"
+                )
 
-                if "word_repetition_ratio" in columns:
-                    cond_filter = np.invert(
-                        np.all(conds["word_repetition_ratio"], axis=0)
-                    )
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the word repetition ratio",
-                        "docs",
-                    )
+                # st.subheader("Display discarded documents by filter")
+                display_discarded_documents_by_filter = st.checkbox(
+                    "Display discarded documents by filter"
+                )
 
-                if "special_characters_ratio" in columns:
-                    cond_filter = np.invert(
-                        np.all(conds["special_characters_ratio"], axis=0)
-                    )
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the special characters ratio",
-                        "docs",
-                    )
+                if display_discarded_documents_by_filter:
+                    columns = list(self.docs)
 
-                if "stopwords_ratio" in columns:
-                    cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the stop words ratio",
-                        "docs",
-                    )
+                    if "number_words" in columns:
+                        cond_filter = np.invert(np.all(conds["number_words"], axis=0))
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the number of words",
+                            "docs",
+                        )
 
-                if "flagged_words_ratio" in columns:
-                    cond_filter = np.invert(
-                        np.all(conds["flagged_words_ratio"], axis=0)
-                    )
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the flagged words ratio",
-                        "docs",
-                    )
+                    if "character_repetition_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["character_repetition_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the character repetition ratio",
+                            "docs",
+                        )
 
-                if "lang_id_score" in columns:
-                    cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the language identification confidence score",
-                        "docs",
-                    )
+                    if "word_repetition_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["word_repetition_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the word repetition ratio",
+                            "docs",
+                        )
 
-                if "perplexity_score" in columns:
-                    cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
-                    Visualization_for_lang.display_dataset(
-                        self.docs,
-                        cond_filter,
-                        "Discarded documents for the filter on the perplexity score",
-                        "docs",
-                    )
+                    if "special_characters_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["special_characters_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the special characters ratio",
+                            "docs",
+                        )
 
-            Visualization_for_lang.display_dataset(
-                self.docs, all_conds, "Retained documents", "docs"
-            )
+                    if "stopwords_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["stopwords_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the stop words ratio",
+                            "docs",
+                        )
+
+                    if "flagged_words_ratio" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["flagged_words_ratio"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the flagged words ratio",
+                            "docs",
+                        )
+
+                    if "lang_id_score" in columns:
+                        cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the language identification confidence score",
+                            "docs",
+                        )
+
+                    if "perplexity_score" in columns:
+                        cond_filter = np.invert(
+                            np.all(conds["perplexity_score"], axis=0)
+                        )
+                        Visualization_for_lang.display_dataset(
+                            self.docs,
+                            cond_filter,
+                            "Discarded documents for the filter on the perplexity score",
+                            "docs",
+                        )
+
+                Visualization_for_lang.display_dataset(
+                    self.docs, all_conds, "Retained documents", "docs"
+                )
 
             st.header("Download data")