Skip to content

Commit 7ed52f9

Browse files
committed
visualization: add register labels
1 parent c8032af commit 7ed52f9

File tree

2 files changed

+137
-97
lines changed

2 files changed

+137
-97
lines changed

ac_dc/visualization/get_data_for_visualization.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,10 @@ def compute_stats(self):
5353
stats_document = {}
5454

5555
try:
56-
document = next(dataset)["text"]
56+
example = next(dataset)
57+
document = example["text"]
58+
if "labels" in example:
59+
stats_document["labels"] = example["labels"]
5760

5861
words = ModifyingDocuments.get_words_from_document(
5962
document,
@@ -164,17 +167,17 @@ def compute_stats(self):
164167

165168
if __name__ == "__main__":
166169

167-
dataset_name = "oscar"
168-
config_name = "unshuffled_deduplicated_en"
169-
data_files = None
170+
dataset_name = "TurkuNLP/register_oscar" # "oscar"
171+
config_name = None # "unshuffled_deduplicated_en"
172+
data_files = "en/en_00000.jsonl.gz" # None
170173
split = "train"
171174
num_iter = 15000
172175

173176
lang_dataset_id = "en"
174177
path_fasttext_model = "ac_dc/lid.176.bin"
175178
path_sentencepiece_model = f"ac_dc/en.sp.model"
176179
path_kenlm_model = f"ac_dc/en.arpa.bin"
177-
path_save_stats = f"ac_dc/visualization/en_examples_with_stats.json"
180+
path_save_stats = f"ac_dc/visualization/en_examples_with_stats_register.json"
178181

179182
dataset = load_dataset(
180183
dataset_name,

ac_dc/visualization/visualization.py

Lines changed: 129 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,6 @@ def display_dataset(dataframe, cond, description, type_of_examples):
128128
st.dataframe(displayed_examples)
129129

130130
def filtering_of_docs(self):
131-
st.sidebar.subheader("Parameters of the filtering on documents")
132-
133131
def set_sliders():
134132
columns = list(self.docs)
135133
keys = []
@@ -385,114 +383,153 @@ def get_cond(key, cutoff, max_cutoff):
385383

386384
return keys, conds
387385

388-
self.keys, conds = set_sliders()
389-
self.parameters = self.keys * 1
390-
391-
all_conds = [subcond for cond in list(conds.values()) for subcond in cond]
392-
all_conds = np.all(all_conds, axis=0)
393-
394386
with st.expander(
395387
f"Filtering on documents, for {self.num_docs} {self.lang} documents"
396388
):
397389
st.header(
398390
f"Filtering on documents, for {self.num_docs} {self.lang} documents"
399391
)
400392

401-
Visualization_for_lang.display_dataset(
402-
self.docs, np.invert(all_conds), "Discarded documents", "docs"
403-
)
393+
if "labels" in list(self.docs):
394+
chosen_label = st.selectbox(
395+
label="Consider only documents that include the following label",
396+
options=[
397+
"All",
398+
"NA: Narrative",
399+
"IN: Informational Description",
400+
"OP: Opinion",
401+
"ID: Interactive Discussion",
402+
"HI: How-to/Instruction",
403+
"IP: Informational Persuasion",
404+
"LY: Lyrical",
405+
"SP: Spoken",
406+
],
407+
)
408+
chosen_label = chosen_label.split(":")[0]
409+
if chosen_label != "All":
410+
cond_label = list(
411+
self.docs["labels"].apply(
412+
lambda x: True if chosen_label in x else False
413+
)
414+
)
415+
self.docs = self.docs[cond_label]
404416

405-
# st.subheader("Display discarded documents by filter")
406-
display_discarded_documents_by_filter = st.checkbox(
407-
"Display discarded documents by filter"
408-
)
417+
if self.docs.empty:
418+
st.markdown(
419+
"No document to display, please try to select a different label."
420+
)
421+
self.keys = []
422+
self.parameters = []
409423

410-
if display_discarded_documents_by_filter:
411-
columns = list(self.docs)
424+
else:
425+
st.sidebar.subheader("Parameters of the filtering on documents")
426+
self.keys, conds = set_sliders()
427+
self.parameters = self.keys * 1
412428

413-
if "number_words" in columns:
414-
cond_filter = np.invert(np.all(conds["number_words"], axis=0))
415-
Visualization_for_lang.display_dataset(
416-
self.docs,
417-
cond_filter,
418-
"Discarded documents for the filter on the number of words",
419-
"docs",
420-
)
429+
all_conds = [
430+
subcond for cond in list(conds.values()) for subcond in cond
431+
]
432+
all_conds = np.all(all_conds, axis=0)
421433

422-
if "character_repetition_ratio" in columns:
423-
cond_filter = np.invert(
424-
np.all(conds["character_repetition_ratio"], axis=0)
425-
)
426-
Visualization_for_lang.display_dataset(
427-
self.docs,
428-
cond_filter,
429-
"Discarded documents for the filter on the character repetition ratio",
430-
"docs",
431-
)
434+
Visualization_for_lang.display_dataset(
435+
self.docs, np.invert(all_conds), "Discarded documents", "docs"
436+
)
432437

433-
if "word_repetition_ratio" in columns:
434-
cond_filter = np.invert(
435-
np.all(conds["word_repetition_ratio"], axis=0)
436-
)
437-
Visualization_for_lang.display_dataset(
438-
self.docs,
439-
cond_filter,
440-
"Discarded documents for the filter on the word repetition ratio",
441-
"docs",
442-
)
438+
# st.subheader("Display discarded documents by filter")
439+
display_discarded_documents_by_filter = st.checkbox(
440+
"Display discarded documents by filter"
441+
)
443442

444-
if "special_characters_ratio" in columns:
445-
cond_filter = np.invert(
446-
np.all(conds["special_characters_ratio"], axis=0)
447-
)
448-
Visualization_for_lang.display_dataset(
449-
self.docs,
450-
cond_filter,
451-
"Discarded documents for the filter on the special characters ratio",
452-
"docs",
453-
)
443+
if display_discarded_documents_by_filter:
444+
columns = list(self.docs)
454445

455-
if "stopwords_ratio" in columns:
456-
cond_filter = np.invert(np.all(conds["stopwords_ratio"], axis=0))
457-
Visualization_for_lang.display_dataset(
458-
self.docs,
459-
cond_filter,
460-
"Discarded documents for the filter on the stop words ratio",
461-
"docs",
462-
)
446+
if "number_words" in columns:
447+
cond_filter = np.invert(np.all(conds["number_words"], axis=0))
448+
Visualization_for_lang.display_dataset(
449+
self.docs,
450+
cond_filter,
451+
"Discarded documents for the filter on the number of words",
452+
"docs",
453+
)
463454

464-
if "flagged_words_ratio" in columns:
465-
cond_filter = np.invert(
466-
np.all(conds["flagged_words_ratio"], axis=0)
467-
)
468-
Visualization_for_lang.display_dataset(
469-
self.docs,
470-
cond_filter,
471-
"Discarded documents for the filter on the flagged words ratio",
472-
"docs",
473-
)
455+
if "character_repetition_ratio" in columns:
456+
cond_filter = np.invert(
457+
np.all(conds["character_repetition_ratio"], axis=0)
458+
)
459+
Visualization_for_lang.display_dataset(
460+
self.docs,
461+
cond_filter,
462+
"Discarded documents for the filter on the character repetition ratio",
463+
"docs",
464+
)
474465

475-
if "lang_id_score" in columns:
476-
cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
477-
Visualization_for_lang.display_dataset(
478-
self.docs,
479-
cond_filter,
480-
"Discarded documents for the filter on the language identification confidence score",
481-
"docs",
482-
)
466+
if "word_repetition_ratio" in columns:
467+
cond_filter = np.invert(
468+
np.all(conds["word_repetition_ratio"], axis=0)
469+
)
470+
Visualization_for_lang.display_dataset(
471+
self.docs,
472+
cond_filter,
473+
"Discarded documents for the filter on the word repetition ratio",
474+
"docs",
475+
)
483476

484-
if "perplexity_score" in columns:
485-
cond_filter = np.invert(np.all(conds["perplexity_score"], axis=0))
486-
Visualization_for_lang.display_dataset(
487-
self.docs,
488-
cond_filter,
489-
"Discarded documents for the filter on the perplexity score",
490-
"docs",
491-
)
477+
if "special_characters_ratio" in columns:
478+
cond_filter = np.invert(
479+
np.all(conds["special_characters_ratio"], axis=0)
480+
)
481+
Visualization_for_lang.display_dataset(
482+
self.docs,
483+
cond_filter,
484+
"Discarded documents for the filter on the special characters ratio",
485+
"docs",
486+
)
492487

493-
Visualization_for_lang.display_dataset(
494-
self.docs, all_conds, "Retained documents", "docs"
495-
)
488+
if "stopwords_ratio" in columns:
489+
cond_filter = np.invert(
490+
np.all(conds["stopwords_ratio"], axis=0)
491+
)
492+
Visualization_for_lang.display_dataset(
493+
self.docs,
494+
cond_filter,
495+
"Discarded documents for the filter on the stop words ratio",
496+
"docs",
497+
)
498+
499+
if "flagged_words_ratio" in columns:
500+
cond_filter = np.invert(
501+
np.all(conds["flagged_words_ratio"], axis=0)
502+
)
503+
Visualization_for_lang.display_dataset(
504+
self.docs,
505+
cond_filter,
506+
"Discarded documents for the filter on the flagged words ratio",
507+
"docs",
508+
)
509+
510+
if "lang_id_score" in columns:
511+
cond_filter = np.invert(np.all(conds["lang_id_score"], axis=0))
512+
Visualization_for_lang.display_dataset(
513+
self.docs,
514+
cond_filter,
515+
"Discarded documents for the filter on the language identification confidence score",
516+
"docs",
517+
)
518+
519+
if "perplexity_score" in columns:
520+
cond_filter = np.invert(
521+
np.all(conds["perplexity_score"], axis=0)
522+
)
523+
Visualization_for_lang.display_dataset(
524+
self.docs,
525+
cond_filter,
526+
"Discarded documents for the filter on the perplexity score",
527+
"docs",
528+
)
529+
530+
Visualization_for_lang.display_dataset(
531+
self.docs, all_conds, "Retained documents", "docs"
532+
)
496533

497534
st.header("Download data")
498535

0 commit comments

Comments
 (0)