@@ -128,8 +128,6 @@ def display_dataset(dataframe, cond, description, type_of_examples):
128128 st .dataframe (displayed_examples )
129129
130130 def filtering_of_docs (self ):
131- st .sidebar .subheader ("Parameters of the filtering on documents" )
132-
133131 def set_sliders ():
134132 columns = list (self .docs )
135133 keys = []
@@ -385,114 +383,153 @@ def get_cond(key, cutoff, max_cutoff):
385383
386384 return keys , conds
387385
388- self .keys , conds = set_sliders ()
389- self .parameters = self .keys * 1
390-
391- all_conds = [subcond for cond in list (conds .values ()) for subcond in cond ]
392- all_conds = np .all (all_conds , axis = 0 )
393-
394386 with st .expander (
395387 f"Filtering on documents, for { self .num_docs } { self .lang } documents"
396388 ):
397389 st .header (
398390 f"Filtering on documents, for { self .num_docs } { self .lang } documents"
399391 )
400392
401- Visualization_for_lang .display_dataset (
402- self .docs , np .invert (all_conds ), "Discarded documents" , "docs"
403- )
393+ if "labels" in list (self .docs ):
394+ chosen_label = st .selectbox (
395+ label = "Consider only documents that include the following label" ,
396+ options = [
397+ "All" ,
398+ "NA: Narrative" ,
399+ "IN: Informational Description" ,
400+ "OP: Opinion" ,
401+ "ID: Interactive Discussion" ,
402+ "HI: How-to/Instruction" ,
403+ "IP: Informational Persuasion" ,
404+ "LY: Lyrical" ,
405+ "SP: Spoken" ,
406+ ],
407+ )
408+ chosen_label = chosen_label .split (":" )[0 ]
409+ if chosen_label != "All" :
410+ cond_label = list (
411+ self .docs ["labels" ].apply (
412+ lambda x : True if chosen_label in x else False
413+ )
414+ )
415+ self .docs = self .docs [cond_label ]
404416
405- # st.subheader("Display discarded documents by filter")
406- display_discarded_documents_by_filter = st .checkbox (
407- "Display discarded documents by filter"
408- )
417+ if self .docs .empty :
418+ st .markdown (
419+ "No document to display, please try to select a different label."
420+ )
421+ self .keys = []
422+ self .parameters = []
409423
410- if display_discarded_documents_by_filter :
411- columns = list (self .docs )
424+ else :
425+ st .sidebar .subheader ("Parameters of the filtering on documents" )
426+ self .keys , conds = set_sliders ()
427+ self .parameters = self .keys * 1
412428
413- if "number_words" in columns :
414- cond_filter = np .invert (np .all (conds ["number_words" ], axis = 0 ))
415- Visualization_for_lang .display_dataset (
416- self .docs ,
417- cond_filter ,
418- "Discarded documents for the filter on the number of words" ,
419- "docs" ,
420- )
429+ all_conds = [
430+ subcond for cond in list (conds .values ()) for subcond in cond
431+ ]
432+ all_conds = np .all (all_conds , axis = 0 )
421433
422- if "character_repetition_ratio" in columns :
423- cond_filter = np .invert (
424- np .all (conds ["character_repetition_ratio" ], axis = 0 )
425- )
426- Visualization_for_lang .display_dataset (
427- self .docs ,
428- cond_filter ,
429- "Discarded documents for the filter on the character repetition ratio" ,
430- "docs" ,
431- )
434+ Visualization_for_lang .display_dataset (
435+ self .docs , np .invert (all_conds ), "Discarded documents" , "docs"
436+ )
432437
433- if "word_repetition_ratio" in columns :
434- cond_filter = np .invert (
435- np .all (conds ["word_repetition_ratio" ], axis = 0 )
436- )
437- Visualization_for_lang .display_dataset (
438- self .docs ,
439- cond_filter ,
440- "Discarded documents for the filter on the word repetition ratio" ,
441- "docs" ,
442- )
438+ # st.subheader("Display discarded documents by filter")
439+ display_discarded_documents_by_filter = st .checkbox (
440+ "Display discarded documents by filter"
441+ )
443442
444- if "special_characters_ratio" in columns :
445- cond_filter = np .invert (
446- np .all (conds ["special_characters_ratio" ], axis = 0 )
447- )
448- Visualization_for_lang .display_dataset (
449- self .docs ,
450- cond_filter ,
451- "Discarded documents for the filter on the special characters ratio" ,
452- "docs" ,
453- )
443+ if display_discarded_documents_by_filter :
444+ columns = list (self .docs )
454445
455- if "stopwords_ratio " in columns :
456- cond_filter = np .invert (np .all (conds ["stopwords_ratio " ], axis = 0 ))
457- Visualization_for_lang .display_dataset (
458- self .docs ,
459- cond_filter ,
460- "Discarded documents for the filter on the stop words ratio " ,
461- "docs" ,
462- )
446+ if "number_words " in columns :
447+ cond_filter = np .invert (np .all (conds ["number_words " ], axis = 0 ))
448+ Visualization_for_lang .display_dataset (
449+ self .docs ,
450+ cond_filter ,
451+ "Discarded documents for the filter on the number of words " ,
452+ "docs" ,
453+ )
463454
464- if "flagged_words_ratio " in columns :
465- cond_filter = np .invert (
466- np .all (conds ["flagged_words_ratio " ], axis = 0 )
467- )
468- Visualization_for_lang .display_dataset (
469- self .docs ,
470- cond_filter ,
471- "Discarded documents for the filter on the flagged words ratio" ,
472- "docs" ,
473- )
455+ if "character_repetition_ratio " in columns :
456+ cond_filter = np .invert (
457+ np .all (conds ["character_repetition_ratio " ], axis = 0 )
458+ )
459+ Visualization_for_lang .display_dataset (
460+ self .docs ,
461+ cond_filter ,
462+ "Discarded documents for the filter on the character repetition ratio" ,
463+ "docs" ,
464+ )
474465
475- if "lang_id_score" in columns :
476- cond_filter = np .invert (np .all (conds ["lang_id_score" ], axis = 0 ))
477- Visualization_for_lang .display_dataset (
478- self .docs ,
479- cond_filter ,
480- "Discarded documents for the filter on the language identification confidence score" ,
481- "docs" ,
482- )
466+ if "word_repetition_ratio" in columns :
467+ cond_filter = np .invert (
468+ np .all (conds ["word_repetition_ratio" ], axis = 0 )
469+ )
470+ Visualization_for_lang .display_dataset (
471+ self .docs ,
472+ cond_filter ,
473+ "Discarded documents for the filter on the word repetition ratio" ,
474+ "docs" ,
475+ )
483476
484- if "perplexity_score" in columns :
485- cond_filter = np .invert (np .all (conds ["perplexity_score" ], axis = 0 ))
486- Visualization_for_lang .display_dataset (
487- self .docs ,
488- cond_filter ,
489- "Discarded documents for the filter on the perplexity score" ,
490- "docs" ,
491- )
477+ if "special_characters_ratio" in columns :
478+ cond_filter = np .invert (
479+ np .all (conds ["special_characters_ratio" ], axis = 0 )
480+ )
481+ Visualization_for_lang .display_dataset (
482+ self .docs ,
483+ cond_filter ,
484+ "Discarded documents for the filter on the special characters ratio" ,
485+ "docs" ,
486+ )
492487
493- Visualization_for_lang .display_dataset (
494- self .docs , all_conds , "Retained documents" , "docs"
495- )
488+ if "stopwords_ratio" in columns :
489+ cond_filter = np .invert (
490+ np .all (conds ["stopwords_ratio" ], axis = 0 )
491+ )
492+ Visualization_for_lang .display_dataset (
493+ self .docs ,
494+ cond_filter ,
495+ "Discarded documents for the filter on the stop words ratio" ,
496+ "docs" ,
497+ )
498+
499+ if "flagged_words_ratio" in columns :
500+ cond_filter = np .invert (
501+ np .all (conds ["flagged_words_ratio" ], axis = 0 )
502+ )
503+ Visualization_for_lang .display_dataset (
504+ self .docs ,
505+ cond_filter ,
506+ "Discarded documents for the filter on the flagged words ratio" ,
507+ "docs" ,
508+ )
509+
510+ if "lang_id_score" in columns :
511+ cond_filter = np .invert (np .all (conds ["lang_id_score" ], axis = 0 ))
512+ Visualization_for_lang .display_dataset (
513+ self .docs ,
514+ cond_filter ,
515+ "Discarded documents for the filter on the language identification confidence score" ,
516+ "docs" ,
517+ )
518+
519+ if "perplexity_score" in columns :
520+ cond_filter = np .invert (
521+ np .all (conds ["perplexity_score" ], axis = 0 )
522+ )
523+ Visualization_for_lang .display_dataset (
524+ self .docs ,
525+ cond_filter ,
526+ "Discarded documents for the filter on the perplexity score" ,
527+ "docs" ,
528+ )
529+
530+ Visualization_for_lang .display_dataset (
531+ self .docs , all_conds , "Retained documents" , "docs"
532+ )
496533
497534 st .header ("Download data" )
498535
0 commit comments