Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
from typing import Tuple, Optional, Callable
from datasets.utils.logging import set_verbosity_info
from clean_helpers import build_small_docs_filter, filter_wiki_non_text_type, filter_wiki_user_titles, \
replace_newline_with_space, build_dedup_template, dedup_document, build_bad_substring_remover

replace_newline_with_space, build_dedup_template, dedup_document, build_bad_substring_remover, \
build_short_line_remover

set_verbosity_info()
logger = logging.getLogger(__name__)
Expand All @@ -18,7 +18,8 @@
MAPS = {
"replace_newline_with_space": replace_newline_with_space,
"remove_lines_with_code": build_bad_substring_remover(["{", "}", "[if", "<script"]),
"remove_html_spans": build_bad_substring_remover(["<span", "</span>", "<div", "</div>", "<a", "</a>", "br>"])
"remove_html_spans": build_bad_substring_remover(["<span", "</span>", "<div", "</div>", "<a", "</a>", "br>"]),
"remove_short_lines_pt_bwarc": build_short_line_remover(min_length=10)
}
# Filter functions: function(batch: Dict) -> Dict
FILTERS = {
Expand All @@ -37,6 +38,10 @@
min_template_line_size=0,
min_template_line_occurence=2,
),
"dedup_template_pt_bwarc": build_dedup_template(
min_template_line_occurence=50,
min_template_line_size=0
),
"dedup_document": dedup_document
}

Expand Down
2 changes: 1 addition & 1 deletion clean_helpers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from .filter_wiki_meta import filter_wiki_user_titles, filter_wiki_non_text_type
from .filter_small_docs_in_datasets import build_small_docs_filter
from .map_arabic import replace_newline_with_space
from .clean_lines import build_bad_substring_remover
from .clean_lines import build_bad_substring_remover, build_short_line_remover
from .deduplication import build_dedup_template, dedup_document
11 changes: 11 additions & 0 deletions clean_helpers/clean_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,14 @@ def remove_bad_substring(batch):
]
}
return remove_bad_substring

def build_short_line_remover(min_length: int):
def remove_short_lines(batch):
return {
**batch,
"text": [
"\n".join([line for line in text.split("\n") if len(line.split(" ")) >= min_length])
for text in batch["text"]
]
}
return remove_short_lines