Skip to content

Commit 0c1c586

Browse files
committed
add additional noise words. sort unmapped
1 parent 734eab7 commit 0c1c586

File tree

1 file changed

+19
-0
lines changed

1 file changed

+19
-0
lines changed

scripts/1-fetch/internetarchive_fetch.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,13 +77,17 @@
7777
"-handwritten",
7878
"-spoken",
7979
"=",
80+
"dialects",
8081
"english patch",
8182
"hand write",
8283
"hand written",
8384
"hand-written",
8485
"handwritten",
8586
"instrumental",
87+
"intertitles",
8688
"language",
89+
"minimal",
90+
"no dialog",
8791
"no speech",
8892
"no spoken word",
8993
"no voice",
@@ -93,6 +97,7 @@
9397
"subbed",
9498
"subtitle",
9599
"subtitles?",
100+
"titlecards",
96101
"universal",
97102
"with subtitles?",
98103
]
@@ -347,6 +352,13 @@ def query_internet_archive(args, session, license_mapping):
347352
"Number of unmapped legal tools: "
348353
f"{sum(unmapped_licenseurl_counter.values()):,}"
349354
)
355+
unmapped_licenseurl_counter = dict(
356+
sorted(
357+
unmapped_licenseurl_counter.items(),
358+
key=lambda item: item[1],
359+
reverse=True,
360+
)
361+
)
350362
for license, count in unmapped_licenseurl_counter.items():
351363
LOGGER.warning(f" Unmapped legal tools: {license}: {count:,}")
352364

@@ -355,6 +367,13 @@ def query_internet_archive(args, session, license_mapping):
355367
"Number of unmapped languages: "
356368
f"{sum(unmapped_language_counter.values()):,}"
357369
)
370+
unmapped_language_counter = dict(
371+
sorted(
372+
unmapped_language_counter.items(),
373+
key=lambda item: item[1],
374+
reverse=True,
375+
)
376+
)
358377
for lang, count in unmapped_language_counter.items():
359378
cleaned = normalize_key(strip_noise(lang))
360379
LOGGER.warning(

0 commit comments

Comments
 (0)