-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathutils.py
More file actions
57 lines (48 loc) · 1.66 KB
/
utils.py
File metadata and controls
57 lines (48 loc) · 1.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import os
from typing import List
import pandas as pd
def get_language_data(language_code: str, test_folder: str) -> str:
"""
Read text data for a language code from the data files.
Returns an empty string if the file is not found or an error occurs.
"""
filename = os.path.join(test_folder, language_code)
if not os.path.isfile(filename):
print(
f"Error: File not found for language code {language_code} at {filename}"
)
return ""
try:
with open(filename, "r", encoding="utf-8") as f:
return f.read()
except UnicodeDecodeError:
with open(filename, "r", encoding="latin-1") as f:
return f.read()
except Exception as e:
print(f"Error reading file {filename}: {e}")
return ""
def load_languages(test_folder: str = "Test/data") -> pd.DataFrame:
"""
Load lang_code.csv and filter to languages that have a matching data file.
"""
df = pd.read_csv("lang_code.csv")
df.columns = df.columns.str.strip()
df["code"] = df["code"].astype(str).str.strip()
df["language"] = df["language"].astype(str).str.strip()
df.dropna(subset=["code", "language"], inplace=True)
files = [
f
for f in os.listdir(test_folder)
if os.path.isfile(os.path.join(test_folder, f))
]
return df[df["code"].isin(files)]
def preprocess_text(text: str) -> List[str]:
"""
Lowercase the text and split into words, keeping only alphabetic characters.
"""
words = text.lower().split()
return [
"".join(ch for ch in word if ch.isalpha())
for word in words
if any(ch.isalpha() for ch in word)
]