Skip to content

Commit 91f6e3a

Browse files
Made a script to add not existing words of the given file to the main directory.
1 parent 2d80aef commit 91f6e3a

File tree

1 file changed

+118
-0
lines changed

1 file changed

+118
-0
lines changed

scripts/add_to_dictionary.py

Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
"""
2+
Script to update the custom dictionary 'main.txt' with new words from a given .po file.
3+
4+
The script scans a specified .po file, ignoring certain metadata lines (e.g., lines starting with "#:").
5+
It extracts all unique Greek and English words, compares them against the custom dictionary
6+
under the 'dictionaries/' directory (sibling to the 'scripts/' directory), and adds any new words in alphabetical order.
7+
"""
8+
9+
import sys
10+
import os
11+
import re
12+
13+
def scan_and_update(file_path):
14+
"""
15+
Scan the given .po file, extract words, and update the main dictionary.
16+
17+
If the dictionary does not exist, it creates a new one.
18+
19+
Args:
20+
file_path (str): Path to the .po file.
21+
22+
Returns:
23+
int: The number of new words added to the dictionary.
24+
"""
25+
# Define the path to the main.txt file relative to the script's location
26+
script_dir = os.path.dirname(os.path.abspath(__file__))
27+
# Navigate to the parent directory of scripts/ and then to dictionaries/
28+
dictionaries_dir = os.path.abspath(os.path.join(script_dir, "..", "dictionaries"))
29+
dictionary_path = os.path.join(dictionaries_dir, "main.txt")
30+
31+
# Step 1: Ensure the dictionaries directory exists
32+
os.makedirs(dictionaries_dir, exist_ok=True)
33+
34+
# Step 2: Read and sort the existing dictionary
35+
try:
36+
with open(dictionary_path, 'r', encoding='utf-8') as dict_file:
37+
dictionary = set(line.strip().lower() for line in dict_file if line.strip())
38+
except FileNotFoundError:
39+
print(f"Dictionary file not found at {dictionary_path}. Creating a new one.")
40+
dictionary = set()
41+
42+
# Step 3: Open the input .po file
43+
try:
44+
with open(file_path, 'r', encoding='utf-8') as input_file:
45+
lines = input_file.readlines()
46+
except FileNotFoundError:
47+
print(f"Input file {file_path} not found.")
48+
return 0
49+
50+
# Regular expression to ignore metadata lines like #: reference/executionmodel.rst:145
51+
ignore_pattern = re.compile(r"^#:")
52+
53+
# Regular expression to include accented Greek letters
54+
word_pattern = re.compile(r'\b[a-zA-Zα-ωά-ώΑ-ΩΆ-Ώ]+\b', re.UNICODE)
55+
56+
new_words = set()
57+
entry_buffer = []
58+
collecting_msgstr = False
59+
60+
# Step 4: Extract words from the .po file
61+
for line in lines:
62+
if ignore_pattern.match(line):
63+
continue # Ignore metadata lines
64+
65+
# Handle msgstr entries
66+
if line.startswith("msgstr"):
67+
collecting_msgstr = True
68+
# Extract the content after 'msgstr' and remove surrounding quotes
69+
msgstr_content = line.strip().partition('msgstr')[2].strip().strip('"')
70+
if msgstr_content:
71+
entry_buffer.append(msgstr_content)
72+
elif collecting_msgstr:
73+
if line.strip() == "" or not line.startswith('"'):
74+
# End of msgstr block
75+
collecting_msgstr = False
76+
if entry_buffer:
77+
full_text = " ".join(entry_buffer)
78+
words = word_pattern.findall(full_text)
79+
# Add unique new words in lowercase
80+
new_words.update(word.lower() for word in words if word.lower() not in dictionary)
81+
entry_buffer = []
82+
else:
83+
# Continue collecting multiline msgstr
84+
# Remove surrounding quotes and append
85+
entry_buffer.append(line.strip().strip('"'))
86+
87+
# Handle any remaining buffered text after the loop
88+
if collecting_msgstr and entry_buffer:
89+
full_text = " ".join(entry_buffer)
90+
words = word_pattern.findall(full_text)
91+
new_words.update(word.lower() for word in words if word.lower() not in dictionary)
92+
93+
# Step 5: Update the dictionary with new words
94+
if new_words:
95+
dictionary.update(new_words)
96+
# Sort and write back to the dictionary file
97+
sorted_dictionary = sorted(dictionary)
98+
with open(dictionary_path, 'w', encoding='utf-8') as dict_file:
99+
dict_file.write("\n".join(sorted_dictionary))
100+
print(f"Added {len(new_words)} new word{'s' if len(new_words) != 1 else ''} to the dictionary.")
101+
else:
102+
print("No new words to add to the dictionary.")
103+
104+
# Return the count of new words added
105+
return len(new_words)
106+
107+
if __name__ == "__main__":
108+
# Check if the script received the correct number of arguments
109+
if len(sys.argv) != 2:
110+
print("Usage: python add_to_dictionary.py <file_path>")
111+
else:
112+
file_path = sys.argv[1]
113+
# Validate that the provided path is a file
114+
if not os.path.isfile(file_path):
115+
print(f"The provided path '{file_path}' is not a valid file.")
116+
sys.exit(1)
117+
# Process the input file and update the dictionary
118+
new_word_count = scan_and_update(file_path)

0 commit comments

Comments
 (0)