1
+ """
2
+ Script to update the custom dictionary 'main.txt' with new words from a given .po file.
3
+
4
+ The script scans a specified .po file, ignoring certain metadata lines (e.g., lines starting with "#:").
5
+ It extracts all unique Greek and English words, compares them against the custom dictionary
6
+ under the 'dictionaries/' directory (sibling to the 'scripts/' directory), and adds any new words in alphabetical order.
7
+ """
8
+
9
+ import sys
10
+ import os
11
+ import re
12
+
13
+ def scan_and_update (file_path ):
14
+ """
15
+ Scan the given .po file, extract words, and update the main dictionary.
16
+
17
+ If the dictionary does not exist, it creates a new one.
18
+
19
+ Args:
20
+ file_path (str): Path to the .po file.
21
+
22
+ Returns:
23
+ int: The number of new words added to the dictionary.
24
+ """
25
+ # Define the path to the main.txt file relative to the script's location
26
+ script_dir = os .path .dirname (os .path .abspath (__file__ ))
27
+ # Navigate to the parent directory of scripts/ and then to dictionaries/
28
+ dictionaries_dir = os .path .abspath (os .path .join (script_dir , ".." , "dictionaries" ))
29
+ dictionary_path = os .path .join (dictionaries_dir , "main.txt" )
30
+
31
+ # Step 1: Ensure the dictionaries directory exists
32
+ os .makedirs (dictionaries_dir , exist_ok = True )
33
+
34
+ # Step 2: Read and sort the existing dictionary
35
+ try :
36
+ with open (dictionary_path , 'r' , encoding = 'utf-8' ) as dict_file :
37
+ dictionary = set (line .strip ().lower () for line in dict_file if line .strip ())
38
+ except FileNotFoundError :
39
+ print (f"Dictionary file not found at { dictionary_path } . Creating a new one." )
40
+ dictionary = set ()
41
+
42
+ # Step 3: Open the input .po file
43
+ try :
44
+ with open (file_path , 'r' , encoding = 'utf-8' ) as input_file :
45
+ lines = input_file .readlines ()
46
+ except FileNotFoundError :
47
+ print (f"Input file { file_path } not found." )
48
+ return 0
49
+
50
+ # Regular expression to ignore metadata lines like #: reference/executionmodel.rst:145
51
+ ignore_pattern = re .compile (r"^#:" )
52
+
53
+ # Regular expression to include accented Greek letters
54
+ word_pattern = re .compile (r'\b[a-zA-Zα-ωά-ώΑ-ΩΆ-Ώ]+\b' , re .UNICODE )
55
+
56
+ new_words = set ()
57
+ entry_buffer = []
58
+ collecting_msgstr = False
59
+
60
+ # Step 4: Extract words from the .po file
61
+ for line in lines :
62
+ if ignore_pattern .match (line ):
63
+ continue # Ignore metadata lines
64
+
65
+ # Handle msgstr entries
66
+ if line .startswith ("msgstr" ):
67
+ collecting_msgstr = True
68
+ # Extract the content after 'msgstr' and remove surrounding quotes
69
+ msgstr_content = line .strip ().partition ('msgstr' )[2 ].strip ().strip ('"' )
70
+ if msgstr_content :
71
+ entry_buffer .append (msgstr_content )
72
+ elif collecting_msgstr :
73
+ if line .strip () == "" or not line .startswith ('"' ):
74
+ # End of msgstr block
75
+ collecting_msgstr = False
76
+ if entry_buffer :
77
+ full_text = " " .join (entry_buffer )
78
+ words = word_pattern .findall (full_text )
79
+ # Add unique new words in lowercase
80
+ new_words .update (word .lower () for word in words if word .lower () not in dictionary )
81
+ entry_buffer = []
82
+ else :
83
+ # Continue collecting multiline msgstr
84
+ # Remove surrounding quotes and append
85
+ entry_buffer .append (line .strip ().strip ('"' ))
86
+
87
+ # Handle any remaining buffered text after the loop
88
+ if collecting_msgstr and entry_buffer :
89
+ full_text = " " .join (entry_buffer )
90
+ words = word_pattern .findall (full_text )
91
+ new_words .update (word .lower () for word in words if word .lower () not in dictionary )
92
+
93
+ # Step 5: Update the dictionary with new words
94
+ if new_words :
95
+ dictionary .update (new_words )
96
+ # Sort and write back to the dictionary file
97
+ sorted_dictionary = sorted (dictionary )
98
+ with open (dictionary_path , 'w' , encoding = 'utf-8' ) as dict_file :
99
+ dict_file .write ("\n " .join (sorted_dictionary ))
100
+ print (f"Added { len (new_words )} new word{ 's' if len (new_words ) != 1 else '' } to the dictionary." )
101
+ else :
102
+ print ("No new words to add to the dictionary." )
103
+
104
+ # Return the count of new words added
105
+ return len (new_words )
106
+
107
+ if __name__ == "__main__" :
108
+ # Check if the script received the correct number of arguments
109
+ if len (sys .argv ) != 2 :
110
+ print ("Usage: python add_to_dictionary.py <file_path>" )
111
+ else :
112
+ file_path = sys .argv [1 ]
113
+ # Validate that the provided path is a file
114
+ if not os .path .isfile (file_path ):
115
+ print (f"The provided path '{ file_path } ' is not a valid file." )
116
+ sys .exit (1 )
117
+ # Process the input file and update the dictionary
118
+ new_word_count = scan_and_update (file_path )
0 commit comments