1
+ """
2
+ Autocomplete System using SQLite3 for Persistence with N-gram optimization
3
+
4
+ This module implements an autocomplete system that learns word sequences from training sentences
5
+ and predicts the most likely next word based on the learned patterns. It uses SQLite3 for
6
+ persistent storage of word mappings and predictions.
7
+ """
8
+
9
+ import sqlite3
10
+ import json
11
+ from typing import Dict , List , Optional , Tuple , Union
12
+
13
+ class AutoComplete :
14
+ """
15
+ An autocomplete system that trains on text data and predicts subsequent words using N-gram model.
16
+
17
+ The system works by:
18
+ 1. Building N-gram maps that track how often each N-gram is followed by another word
19
+ 2. Maintaining predictions for the most likely next word for each N-gram
20
+ 3. Storing all data in an SQLite database for persistence
21
+ """
22
+
23
+ def __init__ (self , n = 2 ) -> None :
24
+ """
25
+ Initialize the AutoComplete system and set up the database.
26
+
27
+ Creates an SQLite database connection and initializes required tables
28
+ (NGramMap and NGramPrediction) if they don't already exist. These tables
29
+ store the N-gram transition mappings and precomputed predictions respectively.
30
+ """
31
+ self .n = n
32
+ # Establish database connection with autocommit enabled
33
+ self .conn : sqlite3 .Connection = sqlite3 .connect ("autocompleteDB.sqlite3" , autocommit = True )
34
+ cursor : sqlite3 .Cursor = self .conn .cursor ()
35
+
36
+ # Check if tables exist
37
+ cursor .execute ("SELECT name FROM sqlite_master WHERE name='NGramMap'" )
38
+ tables_exist : Optional [Tuple [str ]] = cursor .fetchone ()
39
+
40
+ if not tables_exist :
41
+ # Create tables if they don't exist
42
+ cursor .execute ("CREATE TABLE NGramMap(name TEXT, value TEXT)" )
43
+ cursor .execute ("CREATE TABLE NGramPrediction(name TEXT, value TEXT)" )
44
+
45
+ # Initialize with empty dictionaries
46
+ cursor .execute ("INSERT INTO NGramMap VALUES (?, ?)" , ("ngramsmap" , "{}" ))
47
+ cursor .execute ("INSERT INTO NGramPrediction VALUES (?, ?)" , ("ngrampredictions" , "{}" ))
48
+
49
+ def generate_ngrams (self , words_list : List [str ]) -> List [Tuple [str ]]:
50
+ """
51
+ Generate N-grams from a list of words.
52
+ """
53
+ ngrams = []
54
+ for i in range (len (words_list ) - self .n + 1 ):
55
+ ngrams .append (tuple (words_list [i :i + self .n ]))
56
+ return ngrams
57
+
58
+ def train (self , sentence : str ) -> str :
59
+ """
60
+ Train the autocomplete system with a single sentence.
61
+
62
+ Processes the input sentence to update:
63
+ 1. N-gram transition counts (NGramMap)
64
+ 2. Most likely next word predictions (NGramPrediction)
65
+
66
+ Args:
67
+ sentence: A string containing the training text. Words should be space-separated.
68
+
69
+ Returns:
70
+ Confirmation message indicating training completion.
71
+ """
72
+ cursor : sqlite3 .Cursor = self .conn .cursor ()
73
+
74
+ # Split sentence into individual words
75
+ words_list : List [str ] = sentence .split (" " )
76
+
77
+ # Retrieve existing N-gram map and predictions from database
78
+ cursor .execute ("SELECT value FROM NGramMap WHERE name='ngramsmap'" )
79
+ ngrams_map_str : str = cursor .fetchone ()[0 ]
80
+ ngrams_map : Dict [Tuple [str ], Dict [str , int ]] = json .loads (ngrams_map_str , object_hook = lambda d : {tuple (k .split ()): v for k , v in d .items ()})
81
+
82
+ cursor .execute ("SELECT value FROM NGramPrediction WHERE name='ngrampredictions'" )
83
+ predictions_str : str = cursor .fetchone ()[0 ]
84
+ predictions : Dict [Tuple [str ], Dict [str , Union [str , int ]]] = json .loads (predictions_str , object_hook = lambda d : {tuple (k .split ()): v for k , v in d .items ()})
85
+
86
+ # Generate N-grams
87
+ ngrams = self .generate_ngrams (words_list )
88
+
89
+ # Process each N-gram and the next word
90
+ for i in range (len (ngrams ) - 1 ):
91
+ curr_ngram : Tuple [str ] = ngrams [i ]
92
+ next_word : str = words_list [i + self .n ]
93
+
94
+ # Update N-gram transition counts
95
+ if curr_ngram not in ngrams_map :
96
+ ngrams_map [curr_ngram ] = {}
97
+
98
+ if next_word not in ngrams_map [curr_ngram ]:
99
+ ngrams_map [curr_ngram ][next_word ] = 1
100
+ else :
101
+ ngrams_map [curr_ngram ][next_word ] += 1
102
+
103
+ # Update predictions with most frequent next word
104
+ if curr_ngram not in predictions :
105
+ predictions [curr_ngram ] = {
106
+ 'completion_word' : next_word ,
107
+ 'completion_count' : 1
108
+ }
109
+ else :
110
+ # Update if current next word is more frequent
111
+ if ngrams_map [curr_ngram ][next_word ] > predictions [curr_ngram ]['completion_count' ]:
112
+ predictions [curr_ngram ]['completion_word' ] = next_word
113
+ predictions [curr_ngram ]['completion_count' ] = ngrams_map [curr_ngram ][next_word ]
114
+
115
+ # Save updated data back to database
116
+ updated_ngrams_map : str = json .dumps ({ ' ' .join (k ): v for k , v in ngrams_map .items () })
117
+ updated_predictions : str = json .dumps ({ ' ' .join (k ): v for k , v in predictions .items () })
118
+
119
+ cursor .execute ("UPDATE NGramMap SET value = ? WHERE name='ngramsmap'" , (updated_ngrams_map ,))
120
+ cursor .execute ("UPDATE NGramPrediction SET value = ? WHERE name='ngrampredictions'" , (updated_predictions ,))
121
+
122
+ return "training complete"
123
+
124
+ def predict (self , words : str ) -> Optional [str ]:
125
+ """
126
+ Predict the most likely next word for a given input sequence of words.
127
+
128
+ Args:
129
+ words: The input sequence of words to generate a completion for.
130
+
131
+ Returns:
132
+ The most likely next word, or None if no prediction exists.
133
+
134
+ Raises:
135
+ KeyError: If the input sequence of words has no entries in the prediction database.
136
+ """
137
+ cursor : sqlite3 .Cursor = self .conn .cursor ()
138
+
139
+ # Retrieve predictions from database
140
+ cursor .execute ("SELECT value FROM NGramPrediction WHERE name='ngrampredictions'" )
141
+ predictions_str : str = cursor .fetchone ()[0 ]
142
+ predictions : Dict [Tuple [str ], Dict [str , Union [str , int ]]] = json .loads (predictions_str , object_hook = lambda d : {tuple (k .split ()): v for k , v in d .items ()})
143
+
144
+ input_words = words .lower ().split ()
145
+ for i in range (len (input_words ), max (0 , len (input_words ) - self .n + 1 ), - 1 ):
146
+ curr_ngram = tuple (input_words [i - self .n :i ])
147
+ if curr_ngram in predictions :
148
+ return str (predictions [curr_ngram ]['completion_word' ])
149
+ return None
150
+
151
+
152
+ if __name__ == "__main__" :
153
+ # Example usage
154
+ training_sentence : str = (
155
+ "It is not enough to just know how tools work and what they worth, "
156
+ "we have got to learn how to use them and to use them well. And with "
157
+ "all these new weapons in your arsenal, we would better get those profits fired up"
158
+ )
159
+
160
+ # Initialize and train the autocomplete system
161
+ autocomplete : AutoComplete = AutoComplete (n = 2 )
162
+ autocomplete .train (training_sentence )
163
+
164
+ # Test prediction
165
+ test_words : str = "to use"
166
+ prediction : Optional [str ] = autocomplete .predict (test_words )
167
+ print (f"Prediction for '{ test_words } ': { prediction } " )
0 commit comments