|
17 | 17 | */ |
18 | 18 |
|
19 | 19 | #include <algorithm> |
20 | | -#include <map> |
21 | | - |
22 | 20 | #include "Lexicon.hpp" |
23 | 21 |
|
24 | 22 | namespace opencc { |
25 | 23 |
|
26 | 24 | namespace { |
27 | 25 |
|
28 | | -enum class LineType { Empty, Comment, Entry }; |
29 | | - |
30 | | -struct ParsedLine { |
31 | | - LineType type; |
32 | | - std::string content; // Raw line content |
33 | | - DictEntry* entry; // Parsed entry (nullptr for non-entry lines) |
34 | | - |
35 | | - ParsedLine() : type(LineType::Empty), entry(nullptr) {} |
36 | | -}; |
37 | | - |
38 | | -// Determine line type when preserving comments |
39 | | -LineType DetermineLineType(const char* buff) { |
40 | | - if (buff == nullptr || UTF8Util::IsLineEndingOrFileEnding(*buff)) { |
41 | | - return LineType::Empty; |
42 | | - } |
43 | | - // Comment lines start with # |
44 | | - if (*buff == '#') { |
45 | | - return LineType::Comment; |
46 | | - } |
47 | | - // Check if it's an entry line (must have a tab) |
48 | | - const char* pbuff = UTF8Util::FindNextInline(buff, '\t'); |
49 | | - if (!UTF8Util::IsLineEndingOrFileEnding(*pbuff)) { |
50 | | - return LineType::Entry; |
51 | | - } |
52 | | - // Line with content but no tab - could be empty or malformed |
53 | | - // Check if it's all whitespace |
54 | | - const char* p = buff; |
55 | | - while (!UTF8Util::IsLineEndingOrFileEnding(*p)) { |
56 | | - if (*p != ' ' && *p != '\t') { |
57 | | - // Non-whitespace character without tab = malformed |
58 | | - return LineType::Entry; // Will fail in ParseKeyValues |
59 | | - } |
60 | | - p++; |
61 | | - } |
62 | | - return LineType::Empty; |
63 | | -} |
64 | 26 |
|
65 | 27 | DictEntry* ParseKeyValues(const char* buff, size_t lineNum) { |
66 | 28 | size_t length; |
@@ -91,15 +53,6 @@ DictEntry* ParseKeyValues(const char* buff, size_t lineNum) { |
91 | 53 | } |
92 | 54 | } |
93 | 55 |
|
94 | | -std::string TrimLineEnding(const char* buff) { |
95 | | - std::string line(buff); |
96 | | - // Remove trailing \r\n or \n |
97 | | - while (!line.empty() && (line.back() == '\n' || line.back() == '\r')) { |
98 | | - line.pop_back(); |
99 | | - } |
100 | | - return line; |
101 | | -} |
102 | | - |
103 | 56 | } // namespace |
104 | 57 |
|
105 | 58 | void Lexicon::Sort() { |
@@ -129,239 +82,19 @@ LexiconPtr Lexicon::ParseLexiconFromFile(FILE* fp) { |
129 | 82 | LexiconPtr lexicon(new Lexicon); |
130 | 83 | UTF8Util::SkipUtf8Bom(fp); |
131 | 84 |
|
132 | | - // Preserve comments: use detailed parsing |
133 | | - std::vector<ParsedLine> allLines; |
134 | 85 | size_t lineNum = 1; |
135 | | - |
136 | | - // Phase 1: Parse all lines and determine their types |
137 | 86 | while (fgets(buff, ENTRY_BUFF_SIZE, fp)) { |
138 | | - ParsedLine line; |
139 | | - line.type = DetermineLineType(buff); |
140 | | - line.content = TrimLineEnding(buff); |
141 | | - |
142 | | - if (line.type == LineType::Entry) { |
143 | | - line.entry = ParseKeyValues(buff, lineNum); |
144 | | - if (line.entry != nullptr) { |
145 | | - lexicon->Add(line.entry); |
146 | | - } |
| 87 | + if (*buff == '#') { |
| 88 | + lineNum++; |
| 89 | + continue; |
147 | 90 | } |
148 | | - |
149 | | - allLines.push_back(std::move(line)); |
150 | | - lineNum++; |
151 | | - } |
152 | | - |
153 | | - // Phase 2: Build comment blocks and classify them |
154 | | - std::vector<CommentBlock> headerBlocks; |
155 | | - std::vector<CommentBlock> footerBlocks; |
156 | | - std::vector<AnnotatedEntry> annotatedEntries; |
157 | | - std::vector<std::pair<size_t, CommentBlock>> floatingBlocks; // (anchor_idx, block) |
158 | | - |
159 | | - // Find first and last entry line indices |
160 | | - int firstEntryIdx = -1; |
161 | | - int lastEntryIdx = -1; |
162 | | - for (size_t i = 0; i < allLines.size(); ++i) { |
163 | | - if (allLines[i].type == LineType::Entry && allLines[i].entry != nullptr) { |
164 | | - if (firstEntryIdx == -1) { |
165 | | - firstEntryIdx = static_cast<int>(i); |
166 | | - } |
167 | | - lastEntryIdx = static_cast<int>(i); |
168 | | - } |
169 | | - } |
170 | | - |
171 | | - if (firstEntryIdx == -1) { |
172 | | - // No entries, all comments are header or footer |
173 | | - // For simplicity, treat them as header |
174 | | - std::vector<std::string> commentLines; |
175 | | - for (const auto& line : allLines) { |
176 | | - if (line.type == LineType::Comment) { |
177 | | - commentLines.push_back(line.content); |
178 | | - } else if (line.type == LineType::Empty && !commentLines.empty()) { |
179 | | - headerBlocks.emplace_back(std::move(commentLines)); |
180 | | - commentLines.clear(); |
181 | | - } |
| 91 | + DictEntry* entry = ParseKeyValues(buff, lineNum); |
| 92 | + if (entry != nullptr) { |
| 93 | + lexicon->Add(entry); |
182 | 94 | } |
183 | | - if (!commentLines.empty()) { |
184 | | - headerBlocks.emplace_back(std::move(commentLines)); |
185 | | - } |
186 | | - lexicon->SetHeaderBlocks(std::move(headerBlocks)); |
187 | | - return lexicon; |
188 | | - } |
189 | | - |
190 | | - // Find the last empty line before first entry |
191 | | - int headerEndIdx = -1; |
192 | | - for (int i = firstEntryIdx - 1; i >= 0; --i) { |
193 | | - if (allLines[i].type == LineType::Empty) { |
194 | | - headerEndIdx = i; |
195 | | - break; |
196 | | - } |
197 | | - } |
198 | | - |
199 | | - // Build header blocks (before headerEndIdx) |
200 | | - std::vector<std::string> currentBlock; |
201 | | - for (int i = 0; i <= headerEndIdx; ++i) { |
202 | | - if (allLines[i].type == LineType::Comment) { |
203 | | - currentBlock.push_back(allLines[i].content); |
204 | | - } else if (allLines[i].type == LineType::Empty) { |
205 | | - if (!currentBlock.empty()) { |
206 | | - headerBlocks.emplace_back(std::move(currentBlock)); |
207 | | - currentBlock.clear(); |
208 | | - } |
209 | | - } |
210 | | - } |
211 | | - if (!currentBlock.empty()) { |
212 | | - headerBlocks.emplace_back(std::move(currentBlock)); |
213 | | - currentBlock.clear(); |
214 | | - } |
215 | | - |
216 | | - // Build footer blocks (after lastEntryIdx) |
217 | | - for (size_t i = lastEntryIdx + 1; i < allLines.size(); ++i) { |
218 | | - if (allLines[i].type == LineType::Comment) { |
219 | | - currentBlock.push_back(allLines[i].content); |
220 | | - } else if (allLines[i].type == LineType::Empty) { |
221 | | - if (!currentBlock.empty()) { |
222 | | - footerBlocks.emplace_back(std::move(currentBlock)); |
223 | | - currentBlock.clear(); |
224 | | - } |
225 | | - } |
226 | | - } |
227 | | - if (!currentBlock.empty()) { |
228 | | - footerBlocks.emplace_back(std::move(currentBlock)); |
229 | | - } |
230 | | - |
231 | | - // Build annotated entries (between first and last entry) |
232 | | - // Scan from headerEndIdx+1 to lastEntryIdx |
233 | | - size_t entryIndex = 0; |
234 | | - for (int i = headerEndIdx + 1; i <= lastEntryIdx; ++i) { |
235 | | - if (allLines[i].type == LineType::Comment) { |
236 | | - currentBlock.push_back(allLines[i].content); |
237 | | - } else if (allLines[i].type == LineType::Entry && allLines[i].entry != nullptr) { |
238 | | - // Check if current comment block should attach to this entry |
239 | | - CommentBlock* attachedComment = nullptr; |
240 | | - if (!currentBlock.empty()) { |
241 | | - // Check if there's an empty line between comment and entry |
242 | | - bool hasEmptyLineBetween = false; |
243 | | - for (int j = i - 1; j >= 0 && allLines[j].type != LineType::Entry; --j) { |
244 | | - if (allLines[j].type == LineType::Empty) { |
245 | | - hasEmptyLineBetween = true; |
246 | | - break; |
247 | | - } |
248 | | - if (allLines[j].type == LineType::Comment) { |
249 | | - break; // reached the comment block |
250 | | - } |
251 | | - } |
252 | | - |
253 | | - if (!hasEmptyLineBetween) { |
254 | | - // Attached comment |
255 | | - attachedComment = new CommentBlock(std::move(currentBlock)); |
256 | | - } else { |
257 | | - // Floating comment |
258 | | - floatingBlocks.emplace_back(entryIndex, CommentBlock(currentBlock)); |
259 | | - } |
260 | | - currentBlock.clear(); |
261 | | - } |
262 | | - |
263 | | - // Create annotated entry |
264 | | - DictEntry* entryCopy = DictEntryFactory::New(allLines[i].entry); |
265 | | - annotatedEntries.emplace_back(entryCopy, attachedComment); |
266 | | - entryIndex++; |
267 | | - } else if (allLines[i].type == LineType::Empty) { |
268 | | - if (!currentBlock.empty()) { |
269 | | - // Comment block followed by empty line - it's floating |
270 | | - // Find next entry to determine anchor |
271 | | - size_t anchorIdx = entryIndex; |
272 | | - for (int j = i + 1; j <= lastEntryIdx; ++j) { |
273 | | - if (allLines[j].type == LineType::Entry && allLines[j].entry != nullptr) { |
274 | | - break; // anchorIdx is already correct |
275 | | - } |
276 | | - } |
277 | | - floatingBlocks.emplace_back(anchorIdx, CommentBlock(currentBlock)); |
278 | | - currentBlock.clear(); |
279 | | - } |
280 | | - } |
281 | | - } |
282 | | - |
283 | | - // Handle any remaining comment block as floating |
284 | | - if (!currentBlock.empty()) { |
285 | | - floatingBlocks.emplace_back(entryIndex, CommentBlock(currentBlock)); |
| 95 | + lineNum++; |
286 | 96 | } |
287 | | - |
288 | | - // Store results |
289 | | - lexicon->SetHeaderBlocks(std::move(headerBlocks)); |
290 | | - lexicon->SetFooterBlocks(std::move(footerBlocks)); |
291 | | - lexicon->SetAnnotatedEntries(std::move(annotatedEntries)); |
292 | | - lexicon->SetFloatingBlocks(std::move(floatingBlocks)); |
293 | | - |
294 | 97 | return lexicon; |
295 | 98 | } |
296 | 99 |
|
297 | | -void Lexicon::SortWithAnnotations() { |
298 | | - if (!HasAnnotations() || annotatedEntries.empty()) { |
299 | | - // No annotations, just sort entries normally |
300 | | - Sort(); |
301 | | - return; |
302 | | - } |
303 | | - |
304 | | - std::vector<std::string> originalKeys; |
305 | | - originalKeys.reserve(annotatedEntries.size()); |
306 | | - for (const auto& annotated : annotatedEntries) { |
307 | | - originalKeys.push_back(annotated.Key()); |
308 | | - } |
309 | | - |
310 | | - // Create a mapping from old entry pointers to their annotated counterparts |
311 | | - std::map<std::string, size_t> keyToAnnotatedIndex; |
312 | | - for (size_t i = 0; i < annotatedEntries.size(); ++i) { |
313 | | - keyToAnnotatedIndex[annotatedEntries[i].Key()] = i; |
314 | | - } |
315 | | - |
316 | | - // Sort the regular entries |
317 | | - Sort(); |
318 | | - |
319 | | - // Rebuild annotatedEntries in the new order |
320 | | - std::vector<AnnotatedEntry> sortedAnnotated; |
321 | | - sortedAnnotated.reserve(annotatedEntries.size()); |
322 | | - std::map<std::string, size_t> keyToNewIndex; |
323 | | - |
324 | | - for (const auto& entry : entries) { |
325 | | - auto it = keyToAnnotatedIndex.find(entry->Key()); |
326 | | - if (it != keyToAnnotatedIndex.end()) { |
327 | | - size_t oldIndex = it->second; |
328 | | - // Move the annotated entry (with its comment) to the new sorted order |
329 | | - DictEntry* entryCopy = DictEntryFactory::New(entry.get()); |
330 | | - CommentBlock* commentCopy = nullptr; |
331 | | - if (annotatedEntries[oldIndex].attachedComment) { |
332 | | - commentCopy = new CommentBlock(annotatedEntries[oldIndex].attachedComment->lines); |
333 | | - } |
334 | | - sortedAnnotated.emplace_back(entryCopy, commentCopy); |
335 | | - } else { |
336 | | - // Entry without annotation |
337 | | - DictEntry* entryCopy = DictEntryFactory::New(entry.get()); |
338 | | - sortedAnnotated.emplace_back(entryCopy, nullptr); |
339 | | - } |
340 | | - keyToNewIndex[entry->Key()] = sortedAnnotated.size() - 1; |
341 | | - } |
342 | | - |
343 | | - annotatedEntries = std::move(sortedAnnotated); |
344 | | - |
345 | | - if (!floatingBlocks.empty()) { |
346 | | - std::vector<std::pair<size_t, CommentBlock>> updatedFloating; |
347 | | - updatedFloating.reserve(floatingBlocks.size()); |
348 | | - const size_t newCount = annotatedEntries.size(); |
349 | | - for (const auto& pair : floatingBlocks) { |
350 | | - size_t anchor = pair.first; |
351 | | - if (anchor >= originalKeys.size()) { |
352 | | - updatedFloating.emplace_back(newCount, pair.second); |
353 | | - continue; |
354 | | - } |
355 | | - const std::string& anchorKey = originalKeys[anchor]; |
356 | | - auto newIt = keyToNewIndex.find(anchorKey); |
357 | | - if (newIt != keyToNewIndex.end()) { |
358 | | - updatedFloating.emplace_back(newIt->second, pair.second); |
359 | | - } else { |
360 | | - updatedFloating.emplace_back(newCount, pair.second); |
361 | | - } |
362 | | - } |
363 | | - floatingBlocks = std::move(updatedFloating); |
364 | | - } |
365 | | -} |
366 | | - |
367 | 100 | } // namespace opencc |
0 commit comments