Skip to content

Commit d96a31d

Browse files
committed
大幅简化 C++ 关于字典文件注释的逻辑,直接忽略 # 开头的行;排序可由 Python 脚本进行
1 parent 34b4af5 commit d96a31d

File tree

5 files changed

+21
-508
lines changed

5 files changed

+21
-508
lines changed

src/DictConverter.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -61,12 +61,6 @@ SerializableDictPtr ConvertDict(const std::string& format,
6161
const DictPtr dict,
6262
const std::string& formatFrom) {
6363
if (format == "text") {
64-
if (formatFrom == "text") {
65-
TextDictPtr textDict = std::static_pointer_cast<TextDict>(dict);
66-
if (textDict->GetLexicon()->HasAnnotations()) {
67-
return std::static_pointer_cast<SerializableDict>(textDict);
68-
}
69-
}
7064
return TextDict::NewFromDict(*dict.get());
7165
} else if (format == "ocd") {
7266
#ifdef ENABLE_DARTS

src/Lexicon.cpp

Lines changed: 7 additions & 274 deletions
Original file line numberDiff line numberDiff line change
@@ -17,50 +17,12 @@
1717
*/
1818

1919
#include <algorithm>
20-
#include <map>
21-
2220
#include "Lexicon.hpp"
2321

2422
namespace opencc {
2523

2624
namespace {
2725

28-
enum class LineType { Empty, Comment, Entry };
29-
30-
struct ParsedLine {
31-
LineType type;
32-
std::string content; // Raw line content
33-
DictEntry* entry; // Parsed entry (nullptr for non-entry lines)
34-
35-
ParsedLine() : type(LineType::Empty), entry(nullptr) {}
36-
};
37-
38-
// Determine line type when preserving comments
39-
LineType DetermineLineType(const char* buff) {
40-
if (buff == nullptr || UTF8Util::IsLineEndingOrFileEnding(*buff)) {
41-
return LineType::Empty;
42-
}
43-
// Comment lines start with #
44-
if (*buff == '#') {
45-
return LineType::Comment;
46-
}
47-
// Check if it's an entry line (must have a tab)
48-
const char* pbuff = UTF8Util::FindNextInline(buff, '\t');
49-
if (!UTF8Util::IsLineEndingOrFileEnding(*pbuff)) {
50-
return LineType::Entry;
51-
}
52-
// Line with content but no tab - could be empty or malformed
53-
// Check if it's all whitespace
54-
const char* p = buff;
55-
while (!UTF8Util::IsLineEndingOrFileEnding(*p)) {
56-
if (*p != ' ' && *p != '\t') {
57-
// Non-whitespace character without tab = malformed
58-
return LineType::Entry; // Will fail in ParseKeyValues
59-
}
60-
p++;
61-
}
62-
return LineType::Empty;
63-
}
6426

6527
DictEntry* ParseKeyValues(const char* buff, size_t lineNum) {
6628
size_t length;
@@ -91,15 +53,6 @@ DictEntry* ParseKeyValues(const char* buff, size_t lineNum) {
9153
}
9254
}
9355

94-
std::string TrimLineEnding(const char* buff) {
95-
std::string line(buff);
96-
// Remove trailing \r\n or \n
97-
while (!line.empty() && (line.back() == '\n' || line.back() == '\r')) {
98-
line.pop_back();
99-
}
100-
return line;
101-
}
102-
10356
} // namespace
10457

10558
void Lexicon::Sort() {
@@ -129,239 +82,19 @@ LexiconPtr Lexicon::ParseLexiconFromFile(FILE* fp) {
12982
LexiconPtr lexicon(new Lexicon);
13083
UTF8Util::SkipUtf8Bom(fp);
13184

132-
// Preserve comments: use detailed parsing
133-
std::vector<ParsedLine> allLines;
13485
size_t lineNum = 1;
135-
136-
// Phase 1: Parse all lines and determine their types
13786
while (fgets(buff, ENTRY_BUFF_SIZE, fp)) {
138-
ParsedLine line;
139-
line.type = DetermineLineType(buff);
140-
line.content = TrimLineEnding(buff);
141-
142-
if (line.type == LineType::Entry) {
143-
line.entry = ParseKeyValues(buff, lineNum);
144-
if (line.entry != nullptr) {
145-
lexicon->Add(line.entry);
146-
}
87+
if (*buff == '#') {
88+
lineNum++;
89+
continue;
14790
}
148-
149-
allLines.push_back(std::move(line));
150-
lineNum++;
151-
}
152-
153-
// Phase 2: Build comment blocks and classify them
154-
std::vector<CommentBlock> headerBlocks;
155-
std::vector<CommentBlock> footerBlocks;
156-
std::vector<AnnotatedEntry> annotatedEntries;
157-
std::vector<std::pair<size_t, CommentBlock>> floatingBlocks; // (anchor_idx, block)
158-
159-
// Find first and last entry line indices
160-
int firstEntryIdx = -1;
161-
int lastEntryIdx = -1;
162-
for (size_t i = 0; i < allLines.size(); ++i) {
163-
if (allLines[i].type == LineType::Entry && allLines[i].entry != nullptr) {
164-
if (firstEntryIdx == -1) {
165-
firstEntryIdx = static_cast<int>(i);
166-
}
167-
lastEntryIdx = static_cast<int>(i);
168-
}
169-
}
170-
171-
if (firstEntryIdx == -1) {
172-
// No entries, all comments are header or footer
173-
// For simplicity, treat them as header
174-
std::vector<std::string> commentLines;
175-
for (const auto& line : allLines) {
176-
if (line.type == LineType::Comment) {
177-
commentLines.push_back(line.content);
178-
} else if (line.type == LineType::Empty && !commentLines.empty()) {
179-
headerBlocks.emplace_back(std::move(commentLines));
180-
commentLines.clear();
181-
}
91+
DictEntry* entry = ParseKeyValues(buff, lineNum);
92+
if (entry != nullptr) {
93+
lexicon->Add(entry);
18294
}
183-
if (!commentLines.empty()) {
184-
headerBlocks.emplace_back(std::move(commentLines));
185-
}
186-
lexicon->SetHeaderBlocks(std::move(headerBlocks));
187-
return lexicon;
188-
}
189-
190-
// Find the last empty line before first entry
191-
int headerEndIdx = -1;
192-
for (int i = firstEntryIdx - 1; i >= 0; --i) {
193-
if (allLines[i].type == LineType::Empty) {
194-
headerEndIdx = i;
195-
break;
196-
}
197-
}
198-
199-
// Build header blocks (before headerEndIdx)
200-
std::vector<std::string> currentBlock;
201-
for (int i = 0; i <= headerEndIdx; ++i) {
202-
if (allLines[i].type == LineType::Comment) {
203-
currentBlock.push_back(allLines[i].content);
204-
} else if (allLines[i].type == LineType::Empty) {
205-
if (!currentBlock.empty()) {
206-
headerBlocks.emplace_back(std::move(currentBlock));
207-
currentBlock.clear();
208-
}
209-
}
210-
}
211-
if (!currentBlock.empty()) {
212-
headerBlocks.emplace_back(std::move(currentBlock));
213-
currentBlock.clear();
214-
}
215-
216-
// Build footer blocks (after lastEntryIdx)
217-
for (size_t i = lastEntryIdx + 1; i < allLines.size(); ++i) {
218-
if (allLines[i].type == LineType::Comment) {
219-
currentBlock.push_back(allLines[i].content);
220-
} else if (allLines[i].type == LineType::Empty) {
221-
if (!currentBlock.empty()) {
222-
footerBlocks.emplace_back(std::move(currentBlock));
223-
currentBlock.clear();
224-
}
225-
}
226-
}
227-
if (!currentBlock.empty()) {
228-
footerBlocks.emplace_back(std::move(currentBlock));
229-
}
230-
231-
// Build annotated entries (between first and last entry)
232-
// Scan from headerEndIdx+1 to lastEntryIdx
233-
size_t entryIndex = 0;
234-
for (int i = headerEndIdx + 1; i <= lastEntryIdx; ++i) {
235-
if (allLines[i].type == LineType::Comment) {
236-
currentBlock.push_back(allLines[i].content);
237-
} else if (allLines[i].type == LineType::Entry && allLines[i].entry != nullptr) {
238-
// Check if current comment block should attach to this entry
239-
CommentBlock* attachedComment = nullptr;
240-
if (!currentBlock.empty()) {
241-
// Check if there's an empty line between comment and entry
242-
bool hasEmptyLineBetween = false;
243-
for (int j = i - 1; j >= 0 && allLines[j].type != LineType::Entry; --j) {
244-
if (allLines[j].type == LineType::Empty) {
245-
hasEmptyLineBetween = true;
246-
break;
247-
}
248-
if (allLines[j].type == LineType::Comment) {
249-
break; // reached the comment block
250-
}
251-
}
252-
253-
if (!hasEmptyLineBetween) {
254-
// Attached comment
255-
attachedComment = new CommentBlock(std::move(currentBlock));
256-
} else {
257-
// Floating comment
258-
floatingBlocks.emplace_back(entryIndex, CommentBlock(currentBlock));
259-
}
260-
currentBlock.clear();
261-
}
262-
263-
// Create annotated entry
264-
DictEntry* entryCopy = DictEntryFactory::New(allLines[i].entry);
265-
annotatedEntries.emplace_back(entryCopy, attachedComment);
266-
entryIndex++;
267-
} else if (allLines[i].type == LineType::Empty) {
268-
if (!currentBlock.empty()) {
269-
// Comment block followed by empty line - it's floating
270-
// Find next entry to determine anchor
271-
size_t anchorIdx = entryIndex;
272-
for (int j = i + 1; j <= lastEntryIdx; ++j) {
273-
if (allLines[j].type == LineType::Entry && allLines[j].entry != nullptr) {
274-
break; // anchorIdx is already correct
275-
}
276-
}
277-
floatingBlocks.emplace_back(anchorIdx, CommentBlock(currentBlock));
278-
currentBlock.clear();
279-
}
280-
}
281-
}
282-
283-
// Handle any remaining comment block as floating
284-
if (!currentBlock.empty()) {
285-
floatingBlocks.emplace_back(entryIndex, CommentBlock(currentBlock));
95+
lineNum++;
28696
}
287-
288-
// Store results
289-
lexicon->SetHeaderBlocks(std::move(headerBlocks));
290-
lexicon->SetFooterBlocks(std::move(footerBlocks));
291-
lexicon->SetAnnotatedEntries(std::move(annotatedEntries));
292-
lexicon->SetFloatingBlocks(std::move(floatingBlocks));
293-
29497
return lexicon;
29598
}
29699

297-
void Lexicon::SortWithAnnotations() {
298-
if (!HasAnnotations() || annotatedEntries.empty()) {
299-
// No annotations, just sort entries normally
300-
Sort();
301-
return;
302-
}
303-
304-
std::vector<std::string> originalKeys;
305-
originalKeys.reserve(annotatedEntries.size());
306-
for (const auto& annotated : annotatedEntries) {
307-
originalKeys.push_back(annotated.Key());
308-
}
309-
310-
// Create a mapping from old entry pointers to their annotated counterparts
311-
std::map<std::string, size_t> keyToAnnotatedIndex;
312-
for (size_t i = 0; i < annotatedEntries.size(); ++i) {
313-
keyToAnnotatedIndex[annotatedEntries[i].Key()] = i;
314-
}
315-
316-
// Sort the regular entries
317-
Sort();
318-
319-
// Rebuild annotatedEntries in the new order
320-
std::vector<AnnotatedEntry> sortedAnnotated;
321-
sortedAnnotated.reserve(annotatedEntries.size());
322-
std::map<std::string, size_t> keyToNewIndex;
323-
324-
for (const auto& entry : entries) {
325-
auto it = keyToAnnotatedIndex.find(entry->Key());
326-
if (it != keyToAnnotatedIndex.end()) {
327-
size_t oldIndex = it->second;
328-
// Move the annotated entry (with its comment) to the new sorted order
329-
DictEntry* entryCopy = DictEntryFactory::New(entry.get());
330-
CommentBlock* commentCopy = nullptr;
331-
if (annotatedEntries[oldIndex].attachedComment) {
332-
commentCopy = new CommentBlock(annotatedEntries[oldIndex].attachedComment->lines);
333-
}
334-
sortedAnnotated.emplace_back(entryCopy, commentCopy);
335-
} else {
336-
// Entry without annotation
337-
DictEntry* entryCopy = DictEntryFactory::New(entry.get());
338-
sortedAnnotated.emplace_back(entryCopy, nullptr);
339-
}
340-
keyToNewIndex[entry->Key()] = sortedAnnotated.size() - 1;
341-
}
342-
343-
annotatedEntries = std::move(sortedAnnotated);
344-
345-
if (!floatingBlocks.empty()) {
346-
std::vector<std::pair<size_t, CommentBlock>> updatedFloating;
347-
updatedFloating.reserve(floatingBlocks.size());
348-
const size_t newCount = annotatedEntries.size();
349-
for (const auto& pair : floatingBlocks) {
350-
size_t anchor = pair.first;
351-
if (anchor >= originalKeys.size()) {
352-
updatedFloating.emplace_back(newCount, pair.second);
353-
continue;
354-
}
355-
const std::string& anchorKey = originalKeys[anchor];
356-
auto newIt = keyToNewIndex.find(anchorKey);
357-
if (newIt != keyToNewIndex.end()) {
358-
updatedFloating.emplace_back(newIt->second, pair.second);
359-
} else {
360-
updatedFloating.emplace_back(newCount, pair.second);
361-
}
362-
}
363-
floatingBlocks = std::move(updatedFloating);
364-
}
365-
}
366-
367100
} // namespace opencc

0 commit comments

Comments
 (0)