Skip to content

Commit 8e38c07

Browse files
pwilkinfirecoperana
authored andcommitted
Autoparser - complete refactoring of parser architecture
Autoparser: add optional argument reshuffle capability
1 parent d1a4b71 commit 8e38c07

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+13241
-9983
lines changed

common/CMakeLists.txt

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,12 +52,10 @@ set(TARGET common)
5252

5353
add_library(${TARGET} STATIC
5454
base64.hpp
55-
chat.cpp
56-
chat.h
57-
chat-parser.cpp
58-
chat-parser.h
59-
chat-parser-xml-toolcall.h
60-
chat-parser-xml-toolcall.cpp
55+
chat-auto-parser-generator.cpp
56+
chat-auto-parser-helpers.cpp
57+
chat-auto-parser.h
58+
chat-diff-analyzer.cpp
6159
chat-peg-parser.cpp
6260
chat-peg-parser.h
6361
common.cpp
@@ -77,15 +75,17 @@ add_library(${TARGET} STATIC
7775
ngram-cache.h
7876
ngram-map.cpp
7977
ngram-map.h
80-
peg-parser.cpp
78+
peg-parser.cpp
8179
peg-parser.h
82-
speculative.cpp
80+
speculative.cpp
8381
unicode.cpp
8482
unicode.h
8583
ngram-mod.cpp
8684
ngram-mod.h
8785
regex-partial.cpp
8886
regex-partial.h
87+
chat.cpp
88+
chat.h
8989
jinja/lexer.cpp
9090
jinja/lexer.h
9191
jinja/parser.cpp

common/chat-auto-parser-generator.cpp

Lines changed: 424 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 347 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,347 @@
1+
#include "chat-auto-parser-helpers.h"
2+
3+
#include "chat-auto-parser.h"
4+
#include "chat.h"
5+
#include "log.h"
6+
#include "nlohmann/json.hpp"
7+
8+
#include <cctype>
9+
#include <numeric>
10+
11+
using json = nlohmann::ordered_json;
12+
13+
std::string trim_whitespace(const std::string & str) {
14+
size_t start = 0;
15+
while (start < str.length() && std::isspace(static_cast<unsigned char>(str[start]))) {
16+
start++;
17+
}
18+
19+
if (start == str.length()) {
20+
return "";
21+
}
22+
23+
size_t end = str.length() - 1;
24+
while (end > start && std::isspace(static_cast<unsigned char>(str[end]))) {
25+
end--;
26+
}
27+
28+
return str.substr(start, end - start + 1);
29+
}
30+
31+
std::string trim_leading_whitespace(const std::string & str) {
32+
size_t start = 0;
33+
while (start < str.length() && std::isspace(static_cast<unsigned char>(str[start]))) {
34+
start++;
35+
}
36+
37+
return str.substr(start);
38+
}
39+
40+
std::string trim_trailing_whitespace(const std::string & str) {
41+
if (str.empty()) {
42+
return "";
43+
}
44+
45+
size_t end = str.length() - 1;
46+
while (end > 0 && std::isspace(static_cast<unsigned char>(str[end]))) {
47+
end--;
48+
}
49+
50+
// If first char is also whitespace, return empty string
51+
if (end == 0 && std::isspace(static_cast<unsigned char>(str[0]))) {
52+
return "";
53+
}
54+
55+
return str.substr(0, end + 1);
56+
}
57+
58+
std::string trim_trailing_newlines(const std::string & str) {
59+
size_t end = str.length();
60+
while (end > 0 && str[end - 1] == '\n') {
61+
end--;
62+
}
63+
64+
return str.substr(0, end);
65+
}
66+
67+
static size_t common_prefix_len(const std::string & left, const std::string & right) {
68+
size_t prefix_len = 0;
69+
size_t min_len = std::min(left.length(), right.length());
70+
while (prefix_len < min_len && left[prefix_len] == right[prefix_len]) {
71+
prefix_len++;
72+
}
73+
return prefix_len;
74+
}
75+
76+
static size_t common_suffix_len(const std::string & left, const std::string & right) {
77+
size_t suffix_len = 0;
78+
size_t min_len = std::min(left.length(), right.length());
79+
while (suffix_len < min_len && left[left.length() - 1 - suffix_len] == right[right.length() - 1 - suffix_len]) {
80+
suffix_len++;
81+
}
82+
return suffix_len;
83+
}
84+
85+
diff_split calculate_diff_split(const std::string & left, const std::string & right) {
86+
diff_split result;
87+
88+
auto left_seg = segmentize_markers(left);
89+
auto right_seg = segmentize_markers(right);
90+
91+
if (left_seg.empty()) {
92+
result.right = right;
93+
return result;
94+
}
95+
if (right_seg.empty()) {
96+
result.left = left;
97+
return result;
98+
}
99+
100+
auto left_start = left_seg.begin();
101+
auto left_end = --left_seg.end();
102+
auto right_start = right_seg.begin();
103+
auto right_end = --right_seg.end();
104+
105+
auto test = [&] () {
106+
return left_start != left_end && right_start != right_end;
107+
};
108+
109+
bool left_fully_consumed = false;
110+
bool right_fully_consumed = false;
111+
112+
while (test()) {
113+
bool advanced = false;
114+
if (*left_start == *right_start) {
115+
result.prefix.append(left_start->value);
116+
left_start++;
117+
right_start++;
118+
advanced = true;
119+
}
120+
if (*left_end == *right_end) {
121+
result.suffix = left_end->value + result.suffix;
122+
if (left_start != left_end) {
123+
left_end--;
124+
} else {
125+
left_fully_consumed = true;
126+
}
127+
if (right_start != right_end) {
128+
right_end--;
129+
} else {
130+
right_fully_consumed = true;
131+
}
132+
advanced = true;
133+
}
134+
if (!advanced) {
135+
break;
136+
}
137+
}
138+
139+
if (left_start == left_end && right_start != right_end) {
140+
if (*left_start == *right_end) {
141+
result.suffix = right_end->value + result.suffix;
142+
right_end--;
143+
left_fully_consumed = true;
144+
} else if (*left_start == *right_start) {
145+
result.prefix.append(right_start->value);
146+
right_start++;
147+
left_fully_consumed = true;
148+
}
149+
} else if (right_start == right_end && left_start != left_end) {
150+
if (*left_end == *right_start) {
151+
result.suffix = left_end->value + result.suffix;
152+
left_end--;
153+
right_fully_consumed = true;
154+
} else if (*left_start == *right_start) {
155+
result.prefix.append(left_start->value);
156+
left_start++;
157+
right_fully_consumed = true;
158+
}
159+
} else if (left_start == left_end && right_start == right_end && *left_start == *right_start && left_start->type == segment_type::MARKER) {
160+
result.prefix.append(right_start->value);
161+
left_fully_consumed = true;
162+
right_fully_consumed = true;
163+
}
164+
165+
auto eat_segment = [](std::string str, segment & seg) -> std::string { return str+seg.value; };
166+
167+
bool can_have_text_suffix = left_end->type == segment_type::TEXT && right_end->type == segment_type::TEXT;
168+
bool can_have_text_prefix = right_start->type == segment_type::TEXT && left_start->type == segment_type::TEXT;
169+
170+
std::string remainder_left = std::accumulate(left_start, left_fully_consumed ? left_end : ++left_end, std::string(), eat_segment);
171+
std::string remainder_right = std::accumulate(right_start, right_fully_consumed ? right_end : ++right_end, std::string(), eat_segment);
172+
173+
size_t suffix_len = can_have_text_suffix ? common_suffix_len(remainder_left, remainder_right) : 0;
174+
// avoid overlaps between prefix and suffix
175+
size_t prefix_len = can_have_text_prefix ? common_prefix_len(remainder_left.substr(0, remainder_left.size() - suffix_len),
176+
remainder_right.substr(0, remainder_right.size() - suffix_len)) : 0;
177+
178+
result.prefix.append(remainder_left.substr(0, prefix_len));
179+
result.suffix = remainder_left.substr(remainder_left.length() - suffix_len, suffix_len) + result.suffix;
180+
result.left = remainder_left.substr(prefix_len, remainder_left.length() - prefix_len - suffix_len);
181+
result.right = remainder_right.substr(prefix_len, remainder_right.length() - prefix_len - suffix_len);
182+
183+
if (result.left == "" && result.right == "") {
184+
// degenerate case, no diff
185+
result.prefix = left;
186+
result.suffix = "";
187+
// pick prefix = all as representation
188+
}
189+
return result;
190+
}
191+
192+
// Returns the prefix of `full` up until the first occurrence of the common prefix of `left` and `right`
193+
std::string until_common_prefix(const std::string & full, const std::string & left, const std::string & right) {
194+
// Find the common prefix of left and right
195+
size_t common_prefix_len = 0;
196+
size_t min_len = std::min(left.length(), right.length());
197+
while (common_prefix_len < min_len && left[common_prefix_len] == right[common_prefix_len]) {
198+
common_prefix_len++;
199+
}
200+
201+
// If there's no common prefix, return empty string
202+
if (common_prefix_len == 0) {
203+
return "";
204+
}
205+
206+
// Find the common prefix in the full string
207+
std::string common_prefix = left.substr(0, common_prefix_len);
208+
size_t pos = full.find(common_prefix);
209+
210+
// If not found, return empty string
211+
if (pos == std::string::npos) {
212+
return "";
213+
}
214+
215+
// Return everything before the common prefix
216+
return full.substr(0, pos);
217+
}
218+
219+
// Returns the suffix of `full` after the last occurrence of the common suffix of `left` and `right`
220+
std::string after_common_suffix(const std::string & full, const std::string & left, const std::string & right) {
221+
// Find the common suffix of left and right (compare from the end)
222+
size_t common_suffix_len = 0;
223+
size_t min_len = std::min(left.length(), right.length());
224+
while (common_suffix_len < min_len &&
225+
left[left.length() - 1 - common_suffix_len] == right[right.length() - 1 - common_suffix_len]) {
226+
common_suffix_len++;
227+
}
228+
229+
// If there's no common suffix, return empty string
230+
if (common_suffix_len == 0) {
231+
return "";
232+
}
233+
234+
// Extract the common suffix
235+
std::string common_suffix = left.substr(left.length() - common_suffix_len);
236+
237+
// Find the last occurrence of the common suffix in the full string
238+
size_t pos = full.rfind(common_suffix);
239+
240+
// If not found, return empty string
241+
if (pos == std::string::npos) {
242+
return "";
243+
}
244+
245+
// Return everything after the common suffix
246+
return full.substr(pos + common_suffix_len);
247+
}
248+
249+
// TODO: segmentize will treat a JSON array inside tags as a tag: <calls>[{ "fun": { ... } }]</calls> will be three markers
250+
// not too worried about that because it hasn't turned out as a problem anywhere, but noting here in case it will
251+
// Might have to put some restrictions on tag contents as well (like "no { }")
252+
std::vector<segment> segmentize_markers(const std::string & text) {
253+
std::vector<segment> retval;
254+
bool in_marker = false;
255+
char marker_opener = '\0';
256+
257+
auto is_marker_opener = [](char c) -> bool { return c == '<' || c == '['; };
258+
auto is_marker_closer = [](char op, char c) -> bool { return (op == '<' && c == '>') || (op == '[' && c == ']'); };
259+
260+
size_t last_border = 0;
261+
262+
for (size_t cur_pos = 0; cur_pos < text.length(); cur_pos++) {
263+
if (!in_marker && is_marker_opener(text[cur_pos])) {
264+
if (last_border < cur_pos) {
265+
retval.push_back(segment(segment_type::TEXT, text.substr(last_border, cur_pos - last_border)));
266+
}
267+
last_border = cur_pos;
268+
in_marker = true;
269+
marker_opener = text[cur_pos];
270+
} else if (in_marker && is_marker_closer(marker_opener, text[cur_pos])) {
271+
// no need to check because last_border will always be smaller
272+
retval.push_back(segment(segment_type::MARKER, text.substr(last_border, cur_pos - last_border + 1)));
273+
last_border = cur_pos + 1;
274+
in_marker = false;
275+
marker_opener = '\0';
276+
}
277+
}
278+
if (last_border < text.length()) {
279+
retval.push_back(segment(segment_type::TEXT, text.substr(last_border)));
280+
}
281+
return retval;
282+
}
283+
284+
std::vector<segment> prune_whitespace_segments(const std::vector<segment> & segments) {
285+
std::vector<segment> result;
286+
for (const auto & seg : segments) {
287+
if (!trim_whitespace(seg.value).empty()) {
288+
result.push_back(seg);
289+
}
290+
}
291+
return result;
292+
}
293+
294+
namespace autoparser {
295+
296+
std::string apply_template(const common_chat_template & tmpl, const template_params & params) {
297+
templates_params tmpl_params;
298+
tmpl_params.messages = params.messages;
299+
tmpl_params.tools = params.tools;
300+
tmpl_params.add_generation_prompt = params.add_generation_prompt;
301+
tmpl_params.enable_thinking = params.enable_thinking;
302+
303+
if (params.extra_context) {
304+
tmpl_params.extra_context = *params.extra_context;
305+
}
306+
tmpl_params.extra_context["enable_thinking"] = params.enable_thinking;
307+
308+
try {
309+
return common_chat_template_direct_apply(tmpl, tmpl_params);
310+
} catch (const std::exception & e) {
311+
LOG_DBG("Template application failed: %s\n", e.what());
312+
return "";
313+
}
314+
}
315+
316+
std::optional<compare_variants_result> compare_variants(
317+
const common_chat_template & tmpl,
318+
const template_params & params_A,
319+
const std::function<void(template_params &)> & params_modifier) {
320+
// Create variant B by copying A
321+
template_params params_B = params_A;
322+
323+
// Apply modifier to create variant B
324+
if (params_modifier) {
325+
params_modifier(params_B);
326+
}
327+
328+
// Apply template to both variants
329+
std::string output_A = apply_template(tmpl, params_A);
330+
std::string output_B = apply_template(tmpl, params_B);
331+
332+
// Check for template application failures
333+
if (output_A.empty() || output_B.empty()) {
334+
return std::nullopt;
335+
}
336+
337+
// Calculate diff and return result with both outputs
338+
compare_variants_result result;
339+
result.diff = calculate_diff_split(output_A, output_B);
340+
result.output_A = output_A;
341+
result.output_B = output_B;
342+
343+
return result;
344+
}
345+
346+
} // namespace autoparser
347+

0 commit comments

Comments
 (0)