Skip to content

Commit a1526ab

Browse files
authored
Merge pull request #516 from katef/kate/interpolate_groups
Add re_interpolate groups()
2 parents d817464 + 74ab9a7 commit a1526ab

File tree

9 files changed

+469
-0
lines changed

9 files changed

+469
-0
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ SUBDIR += tests/sql
147147
SUBDIR += tests/queue
148148
SUBDIR += tests/aho_corasick
149149
SUBDIR += tests/retest
150+
SUBDIR += tests/re_interpolate_groups
150151
SUBDIR += tests
151152
.if make(theft) || make(${BUILD}/theft/theft)
152153
SUBDIR += theft

include/re/groups.h

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/*
2+
* Copyright 2026 Katherine Flavel
3+
*
4+
* See LICENCE for the full copyright terms.
5+
*/
6+
7+
#ifndef RE_GROUPS_H
8+
#define RE_GROUPS_H
9+
10+
struct re_pos;
11+
12+
/*
13+
* esc is the character for escaping group references,
14+
* typically '\\' or '$'.
15+
*
16+
* group0 is passed separately for caller convenience,
17+
* so you don't have to construct a single array for
18+
* all groups. It's supposed to be the entire string
19+
* that matched. group0 may not be NULL.
20+
*
21+
* groupv is 0-indexed meaning group $1 onwards.
22+
* groupc is the count of elements in groupv.
23+
*
24+
* nonexistent is what to do about references to groups
25+
* that are outside the bounds of the array. NULL means
26+
* to error, otherwise the string value will be used.
27+
* Typically this would be passed as "".
28+
*
29+
* start,end are only populated on error.
30+
*
31+
* You can distinguish compile-time errors (that is,
32+
* syntax errors in the format string) vs. runtime errors
33+
* (that is, nonexistent groups) by calling
34+
* re_interpolate_groups() ahead of time with groupc = 0
35+
* and passing a non-NULL nonexistent value.
36+
*
37+
* The output string will always be less than or equal in
38+
* length to the format string when all interpolated
39+
* values are the empty string. That is, when groupc is 0
40+
* and nonexistent is the empty string, or when all groups
41+
* used from groupv[] are the empty string.
42+
*
43+
* The output is \0-terminated. outn includes the \0.
44+
*
45+
* outs may be NULL in which case outn must be 0, and no
46+
* output is made.
47+
*
48+
* On error the function returns false and the output
49+
* buffer is indeterminate.
50+
*/
51+
bool
52+
re_interpolate_groups(const char *fmt, char esc,
53+
const char *group0, unsigned groupc, const char *groupv[], const char *nonexistent,
54+
char *outs, size_t outn,
55+
struct re_pos *start, struct re_pos *end);
56+
57+
#endif
58+

src/libre/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ SRC += src/libre/ast_new_from_fsm.c
1010
SRC += src/libre/ast_rewrite.c
1111
SRC += src/libre/ac.c
1212
SRC += src/libre/print.c
13+
SRC += src/libre/re_interpolate_groups.c
1314
SRC += src/libre/re_strings.c
1415

1516
# generated

src/libre/libre.syms

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ re_flags
44
re_strerror
55
re_perror
66
re_is_anchored
7+
re_interpolate_groups
78

89
ast_print
910
ast_print_dot

src/libre/re_interpolate_groups.c

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
/*
2+
* Copyright 2026 Katherine Flavel
3+
*
4+
* See LICENCE for the full copyright terms.
5+
*/
6+
7+
#include <assert.h>
8+
#include <stdbool.h>
9+
#include <string.h>
10+
#include <limits.h>
11+
#include <stdio.h>
12+
#include <ctype.h>
13+
14+
#include <re/re.h>
15+
#include <re/groups.h>
16+
17+
#define OUT_CHAR(c) \
18+
do if (outs != NULL) { \
19+
if (outn < 1) goto overflow; \
20+
*outs++ = (c); \
21+
outn--; \
22+
} while (0)
23+
24+
#define OUT_GROUP(s) \
25+
do if (outs != NULL) { \
26+
size_t n = strlen((s)); \
27+
if (outn < n) goto overflow; \
28+
(void) memcpy(outs, s, n); \
29+
outs += n; \
30+
outn -= n; \
31+
} while (0)
32+
33+
bool
34+
re_interpolate_groups(const char *fmt, char esc,
35+
const char *group0, unsigned groupc, const char *groupv[], const char *nonexistent,
36+
char *outs, size_t outn,
37+
struct re_pos *start, struct re_pos *end)
38+
{
39+
unsigned group; // 0 meaning group0, 1 meaning groupv[0], etc
40+
char *outs_orig;
41+
const char *p;
42+
43+
enum {
44+
STATE_LIT,
45+
STATE_ESC,
46+
STATE_DIGIT
47+
} state;
48+
49+
assert(esc != '\0');
50+
assert(group0 != NULL || groupc == 0);
51+
assert(groupc < UINT_MAX / 10 - 1);
52+
assert(outs != NULL || outn == 0);
53+
54+
state = STATE_LIT;
55+
group = 0;
56+
57+
outs_orig = outn > 0 ? outs : NULL;
58+
59+
if (start != NULL) {
60+
start->byte = 0;
61+
}
62+
63+
p = fmt;
64+
65+
do {
66+
switch (state) {
67+
case STATE_LIT:
68+
if (*p == '\0') {
69+
break;
70+
}
71+
72+
if (*p == esc) {
73+
if (start != NULL) {
74+
start->byte = p - fmt;
75+
}
76+
77+
state = STATE_ESC;
78+
continue;
79+
}
80+
81+
OUT_CHAR(*p);
82+
continue;
83+
84+
case STATE_ESC:
85+
if (*p == '\0') {
86+
goto error;
87+
}
88+
89+
if (*p == esc) {
90+
OUT_CHAR(esc);
91+
state = STATE_LIT;
92+
continue;
93+
}
94+
95+
if (isdigit((unsigned char) *p)) {
96+
group = *p - '0';
97+
state = STATE_DIGIT;
98+
continue;
99+
}
100+
101+
goto error;
102+
103+
case STATE_DIGIT:
104+
if (isdigit((unsigned char) *p)) {
105+
group *= 10;
106+
group += *p - '0';
107+
108+
/*
109+
* We need to handle numeric overflow somehow here,
110+
* as we would with using strtol() or similar. But
111+
* we don't need to distinguish this as a special
112+
* error code, semantically it's the same as a group
113+
* that doesn't exist.
114+
*
115+
* groupc + 1 is always out of bounds. So we cap to that,
116+
* using it as a simple way to avoid needing to handle
117+
* numeric overflow for subsequent digits. This assumes
118+
* groupc *= 10 is <= UINT_MAX.
119+
*/
120+
if (group > groupc) {
121+
group = groupc + 1;
122+
}
123+
continue;
124+
}
125+
126+
if (group == 0) {
127+
OUT_GROUP(group0);
128+
} else if (group <= groupc) {
129+
assert(groupv[group - 1] != NULL);
130+
OUT_GROUP(groupv[group - 1]);
131+
} else if (nonexistent == NULL) {
132+
/*
133+
* We could indicate this independently from syntax errors,
134+
* with some way to return different error codes.
135+
*
136+
* But there's no need, you can pre-check the fmt syntax
137+
* by running ahead of time with groupc == 0 and pass
138+
* nonexistent != NULL, because that eliminates the
139+
* possibility for group-related errors.
140+
*/
141+
goto error;
142+
} else {
143+
OUT_GROUP(nonexistent);
144+
}
145+
146+
group = 0;
147+
state = STATE_LIT;
148+
149+
if (*p == '\0') {
150+
break;
151+
}
152+
153+
if (*p == esc) {
154+
if (start != NULL) {
155+
start->byte = p - fmt;
156+
}
157+
158+
state = STATE_ESC;
159+
continue;
160+
}
161+
162+
OUT_CHAR(*p);
163+
continue;
164+
165+
default:
166+
assert(!"unreached");
167+
goto error;
168+
}
169+
} while (*p != '\0' && p++);
170+
171+
if (state != STATE_LIT) {
172+
goto error;
173+
}
174+
175+
OUT_CHAR('\0');
176+
177+
return true;
178+
179+
overflow:
180+
181+
/* we're blaming the entire fmt string for overflow */
182+
if (start != NULL) {
183+
start->byte = 0;
184+
}
185+
186+
error:
187+
188+
if (end != NULL) {
189+
end->byte = p - fmt;
190+
}
191+
192+
if (outs_orig != NULL) {
193+
*outs_orig = '\0';
194+
}
195+
196+
return false;
197+
}
198+
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
.include "../../share/mk/top.mk"
2+
3+
TEST.tests/re_interpolate_groups != ls -1 tests/re_interpolate_groups/re_interpolate_groups*.c
4+
TEST_SRCDIR.tests/re_interpolate_groups = tests/re_interpolate_groups
5+
TEST_OUTDIR.tests/re_interpolate_groups = ${BUILD}/tests/re_interpolate_groups
6+
7+
.for n in ${TEST.tests/re_interpolate_groups:T:R:C/^re_interpolate_groups//}
8+
test:: ${TEST_OUTDIR.tests/re_interpolate_groups}/res${n}
9+
SRC += ${TEST_SRCDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.c
10+
#CFLAGS.${TEST_SRCDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.c = -UNDEBUG
11+
CFLAGS.${TEST_SRCDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.c = -std=c99
12+
13+
${TEST_OUTDIR.tests/re_interpolate_groups}/run${n}: ${TEST_OUTDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.o
14+
${CC} ${CFLAGS} -o ${TEST_OUTDIR.tests/re_interpolate_groups}/run${n} ${TEST_OUTDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.o ${BUILD}/src/libre/re_interpolate_groups.o
15+
16+
${TEST_OUTDIR.tests/re_interpolate_groups}/res${n}: ${TEST_OUTDIR.tests/re_interpolate_groups}/run${n}
17+
( ${TEST_OUTDIR.tests/re_interpolate_groups}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/re_interpolate_groups}/res${n}
18+
19+
#.for lib in ${LIB:Mlibfsm} ${LIB:Mlibre}
20+
#${TEST_OUTDIR.tests/re_interpolate_groups}/run${n}: ${BUILD}/lib/${lib:R}.a
21+
#.endfor
22+
.endfor
23+
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
/*
2+
* Copyright 2026 Katherine Flavel
3+
*
4+
* See LICENCE for the full copyright terms.
5+
*/
6+
7+
#include <assert.h>
8+
#include <stdbool.h>
9+
#include <string.h>
10+
#include <stdio.h>
11+
12+
#include <re/re.h>
13+
#include <re/groups.h>
14+
15+
static unsigned failed;
16+
17+
static void
18+
test(const char *fmt, size_t groupc, const char *groupv[], const char *expected)
19+
{
20+
char outs[40];
21+
bool r;
22+
23+
assert(fmt != NULL);
24+
assert(expected != NULL);
25+
26+
if (!re_interpolate_groups(fmt, '$', "<g0>", groupc, groupv, "<ne>", outs, sizeof outs, NULL, NULL)) {
27+
printf("%s/%zu XXX\n", fmt, groupc);
28+
failed++;
29+
return;
30+
}
31+
32+
failed += r = 0 != strcmp(outs, expected);
33+
34+
printf("%s/%zu => %s%s\n", fmt, groupc, outs,
35+
r ? " XXX" : "");
36+
}
37+
38+
int main(void) {
39+
const char *gn[] = { "one", "two", "three", "four" };
40+
const char **g0 = NULL;
41+
const char *ga[] = { "1" };
42+
const char *gb[] = { "" };
43+
// const char *gc[] = { NULL }; // XXX: not permitted
44+
45+
test("", 0, g0, "");
46+
test("", 4, gn, "");
47+
48+
test("x", 0, g0, "x");
49+
test("x", 4, gn, "x");
50+
51+
test("\001", 0, g0, "\001");
52+
test("\001", 4, gn, "\001");
53+
54+
test("$0", 0, gn, "<g0>");
55+
test("x$000000000000000000000x", 0, gn, "x<g0>x");
56+
test("x$000000000000000000001x", 1, gn, "xonex");
57+
test("x$100000000000000000000x", 1, gn, "x<ne>x");
58+
59+
test("$$$1$1$2$1$3$4$3$2$1$$$$", 4, gn, "$oneonetwoonethreefourthreetwoone$$");
60+
test("$$$$$$$$$$$$$$$$$$$$", 4, gn, "$$$$$$$$$$");
61+
62+
test("xyz_$1..$0003;$3,$$.$1-$4=$123", 4, gn, "xyz_one..three;three,$.one-four=<ne>");
63+
test("xyz_$1..$0003;$3,$$.$1-$4=$123", 3, gn, "xyz_one..three;three,$.one-<ne>=<ne>");
64+
test("xyz_$1..$0003;$3,$$.$1-$4=$123", 2, gn, "xyz_one..<ne>;<ne>,$.one-<ne>=<ne>");
65+
test("xyz_$1..$0003;$3,$$.$1-$4=$123", 1, gn, "xyz_one..<ne>;<ne>,$.one-<ne>=<ne>");
66+
test("xyz_$1..$0003;$3,$$.$1-$4=$123", 0, g0, "xyz_<ne>..<ne>;<ne>,$.<ne>-<ne>=<ne>");
67+
test("xyz_$1..$0003;$3,$$.$1-$4=$123", 1, ga, "xyz_1..<ne>;<ne>,$.1-<ne>=<ne>");
68+
test("xyz_$1..$0003;$3,$$.$1-$4=$123", 1, gb, "xyz_..<ne>;<ne>,$.-<ne>=<ne>");
69+
70+
return failed;
71+
}
72+

0 commit comments

Comments
 (0)