Skip to content

Commit 1a43f9c

Browse files
committed
First cut at re_interpolate_groups()
1 parent 27802dc commit 1a43f9c

File tree

8 files changed

+355
-0
lines changed

8 files changed

+355
-0
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ SUBDIR += tests/sql
147147
SUBDIR += tests/queue
148148
SUBDIR += tests/aho_corasick
149149
SUBDIR += tests/retest
150+
SUBDIR += tests/re_interpolate_groups
150151
SUBDIR += tests
151152
.if make(theft) || make(${BUILD}/theft/theft)
152153
SUBDIR += theft

include/re/groups.h

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
/*
2+
* Copyright 2026 Katherine Flavel
3+
*
4+
* See LICENCE for the full copyright terms.
5+
*/
6+
7+
#ifndef RE_GROUPS_H
8+
#define RE_GROUPS_H
9+
10+
struct re_pos;
11+
12+
/*
13+
* esc is the character for escaping group references,
14+
* typically '\\' or '$'.
15+
*
16+
* group0 is passed separately for caller convenience,
17+
* so you don't have to construct a single array for
18+
* all groups. It's supposed to be the entire string
19+
* that matched. group0 may not be NULL.
20+
*
21+
* groupv is 0-indexed meaning group $1 onwards.
22+
* groupc is the count of elements in groupv.
23+
*
24+
* nonexistent is what to do about references to groups
25+
* that are outside the bounds of the array. NULL means
26+
* to error, otherwise the string value will be used.
27+
* Typically this would be passed as "".
28+
*
29+
* You can distinguish compile-time errors (that is,
30+
* syntax errors in the format string) vs. runtime errors
31+
* (that is, nonexistent groups) by calling
32+
* re_interpolate_groups() ahead of time with groupc = 0
33+
* and passing a non-NULL nonexistent value.
34+
*
35+
* The output string will always be less than or equal in
36+
* length to the format string. The output is \0-terminated.
37+
* outn includes the \0.
38+
*/
39+
bool
40+
re_interpolate_groups(const char *fmt, char esc,
41+
const char *group0, unsigned groupc, const char *groupv[], const char *nonexistent,
42+
char *outs, size_t outn,
43+
struct re_pos *pos);
44+
45+
#endif
46+

src/libre/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ SRC += src/libre/ast_new_from_fsm.c
1010
SRC += src/libre/ast_rewrite.c
1111
SRC += src/libre/ac.c
1212
SRC += src/libre/print.c
13+
SRC += src/libre/re_interpolate_groups.c
1314
SRC += src/libre/re_strings.c
1415

1516
# generated

src/libre/libre.syms

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ re_flags
44
re_strerror
55
re_perror
66
re_is_anchored
7+
re_interpolate_groups
78

89
ast_print
910
ast_print_dot

src/libre/re_interpolate_groups.c

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
/*
2+
* Copyright 2026 Katherine Flavel
3+
*
4+
* See LICENCE for the full copyright terms.
5+
*/
6+
7+
#include <assert.h>
8+
#include <stdbool.h>
9+
#include <string.h>
10+
#include <limits.h>
11+
#include <stdio.h>
12+
#include <ctype.h>
13+
14+
#include <re/re.h>
15+
#include <re/groups.h>
16+
17+
// TODO
18+
#define OUT_CHAR(c) do { if (outn < 1) { goto error; } *outs++ = (c); outn--; } while (0)
19+
#define OUT_GROUP(s) do { if (outn < strlen((s))) { goto error; } outs += sprintf(outs, "%s", (s)); outn -= strlen((s)); } while (0)
20+
21+
// TODO: return values: syntax error, nonexistent group error (digit overflow is the same thing), success
22+
bool
23+
re_interpolate_groups(const char *fmt, char esc,
24+
const char *group0, unsigned groupc, const char *groupv[], const char *nonexistent,
25+
char *outs, size_t outn,
26+
struct re_pos *pos)
27+
{
28+
unsigned group; // 0 meaning group0, 1 meaning groupv[0], etc
29+
const char *p;
30+
31+
enum {
32+
STATE_LIT,
33+
STATE_ESC,
34+
STATE_DIGIT
35+
} state;
36+
37+
assert(esc != '\0');
38+
assert(group0 != NULL || groupc == 0);
39+
assert(groupc < UINT_MAX / 10 - 1);
40+
assert(outs != NULL);
41+
42+
state = STATE_LIT;
43+
group = 0;
44+
45+
p = fmt;
46+
do {
47+
switch (state) {
48+
case STATE_LIT:
49+
if (*p == '\0') {
50+
break;
51+
}
52+
53+
if (*p == esc) {
54+
state = STATE_ESC;
55+
continue;
56+
}
57+
58+
OUT_CHAR(*p);
59+
continue;
60+
61+
case STATE_ESC:
62+
if (*p == '\0') {
63+
goto error;
64+
}
65+
66+
if (*p == esc) {
67+
OUT_CHAR(esc);
68+
state = STATE_LIT;
69+
continue;
70+
}
71+
72+
if (isdigit((unsigned char) *p)) {
73+
group = *p - '0';
74+
state = STATE_DIGIT;
75+
continue;
76+
}
77+
78+
goto error;
79+
80+
case STATE_DIGIT:
81+
if (isdigit((unsigned char) *p)) {
82+
group *= 10;
83+
group += *p - '0';
84+
85+
// TODO: explain this
86+
// digit overflow, we cap to groupc + 1
87+
// groupc + 1 is always out of bounds
88+
// this is a simple way to avoid needing to handle digit overflow for subsequent digits,
89+
// assuming groupc *= 10 is <= UINT_MAX
90+
if (group > groupc) {
91+
group = groupc + 1;
92+
}
93+
continue;
94+
}
95+
96+
if (group == 0) {
97+
OUT_GROUP(group0);
98+
} else if (group <= groupc) {
99+
assert(groupv[group - 1] != NULL);
100+
OUT_GROUP(groupv[group - 1]);
101+
} else if (nonexistent == NULL) {
102+
// TODO: maybe want to indicate this independently from syntax errors
103+
// TODO: no need, you can pre-check the entire syntax by running with 0 groups
104+
goto error;
105+
} else {
106+
OUT_GROUP(nonexistent);
107+
}
108+
109+
group = 0;
110+
state = STATE_LIT;
111+
112+
if (*p == '\0') {
113+
break;
114+
}
115+
116+
if (*p == esc) {
117+
state = STATE_ESC;
118+
continue;
119+
}
120+
121+
OUT_CHAR(*p);
122+
continue;
123+
124+
default:
125+
assert(!"unreached");
126+
goto error;
127+
}
128+
} while (*p++);
129+
130+
if (state != STATE_LIT) {
131+
goto error;
132+
}
133+
134+
OUT_CHAR('\0');
135+
136+
return true;
137+
138+
error:
139+
140+
// TODO: track start,end independently
141+
if (pos != NULL) {
142+
pos->byte = p - fmt;
143+
}
144+
145+
return false;
146+
}
147+
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
.include "../../share/mk/top.mk"
2+
3+
TEST.tests/re_interpolate_groups != ls -1 tests/re_interpolate_groups/re_interpolate_groups*.c
4+
TEST_SRCDIR.tests/re_interpolate_groups = tests/re_interpolate_groups
5+
TEST_OUTDIR.tests/re_interpolate_groups = ${BUILD}/tests/re_interpolate_groups
6+
7+
.for n in ${TEST.tests/re_interpolate_groups:T:R:C/^re_interpolate_groups//}
8+
test:: ${TEST_OUTDIR.tests/re_interpolate_groups}/res${n}
9+
SRC += ${TEST_SRCDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.c
10+
#CFLAGS.${TEST_SRCDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.c = -UNDEBUG
11+
CFLAGS.${TEST_SRCDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.c = -std=c99
12+
13+
${TEST_OUTDIR.tests/re_interpolate_groups}/run${n}: ${TEST_OUTDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.o
14+
${CC} ${CFLAGS} -o ${TEST_OUTDIR.tests/re_interpolate_groups}/run${n} ${TEST_OUTDIR.tests/re_interpolate_groups}/re_interpolate_groups${n}.o ${BUILD}/src/libre/re_interpolate_groups.o
15+
16+
${TEST_OUTDIR.tests/re_interpolate_groups}/res${n}: ${TEST_OUTDIR.tests/re_interpolate_groups}/run${n}
17+
( ${TEST_OUTDIR.tests/re_interpolate_groups}/run${n} 1>&2 && echo PASS || echo FAIL ) > ${TEST_OUTDIR.tests/re_interpolate_groups}/res${n}
18+
19+
#.for lib in ${LIB:Mlibfsm} ${LIB:Mlibre}
20+
#${TEST_OUTDIR.tests/re_interpolate_groups}/run${n}: ${BUILD}/lib/${lib:R}.a
21+
#.endfor
22+
.endfor
23+
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/*
2+
* Copyright 2026 Katherine Flavel
3+
*
4+
* See LICENCE for the full copyright terms.
5+
*/
6+
7+
#include <assert.h>
8+
#include <stdbool.h>
9+
#include <string.h>
10+
#include <stdio.h>
11+
12+
#include <re/re.h>
13+
#include <re/groups.h>
14+
15+
static unsigned failed;
16+
17+
static void
18+
test(const char *fmt, size_t groupc, const char *groupv[], const char *expected)
19+
{
20+
struct re_pos pos;
21+
char outs[40];
22+
bool r;
23+
24+
assert(fmt != NULL);
25+
assert(expected != NULL);
26+
27+
if (!re_interpolate_groups(fmt, '$', "<g0>", groupc, groupv, "<ne>", outs, sizeof outs, &pos)) {
28+
printf("%s/%zu XXX\n", fmt, groupc);
29+
failed++;
30+
return;
31+
}
32+
33+
failed += r = 0 != strcmp(outs, expected);
34+
35+
printf("%s/%zu => %s%s\n", fmt, groupc, outs,
36+
r ? " XXX" : "");
37+
}
38+
39+
int main(void) {
40+
const char *gn[] = { "one", "two", "three", "four" };
41+
const char **g0 = NULL;
42+
const char *ga[] = { "1" };
43+
const char *gb[] = { "" };
44+
// const char *gc[] = { NULL }; // XXX: not permitted
45+
46+
test("", 0, g0, "");
47+
test("", 4, gn, "");
48+
49+
test("x", 0, g0, "x");
50+
test("x", 4, gn, "x");
51+
52+
test("\001", 0, g0, "\001");
53+
test("\001", 4, gn, "\001");
54+
55+
test("$0", 0, gn, "<g0>");
56+
test("x$000000000000000000000x", 0, gn, "x<g0>x");
57+
test("x$000000000000000000001x", 1, gn, "xone");
58+
test("x$100000000000000000000x", 1, gn, "x<ne>");
59+
60+
test("$$$1$1$2$1$3$4$3$2$1$$$$", 4, gn, "$oneonetwoonethreefourthreetwoone$$");
61+
62+
test("xyz_$1..$0003;$3,$$.$1-$4=$123", 4, gn, "xyz_one..three;three,$.one-four=<ne>");
63+
test("xyz_$1..$0003;$3,$$.$1-$4=$123", 3, gn, "xyz_one..three;three,$.one-<ne>=<ne>");
64+
test("xyz_$1..$0003;$3,$$.$1-$4=$123", 2, gn, "xyz_one..<ne>;<ne>,$.one-<ne>=<ne>");
65+
test("xyz_$1..$0003;$3,$$.$1-$4=$123", 1, gn, "xyz_one..<ne>;<ne>,$.one-<ne>=<ne>");
66+
test("xyz_$1..$0003;$3,$$.$1-$4=$123", 0, g0, "xyz_<ne>..<ne>;<ne>,$.<ne>-<ne>=<ne>");
67+
test("xyz_$1..$0003;$3,$$.$1-$4=$123", 1, ga, "xyz_1..<ne>;<ne>,$.1-<ne>=<ne>");
68+
test("xyz_$1..$0003;$3,$$.$1-$4=$123", 1, gb, "xyz_..<ne>;<ne>,$.-<ne>=<ne>");
69+
// test("xyz_$1..$0003;$3,$$.$1-$4=$123", 0, gc, "xyz_<ne>..<ne>;<ne>,$.<ne>-<ne>=<ne>");
70+
71+
return failed;
72+
}
73+
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
/*
2+
* Copyright 2026 Katherine Flavel
3+
*
4+
* See LICENCE for the full copyright terms.
5+
*/
6+
7+
#include <assert.h>
8+
#include <stdbool.h>
9+
#include <string.h>
10+
#include <stdio.h>
11+
12+
#include <re/re.h>
13+
#include <re/groups.h>
14+
15+
static unsigned failed;
16+
17+
static void
18+
test_err(const char *fmt, size_t groupc, const char *groupv[], const char *ne, unsigned expected_pos)
19+
{
20+
struct re_pos pos;
21+
char outs[10];
22+
bool r;
23+
24+
assert(fmt != NULL);
25+
26+
/* for these tests we're expecting to error */
27+
if (re_interpolate_groups(fmt, '$', "<g0>", groupc, groupv, ne, outs, sizeof outs, &pos)) {
28+
printf("%s/%zu XXX\n", fmt, groupc);
29+
failed++;
30+
return;
31+
}
32+
33+
failed += r = expected_pos != pos.byte;
34+
35+
printf("%s/%zu => :%u :%u%s\n", fmt, groupc,
36+
pos.byte, expected_pos,
37+
r ? " XXX" : "");
38+
}
39+
40+
int main(void) {
41+
const char *ne = "<ne>";
42+
43+
const char *gn[] = { "one", "two", "three", "four" };
44+
const char **g0 = NULL;
45+
const char *ga[] = { "1" };
46+
const char *gb[] = { "" };
47+
// const char *gc[] = { NULL }; // XXX: not permitted
48+
49+
test_err("$", 0, g0, ne, 1);
50+
test_err("$x", 0, g0, ne, 1);
51+
test_err("$ ", 4, gn, ne, 1);
52+
test_err("$\\01", 0, g0, ne, 1);
53+
54+
test_err("$$$x", 4, gn, ne, 3);
55+
56+
test_err("xyz$1", 0, gn, NULL, 5);
57+
test_err("xyz$2", 1, gn, NULL, 5);
58+
59+
test_err("01234567890", 1, gn, ne, 10);
60+
61+
return failed;
62+
}
63+

0 commit comments

Comments
 (0)