Skip to content

Commit ede6b5f

Browse files
committed
rewrite whitespace-only
1 parent 2e5ad07 commit ede6b5f

File tree

5 files changed

+993
-27
lines changed

5 files changed

+993
-27
lines changed

CMakeLists.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,4 +84,14 @@ add_test(NAME lws5
8484
-P ${CMAKE_CURRENT_SOURCE_DIR}/tests/runtest.cmake
8585
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests/5)
8686

87+
add_test(NAME lws6
88+
COMMAND ${CMAKE_COMMAND}
89+
-DCMD=$<TARGET_FILE:${PROJECT_NAME}>
90+
-DSRC=b-comms.c
91+
-DPATCH=gemini.patch
92+
-DEXPSHA=6ea83a67aba0358099752cfaf83a28d5d983b50855e93352ae9c04d656c7911e
93+
-DEXPSHA_WIN=2e6b9b12ae0128c9edfc109744b9c67848712b0521c322a45104895aa4cbc3b1
94+
-P ${CMAKE_CURRENT_SOURCE_DIR}/tests/runtest.cmake
95+
WORKING_DIRECTORY ${PROJECT_SOURCE_DIR}/tests/6)
96+
8797

README.md

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# fixdiff
22

3-
Andy Green <[email protected]> 2025
4-
See MIT license in LICENSE
3+
Copyright (C) 2025 Andy Green <[email protected]>
4+
Licensed under MIT license, see LICENSE
55

66
```
77
$ cat llm-patch.diff | fixdiff | patch -p1
@@ -31,15 +31,17 @@ headers with accurate line counts on stdout.
3131

3232
It silently repairs:
3333

34-
- added empty lines with only whitespace become blank lines
35-
- wrong "before" line in original stanza header
36-
- wrong "before" line count in original stanza header
37-
- wrong "after" line in original stanza header
38-
- wrong "after" line count in original stanza header
39-
- removes extra lead-in context lines in stanza
40-
- for diffs adding to end of file, corrects mismatching context caused by
41-
LLM losing blank lines at the original EOF (by checking the original
42-
source file for extra lines and adding them to the stanza as context)
34+
1. new empty lines with only whitespace, by rewriting to blank lines
35+
2. original lines in diff that differ from real line in file only by
36+
whitespace are rewritten to contain the correct whitespace
37+
3. wrong "before" line in original stanza header
38+
4. wrong "before" line count in original stanza header
39+
5. wrong "after" line in original stanza header
40+
6. wrong "after" line count in original stanza header
41+
7. extra lead-in context lines to stanza by removing until only 3
42+
8. diffs adding to end of file with missing or wrong context caused by
43+
LLM losing blank lines at the original EOF are rewritten by checking
44+
the original source file for extra lines and adding them to the stanza as context)
4345

4446
It finds and scans the sources the patches apply to and uses the diff stanza to
4547
find the original line it applied to by itself, along with the original line

fixdiff.c

Lines changed: 157 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -84,11 +84,21 @@ typedef struct {
8484
int li;
8585
} lbuf_t;
8686

87+
typedef struct rewriter {
88+
struct rewriter *next;
89+
size_t len;
90+
int line;
91+
char *text;
92+
} rewriter_t;
93+
/* new_text is overcommitted below */
94+
8795
typedef struct {
8896
off_t flo;
8997

9098
const char *reason;
9199

100+
rewriter_t *rewriter_head;
101+
92102
dss_t d;
93103
int pre;
94104
int post;
@@ -102,6 +112,8 @@ typedef struct {
102112

103113
int fd_temp;
104114

115+
int li_out;
116+
105117
char ongoing;
106118
char skip_this_one;
107119
char lead_in_active;
@@ -313,10 +325,26 @@ fixdiff_stanza_start(dp_t *pdp, char *sh, size_t len)
313325
return 0;
314326
}
315327

328+
static void
329+
stain_copy(char *dest, const char *in, size_t len)
330+
{
331+
char *p = dest;
332+
333+
strncpy(dest, in, len - 1);
334+
dest[len - 1] = '\0';
335+
do {
336+
p = strchr(p, '\t');
337+
if (!p)
338+
break;
339+
*p = '>';
340+
p++;
341+
} while (1);
342+
}
343+
316344
static int
317345
fixdiff_find_original(dp_t *pdp, int *line_start)
318346
{
319-
char in_src[4096], in_temp[4096], b1[256], b2[256], hit = 0;
347+
char in_src[4096], in_temp[4096], b1[256], b2[256], f1[256], f2[256], hit = 0;
320348
int ret = 1, mc = 0, lmc = 0, lis = 0, lg_lis = 0;
321349
lbuf_t lb_temp, lb_src, lb;
322350
size_t lt, ls;
@@ -329,6 +357,8 @@ fixdiff_find_original(dp_t *pdp, int *line_start)
329357
lb_src.fd = lb.fd = -1;
330358
b1[0] = '\0';
331359
b2[0] = '\0';
360+
f1[0] = '\0';
361+
f2[0] = '\0';
332362

333363
init_lbuf(&lb_temp, "temp");
334364
lb_temp.fd = open(pdp->temp, OFLAGS(O_RDWR));
@@ -402,26 +432,92 @@ fixdiff_find_original(dp_t *pdp, int *line_start)
402432
break;
403433

404434
if (!ls) {
405-
elog("failed to match, best chunk %d lines at %s:%d\n",
435+
elog("failed to match, best chunk %d lines at %s:%d (tabs shown below as >)\n",
406436
lmc, pdp->pf, lg_lis);
407-
elog("patch: '%s', source '%s'\n", b1, b2);
437+
elog("last match: patch: '%s', source: '%s'\n", b1, b2);
438+
elog("divergence: patch: '%s', source: '%s'\n", f1, f2);
408439
mc = 0;
409440
break;
410441
}
411442

412443
if (fixdiff_strcmp(in_temp + 1, lt - 1, &let, in_src, ls, &les)) {
413-
if (mc > pdp->pre + pdp->post)
414-
elog("match failed after %d: '%s' / '%s'", mc, in_temp + 1, in_src);
444+
/*
445+
* It's still possible we only differ by whitespace.
446+
* Does it match if we treat any whitespace as a single
447+
* whitespace match token?
448+
*/
449+
450+
char *p1 = in_temp + 1, *p1_end = p1 + lt - 1 - (int)let,
451+
*p2 = in_src, *p2_end = p2 + ls - (int)les;
452+
453+
while (p1 < p1_end && p2 < p2_end) {
454+
char wst1 = 0, wst2 = 0;
455+
456+
while (*p1 == ' ' || *p1 == '\t' && p1 < p1_end) {
457+
p1++;
458+
wst1 = 1;
459+
}
460+
while (*p2 == ' ' || *p2 == '\t' && p2 < p2_end) {
461+
p2++;
462+
wst2 = 1;
463+
}
464+
465+
if (wst1 != wst2)
466+
goto record_breakage;
467+
468+
if (*p1 != *p2)
469+
goto record_breakage;
470+
471+
p1++;
472+
p2++;
473+
}
474+
475+
if ((p1 < p1_end) != (p2 < p2_end))
476+
goto record_breakage;
477+
478+
elog("(fixable whitespace-only difference at stanza line %d)\n", lb_temp.li);
479+
480+
{
481+
rewriter_t *rwt = malloc(sizeof(*rwt) + ls + 1);
482+
if (!rwt) {
483+
elog("OOM\n");
484+
return -1;
485+
}
486+
rwt->next = pdp->rewriter_head;
487+
pdp->rewriter_head = rwt;
488+
rwt->line = lb_temp.li;
489+
rwt->text = (char *)&rwt[1];
490+
rwt->text[0] = *in_temp;
491+
rwt->len = ls + 1;
492+
memcpy(rwt->text + 1, in_src, ls);
493+
}
494+
goto allow_match_ws;
495+
496+
record_breakage:
497+
if (mc + 1 > lmc) {
498+
stain_copy(f1, in_temp + 1, sizeof(f1));
499+
stain_copy(f2, in_src, sizeof(f2));
500+
}
415501
mc = 0;
502+
{
503+
rewriter_t *rwt = pdp->rewriter_head, *rwt1;
504+
505+
while (rwt) {
506+
rwt1 = rwt->next;
507+
free(rwt);
508+
rwt = rwt1;
509+
}
510+
511+
pdp->rewriter_head = NULL;
512+
}
416513
break;
417514
}
418515

516+
allow_match_ws:
419517
mc++;
420518
if (mc > lmc) {
421-
strncpy(b1, in_temp + 1, sizeof(b1) - 1);
422-
b1[sizeof(b1) - 1] = '\0';
423-
strncpy(b2, in_src + 1, sizeof(b2) - 1);
424-
b2[sizeof(b2) - 1] = '\0';
519+
stain_copy(b1, in_temp + 1, sizeof(b1));
520+
stain_copy(b2, in_src, sizeof(b2));
425521
lmc++;
426522
lg_lis = lis;
427523
}
@@ -512,8 +608,9 @@ fixdiff_find_original(dp_t *pdp, int *line_start)
512608
static int
513609
fixdiff_stanza_end(dp_t *pdp)
514610
{
611+
int orig, nope = 0;
612+
lbuf_t lb_temp;
515613
char buf[256];
516-
int orig;
517614

518615
if (!pdp->ongoing)
519616
return 0;
@@ -554,21 +651,64 @@ fixdiff_stanza_end(dp_t *pdp)
554651

555652
/* dump the temp side-buffer into stdout */
556653

557-
lseek(pdp->fd_temp, pdp->flo, SEEK_SET);
654+
init_lbuf(&lb_temp, "lb_temp");
655+
lb_temp.fd = open(pdp->temp, OFLAGS(O_RDONLY));
656+
lseek(lb_temp.fd, pdp->flo, SEEK_SET);
657+
558658
while (1) {
559-
ssize_t l = read(pdp->fd_temp, buf, sizeof(buf));
659+
char buf[4096];
660+
ssize_t l = fixdiff_get_line(&lb_temp, buf, sizeof(buf));
661+
rewriter_t *rwt = pdp->rewriter_head;
662+
560663
if (!l)
561664
break;
562665

563-
if (write(1, buf, TO_POSLEN(l)) != (ssize_t)l) {
564-
pdp->reason = "failed to write to stdout";
565-
return 1;
666+
// elog("dumping %d (len %d)\n", (int)pdp->li_out, (int)l);
667+
668+
while (rwt) {
669+
// elog("%d %d\n", rwt->line, pdp->li_out);
670+
if (rwt->line == lb_temp.li /*pdp->li_out*/) /* we need to rewrite this line */
671+
break;
672+
673+
rwt = rwt->next;
674+
}
675+
676+
if (rwt) {
677+
// elog("rewriting '%.*s' to '%.*s'\n", (int)l, buf, (int)rwt->len, rwt->text);
678+
if (write(1, rwt->text, TO_POSLEN(rwt->len)) != (ssize_t)rwt->len) {
679+
pdp->reason = "failed to write to stdout";
680+
nope = 1;
681+
break;
682+
}
683+
} else {
684+
if (write(1, buf, TO_POSLEN(l)) != (ssize_t)l) {
685+
pdp->reason = "failed to write to stdout";
686+
nope = 1;
687+
break;
688+
}
566689
}
690+
691+
pdp->li_out++;
567692
}
568693

569-
close(pdp->fd_temp);
694+
{
695+
rewriter_t *rwt = pdp->rewriter_head, *rwt1;
696+
697+
while (rwt) {
698+
rwt1 = rwt->next;
699+
free(rwt);
700+
rwt = rwt1;
701+
}
702+
703+
pdp->rewriter_head = NULL;
704+
}
705+
706+
close(lb_temp.fd);
570707
pdp->fd_temp = -1;
571708

709+
if (nope)
710+
return 1;
711+
572712
/* track the effect stanza changes are having on line offsets */
573713
pdp->delta += pdp->post - pdp->pre;
574714

@@ -611,6 +751,7 @@ main(int argc, char *argv[])
611751
dp.d = DSS_WAIT_MMM;
612752
dp.lb.fd = 0; /* stdin */
613753
dp.fd_temp = -1;
754+
dp.li_out = 1;
614755

615756
while (1) {
616757
size_t l = fixdiff_get_line(&dp.lb, in, sizeof(in));

0 commit comments

Comments
 (0)