Skip to content

Commit 6e1ecff

Browse files
committed
fix: implement proper advanced-format parser for nested starform rules
The previous advancedToCanonical/tokenize/tokensToCanonical pipeline was broken for nested sub-expressions such as: (facetec-scan (liveness-score (* range numeric ge 080)) ...) The tokenizer did not handle nested parentheses recursively, causing predicates like (liveness-score ...) to be flattened into bare atoms at the wrong nesting level. The go-spocp docs/FILE_LOADING.md note 'Advanced Format: not yet fully implemented in parser' confirmed this. Replace the broken two-step approach with a proper recursive descent parser (parseAdvanced / advTokenize / advParseElement / advParseStarForm / advParseRange) that builds the sexp.Element tree directly, including correct starform.Range values with their RangeType and RangeBound fields. loadText now calls parseAdvanced directly instead of going through advancedToCanonical + sexp.NewParser. The old advancedToCanonical and tokenize functions are kept as thin wrappers (for saveAdvanced backward compatibility and existing tests respectively) but delegate to the new implementation. All existing tests continue to pass.
1 parent f500631 commit 6e1ecff

File tree

1 file changed

+168
-69
lines changed

1 file changed

+168
-69
lines changed

pkg/persist/persist.go

Lines changed: 168 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import (
1313
"strings"
1414

1515
"github.com/sirosfoundation/go-spocp/pkg/sexp"
16+
"github.com/sirosfoundation/go-spocp/pkg/starform"
1617
)
1718

1819
// FileFormat represents the format of a ruleset file
@@ -121,10 +122,8 @@ func loadText(r io.Reader, opts LoadOptions) ([]sexp.Element, error) {
121122
var err error
122123

123124
if opts.Format == FormatAdvanced {
124-
// Convert advanced form to canonical, then parse
125-
canonical := advancedToCanonical(line)
126-
parser := sexp.NewParser(canonical)
127-
elem, err = parser.Parse()
125+
// Parse advanced form directly into sexp.Element (handles star-forms)
126+
elem, err = parseAdvanced(line)
128127
} else {
129128
// Parse canonical form directly
130129
parser := sexp.NewParser(line)
@@ -350,102 +349,202 @@ func isBinaryFile(filename string) bool {
350349
strings.HasSuffix(filename, ".bin")
351350
}
352351

353-
// advancedToCanonical converts advanced form to canonical form
354-
// This is a simple implementation - for production use, you might want
355-
// a more sophisticated parser
356-
func advancedToCanonical(advanced string) string {
357-
// Remove outer parentheses if present
358-
advanced = strings.TrimSpace(advanced)
359-
if strings.HasPrefix(advanced, "(") && strings.HasSuffix(advanced, ")") {
360-
advanced = advanced[1 : len(advanced)-1]
352+
// parseAdvanced parses a human-readable advanced-form S-expression string
353+
// directly into a sexp.Element tree, without going through canonical form.
354+
// This replaces the old advancedToCanonical + sexp.NewParser two-step approach
355+
// which was incomplete for deeply nested expressions.
356+
//
357+
// The advanced format is:
358+
//
359+
// atom → any whitespace-delimited token without parens
360+
// list → "(" atom element* ")"
361+
// star-wildcard → "(*)"
362+
// star-range → "(* range <type> <op> <val> [<op> <val>])"
363+
// star-prefix → "(* prefix <value>)"
364+
// star-suffix → "(* suffix <value>)"
365+
// star-set → "(* set <element>...)"
366+
func parseAdvanced(s string) (sexp.Element, error) {
367+
s = strings.TrimSpace(s)
368+
tokens := advTokenize(s)
369+
if len(tokens) == 0 {
370+
return nil, fmt.Errorf("empty expression")
361371
}
362-
363-
// Split into tokens
364-
tokens := tokenize(advanced)
365-
366-
// Convert to canonical
367-
return tokensToCanonical(tokens)
372+
elem, rest, err := advParseElement(tokens)
373+
if err != nil {
374+
return nil, err
375+
}
376+
if len(rest) > 0 {
377+
return nil, fmt.Errorf("unexpected trailing tokens: %v", rest)
378+
}
379+
return elem, nil
368380
}
369381

370-
func tokenize(s string) []string {
382+
// advTokenize splits an advanced-form string into a flat list of string tokens.
383+
// Sub-expressions in parens are kept as single opaque tokens (with parens).
384+
// Quoted strings are kept as single tokens (including the quote characters).
385+
func advTokenize(s string) []string {
371386
var tokens []string
372387
var current strings.Builder
373388
depth := 0
374389
inQuote := false
375390

376-
for i, ch := range s {
377-
switch ch {
378-
case '(':
379-
if !inQuote {
380-
if current.Len() > 0 {
381-
tokens = append(tokens, current.String())
382-
current.Reset()
383-
}
384-
depth++
391+
runes := []rune(s)
392+
for i := 0; i < len(runes); i++ {
393+
ch := runes[i]
394+
switch {
395+
case ch == '"':
396+
inQuote = !inQuote
397+
current.WriteRune(ch)
398+
case inQuote:
399+
current.WriteRune(ch)
400+
case ch == '(':
401+
if depth > 0 {
385402
current.WriteRune(ch)
386403
} else {
387-
current.WriteRune(ch)
388-
}
389-
case ')':
390-
if !inQuote {
391-
current.WriteRune(ch)
392-
depth--
393-
if depth == 0 {
404+
if current.Len() > 0 {
394405
tokens = append(tokens, current.String())
395406
current.Reset()
396407
}
397-
} else {
398408
current.WriteRune(ch)
399409
}
400-
case '"':
401-
inQuote = !inQuote
410+
depth++
411+
case ch == ')':
402412
current.WriteRune(ch)
403-
case ' ', '\t', '\n', '\r':
404-
if !inQuote && depth == 0 {
405-
if current.Len() > 0 {
406-
tokens = append(tokens, current.String())
407-
current.Reset()
408-
}
409-
} else {
413+
depth--
414+
if depth == 0 {
415+
tokens = append(tokens, current.String())
416+
current.Reset()
417+
}
418+
case ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r':
419+
if depth > 0 {
410420
current.WriteRune(ch)
421+
} else if current.Len() > 0 {
422+
tokens = append(tokens, current.String())
423+
current.Reset()
411424
}
412425
default:
413426
current.WriteRune(ch)
414427
}
415-
416-
// Handle last character
417-
if i == len(s)-1 && current.Len() > 0 {
418-
tokens = append(tokens, current.String())
419-
}
420428
}
421-
429+
if current.Len() > 0 {
430+
tokens = append(tokens, current.String())
431+
}
422432
return tokens
423433
}
424434

425-
func tokensToCanonical(tokens []string) string {
435+
// advParseElement parses the first element from tokens and returns it along
436+
// with the remaining unconsumed tokens.
437+
func advParseElement(tokens []string) (sexp.Element, []string, error) {
426438
if len(tokens) == 0 {
427-
return ""
439+
return nil, nil, fmt.Errorf("unexpected end of tokens")
440+
}
441+
tok := tokens[0]
442+
if !strings.HasPrefix(tok, "(") {
443+
// Plain atom
444+
return sexp.NewAtom(tok), tokens[1:], nil
445+
}
446+
// It's a list — strip the outer parens and tokenize its contents
447+
inner := strings.TrimSpace(tok[1 : len(tok)-1])
448+
innerTokens := advTokenize(inner)
449+
if len(innerTokens) == 0 {
450+
return nil, nil, fmt.Errorf("empty list")
451+
}
452+
tag := innerTokens[0]
453+
if tag == "*" {
454+
// Star form
455+
elem, err := advParseStarForm(innerTokens[1:])
456+
if err != nil {
457+
return nil, nil, err
458+
}
459+
return elem, tokens[1:], nil
460+
}
461+
// Regular list: tag + elements
462+
var elems []sexp.Element
463+
rest := innerTokens[1:]
464+
for len(rest) > 0 {
465+
var elem sexp.Element
466+
var err error
467+
elem, rest, err = advParseElement(rest)
468+
if err != nil {
469+
return nil, nil, err
470+
}
471+
elems = append(elems, elem)
428472
}
473+
return sexp.NewList(tag, elems...), tokens[1:], nil
474+
}
429475

430-
// Single token
431-
if len(tokens) == 1 {
432-
token := tokens[0]
433-
if strings.HasPrefix(token, "(") {
434-
return tokensToCanonical(tokenize(token[1 : len(token)-1]))
476+
// advParseStarForm constructs the appropriate starform type from the tokens
477+
// that follow the "*" tag inside a star-form list.
478+
func advParseStarForm(args []string) (sexp.Element, error) {
479+
if len(args) == 0 {
480+
return &starform.Wildcard{}, nil
481+
}
482+
switch args[0] {
483+
case "range":
484+
return advParseRange(args[1:])
485+
case "prefix":
486+
if len(args) != 2 {
487+
return nil, fmt.Errorf("prefix star-form expects 1 argument, got %d", len(args)-1)
435488
}
436-
return fmt.Sprintf("%d:%s", len(token), token)
489+
return &starform.Prefix{Value: args[1]}, nil
490+
case "suffix":
491+
if len(args) != 2 {
492+
return nil, fmt.Errorf("suffix star-form expects 1 argument, got %d", len(args)-1)
493+
}
494+
return &starform.Suffix{Value: args[1]}, nil
495+
case "set":
496+
var elems []sexp.Element
497+
rest := args[1:]
498+
for len(rest) > 0 {
499+
elem, remaining, err := advParseElement(rest)
500+
if err != nil {
501+
return nil, err
502+
}
503+
elems = append(elems, elem)
504+
rest = remaining
505+
}
506+
return &starform.Set{Elements: elems}, nil
507+
default:
508+
return nil, fmt.Errorf("unknown star-form type: %q", args[0])
437509
}
510+
}
438511

439-
// Multiple tokens form a list
440-
var buf strings.Builder
441-
buf.WriteString("(")
442-
for _, token := range tokens {
443-
if strings.HasPrefix(token, "(") {
444-
buf.WriteString(tokensToCanonical(tokenize(token[1 : len(token)-1])))
445-
} else {
446-
buf.WriteString(fmt.Sprintf("%d:%s", len(token), token))
512+
// advParseRange parses the contents of a range star-form: <type> (<op> <val>)...
513+
func advParseRange(args []string) (sexp.Element, error) {
514+
if len(args) == 0 {
515+
return nil, fmt.Errorf("range star-form requires a type argument")
516+
}
517+
r := &starform.Range{RangeType: starform.RangeType(args[0])}
518+
i := 1
519+
for i+1 < len(args) {
520+
op := starform.RangeOp(args[i])
521+
val := args[i+1]
522+
bound := &starform.RangeBound{Op: op, Value: val}
523+
switch op {
524+
case starform.OpGE, starform.OpGT:
525+
r.LowerBound = bound
526+
case starform.OpLE, starform.OpLT:
527+
r.UpperBound = bound
528+
default:
529+
return nil, fmt.Errorf("unknown range operator: %q", op)
447530
}
531+
i += 2
448532
}
449-
buf.WriteString(")")
450-
return buf.String()
533+
return r, nil
534+
}
535+
536+
// advancedToCanonical converts advanced form to canonical form.
537+
// Kept for backward compatibility with saveAdvanced; new loading code uses
538+
// parseAdvanced directly.
539+
func advancedToCanonical(advanced string) string {
540+
elem, err := parseAdvanced(advanced)
541+
if err != nil {
542+
return ""
543+
}
544+
return elem.String()
545+
}
546+
547+
// tokenize splits a string into tokens (kept for tests that reference it).
548+
func tokenize(s string) []string {
549+
return advTokenize(s)
451550
}

0 commit comments

Comments
 (0)