diff --git a/.gitignore b/.gitignore index d262e9c..a5a239e 100644 --- a/.gitignore +++ b/.gitignore @@ -44,4 +44,7 @@ go.work.sum # node_modules **/node_modules/ -**/*.class \ No newline at end of file +**/*.class + +# No binary files +**/bin/** \ No newline at end of file diff --git a/go.mod b/go.mod index ff4a681..8cefc8a 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.24.5 require ( github.com/antlr4-go/antlr/v4 v4.13.1 + github.com/pkg/errors v0.9.1 github.com/stretchr/testify v1.10.0 ) diff --git a/go.sum b/go.sum index c96f3a5..7367760 100644 --- a/go.sum +++ b/go.sum @@ -2,6 +2,8 @@ github.com/bytebase/antlr/v4 v4.0.0-20240827034948-8c385f108920 h1:IfmPt5o5R70NK github.com/bytebase/antlr/v4 v4.0.0-20240827034948-8c385f108920/go.mod h1:ykhjIPiv0IWpu3OGXCHdz2eUSe8UNGGD6baqjs8jSuU= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= diff --git a/tools/fuzzing/DESIGN.md b/tools/fuzzing/DESIGN.md deleted file mode 100644 index d5fc9c0..0000000 --- a/tools/fuzzing/DESIGN.md +++ /dev/null @@ -1,230 +0,0 @@ -# Grammar-Aware Fuzzing Tool Design - -## Overview - -A fuzzing tool that generates valid SQL inputs by analyzing ANTLR v4 grammar files, ensuring comprehensive parser testing with syntactically correct queries that can stress-test parsing performance and correctness. - -## Goals - -- **Valid Input Generation**: Generate syntactically correct SQL queries based on grammar rules -- **Performance Testing**: Create complex queries to test parser performance limits -- **Coverage Maximization**: Exercise all grammar rules and edge cases -- **Automated Testing**: Integrate with CI for continuous parser validation - -## Architecture - -``` -tools/fuzzing/ -├── generator/ # Core generation logic -│ ├── grammar_analyzer.go # Parse ANTLR grammar files -│ ├── rule_expander.go # Expand grammar rules to concrete syntax -│ └── query_builder.go # Build SQL queries from rule expansions -├── strategies/ # Different generation strategies -│ ├── depth_first.go # Generate deeply nested structures -│ ├── breadth_first.go # Generate wide, complex queries -│ └── weighted.go # Probability-based rule selection -├── corpus/ # Generated test cases and seeds -│ ├── seeds/ # Hand-crafted seed inputs -│ └── generated/ # Auto-generated test cases -└── cmd/ # CLI tools - └── fuzzer/ # Main fuzzer executable -``` - -## Core Components - -### 1. Grammar Analyzer - -Leverages the existing `tools/grammar/` ANTLR v4 parser to: -- Parse target grammar files (e.g., `postgresql.g4`, `cql.g4`) -- Extract production rules and their alternatives -- Build dependency graph between rules -- Identify terminal vs non-terminal symbols - -```go -type GrammarAnalyzer struct { - parser *grammar.ANTLRv4Parser - rules map[string]*Rule -} - -type Rule struct { - Name string - Alternatives []Alternative - Type RuleType // LEXER, PARSER, FRAGMENT -} -``` - -### 2. Rule Expander - -Recursively expands grammar rules into concrete syntax trees: -- Handles rule recursion with configurable depth limits -- Supports probability-weighted alternative selection -- Manages lexer rules and literal generation -- Tracks generation context for smart decisions - -```go -type RuleExpander struct { - grammar *ParsedGrammar - maxDepth int - weights map[string]float64 - random *rand.Rand -} -``` - -### 3. Query Builder - -Converts syntax trees to executable SQL strings: -- Handles whitespace and formatting -- Manages identifier generation (table names, columns) -- Ensures semantic consistency where possible -- Outputs parseable query strings - -## Generation Strategies - -### Depth-First Strategy -- Generates deeply nested subqueries, expressions -- Tests parser stack limits and recursion handling -- Focuses on structural complexity - -### Breadth-First Strategy -- Creates wide queries with many clauses, joins, columns -- Tests parser memory usage and performance -- Focuses on query size and breadth - -### Weighted Strategy -- Uses probability weights for rule selection -- Biases toward commonly used constructs -- Configurable via weight files per dialect - -## Integration Points - -### With Existing Grammar Parser -```go -// Reuse tools/grammar/ for parsing target grammars -analyzer := NewGrammarAnalyzer() -targetGrammar, err := analyzer.ParseGrammarFile("postgresql/PostgreSQLLexer.g4") -``` - -### With Parser Testing -```go -// Generate test cases for specific parser -fuzzer := NewFuzzer(postgresqlGrammar) -queries := fuzzer.GenerateQueries(1000) - -for _, query := range queries { - // Test against postgresql parser - result := postgresqlParser.Parse(query) - // Collect metrics, detect crashes -} -``` - -## Configuration - -### Fuzzer Config -```yaml -target_grammar: "postgresql" -strategies: - - name: "depth_first" - weight: 0.3 - max_depth: 15 - - name: "breadth_first" - weight: 0.4 - max_width: 50 - - name: "weighted" - weight: 0.3 - weights_file: "postgresql_weights.yaml" - -generation: - count: 10000 - max_query_length: 100000 - seed: 42 - -output: - format: "sql" - directory: "corpus/generated" -``` - -### Grammar Weights -```yaml -# postgresql_weights.yaml -rules: - selectStmt: 0.4 - insertStmt: 0.2 - updateStmt: 0.2 - deleteStmt: 0.1 - createStmt: 0.1 - - # Bias toward complex expressions - expr: - binaryOp: 0.4 - functionCall: 0.3 - subquery: 0.2 - literal: 0.1 -``` - -## CLI Interface - -```bash -# Generate queries for PostgreSQL -./fuzzer generate --grammar postgresql --count 1000 --strategy weighted - -# Run continuous fuzzing with performance metrics -./fuzzer fuzz --grammar cql --duration 1h --metrics - -# Validate existing corpus against parser -./fuzzer validate --grammar postgresql --corpus corpus/postgresql/ -``` - -## Performance Metrics - -### Generation Metrics -- Queries generated per second -- Grammar rule coverage percentage -- Distribution of query complexity (depth, width) - -### Parser Testing Metrics -- Parse success rate -- Average parse time per query -- Memory usage during parsing -- Parser crash/error detection - -## Implementation Phases - -### Phase 1: Foundation (Week 1-2) -- Basic grammar analyzer using existing ANTLR parser -- Simple rule expander with depth-first strategy -- Command-line interface for manual testing - -### Phase 2: Core Features (Week 3-4) -- Multiple generation strategies -- Configuration system -- Basic corpus management -- Integration with existing parser tests - -### Phase 3: Advanced Features (Week 5-6) -- Weighted generation with probability tuning -- Performance metrics collection -- CI integration for continuous fuzzing -- Corpus minimization and deduplication - -### Phase 4: Optimization (Week 7-8) -- Generation performance optimization -- Advanced semantic awareness -- Custom mutation strategies -- Comprehensive documentation - -## Future Enhancements - -- **Semantic Awareness**: Generate queries with valid schema references -- **Mutation-Based Fuzzing**: Mutate existing queries to explore edge cases -- **Differential Testing**: Compare parser outputs across database dialects -- **Performance Regression Detection**: Track parser performance over time -- **Grammar Evolution**: Adapt fuzzing as grammars evolve - -## Dependencies - -- Existing `tools/grammar/` ANTLR v4 parser -- Go standard library (`rand`, `fmt`, `strings`) -- YAML configuration parsing -- CLI framework (e.g., `cobra`) - -This design provides a solid foundation for grammar-aware fuzzing while leveraging our existing ANTLR infrastructure. \ No newline at end of file diff --git a/tools/fuzzing/Makefile b/tools/fuzzing/Makefile new file mode 100644 index 0000000..227ca53 --- /dev/null +++ b/tools/fuzzing/Makefile @@ -0,0 +1,38 @@ +.PHONY: all test clean help + +all: test + +# Run tests +test: + @echo "Running tests..." + go test -v github.com/bytebase/parser/tools/fuzzing/... + +# Clean build artifacts +clean: + @echo "Cleaning..." + go clean + +# Install dependencies +deps: + @echo "Installing dependencies..." + cd ../.. && go mod tidy && go mod download + +# Format code +fmt: + @echo "Formatting code..." + go fmt github.com/bytebase/parser/tools/fuzzing/... + +# Run linter +lint: + @echo "Running linter..." + golangci-lint run + +# Show help +help: + @echo "Available targets:" + @echo " test - Run all tests" + @echo " clean - Clean build artifacts" + @echo " deps - Install/update dependencies" + @echo " fmt - Format all Go code" + @echo " lint - Run golangci-lint" + @echo " help - Show this help message" \ No newline at end of file diff --git a/tools/fuzzing/internal/config/config.go b/tools/fuzzing/internal/config/config.go new file mode 100644 index 0000000..3d4e27a --- /dev/null +++ b/tools/fuzzing/internal/config/config.go @@ -0,0 +1,116 @@ +package config + +import ( + "fmt" + + "github.com/pkg/errors" +) + +// OutputFormat represents different output formatting options +type OutputFormat int + +const ( + // CompactOutput shows cleaner, more readable output (default) + CompactOutput OutputFormat = iota + // VerboseOutput shows full grammar rule traversal with comments + VerboseOutput +) + +// ParseOutputFormat parses a string into an OutputFormat +func ParseOutputFormat(s string) OutputFormat { + switch s { + case "compact", "": + return CompactOutput + case "verbose": + return VerboseOutput + default: + return CompactOutput + } +} + +// Config holds all configuration options for the fuzzer +type Config struct { + GrammarFiles []string // Can be one file (combined) or two files (lexer,parser) + StartRule string + Count int + MaxDepth int + OptionalProb float64 + MaxQuantifier int + MinQuantifier int + QuantifierCount int + Output string + OutputFormat OutputFormat // How to format the output + Seed int64 +} + +// Validate checks if the configuration is valid +func (c *Config) Validate() error { + if len(c.GrammarFiles) == 0 { + return errors.New("--grammar is required") + } + + if len(c.GrammarFiles) > 2 { + return errors.New("--grammar accepts maximum 2 files (lexer,parser)") + } + + if c.StartRule == "" { + return errors.New("--start-rule is required") + } + + if c.Count <= 0 { + return errors.New("--count must be positive") + } + + if c.MaxDepth <= 0 { + return errors.New("--max-depth must be positive") + } + + if c.OptionalProb < 0.0 || c.OptionalProb > 1.0 { + return errors.New("--optional-prob must be between 0.0 and 1.0") + } + + if c.MaxQuantifier <= 0 { + return errors.New("--max-quantifier must be positive") + } + + if c.MinQuantifier < 0 { + return errors.New("--min-quantifier must be non-negative") + } + + if c.MinQuantifier > c.MaxQuantifier { + return errors.New("--min-quantifier cannot be greater than --max-quantifier") + } + + if c.QuantifierCount < 0 { + return errors.New("--quantifier-count must be non-negative") + } + + return nil +} + +// Print displays the configuration +func (c *Config) Print() { + fmt.Printf("Grammar-Aware Fuzzer\n") + if len(c.GrammarFiles) == 1 { + fmt.Printf("Grammar File: %s\n", c.GrammarFiles[0]) + } else if len(c.GrammarFiles) == 2 { + fmt.Printf("Lexer File: %s\n", c.GrammarFiles[0]) + fmt.Printf("Parser File: %s\n", c.GrammarFiles[1]) + } + fmt.Printf("Start Rule: %s\n", c.StartRule) + fmt.Printf("Count: %d\n", c.Count) + fmt.Printf("Max Depth: %d\n", c.MaxDepth) + fmt.Printf("Optional Probability: %.2f\n", c.OptionalProb) + fmt.Printf("Max Quantifier: %d\n", c.MaxQuantifier) + if c.MinQuantifier > 0 { + fmt.Printf("Min Quantifier: %d\n", c.MinQuantifier) + } + if c.QuantifierCount > 0 { + fmt.Printf("Fixed Quantifier Count: %d\n", c.QuantifierCount) + } + if c.Output != "" { + fmt.Printf("Output: %s\n", c.Output) + } + fmt.Printf("Seed: %d\n", c.Seed) + fmt.Println() +} \ No newline at end of file diff --git a/tools/fuzzing/internal/generator/generator.go b/tools/fuzzing/internal/generator/generator.go new file mode 100644 index 0000000..9eb2ea6 --- /dev/null +++ b/tools/fuzzing/internal/generator/generator.go @@ -0,0 +1,466 @@ +package generator + +import ( + "fmt" + "math/rand" + "strings" + + "github.com/bytebase/parser/tools/fuzzing/internal/config" + "github.com/bytebase/parser/tools/fuzzing/internal/grammar" + "github.com/pkg/errors" +) + +// Generator handles the fuzzing logic +type Generator struct { + config *config.Config + random *rand.Rand + grammar *grammar.ParsedGrammar +} + +// WorkItem represents a unit of work in the generation stack +type WorkItem struct { + RuleName string + Depth int + Result *string // Pointer to where the result should be stored +} + +// New creates a new generator with the given configuration +func New(cfg *config.Config) *Generator { + return &Generator{ + config: cfg, + random: rand.New(rand.NewSource(cfg.Seed)), + grammar: nil, + } +} + +// Generate produces the specified number of queries +func (g *Generator) Generate() error { + fmt.Println("Initializing grammar parser...") + + // Parse and merge all grammar files into a single grammar + var err error + g.grammar, err = grammar.ParseAndMergeGrammarFiles(g.config.GrammarFiles) + if err != nil { + return errors.Wrap(err, "failed to parse and merge grammar files") + } + + fmt.Printf("Parsed and merged %d grammar files into single grammar\n", len(g.config.GrammarFiles)) + + // Validate start rule exists + if g.grammar.GetRule(g.config.StartRule) == nil { + return errors.Errorf("start rule '%s' not found in merged grammar", g.config.StartRule) + } + + fmt.Printf("Generating %d queries from rule '%s'...\n", g.config.Count, g.config.StartRule) + + // Generate queries + for i := 0; i < g.config.Count; i++ { + query := g.generateQuery(i + 1) + fmt.Printf("Query %d: %s\n", i+1, query) + } + + return nil +} + +// getRule gets a rule from the merged grammar +func (g *Generator) getRule(ruleName string) *grammar.Rule { + return g.grammar.GetRule(ruleName) +} + + +// generateQuery creates a single query using grammar rules +func (g *Generator) generateQuery(index int) string { + // Start generation from the specified start rule with no recursion limit for now + result := g.generateFromRule(g.config.StartRule, 0) + return result +} + +// generateFromRule generates text from a grammar rule +func (g *Generator) generateFromRule(ruleName string, currentDepth int) string { + // Check depth limit to prevent infinite recursion + if currentDepth >= g.config.MaxDepth { + return fmt.Sprintf("<%s_MAX_DEPTH>", ruleName) + } + + // Get the rule + rule := g.getRule(ruleName) + if rule == nil { + // If rule not found, return placeholder + return fmt.Sprintf("<%s>", ruleName) + } + + // Select a random alternative + if len(rule.Alternatives) == 0 { + return fmt.Sprintf("<%s>", ruleName) + } + + altIndex := g.random.Intn(len(rule.Alternatives)) + alternative := rule.Alternatives[altIndex] + + // Generate from all elements in the alternative + var result []string + for _, element := range alternative.Elements { + elementResult := g.generateFromElement(&element, currentDepth) + if elementResult != "" { + result = append(result, elementResult) + } + } + + // Format output based on configuration + switch g.config.OutputFormat { + case config.CompactOutput: + // Clean, readable output without verbose comments (default) + return joinWithSpaces(result) + case config.VerboseOutput: + // Full grammar rule traversal with comments + return fmt.Sprintf("/* %s */ %s", ruleName, joinWithSpaces(result)) + default: + // Default to compact + return joinWithSpaces(result) + } +} + +// generateFromElement generates text from a single grammar element +func (g *Generator) generateFromElement(element *grammar.Element, currentDepth int) string { + // Handle optional elements + if element.IsOptional() && g.random.Float64() > g.config.OptionalProb { + return "" // Skip optional element + } + + // Handle quantified elements + if element.IsQuantified() { + return g.generateQuantified(element, currentDepth) + } + + // Generate single element + if element.IsRule() { + if refValue, ok := element.Value.(grammar.ReferenceValue); ok { + return g.generateFromRuleOrToken(refValue.Name, currentDepth+1) + } else if blockValue, ok := element.Value.(grammar.BlockValue); ok { + return g.generateFromBlock(blockValue, currentDepth) + } + return g.generateFromRuleOrToken(element.Value.String(), currentDepth+1) + } else if element.IsTerminal() { + if litValue, ok := element.Value.(grammar.LiteralValue); ok { + return cleanLiteral(litValue.Text) + } + return cleanLiteral(element.Value.String()) + } + + return element.Value.String() +} + +// generateQuantified handles quantified elements (* +) +func (g *Generator) generateQuantified(element *grammar.Element, currentDepth int) string { + var count int + + // Use fixed count if specified, otherwise use random count + if g.config.QuantifierCount > 0 { + count = g.config.QuantifierCount + } else { + switch element.Quantifier { + case grammar.ZERO_MORE: // * + count = g.random.Intn(g.config.MaxQuantifier + 1) // 0 to MaxQuantifier + case grammar.ONE_MORE: // + + count = 1 + g.random.Intn(g.config.MaxQuantifier) // 1 to MaxQuantifier + default: + count = 1 + } + } + + var results []string + for i := 0; i < count; i++ { + if element.IsRule() { + if refValue, ok := element.Value.(grammar.ReferenceValue); ok { + result := g.generateFromRuleOrToken(refValue.Name, currentDepth+1) + results = append(results, result) + } else if blockValue, ok := element.Value.(grammar.BlockValue); ok { + result := g.generateFromBlock(blockValue, currentDepth+1) + results = append(results, result) + } else { + result := g.generateFromRuleOrToken(element.Value.String(), currentDepth+1) + results = append(results, result) + } + } else if element.IsTerminal() { + if litValue, ok := element.Value.(grammar.LiteralValue); ok { + results = append(results, cleanLiteral(litValue.Text)) + } else { + results = append(results, cleanLiteral(element.Value.String())) + } + } + } + + return joinWithSpaces(results) +} + +// generateFromBlock generates content from a block value +func (g *Generator) generateFromBlock(blockValue grammar.BlockValue, currentDepth int) string { + if len(blockValue.Alternatives) == 0 { + return "" + } + + // Select a random alternative from the block + altIndex := g.random.Intn(len(blockValue.Alternatives)) + alternative := blockValue.Alternatives[altIndex] + + // Generate from all elements in the selected alternative + var result []string + for _, element := range alternative.Elements { + elementResult := g.generateFromElement(&element, currentDepth) + if elementResult != "" { + result = append(result, elementResult) + } + } + + return joinWithSpaces(result) +} + + +// generateFromRuleOrToken generates from a rule using standard rule-based generation +func (g *Generator) generateFromRuleOrToken(ruleName string, currentDepth int) string { + // Check if this is a lexer rule and generate concrete token + if rule := g.grammar.GetRule(ruleName); rule != nil && rule.IsLexer { + return g.generateConcreteToken(ruleName) + } + + // Otherwise expand as parser rule + return g.generateFromRule(ruleName, currentDepth) +} + +// generateConcreteToken generates concrete tokens by expanding lexer rules +func (g *Generator) generateConcreteToken(ruleName string) string { + // Get the lexer rule + rule := g.grammar.GetRule(ruleName) + if rule == nil || !rule.IsLexer { + return fmt.Sprintf("<%s>", ruleName) + } + + // For lexer rules, we need to expand them but generate concrete characters + // at the terminal level (character sets, literals, etc.) + return g.generateFromLexerRule(rule, 0) +} + +// generateFromLexerRule generates content from a lexer rule +func (g *Generator) generateFromLexerRule(rule *grammar.Rule, currentDepth int) string { + if len(rule.Alternatives) == 0 { + return "" + } + + // Select a random alternative + altIndex := g.random.Intn(len(rule.Alternatives)) + alternative := rule.Alternatives[altIndex] + + // Generate from all elements in the alternative + var result []string + for _, element := range alternative.Elements { + elementResult := g.generateFromLexerElement(&element, currentDepth) + if elementResult != "" { + result = append(result, elementResult) + } + } + + return strings.Join(result, "") +} + +// generateFromLexerElement generates content from a lexer element +func (g *Generator) generateFromLexerElement(element *grammar.Element, currentDepth int) string { + // Handle optional elements + if element.IsOptional() && g.random.Float64() > g.config.OptionalProb { + return "" // Skip optional element + } + + // Handle quantified elements + if element.IsQuantified() { + return g.generateQuantifiedLexer(element, currentDepth) + } + + // Generate single element + if element.IsRule() { + if refValue, ok := element.Value.(grammar.ReferenceValue); ok { + // Check if referenced rule is lexer or parser + if referencedRule := g.grammar.GetRule(refValue.Name); referencedRule != nil && referencedRule.IsLexer { + return g.generateFromLexerRule(referencedRule, currentDepth+1) + } else { + // Parser rule - shouldn't happen in lexer context, but handle it + return g.generateFromRule(refValue.Name, currentDepth+1) + } + } else if blockValue, ok := element.Value.(grammar.BlockValue); ok { + return g.generateFromLexerBlock(blockValue, currentDepth) + } + return element.Value.String() + } else if element.IsTerminal() { + if litValue, ok := element.Value.(grammar.LiteralValue); ok { + return g.generateFromLiteral(litValue.Text) + } + return g.generateFromLiteral(element.Value.String()) + } + + return element.Value.String() +} + +// generateQuantifiedLexer handles quantified lexer elements +func (g *Generator) generateQuantifiedLexer(element *grammar.Element, currentDepth int) string { + var count int + + // Use fixed count if specified, otherwise use random count + if g.config.QuantifierCount > 0 { + count = g.config.QuantifierCount + } else { + switch element.Quantifier { + case grammar.ZERO_MORE: // * + count = g.random.Intn(g.config.MaxQuantifier + 1) // 0 to MaxQuantifier + case grammar.ONE_MORE: // + + count = 1 + g.random.Intn(g.config.MaxQuantifier) // 1 to MaxQuantifier + default: + count = 1 + } + } + + var results []string + for i := 0; i < count; i++ { + result := g.generateFromLexerElement(&grammar.Element{ + Value: element.Value, + Quantifier: grammar.NONE, // Remove quantifier for individual generation + }, currentDepth+1) + if result != "" { + results = append(results, result) + } + } + + return strings.Join(results, "") +} + +// generateFromLexerBlock generates content from a lexer block +func (g *Generator) generateFromLexerBlock(blockValue grammar.BlockValue, currentDepth int) string { + if len(blockValue.Alternatives) == 0 { + return "" + } + + // Select a random alternative from the block + altIndex := g.random.Intn(len(blockValue.Alternatives)) + alternative := blockValue.Alternatives[altIndex] + + // Generate from all elements in the selected alternative + var result []string + for _, element := range alternative.Elements { + elementResult := g.generateFromLexerElement(&element, currentDepth) + if elementResult != "" { + result = append(result, elementResult) + } + } + + return strings.Join(result, "") +} + +// generateFromLiteral generates concrete characters from lexer literals and character sets +func (g *Generator) generateFromLiteral(literal string) string { + // Handle character sets like ~[\u0000"] or [a-zA-Z_] + if strings.HasPrefix(literal, "~[") && strings.HasSuffix(literal, "]") { + return g.generateFromNegatedSet(literal[2 : len(literal)-1]) + } else if strings.HasPrefix(literal, "[") && strings.HasSuffix(literal, "]") { + return g.generateFromCharacterSet(literal[1 : len(literal)-1]) + } + + // Handle string literals + if strings.HasPrefix(literal, "'") && strings.HasSuffix(literal, "'") && len(literal) >= 2 { + return literal[1 : len(literal)-1] // Remove quotes + } + + // Handle special escape sequences + switch literal { + case "\\r": + return "\r" + case "\\n": + return "\n" + case "\\t": + return "\t" + case "\\\"": + return "\"" + case "\\'": + return "'" + case "\\\\": + return "\\" + } + + // Return as-is for other cases + return literal +} + +// generateFromCharacterSet generates a random character from a character set like [a-zA-Z_] +func (g *Generator) generateFromCharacterSet(charset string) string { + chars := []rune{} + + // Simple character set expansion - handle ranges like a-z, A-Z, 0-9 + i := 0 + for i < len(charset) { + if i+2 < len(charset) && charset[i+1] == '-' { + // Handle range like a-z + start := rune(charset[i]) + end := rune(charset[i+2]) + for r := start; r <= end; r++ { + chars = append(chars, r) + } + i += 3 + } else { + // Single character + chars = append(chars, rune(charset[i])) + i++ + } + } + + if len(chars) == 0 { + return "x" // Fallback + } + + return string(chars[g.random.Intn(len(chars))]) +} + +// generateFromNegatedSet generates a character NOT in the specified set +func (g *Generator) generateFromNegatedSet(negatedSet string) string { + // For simplicity, generate common safe characters that are typically not in negated sets + safeChars := []string{"a", "b", "c", "x", "y", "z", "_", "1", "2", "3"} + + // TODO: Implement proper negated set handling by expanding the set and excluding those characters + // For now, just return a safe character + return safeChars[g.random.Intn(len(safeChars))] +} + + +// cleanLiteral removes quotes from literal strings +func cleanLiteral(literal string) string { + // Remove single quotes from literals like 'SELECT' + if len(literal) >= 2 && literal[0] == '\'' && literal[len(literal)-1] == '\'' { + return literal[1 : len(literal)-1] + } + return literal +} + +// joinWithSpaces joins strings with spaces, skipping empty strings +func joinWithSpaces(strs []string) string { + var nonEmpty []string + for _, s := range strs { + if s != "" { + nonEmpty = append(nonEmpty, s) + } + } + if len(nonEmpty) == 0 { + return "" + } + return joinStrings(nonEmpty, " ") +} + +// joinStrings joins strings with a separator +func joinStrings(strs []string, sep string) string { + if len(strs) == 0 { + return "" + } + if len(strs) == 1 { + return strs[0] + } + + result := strs[0] + for i := 1; i < len(strs); i++ { + result += sep + strs[i] + } + return result +} \ No newline at end of file diff --git a/tools/fuzzing/internal/grammar/parser.go b/tools/fuzzing/internal/grammar/parser.go new file mode 100644 index 0000000..cd43d1c --- /dev/null +++ b/tools/fuzzing/internal/grammar/parser.go @@ -0,0 +1,828 @@ +package grammar + +import ( + "fmt" + "os" + "strings" + + "github.com/antlr4-go/antlr/v4" + "github.com/pkg/errors" + grammar "github.com/bytebase/parser/tools/grammar" +) + +// ParsedGrammar represents a parsed grammar with extracted rules +type ParsedGrammar struct { + LexerRules map[string]*Rule + ParserRules map[string]*Rule + FilePath string + // BlockAltMap stores temporary block rules for debugging + // Key: block ID (e.g., "block_1_alts"), Value: the block alternatives + BlockAltMap map[string][]Alternative +} + +// Rule represents a grammar rule with its alternatives +type Rule struct { + Name string + Alternatives []Alternative + IsLexer bool +} + +// Alternative represents one alternative of a rule +type Alternative struct { + Elements []Element +} + +// Global block ID counter for generating unique block names +var globalBlockID = 0 + +// ElementValue represents different types of element values +type ElementValue interface { + // String returns a string representation for display/debugging + String() string +} + +// LiteralValue represents a literal string (e.g., 'SELECT') +type LiteralValue struct { + Text string +} + +func (l LiteralValue) String() string { return l.Text } + +// ReferenceValue represents a reference to a rule or token (e.g., IDENTIFIER, selectStmt) +type ReferenceValue struct { + Name string +} + +func (r ReferenceValue) String() string { return r.Name } + +// BlockValue represents a generated block (e.g., (',' column)*) +type BlockValue struct { + ID string // Global unique ID like "block_1_alts" + Alternatives []Alternative +} + +func (b BlockValue) String() string { + if len(b.Alternatives) == 0 { + return "" + } + if len(b.Alternatives) == 1 { + elements := []string{} + for _, elem := range b.Alternatives[0].Elements { + elements = append(elements, elem.Value.String()) + } + return fmt.Sprintf("(%s)", strings.Join(elements, " ")) + } + return b.ID +} + + +// WildcardValue represents a wildcard (.) +type WildcardValue struct{} + +func (w WildcardValue) String() string { return "." } + +// Element represents an element within an alternative +type Element struct { + Value ElementValue + Quantifier Quantifier +} + +// Quantifier indicates repetition type +type Quantifier int + +const ( + NONE Quantifier = iota + OPTIONAL_Q // ? + ZERO_MORE // * + ONE_MORE // + +) + +// ParseGrammarFile parses a .g4 file and extracts rules for fuzzing +func ParseGrammarFile(filePath string) (*ParsedGrammar, error) { + // Read file content + content, err := os.ReadFile(filePath) + if err != nil { + return nil, errors.Wrap(err, "failed to read grammar file") + } + + if len(content) == 0 { + return nil, errors.New("grammar file is empty") + } + + // Create input stream + input := antlr.NewInputStream(string(content)) + + // Create lexer + lexer := grammar.NewANTLRv4Lexer(input) + + // Add error listener + errorListener := &GrammarErrorListener{} + lexer.RemoveErrorListeners() + lexer.AddErrorListener(errorListener) + + // Create token stream + stream := antlr.NewCommonTokenStream(lexer, 0) + + // Create parser + parser := grammar.NewANTLRv4Parser(stream) + + // Add error listener to parser + parser.RemoveErrorListeners() + parser.AddErrorListener(errorListener) + + // Parse the grammar + tree := parser.GrammarSpec() + + // Check for parsing errors + if errorListener.HasErrors() { + return nil, errors.Errorf("failed to parse grammar: %v", errorListener.GetErrors()) + } + + if tree == nil { + return nil, errors.New("parser returned nil tree") + } + + // Extract rules from parse tree + visitor := NewGrammarExtractorVisitor() + visitor.VisitGrammarSpec(tree) + + + + return &ParsedGrammar{ + LexerRules: visitor.lexerRules, + ParserRules: visitor.parserRules, + FilePath: filePath, + BlockAltMap: visitor.blockAltMap, + }, nil +} + +// GetRule gets a rule by name from either lexer or parser rules +func (g *ParsedGrammar) GetRule(name string) *Rule { + if rule, ok := g.ParserRules[name]; ok { + return rule + } + if rule, ok := g.LexerRules[name]; ok { + return rule + } + return nil +} + +// GetAllRules returns all rules (both lexer and parser) +func (g *ParsedGrammar) GetAllRules() map[string]*Rule { + allRules := make(map[string]*Rule) + for name, rule := range g.LexerRules { + allRules[name] = rule + } + for name, rule := range g.ParserRules { + allRules[name] = rule + } + return allRules +} + +// GetBlockAlternatives returns the alternatives for a generated block ID +func (g *ParsedGrammar) GetBlockAlternatives(blockID string) ([]Alternative, bool) { + alts, exists := g.BlockAltMap[blockID] + return alts, exists +} + +// IsGeneratedBlock checks if a name refers to a generated block +func (g *ParsedGrammar) IsGeneratedBlock(name string) bool { + _, exists := g.BlockAltMap[name] + return exists +} + +// MergeGrammar merges another grammar into this one +func (g *ParsedGrammar) MergeGrammar(other *ParsedGrammar) error { + // Merge lexer rules + for name, rule := range other.LexerRules { + if _, exists := g.LexerRules[name]; exists { + return fmt.Errorf("duplicate lexer rule '%s' found in grammars '%s' and '%s'", name, g.FilePath, other.FilePath) + } + g.LexerRules[name] = rule + } + + // Merge parser rules + for name, rule := range other.ParserRules { + if _, exists := g.ParserRules[name]; exists { + return fmt.Errorf("duplicate parser rule '%s' found in grammars '%s' and '%s'", name, g.FilePath, other.FilePath) + } + g.ParserRules[name] = rule + } + + // Merge block alternatives map + for blockID, alternatives := range other.BlockAltMap { + if _, exists := g.BlockAltMap[blockID]; exists { + return fmt.Errorf("duplicate block ID '%s' found in grammars '%s' and '%s'", blockID, g.FilePath, other.FilePath) + } + g.BlockAltMap[blockID] = alternatives + } + + // Update file path to indicate it's a merged grammar + if g.FilePath != other.FilePath { + g.FilePath = fmt.Sprintf("%s + %s", g.FilePath, other.FilePath) + } + + return nil +} + +// ParseAndMergeGrammarFiles parses multiple grammar files and merges them into a single ParsedGrammar +func ParseAndMergeGrammarFiles(filePaths []string) (*ParsedGrammar, error) { + if len(filePaths) == 0 { + return nil, errors.New("no grammar files provided") + } + + // Parse the first grammar file + mergedGrammar, err := ParseGrammarFile(filePaths[0]) + if err != nil { + return nil, errors.Wrapf(err, "failed to parse first grammar file %s", filePaths[0]) + } + + // Merge additional grammar files + for i := 1; i < len(filePaths); i++ { + filePath := filePaths[i] + grammar, err := ParseGrammarFile(filePath) + if err != nil { + return nil, errors.Wrapf(err, "failed to parse grammar file %s", filePath) + } + + if err := mergedGrammar.MergeGrammar(grammar); err != nil { + return nil, errors.Wrapf(err, "failed to merge grammar file %s", filePath) + } + } + + return mergedGrammar, nil +} + +// IsRule checks if an element refers to another rule or generated block +func (e *Element) IsRule() bool { + _, isRef := e.Value.(ReferenceValue) + _, isBlock := e.Value.(BlockValue) + return isRef || isBlock +} + +// IsTerminal checks if an element is a terminal (literal) +func (e *Element) IsTerminal() bool { + _, isLit := e.Value.(LiteralValue) + _, isWild := e.Value.(WildcardValue) + return isLit || isWild +} + +// IsOptional checks if an element has optional quantifier +func (e *Element) IsOptional() bool { + return e.Quantifier == OPTIONAL_Q +} + +// IsQuantified checks if an element has repetition quantifiers +func (e *Element) IsQuantified() bool { + return e.Quantifier == ZERO_MORE || e.Quantifier == ONE_MORE +} + +// GrammarErrorListener collects parsing errors +type GrammarErrorListener struct { + errors []string +} + +func (l *GrammarErrorListener) SyntaxError(recognizer antlr.Recognizer, offendingSymbol interface{}, line, column int, msg string, e antlr.RecognitionException) { + l.errors = append(l.errors, fmt.Sprintf("line %d:%d %s", line, column, msg)) +} + +func (l *GrammarErrorListener) ReportAmbiguity(recognizer antlr.Parser, dfa *antlr.DFA, startIndex, stopIndex int, exact bool, ambigAlts *antlr.BitSet, configs *antlr.ATNConfigSet) { + // Ignore ambiguity for fuzzing purposes +} + +func (l *GrammarErrorListener) ReportAttemptingFullContext(recognizer antlr.Parser, dfa *antlr.DFA, startIndex, stopIndex int, conflictingAlts *antlr.BitSet, configs *antlr.ATNConfigSet) { + // Ignore for fuzzing purposes +} + +func (l *GrammarErrorListener) ReportContextSensitivity(recognizer antlr.Parser, dfa *antlr.DFA, startIndex, stopIndex, prediction int, configs *antlr.ATNConfigSet) { + // Ignore for fuzzing purposes +} + +func (l *GrammarErrorListener) HasErrors() bool { + return len(l.errors) > 0 +} + +func (l *GrammarErrorListener) GetErrors() []string { + return l.errors +} + +// GrammarExtractorVisitor extracts rules from the parse tree +type GrammarExtractorVisitor struct { + *grammar.BaseANTLRv4ParserVisitor + lexerRules map[string]*Rule + parserRules map[string]*Rule + blockAltMap map[string][]Alternative +} + +// NewGrammarExtractorVisitor creates a new visitor +func NewGrammarExtractorVisitor() *GrammarExtractorVisitor { + v := &GrammarExtractorVisitor{ + BaseANTLRv4ParserVisitor: &grammar.BaseANTLRv4ParserVisitor{}, + lexerRules: make(map[string]*Rule), + parserRules: make(map[string]*Rule), + blockAltMap: make(map[string][]Alternative), + } + return v +} + +// VisitGrammarSpec visits the grammar specification +func (v *GrammarExtractorVisitor) VisitGrammarSpec(ctx grammar.IGrammarSpecContext) interface{} { + // Visit rules section + if rulesCtx := ctx.Rules(); rulesCtx != nil { + v.VisitRules(rulesCtx) + } + return nil +} + +// VisitRules visits the rules section +func (v *GrammarExtractorVisitor) VisitRules(ctx grammar.IRulesContext) interface{} { + // Visit all rule specifications + for _, ruleSpecCtx := range ctx.AllRuleSpec() { + v.VisitRuleSpec(ruleSpecCtx) + } + return nil +} + +// VisitRuleSpec visits a rule specification (could be parser or lexer rule) +func (v *GrammarExtractorVisitor) VisitRuleSpec(ctx grammar.IRuleSpecContext) interface{} { + // Handle parser rules + if parserRuleCtx := ctx.ParserRuleSpec(); parserRuleCtx != nil { + v.VisitParserRuleSpec(parserRuleCtx) + } + // Handle lexer rules + if lexerRuleCtx := ctx.LexerRuleSpec(); lexerRuleCtx != nil { + v.VisitLexerRuleSpec(lexerRuleCtx) + } + return nil +} + +// VisitParserRuleSpec visits a parser rule specification +func (v *GrammarExtractorVisitor) VisitParserRuleSpec(ctx grammar.IParserRuleSpecContext) interface{} { + // Get rule name + ruleNameToken := ctx.RULE_REF() + if ruleNameToken == nil { + return nil + } + ruleName := ruleNameToken.GetText() + + // Get rule block (alternatives) + ruleBlockCtx := ctx.RuleBlock() + if ruleBlockCtx == nil { + return nil + } + + // Extract alternatives + alternatives := v.extractAlternatives(ruleBlockCtx) + + // Create rule + rule := &Rule{ + Name: ruleName, + IsLexer: false, + Alternatives: alternatives, + } + + // Store rule + v.parserRules[ruleName] = rule + + return nil +} + +// VisitLexerRuleSpec visits a lexer rule specification +func (v *GrammarExtractorVisitor) VisitLexerRuleSpec(ctx grammar.ILexerRuleSpecContext) interface{} { + // Get rule name + ruleNameToken := ctx.TOKEN_REF() + if ruleNameToken == nil { + return nil + } + ruleName := ruleNameToken.GetText() + + // Get lexer rule block (alternatives) + lexerRuleBlockCtx := ctx.LexerRuleBlock() + if lexerRuleBlockCtx == nil { + return nil + } + + // Extract alternatives from lexer rule block + alternatives := v.extractLexerAlternatives(lexerRuleBlockCtx) + + // Create rule + rule := &Rule{ + Name: ruleName, + IsLexer: true, + Alternatives: alternatives, + } + + // Store rule + v.lexerRules[ruleName] = rule + + return nil +} + +// extractAlternatives extracts alternatives from a rule block +func (v *GrammarExtractorVisitor) extractAlternatives(ruleBlockCtx grammar.IRuleBlockContext) []Alternative { + var alternatives []Alternative + + // Get rule alternative list + ruleAltListCtx := ruleBlockCtx.RuleAltList() + if ruleAltListCtx == nil { + return alternatives + } + + // Process each labeled alternative + for _, labeledAltCtx := range ruleAltListCtx.AllLabeledAlt() { + alternative := v.extractAlternative(labeledAltCtx) + alternatives = append(alternatives, alternative) + } + + return alternatives +} + +// extractLexerAlternatives extracts alternatives from a lexer rule block +func (v *GrammarExtractorVisitor) extractLexerAlternatives(lexerRuleBlockCtx grammar.ILexerRuleBlockContext) []Alternative { + var alternatives []Alternative + + // Get lexer alternative list + lexerAltListCtx := lexerRuleBlockCtx.LexerAltList() + if lexerAltListCtx == nil { + return alternatives + } + + // Process each lexer alternative + for _, lexerAltCtx := range lexerAltListCtx.AllLexerAlt() { + alternative := v.extractLexerAlternative(lexerAltCtx) + alternatives = append(alternatives, alternative) + } + + return alternatives +} + +// extractLexerAlternative extracts a single lexer alternative +func (v *GrammarExtractorVisitor) extractLexerAlternative(lexerAltCtx grammar.ILexerAltContext) Alternative { + var elements []Element + + // Get lexer elements context + lexerElementsCtx := lexerAltCtx.LexerElements() + if lexerElementsCtx != nil { + // Process each lexer element + for _, lexerElementCtx := range lexerElementsCtx.AllLexerElement() { + element := v.extractLexerElement(lexerElementCtx) + if element != nil { + elements = append(elements, *element) + } + } + } + + return Alternative{ + Elements: elements, + } +} + +// extractAlternative extracts a single alternative +func (v *GrammarExtractorVisitor) extractAlternative(labeledAltCtx grammar.ILabeledAltContext) Alternative { + var elements []Element + + // Get alternative context + altCtx := labeledAltCtx.Alternative() + if altCtx != nil { + // Process each element in the alternative + for _, elementCtx := range altCtx.AllElement() { + element := v.extractElement(elementCtx) + if element != nil { + elements = append(elements, *element) + } + } + } + + return Alternative{ + Elements: elements, + } +} + +// extractElement extracts an element from an element context +func (v *GrammarExtractorVisitor) extractElement(elementCtx grammar.IElementContext) *Element { + // Handle labeled elements + if labeledElementCtx := elementCtx.LabeledElement(); labeledElementCtx != nil { + return v.extractLabeledElement(labeledElementCtx) + } + + // Handle atoms (terminals/non-terminals) + if atomCtx := elementCtx.Atom(); atomCtx != nil { + element := v.extractAtom(atomCtx) + // Check for quantifiers + if element != nil { + element.Quantifier = v.extractQuantifier(elementCtx.EbnfSuffix()) + } + return element + } + + // Handle EBNF constructs (blocks with quantifiers) + if ebnfCtx := elementCtx.Ebnf(); ebnfCtx != nil { + return v.extractEbnf(ebnfCtx) + } + + return nil +} + +// extractLexerElement extracts a lexer element from a lexer element context +func (v *GrammarExtractorVisitor) extractLexerElement(lexerElementCtx grammar.ILexerElementContext) *Element { + // Handle lexer atoms (character ranges, terminals, etc.) + if lexerAtomCtx := lexerElementCtx.LexerAtom(); lexerAtomCtx != nil { + element := v.extractLexerAtom(lexerAtomCtx) + // Check for quantifiers + if element != nil { + element.Quantifier = v.extractQuantifier(lexerElementCtx.EbnfSuffix()) + } + return element + } + + // Handle lexer blocks (grouped alternatives) + if lexerBlockCtx := lexerElementCtx.LexerBlock(); lexerBlockCtx != nil { + element := v.extractLexerBlock(lexerBlockCtx) + // Check for quantifiers + if element != nil { + element.Quantifier = v.extractQuantifier(lexerElementCtx.EbnfSuffix()) + } + return element + } + + // Handle action blocks (for now, just return nil as they don't generate text) + if lexerElementCtx.ActionBlock() != nil { + // Action blocks don't contribute to generated text, so we skip them + return nil + } + + return nil +} + +// extractLexerAtom extracts a lexer atom (character ranges, terminals, etc.) +func (v *GrammarExtractorVisitor) extractLexerAtom(lexerAtomCtx grammar.ILexerAtomContext) *Element { + // Handle terminal definition (string literal or token reference) + if terminalDefCtx := lexerAtomCtx.TerminalDef(); terminalDefCtx != nil { + return v.extractTerminalDef(terminalDefCtx) + } + + // Handle character range (e.g., [a-z]) + if characterRangeCtx := lexerAtomCtx.CharacterRange(); characterRangeCtx != nil { + return v.extractCharacterRange(characterRangeCtx) + } + + // Handle not set (e.g., ~[abc]) + if notSetCtx := lexerAtomCtx.NotSet(); notSetCtx != nil { + return v.extractNotSet(notSetCtx) + } + + // Handle lexer character set (e.g., [abc]) + if lexerCharSetToken := lexerAtomCtx.LEXER_CHAR_SET(); lexerCharSetToken != nil { + return &Element{ + Value: LiteralValue{Text: lexerCharSetToken.GetText()}, + } + } + + // Handle wildcard (.) + if wildcardCtx := lexerAtomCtx.Wildcard(); wildcardCtx != nil { + return &Element{ + Value: WildcardValue{}, + } + } + + return nil +} + +// extractLexerBlock extracts a lexer block (grouped alternatives) +func (v *GrammarExtractorVisitor) extractLexerBlock(lexerBlockCtx grammar.ILexerBlockContext) *Element { + // Get the lexer alternative list from the block + lexerAltListCtx := lexerBlockCtx.LexerAltList() + if lexerAltListCtx == nil { + globalBlockID++ + blockID := fmt.Sprintf("lexer_block_%d_alts", globalBlockID) + emptyAlts := []Alternative{} + v.blockAltMap[blockID] = emptyAlts + + return &Element{ + Value: BlockValue{ID: blockID, Alternatives: emptyAlts}, + } + } + + // Extract all lexer alternatives from the block + lexerAlts := lexerAltListCtx.AllLexerAlt() + if len(lexerAlts) == 0 { + globalBlockID++ + blockID := fmt.Sprintf("lexer_block_%d_alts", globalBlockID) + emptyAlts := []Alternative{} + v.blockAltMap[blockID] = emptyAlts + + return &Element{ + Value: BlockValue{ID: blockID, Alternatives: emptyAlts}, + } + } + + // Extract all alternatives + blockAlternatives := []Alternative{} + for _, lexerAltCtx := range lexerAlts { + elements := []Element{} + if lexerElementsCtx := lexerAltCtx.LexerElements(); lexerElementsCtx != nil { + for _, lexerElementCtx := range lexerElementsCtx.AllLexerElement() { + element := v.extractLexerElement(lexerElementCtx) + if element != nil { + elements = append(elements, *element) + } + } + } + blockAlternatives = append(blockAlternatives, Alternative{Elements: elements}) + } + + // Generate global unique block ID and store mapping + globalBlockID++ + blockID := fmt.Sprintf("lexer_block_%d_alts", globalBlockID) + v.blockAltMap[blockID] = blockAlternatives + + return &Element{ + Value: BlockValue{ID: blockID, Alternatives: blockAlternatives}, + } +} + +// extractCharacterRange extracts a character range (e.g., 'a'..'z') +func (v *GrammarExtractorVisitor) extractCharacterRange(characterRangeCtx grammar.ICharacterRangeContext) *Element { + // Get the start and end of the range + stringLiterals := characterRangeCtx.AllSTRING_LITERAL() + if len(stringLiterals) == 2 { + startChar := stringLiterals[0].GetText() + endChar := stringLiterals[1].GetText() + rangeText := fmt.Sprintf("%s..%s", startChar, endChar) + return &Element{ + Value: LiteralValue{Text: rangeText}, + } + } + return nil +} + +// extractNotSet extracts a not set (e.g., ~[abc]) +func (v *GrammarExtractorVisitor) extractNotSet(notSetCtx grammar.INotSetContext) *Element { + // For now, represent as a literal text + // In a real implementation, this would need more sophisticated handling + return &Element{ + Value: LiteralValue{Text: "~[...]"}, + } +} + +// extractLabeledElement extracts a labeled element (e.g., label=atom) +func (v *GrammarExtractorVisitor) extractLabeledElement(labeledElementCtx grammar.ILabeledElementContext) *Element { + // For now, just extract the atom part and ignore the label + if atomCtx := labeledElementCtx.Atom(); atomCtx != nil { + return v.extractAtom(atomCtx) + } + if blockCtx := labeledElementCtx.Block(); blockCtx != nil { + return v.extractBlock(blockCtx) + } + return nil +} + +// extractAtom extracts an atom (terminal or non-terminal) +func (v *GrammarExtractorVisitor) extractAtom(atomCtx grammar.IAtomContext) *Element { + // Handle terminal definition (string literal or token reference) + if terminalDefCtx := atomCtx.TerminalDef(); terminalDefCtx != nil { + return v.extractTerminalDef(terminalDefCtx) + } + + // Handle rule reference + if rulerefCtx := atomCtx.Ruleref(); rulerefCtx != nil { + return v.extractRuleRef(rulerefCtx) + } + + // Handle wildcard (.) + if wildcardCtx := atomCtx.Wildcard(); wildcardCtx != nil { + return &Element{ + Value: WildcardValue{}, + } + } + + // Handle not sets, ranges, etc. - for now just return nil + return nil +} + +// extractTerminalDef extracts a terminal definition (literal string or token reference) +func (v *GrammarExtractorVisitor) extractTerminalDef(terminalDefCtx grammar.ITerminalDefContext) *Element { + if stringLiteralToken := terminalDefCtx.STRING_LITERAL(); stringLiteralToken != nil { + return &Element{ + Value: LiteralValue{Text: stringLiteralToken.GetText()}, + } + } + if tokenRefToken := terminalDefCtx.TOKEN_REF(); tokenRefToken != nil { + return &Element{ + Value: ReferenceValue{Name: tokenRefToken.GetText()}, + } + } + return nil +} + + +// extractRuleRef extracts a rule reference +func (v *GrammarExtractorVisitor) extractRuleRef(rulerefCtx grammar.IRulerefContext) *Element { + if ruleRefToken := rulerefCtx.RULE_REF(); ruleRefToken != nil { + return &Element{ + Value: ReferenceValue{Name: ruleRefToken.GetText()}, + } + } + return nil +} + +// extractBlock extracts a block (grouped alternatives) +func (v *GrammarExtractorVisitor) extractBlock(blockCtx grammar.IBlockContext) *Element { + // Get the alternative list from the block + altListCtx := blockCtx.AltList() + if altListCtx == nil { + globalBlockID++ + blockID := fmt.Sprintf("block_%d_alts", globalBlockID) + emptyAlts := []Alternative{} + v.blockAltMap[blockID] = emptyAlts + + return &Element{ + Value: BlockValue{ID: blockID, Alternatives: emptyAlts}, + } + } + + // Extract all alternatives from the block + alts := altListCtx.AllAlternative() + if len(alts) == 0 { + globalBlockID++ + blockID := fmt.Sprintf("block_%d_alts", globalBlockID) + emptyAlts := []Alternative{} + v.blockAltMap[blockID] = emptyAlts + + return &Element{ + Value: BlockValue{ID: blockID, Alternatives: emptyAlts}, + } + } + + // Extract all alternatives + blockAlternatives := []Alternative{} + for _, altCtx := range alts { + elements := []Element{} + for _, elementCtx := range altCtx.AllElement() { + element := v.extractElement(elementCtx) + if element != nil { + elements = append(elements, *element) + } + } + blockAlternatives = append(blockAlternatives, Alternative{Elements: elements}) + } + + // If it's a single element in a single alternative, we can simplify + if len(blockAlternatives) == 1 && len(blockAlternatives[0].Elements) == 1 { + return &blockAlternatives[0].Elements[0] + } + + // Generate global unique block ID and store mapping + globalBlockID++ + blockID := fmt.Sprintf("block_%d_alts", globalBlockID) + v.blockAltMap[blockID] = blockAlternatives + + return &Element{ + Value: BlockValue{ID: blockID, Alternatives: blockAlternatives}, + } +} + +// extractEbnf extracts EBNF constructs (blocks with suffixes) +func (v *GrammarExtractorVisitor) extractEbnf(ebnfCtx grammar.IEbnfContext) *Element { + // Get the block + blockCtx := ebnfCtx.Block() + if blockCtx == nil { + return nil + } + + element := v.extractBlock(blockCtx) + if element != nil { + // Apply quantifier from block suffix + if blockSuffixCtx := ebnfCtx.BlockSuffix(); blockSuffixCtx != nil { + if ebnfSuffixCtx := blockSuffixCtx.EbnfSuffix(); ebnfSuffixCtx != nil { + element.Quantifier = v.extractQuantifier(ebnfSuffixCtx) + } + } + } + + return element +} + +// extractQuantifier extracts quantifier from EBNF suffix +func (v *GrammarExtractorVisitor) extractQuantifier(ebnfSuffixCtx grammar.IEbnfSuffixContext) Quantifier { + if ebnfSuffixCtx == nil { + return NONE + } + + // Check for question mark (optional) + if ebnfSuffixCtx.QUESTION(0) != nil { + return OPTIONAL_Q + } + + // Check for star (zero or more) + if ebnfSuffixCtx.STAR() != nil { + return ZERO_MORE + } + + // Check for plus (one or more) + if ebnfSuffixCtx.PLUS() != nil { + return ONE_MORE + } + + return NONE +} \ No newline at end of file diff --git a/tools/fuzzing/internal/grammar/parser_test.go b/tools/fuzzing/internal/grammar/parser_test.go new file mode 100644 index 0000000..15cb127 --- /dev/null +++ b/tools/fuzzing/internal/grammar/parser_test.go @@ -0,0 +1,557 @@ +package grammar + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +// TestCompleteGrammarIR tests the complete intermediate representation of parsed grammar +func TestCompleteGrammarIR(t *testing.T) { + grammarContent := ` +parser grammar CompleteIRTest; + +// Simple rule with literals +greeting: 'Hello' 'World'; + +// Rule with alternatives +statement: selectStmt | insertStmt | 'DELETE'; + +// Rule with quantifiers and mixed elements +selectStmt: 'SELECT' columnList 'FROM' IDENTIFIER whereClause?; + +// Rule with quantified elements +columnList: column (',' column)*; + +// Rule with token reference +column: IDENTIFIER ('AS' IDENTIFIER)?; + +// Rule with optional and alternatives +whereClause: 'WHERE' expr; + +// Complex rule with multiple alternatives and quantifiers +expr: expr '+' expr + | expr '*' expr + | '(' expr ')' + | IDENTIFIER + | NUMBER; +` + + tmpFile := createTempGrammarFile(t, grammarContent) + defer os.Remove(tmpFile) + + grammar, err := ParseGrammarFile(tmpFile) + if err != nil { + t.Fatalf("Failed to parse grammar: %v", err) + } + + // Basic grammar properties + if grammar == nil { + t.Fatal("Grammar is nil") + } + if grammar.FilePath != tmpFile { + t.Errorf("Expected file path %s, got %s", tmpFile, grammar.FilePath) + } + if len(grammar.LexerRules) != 0 { + t.Errorf("Expected 0 lexer rules, got %d", len(grammar.LexerRules)) + } + if len(grammar.ParserRules) != 7 { + t.Errorf("Expected 7 parser rules, got %d", len(grammar.ParserRules)) + } + + // Test cases for rule validation + tests := []struct { + ruleName string + alternatives int + elements []elementTest + }{ + { + ruleName: "greeting", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "'Hello'", quantifier: NONE, elementType: "literal"}, + {altIndex: 0, elementIndex: 1, value: "'World'", quantifier: NONE, elementType: "literal"}, + }, + }, + { + ruleName: "statement", + alternatives: 3, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "selectStmt", quantifier: NONE, elementType: "reference"}, + {altIndex: 1, elementIndex: 0, value: "insertStmt", quantifier: NONE, elementType: "reference"}, + {altIndex: 2, elementIndex: 0, value: "'DELETE'", quantifier: NONE, elementType: "literal"}, + }, + }, + { + ruleName: "selectStmt", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "'SELECT'", quantifier: NONE, elementType: "literal"}, + {altIndex: 0, elementIndex: 1, value: "columnList", quantifier: NONE, elementType: "reference"}, + {altIndex: 0, elementIndex: 2, value: "'FROM'", quantifier: NONE, elementType: "literal"}, + {altIndex: 0, elementIndex: 3, value: "IDENTIFIER", quantifier: NONE, elementType: "reference"}, + {altIndex: 0, elementIndex: 4, value: "whereClause", quantifier: OPTIONAL_Q, elementType: "reference"}, + }, + }, + { + ruleName: "columnList", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "column", quantifier: NONE, elementType: "reference"}, + {altIndex: 0, elementIndex: 1, value: "(',' column)", quantifier: ZERO_MORE, elementType: "block"}, + }, + }, + { + ruleName: "column", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "IDENTIFIER", quantifier: NONE, elementType: "reference"}, + {altIndex: 0, elementIndex: 1, value: "('AS' IDENTIFIER)", quantifier: OPTIONAL_Q, elementType: "block"}, + }, + }, + { + ruleName: "whereClause", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "'WHERE'", quantifier: NONE, elementType: "literal"}, + {altIndex: 0, elementIndex: 1, value: "expr", quantifier: NONE, elementType: "reference"}, + }, + }, + { + ruleName: "expr", + alternatives: 5, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "expr", quantifier: NONE, elementType: "reference"}, + {altIndex: 0, elementIndex: 1, value: "'+'", quantifier: NONE, elementType: "literal"}, + {altIndex: 0, elementIndex: 2, value: "expr", quantifier: NONE, elementType: "reference"}, + {altIndex: 1, elementIndex: 1, value: "'*'", quantifier: NONE, elementType: "literal"}, + {altIndex: 2, elementIndex: 0, value: "'('", quantifier: NONE, elementType: "literal"}, + {altIndex: 2, elementIndex: 1, value: "expr", quantifier: NONE, elementType: "reference"}, + {altIndex: 2, elementIndex: 2, value: "')'", quantifier: NONE, elementType: "literal"}, + {altIndex: 3, elementIndex: 0, value: "IDENTIFIER", quantifier: NONE, elementType: "reference"}, + {altIndex: 4, elementIndex: 0, value: "NUMBER", quantifier: NONE, elementType: "reference"}, + }, + }, + } + + for _, tc := range tests { + t.Run(tc.ruleName, func(t *testing.T) { + rule := grammar.GetRule(tc.ruleName) + if rule == nil { + t.Fatalf("rule %s not found", tc.ruleName) + } + if rule.Name != tc.ruleName || rule.IsLexer { + t.Errorf("rule %s has incorrect metadata", tc.ruleName) + } + if len(rule.Alternatives) != tc.alternatives { + t.Errorf("%s: expected %d alternatives, got %d", tc.ruleName, tc.alternatives, len(rule.Alternatives)) + } + + for _, elem := range tc.elements { + altIndex := elem.altIndex + elementIndex := elem.elementIndex + + if altIndex >= len(rule.Alternatives) { + t.Errorf("%s: alternative %d out of range", tc.ruleName, altIndex) + continue + } + + elements := rule.Alternatives[altIndex].Elements + if elementIndex >= len(elements) { + t.Errorf("%s alt %d: element %d out of range", tc.ruleName, altIndex, elementIndex) + continue + } + + element := elements[elementIndex] + if elem.value != "" && element.Value.String() != elem.value { + t.Errorf("%s alt %d elem %d: expected value %s, got %s", tc.ruleName, altIndex, elementIndex, elem.value, element.Value.String()) + } + if element.Quantifier != elem.quantifier { + t.Errorf("%s alt %d elem %d: expected quantifier %v, got %v", tc.ruleName, altIndex, elementIndex, elem.quantifier, element.Quantifier) + } + + // Validate element type using type assertions + switch elem.elementType { + case "literal": + if _, ok := element.Value.(LiteralValue); !ok { + t.Errorf("%s alt %d elem %d: expected LiteralValue, got %T", tc.ruleName, altIndex, elementIndex, element.Value) + } + case "reference": + if _, ok := element.Value.(ReferenceValue); !ok { + t.Errorf("%s alt %d elem %d: expected ReferenceValue, got %T", tc.ruleName, altIndex, elementIndex, element.Value) + } + case "block": + if _, ok := element.Value.(BlockValue); !ok { + t.Errorf("%s alt %d elem %d: expected BlockValue, got %T", tc.ruleName, altIndex, elementIndex, element.Value) + } + } + } + }) + } + + // Test GetAllRules method + allRules := grammar.GetAllRules() + if len(allRules) != 7 { + t.Errorf("GetAllRules: expected 7 rules, got %d", len(allRules)) + } +} + +type elementTest struct { + altIndex int + elementIndex int + value string + quantifier Quantifier + elementType string // "literal", "reference", or "block" +} + +// Helper functions + +func createTempGrammarFile(t *testing.T, content string) string { + tmpDir := os.TempDir() + tmpFile := filepath.Join(tmpDir, "test_grammar.g4") + + err := os.WriteFile(tmpFile, []byte(content), 0644) + if err != nil { + t.Fatalf("Failed to create temp grammar file: %v", err) + } + + return tmpFile +} + +func createTempGrammarFileWithName(t *testing.T, content string, filename string) string { + tmpDir := os.TempDir() + tmpFile := filepath.Join(tmpDir, filename) + + err := os.WriteFile(tmpFile, []byte(content), 0644) + if err != nil { + t.Fatalf("Failed to create temp grammar file: %v", err) + } + + return tmpFile +} + +// TestLexerRuleParsing tests the parsing of lexer rules +func TestLexerRuleParsing(t *testing.T) { + grammarContent := ` +lexer grammar TestLexer; + +// Simple string literal +SELECT: 'SELECT'; + +// Character range +LETTER: [a-zA-Z]; + +// Complex rule with alternatives and quantifiers +IDENTIFIER: [a-zA-Z_] [a-zA-Z0-9_]*; + +// Rule with character set +DIGIT: [0-9]; + +// Rule with wildcard and quantifier +COMMENT: '//' .*? '\n'; +` + + tmpFile := createTempGrammarFile(t, grammarContent) + defer os.Remove(tmpFile) + + grammar, err := ParseGrammarFile(tmpFile) + if err != nil { + t.Fatalf("Failed to parse lexer grammar: %v", err) + } + + // Basic grammar properties + if grammar == nil { + t.Fatal("Grammar is nil") + } + if len(grammar.ParserRules) != 0 { + t.Errorf("Expected 0 parser rules, got %d", len(grammar.ParserRules)) + } + if len(grammar.LexerRules) != 5 { + t.Errorf("Expected 5 lexer rules, got %d", len(grammar.LexerRules)) + } + + // Test cases for lexer rule validation + tests := []struct { + ruleName string + alternatives int + elements []elementTest + }{ + { + ruleName: "SELECT", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "'SELECT'", quantifier: NONE, elementType: "literal"}, + }, + }, + { + ruleName: "LETTER", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "[a-zA-Z]", quantifier: NONE, elementType: "literal"}, + }, + }, + { + ruleName: "IDENTIFIER", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "[a-zA-Z_]", quantifier: NONE, elementType: "literal"}, + {altIndex: 0, elementIndex: 1, value: "[a-zA-Z0-9_]", quantifier: ZERO_MORE, elementType: "literal"}, + }, + }, + { + ruleName: "DIGIT", + alternatives: 1, + elements: []elementTest{ + {altIndex: 0, elementIndex: 0, value: "[0-9]", quantifier: NONE, elementType: "literal"}, + }, + }, + } + + for _, tc := range tests { + t.Run(tc.ruleName, func(t *testing.T) { + rule := grammar.GetRule(tc.ruleName) + if rule == nil { + t.Fatalf("rule %s not found", tc.ruleName) + } + if rule.Name != tc.ruleName || !rule.IsLexer { + t.Errorf("rule %s has incorrect metadata: IsLexer=%v", tc.ruleName, rule.IsLexer) + } + if len(rule.Alternatives) != tc.alternatives { + t.Errorf("%s: expected %d alternatives, got %d", tc.ruleName, tc.alternatives, len(rule.Alternatives)) + } + + for _, elem := range tc.elements { + altIndex := elem.altIndex + elementIndex := elem.elementIndex + + if altIndex >= len(rule.Alternatives) { + t.Errorf("%s: alternative %d out of range", tc.ruleName, altIndex) + continue + } + + elements := rule.Alternatives[altIndex].Elements + if elementIndex >= len(elements) { + t.Errorf("%s alt %d: element %d out of range", tc.ruleName, altIndex, elementIndex) + continue + } + + element := elements[elementIndex] + if elem.value != "" && element.Value.String() != elem.value { + t.Errorf("%s alt %d elem %d: expected value %s, got %s", tc.ruleName, altIndex, elementIndex, elem.value, element.Value.String()) + } + if element.Quantifier != elem.quantifier { + t.Errorf("%s alt %d elem %d: expected quantifier %v, got %v", tc.ruleName, altIndex, elementIndex, elem.quantifier, element.Quantifier) + } + + // Validate element type using type assertions + switch elem.elementType { + case "literal": + if _, ok := element.Value.(LiteralValue); !ok { + t.Errorf("%s alt %d elem %d: expected LiteralValue, got %T", tc.ruleName, altIndex, elementIndex, element.Value) + } + case "reference": + if _, ok := element.Value.(ReferenceValue); !ok { + t.Errorf("%s alt %d elem %d: expected ReferenceValue, got %T", tc.ruleName, altIndex, elementIndex, element.Value) + } + case "block": + if _, ok := element.Value.(BlockValue); !ok { + t.Errorf("%s alt %d elem %d: expected BlockValue, got %T", tc.ruleName, altIndex, elementIndex, element.Value) + } + } + } + }) + } +} + +// TestCombinedGrammarParsing tests parsing of combined grammar with both parser and lexer rules +func TestCombinedGrammarParsing(t *testing.T) { + grammarContent := ` +grammar CombinedTest; + +// Parser rules +statement: selectStmt; +selectStmt: 'SELECT' IDENTIFIER; + +// Lexer rules +IDENTIFIER: [a-zA-Z_] [a-zA-Z0-9_]*; +WS: [ \t\r\n]+ -> skip; +` + + tmpFile := createTempGrammarFile(t, grammarContent) + defer os.Remove(tmpFile) + + grammar, err := ParseGrammarFile(tmpFile) + if err != nil { + t.Fatalf("Failed to parse combined grammar: %v", err) + } + + // Basic grammar properties + if grammar == nil { + t.Fatal("Grammar is nil") + } + if len(grammar.ParserRules) != 2 { + t.Errorf("Expected 2 parser rules, got %d", len(grammar.ParserRules)) + } + if len(grammar.LexerRules) != 2 { + t.Errorf("Expected 2 lexer rules, got %d", len(grammar.LexerRules)) + } + + // Test parser rule + statement := grammar.GetRule("statement") + if statement == nil { + t.Fatal("Parser rule 'statement' not found") + } + if statement.IsLexer { + t.Error("Parser rule incorrectly marked as lexer rule") + } + + // Test lexer rule + identifier := grammar.GetRule("IDENTIFIER") + if identifier == nil { + t.Fatal("Lexer rule 'IDENTIFIER' not found") + } + if !identifier.IsLexer { + t.Error("Lexer rule incorrectly marked as parser rule") + } + + // Test that GetAllRules returns both types + allRules := grammar.GetAllRules() + if len(allRules) != 4 { + t.Errorf("Expected 4 total rules, got %d", len(allRules)) + } +} + +// TestGrammarMerging tests merging multiple grammar files +func TestGrammarMerging(t *testing.T) { + // Create first grammar file (parser rules) + parserGrammarContent := ` +parser grammar ParserTest; + +options { + tokenVocab = LexerTest; +} + +statement: selectStmt; +selectStmt: 'SELECT' IDENTIFIER; +` + + // Create second grammar file (lexer rules) + lexerGrammarContent := ` +lexer grammar LexerTest; + +IDENTIFIER: [a-zA-Z_] [a-zA-Z0-9_]*; +WS: [ \t\r\n]+ -> skip; +` + + // Create temporary files with unique names + tmpParserFile := createTempGrammarFileWithName(t, parserGrammarContent, "test_parser.g4") + defer os.Remove(tmpParserFile) + + tmpLexerFile := createTempGrammarFileWithName(t, lexerGrammarContent, "test_lexer.g4") + defer os.Remove(tmpLexerFile) + + // Test parsing and merging + filePaths := []string{tmpParserFile, tmpLexerFile} + mergedGrammar, err := ParseAndMergeGrammarFiles(filePaths) + if err != nil { + t.Fatalf("Failed to parse and merge grammar files: %v", err) + } + + // Verify merged grammar properties + if mergedGrammar == nil { + t.Fatal("Merged grammar is nil") + } + + if len(mergedGrammar.ParserRules) != 2 { + t.Errorf("Expected 2 parser rules, got %d", len(mergedGrammar.ParserRules)) + } + + if len(mergedGrammar.LexerRules) != 2 { + t.Errorf("Expected 2 lexer rules, got %d", len(mergedGrammar.LexerRules)) + } + + // Test that both parser and lexer rules are accessible + statement := mergedGrammar.GetRule("statement") + if statement == nil || statement.IsLexer { + t.Error("Parser rule 'statement' not found or incorrectly marked") + } + + identifier := mergedGrammar.GetRule("IDENTIFIER") + if identifier == nil || !identifier.IsLexer { + t.Error("Lexer rule 'IDENTIFIER' not found or incorrectly marked") + } + + // Test that merged path is updated + if !strings.Contains(mergedGrammar.FilePath, "+") { + t.Errorf("Expected merged file path to contain '+', got: %s", mergedGrammar.FilePath) + } + + // Test GetAllRules on merged grammar + allRules := mergedGrammar.GetAllRules() + if len(allRules) != 4 { + t.Errorf("Expected 4 total rules in merged grammar, got %d", len(allRules)) + } +} + +// TestGrammarMergingWithConflicts tests handling of duplicate rule names +func TestGrammarMergingWithConflicts(t *testing.T) { + // Create two grammars with conflicting rule names + grammar1Content := ` +lexer grammar Test1; +IDENTIFIER: [a-z]+; +` + + grammar2Content := ` +lexer grammar Test2; +IDENTIFIER: [A-Z]+; // Conflict with first grammar +` + + tmpFile1 := createTempGrammarFileWithName(t, grammar1Content, "conflict1.g4") + defer os.Remove(tmpFile1) + + tmpFile2 := createTempGrammarFileWithName(t, grammar2Content, "conflict2.g4") + defer os.Remove(tmpFile2) + + // Test that merging fails with duplicate rule names + filePaths := []string{tmpFile1, tmpFile2} + _, err := ParseAndMergeGrammarFiles(filePaths) + if err == nil { + t.Error("Expected error when merging grammars with duplicate rule names") + } + + if !strings.Contains(err.Error(), "duplicate") { + t.Errorf("Expected error about duplicate rules, got: %v", err) + } +} + +// TestParseAndMergeGrammarFilesEdgeCases tests edge cases +func TestParseAndMergeGrammarFilesEdgeCases(t *testing.T) { + // Test with empty file list + _, err := ParseAndMergeGrammarFiles([]string{}) + if err == nil { + t.Error("Expected error with empty file list") + } + + // Test with single file + grammarContent := ` +lexer grammar SingleTest; +TOKEN: 'test'; +` + + tmpFile := createTempGrammarFileWithName(t, grammarContent, "single.g4") + defer os.Remove(tmpFile) + + grammar, err := ParseAndMergeGrammarFiles([]string{tmpFile}) + if err != nil { + t.Fatalf("Failed to parse single grammar file: %v", err) + } + + if len(grammar.LexerRules) != 1 { + t.Errorf("Expected 1 lexer rule, got %d", len(grammar.LexerRules)) + } + + if grammar.GetRule("TOKEN") == nil { + t.Error("TOKEN rule not found in single file grammar") + } +} \ No newline at end of file diff --git a/tools/fuzzing/internal/lexer/token_generator.go b/tools/fuzzing/internal/lexer/token_generator.go new file mode 100644 index 0000000..c201152 --- /dev/null +++ b/tools/fuzzing/internal/lexer/token_generator.go @@ -0,0 +1,351 @@ +package lexer + +import ( + "fmt" + "math/rand" + "regexp" + "strings" + + "github.com/bytebase/parser/tools/fuzzing/internal/grammar" +) + +// TokenGenerator generates tokens from lexer rules +type TokenGenerator struct { + random *rand.Rand + config *TokenGeneratorConfig +} + +// TokenGeneratorConfig controls token generation behavior +type TokenGeneratorConfig struct { + // MaxQuantifierCount limits how many times quantified elements repeat + MaxQuantifierCount int + // MinQuantifierCount sets minimum repetitions for + quantifiers + MinQuantifierCount int + // OptionalProbability controls likelihood of including optional elements (0.0-1.0) + OptionalProbability float64 + // MaxDepth limits recursion depth to prevent infinite loops + MaxDepth int +} + +// NewTokenGenerator creates a new token generator +func NewTokenGenerator(seed int64, config *TokenGeneratorConfig) *TokenGenerator { + if config == nil { + config = &TokenGeneratorConfig{ + MaxQuantifierCount: 5, + MinQuantifierCount: 1, + OptionalProbability: 0.7, + MaxDepth: 10, + } + } + return &TokenGenerator{ + random: rand.New(rand.NewSource(seed)), + config: config, + } +} + +// GenerateToken generates a token string from a lexer rule +func (g *TokenGenerator) GenerateToken(rule *grammar.Rule) (string, error) { + if !rule.IsLexer { + return "", fmt.Errorf("rule %s is not a lexer rule", rule.Name) + } + + if len(rule.Alternatives) == 0 { + return "", fmt.Errorf("rule %s has no alternatives", rule.Name) + } + + // Select a random alternative + altIndex := g.random.Intn(len(rule.Alternatives)) + alternative := rule.Alternatives[altIndex] + + // Generate from the selected alternative + return g.generateFromAlternative(&alternative, 0) +} + +// generateFromAlternative generates text from a lexer rule alternative +func (g *TokenGenerator) generateFromAlternative(alt *grammar.Alternative, depth int) (string, error) { + if depth > g.config.MaxDepth { + return "", fmt.Errorf("maximum depth exceeded") + } + + var result strings.Builder + for _, element := range alt.Elements { + text, err := g.generateFromElement(&element, depth+1) + if err != nil { + return "", err + } + result.WriteString(text) + } + return result.String(), nil +} + +// generateFromElement generates text from a single lexer element +func (g *TokenGenerator) generateFromElement(element *grammar.Element, depth int) (string, error) { + if depth > g.config.MaxDepth { + return "", fmt.Errorf("maximum depth exceeded") + } + + // Handle quantifiers + switch element.Quantifier { + case grammar.OPTIONAL_Q: // ? + if g.random.Float64() > g.config.OptionalProbability { + return "", nil // Skip optional element + } + return g.generateElementContent(element, depth) + + case grammar.ZERO_MORE: // * + count := g.random.Intn(g.config.MaxQuantifierCount + 1) // 0 to MaxQuantifierCount + return g.generateRepeated(element, count, depth) + + case grammar.ONE_MORE: // + + count := g.config.MinQuantifierCount + g.random.Intn(g.config.MaxQuantifierCount) + return g.generateRepeated(element, count, depth) + + default: // NONE + return g.generateElementContent(element, depth) + } +} + +// generateRepeated generates repeated content for quantified elements +func (g *TokenGenerator) generateRepeated(element *grammar.Element, count int, depth int) (string, error) { + var result strings.Builder + for i := 0; i < count; i++ { + text, err := g.generateElementContent(element, depth) + if err != nil { + return "", err + } + result.WriteString(text) + } + return result.String(), nil +} + +// generateElementContent generates the actual content for an element +func (g *TokenGenerator) generateElementContent(element *grammar.Element, depth int) (string, error) { + switch value := element.Value.(type) { + case grammar.LiteralValue: + return g.generateFromLiteral(value) + case grammar.BlockValue: + return g.generateFromBlock(value, depth) + case grammar.WildcardValue: + return g.generateFromWildcard() + case grammar.ReferenceValue: + // For lexer rules, this typically shouldn't happen unless it's a fragment reference + // For now, return the reference name as placeholder + return fmt.Sprintf("<%s>", value.Name), nil + default: + return "", fmt.Errorf("unsupported element value type: %T", value) + } +} + +// generateFromLiteral generates text from a literal value +func (g *TokenGenerator) generateFromLiteral(literal grammar.LiteralValue) (string, error) { + text := literal.Text + + // Handle string literals - remove quotes + if len(text) >= 2 && text[0] == '\'' && text[len(text)-1] == '\'' { + return text[1 : len(text)-1], nil + } + + // Handle negated sets like ~[...] FIRST (before checking for ..) + if strings.HasPrefix(text, "~[") && strings.HasSuffix(text, "]") { + return g.generateFromNegatedSet(text) + } + + // Handle character sets like [a-zA-Z] + if len(text) >= 2 && text[0] == '[' && text[len(text)-1] == ']' { + return g.generateFromCharacterSet(text[1 : len(text)-1]) + } + + // Handle character ranges like 'a'..'z' + if strings.Contains(text, "..") { + return g.generateFromCharacterRange(text) + } + + // Default: return the literal as-is + return text, nil +} + +// generateFromCharacterSet generates a character from a character set like [a-zA-Z0-9_] +func (g *TokenGenerator) generateFromCharacterSet(charset string) (string, error) { + chars, err := g.expandCharacterSet(charset) + if err != nil { + return "", err + } + if len(chars) == 0 { + return "", fmt.Errorf("empty character set") + } + + // Select a random character from the set + index := g.random.Intn(len(chars)) + return string(chars[index]), nil +} + +// expandCharacterSet expands a character set specification into actual characters +func (g *TokenGenerator) expandCharacterSet(charset string) ([]rune, error) { + var chars []rune + i := 0 + + for i < len(charset) { + // Handle escape sequences + if i < len(charset) && charset[i] == '\\' && i+1 < len(charset) { + switch charset[i+1] { + case 'r': + chars = append(chars, '\r') + case 'n': + chars = append(chars, '\n') + case 't': + chars = append(chars, '\t') + case '\\': + chars = append(chars, '\\') + case '"': + chars = append(chars, '"') + case '\'': + chars = append(chars, '\'') + default: + // For unknown escapes, use the escaped character literally + chars = append(chars, rune(charset[i+1])) + } + i += 2 + } else if i+2 < len(charset) && charset[i+1] == '-' && charset[i+2] != '\\' { + // Handle range like a-z (but not when second char is an escape) + start := rune(charset[i]) + end := rune(charset[i+2]) + + if start > end { + return nil, fmt.Errorf("invalid character range: %c-%c", start, end) + } + + for c := start; c <= end; c++ { + chars = append(chars, c) + } + i += 3 + } else { + // Handle single character + chars = append(chars, rune(charset[i])) + i++ + } + } + + return chars, nil +} + +// generateFromCharacterRange generates from a character range like 'a'..'z' +func (g *TokenGenerator) generateFromCharacterRange(rangeText string) (string, error) { + // Extract start and end characters from 'a'..'z' format + parts := strings.Split(rangeText, "..") + if len(parts) != 2 { + return "", fmt.Errorf("invalid character range format: %s", rangeText) + } + + start := strings.Trim(parts[0], "'\"") + end := strings.Trim(parts[1], "'\"") + + if len(start) != 1 || len(end) != 1 { + return "", fmt.Errorf("character range must be single characters: %s", rangeText) + } + + startChar := rune(start[0]) + endChar := rune(end[0]) + + if startChar > endChar { + return "", fmt.Errorf("invalid character range: %c > %c", startChar, endChar) + } + + // Generate random character in range + rangeSize := int(endChar - startChar + 1) + offset := g.random.Intn(rangeSize) + result := startChar + rune(offset) + + return string(result), nil +} + +// generateFromBlock generates text from a block value +func (g *TokenGenerator) generateFromBlock(block grammar.BlockValue, depth int) (string, error) { + if len(block.Alternatives) == 0 { + return "", nil + } + + // Select a random alternative from the block + altIndex := g.random.Intn(len(block.Alternatives)) + alternative := &block.Alternatives[altIndex] + + return g.generateFromAlternative(alternative, depth) +} + +// generateFromWildcard generates a character for wildcard (.) +func (g *TokenGenerator) generateFromWildcard() (string, error) { + // Generate a random printable ASCII character + // Range: 32-126 (space to tilde) + char := rune(32 + g.random.Intn(95)) + return string(char), nil +} + +// generateFromNegatedSet generates a character NOT in the specified set +func (g *TokenGenerator) generateFromNegatedSet(negatedSet string) (string, error) { + // Extract the character set from ~[...] format + if len(negatedSet) < 4 || !strings.HasPrefix(negatedSet, "~[") || !strings.HasSuffix(negatedSet, "]") { + return "", fmt.Errorf("invalid negated set format: %s", negatedSet) + } + + charset := negatedSet[2 : len(negatedSet)-1] // Remove ~[ and ] + + // Expand the excluded character set + excludedChars, err := g.expandCharacterSet(charset) + if err != nil { + return "", fmt.Errorf("failed to expand excluded character set: %v", err) + } + + // Create a map for quick lookup + excluded := make(map[rune]bool) + for _, c := range excludedChars { + excluded[c] = true + } + + // Generate a character that's not in the excluded set + // Try common printable ASCII characters first + candidates := []rune{} + + // Add letters + for c := 'a'; c <= 'z'; c++ { + if !excluded[c] { + candidates = append(candidates, c) + } + } + for c := 'A'; c <= 'Z'; c++ { + if !excluded[c] { + candidates = append(candidates, c) + } + } + + // Add digits + for c := '0'; c <= '9'; c++ { + if !excluded[c] { + candidates = append(candidates, c) + } + } + + // Add some special characters + specialChars := []rune{' ', '!', '#', '$', '%', '&', '*', '+', '/', '=', '?', '@', '^', '_', '`', '|', '~'} + for _, c := range specialChars { + if !excluded[c] { + candidates = append(candidates, c) + } + } + + if len(candidates) == 0 { + return "", fmt.Errorf("no valid characters available (all excluded)") + } + + // Select a random candidate + index := g.random.Intn(len(candidates)) + return string(candidates[index]), nil +} + +// ValidateCharacterSet validates if a character set specification is valid +func ValidateCharacterSet(charset string) error { + // Use regex to validate basic character set patterns + validPattern := regexp.MustCompile(`^[a-zA-Z0-9_\-\[\]\\^]+$`) + if !validPattern.MatchString(charset) { + return fmt.Errorf("invalid characters in character set: %s", charset) + } + return nil +} \ No newline at end of file diff --git a/tools/fuzzing/internal/lexer/token_generator_test.go b/tools/fuzzing/internal/lexer/token_generator_test.go new file mode 100644 index 0000000..c944268 --- /dev/null +++ b/tools/fuzzing/internal/lexer/token_generator_test.go @@ -0,0 +1,344 @@ +package lexer + +import ( + "os" + "path/filepath" + "regexp" + "strings" + "testing" + + "github.com/bytebase/parser/tools/fuzzing/internal/grammar" +) + +// TestTokenGeneratorBasic tests basic token generation functionality +func TestTokenGeneratorBasic(t *testing.T) { + config := &TokenGeneratorConfig{ + MaxQuantifierCount: 3, + MinQuantifierCount: 1, + OptionalProbability: 1.0, // Always include optional elements for testing + MaxDepth: 5, + } + generator := NewTokenGenerator(12345, config) + + tests := []struct { + ruleName string + grammarText string + validator func(string) bool + description string + }{ + { + ruleName: "SELECT", + grammarText: "SELECT: 'SELECT';", + validator: func(s string) bool { return s == "SELECT" }, + description: "simple string literal", + }, + { + ruleName: "LETTER", + grammarText: "LETTER: [a-z];", + validator: func(s string) bool { return len(s) == 1 && s[0] >= 'a' && s[0] <= 'z' }, + description: "single character range", + }, + { + ruleName: "DIGIT", + grammarText: "DIGIT: [0-9];", + validator: func(s string) bool { return len(s) == 1 && s[0] >= '0' && s[0] <= '9' }, + description: "digit character range", + }, + { + ruleName: "IDENTIFIER", + grammarText: "IDENTIFIER: [a-zA-Z_] [a-zA-Z0-9_]*;", + validator: func(s string) bool { + if len(s) == 0 { + return false + } + // First character must be letter or underscore + first := s[0] + if !((first >= 'a' && first <= 'z') || (first >= 'A' && first <= 'Z') || first == '_') { + return false + } + // Rest must be letters, digits, or underscore + for _, c := range s[1:] { + if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_') { + return false + } + } + return true + }, + description: "identifier with quantifier", + }, + } + + for _, tt := range tests { + t.Run(tt.description, func(t *testing.T) { + // Create a temporary grammar file + grammarContent := "lexer grammar Test;\n\n" + tt.grammarText + tmpFile := createTempGrammarFile(t, grammarContent) + defer os.Remove(tmpFile) + + // Parse the grammar + parsedGrammar, err := grammar.ParseGrammarFile(tmpFile) + if err != nil { + t.Fatalf("Failed to parse grammar: %v", err) + } + + // Get the rule + rule := parsedGrammar.GetRule(tt.ruleName) + if rule == nil { + t.Fatalf("Rule %s not found", tt.ruleName) + } + + // Generate multiple tokens to test consistency + for i := 0; i < 10; i++ { + token, err := generator.GenerateToken(rule) + if err != nil { + t.Errorf("Failed to generate token: %v", err) + continue + } + + if !tt.validator(token) { + t.Errorf("Generated token '%s' does not match expected pattern for %s", token, tt.description) + } + } + }) + } +} + +// TestQuantifierHandling tests EBNF quantifier handling +func TestQuantifierHandling(t *testing.T) { + config := &TokenGeneratorConfig{ + MaxQuantifierCount: 5, + MinQuantifierCount: 2, + OptionalProbability: 0.5, + MaxDepth: 5, + } + generator := NewTokenGenerator(54321, config) + + tests := []struct { + ruleName string + grammarText string + validator func(string) bool + description string + }{ + { + ruleName: "OPTIONAL", + grammarText: "OPTIONAL: 'A' 'B'?;", + validator: func(s string) bool { + return s == "A" || s == "AB" + }, + description: "optional element with ?", + }, + { + ruleName: "ZERO_MORE", + grammarText: "ZERO_MORE: 'X' 'Y'*;", + validator: func(s string) bool { + if !strings.HasPrefix(s, "X") { + return false + } + rest := s[1:] + for _, c := range rest { + if c != 'Y' { + return false + } + } + return true + }, + description: "zero or more with *", + }, + { + ruleName: "ONE_MORE", + grammarText: "ONE_MORE: 'Z' 'W'+;", + validator: func(s string) bool { + if !strings.HasPrefix(s, "Z") { + return false + } + rest := s[1:] + if len(rest) == 0 { + return false // + requires at least one + } + for _, c := range rest { + if c != 'W' { + return false + } + } + return true + }, + description: "one or more with +", + }, + } + + for _, tt := range tests { + t.Run(tt.description, func(t *testing.T) { + // Create a temporary grammar file + grammarContent := "lexer grammar Test;\n\n" + tt.grammarText + tmpFile := createTempGrammarFile(t, grammarContent) + defer os.Remove(tmpFile) + + // Parse the grammar + parsedGrammar, err := grammar.ParseGrammarFile(tmpFile) + if err != nil { + t.Fatalf("Failed to parse grammar: %v", err) + } + + // Get the rule + rule := parsedGrammar.GetRule(tt.ruleName) + if rule == nil { + t.Fatalf("Rule %s not found", tt.ruleName) + } + + // Generate multiple tokens to test quantifier behavior + validCount := 0 + for i := 0; i < 20; i++ { + token, err := generator.GenerateToken(rule) + if err != nil { + t.Errorf("Failed to generate token: %v", err) + continue + } + + if tt.validator(token) { + validCount++ + } else { + t.Logf("Generated token '%s' for %s (validation failed but continuing)", token, tt.description) + } + } + + // At least 50% of generated tokens should be valid + if validCount < 10 { + t.Errorf("Too few valid tokens generated (%d/20) for %s", validCount, tt.description) + } + }) + } +} + +// TestCharacterSetExpansion tests character set expansion functionality +func TestCharacterSetExpansion(t *testing.T) { + generator := NewTokenGenerator(9999, nil) + + tests := []struct { + charset string + expected []rune + }{ + {"abc", []rune{'a', 'b', 'c'}}, + {"a-c", []rune{'a', 'b', 'c'}}, + {"0-2", []rune{'0', '1', '2'}}, + {"a-cX", []rune{'a', 'b', 'c', 'X'}}, + {"A-Z_", append(makeRange('A', 'Z'), '_')}, + } + + for _, tt := range tests { + t.Run(tt.charset, func(t *testing.T) { + result, err := generator.expandCharacterSet(tt.charset) + if err != nil { + t.Fatalf("Failed to expand character set '%s': %v", tt.charset, err) + } + + if len(result) != len(tt.expected) { + t.Errorf("Expected %d characters, got %d", len(tt.expected), len(result)) + return + } + + for i, expected := range tt.expected { + if result[i] != expected { + t.Errorf("At position %d: expected '%c', got '%c'", i, expected, result[i]) + } + } + }) + } +} + +// TestComplexLexerRules tests complex lexer rules with multiple elements +func TestComplexLexerRules(t *testing.T) { + config := &TokenGeneratorConfig{ + MaxQuantifierCount: 3, + MinQuantifierCount: 1, + OptionalProbability: 0.8, + MaxDepth: 10, + } + generator := NewTokenGenerator(11111, config) + + grammarContent := ` +lexer grammar ComplexTest; + +// Complex identifier rule +IDENTIFIER: [a-zA-Z_] [a-zA-Z0-9_]*; + +// Number with optional decimal part +NUMBER: [0-9]+ ('.' [0-9]+)?; + +// String with escaped quotes +STRING: '"' (~'"')* '"'; + +// Comment line +COMMENT: '//' (~[\r\n])*; +` + + tmpFile := createTempGrammarFile(t, grammarContent) + defer os.Remove(tmpFile) + + parsedGrammar, err := grammar.ParseGrammarFile(tmpFile) + if err != nil { + t.Fatalf("Failed to parse complex grammar: %v", err) + } + + tests := []struct { + ruleName string + pattern string + }{ + {"IDENTIFIER", `^[a-zA-Z_][a-zA-Z0-9_]*$`}, + {"NUMBER", `^[0-9]+(\.[0-9]+)?$`}, + {"STRING", `^"[^"]*"$`}, + {"COMMENT", `^//.*$`}, + } + + for _, tt := range tests { + t.Run(tt.ruleName, func(t *testing.T) { + rule := parsedGrammar.GetRule(tt.ruleName) + if rule == nil { + t.Fatalf("Rule %s not found", tt.ruleName) + } + + regex := regexp.MustCompile(tt.pattern) + validCount := 0 + + for i := 0; i < 10; i++ { + token, err := generator.GenerateToken(rule) + if err != nil { + t.Errorf("Failed to generate token for %s: %v", tt.ruleName, err) + continue + } + + t.Logf("Generated token for %s: '%s'", tt.ruleName, token) + + if regex.MatchString(token) { + validCount++ + } + } + + // Expect at least some valid tokens + if validCount == 0 { + t.Errorf("No valid tokens generated for %s", tt.ruleName) + } + }) + } +} + +// Helper functions + +func createTempGrammarFile(t *testing.T, content string) string { + tmpDir := os.TempDir() + tmpFile := filepath.Join(tmpDir, "test_lexer.g4") + + err := os.WriteFile(tmpFile, []byte(content), 0644) + if err != nil { + t.Fatalf("Failed to create temp grammar file: %v", err) + } + + return tmpFile +} + +func makeRange(start, end rune) []rune { + var result []rune + for c := start; c <= end; c++ { + result = append(result, c) + } + return result +} \ No newline at end of file diff --git a/tools/fuzzing/tests/postgresql_test.go b/tools/fuzzing/tests/postgresql_test.go new file mode 100644 index 0000000..fa067a4 --- /dev/null +++ b/tools/fuzzing/tests/postgresql_test.go @@ -0,0 +1,192 @@ +package tests + +import ( + "fmt" + "path/filepath" + "runtime" + "testing" + + "github.com/bytebase/parser/tools/fuzzing/internal/config" + "github.com/bytebase/parser/tools/fuzzing/internal/generator" +) + +// getRepoRoot finds the repository root directory +func getRepoRoot() string { + _, filename, _, _ := runtime.Caller(0) + // Go up from tools/fuzzing/tests to the repo root + return filepath.Join(filepath.Dir(filename), "..", "..", "..") +} + +func TestPostgreSQLSelectStmt(t *testing.T) { + repoRoot := getRepoRoot() + + // PostgreSQL grammar file paths + lexerPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLLexer.g4") + parserPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLParser.g4") + + tests := []struct { + name string + startRule string + count int + maxDepth int + optionalProb float64 + seed int64 + }{ + { + name: "Simple SELECT statements", + startRule: "selectstmt", + count: 3, + maxDepth: 5, + optionalProb: 0.7, + seed: 42, + }, + { + name: "Deep SELECT statements", + startRule: "selectstmt", + count: 2, + maxDepth: 8, + optionalProb: 0.5, + seed: 123, + }, + { + name: "Minimal SELECT statements", + startRule: "selectstmt", + count: 5, + maxDepth: 3, + optionalProb: 0.3, + seed: 456, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfg := &config.Config{ + GrammarFiles: []string{lexerPath, parserPath}, + StartRule: tt.startRule, + Count: tt.count, + MaxDepth: tt.maxDepth, + OptionalProb: tt.optionalProb, + MaxQuantifier: 3, + MinQuantifier: 1, + QuantifierCount: 0, + OutputFormat: config.CompactOutput, + Seed: tt.seed, + } + + fmt.Printf("\n=== %s ===\n", tt.name) + fmt.Printf("Config: MaxDepth=%d, OptionalProb=%.1f, Count=%d, Seed=%d\n", + tt.maxDepth, tt.optionalProb, tt.count, tt.seed) + fmt.Println() + + gen := generator.New(cfg) + err := gen.Generate() + + if err != nil { + t.Errorf("Failed to generate PostgreSQL %s: %v", tt.startRule, err) + } else { + t.Logf("Successfully generated %d PostgreSQL %s statements", tt.count, tt.startRule) + } + }) + } +} + +func TestPostgreSQLExpressions(t *testing.T) { + repoRoot := getRepoRoot() + + // PostgreSQL grammar file paths + lexerPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLLexer.g4") + parserPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLParser.g4") + + cfg := &config.Config{ + GrammarFiles: []string{lexerPath, parserPath}, + StartRule: "a_expr", // PostgreSQL expression rule + Count: 5, + MaxDepth: 4, + OptionalProb: 0.6, + MaxQuantifier: 2, + MinQuantifier: 1, + QuantifierCount: 0, + OutputFormat: config.CompactOutput, + Seed: 789, + } + + fmt.Printf("\n=== PostgreSQL Expressions ===\n") + fmt.Printf("Generating %d expressions with max depth %d\n", cfg.Count, cfg.MaxDepth) + fmt.Println() + + gen := generator.New(cfg) + err := gen.Generate() + + if err != nil { + t.Errorf("Failed to generate PostgreSQL expressions: %v", err) + } else { + t.Logf("Successfully generated %d PostgreSQL expressions", cfg.Count) + } +} + +func TestPostgreSQLVerboseOutput(t *testing.T) { + repoRoot := getRepoRoot() + + // PostgreSQL grammar file paths + lexerPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLLexer.g4") + parserPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLParser.g4") + + cfg := &config.Config{ + GrammarFiles: []string{lexerPath, parserPath}, + StartRule: "selectstmt", + Count: 2, + MaxDepth: 4, + OptionalProb: 0.8, + MaxQuantifier: 2, + MinQuantifier: 1, + QuantifierCount: 0, + OutputFormat: config.VerboseOutput, // Show rule traversal + Seed: 999, + } + + fmt.Printf("\n=== PostgreSQL Verbose Output ===\n") + fmt.Printf("Generating with verbose output to show rule traversal\n") + fmt.Println() + + gen := generator.New(cfg) + err := gen.Generate() + + if err != nil { + t.Errorf("Failed to generate PostgreSQL statements with verbose output: %v", err) + } else { + t.Logf("Successfully generated PostgreSQL statements with verbose output") + } +} + +// Benchmark test for performance measurement +func BenchmarkPostgreSQLGeneration(b *testing.B) { + repoRoot := getRepoRoot() + + lexerPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLLexer.g4") + parserPath := filepath.Join(repoRoot, "postgresql", "PostgreSQLParser.g4") + + cfg := &config.Config{ + GrammarFiles: []string{lexerPath, parserPath}, + StartRule: "selectstmt", + Count: 1, + MaxDepth: 6, + OptionalProb: 0.5, + MaxQuantifier: 3, + MinQuantifier: 1, + QuantifierCount: 0, + OutputFormat: config.CompactOutput, + Seed: 42, + } + + gen := generator.New(cfg) + + // Reset the timer to exclude setup time + b.ResetTimer() + + for i := 0; i < b.N; i++ { + err := gen.Generate() + if err != nil { + b.Fatalf("Generation failed: %v", err) + } + } +} \ No newline at end of file diff --git a/tools/grammar/README.md b/tools/grammar/README.md deleted file mode 100644 index 2641250..0000000 --- a/tools/grammar/README.md +++ /dev/null @@ -1,36 +0,0 @@ -# ANTLR v4 Grammar Parser - -A Go implementation to parse ANTLR v4 grammar files (`.g4` files) in this repository. - -## Source - -The lexer and parser grammars come from: https://github.com/antlr/grammars-v4/blob/master/antlr/antlr4 - -## Why Custom NextToken()? - -We added `func (l *LexerAdaptor) NextToken() antlr.Token` in `lexer_adaptor.go` because: - -- ANTLR grammar parsing requires context-sensitive lexing -- Need to convert `ID` tokens to `TOKEN_REF` (uppercase) or `RULE_REF` (lowercase) -- Go ANTLR doesn't automatically call `Emit()` like Java ANTLR does -- Go tokens are immutable, so we use a `TokenTypeWrapper` to override token types - -## Why Sed Command in Makefile? - -We added this sed command in the Makefile: -```bash -sed -i '' 's/l\.BaseLexer = antlr\.NewBaseLexer(input)/l.LexerAdaptor = *NewLexerAdaptor(input)/' antlrv4_lexer.go -``` - -Because: -- ANTLR code generation creates `l.BaseLexer = antlr.NewBaseLexer(input)` -- We need `l.LexerAdaptor = *NewLexerAdaptor(input)` to use our custom lexer -- This automatically fixes the generated constructor after each regeneration - -## Usage - -```bash -make build # Generate parser and apply fixes -make test # Test all .g4 files in repository (should show 100% success) -make all # Build and test -``` \ No newline at end of file