Introduce analyze command to extract field names

kelnage · kelnage · commit cf489caeafe1 · 2025-10-28T16:56:06.000Z
When working with converting large numbers of Sigma rules, being
able to check whether a query's results contains the fields that are
searched for in the converted rule is integral to validating the query
will work correctly. This introduces a new analyze command, `fields`
which is provided a backend, any number of pipelines, and Sigma files,
and returns a list of unique fields that appear after the pipeline
transformations are taken into account.
diff --git a/sigma/analyze/fields.py b/sigma/analyze/fields.py
@@ -0,0 +1,205 @@
+"""Extract field names from Sigma rules."""
+from operator import add
+from typing import List, Set, Tuple
+from sigma.rule import SigmaRule, SigmaDetection, SigmaDetectionItem
+from sigma.collection import SigmaCollection
+from sigma.correlations import SigmaCorrelationRule
+from sigma.exceptions import SigmaError, SigmaPlaceholderError
+from sigma.modifiers import SigmaExpandModifier
+from sigma.types import SigmaString
+from sigma.processing.pipeline import ProcessingPipeline
+
+
+def get_fields(
+    backend,
+    rule: SigmaRule | SigmaCorrelationRule,
+    collect_errors: bool = True,
+) -> Tuple[List[str], List[SigmaError]]:
+    """Extract field names from a Sigma rule.
+    
+    Args:
+        backend: A Backend instance used to escape and quote field names
+        rule: A SigmaRule or SigmaCorrelationRule to extract fields from
+        collect_errors: Whether to collect errors. Defaults to True.
+    
+    Returns:
+        Tuple[List[str], List[SigmaError]]: A list of fields and any errors found
+    """
+    fields: List[str] = []
+    errors: List[SigmaError] = []
+    
+    def noop(field: str) -> str:
+        """A no-op function that returns the field as-is."""
+        return field
+    
+    # Get the field escaper from the backend
+    escape_and_quote_field = getattr(backend, "escape_and_quote_field", lambda x: x)
+    if not callable(escape_and_quote_field):
+        escape_and_quote_field = noop
+    
+    if isinstance(rule, SigmaRule):
+        if not rule.detection:
+            return fields, errors
+        
+        # Extract fields from each detection
+        for key in frozenset(rule.detection.detections.keys()):
+            _fields, _errors = _get_fields_from_detection_items(
+                backend,
+                rule.detection.detections[key].detection_items,
+                collect_errors,
+            )
+            fields.extend(_fields)
+            errors.extend(_errors)
+    
+    elif isinstance(rule, SigmaCorrelationRule):
+        # Handle correlation rules
+        if rule.group_by:
+            fields.extend([escape_and_quote_field(field) for field in rule.group_by])
+        
+        # Handle aliases
+        if rule.aliases:
+            aliases_to_remove = set()
+            for field_alias in rule.aliases:
+                esc_field_alias = escape_and_quote_field(field_alias.alias)
+                if esc_field_alias in fields:
+                    aliases_to_remove.add(esc_field_alias)
+                    fields.extend([
+                        escape_and_quote_field(field)
+                        for field in field_alias.mapping.values()
+                    ])
+            fields = [f for f in fields if f not in aliases_to_remove]
+    
+    return fields, errors
+
+
+def _get_fields_from_detection_items(
+    backend,
+    detection_items: List[SigmaDetectionItem | SigmaDetection],
+    collect_errors: bool = True,
+) -> Tuple[List[str], List[SigmaError]]:
+    """Extract fields from detection items recursively.
+    
+    Args:
+        backend: A Backend instance used to escape and quote field names
+        detection_items: A list of SigmaDetectionItem or SigmaDetection
+        collect_errors: Whether to collect errors. Defaults to True.
+    
+    Returns:
+        Tuple[List[str], List[SigmaError]]: A list of fields and any errors found
+    """
+    fields: List[str] = []
+    errors: List[SigmaError] = []
+    
+    def noop(field: str) -> str:
+        """A no-op function that returns the field as-is."""
+        return field
+    
+    escape_and_quote_field = getattr(backend, "escape_and_quote_field", lambda x: x)
+    if not callable(escape_and_quote_field):
+        escape_and_quote_field = noop
+    
+    for di in detection_items:
+        if isinstance(di, SigmaDetectionItem) and hasattr(di, "field") and di.field:
+            if collect_errors:
+                # Check for unexpanded placeholders
+                has_placeholder_modifier = any(
+                    [
+                        is_sem
+                        for mod in di.modifiers
+                        if (is_sem := issubclass(mod, SigmaExpandModifier))
+                    ]
+                )
+                has_placeholder_value = any(
+                    [
+                        is_placeholder
+                        for val in di.value
+                        if (
+                            is_placeholder := isinstance(val, SigmaString)
+                            and (
+                                hasattr(val, "contains_placeholder")
+                                and val.contains_placeholder()
+                            )
+                        )
+                    ]
+                )
+                if all([has_placeholder_modifier, has_placeholder_value]):
+                    errors.append(
+                        SigmaPlaceholderError(
+                            "Cannot extract fields from Sigma rule with unexpanded placeholders."
+                        )
+                    )
+            fields.append(escape_and_quote_field(di.field))
+        elif isinstance(di, SigmaDetection):
+            # Recursively extract fields from nested detections
+            _fields, _errors = _get_fields_from_detection_items(
+                backend, di.detection_items, collect_errors
+            )
+            fields.extend(_fields)
+            errors.extend(_errors)
+    
+    return fields, errors
+
+
+def extract_fields_from_collection(
+    collection: SigmaCollection,
+    backend,
+    collect_errors: bool = True,
+) -> Tuple[Set[str], List[SigmaError]]:
+    """Extract all unique field names from a Sigma collection.
+    
+    Args:
+        collection: A SigmaCollection to extract fields from
+        backend: A Backend instance used to escape and quote field names
+        collect_errors: Whether to collect errors. Defaults to True.
+    
+    Returns:
+        Tuple[Set[str], List[SigmaError]]: A set of unique field names and any errors found
+    """
+    all_fields: Set[str] = set()
+    all_errors: List[SigmaError] = []
+    
+    for rule in collection:
+        # Try to apply any processing pipelines if available
+        last_processing_pipeline = getattr(rule, "last_processing_pipeline", None)
+        if not last_processing_pipeline:
+            backend_processing_pipeline = (
+                getattr(backend, "backend_processing_pipeline", None) or None
+            )
+            processing_pipeline = getattr(backend, "processing_pipeline", None) or None
+            output_format_processing_pipeline = (
+                getattr(backend, "output_format_processing_pipeline", None) or None
+            )
+            
+            if output_format_processing_pipeline and isinstance(output_format_processing_pipeline, dict):
+                output_format_processing_pipeline = (
+                    output_format_processing_pipeline.get(
+                        getattr(backend, "format", "default")
+                    )
+                )
+            
+            if backend_processing_pipeline is None:
+                backend_processing_pipeline = ProcessingPipeline()
+            if processing_pipeline is None:
+                processing_pipeline = ProcessingPipeline()
+            if output_format_processing_pipeline is None:
+                output_format_processing_pipeline = ProcessingPipeline()
+            
+            last_processing_pipeline = add(
+                backend_processing_pipeline,
+                add(processing_pipeline, output_format_processing_pipeline),
+            )
+        
+        # Apply the processing pipeline to the rule
+        try:
+            rule = last_processing_pipeline.apply(rule)
+        except Exception:
+            # If pipeline application fails, continue with the rule as-is
+            pass
+        
+        # Extract fields from the rule
+        fields, errors = get_fields(backend, rule, collect_errors)
+        all_fields.update(fields)
+        all_errors.extend(errors)
+    
+    return all_fields, all_errors
+
diff --git a/sigma/cli/analyze.py b/sigma/cli/analyze.py
@@ -1,15 +1,20 @@
 import json
 import pathlib
 import click
+from sigma.processing.resolver import SigmaPipelineNotFoundError
 
+from sigma.cli.convert import pipeline_resolver
 from sigma.cli.rules import check_rule_errors, load_rules
 from sigma.analyze.attack import score_functions, calculate_attack_scores
+from sigma.analyze.fields import extract_fields_from_collection
 from sigma.data.mitre_attack import (
     mitre_attack_techniques_tactics_mapping,
     mitre_attack_version,
 )
 from sigma.analyze.stats import create_logsourcestats, format_row
 from sigma.rule import SigmaLevel, SigmaStatus
+from sigma.plugins import InstalledSigmaPlugins
+from sigma.conversion.base import Backend
 
 
 @click.group(name="analyze", help="Analyze Sigma rule sets")
@@ -207,3 +212,102 @@ def analyze_logsource(
     print("-+-".join("-" * width for width in column_widths), file=output)
     for row in rows:
         print(format_row(row, column_widths), file=output)
+
+
+@analyze_group.command(
+    name="fields",
+    help="Extract field names from Sigma rules for a given target backend and processing pipeline(s).",
+)
+@click.option(
+    "--file-pattern",
+    "-P",
+    default="*.yml",
+    show_default=True,
+    help="Pattern for file names to be included in recursion into directories.",
+)
+@click.option(
+    "--target",
+    "-t",
+    type=str,
+    required=True,
+    help="Target backend to use for field name escaping and quoting.",
+)
+@click.option(
+    "--pipeline",
+    "-p",
+    multiple=True,
+    help="Specify processing pipelines as identifiers ("
+    + click.style("sigma list pipelines", bold=True, fg="green")
+    + ") or YAML files or directories",
+)
+@click.option(
+    "--pipeline-check/--disable-pipeline-check",
+    default=True,
+    help="Verify if a pipeline is used that is intended for another backend.",
+)
+@click.argument(
+    "input",
+    nargs=-1,
+    required=True,
+    type=click.Path(exists=True, allow_dash=True, path_type=pathlib.Path),
+)
+def analyze_fields(file_pattern, target, pipeline, pipeline_check, input):
+    """Extract field names from Sigma rule sets.
+    
+    This command extracts and outputs all unique field names present in the given
+    Sigma rule collection, formatted for the specified target backend.
+    """
+    # Load plugins and get available backends
+    plugins = InstalledSigmaPlugins.autodiscover()
+    backends = plugins.backends
+    
+    if target not in backends:
+        available_targets = ", ".join(sorted(backends.keys()))
+        raise click.ClickException(
+            f"Unknown target '{target}'. Available targets are: {available_targets}"
+        )
+    
+    # Load rules
+    rules = load_rules(input, file_pattern)
+    check_rule_errors(rules)
+    
+    # Resolve pipelines
+    try:
+        processing_pipeline = pipeline_resolver.resolve(
+            pipeline, target if pipeline_check else None
+        )
+    except SigmaPipelineNotFoundError as e:
+        raise click.UsageError(
+            f"The pipeline '{e.spec}' was not found.\n"
+            + "List all installed processing pipelines with: "
+            + click.style(f"sigma list pipelines {target}", bold=True, fg="green")
+            + "\n"
+            "List pipeline plugins for installation with: "
+            + click.style(
+                f"sigma plugin list --plugin-type pipeline", bold=True, fg="green"
+            )
+            + "\n"
+            + "Pipelines not listed here are treated as file names."
+        )
+    
+    # Initialize backend
+    backend_class = backends[target]
+    try:
+        backend: Backend = backend_class(
+            processing_pipeline=processing_pipeline,
+            collect_errors=True,
+        )
+    except Exception as e:
+        raise click.ClickException(f"Failed to initialize backend '{target}': {str(e)}")
+    
+    # Extract fields
+    all_fields, errors = extract_fields_from_collection(rules, backend)
+    
+    # Handle errors
+    if errors:
+        click.echo("Warnings during field extraction:", err=True)
+        for error in errors:
+            click.echo(f"* {error}", err=True)
+    
+    # Output fields sorted
+    click.echo("\n".join(sorted(all_fields)))
diff --git a/tests/test_analyze.py b/tests/test_analyze.py
@@ -1,6 +1,6 @@
 import pytest
 from click.testing import CliRunner
-from sigma.cli.analyze import analyze_group, analyze_attack, analyze_logsource
+from sigma.cli.analyze import analyze_group, analyze_attack, analyze_logsource, analyze_fields
 from sigma.rule import (
     SigmaRule,
     SigmaLogSource,
@@ -208,4 +208,40 @@ def test_logsource_invalid_rule():
     cli = CliRunner()
     result = cli.invoke(analyze_logsource, ["-", "tests/files/sigma_rule_without_condition.yml"])
     assert result.exit_code != 0
-    assert "at least one condition" in result.stdout
+    assert "at least one condition" in result.stdout
+
+
+def test_fields_help():
+    cli = CliRunner()
+    result = cli.invoke(analyze_fields, ["--help"])
+    assert result.exit_code == 0
+    assert len(result.stdout.split()) > 8
+
+
+def test_fields_extract():
+    cli = CliRunner()
+    result = cli.invoke(analyze_fields, ["-t", "text_query_test", "-", "tests/files/valid"])
+    assert result.exit_code == 0
+    # Should have extracted at least some fields
+    assert len(result.stdout.split()) > 0
+
+
+def test_fields_extract_correlation_rule():
+    cli = CliRunner()
+    result = cli.invoke(analyze_fields, ["-t", "text_query_test", "-", "tests/files/sigma_correlation_rules.yml"])
+    assert result.exit_code == 0
+    assert len(result.stdout.split()) > 0
+
+
+def test_fields_extract_with_pipelines():
+    cli = CliRunner()
+    result = cli.invoke(analyze_fields, ["-t", "text_query_test", "-p", "tests/files/custom_pipeline.yml", "-p", "dummy_test", "-", "tests/files/valid"])
+    assert result.exit_code == 0
+    assert len(result.stdout.split()) > 0
+
+
+def test_fields_invalid_rule():
+    cli = CliRunner()
+    result = cli.invoke(analyze_fields, ["-t", "text_query_test", "-", "tests/files/sigma_rule_without_condition.yml"])
+    assert result.exit_code != 0
+    assert "at least one condition" in result.stdout