curl
diff --git a/‎pyproject.toml
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/curl_fuzzer_tools/corpus_to_yaml.py
Lines changed: 288 additions & 0 deletions b/‎src/curl_fuzzer_tools/corpus_to_yaml.py
Lines changed: 288 additions & 0 deletions
diff --git a/‎yaml/curl_fuzzer/test1.yaml
Lines changed: 11 additions & 0 deletions b/‎yaml/curl_fuzzer/test1.yaml
Lines changed: 11 additions & 0 deletions
diff --git a/‎yaml/curl_fuzzer/test10.yaml
Lines changed: 15 additions & 0 deletions b/‎yaml/curl_fuzzer/test10.yaml
Lines changed: 15 additions & 0 deletions
diff --git a/‎yaml/curl_fuzzer/test100.yaml
Lines changed: 11 additions & 0 deletions b/‎yaml/curl_fuzzer/test100.yaml
Lines changed: 11 additions & 0 deletions
diff --git a/‎yaml/curl_fuzzer/test100_2.yaml
Lines changed: 31 additions & 0 deletions b/‎yaml/curl_fuzzer/test100_2.yaml
Lines changed: 31 additions & 0 deletions
@@ -29,6 +29,7 @@ read_corpus = "curl_fuzzer_tools.read_corpus:run"
 generate_corpus = "curl_fuzzer_tools.generate_corpus:run"
 corpus_to_pcap = "curl_fuzzer_tools.corpus_to_pcap:run"
 generate_matrix = "curl_fuzzer_tools.generate_matrix:run"
+corpus_to_yaml = "curl_fuzzer_tools.corpus_to_yaml:run"
 
 [build-system]
 requires = ["setuptools>=61.0"]
 
@@ -0,0 +1,288 @@
+#!/usr/bin/env python3
+"""Tool to read corpus files and convert them to YAML format."""
+
+import argparse
+import logging
+import re
+import yaml
+from pathlib import Path
+from typing import Dict, Any, List, Union
+
+from curl_fuzzer_tools import common_logging
+from curl_fuzzer_tools.corpus import TLVDecoder
+
+log = logging.getLogger(__name__)
+
+
+def extract_tlv_types_from_header(header_file_path: Path) -> Dict[int, str]:
+    """Extract TLV type definitions from curl_fuzzer.h header file."""
+    tlv_types = {}
+
+    if not header_file_path.exists():
+        log.warning(f"Header file {header_file_path} not found, using built-in types")
+        return {}
+
+    with open(header_file_path, "r") as f:
+        content = f.read()
+
+    # Pattern to match #define TLV_TYPE_NAME value
+    pattern = r'#define\s+TLV_TYPE_(\w+)\s+(\d+)'
+
+    for match in re.finditer(pattern, content):
+        type_name = match.group(1)
+        type_value = int(match.group(2))
+        tlv_types[type_value] = type_name
+
+    log.info(f"Extracted {len(tlv_types)} TLV type definitions")
+    return tlv_types
+
+
+def get_tlv_type_name(tlv_type: int, tlv_types: Dict[int, str]) -> str:
+    """Get the human-readable name for a TLV type."""
+    return tlv_types.get(tlv_type, f"UNKNOWN_{tlv_type}")
+
+
+def format_value_for_yaml(data: bytes, verbose: bool = False) -> Union[str, int, Dict[str, Any]]:
+    """Format TLV value appropriately for YAML output."""
+    if len(data) == 0:
+        return ""
+
+    # Try to decode as UTF-8 string
+    try:
+        decoded = data.decode('utf-8')
+        # Check if it's printable ASCII/UTF-8
+        if decoded.isprintable():
+            return decoded
+    except UnicodeDecodeError:
+        pass
+
+    # Check if it's a 4-byte integer (common in TLVs)
+    if len(data) == 4:
+        # Try big-endian first (network byte order)
+        import struct
+        try:
+            value = struct.unpack('!I', data)[0]
+            # Return both representations for clarity if verbose
+            if verbose:
+                return {
+                    'integer': value,
+                    'hex': data.hex()
+                }
+            else:
+                return value
+        except struct.error:
+            pass
+
+    # For non-verbose mode, just return hex for binary data
+    if not verbose:
+        return data.hex()
+
+    # Check if it's binary data that might be partially printable
+    try:
+        decoded = data.decode('utf-8', errors='replace')
+        if any(c.isprintable() or c.isspace() for c in decoded):
+            return {
+                'hex': data.hex(),
+                'partial_text': decoded,
+                'note': 'Binary data with some printable characters'
+            }
+    except:
+        pass
+
+    # For pure binary data, represent as hex
+    return {
+        'hex': data.hex(),
+        'note': 'Binary data'
+    }
+
+
+def corpus_to_yaml(corpus_file: Path, tlv_types: Dict[int, str], verbose: bool = False) -> Dict[str, Any]:
+    """Convert a corpus file to a YAML-ready dictionary."""
+    result = {
+        'corpus_file': str(corpus_file),
+        'tlvs': []
+    }
+
+    with open(corpus_file, "rb") as f:
+        data = f.read()
+
+    result['file_size'] = len(data)
+
+    try:
+        decoder = TLVDecoder(data)
+        for tlv in decoder:
+            tlv_entry = {
+                'type': get_tlv_type_name(tlv.type, tlv_types),
+                'type_id_v1': tlv.type,
+                'length': tlv.length
+            }
+
+            if tlv.length > 0:
+                tlv_entry['value'] = format_value_for_yaml(tlv.data, verbose)
+
+            result['tlvs'].append(tlv_entry)
+
+    except Exception as e:
+        log.error(f"Error parsing corpus file {corpus_file}: {e}")
+        result['error'] = str(e)
+
+    return result
+
+
+def main() -> None:
+    """Main function."""
+    parser = argparse.ArgumentParser(
+        description="Convert curl fuzzer corpus files to YAML format"
+    )
+    parser.add_argument(
+        "input",
+        help="Corpus file or directory to convert"
+    )
+    parser.add_argument(
+        "-o", "--output",
+        help="Output file (default: create yaml directory structure)"
+    )
+    parser.add_argument(
+        "--header",
+        help="Path to curl_fuzzer.h header file (default: auto-detect)",
+        type=Path
+    )
+    parser.add_argument(
+        "--pretty",
+        action="store_true",
+        help="Pretty-print YAML output"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Include verbose output with multiple data representations"
+    )
+    parser.add_argument(
+        "--yaml-dir",
+        help="Base directory for YAML output (default: yaml/)",
+        type=Path,
+        default=Path("yaml")
+    )
+    parser.add_argument(
+        "--stdout",
+        action="store_true",
+        help="Output to stdout instead of creating files"
+    )
+
+    args = parser.parse_args()
+
+    # Auto-detect header file if not provided
+    if args.header is None:
+        header_file = Path(__file__).parent.parent.parent / "curl_fuzzer.h"
+    else:
+        header_file = args.header
+
+    # Extract TLV type definitions
+    tlv_types = extract_tlv_types_from_header(header_file)
+
+    input_path = Path(args.input)
+
+    # Configure YAML output
+    yaml_args = {
+        'default_flow_style': False,
+        'allow_unicode': True,
+    }
+
+    if args.pretty:
+        yaml_args['indent'] = 2
+        yaml_args['width'] = 120
+
+    if input_path.is_file():
+        # Single file processing
+        yaml_data = corpus_to_yaml(input_path, tlv_types, args.verbose)
+
+        if args.output:
+            # Use specified output file
+            with open(args.output, 'w', encoding='utf-8') as f:
+                yaml.dump(yaml_data, f, **yaml_args)
+            log.info(f"Output written to {args.output}")
+        elif args.stdout:
+            # Output to stdout
+            import sys
+            yaml.dump(yaml_data, sys.stdout, **yaml_args)
+        else:
+            # Create corresponding YAML file in yaml directory
+            yaml_file = create_yaml_path(input_path, args.yaml_dir)
+            yaml_file.parent.mkdir(parents=True, exist_ok=True)
+            with open(yaml_file, 'w', encoding='utf-8') as f:
+                yaml.dump(yaml_data, f, **yaml_args)
+            log.info(f"Output written to {yaml_file}")
+
+    elif input_path.is_dir():
+        # Directory processing - create yaml directory structure
+        if args.stdout:
+            log.error("Cannot output directory contents to stdout. Use --output for single file or remove --stdout.")
+            return
+
+        corpus_files = list(input_path.rglob("*"))
+        corpus_files = [f for f in corpus_files if f.is_file()]
+
+        processed_count = 0
+        for corpus_file in corpus_files:
+            try:
+                yaml_data = corpus_to_yaml(corpus_file, tlv_types, args.verbose)
+
+                if args.output:
+                    # For directory input with single output file, create a list
+                    if processed_count == 0:
+                        all_yaml_data = []
+                    all_yaml_data.append(yaml_data)
+                else:
+                    # Create corresponding YAML file maintaining directory structure
+                    # Use the input directory itself as the base, so subdirectories are preserved
+                    yaml_file = create_yaml_path(corpus_file, args.yaml_dir, input_path)
+                    yaml_file.parent.mkdir(parents=True, exist_ok=True)
+                    with open(yaml_file, 'w', encoding='utf-8') as f:
+                        yaml.dump(yaml_data, f, **yaml_args)
+                    log.debug(f"Converted {corpus_file} -> {yaml_file}")
+
+                processed_count += 1
+
+            except Exception as e:
+                log.error(f"Error processing {corpus_file}: {e}")
+
+        if args.output and processed_count > 0:
+            with open(args.output, 'w', encoding='utf-8') as f:
+                yaml.dump(all_yaml_data, f, **yaml_args)
+            log.info(f"Output written to {args.output}")
+        elif not args.output:
+            log.info(f"Processed {processed_count} files to {args.yaml_dir}/")
+
+    else:
+        raise FileNotFoundError(f"Input path {args.input} does not exist")
+
+
+def create_yaml_path(corpus_file: Path, yaml_base_dir: Path, corpus_base_dir: Path = None) -> Path:
+    """Create the corresponding YAML file path maintaining directory structure."""
+    if corpus_base_dir:
+        # For directory processing, maintain relative structure
+        try:
+            relative_path = corpus_file.relative_to(corpus_base_dir)
+            # Create YAML filename while preserving directory structure
+            yaml_path = yaml_base_dir / relative_path.with_suffix(relative_path.suffix + '.yaml')
+            return yaml_path
+        except ValueError:
+            # If corpus_file is not relative to corpus_base_dir, use just the filename
+            relative_path = corpus_file.name
+    else:
+        # For single file processing, use just the filename
+        relative_path = corpus_file.name
+
+    # Create YAML filename
+    yaml_filename = f"{relative_path}.yaml"
+    return yaml_base_dir / yaml_filename
+
+
+def run() -> None:
+    """Set up common logging and run the main function."""
+    common_logging(__name__, __file__)
+    main()
+
+
+if __name__ == "__main__":
+    run()
@@ -0,0 +1,11 @@
+corpus_file: corpora/curl_fuzzer/test1
+file_size: 289
+tlvs:
+- length: 21
+  type: URL
+  type_id_v1: 1
+  value: http://127.0.0.1:80/1
+- length: 256
+  type: RESPONSE0
+  type_id_v1: 2
+  value: 485454502f312e3120323030204f4b0a446174653a205468752c203039204e6f7620323031302031343a34393a303020474d540a5365727665723a20746573742d7365727665722f66616b650a4c6173742d4d6f6469666965643a205475652c203133204a756e20323030302031323a31303a303020474d540a455461673a202232313032352d6463372d3339343632343938220a4163636570742d52616e6765733a2062797465730a436f6e74656e742d4c656e6774683a20360a436f6e6e656374696f6e3a20636c6f73650a436f6e74656e742d547970653a20746578742f68746d6c0a46756e6e792d686561643a207965737965730a0a2d666f6f2d0a
@@ -0,0 +1,15 @@
+corpus_file: corpora/curl_fuzzer/test10
+file_size: 226
+tlvs:
+- length: 32
+  type: URL
+  type_id_v1: 1
+  value: http://127.0.0.1:8990/we/want/10
+- length: 98
+  type: RESPONSE0
+  type_id_v1: 2
+  value: 485454502f312e3020323030204f4b20737773636c6f73650a446174653a205468752c203039204e6f7620323031302031343a34393a303020474d540a5365727665723a20746573742d7365727665722f66616b650a0a626c61626c61626c610a0a
+- length: 78
+  type: UPLOAD1
+  type_id_v1: 8
+  value: 57656972640a202020202066696c650a202020202020202020746f0a20202075706c6f61640a666f720a20202074657374696e670a7468650a2020205055540a202020202020666561747572650a
@@ -0,0 +1,11 @@
+corpus_file: corpora/curl_fuzzer/test100
+file_size: 675
+tlvs:
+- length: 30
+  type: URL
+  type_id_v1: 1
+  value: ftp://127.0.0.1:8992/test-100/
+- length: 633
+  type: RESPONSE0
+  type_id_v1: 2
+  value: 746f74616c2032300d0a64727778722d78722d78202020382039382020202020202039382020202020202020202020353132204f63742032322031333a3036202e0d0a64727778722d78722d78202020382039382020202020202039382020202020202020202020353132204f63742032322031333a3036202e2e0d0a64727778722d78722d78202020322039382020202020202039382020202020202020202020353132204d6179202032202031393936206375726c2d72656c65617365730d0a2d722d2d722d2d722d2d202020312030202020202020202031202020202020202020202020203335204a756c20313620203139393620524541444d450d0a6c727778727778727778202020312030202020202020202031202020202020202020202020202037204465632020392020313939392062696e202d3e207573722f62696e0d0a64722d78722d78722d78202020322030202020202020202031202020202020202020202020353132204f6374202031202031393937206465760d0a64727778727778727778202020322039382020202020202039382020202020202020202020353132204d61792032392031363a303420646f776e6c6f61642e68746d6c0d0a64722d78722d78722d78202020322030202020202020202031202020202020202020202020353132204e6f76203330202031393935206574630d0a64727778727778727778202020322039382020202020202031202020202020202020202020353132204f63742033302031343a3333207075620d0a64722d78722d78722d78202020352030202020202020202031202020202020202020202020353132204f6374202031202031393937207573720d0a0d0a
@@ -0,0 +1,31 @@
+corpus_file: corpora/curl_fuzzer/test100_2
+file_size: 128
+tlvs:
+- length: 30
+  type: URL
+  type_id_v1: 1
+  value: ftp://127.0.0.1:8992/test-100/
+- length: 11
+  type: RESPONSE0
+  type_id_v1: 2
+  value: 3232302048656c6c6f210a
+- length: 9
+  type: RESPONSE1
+  type_id_v1: 17
+  value: 32303020537572650a
+- length: 9
+  type: RESPONSE2
+  type_id_v1: 18
+  value: 32303020537572650a
+- length: 9
+  type: RESPONSE3
+  type_id_v1: 19
+  value: 32303020537572650a
+- length: 9
+  type: RESPONSE4
+  type_id_v1: 20
+  value: 34303020537572650a
+- length: 9
+  type: RESPONSE5
+  type_id_v1: 21
+  value: 32303020537572650a