Skip to content

Commit a97f196

Browse files
committed
Convert corpora cases to YAML format while in repo. They get converted to
binary format during the oss-fuzz process.
1 parent f070104 commit a97f196

File tree

148 files changed

+102666
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

148 files changed

+102666
-0
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ read_corpus = "curl_fuzzer_tools.read_corpus:run"
2929
generate_corpus = "curl_fuzzer_tools.generate_corpus:run"
3030
corpus_to_pcap = "curl_fuzzer_tools.corpus_to_pcap:run"
3131
generate_matrix = "curl_fuzzer_tools.generate_matrix:run"
32+
corpus_to_yaml = "curl_fuzzer_tools.corpus_to_yaml:run"
3233

3334
[build-system]
3435
requires = ["setuptools>=61.0"]
Lines changed: 288 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
#!/usr/bin/env python3
2+
"""Tool to read corpus files and convert them to YAML format."""
3+
4+
import argparse
5+
import logging
6+
import re
7+
import yaml
8+
from pathlib import Path
9+
from typing import Dict, Any, List, Union
10+
11+
from curl_fuzzer_tools import common_logging
12+
from curl_fuzzer_tools.corpus import TLVDecoder
13+
14+
log = logging.getLogger(__name__)
15+
16+
17+
def extract_tlv_types_from_header(header_file_path: Path) -> Dict[int, str]:
18+
"""Extract TLV type definitions from curl_fuzzer.h header file."""
19+
tlv_types = {}
20+
21+
if not header_file_path.exists():
22+
log.warning(f"Header file {header_file_path} not found, using built-in types")
23+
return {}
24+
25+
with open(header_file_path, "r") as f:
26+
content = f.read()
27+
28+
# Pattern to match #define TLV_TYPE_NAME value
29+
pattern = r'#define\s+TLV_TYPE_(\w+)\s+(\d+)'
30+
31+
for match in re.finditer(pattern, content):
32+
type_name = match.group(1)
33+
type_value = int(match.group(2))
34+
tlv_types[type_value] = type_name
35+
36+
log.info(f"Extracted {len(tlv_types)} TLV type definitions")
37+
return tlv_types
38+
39+
40+
def get_tlv_type_name(tlv_type: int, tlv_types: Dict[int, str]) -> str:
41+
"""Get the human-readable name for a TLV type."""
42+
return tlv_types.get(tlv_type, f"UNKNOWN_{tlv_type}")
43+
44+
45+
def format_value_for_yaml(data: bytes, verbose: bool = False) -> Union[str, int, Dict[str, Any]]:
46+
"""Format TLV value appropriately for YAML output."""
47+
if len(data) == 0:
48+
return ""
49+
50+
# Try to decode as UTF-8 string
51+
try:
52+
decoded = data.decode('utf-8')
53+
# Check if it's printable ASCII/UTF-8
54+
if decoded.isprintable():
55+
return decoded
56+
except UnicodeDecodeError:
57+
pass
58+
59+
# Check if it's a 4-byte integer (common in TLVs)
60+
if len(data) == 4:
61+
# Try big-endian first (network byte order)
62+
import struct
63+
try:
64+
value = struct.unpack('!I', data)[0]
65+
# Return both representations for clarity if verbose
66+
if verbose:
67+
return {
68+
'integer': value,
69+
'hex': data.hex()
70+
}
71+
else:
72+
return value
73+
except struct.error:
74+
pass
75+
76+
# For non-verbose mode, just return hex for binary data
77+
if not verbose:
78+
return data.hex()
79+
80+
# Check if it's binary data that might be partially printable
81+
try:
82+
decoded = data.decode('utf-8', errors='replace')
83+
if any(c.isprintable() or c.isspace() for c in decoded):
84+
return {
85+
'hex': data.hex(),
86+
'partial_text': decoded,
87+
'note': 'Binary data with some printable characters'
88+
}
89+
except:
90+
pass
91+
92+
# For pure binary data, represent as hex
93+
return {
94+
'hex': data.hex(),
95+
'note': 'Binary data'
96+
}
97+
98+
99+
def corpus_to_yaml(corpus_file: Path, tlv_types: Dict[int, str], verbose: bool = False) -> Dict[str, Any]:
100+
"""Convert a corpus file to a YAML-ready dictionary."""
101+
result = {
102+
'corpus_file': str(corpus_file),
103+
'tlvs': []
104+
}
105+
106+
with open(corpus_file, "rb") as f:
107+
data = f.read()
108+
109+
result['file_size'] = len(data)
110+
111+
try:
112+
decoder = TLVDecoder(data)
113+
for tlv in decoder:
114+
tlv_entry = {
115+
'type': get_tlv_type_name(tlv.type, tlv_types),
116+
'type_id_v1': tlv.type,
117+
'length': tlv.length
118+
}
119+
120+
if tlv.length > 0:
121+
tlv_entry['value'] = format_value_for_yaml(tlv.data, verbose)
122+
123+
result['tlvs'].append(tlv_entry)
124+
125+
except Exception as e:
126+
log.error(f"Error parsing corpus file {corpus_file}: {e}")
127+
result['error'] = str(e)
128+
129+
return result
130+
131+
132+
def main() -> None:
133+
"""Main function."""
134+
parser = argparse.ArgumentParser(
135+
description="Convert curl fuzzer corpus files to YAML format"
136+
)
137+
parser.add_argument(
138+
"input",
139+
help="Corpus file or directory to convert"
140+
)
141+
parser.add_argument(
142+
"-o", "--output",
143+
help="Output file (default: create yaml directory structure)"
144+
)
145+
parser.add_argument(
146+
"--header",
147+
help="Path to curl_fuzzer.h header file (default: auto-detect)",
148+
type=Path
149+
)
150+
parser.add_argument(
151+
"--pretty",
152+
action="store_true",
153+
help="Pretty-print YAML output"
154+
)
155+
parser.add_argument(
156+
"--verbose",
157+
action="store_true",
158+
help="Include verbose output with multiple data representations"
159+
)
160+
parser.add_argument(
161+
"--yaml-dir",
162+
help="Base directory for YAML output (default: yaml/)",
163+
type=Path,
164+
default=Path("yaml")
165+
)
166+
parser.add_argument(
167+
"--stdout",
168+
action="store_true",
169+
help="Output to stdout instead of creating files"
170+
)
171+
172+
args = parser.parse_args()
173+
174+
# Auto-detect header file if not provided
175+
if args.header is None:
176+
header_file = Path(__file__).parent.parent.parent / "curl_fuzzer.h"
177+
else:
178+
header_file = args.header
179+
180+
# Extract TLV type definitions
181+
tlv_types = extract_tlv_types_from_header(header_file)
182+
183+
input_path = Path(args.input)
184+
185+
# Configure YAML output
186+
yaml_args = {
187+
'default_flow_style': False,
188+
'allow_unicode': True,
189+
}
190+
191+
if args.pretty:
192+
yaml_args['indent'] = 2
193+
yaml_args['width'] = 120
194+
195+
if input_path.is_file():
196+
# Single file processing
197+
yaml_data = corpus_to_yaml(input_path, tlv_types, args.verbose)
198+
199+
if args.output:
200+
# Use specified output file
201+
with open(args.output, 'w', encoding='utf-8') as f:
202+
yaml.dump(yaml_data, f, **yaml_args)
203+
log.info(f"Output written to {args.output}")
204+
elif args.stdout:
205+
# Output to stdout
206+
import sys
207+
yaml.dump(yaml_data, sys.stdout, **yaml_args)
208+
else:
209+
# Create corresponding YAML file in yaml directory
210+
yaml_file = create_yaml_path(input_path, args.yaml_dir)
211+
yaml_file.parent.mkdir(parents=True, exist_ok=True)
212+
with open(yaml_file, 'w', encoding='utf-8') as f:
213+
yaml.dump(yaml_data, f, **yaml_args)
214+
log.info(f"Output written to {yaml_file}")
215+
216+
elif input_path.is_dir():
217+
# Directory processing - create yaml directory structure
218+
if args.stdout:
219+
log.error("Cannot output directory contents to stdout. Use --output for single file or remove --stdout.")
220+
return
221+
222+
corpus_files = list(input_path.rglob("*"))
223+
corpus_files = [f for f in corpus_files if f.is_file()]
224+
225+
processed_count = 0
226+
for corpus_file in corpus_files:
227+
try:
228+
yaml_data = corpus_to_yaml(corpus_file, tlv_types, args.verbose)
229+
230+
if args.output:
231+
# For directory input with single output file, create a list
232+
if processed_count == 0:
233+
all_yaml_data = []
234+
all_yaml_data.append(yaml_data)
235+
else:
236+
# Create corresponding YAML file maintaining directory structure
237+
# Use the input directory itself as the base, so subdirectories are preserved
238+
yaml_file = create_yaml_path(corpus_file, args.yaml_dir, input_path)
239+
yaml_file.parent.mkdir(parents=True, exist_ok=True)
240+
with open(yaml_file, 'w', encoding='utf-8') as f:
241+
yaml.dump(yaml_data, f, **yaml_args)
242+
log.debug(f"Converted {corpus_file} -> {yaml_file}")
243+
244+
processed_count += 1
245+
246+
except Exception as e:
247+
log.error(f"Error processing {corpus_file}: {e}")
248+
249+
if args.output and processed_count > 0:
250+
with open(args.output, 'w', encoding='utf-8') as f:
251+
yaml.dump(all_yaml_data, f, **yaml_args)
252+
log.info(f"Output written to {args.output}")
253+
elif not args.output:
254+
log.info(f"Processed {processed_count} files to {args.yaml_dir}/")
255+
256+
else:
257+
raise FileNotFoundError(f"Input path {args.input} does not exist")
258+
259+
260+
def create_yaml_path(corpus_file: Path, yaml_base_dir: Path, corpus_base_dir: Path = None) -> Path:
261+
"""Create the corresponding YAML file path maintaining directory structure."""
262+
if corpus_base_dir:
263+
# For directory processing, maintain relative structure
264+
try:
265+
relative_path = corpus_file.relative_to(corpus_base_dir)
266+
# Create YAML filename while preserving directory structure
267+
yaml_path = yaml_base_dir / relative_path.with_suffix(relative_path.suffix + '.yaml')
268+
return yaml_path
269+
except ValueError:
270+
# If corpus_file is not relative to corpus_base_dir, use just the filename
271+
relative_path = corpus_file.name
272+
else:
273+
# For single file processing, use just the filename
274+
relative_path = corpus_file.name
275+
276+
# Create YAML filename
277+
yaml_filename = f"{relative_path}.yaml"
278+
return yaml_base_dir / yaml_filename
279+
280+
281+
def run() -> None:
282+
"""Set up common logging and run the main function."""
283+
common_logging(__name__, __file__)
284+
main()
285+
286+
287+
if __name__ == "__main__":
288+
run()

yaml/curl_fuzzer/test1.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
corpus_file: corpora/curl_fuzzer/test1
2+
file_size: 289
3+
tlvs:
4+
- length: 21
5+
type: URL
6+
type_id_v1: 1
7+
value: http://127.0.0.1:80/1
8+
- length: 256
9+
type: RESPONSE0
10+
type_id_v1: 2
11+
value: 485454502f312e3120323030204f4b0a446174653a205468752c203039204e6f7620323031302031343a34393a303020474d540a5365727665723a20746573742d7365727665722f66616b650a4c6173742d4d6f6469666965643a205475652c203133204a756e20323030302031323a31303a303020474d540a455461673a202232313032352d6463372d3339343632343938220a4163636570742d52616e6765733a2062797465730a436f6e74656e742d4c656e6774683a20360a436f6e6e656374696f6e3a20636c6f73650a436f6e74656e742d547970653a20746578742f68746d6c0a46756e6e792d686561643a207965737965730a0a2d666f6f2d0a

yaml/curl_fuzzer/test10.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
corpus_file: corpora/curl_fuzzer/test10
2+
file_size: 226
3+
tlvs:
4+
- length: 32
5+
type: URL
6+
type_id_v1: 1
7+
value: http://127.0.0.1:8990/we/want/10
8+
- length: 98
9+
type: RESPONSE0
10+
type_id_v1: 2
11+
value: 485454502f312e3020323030204f4b20737773636c6f73650a446174653a205468752c203039204e6f7620323031302031343a34393a303020474d540a5365727665723a20746573742d7365727665722f66616b650a0a626c61626c61626c610a0a
12+
- length: 78
13+
type: UPLOAD1
14+
type_id_v1: 8
15+
value: 57656972640a202020202066696c650a202020202020202020746f0a20202075706c6f61640a666f720a20202074657374696e670a7468650a2020205055540a202020202020666561747572650a

yaml/curl_fuzzer/test100.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
corpus_file: corpora/curl_fuzzer/test100
2+
file_size: 675
3+
tlvs:
4+
- length: 30
5+
type: URL
6+
type_id_v1: 1
7+
value: ftp://127.0.0.1:8992/test-100/
8+
- length: 633
9+
type: RESPONSE0
10+
type_id_v1: 2
11+
value: 746f74616c2032300d0a64727778722d78722d78202020382039382020202020202039382020202020202020202020353132204f63742032322031333a3036202e0d0a64727778722d78722d78202020382039382020202020202039382020202020202020202020353132204f63742032322031333a3036202e2e0d0a64727778722d78722d78202020322039382020202020202039382020202020202020202020353132204d6179202032202031393936206375726c2d72656c65617365730d0a2d722d2d722d2d722d2d202020312030202020202020202031202020202020202020202020203335204a756c20313620203139393620524541444d450d0a6c727778727778727778202020312030202020202020202031202020202020202020202020202037204465632020392020313939392062696e202d3e207573722f62696e0d0a64722d78722d78722d78202020322030202020202020202031202020202020202020202020353132204f6374202031202031393937206465760d0a64727778727778727778202020322039382020202020202039382020202020202020202020353132204d61792032392031363a303420646f776e6c6f61642e68746d6c0d0a64722d78722d78722d78202020322030202020202020202031202020202020202020202020353132204e6f76203330202031393935206574630d0a64727778727778727778202020322039382020202020202031202020202020202020202020353132204f63742033302031343a3333207075620d0a64722d78722d78722d78202020352030202020202020202031202020202020202020202020353132204f6374202031202031393937207573720d0a0d0a

yaml/curl_fuzzer/test100_2.yaml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
corpus_file: corpora/curl_fuzzer/test100_2
2+
file_size: 128
3+
tlvs:
4+
- length: 30
5+
type: URL
6+
type_id_v1: 1
7+
value: ftp://127.0.0.1:8992/test-100/
8+
- length: 11
9+
type: RESPONSE0
10+
type_id_v1: 2
11+
value: 3232302048656c6c6f210a
12+
- length: 9
13+
type: RESPONSE1
14+
type_id_v1: 17
15+
value: 32303020537572650a
16+
- length: 9
17+
type: RESPONSE2
18+
type_id_v1: 18
19+
value: 32303020537572650a
20+
- length: 9
21+
type: RESPONSE3
22+
type_id_v1: 19
23+
value: 32303020537572650a
24+
- length: 9
25+
type: RESPONSE4
26+
type_id_v1: 20
27+
value: 34303020537572650a
28+
- length: 9
29+
type: RESPONSE5
30+
type_id_v1: 21
31+
value: 32303020537572650a

0 commit comments

Comments
 (0)