Skip to content

Commit 386aa65

Browse files
authored
Update indexer. (#13429)
1 parent 28f1997 commit 386aa65

File tree

4 files changed

+212
-21
lines changed

4 files changed

+212
-21
lines changed

infra/base-images/base-builder/indexer/clang_wrapper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ def read_cdb_fragments(cdb_path: Path) -> Any:
259259
)
260260
time.sleep(2**(i + 1))
261261
else:
262-
error = "CDB fragment {file} is invalid even after retries: {data}"
262+
error = f"CDB fragment {file} is invalid even after retries: {data}"
263263
if "test.c" in file.name or "conftest.c" in file.name:
264264
# Some build systems seem to have a weird issue where the autotools
265265
# generated `test.c` or `conftest.c` for testing compilers doesn't

infra/base-images/base-builder/indexer/index_build.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -450,8 +450,11 @@ def archive_target(target: BinaryMetadata) -> Path | None:
450450
manifest_types.Manifest(
451451
name=name,
452452
uuid=uuid,
453-
binary_name=target.name,
454-
binary_args=target.binary_args,
453+
binary_config=manifest_types.CommandLineBinaryConfig(
454+
kind=manifest_types.BinaryConfigKind.OSS_FUZZ,
455+
binary_name=target.name,
456+
binary_args=target.binary_args,
457+
),
455458
source_map=manifest_types.source_map_from_dict(json.loads(source_map)),
456459
lib_mount_path=lib_mount_path,
457460
).save_build(
@@ -476,6 +479,9 @@ def test_and_archive(target_args: list[str],
476479
targets = enumerate_build_targets(target_args)
477480
if targets_to_index:
478481
targets = [t for t in targets if t.name in targets_to_index]
482+
missing_targets = set(targets_to_index) - set(t.name for t in targets)
483+
if missing_targets:
484+
raise ValueError(f'Could not find specified targets {missing_targets}.')
479485

480486
logging.info('targets %s', targets)
481487
for target in targets:
@@ -521,13 +527,24 @@ def main():
521527
)
522528
parser.add_argument(
523529
'--target-args',
524-
default=manifest_types.INPUT_FILE,
530+
default=None,
525531
help=('Arguments to pass to the target when executing it. '
526532
'This string is shell-escaped (interpreted with `shlex.split`). '
533+
'The substring <input_file> will be replaced with the input path.'
534+
'Note: This is deprecated, use --target-arg instead.'),
535+
)
536+
parser.add_argument(
537+
'--target-arg',
538+
action='append',
539+
help=('An argument to pass to the target binary. '
527540
'The substring <input_file> will be replaced with the input path.'),
528541
)
529542
args = parser.parse_args()
530543

544+
if args.target_args and args.target_arg:
545+
raise ValueError(
546+
'Only one of --target-args or --target-arg can be specified.')
547+
531548
targets_to_index = None
532549
if args.targets:
533550
targets_to_index = args.targets.split(',')
@@ -542,7 +559,17 @@ def main():
542559
# We don't have an existing /out dir on oss-fuzz's build infra.
543560
OUT.mkdir(parents=True, exist_ok=True)
544561
build_project(targets_to_index)
545-
test_and_archive(shlex.split(args.target_args), targets_to_index)
562+
563+
if args.target_arg:
564+
target_args = args.target_arg
565+
elif args.target_args:
566+
logging.warning('--target-args is deprecated, use --target-arg instead.')
567+
target_args = shlex.split(args.target_args)
568+
else:
569+
logging.info('No target args specified.')
570+
target_args = []
571+
572+
test_and_archive(target_args, targets_to_index)
546573

547574
for snapshot in SNAPSHOT_DIR.iterdir():
548575
shutil.move(str(snapshot), OUT)

infra/base-images/base-builder/indexer/index_build_test.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,9 @@ def _check_archive(self, archive_path: Path):
7878
if file: # Make type checkers happy.
7979
manifest = json.load(file)
8080
self.assertTrue(manifest['lib_mount_path'])
81-
self.assertIsNotNone(tar.getmember('obj/' +
82-
manifest['binary_name']))
81+
self.assertIsNotNone(
82+
tar.getmember('obj/' +
83+
manifest['binary_config']['binary_name']))
8384

8485
self.assertTrue(has_obj_lib, "obj/lib/ was not found in the archive.")
8586
self.assertTrue(has_idx_sqlite,

infra/base-images/base-builder/indexer/manifest_types.py

Lines changed: 177 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@
3232
import shutil
3333
import tarfile
3434
import tempfile
35-
from typing import Any, Callable, Self
35+
from typing import Any, Callable, Mapping, Self, Sequence
36+
import urllib.request
3637

3738
import pathlib
3839

@@ -51,7 +52,12 @@
5152
# Min archive version we currently support.
5253
_MIN_SUPPORTED_ARCHIVE_VERSION = 1
5354
# The current version of the build archive format.
54-
ARCHIVE_VERSION = 3
55+
ARCHIVE_VERSION = 4
56+
# OSS-Fuzz $OUT dir.
57+
OUT = pathlib.Path(os.getenv("OUT", "/out"))
58+
# OSS-Fuzz coverage info.
59+
_COVERAGE_INFO_URL = ("https://storage.googleapis.com/oss-fuzz-coverage/"
60+
f"latest_report_info/{os.getenv('PROJECT_NAME')}.json")
5561

5662
# Will be replaced with the input file for target execution.
5763
INPUT_FILE = "<input_file>"
@@ -128,14 +134,85 @@ def from_dict(self, data: dict[str, Any]) -> Self:
128134
)
129135

130136

137+
class BinaryConfigKind(enum.StrEnum):
138+
"""The kind of binary configurations."""
139+
140+
OSS_FUZZ = enum.auto()
141+
BINARY = enum.auto()
142+
143+
def validate_in(self, options: list[Self]):
144+
if self not in options:
145+
raise ValueError(
146+
f"Expected one of the following binary config kinds: {options}, "
147+
f"but got {self}")
148+
149+
150+
@dataclasses.dataclass(frozen=True, kw_only=True)
151+
class BinaryConfig:
152+
"""Base binary configuration.
153+
154+
Attributes:
155+
kind: The kind of binary configuration.
156+
binary_args: The arguments to pass to the binary, for example
157+
"<input_file>".
158+
"""
159+
160+
kind: BinaryConfigKind
161+
162+
@classmethod
163+
def from_dict(cls, config_dict: Mapping[Any, Any]) -> Self:
164+
"""Deserializes the correct `BinaryConfig` subclass from a dict."""
165+
mapping = {
166+
BinaryConfigKind.OSS_FUZZ: CommandLineBinaryConfig,
167+
BinaryConfigKind.BINARY: CommandLineBinaryConfig,
168+
}
169+
kind = config_dict["kind"]
170+
if kind not in mapping:
171+
raise ValueError(f"Unknown BinaryConfigKind: {kind}")
172+
val = config_dict
173+
if isinstance(val.get("binary_args"), str):
174+
logging.warning(
175+
"BinaryConfig: binary_args is type string instead of list."
176+
" This is deprecated. Converting to list. Args: %s",
177+
val["binary_args"],
178+
)
179+
val = dict(val, binary_args=shlex.split(val["binary_args"]))
180+
return mapping[kind].from_dict(val)
181+
182+
183+
@dataclasses.dataclass(frozen=True, kw_only=True)
184+
class CommandLineBinaryConfig(BinaryConfig):
185+
"""Configuration for a command-line userspace binary."""
186+
187+
binary_name: str
188+
binary_args: list[str]
189+
190+
@classmethod
191+
def from_dict(cls, config_dict: Mapping[Any, Any]) -> Self:
192+
"""Deserializes the `CommandLineBinaryConfig` from a dict."""
193+
kind = BinaryConfigKind(config_dict["kind"])
194+
kind.validate_in([BinaryConfigKind.OSS_FUZZ, BinaryConfigKind.BINARY])
195+
return CommandLineBinaryConfig(
196+
kind=kind,
197+
binary_name=config_dict["binary_name"],
198+
binary_args=config_dict["binary_args"],
199+
)
200+
201+
131202
@dataclasses.dataclass(frozen=True)
132203
class Manifest:
133204
"""Contains general meta-information about the snapshot."""
134205

206+
# The name of the target.
135207
name: str
208+
# A unique identifier for the snapshot (not necessarily a valid UUID).
136209
uuid: str
137-
binary_name: str
138-
binary_args: list[str] | None = None
210+
# A fixed path that shared libraries stored at `./obj/lib` should be mounted
211+
# at before running the target.
212+
lib_mount_path: pathlib.Path | None
213+
214+
# The binary configuration used to build the snapshot.
215+
binary_config: BinaryConfig
139216

140217
# The path prefix of the actual build directory (e.g., a temporary file in
141218
# the build host). It's used during replay to remove noisy source-file
@@ -154,7 +231,8 @@ class Manifest:
154231
# }
155232
# }
156233
source_map: dict[pathlib.Path, SourceRef] | None = None
157-
lib_mount_path: pathlib.Path | None = None
234+
235+
# Version of the manifest spec.
158236
version: int = ARCHIVE_VERSION
159237

160238
@classmethod
@@ -172,22 +250,42 @@ def from_dict(cls, data: dict[str, Any]) -> Self:
172250
binary_args = _get_mapped(data, "binary_args", shlex.split)
173251
else:
174252
binary_args = data.get("binary_args")
253+
if data["version"] < 4:
254+
binary_config = CommandLineBinaryConfig(
255+
kind=BinaryConfigKind.BINARY,
256+
binary_name=data["binary_name"],
257+
binary_args=binary_args or [],
258+
)
259+
else:
260+
binary_config = _get_mapped(data, "binary_config", BinaryConfig.from_dict)
261+
262+
version = data["version"]
263+
if _MIN_SUPPORTED_ARCHIVE_VERSION <= version <= ARCHIVE_VERSION:
264+
# Upgrade archive version - we have upgraded all necessary fields.
265+
version = ARCHIVE_VERSION
266+
else:
267+
logging.warning(
268+
"Unsupported manifest version %s detected. Not upgrading.", version)
175269
return Manifest(
270+
version=version,
176271
name=data["name"],
177272
uuid=data["uuid"],
178-
binary_name=data["binary_name"],
179-
binary_args=binary_args,
180273
lib_mount_path=lib_mount_path,
181274
source_map=_get_mapped(data, "source_map", source_map_from_dict),
182275
source_dir_prefix=data.get("source_dir_prefix"),
183276
reproducibility=_get_mapped(data, "reproducibility",
184277
Reproducibility.from_dict),
185-
version=data["version"],
278+
binary_config=binary_config,
186279
)
187280

188281
def to_dict(self) -> dict[str, Any]:
189282
"""Converts a Manifest object to a serializable dict."""
190283
data = dataclasses.asdict(self)
284+
285+
patches = data["binary_config"].get("patches")
286+
if patches:
287+
patches[:] = [path.as_posix() for path in patches]
288+
191289
data["lib_mount_path"] = _get_mapped(data, "lib_mount_path",
192290
lambda x: x.as_posix())
193291
data["source_map"] = _get_mapped(data, "source_map", source_map_to_dict)
@@ -211,7 +309,7 @@ def validate(self) -> None:
211309
raise RuntimeError(
212310
"Build archive with version 1 has an alternative lib_mount_path set"
213311
f" ({self.lib_mount_path}). This is not a valid archive.")
214-
if not self.name or not self.uuid or not self.binary_name:
312+
if not self.name or not self.uuid or not self.binary_config:
215313
raise RuntimeError(
216314
"Attempting to load a manifest with missing fields. Expected all"
217315
" fields to be set, but got {self}")
@@ -229,10 +327,12 @@ def validate(self) -> None:
229327
raise RuntimeError(f"Type mismatch for field {k}: expected {v}, got"
230328
f" {type(getattr(self, k))}")
231329
# We updated from string to list in version 3, make sure this propagated.
232-
if self.binary_args is not None and not isinstance(self.binary_args, list):
233-
raise RuntimeError(
234-
"Type mismatch for field binary_args: expected list, got"
235-
f" {type(self.binary_args)}")
330+
binary_config = self.binary_config
331+
if hasattr(binary_config, "binary_args"):
332+
if not isinstance(binary_config.binary_args, list):
333+
raise RuntimeError(
334+
"Type mismatch for field binary_config.binary_args: expected list,"
335+
f"got {type(binary_config.binary_args)}")
236336

237337
def save_build(
238338
self,
@@ -246,6 +346,12 @@ def save_build(
246346
) -> None:
247347
"""Saves a build archive with this Manifest."""
248348
self.validate()
349+
350+
if not hasattr(self.binary_config, "binary_name"):
351+
raise RuntimeError(
352+
"Attempting to save a binary config type without binary_name."
353+
" This is not yet supported. Kind: {self.binary_config.kind}.")
354+
249355
with tempfile.NamedTemporaryFile() as tmp:
250356
mode = "w:gz" if archive_path.suffix.endswith("gz") else "w"
251357
with tarfile.open(tmp.name, mode) as tar:
@@ -294,15 +400,72 @@ def _save_dir(
294400
_save_dir(source_dir, SRC_DIR, exclude_build_artifacts=True)
295401
# Only include the relevant target for the snapshot, to save on disk
296402
# space.
297-
_save_dir(build_dir, OBJ_DIR, only_include_target=self.binary_name)
403+
_save_dir(
404+
build_dir,
405+
OBJ_DIR,
406+
only_include_target=self.binary_config.binary_name,
407+
)
298408
_save_dir(index_dir, INDEX_DIR)
409+
if self.binary_config.kind == BinaryConfigKind.OSS_FUZZ:
410+
copied_files = [tar_info.name for tar_info in tar.getmembers()]
411+
try:
412+
report_missing_source_files(self.binary_config.binary_name,
413+
copied_files, tar)
414+
except Exception: # pylint: disable=broad-except
415+
logging.exception("Failed to report missing source files.")
299416

300417
if os.path.exists(archive_path) and not overwrite:
301418
logging.warning("Skipping existing archive %s", archive_path)
302419
else:
303420
shutil.copyfile(tmp.name, archive_path)
304421

305422

423+
def report_missing_source_files(binary_name: str, copied_files: list[str],
424+
tar: tarfile.TarFile):
425+
"""Saves a report of missing source files to the snapshot tarball."""
426+
copied_files = {_get_comparable_path(file) for file in copied_files}
427+
covered_files = {
428+
_get_comparable_path(path): path
429+
for path in get_covered_files(binary_name)
430+
}
431+
missing = set(covered_files) - copied_files
432+
if not missing:
433+
return
434+
logging.info("Reporting missing files: %s", missing)
435+
missing_report_lines = sorted([covered_files[k] for k in missing])
436+
report_name = f"{binary_name}_missing_files.txt"
437+
tar_info = tarfile.TarInfo(name=report_name)
438+
missing_report = " ".join(missing_report_lines)
439+
missing_report_bytes = missing_report.encode("utf-8")
440+
tar.addfile(tarinfo=tar_info, fileobj=io.BytesIO(missing_report_bytes))
441+
with open(os.path.join(OUT, report_name), "w") as fp:
442+
fp.write(missing_report)
443+
444+
445+
def _get_comparable_path(path: str) -> tuple[str, str]:
446+
return os.path.basename(os.path.dirname(path)), os.path.basename(path)
447+
448+
449+
def get_covered_files(target: str) -> Sequence[str]:
450+
"""Returns the files covered by fuzzing on OSS-Fuzz by the target."""
451+
with urllib.request.urlopen(_COVERAGE_INFO_URL) as resp:
452+
latest_info = json.load(resp)
453+
454+
stats_url = latest_info.get("fuzzer_stats_dir").replace(
455+
"gs://", "https://storage.googleapis.com/")
456+
457+
target_url = f"{stats_url}/{target}.json"
458+
with urllib.request.urlopen(target_url) as resp:
459+
target_cov = json.load(resp)
460+
461+
files = target_cov["data"][0]["files"]
462+
return [
463+
file["filename"]
464+
for file in files
465+
if file["summary"]["regions"]["covered"]
466+
]
467+
468+
306469
def _get_mapped(data: dict[str, Any], key: str,
307470
mapper: Callable[[Any], Any]) -> Any | None:
308471
"""Get a value from a dict and apply a mapper to it, if it's not None."""

0 commit comments

Comments
 (0)