3232import shutil
3333import tarfile
3434import tempfile
35- from typing import Any , Callable , Self
35+ from typing import Any , Callable , Mapping , Self , Sequence
36+ import urllib .request
3637
3738import pathlib
3839
5152# Min archive version we currently support.
5253_MIN_SUPPORTED_ARCHIVE_VERSION = 1
5354# The current version of the build archive format.
54- ARCHIVE_VERSION = 3
55+ ARCHIVE_VERSION = 4
56+ # OSS-Fuzz $OUT dir.
57+ OUT = pathlib .Path (os .getenv ("OUT" , "/out" ))
58+ # OSS-Fuzz coverage info.
59+ _COVERAGE_INFO_URL = ("https://storage.googleapis.com/oss-fuzz-coverage/"
60+ f"latest_report_info/{ os .getenv ('PROJECT_NAME' )} .json" )
5561
5662# Will be replaced with the input file for target execution.
5763INPUT_FILE = "<input_file>"
@@ -128,14 +134,85 @@ def from_dict(self, data: dict[str, Any]) -> Self:
128134 )
129135
130136
137+ class BinaryConfigKind (enum .StrEnum ):
138+ """The kind of binary configurations."""
139+
140+ OSS_FUZZ = enum .auto ()
141+ BINARY = enum .auto ()
142+
143+ def validate_in (self , options : list [Self ]):
144+ if self not in options :
145+ raise ValueError (
146+ f"Expected one of the following binary config kinds: { options } , "
147+ f"but got { self } " )
148+
149+
150+ @dataclasses .dataclass (frozen = True , kw_only = True )
151+ class BinaryConfig :
152+ """Base binary configuration.
153+
154+ Attributes:
155+ kind: The kind of binary configuration.
156+ binary_args: The arguments to pass to the binary, for example
157+ "<input_file>".
158+ """
159+
160+ kind : BinaryConfigKind
161+
162+ @classmethod
163+ def from_dict (cls , config_dict : Mapping [Any , Any ]) -> Self :
164+ """Deserializes the correct `BinaryConfig` subclass from a dict."""
165+ mapping = {
166+ BinaryConfigKind .OSS_FUZZ : CommandLineBinaryConfig ,
167+ BinaryConfigKind .BINARY : CommandLineBinaryConfig ,
168+ }
169+ kind = config_dict ["kind" ]
170+ if kind not in mapping :
171+ raise ValueError (f"Unknown BinaryConfigKind: { kind } " )
172+ val = config_dict
173+ if isinstance (val .get ("binary_args" ), str ):
174+ logging .warning (
175+ "BinaryConfig: binary_args is type string instead of list."
176+ " This is deprecated. Converting to list. Args: %s" ,
177+ val ["binary_args" ],
178+ )
179+ val = dict (val , binary_args = shlex .split (val ["binary_args" ]))
180+ return mapping [kind ].from_dict (val )
181+
182+
183+ @dataclasses .dataclass (frozen = True , kw_only = True )
184+ class CommandLineBinaryConfig (BinaryConfig ):
185+ """Configuration for a command-line userspace binary."""
186+
187+ binary_name : str
188+ binary_args : list [str ]
189+
190+ @classmethod
191+ def from_dict (cls , config_dict : Mapping [Any , Any ]) -> Self :
192+ """Deserializes the `CommandLineBinaryConfig` from a dict."""
193+ kind = BinaryConfigKind (config_dict ["kind" ])
194+ kind .validate_in ([BinaryConfigKind .OSS_FUZZ , BinaryConfigKind .BINARY ])
195+ return CommandLineBinaryConfig (
196+ kind = kind ,
197+ binary_name = config_dict ["binary_name" ],
198+ binary_args = config_dict ["binary_args" ],
199+ )
200+
201+
131202@dataclasses .dataclass (frozen = True )
132203class Manifest :
133204 """Contains general meta-information about the snapshot."""
134205
206+ # The name of the target.
135207 name : str
208+ # A unique identifier for the snapshot (not necessarily a valid UUID).
136209 uuid : str
137- binary_name : str
138- binary_args : list [str ] | None = None
210+ # A fixed path that shared libraries stored at `./obj/lib` should be mounted
211+ # at before running the target.
212+ lib_mount_path : pathlib .Path | None
213+
214+ # The binary configuration used to build the snapshot.
215+ binary_config : BinaryConfig
139216
140217 # The path prefix of the actual build directory (e.g., a temporary file in
141218 # the build host). It's used during replay to remove noisy source-file
@@ -154,7 +231,8 @@ class Manifest:
154231 # }
155232 # }
156233 source_map : dict [pathlib .Path , SourceRef ] | None = None
157- lib_mount_path : pathlib .Path | None = None
234+
235+ # Version of the manifest spec.
158236 version : int = ARCHIVE_VERSION
159237
160238 @classmethod
@@ -172,22 +250,42 @@ def from_dict(cls, data: dict[str, Any]) -> Self:
172250 binary_args = _get_mapped (data , "binary_args" , shlex .split )
173251 else :
174252 binary_args = data .get ("binary_args" )
253+ if data ["version" ] < 4 :
254+ binary_config = CommandLineBinaryConfig (
255+ kind = BinaryConfigKind .BINARY ,
256+ binary_name = data ["binary_name" ],
257+ binary_args = binary_args or [],
258+ )
259+ else :
260+ binary_config = _get_mapped (data , "binary_config" , BinaryConfig .from_dict )
261+
262+ version = data ["version" ]
263+ if _MIN_SUPPORTED_ARCHIVE_VERSION <= version <= ARCHIVE_VERSION :
264+ # Upgrade archive version - we have upgraded all necessary fields.
265+ version = ARCHIVE_VERSION
266+ else :
267+ logging .warning (
268+ "Unsupported manifest version %s detected. Not upgrading." , version )
175269 return Manifest (
270+ version = version ,
176271 name = data ["name" ],
177272 uuid = data ["uuid" ],
178- binary_name = data ["binary_name" ],
179- binary_args = binary_args ,
180273 lib_mount_path = lib_mount_path ,
181274 source_map = _get_mapped (data , "source_map" , source_map_from_dict ),
182275 source_dir_prefix = data .get ("source_dir_prefix" ),
183276 reproducibility = _get_mapped (data , "reproducibility" ,
184277 Reproducibility .from_dict ),
185- version = data [ "version" ] ,
278+ binary_config = binary_config ,
186279 )
187280
188281 def to_dict (self ) -> dict [str , Any ]:
189282 """Converts a Manifest object to a serializable dict."""
190283 data = dataclasses .asdict (self )
284+
285+ patches = data ["binary_config" ].get ("patches" )
286+ if patches :
287+ patches [:] = [path .as_posix () for path in patches ]
288+
191289 data ["lib_mount_path" ] = _get_mapped (data , "lib_mount_path" ,
192290 lambda x : x .as_posix ())
193291 data ["source_map" ] = _get_mapped (data , "source_map" , source_map_to_dict )
@@ -211,7 +309,7 @@ def validate(self) -> None:
211309 raise RuntimeError (
212310 "Build archive with version 1 has an alternative lib_mount_path set"
213311 f" ({ self .lib_mount_path } ). This is not a valid archive." )
214- if not self .name or not self .uuid or not self .binary_name :
312+ if not self .name or not self .uuid or not self .binary_config :
215313 raise RuntimeError (
216314 "Attempting to load a manifest with missing fields. Expected all"
217315 " fields to be set, but got {self}" )
@@ -229,10 +327,12 @@ def validate(self) -> None:
229327 raise RuntimeError (f"Type mismatch for field { k } : expected { v } , got"
230328 f" { type (getattr (self , k ))} " )
231329 # We updated from string to list in version 3, make sure this propagated.
232- if self .binary_args is not None and not isinstance (self .binary_args , list ):
233- raise RuntimeError (
234- "Type mismatch for field binary_args: expected list, got"
235- f" { type (self .binary_args )} " )
330+ binary_config = self .binary_config
331+ if hasattr (binary_config , "binary_args" ):
332+ if not isinstance (binary_config .binary_args , list ):
333+ raise RuntimeError (
334+ "Type mismatch for field binary_config.binary_args: expected list,"
335+ f"got { type (binary_config .binary_args )} " )
236336
237337 def save_build (
238338 self ,
@@ -246,6 +346,12 @@ def save_build(
246346 ) -> None :
247347 """Saves a build archive with this Manifest."""
248348 self .validate ()
349+
350+ if not hasattr (self .binary_config , "binary_name" ):
351+ raise RuntimeError (
352+ "Attempting to save a binary config type without binary_name."
353+ " This is not yet supported. Kind: {self.binary_config.kind}." )
354+
249355 with tempfile .NamedTemporaryFile () as tmp :
250356 mode = "w:gz" if archive_path .suffix .endswith ("gz" ) else "w"
251357 with tarfile .open (tmp .name , mode ) as tar :
@@ -294,15 +400,72 @@ def _save_dir(
294400 _save_dir (source_dir , SRC_DIR , exclude_build_artifacts = True )
295401 # Only include the relevant target for the snapshot, to save on disk
296402 # space.
297- _save_dir (build_dir , OBJ_DIR , only_include_target = self .binary_name )
403+ _save_dir (
404+ build_dir ,
405+ OBJ_DIR ,
406+ only_include_target = self .binary_config .binary_name ,
407+ )
298408 _save_dir (index_dir , INDEX_DIR )
409+ if self .binary_config .kind == BinaryConfigKind .OSS_FUZZ :
410+ copied_files = [tar_info .name for tar_info in tar .getmembers ()]
411+ try :
412+ report_missing_source_files (self .binary_config .binary_name ,
413+ copied_files , tar )
414+ except Exception : # pylint: disable=broad-except
415+ logging .exception ("Failed to report missing source files." )
299416
300417 if os .path .exists (archive_path ) and not overwrite :
301418 logging .warning ("Skipping existing archive %s" , archive_path )
302419 else :
303420 shutil .copyfile (tmp .name , archive_path )
304421
305422
423+ def report_missing_source_files (binary_name : str , copied_files : list [str ],
424+ tar : tarfile .TarFile ):
425+ """Saves a report of missing source files to the snapshot tarball."""
426+ copied_files = {_get_comparable_path (file ) for file in copied_files }
427+ covered_files = {
428+ _get_comparable_path (path ): path
429+ for path in get_covered_files (binary_name )
430+ }
431+ missing = set (covered_files ) - copied_files
432+ if not missing :
433+ return
434+ logging .info ("Reporting missing files: %s" , missing )
435+ missing_report_lines = sorted ([covered_files [k ] for k in missing ])
436+ report_name = f"{ binary_name } _missing_files.txt"
437+ tar_info = tarfile .TarInfo (name = report_name )
438+ missing_report = " " .join (missing_report_lines )
439+ missing_report_bytes = missing_report .encode ("utf-8" )
440+ tar .addfile (tarinfo = tar_info , fileobj = io .BytesIO (missing_report_bytes ))
441+ with open (os .path .join (OUT , report_name ), "w" ) as fp :
442+ fp .write (missing_report )
443+
444+
445+ def _get_comparable_path (path : str ) -> tuple [str , str ]:
446+ return os .path .basename (os .path .dirname (path )), os .path .basename (path )
447+
448+
449+ def get_covered_files (target : str ) -> Sequence [str ]:
450+ """Returns the files covered by fuzzing on OSS-Fuzz by the target."""
451+ with urllib .request .urlopen (_COVERAGE_INFO_URL ) as resp :
452+ latest_info = json .load (resp )
453+
454+ stats_url = latest_info .get ("fuzzer_stats_dir" ).replace (
455+ "gs://" , "https://storage.googleapis.com/" )
456+
457+ target_url = f"{ stats_url } /{ target } .json"
458+ with urllib .request .urlopen (target_url ) as resp :
459+ target_cov = json .load (resp )
460+
461+ files = target_cov ["data" ][0 ]["files" ]
462+ return [
463+ file ["filename" ]
464+ for file in files
465+ if file ["summary" ]["regions" ]["covered" ]
466+ ]
467+
468+
306469def _get_mapped (data : dict [str , Any ], key : str ,
307470 mapper : Callable [[Any ], Any ]) -> Any | None :
308471 """Get a value from a dict and apply a mapper to it, if it's not None."""
0 commit comments