Skip to content

Commit 4f8de76

Browse files
committed
[Centipede] Perform corpus minimization during a fuzzing session
This change call's centipede's minimize_corpus method on each fuzzing round. This allows us to distill the corpus and only add useful units. On a local run, this allowed to decrease the number of units added from 2000+ to 5 with the same coverage. The cl also extracts some common functionality into engine_common.
1 parent 0cd0f6e commit 4f8de76

File tree

8 files changed

+221
-140
lines changed

8 files changed

+221
-140
lines changed

src/clusterfuzz/_internal/bot/fuzzers/centipede/constants.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@
3939
EXTRA_BINARIES_FLAGNAME = 'extra_binaries'
4040
EXIT_ON_CRASH_FLAGNAME = 'exit_on_crash'
4141

42+
MAX_LEN_FLAGNAME = 'max_len'
43+
RUNS_FLAGNAME = 'runs'
44+
BATCH_SIZE_FLAGNAME = 'batch_size'
45+
4246
NUM_RUNS_PER_MINIMIZATION = 100000
4347

4448

src/clusterfuzz/_internal/bot/fuzzers/centipede/engine.py

Lines changed: 114 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from clusterfuzz._internal.metrics import logs
3333
from clusterfuzz._internal.system import environment
3434
from clusterfuzz._internal.system import new_process
35+
from clusterfuzz._internal.system import shell
3536
from clusterfuzz.fuzz import engine
3637
from clusterfuzz.stacktraces import constants as stacktraces_constants
3738

@@ -46,6 +47,17 @@ class CentipedeError(Exception):
4647
"""Base exception class."""
4748

4849

50+
class CentipedeOptions(engine.FuzzOptions):
51+
"""Centipede engine options."""
52+
53+
def __init__(self, corpus_dir, arguments, strategies, workdir,
54+
new_corpus_dir):
55+
super().__init__(corpus_dir, arguments, strategies)
56+
# Directory to add new units
57+
self.new_corpus_dir = new_corpus_dir
58+
self.workdir = workdir
59+
60+
4961
def _get_runner(target_path):
5062
"""Gets the Centipede runner."""
5163
centipede_path = pathlib.Path(target_path).parent / 'centipede'
@@ -198,11 +210,15 @@ def prepare(self, corpus_dir, target_path, build_dir):
198210
# 1. Centipede-readable corpus file;
199211
# 2. Centipede-readable feature file;
200212
# 3. Crash reproducing inputs.
201-
workdir = self._create_temp_dir('workdir')
213+
workdir = engine_common.create_temp_fuzzing_dir('workdir')
202214
arguments[constants.WORKDIR_FLAGNAME] = str(workdir)
203215

204-
# Directory corpus_dir saves the corpus files required by ClusterFuzz.
205-
arguments[constants.CORPUS_DIR_FLAGNAME] = corpus_dir
216+
# Directory to place new units. While fuzzing, the new corpus
217+
# elements are written to the first dir in the list of corpus directories.
218+
new_corpus_dir = engine_common.create_temp_fuzzing_dir('new')
219+
corpus_dirs = [new_corpus_dir, corpus_dir]
220+
arguments[constants.CORPUS_DIR_FLAGNAME] = ','.join(
221+
dir for dir in corpus_dirs)
206222

207223
target_binaries = self._get_binary_paths(target_path)
208224
if target_binaries.unsanitized is None:
@@ -214,7 +230,8 @@ def prepare(self, corpus_dir, target_path, build_dir):
214230
arguments[constants.EXTRA_BINARIES_FLAGNAME] = str(
215231
target_binaries.sanitized)
216232

217-
return engine.FuzzOptions(corpus_dir, arguments.list(), {})
233+
return CentipedeOptions(corpus_dir, arguments.list(), {}, workdir,
234+
new_corpus_dir)
218235

219236
def _get_binary_paths(self, target_path):
220237
"""Gets the paths to the main and auxiliary binaries based on |target_path|
@@ -284,11 +301,44 @@ def fuzz(self, target_path, options, reproducers_dir, max_time): # pylint: disa
284301
runner = _get_runner(target_path)
285302
_set_sanitizer_options(target_path)
286303
timeout = max_time + _CLEAN_EXIT_SECS
304+
305+
old_corpus_len = shell.get_directory_file_count(options.corpus_dir)
306+
logs.info(f'Corpus length before fuzzing: {old_corpus_len}')
307+
287308
fuzz_result = runner.run_and_wait(
288309
additional_args=options.arguments, timeout=timeout)
289310
log_lines = fuzz_result.output.splitlines()
290311
fuzz_result.output = Engine.trim_logs(fuzz_result.output)
291312

313+
workdir = options.workdir
314+
315+
corpus_minimization_failed = False
316+
try:
317+
time_for_minimize = timeout - fuzz_result.time_executed
318+
319+
self.minimize_corpus(
320+
target_path=target_path,
321+
arguments=[],
322+
# New units, in addition to the main corpus units,
323+
# are placed in new_corpus_dir. Minimize and merge back
324+
# to the main corpus_dir.
325+
input_dirs=[options.new_corpus_dir],
326+
output_dir=options.corpus_dir,
327+
reproducers_dir=reproducers_dir,
328+
max_time=time_for_minimize,
329+
# Use the same workdir that was used for fuzzing.
330+
# This allows us to skip rerunning the fuzzing inputs.
331+
workdir=workdir)
332+
except Exception as e:
333+
corpus_minimization_failed = True
334+
logs.error(f'corpus minimization failed: {e}')
335+
336+
if corpus_minimization_failed:
337+
# If we fail to minimize, fall back to moving the new units
338+
# from the new corpus_dir to the main corpus_dir.
339+
engine_common.move_mergeable_units(options.new_corpus_dir,
340+
options.corpus_dir)
341+
292342
reproducer_path = _get_reproducer_path(fuzz_result.output, reproducers_dir)
293343
crashes = []
294344
if reproducer_path:
@@ -298,11 +348,7 @@ def fuzz(self, target_path, options, reproducers_dir, max_time): # pylint: disa
298348
int(fuzz_result.time_executed)))
299349

300350
stats_filename = f'fuzzing-stats-{os.path.basename(target_path)}.000000.csv'
301-
args = fuzzer_options.FuzzerArguments.from_list(options.arguments)
302-
assert args is not None
303-
assert constants.WORKDIR_FLAGNAME in args
304351

305-
workdir = args[constants.WORKDIR_FLAGNAME]
306352
stats_file = os.path.join(workdir, stats_filename)
307353
stats = _parse_centipede_stats(stats_file)
308354
if not stats:
@@ -321,6 +367,11 @@ def fuzz(self, target_path, options, reproducers_dir, max_time): # pylint: disa
321367
num_execs_avg = stats.get('NumExecs_Avg', 0.0)
322368
stats['average_exec_per_sec'] = num_execs_avg / fuzz_time_secs_avg
323369
stats.update(_parse_centipede_logs(log_lines))
370+
371+
new_corpus_len = shell.get_directory_file_count(options.corpus_dir)
372+
logs.info(f'Corpus length after fuzzing: {new_corpus_len}')
373+
new_units_added = new_corpus_len - old_corpus_len
374+
stats['new_units_added'] = new_units_added
324375
return engine.FuzzResult(fuzz_result.output, fuzz_result.command, crashes,
325376
stats, fuzz_result.time_executed)
326377

@@ -379,14 +430,28 @@ def reproduce(self, target_path, input_path, arguments, max_time): # pylint: di
379430
return engine.ReproduceResult(result.command, result.return_code,
380431
result.time_executed, result.output)
381432

382-
def _create_temp_dir(self, name):
383-
"""Creates temporary directory for fuzzing."""
384-
new_directory = pathlib.Path(fuzzer_utils.get_temp_dir(), name)
385-
engine_common.recreate_directory(new_directory)
386-
return new_directory
433+
def _strip_fuzzing_arguments(self, arguments):
434+
"""Remove arguments only needed for fuzzing."""
435+
for argument in [
436+
constants.FORK_SERVER_FLAGNAME,
437+
constants.MAX_LEN_FLAGNAME,
438+
constants.RUNS_FLAGNAME,
439+
constants.EXIT_ON_CRASH_FLAGNAME,
440+
constants.BATCH_SIZE_FLAGNAME,
441+
]:
442+
if argument in arguments:
443+
del arguments[argument]
444+
445+
return arguments
387446

388-
def minimize_corpus(self, target_path, arguments, input_dirs, output_dir,
389-
reproducers_dir, max_time):
447+
def minimize_corpus(self,
448+
target_path,
449+
arguments,
450+
input_dirs,
451+
output_dir,
452+
reproducers_dir,
453+
max_time,
454+
workdir=None):
390455
"""Runs corpus minimization.
391456
Args:
392457
target_path: Path to the target.
@@ -401,16 +466,29 @@ def minimize_corpus(self, target_path, arguments, input_dirs, output_dir,
401466
A FuzzResult object.
402467
"""
403468
runner = _get_runner(target_path)
469+
_set_sanitizer_options(target_path)
470+
471+
minimize_arguments = self._get_arguments(target_path)
472+
self._strip_fuzzing_arguments(minimize_arguments)
473+
environment.set_value('ASAN_OPTIONS', 'detect_odr_violation=0')
404474

405475
# Step 1: Generate corpus file for Centipede.
406-
full_corpus_workdir = self._create_temp_dir('full_corpus_workdir')
476+
# When calling this during a fuzzing session, use the existing workdir.
477+
# This avoids us having to re-run inputs and waste time unnecessarily.
478+
# This saves a lot of time when the input corpus contains thousands
479+
# of files.
480+
full_corpus_workdir = workdir
481+
if not full_corpus_workdir:
482+
full_corpus_workdir = engine_common.create_temp_fuzzing_dir(
483+
'full_corpus_workdir')
407484
input_dirs_param = ','.join(str(dir) for dir in input_dirs)
408-
args = [
485+
args = minimize_arguments.list() + [
409486
f'--workdir={full_corpus_workdir}',
410487
f'--binary={target_path}',
411488
f'--corpus_dir={input_dirs_param}',
412489
'--num_runs=0',
413490
]
491+
logs.info(f'Running Generate Corpus file for Centipede with args: {args}')
414492
result = runner.run_and_wait(additional_args=args, timeout=max_time)
415493
max_time -= result.time_executed
416494

@@ -422,11 +500,12 @@ def minimize_corpus(self, target_path, arguments, input_dirs, output_dir,
422500
raise TimeoutError('Minimization timed out.')
423501

424502
# Step 2: Distill.
425-
args = [
503+
args = minimize_arguments.list() + [
426504
f'--workdir={full_corpus_workdir}',
427505
f'--binary={target_path}',
428-
'--distill',
506+
'--distill=true',
429507
]
508+
logs.info(f'Running Corpus Distillation with args: {args}')
430509
result = runner.run_and_wait(additional_args=args, timeout=max_time)
431510
max_time -= result.time_executed
432511

@@ -438,17 +517,21 @@ def minimize_corpus(self, target_path, arguments, input_dirs, output_dir,
438517

439518
# Step 3: Generate corpus files for output_dir.
440519
os.makedirs(output_dir, exist_ok=True)
441-
minimized_corpus_workdir = self._create_temp_dir('minimized_corpus_workdir')
520+
minimized_corpus_workdir = engine_common.create_temp_fuzzing_dir(
521+
'minimized_corpus_workdir')
522+
logs.info(f'Created a temporary minimized corpus '
523+
f'workdir {minimized_corpus_workdir}')
442524
distilled_file = os.path.join(
443525
full_corpus_workdir,
444526
f'distilled-{os.path.basename(target_path)}.000000')
445527
corpus_file = os.path.join(minimized_corpus_workdir, 'corpus.000000')
446528
shutil.copyfile(distilled_file, corpus_file)
447529

448-
args = [
530+
args = minimize_arguments.list() + [
449531
f'--workdir={minimized_corpus_workdir}',
450532
f'--corpus_to_files={output_dir}',
451533
]
534+
logs.info(f'Converting corpus to files with the following args: {args}')
452535
result = runner.run_and_wait(additional_args=args, timeout=max_time)
453536

454537
if result.timed_out or max_time < 0:
@@ -461,11 +544,16 @@ def minimize_corpus(self, target_path, arguments, input_dirs, output_dir,
461544
# Step 4: Copy reproducers from full_corpus_workdir.
462545
os.makedirs(reproducers_dir, exist_ok=True)
463546
crashes_dir = os.path.join(full_corpus_workdir, 'crashes')
464-
for file in os.listdir(crashes_dir):
465-
crasher_path = os.path.join(crashes_dir, file)
466-
shutil.copy(crasher_path, reproducers_dir)
467-
shutil.rmtree(full_corpus_workdir)
547+
548+
if os.path.exists(crashes_dir):
549+
for file in os.listdir(crashes_dir):
550+
crasher_path = os.path.join(crashes_dir, file)
551+
shutil.copy(crasher_path, reproducers_dir)
552+
468553
shutil.rmtree(minimized_corpus_workdir)
554+
if not workdir:
555+
# Only remove this directory if it was created in this method.
556+
shutil.rmtree(full_corpus_workdir)
469557

470558
return engine.ReproduceResult(result.command, result.return_code,
471559
result.time_executed, result.output)
@@ -507,7 +595,7 @@ def minimize_testcase(self, target_path, arguments, input_path, output_path,
507595
TimeoutError: If the testcase minimization exceeds max_time.
508596
"""
509597
runner = _get_runner(target_path)
510-
workdir = self._create_temp_dir('workdir')
598+
workdir = engine_common.create_temp_fuzzing_dir('workdir')
511599
args = [
512600
f'--binary={target_path}',
513601
f'--workdir={workdir}',

src/clusterfuzz/_internal/bot/fuzzers/engine_common.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import re
2222
import shlex
2323
import shutil
24+
import string
2425
import sys
2526
import time
2627

@@ -656,3 +657,34 @@ def get_log_header(command, time_executed):
656657
"""Get the log header."""
657658
quoted_command = get_command_quoted(command)
658659
return f'Command: {quoted_command}\nTime ran: {time_executed}\n'
660+
661+
662+
def is_sha1_hash(possible_hash):
663+
"""Returns True if |possible_hash| looks like a valid sha1 hash."""
664+
if len(possible_hash) != 40:
665+
return False
666+
667+
hexdigits_set = set(string.hexdigits)
668+
return all(char in hexdigits_set for char in possible_hash)
669+
670+
671+
def move_mergeable_units(merge_directory, corpus_directory):
672+
"""Move new units in |merge_directory| into |corpus_directory|."""
673+
initial_units = {
674+
os.path.basename(filename)
675+
for filename in shell.get_files_list(corpus_directory)
676+
}
677+
678+
for unit_path in shell.get_files_list(merge_directory):
679+
unit_name = os.path.basename(unit_path)
680+
if unit_name in initial_units and is_sha1_hash(unit_name):
681+
continue
682+
dest_path = os.path.join(corpus_directory, unit_name)
683+
shell.move(unit_path, dest_path)
684+
685+
686+
def create_temp_fuzzing_dir(name):
687+
"""Create a temporary directory for fuzzing."""
688+
new_corpus_directory = os.path.join(fuzzer_utils.get_temp_dir(), name)
689+
recreate_directory(new_corpus_directory)
690+
return new_corpus_directory

src/clusterfuzz/_internal/bot/fuzzers/honggfuzz/engine.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
from clusterfuzz._internal.base import utils
2222
from clusterfuzz._internal.bot.fuzzers import dictionary_manager
2323
from clusterfuzz._internal.bot.fuzzers import engine_common
24-
from clusterfuzz._internal.bot.fuzzers import utils as fuzzer_utils
2524
from clusterfuzz._internal.metrics import logs
2625
from clusterfuzz._internal.system import environment
2726
from clusterfuzz._internal.system import new_process
@@ -219,12 +218,6 @@ def reproduce(self, target_path, input_path, arguments, max_time): # pylint: di
219218
return engine.ReproduceResult(result.command, result.return_code,
220219
result.time_executed, result.output)
221220

222-
def _create_temp_corpus_dir(self, name):
223-
"""Creates temporary corpus directory."""
224-
new_corpus_directory = os.path.join(fuzzer_utils.get_temp_dir(), name)
225-
engine_common.recreate_directory(new_corpus_directory)
226-
return new_corpus_directory
227-
228221
def minimize_corpus(self, target_path, arguments, input_dirs, output_dir,
229222
reproducers_dir, max_time):
230223
"""Optional (but recommended): run corpus minimization.
@@ -244,7 +237,8 @@ def minimize_corpus(self, target_path, arguments, input_dirs, output_dir,
244237
del reproducers_dir
245238

246239
runner = _get_runner()
247-
combined_corpus_dir = self._create_temp_corpus_dir('minimize-workdir')
240+
combined_corpus_dir = engine_common.create_temp_fuzzing_dir(
241+
'minimize-workdir')
248242

249243
# Copy all of the seeds into corpus.
250244
idx = 0

0 commit comments

Comments
 (0)