-
Notifications
You must be signed in to change notification settings - Fork 65
Need Comprehensive Error Handling and Logging #9
Copy link
Copy link
Open
Description
Description
The benchmark code currently lacks comprehensive error handling and logging infrastructure. This makes debugging difficult and provides poor user experience when things go wrong.
Current Issues
-
Minimal Error Handling:
- Few try-except blocks in critical sections
- Errors propagate as uncaught exceptions
- No graceful degradation or recovery
- Cryptic error messages for users
-
No Logging Infrastructure:
- No structured logging system
- Difficult to debug issues
- No audit trail of benchmark execution
- Can't track progress for long-running benchmarks
- No way to adjust verbosity
-
Poor User Experience:
- Stack traces instead of helpful error messages
- No indication of progress for long operations
- Difficult to diagnose configuration issues
- Silent failures in some cases
Examples of Missing Error Handling
listing_folder_benchmarks/src/run.py:
# No validation if root directory is accessible
if not os.path.exists(args.root):
make_tree(args.root, args.entries_per_dir, args.depth)
# What if make_tree fails? No error handlingcheckpointing_benchmarks/src/checkpoint_runner.py:
def _write_shard(path: Path, size_bytes: int, chunk_bytes: int, fsync: bool):
# No error handling for disk full, permission denied, etc.
with open(path, "wb") as fh:
while remaining > 0:
fh.write(to_write) # Could failserving_benchmarks/src/train.py:
# No error handling for invalid configurations
microbatches_per_step = max(1, args.gbs // max(1, args.mbs))
# What if gbs < mbs? Silent incorrect behaviorProposed Solution
1. Add Structured Logging
Create utils/logging.py:
import logging
import sys
from typing import Optional
def setup_logger(
name: str,
level: str = "INFO",
log_file: Optional[str] = None
) -> logging.Logger:
"""
Set up structured logger for benchmarks.
Args:
name: Logger name (typically __name__)
level: Log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
log_file: Optional file path for log output
Returns:
Configured logger instance
"""
logger = logging.getLogger(name)
logger.setLevel(getattr(logging, level.upper()))
# Console handler with formatting
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.INFO)
# Detailed format for file, simple for console
console_format = logging.Formatter(
'%(levelname)s: %(message)s'
)
file_format = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
console_handler.setFormatter(console_format)
logger.addHandler(console_handler)
# File handler if specified
if log_file:
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(file_format)
logger.addHandler(file_handler)
return logger2. Add Error Handling Framework
Create utils/errors.py:
class BenchmarkError(Exception):
"""Base exception for benchmark errors."""
pass
class ConfigurationError(BenchmarkError):
"""Raised when configuration is invalid."""
pass
class StorageError(BenchmarkError):
"""Raised when storage operations fail."""
pass
class DataGenerationError(BenchmarkError):
"""Raised when synthetic data generation fails."""
pass
def handle_error(logger, error: Exception, context: str) -> None:
"""
Centralized error handling with user-friendly messages.
Args:
logger: Logger instance
error: Exception that occurred
context: Description of what was being done
"""
logger.error(f"Error during {context}: {str(error)}")
logger.debug(f"Full traceback:", exc_info=True)
# Provide helpful suggestions based on error type
if isinstance(error, PermissionError):
logger.error(
"Permission denied. Please check:\n"
" - Directory permissions\n"
" - Available disk space\n"
" - User has write access"
)
elif isinstance(error, FileNotFoundError):
logger.error(
"File or directory not found. Please check:\n"
" - Path is correct\n"
" - Directory exists\n"
" - Configuration file path"
)3. Enhanced Run Scripts with Error Handling
Example for listing_folder_benchmarks/src/run.py:
import os
import sys
from pathlib import Path
from utils.logging import setup_logger
from utils.errors import ConfigurationError, DataGenerationError, handle_error
def main():
# Set up logging with --verbose flag support
log_level = "DEBUG" if args.verbose else "INFO"
log_file = os.path.join(args.outdir, args.run_name, "benchmark.log")
logger = setup_logger(__name__, log_level, log_file)
logger.info(f"Starting listing benchmark: {args.run_name}")
logger.debug(f"Configuration: {vars(args)}")
try:
# Validate configuration
validate_config(args, logger)
# Create output directory
run_dir = Path(args.outdir) / args.run_name
try:
run_dir.mkdir(parents=True, exist_ok=True)
logger.debug(f"Output directory: {run_dir}")
except PermissionError as e:
raise ConfigurationError(
f"Cannot create output directory {run_dir}: {e}"
)
# Build synthetic tree if needed
if not os.path.exists(args.root):
logger.info(f"Creating synthetic tree at {args.root}")
try:
make_tree(args.root, args.entries_per_dir, args.depth)
logger.info(
f"Created tree with {args.entries_per_dir} entries "
f"per directory, depth {args.depth}"
)
except Exception as e:
raise DataGenerationError(
f"Failed to create synthetic tree: {e}"
)
# Run benchmark with progress logging
logger.info("Starting listing benchmark...")
records = list_tree(
args.root, args.page_size, args.concurrency, logger
)
logger.info(f"Completed {len(records)} listing operations")
# Generate outputs
logger.info("Writing outputs...")
write_outputs(run_dir, records, args, logger)
logger.info(f"Results written to {run_dir}")
logger.info("Benchmark completed successfully")
except ConfigurationError as e:
handle_error(logger, e, "configuration validation")
sys.exit(1)
except DataGenerationError as e:
handle_error(logger, e, "synthetic tree generation")
sys.exit(2)
except Exception as e:
handle_error(logger, e, "benchmark execution")
sys.exit(3)
def validate_config(args, logger):
"""Validate configuration parameters."""
if args.concurrency < 1:
raise ConfigurationError("Concurrency must be >= 1")
if args.page_size < 1:
raise ConfigurationError("Page size must be >= 1")
if args.depth < 1:
raise ConfigurationError("Tree depth must be >= 1")
if args.entries_per_dir < 1:
raise ConfigurationError("Entries per directory must be >= 1")
logger.debug("Configuration validation passed")4. Add Progress Logging for Long Operations
from tqdm import tqdm # Add to requirements
def list_tree(root: str, page_size: int, concurrency: int, logger):
"""List tree with progress indication."""
logger.info("Enumerating directories...")
dirs = []
for p, subdirs, files in os.walk(root):
entries = [os.path.join(p, f) for f in files]
dirs.append((p, entries))
logger.info(f"Found {len(dirs)} directories to process")
records = []
with ThreadPoolExecutor(max_workers=concurrency) as ex:
futures = []
for path, entries in dirs:
for chunk in _chunks(entries, page_size):
futures.append(ex.submit(_list_chunk, path, chunk))
# Progress bar for user feedback
with tqdm(total=len(futures), desc="Listing") as pbar:
for fut in as_completed(futures):
try:
records.append(fut.result())
pbar.update(1)
except Exception as e:
logger.error(f"Failed to list chunk: {e}")
# Continue processing other chunks
return sorted(records, key=lambda r: float(r[0]))5. Add CLI Arguments for Logging Control
parser.add_argument(
"--verbose", "-v",
action="store_true",
help="Enable verbose debug logging"
)
parser.add_argument(
"--quiet", "-q",
action="store_true",
help="Suppress all output except errors"
)
parser.add_argument(
"--log-file",
type=str,
default=None,
help="Write logs to specified file"
)Benefits
- Better debugging capabilities
- Improved user experience with clear error messages
- Audit trail for benchmark runs
- Progress feedback for long-running operations
- Easier troubleshooting
- Professional error handling
- Graceful failure modes
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels