diff --git a/examples/csv_validation_example.py b/examples/csv_validation_example.py new file mode 100644 index 0000000..81f5520 --- /dev/null +++ b/examples/csv_validation_example.py @@ -0,0 +1,143 @@ +""" +Example usage of the CSV validator for price and signals files. + +This script demonstrates how to use the validate_input_csv function +to validate user-provided CSV files before backtesting. +""" + +import pandas as pd +from quant_research_starter.data import validate_input_csv, validate_price_csv + + +def example_validate_price_file(): + """Example: Validate a price CSV file.""" + print("=" * 60) + print("Example 1: Validating Price CSV File") + print("=" * 60) + + # Simulate a price file path + price_file = "data/sample_prices.csv" + + # Validate using the general function + result = validate_input_csv(price_file, csv_type="price") + + # Check results + if result["valid"]: + print(f"✓ File is valid!") + print(f" Rows: {result['row_count']}") + print(f" Columns: {result['column_count']}") + else: + print(f"✗ File has {len(result['errors'])} error(s):") + for error in result["errors"]: + print(f" - [{error['type']}] {error['message']}") + + # Check warnings + if result["warnings"]: + print(f"\n⚠ {len(result['warnings'])} warning(s):") + for warning in result["warnings"]: + print(f" - {warning['message']}") + + print() + + +def example_validate_with_required_columns(): + """Example: Validate with specific required columns.""" + print("=" * 60) + print("Example 2: Validating with Required Columns") + print("=" * 60) + + # Validate price file requiring specific symbols + price_file = "data/sample_prices.csv" + required_symbols = ["AAPL", "GOOGL", "MSFT"] + + is_valid, errors = validate_price_csv(price_file, required_symbols=required_symbols) + + if is_valid: + print(f"✓ All required symbols present: {', '.join(required_symbols)}") + else: + print(f"✗ Validation failed:") + for err in errors: + print(f" {err}") + + print() + + +def example_error_handling(): + """Example: Proper error handling in production code.""" + print("=" * 60) + print("Example 3: Error Handling in Production") + print("=" * 60) + + def load_and_validate_prices(file_path: str): + """Load price data with validation.""" + # First validate + result = validate_input_csv(file_path, csv_type="price") + + if not result["valid"]: + # Handle errors + error_messages = [err["message"] for err in result["errors"]] + raise ValueError(f"Invalid price file:\n" + "\n".join(error_messages)) + + # If valid, proceed with loading + prices = pd.read_csv(file_path, index_col=0, parse_dates=True) + print(f"✓ Loaded {len(prices)} rows of price data") + return prices + + # Try with a file + try: + prices = load_and_validate_prices("data/sample_prices.csv") + except ValueError as e: + print(f"✗ Error: {e}") + except FileNotFoundError: + print("✗ File not found (expected for this example)") + + print() + + +def example_detailed_error_info(): + """Example: Accessing detailed error information.""" + print("=" * 60) + print("Example 4: Detailed Error Information") + print("=" * 60) + + result = validate_input_csv("invalid_file.csv", csv_type="price") + + print(f"Validation Summary:") + print(f" Valid: {result['valid']}") + print(f" Errors: {len(result['errors'])}") + print(f" Warnings: {len(result['warnings'])}") + print(f" File: {result['file_path']}") + + # Access structured error data + for error in result["errors"]: + print(f"\nError Type: {error['type']}") + print(f"Message: {error['message']}") + if error.get("column"): + print(f"Column: {error['column']}") + if error.get("sample"): + print(f"Sample data: {error['sample']}") + + print() + + +def main(): + """Run all examples.""" + print("\n") + print("=" * 60) + print("CSV VALIDATOR USAGE EXAMPLES") + print("=" * 60) + print() + + example_validate_price_file() + example_validate_with_required_columns() + example_error_handling() + example_detailed_error_info() + + print("=" * 60) + print("For more information, see the documentation:") + print("src/quant_research_starter/data/validator.py") + print("=" * 60) + + +if __name__ == "__main__": + main() diff --git a/src/quant_research_starter/data/__init__.py b/src/quant_research_starter/data/__init__.py index 0f726c3..885e16f 100644 --- a/src/quant_research_starter/data/__init__.py +++ b/src/quant_research_starter/data/__init__.py @@ -1,12 +1,24 @@ -"""Data module public API.""" - -from .downloaders import AlphaVantageDownloader, YahooDownloader -from .sample_loader import SampleDataLoader -from .synthetic import SyntheticDataGenerator - -__all__ = [ - "SyntheticDataGenerator", - "SampleDataLoader", - "YahooDownloader", - "AlphaVantageDownloader", -] +"""Data module public API.""" + +from .downloaders import AlphaVantageDownloader, YahooDownloader +from .sample_loader import SampleDataLoader +from .synthetic import SyntheticDataGenerator +from .validator import ( + CSVValidator, + ValidationError, + validate_input_csv, + validate_price_csv, + validate_signals_csv, +) + +__all__ = [ + "SyntheticDataGenerator", + "SampleDataLoader", + "YahooDownloader", + "AlphaVantageDownloader", + "CSVValidator", + "ValidationError", + "validate_input_csv", + "validate_price_csv", + "validate_signals_csv", +] diff --git a/src/quant_research_starter/data/validator.py b/src/quant_research_starter/data/validator.py new file mode 100644 index 0000000..0d6079b --- /dev/null +++ b/src/quant_research_starter/data/validator.py @@ -0,0 +1,396 @@ +"""CSV validator for user-provided historical price and signals files.""" + +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + +import pandas as pd + + +class ValidationError: + """Container for validation error information.""" + + def __init__( + self, + error_type: str, + message: str, + sample_rows: Optional[pd.DataFrame] = None, + column: Optional[str] = None, + ): + self.error_type = error_type + self.message = message + self.sample_rows = sample_rows + self.column = column + + def __repr__(self) -> str: + """String representation of error.""" + lines = [f"[{self.error_type}] {self.message}"] + if self.sample_rows is not None and not self.sample_rows.empty: + lines.append("\nSample of offending rows (first 5):") + lines.append(self.sample_rows.head(5).to_string()) + return "\n".join(lines) + + +class CSVValidator: + """ + Validator for user-provided CSV files containing price or signal data. + + Validates: + - File existence and readability + - Required columns presence + - Date parsing and format + - Timezone handling + - Data types + - Missing values + - Duplicate dates + """ + + def __init__( + self, + required_columns: Optional[List[str]] = None, + date_column: str = "date", + allow_timezone: bool = True, + min_rows: int = 10, + ): + """ + Initialize validator. + + Args: + required_columns: List of required column names (besides date) + date_column: Name of the date/datetime column + allow_timezone: Whether to allow timezone-aware datetimes + min_rows: Minimum number of data rows required + """ + self.required_columns = required_columns or [] + self.date_column = date_column + self.allow_timezone = allow_timezone + self.min_rows = min_rows + self.errors: List[ValidationError] = [] + + def validate(self, file_path: str) -> Tuple[bool, List[ValidationError]]: + """ + Validate a CSV file. + + Args: + file_path: Path to CSV file + + Returns: + Tuple of (is_valid, list_of_errors) + """ + self.errors = [] + path = Path(file_path) + + # Check file existence + if not self._check_file_exists(path): + return False, self.errors + + # Try to read the CSV + df = self._read_csv(path) + if df is None: + return False, self.errors + + # Validate structure + self._validate_columns(df) + self._validate_date_index(df) + self._validate_data_types(df) + self._validate_missing_values(df) + self._validate_duplicates(df) + self._validate_row_count(df) + + is_valid = len(self.errors) == 0 + return is_valid, self.errors + + def _check_file_exists(self, path: Path) -> bool: + """Check if file exists and is readable.""" + if not path.exists(): + self.errors.append( + ValidationError("FILE_NOT_FOUND", f"File not found: {path.absolute()}") + ) + return False + + if not path.is_file(): + self.errors.append( + ValidationError( + "INVALID_FILE", f"Path is not a file: {path.absolute()}" + ) + ) + return False + + return True + + def _read_csv(self, path: Path) -> Optional[pd.DataFrame]: + """Attempt to read CSV file.""" + try: + # Try reading with date parsing + df = pd.read_csv(path, index_col=0, parse_dates=True) + return df + except pd.errors.EmptyDataError: + self.errors.append( + ValidationError("EMPTY_FILE", f"File is empty: {path.name}") + ) + return None + except pd.errors.ParserError as e: + self.errors.append( + ValidationError( + "PARSE_ERROR", f"Failed to parse CSV: {str(e)}", column=path.name + ) + ) + return None + except Exception as e: + self.errors.append( + ValidationError( + "READ_ERROR", f"Failed to read file: {str(e)}", column=path.name + ) + ) + return None + + def _validate_columns(self, df: pd.DataFrame) -> None: + """Validate that required columns are present.""" + if not self.required_columns: + return + + actual_columns = set(df.columns) + required_set = set(self.required_columns) + missing = required_set - actual_columns + + if missing: + for col in sorted(missing): + self.errors.append( + ValidationError( + "MISSING_COLUMN", + f"Missing required column: '{col}' - required for price/signal series", + column=col, + ) + ) + + def _validate_date_index(self, df: pd.DataFrame) -> None: + """Validate date index format and properties.""" + # Check if index is datetime + if not isinstance(df.index, pd.DatetimeIndex): + sample_df = pd.DataFrame({"index": df.index[:5]}) + self.errors.append( + ValidationError( + "INVALID_DATE_FORMAT", + f"Index is not a valid datetime. Expected datetime index, got {type(df.index).__name__}", + sample_rows=sample_df, + ) + ) + return + + # Check timezone + if df.index.tz is not None and not self.allow_timezone: + self.errors.append( + ValidationError( + "TIMEZONE_NOT_ALLOWED", + f"Timezone-aware datetime not allowed. Index has timezone: {df.index.tz}", + ) + ) + + # Check for NaT (Not a Time) values + nat_mask = df.index.isna() + if nat_mask.any(): + nat_indices = df[nat_mask].index[:5] + sample_df = pd.DataFrame( + {"row_number": range(len(nat_indices)), "invalid_date": nat_indices} + ) + self.errors.append( + ValidationError( + "INVALID_DATES", + f"Found {nat_mask.sum()} invalid date(s) (NaT) in index", + sample_rows=sample_df, + ) + ) + + def _validate_data_types(self, df: pd.DataFrame) -> None: + """Validate data types of columns.""" + for col in df.columns: + # Check if column is numeric + if not pd.api.types.is_numeric_dtype(df[col]): + # Try to show sample of non-numeric values + try: + non_numeric_mask = pd.to_numeric(df[col], errors="coerce").isna() + non_numeric_rows = df[non_numeric_mask].head(5) + self.errors.append( + ValidationError( + "INVALID_DTYPE", + f"Column '{col}' contains non-numeric values. " + f"Expected float/int, got {df[col].dtype}", + sample_rows=non_numeric_rows[[col]], + column=col, + ) + ) + except Exception: + self.errors.append( + ValidationError( + "INVALID_DTYPE", + f"Column '{col}' has invalid data type: {df[col].dtype}", + column=col, + ) + ) + + def _validate_missing_values(self, df: pd.DataFrame) -> None: + """Validate and report missing values.""" + for col in df.columns: + missing_count = df[col].isna().sum() + if missing_count > 0: + missing_pct = (missing_count / len(df)) * 100 + # Show sample rows with missing values + missing_rows = df[df[col].isna()].head(5) + self.errors.append( + ValidationError( + "MISSING_VALUES", + f"Column '{col}' has {missing_count} missing values ({missing_pct:.1f}% of data)", + sample_rows=missing_rows[[col]], + column=col, + ) + ) + + def _validate_duplicates(self, df: pd.DataFrame) -> None: + """Check for duplicate date indices.""" + duplicates = df.index.duplicated() + if duplicates.any(): + dup_count = duplicates.sum() + dup_dates = df[duplicates].index[:5] + sample_df = pd.DataFrame( + {"duplicate_date": dup_dates, "occurrence": "duplicate"} + ) + self.errors.append( + ValidationError( + "DUPLICATE_DATES", + f"Found {dup_count} duplicate date(s) in index. Each date should appear only once.", + sample_rows=sample_df, + ) + ) + + def _validate_row_count(self, df: pd.DataFrame) -> None: + """Validate minimum number of rows.""" + if len(df) < self.min_rows: + self.errors.append( + ValidationError( + "INSUFFICIENT_DATA", + f"File has only {len(df)} rows. Minimum required: {self.min_rows}", + ) + ) + + +def validate_price_csv( + file_path: str, required_symbols: Optional[List[str]] = None +) -> Tuple[bool, List[ValidationError]]: + """ + Validate a price CSV file. + + Expected format: + - First column (index): date + - Other columns: symbol names with price data + - All values numeric (float/int) + - No missing dates + + Args: + file_path: Path to the CSV file + required_symbols: Optional list of required symbol columns + + Returns: + Tuple of (is_valid, list_of_errors) + """ + validator = CSVValidator( + required_columns=required_symbols, date_column="date", min_rows=20 + ) + return validator.validate(file_path) + + +def validate_signals_csv( + file_path: str, required_columns: Optional[List[str]] = None +) -> Tuple[bool, List[ValidationError]]: + """ + Validate a signals CSV file. + + Expected format: + - First column (index): date + - Other columns: signal names or symbol signals + - All values numeric (float/int) + + Args: + file_path: Path to the CSV file + required_columns: Optional list of required column names + + Returns: + Tuple of (is_valid, list_of_errors) + """ + validator = CSVValidator( + required_columns=required_columns, date_column="date", min_rows=20 + ) + return validator.validate(file_path) + + +def validate_input_csv( + file_path: str, csv_type: str = "price", **kwargs +) -> Dict[str, Any]: + """ + General-purpose CSV validator with structured error reporting. + + Args: + file_path: Path to CSV file + csv_type: Type of CSV - 'price' or 'signals' + **kwargs: Additional arguments passed to specific validators + + Returns: + Dictionary with validation results: + { + 'valid': bool, + 'errors': List[Dict], # Structured error information + 'warnings': List[Dict], + 'file_path': str, + 'row_count': int (if file readable), + 'column_count': int (if file readable) + } + """ + path = Path(file_path) + + # Choose appropriate validator + if csv_type == "price": + is_valid, errors = validate_price_csv(file_path, **kwargs) + elif csv_type == "signals": + is_valid, errors = validate_signals_csv(file_path, **kwargs) + else: + validator = CSVValidator(**kwargs) + is_valid, errors = validator.validate(file_path) + + # Try to get basic file info + row_count = None + column_count = None + try: + if path.exists(): + df = pd.read_csv(file_path, index_col=0) + row_count = len(df) + column_count = len(df.columns) + except Exception: + pass + + # Structure errors for output + error_dicts = [] + warnings = [] + + for err in errors: + err_dict = { + "type": err.error_type, + "message": err.message, + "column": err.column, + } + + # Include sample data if available + if err.sample_rows is not None: + err_dict["sample"] = err.sample_rows.to_dict() + + # Categorize some errors as warnings + if err.error_type == "MISSING_VALUES": + warnings.append(err_dict) + else: + error_dicts.append(err_dict) + + return { + "valid": is_valid and len(error_dicts) == 0, + "errors": error_dicts, + "warnings": warnings, + "file_path": str(path.absolute()), + "row_count": row_count, + "column_count": column_count, + } diff --git a/tests/test_validator.py b/tests/test_validator.py new file mode 100644 index 0000000..dbb2f08 --- /dev/null +++ b/tests/test_validator.py @@ -0,0 +1,352 @@ +"""Tests for CSV validator.""" + +import numpy as np +import pandas as pd +import pytest + +from quant_research_starter.data.validator import ( + CSVValidator, + ValidationError, + validate_input_csv, + validate_price_csv, + validate_signals_csv, +) + + +@pytest.fixture +def valid_price_csv(tmp_path): + """Create a valid price CSV file.""" + dates = pd.date_range("2020-01-01", periods=50, freq="D") + data = { + "AAPL": np.random.uniform(100, 200, 50), + "GOOGL": np.random.uniform(1000, 2000, 50), + "MSFT": np.random.uniform(200, 300, 50), + } + df = pd.DataFrame(data, index=dates) + df.index.name = "date" + + file_path = tmp_path / "valid_prices.csv" + df.to_csv(file_path) + return file_path + + +@pytest.fixture +def valid_signals_csv(tmp_path): + """Create a valid signals CSV file.""" + dates = pd.date_range("2020-01-01", periods=50, freq="D") + data = { + "momentum": np.random.normal(0, 1, 50), + "value": np.random.normal(0, 1, 50), + "composite": np.random.normal(0, 1, 50), + } + df = pd.DataFrame(data, index=dates) + df.index.name = "date" + + file_path = tmp_path / "valid_signals.csv" + df.to_csv(file_path) + return file_path + + +class TestValidationError: + """Test ValidationError class.""" + + def test_error_creation(self): + """Test creating a validation error.""" + err = ValidationError("TEST_ERROR", "This is a test error") + assert err.error_type == "TEST_ERROR" + assert err.message == "This is a test error" + assert err.sample_rows is None + assert err.column is None + + def test_error_with_sample(self): + """Test error with sample rows.""" + df = pd.DataFrame({"col1": [1, 2, 3]}) + err = ValidationError( + "TEST_ERROR", "Error with sample", sample_rows=df, column="col1" + ) + assert err.sample_rows is not None + assert err.column == "col1" + + def test_error_repr(self): + """Test error string representation.""" + err = ValidationError("TEST_ERROR", "Test message") + repr_str = repr(err) + assert "TEST_ERROR" in repr_str + assert "Test message" in repr_str + + +class TestCSVValidator: + """Test CSV validator class.""" + + def test_file_not_found(self): + """Test validation of non-existent file.""" + validator = CSVValidator() + is_valid, errors = validator.validate("nonexistent_file.csv") + + assert not is_valid + assert len(errors) > 0 + assert errors[0].error_type == "FILE_NOT_FOUND" + + def test_valid_price_file(self, valid_price_csv): + """Test validation of valid price CSV.""" + validator = CSVValidator(min_rows=10) + is_valid, errors = validator.validate(str(valid_price_csv)) + + assert is_valid + assert len(errors) == 0 + + def test_missing_required_columns(self, tmp_path): + """Test detection of missing required columns.""" + dates = pd.date_range("2020-01-01", periods=20, freq="D") + df = pd.DataFrame({"AAPL": np.random.uniform(100, 200, 20)}, index=dates) + df.index.name = "date" + + file_path = tmp_path / "missing_cols.csv" + df.to_csv(file_path) + + validator = CSVValidator(required_columns=["AAPL", "GOOGL", "MSFT"]) + is_valid, errors = validator.validate(str(file_path)) + + assert not is_valid + assert any(e.error_type == "MISSING_COLUMN" for e in errors) + # Should find 2 missing columns: GOOGL and MSFT + missing_errors = [e for e in errors if e.error_type == "MISSING_COLUMN"] + assert len(missing_errors) == 2 + + def test_non_numeric_data(self, tmp_path): + """Test detection of non-numeric data.""" + dates = pd.date_range("2020-01-01", periods=20, freq="D") + df = pd.DataFrame( + { + "AAPL": ["100.5", "invalid", "102.3"] + list(range(17)), + "GOOGL": np.random.uniform(1000, 2000, 20), + }, + index=dates, + ) + df.index.name = "date" + + file_path = tmp_path / "non_numeric.csv" + df.to_csv(file_path) + + validator = CSVValidator() + is_valid, errors = validator.validate(str(file_path)) + + assert not is_valid + assert any(e.error_type == "INVALID_DTYPE" for e in errors) + + def test_missing_values_detection(self, tmp_path): + """Test detection of missing values.""" + dates = pd.date_range("2020-01-01", periods=30, freq="D") + data = np.random.uniform(100, 200, 30) + data[5:10] = np.nan # Add some NaN values + + df = pd.DataFrame({"AAPL": data}, index=dates) + df.index.name = "date" + + file_path = tmp_path / "missing_values.csv" + df.to_csv(file_path) + + validator = CSVValidator() + is_valid, errors = validator.validate(str(file_path)) + + # Missing values generate errors + assert any(e.error_type == "MISSING_VALUES" for e in errors) + + def test_duplicate_dates(self, tmp_path): + """Test detection of duplicate dates.""" + dates = pd.date_range("2020-01-01", periods=20, freq="D") + # Create duplicates by repeating some dates + dates_with_dups = dates.tolist() + [dates[5], dates[10]] + + df = pd.DataFrame( + {"AAPL": np.random.uniform(100, 200, 22)}, index=dates_with_dups + ) + df.index.name = "date" + + file_path = tmp_path / "duplicate_dates.csv" + df.to_csv(file_path) + + validator = CSVValidator() + is_valid, errors = validator.validate(str(file_path)) + + assert not is_valid + assert any(e.error_type == "DUPLICATE_DATES" for e in errors) + + def test_insufficient_data(self, tmp_path): + """Test detection of insufficient data rows.""" + dates = pd.date_range("2020-01-01", periods=5, freq="D") + df = pd.DataFrame({"AAPL": np.random.uniform(100, 200, 5)}, index=dates) + df.index.name = "date" + + file_path = tmp_path / "insufficient.csv" + df.to_csv(file_path) + + validator = CSVValidator(min_rows=10) + is_valid, errors = validator.validate(str(file_path)) + + assert not is_valid + assert any(e.error_type == "INSUFFICIENT_DATA" for e in errors) + + def test_invalid_date_format(self, tmp_path): + """Test detection of invalid date format.""" + # Create CSV with non-date index + df = pd.DataFrame( + { + "date": ["not-a-date", "2020-01-02", "2020-01-03"], + "AAPL": [100, 101, 102], + } + ) + + file_path = tmp_path / "invalid_dates.csv" + df.to_csv(file_path, index=False) + + validator = CSVValidator() + is_valid, errors = validator.validate(str(file_path)) + + # Should have date format issues + assert not is_valid + + def test_empty_file(self, tmp_path): + """Test validation of empty CSV file.""" + file_path = tmp_path / "empty.csv" + file_path.write_text("") + + validator = CSVValidator() + is_valid, errors = validator.validate(str(file_path)) + + assert not is_valid + assert any(e.error_type == "EMPTY_FILE" for e in errors) + + +class TestValidatePriceCSV: + """Test price CSV validation function.""" + + def test_valid_price_csv(self, valid_price_csv): + """Test validation of valid price CSV.""" + is_valid, errors = validate_price_csv(str(valid_price_csv)) + + assert is_valid + assert len(errors) == 0 + + def test_price_csv_with_required_symbols(self, valid_price_csv): + """Test price CSV validation with required symbols.""" + # Valid - has AAPL + is_valid, errors = validate_price_csv( + str(valid_price_csv), required_symbols=["AAPL"] + ) + assert is_valid + + # Invalid - missing TSLA + is_valid, errors = validate_price_csv( + str(valid_price_csv), required_symbols=["AAPL", "TSLA"] + ) + assert not is_valid + assert any(e.error_type == "MISSING_COLUMN" for e in errors) + + +class TestValidateSignalsCSV: + """Test signals CSV validation function.""" + + def test_valid_signals_csv(self, valid_signals_csv): + """Test validation of valid signals CSV.""" + is_valid, errors = validate_signals_csv(str(valid_signals_csv)) + + assert is_valid + assert len(errors) == 0 + + def test_signals_csv_with_required_columns(self, valid_signals_csv): + """Test signals CSV validation with required columns.""" + # Valid - has momentum + is_valid, errors = validate_signals_csv( + str(valid_signals_csv), required_columns=["momentum"] + ) + assert is_valid + + # Invalid - missing size + is_valid, errors = validate_signals_csv( + str(valid_signals_csv), required_columns=["momentum", "size"] + ) + assert not is_valid + + +class TestValidateInputCSV: + """Test general-purpose validate_input_csv function.""" + + def test_validate_price_type(self, valid_price_csv): + """Test validation with price type.""" + result = validate_input_csv(str(valid_price_csv), csv_type="price") + + assert result["valid"] + assert len(result["errors"]) == 0 + assert result["row_count"] == 50 + assert result["column_count"] == 3 + assert "file_path" in result + + def test_validate_signals_type(self, valid_signals_csv): + """Test validation with signals type.""" + result = validate_input_csv(str(valid_signals_csv), csv_type="signals") + + assert result["valid"] + assert len(result["errors"]) == 0 + assert result["row_count"] == 50 + assert result["column_count"] == 3 + + def test_validate_with_errors(self, tmp_path): + """Test validation that returns errors.""" + # Create invalid CSV (too few rows) + dates = pd.date_range("2020-01-01", periods=5, freq="D") + df = pd.DataFrame({"AAPL": [100, 101, 102, 103, 104]}, index=dates) + df.index.name = "date" + + file_path = tmp_path / "invalid.csv" + df.to_csv(file_path) + + result = validate_input_csv(str(file_path), csv_type="price") + + assert not result["valid"] + assert len(result["errors"]) > 0 + assert result["row_count"] == 5 + + def test_validate_with_warnings(self, tmp_path): + """Test validation with warnings (missing values).""" + dates = pd.date_range("2020-01-01", periods=25, freq="D") + data = np.random.uniform(100, 200, 25) + data[10] = np.nan # Add one missing value + + df = pd.DataFrame({"AAPL": data}, index=dates) + df.index.name = "date" + + file_path = tmp_path / "with_warnings.csv" + df.to_csv(file_path) + + result = validate_input_csv(str(file_path), csv_type="price") + + # Should have warnings but still be valid (warnings are separated) + assert len(result["warnings"]) > 0 + + def test_nonexistent_file(self): + """Test validation of non-existent file.""" + result = validate_input_csv("nonexistent.csv", csv_type="price") + + assert not result["valid"] + assert len(result["errors"]) > 0 + assert result["row_count"] is None + assert result["column_count"] is None + + def test_structured_error_output(self, tmp_path): + """Test that errors are properly structured.""" + # Create file with multiple error types + dates = ["2020-01-01", "invalid-date", "2020-01-03"] + df = pd.DataFrame({"date": dates, "AAPL": [100, "invalid", 102]}) + + file_path = tmp_path / "multi_error.csv" + df.to_csv(file_path, index=False) + + result = validate_input_csv(str(file_path), csv_type="price") + + assert not result["valid"] + # Check error structure + for error in result["errors"]: + assert "type" in error + assert "message" in error + assert "column" in error