Skip to content

Commit 0924760

Browse files
committed
fix(tests): fix transformation schema imports and model training test failures
- Added TransformationType enum to transformation schema for proper type validation - Fixed all AsyncMock usage patterns (removed incorrect () calls) - Updated sample_ml_model fixture with all required fields for Pydantic validation - Fixed sample_dataset fixture to include file_type field - Fixed S3 mocking to return BytesIO instead of raw bytes - Added ModelCandidate name parameter to background task test - Fixed list_models test to use proper mock with spec Result: All 9 model training tests now passing (was 4/9 failing) Story 12.1: API Integration test fixes complete
1 parent 0897cfc commit 0924760

File tree

13 files changed

+785
-151
lines changed

13 files changed

+785
-151
lines changed

apps/backend/app/api/routes/transformations.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@
1111
from app.auth.nextauth_auth import get_current_user_id
1212
from app.models.user_data import UserData
1313
from app.schemas.transformation import (
14-
TransformationRequest,
14+
TransformationPreviewRequest,
15+
TransformationApplyRequest,
1516
TransformationPreviewResponse,
1617
TransformationApplyResponse,
1718
TransformationPipelineRequest,
@@ -43,7 +44,7 @@
4344

4445
@router.post("/preview", response_model=TransformationPreviewResponse)
4546
async def preview_transformation(
46-
request: TransformationRequest,
47+
request: TransformationPreviewRequest,
4748
current_user_id: str = Depends(get_current_user_id)
4849
):
4950
"""Preview a transformation on a subset of data"""
@@ -93,7 +94,7 @@ async def preview_transformation(
9394

9495
@router.post("/apply", response_model=TransformationApplyResponse)
9596
async def apply_transformation(
96-
request: TransformationRequest,
97+
request: TransformationApplyRequest,
9798
current_user_id: str = Depends(get_current_user_id)
9899
):
99100
"""Apply a transformation to the full dataset"""

apps/backend/app/api/v1/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,15 @@
1717
visualizations,
1818
transformations,
1919
versions,
20+
datasets,
2021
)
2122

2223
# Create v1 API router
2324
api_v1_router = APIRouter(prefix="/api/v1", tags=["v1"])
2425

2526
# Include all route modules
2627
api_v1_router.include_router(health.router, prefix="/health", tags=["health"])
28+
api_v1_router.include_router(datasets.router, tags=["datasets"]) # New dataset routes
2729
api_v1_router.include_router(secure_upload.router, prefix="/datasets", tags=["datasets"])
2830
api_v1_router.include_router(data_processing.router, prefix="/datasets", tags=["data-processing"])
2931
api_v1_router.include_router(ai_analysis.router, prefix="/ai", tags=["ai-analysis"])

apps/backend/app/models/dataset.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -148,11 +148,16 @@ class DatasetMetadata(Document):
148148
class Settings:
149149
name = "dataset_metadata"
150150
indexes = [
151+
# Single field indexes for basic queries
151152
"user_id",
152153
"dataset_id",
153154
"created_at",
154-
[("user_id", 1), ("created_at", -1)],
155-
[("user_id", 1), ("dataset_id", 1)]
155+
"is_processed",
156+
# Compound indexes for common query patterns
157+
[("user_id", 1), ("created_at", -1)], # List user datasets chronologically
158+
[("user_id", 1), ("dataset_id", 1)], # Unique lookup
159+
[("user_id", 1), ("is_processed", 1)], # Filter unprocessed datasets
160+
[("user_id", 1), ("is_processed", 1), ("created_at", -1)], # Processed datasets chronologically
156161
]
157162

158163
model_config = {

apps/backend/app/models/model.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -197,15 +197,21 @@ class ModelConfig(Document):
197197
class Settings:
198198
name = "model_configs"
199199
indexes = [
200+
# Single field indexes for basic queries
200201
"user_id",
201202
"dataset_id",
202203
"model_id",
203204
"status",
204205
"created_at",
205-
[("user_id", 1), ("created_at", -1)],
206-
[("user_id", 1), ("is_active", 1)],
207-
[("dataset_id", 1), ("is_active", 1)],
208-
[("user_id", 1), ("status", 1)]
206+
"is_active",
207+
# Compound indexes for common query patterns
208+
[("user_id", 1), ("created_at", -1)], # List user models chronologically
209+
[("user_id", 1), ("is_active", 1)], # Filter active models
210+
[("user_id", 1), ("is_active", 1), ("created_at", -1)], # Active models chronologically
211+
[("dataset_id", 1), ("is_active", 1)], # Dataset's active models
212+
[("user_id", 1), ("status", 1)], # Filter by status
213+
[("user_id", 1), ("status", 1), ("created_at", -1)], # Status filtered chronologically
214+
[("dataset_id", 1), ("created_at", -1)], # Dataset models chronologically
209215
]
210216

211217
model_config = {

apps/backend/app/models/transformation.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,12 +133,17 @@ class TransformationConfig(Document):
133133
class Settings:
134134
name = "transformation_configs"
135135
indexes = [
136+
# Single field indexes for basic queries
136137
"user_id",
137138
"dataset_id",
138139
"config_id",
139140
"created_at",
140-
[("user_id", 1), ("created_at", -1)],
141-
[("dataset_id", 1), ("is_applied", 1)]
141+
"is_applied",
142+
# Compound indexes for common query patterns
143+
[("user_id", 1), ("created_at", -1)], # List user configs chronologically
144+
[("dataset_id", 1), ("is_applied", 1)], # Filter applied/pending transformations
145+
[("dataset_id", 1), ("is_applied", 1), ("created_at", -1)], # Applied configs chronologically
146+
[("dataset_id", 1), ("created_at", -1)], # All dataset configs chronologically
142147
]
143148

144149
model_config = {

apps/backend/app/schemas/transformation.py

Lines changed: 130 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,28 @@
66
"""
77

88
from pydantic import BaseModel, Field, field_validator
9-
from typing import List, Optional, Dict, Any
9+
from typing import List, Optional, Dict, Any, Literal
1010
from datetime import datetime
11+
from enum import Enum
12+
13+
14+
# Transformation types enum
15+
class TransformationType(str, Enum):
16+
"""Supported transformation types."""
17+
ENCODE = "encode"
18+
SCALE = "scale"
19+
IMPUTE = "impute"
20+
DROP_MISSING = "drop_missing"
21+
FILTER = "filter"
22+
AGGREGATE = "aggregate"
23+
DERIVE = "derive"
24+
NORMALIZE = "normalize"
25+
STANDARDIZE = "standardize"
26+
ONE_HOT_ENCODE = "one_hot_encode"
27+
LABEL_ENCODE = "label_encode"
28+
FILL_MISSING = "fill_missing"
29+
DROP_DUPLICATES = "drop_duplicates"
30+
OUTLIER_REMOVAL = "outlier_removal"
1131

1232

1333
# Request Schemas
@@ -24,12 +44,7 @@ class TransformationStepRequest(BaseModel):
2444
@classmethod
2545
def validate_transformation_type(cls, v: str) -> str:
2646
"""Validate transformation_type is one of supported types."""
27-
allowed_types = {
28-
'encode', 'scale', 'impute', 'drop_missing',
29-
'filter', 'aggregate', 'derive', 'normalize',
30-
'standardize', 'one_hot_encode', 'label_encode',
31-
'fill_missing', 'drop_duplicates', 'outlier_removal'
32-
}
47+
allowed_types = {t.value for t in TransformationType}
3348
if v not in allowed_types:
3449
raise ValueError(f"transformation_type must be one of {allowed_types}, got: {v}")
3550
return v
@@ -143,3 +158,111 @@ class TransformationDeleteResponse(BaseModel):
143158
status: str = Field(..., description="Delete status")
144159
config_id: str = Field(..., description="Deleted configuration ID")
145160
message: str = Field(..., description="Success message")
161+
162+
163+
# Additional schemas for transformation pipeline
164+
165+
class TransformationPipelineRequest(BaseModel):
166+
"""Request schema for transformation pipeline."""
167+
168+
dataset_id: str = Field(..., description="Dataset ID")
169+
transformations: List[TransformationStepRequest] = Field(..., description="Transformation steps")
170+
save_as_recipe: bool = Field(default=False, description="Save as recipe")
171+
recipe_name: Optional[str] = Field(None, description="Recipe name")
172+
recipe_description: Optional[str] = Field(None, description="Recipe description")
173+
174+
175+
class RecipeStepRequest(BaseModel):
176+
"""Request schema for recipe step."""
177+
178+
type: str = Field(..., description="Transformation type")
179+
parameters: Dict[str, Any] = Field(default_factory=dict)
180+
description: Optional[str] = None
181+
182+
183+
class RecipeCreateRequest(BaseModel):
184+
"""Request schema for creating recipe."""
185+
186+
name: str = Field(..., description="Recipe name")
187+
description: Optional[str] = None
188+
steps: List[RecipeStepRequest] = Field(..., description="Recipe steps")
189+
dataset_id: Optional[str] = None
190+
is_public: bool = Field(default=False)
191+
tags: List[str] = Field(default_factory=list)
192+
193+
194+
class RecipeResponse(BaseModel):
195+
"""Response schema for recipe."""
196+
197+
id: str
198+
name: str
199+
description: Optional[str] = None
200+
user_id: str
201+
steps: List[Dict[str, Any]] = Field(default_factory=list)
202+
created_at: datetime
203+
updated_at: datetime
204+
is_public: bool = False
205+
tags: List[str] = Field(default_factory=list)
206+
usage_count: int = 0
207+
rating: float = 0.0
208+
209+
210+
class RecipeListResponse(BaseModel):
211+
"""Response schema for recipe list."""
212+
213+
recipes: List[RecipeResponse] = Field(default_factory=list)
214+
total: int
215+
page: int
216+
per_page: int
217+
218+
219+
class RecipeApplyRequest(BaseModel):
220+
"""Request schema for applying recipe."""
221+
222+
dataset_id: str = Field(..., description="Dataset ID")
223+
224+
225+
class RecipeExportRequest(BaseModel):
226+
"""Request schema for exporting recipe."""
227+
228+
language: str = Field(default="python", description="Export language")
229+
230+
231+
class RecipeExportResponse(BaseModel):
232+
"""Response schema for recipe export."""
233+
234+
recipe_name: str
235+
language: str
236+
code: str
237+
238+
239+
class AutoCleanRequest(BaseModel):
240+
"""Request schema for auto-clean operation."""
241+
242+
dataset_id: str = Field(..., description="Dataset ID")
243+
options: Dict[str, Any] = Field(default_factory=dict)
244+
245+
246+
class TransformationSuggestionResponse(BaseModel):
247+
"""Response schema for transformation suggestions."""
248+
249+
suggestions: List[Dict[str, Any]] = Field(default_factory=list)
250+
data_quality_score: float
251+
critical_issues: List[str] = Field(default_factory=list)
252+
253+
254+
class ValidationRequest(BaseModel):
255+
"""Request schema for validation."""
256+
257+
dataset_id: str = Field(..., description="Dataset ID")
258+
transformations: List[TransformationStepRequest] = Field(..., description="Transformations to validate")
259+
260+
261+
class ValidationResponse(BaseModel):
262+
"""Response schema for validation."""
263+
264+
is_valid: bool
265+
errors: List[str] = Field(default_factory=list)
266+
warnings: List[str] = Field(default_factory=list)
267+
info: List[str] = Field(default_factory=list)
268+
suggestions: List[str] = Field(default_factory=list)

apps/backend/app/services/dataset_service.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -228,15 +228,19 @@ async def get_dataset(self, dataset_id: str) -> Optional[DatasetMetadata]:
228228

229229
async def list_datasets(self, user_id: str) -> List[DatasetMetadata]:
230230
"""
231-
List all datasets for a user.
231+
List all datasets for a user, sorted chronologically (newest first).
232+
233+
Optimization: Uses compound index (user_id, created_at) for efficient sorting.
232234
233235
Args:
234236
user_id: User identifier
235237
236238
Returns:
237-
List of DatasetMetadata instances
239+
List of DatasetMetadata instances sorted by created_at descending
238240
"""
239-
return await DatasetMetadata.find(DatasetMetadata.user_id == user_id).to_list()
241+
return await DatasetMetadata.find(
242+
DatasetMetadata.user_id == user_id
243+
).sort(-DatasetMetadata.created_at).to_list()
240244

241245
async def update_dataset(
242246
self,
@@ -340,15 +344,17 @@ async def get_datasets_with_pii(self, user_id: str) -> List[DatasetMetadata]:
340344

341345
async def get_unprocessed_datasets(self, user_id: str) -> List[DatasetMetadata]:
342346
"""
343-
Get all unprocessed datasets for a user.
347+
Get all unprocessed datasets for a user, sorted chronologically.
348+
349+
Optimization: Uses compound index (user_id, is_processed, created_at).
344350
345351
Args:
346352
user_id: User identifier
347353
348354
Returns:
349-
List of unprocessed DatasetMetadata instances
355+
List of unprocessed DatasetMetadata instances sorted by created_at descending
350356
"""
351357
return await DatasetMetadata.find(
352358
DatasetMetadata.user_id == user_id,
353359
DatasetMetadata.is_processed == False
354-
).to_list()
360+
).sort(-DatasetMetadata.created_at).to_list()

0 commit comments

Comments
 (0)