fix broken links and format

marwan37 · marwan37 · commit e6ffbeb20ed6 · 2025-03-19T20:04:09.000-05:00
diff --git a/research-radar/README.md b/research-radar/README.md
@@ -230,7 +230,7 @@ The project follows the recommended ZenML project structure:
 The project includes detailed documentation in various subdirectories:
 - **[Data Documentation](data/README.md)**: Details on dataset storage and processing.
 - **[Classification Results Documentation](classification_results/README.md)**: Explanation of classification outputs, metrics, and the checkpoint system.
-- **[Model Comparison Documentation](model_comparison/README.md)**: Details on the model comparison.
+- **[Model Comparison Metrics Documentation](model_compare_metrics/README.md)**: Details on the model comparison.
 - **[Pipelines Documentation](pipelines/README.md)**: Details on the pipeline definitions.
 - **[Prompts Documentation](prompts/README.md)**: Details on the prompts used in the pipeline.
 - **[Schemas Documentation](schemas/README.md)**: Details on data models and validation.
diff --git a/research-radar/pipelines/README.md b/research-radar/pipelines/README.md
@@ -1,13 +1,12 @@
 # Pipeline Usage
 
 - For detailed implementation of each step, see the individual Python files
-- For pipeline configurations and settings, refer to [`base_config.yaml`](../base_config.yaml)
+- For pipeline configurations and settings, refer to [`base_config.yaml`](../configs/base_config.yaml)
 
 ## Classification Pipeline
 
 Runs the following steps:
 
-- [`load_classification_dataset`](../steps/load_classification_dataset.py) - Loads articles based on classification mode
 - [`classify_articles`](../steps/classify_articles.py) - Classifies articles using DeepSeek R1
 - [`save_classifications`](../steps/save_classifications.py) - Saves classification results to JSON
 - [`merge_classifications`](../steps/merge_classifications.py) - Merges new classifications with existing dataset (augmentation mode)
@@ -20,7 +19,6 @@ Runs the following steps:
 
 Runs the following steps:
 
-- [`load_training_dataset`](../steps/load_training_dataset.py) - Automatically selects augmented dataset if available, otherwise uses composite dataset
 - [`data_preprocessor`](../steps/data_preprocessor.py) - Prepares text for model training
 - [`data_splitter`](../steps/data_splitter.py) - Creates train/validation/test splits
 - [`save_test_set`](../steps/save_test_set.py) - Optionally saves test set for later evaluation
@@ -37,6 +35,6 @@ Runs the following step:
 
 Runs the following steps:
 
-- [`load_test_set_from_artifact`](../steps/load_test_set_from_artifact.py)
+- [`load_test_set`](../steps/load_test_set.py)
 - [`compare_models`](../steps/compare_models.py)
 - [`save_comparison_metrics`](../steps/save_comparison_metrics.py)
diff --git a/research-radar/schemas/README.md b/research-radar/schemas/README.md
@@ -31,7 +31,7 @@ Two-part schema for article data:
 - `InputArticle`: Article text with metadata and validation rules
   - Ensures text is non-empty with field validation
 
-### [`training_arguments_config.py`](training_arguments_config.py)
+### [`training_config.py`](training_config.py)
 
 Configuration schema for Hugging Face `TrainingArguments`:
 
diff --git a/research-radar/schemas/config_models.py b/research-radar/schemas/config_models.py
@@ -15,9 +15,7 @@
 # limitations under the License.
 #
 
-"""
-Pydantic models for configuration validation.
-"""
+"""Pydantic models for configuration validation."""
 
 from typing import Dict, List, Literal, Optional
 
@@ -41,6 +39,7 @@ class BatchProcessingConfig(BaseModel):
     @field_validator("batch_size")
     @classmethod
     def validate_batch_size(cls, v):
+        """Validate the batch size."""
         if v <= 0:
             raise ValueError("batch_size must be greater than 0")
         return v
@@ -57,6 +56,7 @@ class ParallelProcessingConfig(BaseModel):
     @field_validator("workers")
     @classmethod
     def validate_workers(cls, v):
+        """Validate the number of workers."""
         if v < 1:
             raise ValueError("Number of workers must be at least 1")
         return v
@@ -82,6 +82,7 @@ class InferenceParamsConfig(BaseModel):
     @field_validator("temperature", "top_p")
     @classmethod
     def validate_probability_params(cls, v):
+        """Validate the probability parameters."""
         if not 0.0 <= v <= 1.0:
             raise ValueError(
                 "Probability parameters must be between 0.0 and 1.0"
@@ -90,6 +91,7 @@ def validate_probability_params(cls, v):
 
     @model_validator(mode="after")
     def validate_token_lengths(self):
+        """Validate the token lengths."""
         if self.max_new_tokens >= self.max_sequence_length:
             raise ValueError(
                 "max_new_tokens must be less than max_sequence_length"
@@ -330,8 +332,7 @@ class AppConfig(BaseModel):
 
 
 def validate_config(config: Dict) -> AppConfig:
-    """
-    Validate configuration dictionary against Pydantic models.
+    """Validate configuration dictionary against Pydantic models.
 
     Args:
         config: Raw configuration dictionary loaded from base_config.yaml
diff --git a/research-radar/steps/data_splitter.py b/research-radar/steps/data_splitter.py
@@ -33,8 +33,7 @@ def data_splitter(
     Annotated[Dataset, "validation_set"],
     Annotated[Dataset, "test_set"],
 ]:
-    """
-    Performs stratified dataset splitting.
+    """Performs stratified dataset splitting.
 
     Args:
         dataset: Input dataset to split
diff --git a/research-radar/steps/load_test_set.py b/research-radar/steps/load_test_set.py
@@ -45,12 +45,10 @@ def load_test_set(
         source_type: Type of data source ('disk' or 'artifact')
         path: Path to dataset on disk (required when source_type is 'disk')
         artifact_name: Name of the ZenML artifact (required when source_type is 'artifact')
+        version: Version of the ZenML artifact (optional, defaults to latest)
 
     Returns:
         Dataset: The loaded dataset
-
-    Raises:
-        ValueError: If the parameters are invalid or the dataset cannot be loaded
     """
     source_type = source_type.lower()
     if source_type not in ["disk", "artifact"]: