Merge pull request #92 from digital-land/1780-new-pipeline

pooleycodes · web-flow · commit 616a24d86b57 · 2026-01-20T16:56:30.000Z
Changing over to new Pipeline Class Management
diff --git a/.vscode/DEBUG_SETUP.md b/.vscode/DEBUG_SETUP.md
@@ -0,0 +1,49 @@
+# Debug Setup Guide for Request Processor Checking of URL
+
+### Prerequisites
+
+You need these services running:
+- PostgreSQL (or use Docker version with port 54320 exposed)
+- Redis (for Celery broker) - or use Docker version with port 6379 exposed
+- LocalStack (for S3/SQS) - or use Docker version with port 4566 exposed
+- **Do Not** have the request-processor running.
+
+Keep your Docker Compose stack running with:
+```bash
+docker compose up -d localstack request-db redis
+```
+
+- Make sure to have .venv installed locally for request-processor, such that the launch.json is correctly pointing to it
+
+### Start Debugging of CheckURL
+
+Click **"Run"** in VS Code Debug
+
+The script will:
+1. Invoke the `check_dataurl` task directly (no Celery broker needed)
+2. Pass your POST request body payload <-- make sure to set the request_body variable to data you want to test in debug_trigger.py
+3. Pause at breakpoints you've set
+
+
+## Environment Variables
+
+The launch configuration sets these automatically:
+
+```
+DATABASE_URL=postgresql://postgres:password@localhost:54320/request_database
+CELERY_BROKER_URL=redis://localhost:6379/0
+AWS_ENDPOINT_URL=http://localhost:4566
+AWS_DEFAULT_REGION=eu-west-2
+SENTRY_ENABLED=false
+```
+
+If your local setup differs, edit `.vscode/launch.json` and update the `env` section within debugging request processor.
+
+---
+
+## Notes
+
+- The `debug_trigger.py` script calls `check_dataurl()` **synchronously**, without Celery. This is intentional—it makes debugging simpler.
+- The request database transaction will be created when the task runs, so you can inspect the DB afterward.
+- The `docker_volume` folder should now contain the downloaded resources under `/opt/collection/resource/<request-id>/` after the task completes.
+
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,57 @@
+{
+    "version": "0.2.0",
+    "python": "${workspaceFolder}/request-processor/.venv/bin/python",
+    "configurations": [
+        {
+            "name": "Debug Request Processor - Manual Task Trigger",
+            "type": "python",
+            "request": "launch",
+            "python": "${workspaceFolder}/request-processor/.venv/bin/python",
+            "program": "${workspaceFolder}/scripts/debug_trigger.py",
+            "console": "integratedTerminal",
+            "justMyCode": false,
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}:${workspaceFolder}/request-processor:${workspaceFolder}/request-processor/.venv/src/digital-land",
+                "DATABASE_URL": "postgresql://postgres:password@localhost:54320/request_database",
+                "CELERY_BROKER_URL": "redis://localhost:6379/0",
+                "AWS_ENDPOINT_URL": "http://localhost:4566",
+                "AWS_DEFAULT_REGION": "eu-west-2",
+                "AWS_ACCESS_KEY_ID": "example",
+                "AWS_SECRET_ACCESS_KEY": "example",
+                "REQUEST_FILES_BUCKET_NAME": "dluhc-data-platform-request-files-local",
+                "SENTRY_ENABLED": "false"
+            },
+            "args": [],
+            "cwd": "${workspaceFolder}/request-processor"
+        },
+        {
+            "name": "Debug Request Processor - Celery Worker",
+            "type": "python",
+            "request": "launch",
+            "python": "${workspaceFolder}/request-processor/.venv/bin/python",
+            "program": "-m",
+            "args": [
+                "celery",
+                "-A",
+                "src.tasks",
+                "worker",
+                "--loglevel=debug",
+                "--concurrency=1"
+            ],
+            "console": "integratedTerminal",
+            "justMyCode": false,
+            "env": {
+                "PYTHONPATH": "${workspaceFolder}:${workspaceFolder}/request-processor:${workspaceFolder}/request-processor/.venv/src/digital-land",
+                "DATABASE_URL": "postgresql://postgres:password@localhost:54320/request_database",
+                "CELERY_BROKER_URL": "redis://localhost:6379/0",
+                "AWS_ENDPOINT_URL": "http://localhost:4566",
+                "AWS_DEFAULT_REGION": "eu-west-2",
+                "AWS_ACCESS_KEY_ID": "example",
+                "AWS_SECRET_ACCESS_KEY": "example",
+                "REQUEST_FILES_BUCKET_NAME": "dluhc-data-platform-request-files-local",
+                "SENTRY_ENABLED": "false"
+            },
+            "cwd": "${workspaceFolder}/request-processor"
+        }
+    ]
+}
diff --git a/README.md b/README.md
@@ -103,3 +103,7 @@ To include new dependencies, update the requirements.in file with the desired pa
     ```
 
 This ensures that your project accurately reflects its dependencies, including any transitive dependencies required by the newly added packages.
+
+## Debug
+
+Because this is a monorepo, debugging can be challenging. The primary approach is to review the logs for each sub‑repository through Docker. However, if you need to perform breakpoint debugging, a debug configuration is available under the .vscode/ folder, currently only set up for the request-processor service.
diff --git a/request-processor/src/application/core/pipeline.py b/request-processor/src/application/core/pipeline.py
@@ -2,35 +2,10 @@
 import csv
 from application.logging.logger import get_logger
 from digital_land.specification import Specification
-from digital_land.log import DatasetResourceLog, IssueLog, ColumnFieldLog
 from digital_land.organisation import Organisation
-from digital_land.phase.combine import FactCombinePhase
-from digital_land.phase.concat import ConcatFieldPhase
-from digital_land.phase.convert import ConvertPhase
-from digital_land.phase.default import DefaultPhase
-from digital_land.phase.factor import FactorPhase
-from digital_land.phase.filter import FilterPhase
-from digital_land.phase.harmonise import HarmonisePhase
-from digital_land.phase.lookup import (
-    EntityLookupPhase,
-    FactLookupPhase,
-)
-
-from digital_land.phase.map import MapPhase
-from digital_land.phase.migrate import MigratePhase
-from digital_land.phase.normalise import NormalisePhase
-from digital_land.phase.organisation import OrganisationPhase
-from digital_land.phase.parse import ParsePhase
-from digital_land.phase.patch import PatchPhase
-from digital_land.phase.pivot import PivotPhase
-from digital_land.phase.prefix import EntityPrefixPhase
-from digital_land.phase.prune import FieldPrunePhase, FactPrunePhase
-from digital_land.phase.reference import EntityReferencePhase, FactReferencePhase
-from digital_land.phase.save import SavePhase
-from digital_land.phase.priority import PriorityPhase
-from digital_land.pipeline import run_pipeline, Pipeline, Lookups
+
+from digital_land.pipeline import Pipeline, Lookups
 from digital_land.commands import get_resource_unidentified_lookups
-from digital_land.check import duplicate_reference_check
 from digital_land.api import API
 from application.core.utils import append_endpoint, append_source
 from datetime import datetime
@@ -55,9 +30,10 @@ def fetch_response_data(
     additional_col_mappings,
     additional_concats,
 ):
-    # define variables for Pipeline and specification
-    pipeline = Pipeline(pipeline_dir, dataset)
+    # define variables for Pipeline Execution
     specification = Specification(specification_dir)
+    pipeline = Pipeline(pipeline_dir, dataset, specification=specification)
+    api = API(specification=specification)
 
     input_path = os.path.join(collection_dir, "resource", request_id)
     # List all files in the "resource" directory
@@ -99,155 +75,29 @@ def fetch_response_data(
             os.path.join(dataset_resource_dir, dataset, request_id), exist_ok=True
         )
         try:
-            pipeline_run(
-                dataset=dataset,
-                pipeline=pipeline,
-                request_id=request_id,
-                specification_dir=specification_dir,
+            resource = resource_from_path(file_path)
+            issue_log = pipeline.transform(
                 input_path=file_path,
                 output_path=os.path.join(
-                    transformed_dir, dataset, request_id, f"{file_name}.csv"
+                    transformed_dir, dataset, request_id, f"{resource}.csv"
                 ),
-                issue_dir=os.path.join(issue_dir, dataset, request_id),
-                column_field_dir=os.path.join(column_field_dir, dataset, request_id),
-                dataset_resource_dir=os.path.join(
-                    dataset_resource_dir, dataset, request_id
+                organisation=Organisation(os.path.join(cache_dir, "organisation.csv"), Path(pipeline.path)),
+                resource=resource,
+                valid_category_values = api.get_valid_category_values(dataset, pipeline),
+                converted_path=os.path.join(converted_dir, request_id, f"{resource}.csv"),
+                disable_lookups=True,
+            )
+            # Issue log needs severity column added, so manually added and saved here
+            issue_log.add_severity_column(os.path.join(specification_dir, "issue-type.csv"))
+            issue_log.save(os.path.join(issue_dir, dataset, request_id, resource + ".csv"))
+            pipeline.save_logs(
+                column_field_path=os.path.join(column_field_dir, dataset, request_id, resource + ".csv"),
+                dataset_resource_path=os.path.join(
+                    dataset_resource_dir, dataset, request_id, resource + ".csv"
                 ),
-                organisation_path=os.path.join(cache_dir, "organisation.csv"),
-                save_harmonised=False,
-                organisations=[organisation],
-                converted_dir=converted_dir,
             )
         except Exception as err:
-            logger.error("An exception occured during pipeline_run: ", str(err))
-
-
-def pipeline_run(
-    dataset,
-    pipeline,
-    request_id,
-    specification_dir,
-    input_path,
-    output_path,
-    organisations,
-    converted_dir,
-    null_path=None,  # TBD: remove this
-    issue_dir=None,
-    organisation_path=None,
-    save_harmonised=False,
-    column_field_dir=None,
-    dataset_resource_dir=None,
-    endpoints=[],
-    entry_date="",
-):
-    resource = resource_from_path(input_path)
-
-    specification = Specification(specification_dir)
-    schema = specification.pipeline[pipeline.name]["schema"]
-    intermediate_fieldnames = specification.intermediate_fieldnames(pipeline)
-    issue_log = IssueLog(dataset=dataset, resource=resource)
-    column_field_log = ColumnFieldLog(dataset=dataset, resource=resource)
-    dataset_resource_log = DatasetResourceLog(dataset=dataset, resource=resource)
-
-    api = API(specification=specification)
-    # load pipeline configuration
-    skip_patterns = pipeline.skip_patterns(resource)
-    columns = pipeline.columns(resource, endpoints=endpoints)
-    concats = pipeline.concatenations(resource, endpoints=endpoints)
-    patches = pipeline.patches(resource=resource)
-    lookups = pipeline.lookups(resource=resource)
-    default_fields = pipeline.default_fields(resource=resource)
-    default_values = pipeline.default_values(endpoints=endpoints)
-    combine_fields = pipeline.combine_fields(endpoints=endpoints)
-
-    # load organisations
-    organisation = Organisation(organisation_path, Path(pipeline.path))
-
-    severity_csv_path = os.path.join(specification_dir, "issue-type.csv")
-
-    # Load valid category values
-    valid_category_values = api.get_valid_category_values(dataset, pipeline)
-    # resource specific default values
-    if len(organisations) == 1:
-        default_values["organisation"] = organisations[0]
-
-    run_pipeline(
-        ConvertPhase(
-            path=input_path,
-            dataset_resource_log=dataset_resource_log,
-            output_path=os.path.join(converted_dir, request_id, f"{resource}.csv"),
-        ),
-        NormalisePhase(skip_patterns=skip_patterns, null_path=null_path),
-        ParsePhase(),
-        ConcatFieldPhase(concats=concats, log=column_field_log),
-        MapPhase(
-            fieldnames=intermediate_fieldnames,
-            columns=columns,
-            log=column_field_log,
-        ),
-        FilterPhase(filters=pipeline.filters(resource)),
-        PatchPhase(
-            issues=issue_log,
-            patches=patches,
-        ),
-        HarmonisePhase(
-            field_datatype_map=specification.get_field_datatype_map(),
-            issues=issue_log,
-            dataset=dataset,
-            valid_category_values=valid_category_values,
-        ),
-        DefaultPhase(
-            default_fields=default_fields,
-            default_values=default_values,
-            issues=issue_log,
-        ),
-        # TBD: move migrating columns to fields to be immediately after map
-        # this will simplify harmonisation and remove intermediate_fieldnames
-        # but effects brownfield-land and other pipelines which operate on columns
-        MigratePhase(
-            fields=specification.schema_field[schema],
-            migrations=pipeline.migrations(),
-        ),
-        OrganisationPhase(organisation=organisation, issues=issue_log),
-        FieldPrunePhase(fields=specification.current_fieldnames(schema)),
-        EntityReferencePhase(
-            dataset=dataset,
-            prefix=specification.dataset_prefix(dataset),
-        ),
-        EntityPrefixPhase(dataset=dataset),
-        EntityLookupPhase(lookups),
-        SavePhase(
-            default_output_path("harmonised", input_path),
-            fieldnames=intermediate_fieldnames,
-            enabled=save_harmonised,
-        ),
-        PriorityPhase(config=None),
-        PivotPhase(),
-        FactCombinePhase(issue_log=issue_log, fields=combine_fields),
-        FactorPhase(),
-        FactReferencePhase(
-            field_typology_map=specification.get_field_typology_map(),
-            field_prefix_map=specification.get_field_prefix_map(),
-        ),
-        FactLookupPhase(
-            lookups=lookups,
-            odp_collections=specification.get_odp_collections(),
-        ),
-        FactPrunePhase(),
-        SavePhase(
-            output_path,
-            fieldnames=specification.factor_fieldnames(),
-        ),
-    )
-
-    issue_log = duplicate_reference_check(issues=issue_log, csv_path=output_path)
-
-    # Add the 'severity' and 'description' column based on the mapping
-    issue_log.add_severity_column(severity_csv_path)
-
-    issue_log.save(os.path.join(issue_dir, resource + ".csv"))
-    column_field_log.save(os.path.join(column_field_dir, resource + ".csv"))
-    dataset_resource_log.save(os.path.join(dataset_resource_dir, resource + ".csv"))
+            logger.error("An exception occured during Pipeline Transform: ", str(err))
 
 
 def resource_from_path(path):
diff --git a/scripts/debug_trigger.py b/scripts/debug_trigger.py