Add data_normalisation_test and load_postgres_test

Soukayna-Boucetta · Soukayna-Boucetta · commit fabc8494bcb6 · 2025-12-11T21:34:37.000+01:00
diff --git a/tests/test_data_normalisation.py b/tests/test_data_normalisation.py
@@ -0,0 +1,92 @@
+import csv
+import os
+import tempfile
+import builtins
+import pytest
+
+import nifipulse.config as config
+from nifipulse.data_normalisation import process_data   
+
+
+# -----------------------------
+# Helper to create temp CSV data
+# -----------------------------
+def create_csv(path, rows):
+    fieldnames = ["timestamp", "instance", "metric_name", "component_name",
+                  "component_type", "component_id", "value"]
+
+    with open(path, "w", newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        for r in rows:
+            writer.writerow(r)
+
+
+# -----------------------------
+# TEST: process_data filters, maps, converts and writes results
+# -----------------------------
+def test_process_data(tmp_path, monkeypatch):
+    # Temporary paths
+    input_csv = tmp_path / "input.csv"
+    output_csv = tmp_path / "clean.csv"
+
+    # Fake rows
+    rows = [
+        # 👎 should be filtered (value = 0 AND metric starts with nifi_amount)
+        {
+            "timestamp": "2025-01-01T00:00:00Z",
+            "instance": "nifi1",
+            "metric_name": "nifi_amount_flowfiles_received",
+            "component_name": "ProcessorA",
+            "component_type": "PROCESSOR",
+            "component_id": "abc123",
+            "value": "0"
+        },
+        # 👍 should be kept and mapped
+        {
+            "timestamp": "2025-01-01T00:01:00Z",
+            "instance": "nifi1",
+            "metric_name": "nifi_amount_bytes_read",
+            "component_name": "ProcessorB",
+            "component_type": "PROCESSOR",
+            "component_id": "xyz999",
+            "value": "150"
+        }
+    ]
+
+    create_csv(input_csv, rows)
+
+    # Override config.env paths
+    monkeypatch.setattr(config, "env", type("E", (), {
+        "CSV_SINK": str(input_csv),
+        "CLEAN_DATA": str(output_csv)
+    }))
+
+    # Run the function
+    process_data()
+
+    # ----------- Validate output CSV -----------
+    assert output_csv.exists(), "Output file should be created"
+
+    with open(output_csv, encoding='utf-8') as f:
+        reader = list(csv.DictReader(f))
+
+    # One row should remain
+    assert len(reader) == 1
+
+    row = reader[0]
+
+    # Validate correct mapping
+    assert row["metric_name"] == "bytes_read"
+
+    # Validate unit detection
+    assert row["original_unit"] == "bytes"
+
+    # Validate numeric conversion
+    assert row["value"] == "150.0"
+
+    # Validate ID propagation
+    assert row["unique_id"] == "xyz999"
+
+    # Validate timestamp copy
+    assert row["timestamp_utc"] == "2025-01-01T00:01:00Z"
diff --git a/tests/test_load_postgre.py b/tests/test_load_postgre.py
@@ -0,0 +1,75 @@
+import pytest
+import pandas as pd
+from unittest.mock import patch, MagicMock
+from nifipulse.load_postgres import load_postgres
+
+
+def test_load_postgres_success(tmp_path):
+
+    # 1. Create fake CSV
+    csv_file = tmp_path / "sample.csv"
+    csv_file.write_text(
+        "timestamp_utc,instance,metric_name,original_unit,component_name,component_type,value\n"
+        "2025-01-01 12:00:00,server1,cpu,%,cpu_component,system,45\n"
+    )
+
+    # 2. Mock engine + conn
+    mock_engine = MagicMock()
+    mock_conn = MagicMock()
+    mock_engine.begin.return_value.__enter__.return_value = mock_conn
+
+    # 3. Fake DataFrames for 5 read_sql calls
+    fake_dim_instance = pd.DataFrame({
+        "instance_id": [1],
+        "instance_name": ["server1"]
+    })
+
+    fake_dim_metric = pd.DataFrame({
+        "metric_id": [10],
+        "metric_name": ["cpu"],
+        "original_unit": ["%"]
+    })
+
+    fake_dim_component = pd.DataFrame({
+        "component_id": [20],
+        "component_name": ["cpu_component"],
+        "component_type": ["system"]
+    })
+
+    fake_dim_date = pd.DataFrame({
+        "date_id": [100],
+        "timestamp_utc": ["2025-01-01 12:00:00"]
+    })
+
+    fake_fact_export = pd.DataFrame({
+        "fact_id": [999],
+        "timestamp_utc": ["2025-01-01 12:00:00"],
+        "instance_name": ["server1"],
+        "metric_name": ["cpu"],
+        "original_unit": ["%"],
+        "component_name": ["cpu_component"],
+        "component_type": ["system"],
+        "value": [45]
+    })
+
+    read_sql_side_effect = [
+        fake_dim_instance,
+        fake_dim_metric,
+        fake_dim_component,
+        fake_dim_date,
+        fake_fact_export    # 5th call for export
+    ]
+
+    with patch("nifipulse.load_postgres.create_engine", return_value=mock_engine), \
+         patch("nifipulse.load_postgres.pd.read_sql", side_effect=read_sql_side_effect):
+
+        load_postgres(str(csv_file))
+
+    # Assertions
+    calls = [c.args[0].text for c in mock_conn.execute.call_args_list]
+
+    assert any("INSERT INTO dim_instance" in sql for sql in calls)
+    assert any("INSERT INTO dim_metric" in sql for sql in calls)
+    assert any("INSERT INTO dim_component" in sql for sql in calls)
+    assert any("INSERT INTO dim_date" in sql for sql in calls)
+    assert any("INSERT INTO fact_metrics" in sql for sql in calls)