|
2 | 2 |
|
3 | 3 | import hashlib |
4 | 4 | import json |
| 5 | +from datetime import datetime |
5 | 6 | from pathlib import Path |
6 | 7 | from unittest.mock import MagicMock, patch |
7 | 8 |
|
@@ -36,6 +37,24 @@ def _make_cache_file(data_dir: Path, rel_path: str, content: bytes) -> Path: |
36 | 37 | _FILE_B = f"cache/v1/{_TS1}/{_TS2}/uploads/data.parquet" |
37 | 38 |
|
38 | 39 |
|
| 40 | +def _find_log_file(data_dir: Path) -> Path | None: |
| 41 | + """Find the single JSONL log file under state/logs/.""" |
| 42 | + log_dir = data_dir / "state" / "logs" |
| 43 | + if not log_dir.exists(): |
| 44 | + return None |
| 45 | + files = list(log_dir.glob("*_pull.jsonl")) |
| 46 | + return files[0] if len(files) == 1 else None |
| 47 | + |
| 48 | + |
| 49 | +def _read_spans(log_file: Path) -> list[dict]: |
| 50 | + """Read all spans from a JSONL log file.""" |
| 51 | + spans = [] |
| 52 | + for line in log_file.read_text().splitlines(): |
| 53 | + if line.strip(): |
| 54 | + spans.append(json.loads(line)) |
| 55 | + return spans |
| 56 | + |
| 57 | + |
39 | 58 | def _fake_response(content: bytes) -> MagicMock: |
40 | 59 | """Create a mock response that yields content in chunks.""" |
41 | 60 | resp = MagicMock() |
@@ -82,6 +101,15 @@ def test_downloads_only_remote(self, mock_session_cls: MagicMock, tmp_path: Path |
82 | 101 | # Session.get was called with the right URL |
83 | 102 | mock_session.get.assert_called_once_with(url, stream=True) |
84 | 103 |
|
| 104 | + # Metrics log should exist with one span |
| 105 | + log_file = _find_log_file(tmp_path) |
| 106 | + assert log_file is not None |
| 107 | + spans = _read_spans(log_file) |
| 108 | + assert len(spans) == 1 |
| 109 | + assert spans[0]["ok"] is True |
| 110 | + assert spans[0]["file"] == _FILE_A |
| 111 | + assert spans[0]["bytes"] == len(content) |
| 112 | + |
85 | 113 |
|
86 | 114 | class TestCachePullMatchingSkipped: |
87 | 115 | """MATCHING entries are not downloaded.""" |
@@ -178,6 +206,15 @@ def test_sha256_failure(self, mock_session_cls: MagicMock, tmp_path: Path): |
178 | 206 | # File should NOT exist on disk (atomic write prevented partial file) |
179 | 207 | assert not (tmp_path / _FILE_A).exists() |
180 | 208 |
|
| 209 | + # Metrics log should record the failure |
| 210 | + log_file = _find_log_file(tmp_path) |
| 211 | + assert log_file is not None |
| 212 | + spans = _read_spans(log_file) |
| 213 | + assert len(spans) == 1 |
| 214 | + assert spans[0]["ok"] is False |
| 215 | + assert spans[0]["error"] is not None |
| 216 | + assert "SHA256 mismatch" in spans[0]["error"] |
| 217 | + |
181 | 218 |
|
182 | 219 | class TestCachePullMultipleFiles: |
183 | 220 | """Multiple ONLY_REMOTE entries are all downloaded.""" |
@@ -243,3 +280,87 @@ def test_no_partial_file_on_failure(self, mock_session_cls: MagicMock, tmp_path: |
243 | 280 |
|
244 | 281 | # No partial file should exist |
245 | 282 | assert not (tmp_path / _FILE_A).exists() |
| 283 | + |
| 284 | + # Metrics log should record the connection failure |
| 285 | + log_file = _find_log_file(tmp_path) |
| 286 | + assert log_file is not None |
| 287 | + spans = _read_spans(log_file) |
| 288 | + assert len(spans) == 1 |
| 289 | + assert spans[0]["ok"] is False |
| 290 | + assert "connection refused" in spans[0]["error"] |
| 291 | + |
| 292 | + |
| 293 | +class TestCachePullMetrics: |
| 294 | + """Download metrics JSONL file is created with correct schema.""" |
| 295 | + |
| 296 | + _EXPECTED_KEYS = { |
| 297 | + "t0", |
| 298 | + "t", |
| 299 | + "worker_id", |
| 300 | + "file", |
| 301 | + "url", |
| 302 | + "content_length", |
| 303 | + "bytes", |
| 304 | + "ok", |
| 305 | + "error", |
| 306 | + } |
| 307 | + |
| 308 | + @patch("iqb.cli.cache_pull.requests.Session") |
| 309 | + def test_two_files_produce_two_spans(self, mock_session_cls: MagicMock, tmp_path: Path): |
| 310 | + content_a = b"content a" |
| 311 | + content_b = b"content b" |
| 312 | + url_a = "https://example.com/a" |
| 313 | + url_b = "https://example.com/b" |
| 314 | + _write_manifest( |
| 315 | + tmp_path, |
| 316 | + { |
| 317 | + _FILE_A: {"sha256": _sha256(content_a), "url": url_a}, |
| 318 | + _FILE_B: {"sha256": _sha256(content_b), "url": url_b}, |
| 319 | + }, |
| 320 | + ) |
| 321 | + |
| 322 | + mock_session = MagicMock() |
| 323 | + |
| 324 | + def side_effect(url: str, **kwargs): |
| 325 | + if url == url_a: |
| 326 | + return _fake_response(content_a) |
| 327 | + return _fake_response(content_b) |
| 328 | + |
| 329 | + mock_session.get.side_effect = side_effect |
| 330 | + mock_session_cls.return_value = mock_session |
| 331 | + |
| 332 | + runner = CliRunner() |
| 333 | + result = runner.invoke(cli, ["cache", "pull", "-d", str(tmp_path)]) |
| 334 | + assert result.exit_code == 0 |
| 335 | + |
| 336 | + log_file = _find_log_file(tmp_path) |
| 337 | + assert log_file is not None |
| 338 | + assert log_file.name.endswith("_pull.jsonl") |
| 339 | + spans = _read_spans(log_file) |
| 340 | + assert len(spans) == 2 |
| 341 | + |
| 342 | + for span in spans: |
| 343 | + # All expected keys are present |
| 344 | + assert set(span.keys()) == self._EXPECTED_KEYS |
| 345 | + |
| 346 | + # ok is True |
| 347 | + assert span["ok"] is True |
| 348 | + assert span["error"] is None |
| 349 | + |
| 350 | + # worker_id is an integer |
| 351 | + assert isinstance(span["worker_id"], int) |
| 352 | + |
| 353 | + # Timestamps are parseable and t0 <= t |
| 354 | + t0 = datetime.strptime(span["t0"], "%Y-%m-%d %H:%M:%S %z") |
| 355 | + t = datetime.strptime(span["t"], "%Y-%m-%d %H:%M:%S %z") |
| 356 | + assert t0 <= t |
| 357 | + |
| 358 | + # URL is present |
| 359 | + assert span["url"] in (url_a, url_b) |
| 360 | + |
| 361 | + # Check per-file details |
| 362 | + spans_by_file = {s["file"]: s for s in spans} |
| 363 | + assert spans_by_file[_FILE_A]["bytes"] == len(content_a) |
| 364 | + assert spans_by_file[_FILE_A]["content_length"] == len(content_a) |
| 365 | + assert spans_by_file[_FILE_B]["bytes"] == len(content_b) |
| 366 | + assert spans_by_file[_FILE_B]["content_length"] == len(content_b) |
0 commit comments