Skip to content

Commit 7c050f3

Browse files
authored
refactor(library): make cache depend on pipeline (#43)
Currently, the pipeline depends on the cache, but it should be the other way around. We need cache to be able to import pipeline so that we define the disk logic only once. While there, CacheEntry -> PipelineCacheEntry for clarity.
1 parent b47324a commit 7c050f3

File tree

4 files changed

+44
-43
lines changed

4 files changed

+44
-43
lines changed

library/src/iqb/cache.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,7 @@
2525
from datetime import datetime
2626
from pathlib import Path
2727

28-
29-
def data_dir_or_default(data_dir: str | Path | None) -> Path:
30-
"""
31-
Return data_dir as a Path if not empty. Otherwise return the
32-
default value for the data_dir (i.e., `./.iqb` like git).
33-
"""
34-
return Path.cwd() / ".iqb" if data_dir is None else Path(data_dir)
28+
from . import pipeline
3529

3630

3731
class IQBCache:
@@ -45,7 +39,7 @@ def __init__(self, data_dir: str | Path | None = None):
4539
data_dir: Path to directory containing cached data files.
4640
If None, defaults to .iqb/ in current working directory.
4741
"""
48-
self.data_dir = data_dir_or_default(data_dir)
42+
self.data_dir = pipeline.data_dir_or_default(data_dir)
4943

5044
def get_data(
5145
self,

library/src/iqb/pipeline.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@
8888
from google.cloud import bigquery, bigquery_storage_v1
8989
from google.cloud.bigquery import job, table
9090

91-
from . import cache, queries
91+
from . import queries
9292

9393
VALID_TEMPLATE_NAMES: Final[set[str]] = {
9494
"downloads_by_country",
@@ -110,7 +110,7 @@ class ParsedTemplateName:
110110

111111

112112
@dataclass(frozen=True)
113-
class CacheEntry:
113+
class PipelineCacheEntry:
114114
"""
115115
Reference to a cache entry containing query results and metadata.
116116
@@ -232,7 +232,7 @@ def __init__(self, project_id: str, data_dir: str | Path | None = None):
232232
"""
233233
self.client = bigquery.Client(project=project_id)
234234
self.bq_read_clnt = bigquery_storage_v1.BigQueryReadClient()
235-
self.data_dir = cache.data_dir_or_default(data_dir)
235+
self.data_dir = data_dir_or_default(data_dir)
236236

237237
def _cache_dir_path(
238238
self,
@@ -252,7 +252,7 @@ def get_cache_entry(
252252
end_date: str,
253253
*,
254254
fetch_if_missing: bool = False,
255-
) -> CacheEntry:
255+
) -> PipelineCacheEntry:
256256
"""
257257
Get or create a cache entry for the given query.
258258
@@ -264,7 +264,7 @@ def get_cache_entry(
264264
Default is False (do not fetch automatically).
265265
266266
Returns:
267-
CacheEntry with paths to data.parquet and stats.json.
267+
PipelineCacheEntry with paths to data.parquet and stats.json.
268268
269269
Raises:
270270
FileNotFoundError: if cache doesn't exist and fetch_if_missing is False.
@@ -282,7 +282,7 @@ def get_cache_entry(
282282

283283
# 4. check if cache exists
284284
if data_path.exists() and stats_path.exists():
285-
return CacheEntry(data_path=data_path, stats_path=stats_path)
285+
return PipelineCacheEntry(data_path=data_path, stats_path=stats_path)
286286

287287
# 5. handle missing cache without auto-fetching
288288
if not fetch_if_missing:
@@ -298,7 +298,7 @@ def get_cache_entry(
298298
result.save_stats()
299299

300300
# 7. return information about the cache entry
301-
return CacheEntry(data_path=data_path, stats_path=stats_path)
301+
return PipelineCacheEntry(data_path=data_path, stats_path=stats_path)
302302

303303
def execute_query_template(
304304
self,
@@ -360,6 +360,14 @@ def _execute_query_template(
360360
)
361361

362362

363+
def data_dir_or_default(data_dir: str | Path | None) -> Path:
364+
"""
365+
Return data_dir as a Path if not empty. Otherwise return the
366+
default value for the data_dir (i.e., `./.iqb` like git).
367+
"""
368+
return Path.cwd() / ".iqb" if data_dir is None else Path(data_dir)
369+
370+
363371
def _parse_both_dates(start_date: str, end_date: str) -> tuple[datetime, datetime]:
364372
"""Parses both dates and ensures start_date <= end_date."""
365373
start_time = _parse_date(start_date)

library/tests/iqb/cache_test.py

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,10 @@
11
"""Tests for the IQBCache data fetching module."""
22

33
from datetime import datetime
4-
from pathlib import Path
54

65
import pytest
76

87
from iqb import IQBCache
9-
from iqb.cache import data_dir_or_default
10-
11-
12-
class TestDataDirOrDefault:
13-
"""Test pure functions without external dependencies."""
14-
15-
def test_data_dir_or_default_with_none(self):
16-
"""Test default behavior when data_dir is None."""
17-
result = data_dir_or_default(None)
18-
expected = Path.cwd() / ".iqb"
19-
assert result == expected
20-
21-
def test_data_dir_or_default_with_string(self, tmp_path):
22-
"""Test conversion of string path."""
23-
test_path = str(tmp_path / "test")
24-
result = data_dir_or_default(test_path)
25-
assert result == Path(test_path)
26-
27-
def test_data_dir_or_default_with_path(self, tmp_path):
28-
"""Test pass-through of Path object."""
29-
input_path = tmp_path / "test"
30-
result = data_dir_or_default(input_path)
31-
assert result == input_path
328

339

3410
class TestIQBCacheInitialization:

library/tests/iqb/pipeline_test.py

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,41 @@
88
import pytest
99

1010
from iqb.pipeline import (
11-
CacheEntry,
1211
IQBPipeline,
1312
ParquetFileInfo,
1413
ParsedTemplateName,
14+
PipelineCacheEntry,
1515
QueryResult,
1616
_load_query_template,
1717
_parse_both_dates,
1818
_parse_date,
1919
_parse_template_name,
20+
data_dir_or_default,
2021
)
2122

2223

24+
class TestDataDirOrDefault:
25+
"""Test for data_dir_or_default function."""
26+
27+
def test_data_dir_or_default_with_none(self):
28+
"""Test default behavior when data_dir is None."""
29+
result = data_dir_or_default(None)
30+
expected = Path.cwd() / ".iqb"
31+
assert result == expected
32+
33+
def test_data_dir_or_default_with_string(self, tmp_path):
34+
"""Test conversion of string path."""
35+
test_path = str(tmp_path / "test")
36+
result = data_dir_or_default(test_path)
37+
assert result == Path(test_path)
38+
39+
def test_data_dir_or_default_with_path(self, tmp_path):
40+
"""Test pass-through of Path object."""
41+
input_path = tmp_path / "test"
42+
result = data_dir_or_default(input_path)
43+
assert result == input_path
44+
45+
2346
class TestHelperFunctions:
2447
"""Test pure functions without external dependencies."""
2548

@@ -514,7 +537,7 @@ def test_save_stats_creates_directory(self, tmp_path):
514537
assert stats_path.exists()
515538

516539

517-
class TestIQBPipelineGetCacheEntry:
540+
class TestIQBPipelineGetPipelineCacheEntry:
518541
"""Test get_cache_entry method."""
519542

520543
@patch("iqb.pipeline.bigquery.Client")
@@ -540,7 +563,7 @@ def test_get_cache_entry_when_exists(self, mock_storage, mock_client, tmp_path):
540563
# Get cache entry (should not execute query)
541564
entry = pipeline.get_cache_entry("downloads_by_country", "2024-10-01", "2024-11-01")
542565

543-
assert isinstance(entry, CacheEntry)
566+
assert isinstance(entry, PipelineCacheEntry)
544567
assert entry.data_path == cache_dir / "data.parquet"
545568
assert entry.stats_path == cache_dir / "stats.json"
546569
assert entry.data_path.exists()
@@ -608,7 +631,7 @@ def test_get_cache_entry_fetch_if_missing(self, mock_storage, mock_client, tmp_p
608631
mock_client_instance.query.assert_called_once()
609632

610633
# Entry should be returned with correct paths
611-
assert isinstance(entry, CacheEntry)
634+
assert isinstance(entry, PipelineCacheEntry)
612635
expected_cache_dir = (
613636
data_dir
614637
/ "cache"

0 commit comments

Comments
 (0)