Skip to content

Commit 691321b

Browse files
authored
refactor: make data a pyproject (#125)
With this refactoring, we can move data-only dependencies inside the proper pyproject rather than artificially inflating the library dependencies for no good reason. While there, add quality checks for data like we do for library.
1 parent 3a39b73 commit 691321b

File tree

9 files changed

+230
-31
lines changed

9 files changed

+230
-31
lines changed

.github/workflows/ci.yml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,33 @@ jobs:
5252
exit 1
5353
}
5454
55+
- name: Check code formatting with ruff (data)
56+
working-directory: data
57+
run: |
58+
uv run ruff format --check || {
59+
echo "❌ Code formatting check failed!"
60+
echo "To fix locally, run: cd data && uv run ruff format"
61+
exit 1
62+
}
63+
64+
- name: Lint code with ruff (data)
65+
working-directory: data
66+
run: |
67+
uv run ruff check || {
68+
echo "❌ Linting check failed!"
69+
echo "To fix locally, run: cd data && uv run ruff check --fix"
70+
exit 1
71+
}
72+
73+
- name: Type check with pyright (data)
74+
working-directory: data
75+
run: |
76+
uv run pyright || {
77+
echo "❌ Type checking failed!"
78+
echo "To see errors locally, run: cd data && uv run pyright"
79+
exit 1
80+
}
81+
5582
test-all:
5683
name: Test All Components
5784
runs-on: ubuntu-latest

data/generate_data.py

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,9 @@
1010
import os
1111
import sys
1212
from dataclasses import dataclass
13+
from datetime import date, datetime
1314
from pathlib import Path
1415

15-
# Add library to path so we can import iqb modules
16-
sys.path.insert(0, str(Path(__file__).parent.parent / "library" / "src"))
17-
1816
import click
1917
import dacite
2018
import yaml
@@ -48,6 +46,11 @@ class PipelineConfig:
4846
def load_pipeline_config(config_path):
4947
"""Load pipeline configuration matrix from YAML script."""
5048

49+
def coerce_str(value):
50+
if isinstance(value, (date, datetime)):
51+
return value.isoformat()
52+
return value
53+
5154
try:
5255
content = config_path.read_text()
5356
except FileNotFoundError as exc:
@@ -62,26 +65,24 @@ def load_pipeline_config(config_path):
6265
raise click.ClickException("Pipeline config must be a mapping.")
6366

6467
try:
65-
config = dacite.from_dict(PipelineConfig, data)
68+
config = dacite.from_dict(
69+
PipelineConfig,
70+
data,
71+
config=dacite.Config(type_hooks={str: coerce_str}),
72+
)
6673
except dacite.DaciteError as exc:
6774
raise click.ClickException(f"Invalid pipeline config: {exc}") from exc
6875

6976
if config.version != "v0":
70-
raise click.ClickException(
71-
f"Unsupported pipeline config version: {config.version}"
72-
)
77+
raise click.ClickException(f"Unsupported pipeline config version: {config.version}")
7378

7479
time_periods = [(entry.start, entry.end) for entry in config.matrix.dates]
7580
if not time_periods:
76-
raise click.ClickException(
77-
"Pipeline config matrix must include non-empty dates."
78-
)
81+
raise click.ClickException("Pipeline config matrix must include non-empty dates.")
7982

8083
granularities = tuple(grain.strip() for grain in config.matrix.granularities)
8184
if not granularities or any(not grain for grain in granularities):
82-
raise click.ClickException(
83-
"Pipeline config matrix must include non-empty granularities."
84-
)
85+
raise click.ClickException("Pipeline config matrix must include non-empty granularities.")
8586

8687
return time_periods, granularities
8788

data/ghcache.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@
4141
import sys
4242
from pathlib import Path
4343

44-
4544
MANIFEST_PATH = Path("state") / "ghremote" / "manifest.json"
4645
CACHE_DIR = Path("cache/v1")
4746
SHA256_PREFIX_LENGTH = 12
@@ -96,10 +95,7 @@ def validate_cache_path(path: str) -> bool:
9695
return False
9796

9897
# Component 6: data.parquet or stats.json
99-
if parts[5] not in ("data.parquet", "stats.json"):
100-
return False
101-
102-
return True
98+
return parts[5] in ("data.parquet", "stats.json")
10399

104100

105101
def mangle_path(local_path: str, sha256: str) -> str:
@@ -121,7 +117,7 @@ def load_manifest() -> dict:
121117
if not MANIFEST_PATH.exists():
122118
return {"v": 0, "files": {}}
123119

124-
with open(MANIFEST_PATH, "r") as f:
120+
with open(MANIFEST_PATH) as f:
125121
return json.load(f)
126122

127123

@@ -219,9 +215,7 @@ def cmd_scan(args) -> int:
219215

220216
# Prepare manifest entry (URL will need to be filled in manually or via script)
221217
# For now, use placeholder URL
222-
url_placeholder = (
223-
f"https://github.com/m-lab/iqb/releases/download/v0.2.0/{mangled_name}"
224-
)
218+
url_placeholder = f"https://github.com/m-lab/iqb/releases/download/v0.2.0/{mangled_name}"
225219

226220
files_dict[rel_path] = {"sha256": sha256, "url": url_placeholder}
227221
files_to_upload.append(mangled_name)

data/pyproject.toml

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
[project]
2+
name = "iqb-data"
3+
version = "0.1.0"
4+
requires-python = ">=3.13"
5+
dependencies = [
6+
"click>=8.3.0",
7+
"mlab-iqb",
8+
"pyyaml>=6.0.0",
9+
]
10+
11+
[tool.uv.sources]
12+
mlab-iqb = { workspace = true }
13+
14+
[tool.ruff]
15+
line-length = 100
16+
target-version = "py313"
17+
18+
[tool.ruff.format]
19+
indent-style = "space"
20+
quote-style = "double"
21+
line-ending = "lf"
22+
23+
[tool.ruff.lint]
24+
select = [
25+
"E", # pycodestyle errors
26+
"W", # pycodestyle warnings
27+
"F", # pyflakes
28+
"I", # isort
29+
"N", # pep8-naming
30+
"UP", # pyupgrade
31+
"B", # flake8-bugbear
32+
"C4", # flake8-comprehensions
33+
"SIM", # flake8-simplify
34+
]
35+
ignore = [
36+
"E501", # line too long (handled by formatter)
37+
"SIM114", # combine if branches - prefer explicit branches for domain clarity
38+
]
39+
40+
[tool.ruff.lint.isort]
41+
known-first-party = ["iqb"]
42+
43+
[tool.pyright]
44+
include = ["generate_data.py", "tests"]
45+
exclude = ["**/__pycache__", ".venv"]
46+
pythonVersion = "3.13"
47+
typeCheckingMode = "basic"
48+
reportMissingTypeStubs = false

data/run_query.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010

1111
from iqb.scripting import iqb_exception, iqb_logging, iqb_pipeline
1212

13+
DEFAULT_PROJECT_ID = "measurement-lab"
1314

14-
def main():
15-
DEFAULT_PROJECT_ID = "measurement-lab"
1615

16+
def main():
1717
parser = argparse.ArgumentParser(
1818
description="Execute BigQuery query template and save results to v1 Parquet cache"
1919
)
@@ -47,6 +47,7 @@ def main():
4747
pipeline = iqb_pipeline.create(data_dir=data_dir, project=args.project_id)
4848
pipeline.sync_mlab(
4949
args.granularity,
50+
enable_bigquery=True,
5051
start_date=args.start_date,
5152
end_date=args.end_date,
5253
)

data/tests/generate_data_test.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
from __future__ import annotations
2+
3+
import importlib.util
4+
from pathlib import Path
5+
6+
import pytest
7+
8+
9+
def load_generate_data_module():
10+
module_path = Path(__file__).parents[1] / "generate_data.py"
11+
spec = importlib.util.spec_from_file_location("generate_data", module_path)
12+
if spec is None or spec.loader is None:
13+
raise RuntimeError(f"Unable to load module from {module_path}")
14+
module = importlib.util.module_from_spec(spec)
15+
spec.loader.exec_module(module)
16+
return module
17+
18+
19+
def test_load_pipeline_config_valid(tmp_path: Path):
20+
module = load_generate_data_module()
21+
config_path = tmp_path / "pipeline.yaml"
22+
config_path.write_text(
23+
"\n".join(
24+
[
25+
'version: "v0"',
26+
"matrix:",
27+
" dates:",
28+
" - start: 2024-01-01",
29+
" end: 2024-02-01",
30+
" granularities:",
31+
" - day",
32+
" - week",
33+
"",
34+
]
35+
)
36+
)
37+
38+
time_periods, granularities = module.load_pipeline_config(config_path)
39+
40+
assert time_periods == [("2024-01-01", "2024-02-01")]
41+
assert granularities == ("day", "week")
42+
43+
44+
def test_load_pipeline_config_rejects_wrong_version(tmp_path: Path):
45+
module = load_generate_data_module()
46+
config_path = tmp_path / "pipeline.yaml"
47+
config_path.write_text(
48+
"\n".join(
49+
[
50+
'version: "v1"',
51+
"matrix:",
52+
" dates:",
53+
" - start: 2024-01-01",
54+
" end: 2024-02-01",
55+
" granularities:",
56+
" - day",
57+
"",
58+
]
59+
)
60+
)
61+
62+
with pytest.raises(module.click.ClickException, match="Unsupported pipeline config"):
63+
module.load_pipeline_config(config_path)
64+
65+
66+
def test_load_pipeline_config_rejects_empty_dates(tmp_path: Path):
67+
module = load_generate_data_module()
68+
config_path = tmp_path / "pipeline.yaml"
69+
config_path.write_text(
70+
"\n".join(
71+
[
72+
'version: "v0"',
73+
"matrix:",
74+
" dates: []",
75+
" granularities:",
76+
" - day",
77+
"",
78+
]
79+
)
80+
)
81+
82+
with pytest.raises(module.click.ClickException, match="matrix must include non-empty"):
83+
module.load_pipeline_config(config_path)
84+
85+
86+
def test_load_pipeline_config_rejects_blank_granularity(tmp_path: Path):
87+
module = load_generate_data_module()
88+
config_path = tmp_path / "pipeline.yaml"
89+
config_path.write_text(
90+
"\n".join(
91+
[
92+
'version: "v0"',
93+
"matrix:",
94+
" dates:",
95+
" - start: 2024-01-01",
96+
" end: 2024-02-01",
97+
" granularities:",
98+
" - day",
99+
' - ""',
100+
"",
101+
]
102+
)
103+
)
104+
105+
with pytest.raises(module.click.ClickException, match="matrix must include non-empty"):
106+
module.load_pipeline_config(config_path)
107+
108+
109+
def test_load_pipeline_config_rejects_non_mapping(tmp_path: Path):
110+
module = load_generate_data_module()
111+
config_path = tmp_path / "pipeline.yaml"
112+
config_path.write_text("- not-a-mapping")
113+
114+
with pytest.raises(module.click.ClickException, match="must be a mapping"):
115+
module.load_pipeline_config(config_path)

library/pyproject.toml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,10 @@ dependencies = [
1616
"pyarrow>=14.0.0",
1717
"pandas>=2.0.0",
1818
"db-dtypes>=1.0.0",
19-
"pyyaml>=6.0.0",
2019
"python-dateutil>=2.9.0.post0",
2120
"dacite>=1.9.2",
2221
"filelock>=3.20.1",
2322
"rich>=14.2.0",
24-
"click>=8.3.0",
2523
]
2624

2725
[project.urls]

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
[tool.uv.workspace]
2-
members = ["library", "prototype", "analysis"]
2+
members = ["library", "prototype", "analysis", "data"]
33

44
# Pytest configuration for workspace-level test execution
55
# This allows running "uv run pytest" from the root to test all workspace members
@@ -8,6 +8,7 @@ testpaths = [
88
"library/tests",
99
"prototype/tests",
1010
"analysis/tests",
11+
"data/tests",
1112
]
1213
python_files = ["*_test.py", "test_*.py"]
1314
python_classes = ["Test*"]

0 commit comments

Comments
 (0)