Skip to content

Commit 9945106

Browse files
authored
chore: add missing data (#128)
This pull request adds more data to the GitHub remote cache index. While there, improve logging and docs.
1 parent fda2564 commit 9945106

File tree

10 files changed

+973
-12
lines changed

10 files changed

+973
-12
lines changed

data/README.md

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,18 +64,21 @@ Then manually:
6464

6565
- Python 3.13 using `uv` as documented in the toplevel [README.md](../README.md)
6666

67-
**Complete Pipeline** (recommended):
67+
**Complete Pipeline**:
6868

6969
```bash
70-
uv run python ./data/generate_data.py
70+
uv run python ./data/generate_data.py -B
7171
```
7272

7373
This orchestrates the complete pipeline:
7474

7575
1. Loads `./data/pipeline.yaml` to determine dates and granularities (edit this
7676
file to change the matrix)
77-
2. Queries both download and upload metrics for each dataset
78-
3. Saves results to v1 Parquet cache with query metadata
77+
2. Attempts to fetch from the GitHub cache first
78+
3. Otherwise, if `-B` is present, queries both download and upload metrics for each dataset
79+
4. Saves results to v1 Parquet cache with query metadata
80+
81+
Omit the `-B` flag to avoid querying BigQuery.
7982

8083
## Future Improvements (Phase 2+)
8184

data/ghcache.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ def cmd_scan(args) -> int:
217217

218218
# Prepare manifest entry (URL will need to be filled in manually or via script)
219219
# For now, use placeholder URL
220-
url_placeholder = f"https://github.com/m-lab/iqb/releases/download/v0.2.0/{mangled_name}"
220+
url_placeholder = f"https://github.com/m-lab/iqb/releases/download/v0.5.0/{mangled_name}"
221221

222222
files_dict[rel_path] = {"sha256": sha256, "url": url_placeholder}
223223
files_to_upload.append(str(dest_path))
@@ -231,7 +231,7 @@ def cmd_scan(args) -> int:
231231
for f in files_to_upload:
232232
print(f" {f}")
233233
print("\nNext steps:")
234-
print("1. Upload mangled files to GitHub release v0.2.0")
234+
print("1. Upload mangled files to GitHub release v0.5.0")
235235
print("2. Update URLs in state/ghremote/manifest.json if needed")
236236
print("3. Commit updated state/ghremote/manifest.json to repository")
237237

data/pipeline.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,20 @@ matrix:
1717
end: "2025-07-01"
1818
- start: "2025-07-01"
1919
end: "2025-08-01"
20+
- start: "2025-08-01"
21+
end: "2025-09-01"
22+
- start: "2025-09-01"
23+
end: "2025-10-01"
2024
- start: "2025-10-01"
2125
end: "2025-11-01"
26+
- start: "2025-11-01"
27+
end: "2025-12-01"
28+
- start: "2025-12-01"
29+
end: "2026-01-01"
2230
granularities:
2331
- country
32+
- country_asn
33+
- subdivision1
34+
- subdivision1_asn
35+
- city
36+
- city_asn

data/state/ghremote/manifest.json

Lines changed: 944 additions & 0 deletions
Large diffs are not rendered by default.

library/src/iqb/ghremote/cache.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525

2626
from ..pipeline.cache import PipelineCacheEntry, data_dir_or_default
2727

28-
log = logging.getLogger("ghremote/cache")
28+
log = logging.getLogger("iqb.ghremote.cache")
2929

3030

3131
@dataclass(frozen=True, kw_only=True)

library/src/iqb/pipeline/bqpq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
TimeRemainingColumn,
2424
)
2525

26-
log = logging.getLogger("pipeline/bqpq")
26+
log = logging.getLogger("iqb.pipeline.bqpq")
2727

2828

2929
def _rows_progress_columns(total_rows: int | None):

library/src/iqb/pipeline/cache.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def sync(self) -> None:
7777
"""
7878
if self.syncers:
7979
if not any(sync(self) for sync in self.syncers):
80-
raise PipelineEntrySyncError(f"Cannot sync {self}: see above logs")
80+
raise PipelineEntrySyncError(f"Cannot sync {self}: see above warnings")
8181
return
8282

8383
if not self.exists():

library/src/iqb/pipeline/pipeline.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
PipelineRemoteCache,
1818
)
1919

20-
log = logging.getLogger("pipeline/bq")
20+
log = logging.getLogger("iqb.pipeline.bq")
2121

2222

2323
class IQBPipeline:

library/src/iqb/scripting/iqb_logging.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ def configure(verbose: bool) -> None:
4444
handlers=[handler],
4545
force=True,
4646
)
47+
logging.captureWarnings(True)
4748

4849

49-
log = logging.getLogger("scripting")
50+
log = logging.getLogger("iqb.scripting")
5051
"""Logger that the scripting package should use."""

library/tests/iqb/scripting/iqb_logging_test.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,4 +52,4 @@ def fake_log_render(*_args, **kwargs):
5252

5353

5454
def test_log_is_named_scripting() -> None:
55-
assert iqb_logging.log.name == "scripting"
55+
assert iqb_logging.log.name == "iqb.scripting"

0 commit comments

Comments
 (0)