Skip to content
This repository was archived by the owner on Jan 12, 2024. It is now read-only.

Commit 06e115f

Browse files
authored
Merge pull request #18 from catalyst-cooperative/dev
Transition to using hand-crafted versioned data URLs
2 parents e485093 + 5869403 commit 06e115f

File tree

4 files changed

+20
-53
lines changed

4 files changed

+20
-53
lines changed

README.rst

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,10 +110,6 @@ this is weird).
110110

111111
.. code:: py
112112
113-
# Environment variables that tell Intake where to find and cache data
114-
os.environ["PUDL_INTAKE_CACHE"] = str(Path.home() / ".cache/intake")
115-
os.environ["PUDL_INTAKE_PATH"] = "gs://intake.catalyst.coop/test"
116-
117113
import intake
118114
import pandas as pd
119115
from pudl_catalog.helpers import year_state_filter
@@ -143,7 +139,7 @@ but with all the Jinja template fields interpolated and filled in:
143139
storage_options:
144140
simplecache:
145141
cache_storage: /home/zane/.cache/intake
146-
urlpath: simplecache::gs://intake.catalyst.coop/test/hourly_emissions_epacems.parquet
142+
urlpath: simplecache::gs://intake.catalyst.coop/dev/hourly_emissions_epacems.parquet
147143
description: Hourly pollution emissions and plant operational data reported via
148144
Continuous Emissions Monitoring Systems (CEMS) as required by 40 CFR Part 75.
149145
Includes CO2, NOx, and SO2, as well as the heat content of fuel consumed and gross

notebooks/pudl-catalog.ipynb

Lines changed: 15 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,6 @@
99
"# Notebook Preamble"
1010
]
1111
},
12-
{
13-
"cell_type": "markdown",
14-
"metadata": {},
15-
"source": [
16-
"## IPython Magic"
17-
]
18-
},
1912
{
2013
"cell_type": "code",
2114
"execution_count": null,
@@ -26,15 +19,6 @@
2619
"%autoreload 3"
2720
]
2821
},
29-
{
30-
"cell_type": "markdown",
31-
"metadata": {
32-
"tags": []
33-
},
34-
"source": [
35-
"## Notebook Imports"
36-
]
37-
},
3822
{
3923
"cell_type": "code",
4024
"execution_count": null,
@@ -47,13 +31,19 @@
4731
"import sys\n",
4832
"from pathlib import Path\n",
4933
"\n",
50-
"# We need to set these environment variables prior to importing our intake catalog.\n",
51-
"# You can also set them in your own shell environment instead.\n",
52-
"os.environ[\"PUDL_INTAKE_CACHE\"] = str(Path.home() / \".cache/intake\")\n",
53-
"os.environ[\"PUDL_INTAKE_PATH\"] = \"gs://intake.catalyst.coop/test\"\n",
34+
"logger = logging.getLogger()\n",
35+
"logger.setLevel(logging.INFO)\n",
36+
"handler = logging.StreamHandler(stream=sys.stdout)\n",
37+
"formatter = logging.Formatter(\"%(message)s\")\n",
38+
"handler.setFormatter(formatter)\n",
39+
"logger.handlers = [handler]\n",
40+
"\n",
41+
"# Where to cache downloaded data locally. Defaults to ~/.intake/cache\n",
42+
"# os.environ[\"PUDL_INTAKE_CACHE\"] = str(Path.home() / \".cache/intake\")\n",
5443
"\n",
55-
"# Local data if you've got it!\n",
44+
"# You can override the default path to the data in your environment if need be\n",
5645
"# os.environ[\"PUDL_INTAKE_PATH\"] = str(Path.cwd().parent() / \"data\")\n",
46+
"# os.environ[\"PUDL_INTAKE_PATH\"] = \"gs://intake.catalyst.coop/dev\"\n",
5747
"\n",
5848
"# 3rd Party Imports:\n",
5949
"import intake\n",
@@ -68,28 +58,7 @@
6858
"cell_type": "markdown",
6959
"metadata": {},
7060
"source": [
71-
"## Set up a logger"
72-
]
73-
},
74-
{
75-
"cell_type": "code",
76-
"execution_count": null,
77-
"metadata": {},
78-
"outputs": [],
79-
"source": [
80-
"logger = logging.getLogger()\n",
81-
"logger.setLevel(logging.INFO)\n",
82-
"handler = logging.StreamHandler(stream=sys.stdout)\n",
83-
"formatter = logging.Formatter(\"%(message)s\")\n",
84-
"handler.setFormatter(formatter)\n",
85-
"logger.handlers = [handler]"
86-
]
87-
},
88-
{
89-
"cell_type": "markdown",
90-
"metadata": {},
91-
"source": [
92-
"## What Intake data sources are installed?"
61+
"# Explore installed Intake catalogs"
9362
]
9463
},
9564
{
@@ -155,7 +124,7 @@
155124
"cell_type": "markdown",
156125
"metadata": {},
157126
"source": [
158-
"## Normal usage"
127+
"# Reading some data from the catalog"
159128
]
160129
},
161130
{
@@ -212,7 +181,7 @@
212181
"outputs": [],
213182
"source": [
214183
"%%time\n",
215-
"df1 = pd.read_parquet(\"gs://intake.catalyst.coop/test/hourly_emissions_epacems/epacems-2020-ID.parquet\")"
184+
"df1 = pd.read_parquet(f\"{os.environ['PUDL_INTAKE_PATH']}/hourly_emissions_epacems/epacems-2020-ID.parquet\")"
216185
]
217186
},
218187
{
@@ -251,7 +220,7 @@
251220
"from pprint import pprint\n",
252221
"import fsspec\n",
253222
"epacems_pq = pq.read_table(\n",
254-
" \"gs://intake.catalyst.coop/test/hourly_emissions_epacems/epacems-2020-ID.parquet\",\n",
223+
" f\"{os.environ['PUDL_INTAKE_PATH']}/hourly_emissions_epacems/epacems-2020-ID.parquet\",\n",
255224
" filesystem=fsspec.filesystem(\"gs\"),\n",
256225
")\n",
257226
"dtype_dict = {name: dtype for name, dtype in zip(epacems_pq.schema.names, epacems_pq.schema.types)}\n",

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
url="https://github.com/catalyst-cooperative/pudl-catalog",
2121
project_urls={
2222
"Source": "https://github.com/catalyst-cooperative/pudl-catalog",
23+
"Documentation": "https://catalystcoop-pudl-catalog.readthedocs.io",
2324
"Issue Tracker": "https://github.com/catalyst-cooperative/pudl-catalog/issues",
2425
},
2526
classifiers=[

src/pudl_catalog/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
logger.addHandler(logging.NullHandler())
1313

1414
BASE_URLS = {
15-
"gs": "gs://intake.catalyst.coop/test",
15+
"gs": "gs://intake.catalyst.coop/v0.1.0",
1616
# HTTPS access doesn't really work well, so we're hiding it from users for now.
17-
"https": "https://storage.googleapis.com/intake.catalyst.coop/test",
17+
"https": "https://storage.googleapis.com/intake.catalyst.coop/v0.1.0",
1818
}
1919

2020
# Ensure that the user has set the relevant environment variables
@@ -23,6 +23,7 @@
2323
"Environment variable PUDL_INTAKE_PATH is not set. "
2424
f"Defaulting to {BASE_URLS['gs']}"
2525
)
26+
os.environ["PUDL_INTAKE_PATH"] = BASE_URLS["gs"]
2627

2728
if os.getenv("PUDL_INTAKE_CACHE") is None:
2829
logger.info(

0 commit comments

Comments
 (0)