Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pymsis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@

__version__ = importlib.metadata.version("pymsis")

__all__ = ["__version__", "Variable", "calculate"]
__all__ = ["Variable", "__version__", "calculate"]
89 changes: 81 additions & 8 deletions pymsis/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
"""Utilities for obtaining input datasets."""

import csv
import os
import urllib.request
import warnings
from datetime import datetime, timedelta
from io import BytesIO
from pathlib import Path

Expand All @@ -14,6 +17,9 @@
_DATA_FNAME: str = "SW-All.csv"
_F107_AP_URL: str = f"https://celestrak.org/SpaceData/{_DATA_FNAME}"
_F107_AP_PATH: Path = Path(pymsis.__file__).parent / _DATA_FNAME
_PARTIAL_DATA_FNAME: str = "SW-Last5Years.csv"
_PARTIAL_F107_AP_URL: str = f"https://celestrak.org/SpaceData/{_PARTIAL_DATA_FNAME}"
_PARTIAL_F107_AP_PATH: Path = Path(pymsis.__file__).parent / _PARTIAL_DATA_FNAME
_DATA: dict[str, npt.NDArray] | None = None


Expand Down Expand Up @@ -48,8 +54,60 @@ def download_f107_ap() -> None:
f.write(req.read())


def _load_f107_ap_data() -> dict[str, npt.NDArray]:
"""Load data from disk, if it isn't present go out and download it first."""
def _refresh_f107_ap_data(last_obs_date: np.datetime64) -> None:
"""
Refresh exising SW_All file after last_obs_date.

Parameters
----------
last_obs_date : datetime of last observed parameter
"""
warnings.warn(
f"Refreshing data using partial ap and F10.7 data from {_PARTIAL_F107_AP_URL}"
)
req = urllib.request.urlopen(_PARTIAL_F107_AP_URL)
with _PARTIAL_F107_AP_PATH.open("wb") as f:
f.write(req.read())

# Store all observed dates in existing_data
with open(_F107_AP_PATH) as f:
reader = csv.DictReader(f)
existing_data = [
row for row in reader if np.datetime64(row["DATE"]) <= last_obs_date
]

# Store data new dates in updated_data
updated_data = []
if os.path.exists(_PARTIAL_F107_AP_PATH):
with open(_PARTIAL_F107_AP_PATH) as f:
reader = csv.DictReader(f)
updated_data = [row for row in reader]

# Merge existing_data and new_data into all_data
all_data = {row["DATE"]: row for row in existing_data}
all_data.update({row["DATE"]: row for row in updated_data})

# Sort all_data by date
sorted_data = sorted(all_data.values(), key=lambda x: np.datetime64(x["DATE"]))
with open(_F107_AP_PATH, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=sorted_data[0].keys())
writer.writeheader()
writer.writerows(sorted_data)

os.remove(_PARTIAL_F107_AP_PATH)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Be wary of this because if there is any download error above, this file will be left hanging around.

A few quick thoughts:

  • Do we need to save it to disk, or could you store it in a BytesIO object in memory instead?
  • Would it make sense to save _DATA arrays with np.savez() instead of the csv files for easier manipulation? (Then you wouldn't need to mess with the dictreader/writer.
  • Honestly, now that I'm thinking about it more, is this just more headache than it is worth and we should just use your logic below and re-download the full file for the simplest case. More bandwidth, but at least we are getting the full file built for us rather than building it from bits and pieces.



def _load_f107_ap_data(end_date: np.datetime64) -> dict[str, npt.NDArray]:
"""
Load data from disk.

- If it isn't present go out and download it first.
- If present but not up to date, go out and refresh it.

Parameters
----------
end_date : datetime of last epoch
"""
if not _F107_AP_PATH.exists():
download_f107_ap()

Expand Down Expand Up @@ -92,19 +150,34 @@ def _load_f107_ap_data() -> dict[str, npt.NDArray]:
# so we can't just go back in line lengths)
with _F107_AP_PATH.open() as fin:
with BytesIO() as fout:
last_obs_date: np.datetime64 | None = None
for line in fin:
if "PRM" in line:
# We don't want the monthly predicted values
continue
if ",,,,,,,," in line:
# We don't want lines with missing values
if "PRM" in line or ",,,,,,,," in line:
# We don't want the monthly predicted values or missing values
continue
if ",OBS," in line:
# Capture last observed date
last_obs_date = np.datetime64(line.split(",")[0])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we get this from the _DATA numpy arrays? Maybe warn_data contains this information already and we could do something like (have not tried this) _DATA["dates"][~_DATA["warn_data"]][-1].

fout.write(line.encode("utf-8"))
fout.seek(0)
arr = np.loadtxt(
fout, delimiter=",", dtype=dtype, usecols=usecols, skiprows=1
) # type: ignore

# Check if the file needs to be refreshed after parsing
if last_obs_date is not None:
file_mod_time = datetime.fromtimestamp(os.path.getmtime(_F107_AP_PATH))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need file modification time? I think we should just leave that out and only rely on the requested date being greater than the observations we have.

if last_obs_date < end_date and datetime.now() - file_mod_time >= timedelta(
hours=3
):
# Refresh file if:
# - requested date is beyond the end of current file
# - file hasn't been refresh in the last 3 hours
_refresh_f107_ap_data(last_obs_date)

# Re-parse the file after refresh
return _load_f107_ap_data(end_date)

# transform each day's 8 3-hourly ap values into a single column
ap = np.empty(len(arr) * 8, dtype=float)
daily_ap = arr["Ap"].astype(float)
Expand Down Expand Up @@ -207,7 +280,7 @@ def get_f107_ap(dates: npt.ArrayLike) -> tuple[npt.NDArray, npt.NDArray, npt.NDA
| prior to current time
"""
dates = np.asarray(dates, dtype=np.datetime64)
data = _DATA or _load_f107_ap_data()
data = _DATA or _load_f107_ap_data(dates[-1])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we have a long running process with _DATA already loaded, then this won't update the data for us. I think the logic should probably be a new if block here instead after the original loading potentially?

Suggested change
data = _DATA or _load_f107_ap_data(dates[-1])
# Load/download data right away.
data = _DATA or _load_f107_ap_data()
if dates[-1] > data["dates"][-1]:
data = _new_function_to_download_recent_data()


data_start = data["dates"][0]
data_end = data["dates"][-1]
Expand Down