diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f30fe01..79ed37a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,44 +1,52 @@ -exclude: ".*(.fits|.fts|.fit|.txt|.csv)$" +exclude: ".*(.csv|.fits|.fts|.fit|.header|.jpg|.jpeg|.json|.png|.svg)$" repos: - - repo: https://github.com/PyCQA/docformatter - rev: v1.7.7 - hooks: - - id: docformatter - args: [--in-place, --pre-summary-newline, --make-summary-multi] - repo: https://github.com/PyCQA/autoflake rev: v2.3.1 hooks: - id: autoflake - args: ['--in-place', '--remove-all-unused-imports', '--remove-unused-variable'] - exclude: ".*(.fits|.fts|.fit|.txt|tca.*|extern.*|.rst|.md|docs/conf.py)$" + args: + [ + "--in-place", + "--remove-all-unused-imports", + "--remove-unused-variable", + ] - repo: https://github.com/astral-sh/ruff-pre-commit - rev: 'v0.14.11' + rev: "v0.14.14" + hooks: + - id: ruff-check + args: ["--fix", "--unsafe-fixes", "--preview"] + - id: ruff-format + args: ["--preview"] + - repo: https://github.com/PyCQA/docformatter + rev: master + hooks: + - id: docformatter + args: ["--make-summary-multi-line", "--pre-summary-newline", "-ri"] + - repo: https://github.com/JoC0de/pre-commit-prettier + rev: v3.8.1 + hooks: + - id: prettier + types_or: [css, scss, javascript, rst, json, yaml, toml, markdown] + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 hooks: - - id: ruff - args: ['--fix', '--unsafe-fixes'] - - repo: https://github.com/psf/black-pre-commit-mirror - rev: 25.12.0 - hooks: - - id: black - - repo: https://github.com/PyCQA/isort - rev: 7.0.0 - hooks: - - id: isort - - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v6.0.0 - hooks: - id: check-ast - id: check-case-conflict - - id: trailing-whitespace - - id: mixed-line-ending - args: ['--fix=lf'] - - id: end-of-file-fixer + - id: check-json + - id: check-toml - id: check-yaml - id: debug-statements + - id: end-of-file-fixer + - id: mixed-line-ending + args: ["--fix=lf"] + - id: trailing-whitespace - repo: https://github.com/codespell-project/codespell rev: v2.4.1 hooks: - id: codespell + additional_dependencies: + - tomli + args: ["--write-changes"] ci: autofix_prs: false autoupdate_schedule: "quarterly" diff --git a/README.md b/README.md index a856151..d3d82b4 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,15 @@ -# SDO Non-nominal Timeline # +# SDO Non-nominal Timeline -## Warning ## +## Warning This captures data from multiple sources and is not guaranteed to be accurate. It is intended to be a rough guide to the non-nominal periods of SDO. -## Requirements ## +## Requirements Requirements are in `requirements.txt`. -## Notes ## +## Notes Things to note: diff --git a/config.py b/config.py index d20fc9d..884f772 100644 --- a/config.py +++ b/config.py @@ -1,12 +1,16 @@ -from datetime import datetime +""" +Basic configuration file for SDO data scraping. +""" + +from datetime import UTC, datetime __all__ = ["CY_END", "DATASETS", "MAP_4", "TIME_FORMATS"] -CY_END = int(str(datetime.now().year + 1)[2:]) +CY_END = int(str(datetime.now(tz=UTC).year + 1)[2:]) TIME_FORMATS = [ "%d-%b-%y %H:%M:%S", # 06-Apr-10 21:11:55 "%Y.%m.%d", # 2010.05.18 - "%y-%j-%H:%M:%S", # YY-DOY-HH:MM:SS + "%y-%j-%H:%M:%S", # YY-DOY-HH:MM:SS, 10-096-06:26:28 "%Y.%m.%d_%H:%M:%S", # 2010.11.10_06:01:20 "%d-%b-%Y %H:%M:%S", # 9-Apr-2010 07:30:00 ] @@ -33,26 +37,14 @@ 19: "Misc Tests/Special Ops", } DATASETS = { - "hmi_obs_cov": { - "fURL": "http://jsoc.stanford.edu/doc/data/hmi/cov2/cov{}.html", + "jsocobs_info": { + "fURL": "https://aia.lmsal.com/public/jsocobs_info{}.html", "RANGE": range(10, CY_END), - "MONTH_RANGE": range(1, 13), }, "spacecraft_night": { "URL": "https://aia.lmsal.com/public/sdo_spacecraft_night.txt", "SKIP_ROWS": [0, 1, 2, 3], }, - "jsocobs_info": { - "fURL": "https://aia.lmsal.com/public/jsocobs_info{}.html", - "RANGE": range(10, CY_END), - }, - # This site has a whole range of text files and its easier to scrape the urls that way. - # Assumption is that each text file on this page has the same structure - "jsocinst_calibrations": { - "URL": "https://aia.lmsal.com/public/jsocinst_calibrations.html", - "SKIP_ROWS": [0], - "SCRAPE": True, - }, "text_block_1": { "URL": "./data_1.txt", }, @@ -65,4 +57,16 @@ "text_block_4": { "URL": "./data_4.txt", }, + "hmi_obs_cov": { + "fURL": "http://jsoc.stanford.edu/doc/data/hmi/cov2/cov{}.html", + "RANGE": range(10, CY_END), + "MONTH_RANGE": range(1, 13), + }, + # This site has a whole range of text files and its easier to scrape the urls that way. + # Assumption is that each text file on this page has the same structure + "jsocinst_calibrations": { + "URL": "https://aia.lmsal.com/public/jsocinst_calibrations.html", + "SKIP_ROWS": [0], + "SCRAPE": True, + }, } diff --git a/pyproject.toml b/pyproject.toml deleted file mode 100644 index 4b04e71..0000000 --- a/pyproject.toml +++ /dev/null @@ -1,92 +0,0 @@ -[tool.black] -line-length = 120 -include = '\.pyi?$' -exclude = ''' -( - /( - \.eggs - | \.git - | \.mypy_cache - | \.tox - | \.venv - | _build - | buck-out - | build - | dist - | docs - | .history - )/ - | ah_bootstrap.py -) -''' -target-version = ['py39'] - -[tool.ruff] -# Allow unused variables when underscore-prefixed. -dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" -target-version = "py39" -line-length = 120 -exclude=[ - ".git,", - "__pycache__", - "build", - "tools/**", -] -select = [ - "E", - "F", - "W", - "UP", - "PT", - "RET", - "TID", - "PLE", - "NPY", - "RUF", - "PGH", - "PTH", - "BLE", - "FBT", - "B", - "A", - "COM", - "C4", - "T20", - "RSE", - "ERA", -] -fixable = [ - "E", - "F", - "W", - "UP", - "PT", - "RET", - "TID", - "PLE", - "NPY", - "RUF", - "PGH", - "PTH", - "BLE", - "FBT", - "B", - "A", - "COM", - "C4", - "T20", - "RSE", - "ERA", -] -extend-ignore = [ - "E501", - "BLE001", - -] - -[tool.ruff.pydocstyle] -convention = "numpy" - -[tool.ruff.format] -quote-style = "double" -indent-style = "space" diff --git a/requirements.txt b/requirements.txt index 2081637..62a087f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ beautifulsoup4==4.14.3 -pandas==2.3.3 +pandas==3.0.0 requests==2.32.5 loguru==0.7.3 diff --git a/ruff.toml b/ruff.toml new file mode 100644 index 0000000..c06b5dd --- /dev/null +++ b/ruff.toml @@ -0,0 +1,28 @@ +# Allow unused variables when underscore-prefixed. +lint.dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$" +target-version = "py314" +line-length = 120 +extend-exclude=[ + "__pycache__", + "build", + "tools/**", +] +lint.select = [ + "ALL", +] +lint.ignore = [ + "TD", + "FIX", + "CPY001", +] +lint.extend-ignore = [ + "COM812", # The following rule may cause conflicts when used with the formatter +] + +[lint.pydocstyle] +convention = "numpy" + +[format] +docstring-code-format = true +indent-style = "space" +quote-style = "double" diff --git a/scraper.py b/scraper.py index 2e56fdb..b3d6d06 100644 --- a/scraper.py +++ b/scraper.py @@ -1,7 +1,12 @@ +""" +Simple scraper for SDO event timeline data. +""" + +import contextlib +import sys from datetime import datetime from itertools import product from pathlib import Path -from typing import Optional import pandas as pd import requests @@ -10,10 +15,13 @@ from config import DATASETS, MAP_4, TIME_FORMATS +logger.remove() +logger.add(sys.stderr, level="INFO") + -def _format_date(date: str, year: Optional[str], _hack: Optional[datetime] = None) -> pd.Timestamp: +def _format_date(date: str, year: str | None = None, start_date: datetime | None = None) -> pd.Timestamp: """ - Formats the given date. + Format the given date. Parameters ---------- @@ -21,25 +29,35 @@ def _format_date(date: str, year: Optional[str], _hack: Optional[datetime] = Non Date string from the html file. year : str, optional The year of the provided dates, if it is not present in the date. - _hack : datetime.datetime, optional - A workaround for some dates, by default None. + start_date : datetime.datetime + The start date of the dataset. + Defaults to None. Returns ------- pandas.Timestamp New date. + + Raises + ------ + ValueError + If ``start_date`` is not provided but required. """ if year is None: return pd.Timestamp(date) # Only date e., '11/2' assuming month/day - if len(date) in [4, 5]: + if len(date) in {4, 5}: # Deal with only times with a hack if "/" not in date: - new_date = pd.Timestamp(str(_hack.date()) + " " + date) + if start_date is None: + msg = f"Start date is required for this format: {date}" + raise ValueError(msg) + new_date = pd.Timestamp(str(start_date.date()) + " " + date) else: new_date = pd.Timestamp(f"{year}-{date}") + # Only time - e.g., '18:15' # Year missing - e.g., '12/10 18:15' - elif len(date) in [9, 10, 11, 12]: + elif len(date) in {9, 10, 11, 12}: new_date = date.split(" ") if "25" in new_date[1]: # This is a hack for 25 hour time @@ -51,7 +69,7 @@ def _format_date(date: str, year: Optional[str], _hack: Optional[datetime] = Non else: try: # This catches 2010.05.01 - 02 - new_date = pd.Timestamp(date.split("-")[0]) + new_date = pd.Timestamp(date.split("-", maxsplit=1)[0]) except ValueError: idx = len(date) // (len(date) // 10) new_date = pd.Timestamp(f"{year}-{date[:idx]}") @@ -60,7 +78,7 @@ def _format_date(date: str, year: Optional[str], _hack: Optional[datetime] = Non def _clean_date(date: str, *, extra_replace: bool = False) -> str: """ - Removes any non-numeric characters from the date. + Remove any non-numeric characters from the date. Parameters ---------- @@ -75,7 +93,8 @@ def _clean_date(date: str, *, extra_replace: bool = False) -> str: Cleaned date. """ date = ( - " ".join(date.split()) + " " + .join(date.split()) .replace("UT", "") .replace(" TBD", "") .replace("ongoing", "") @@ -105,17 +124,29 @@ def _process_time(data: pd.DataFrame, column: int = 0) -> pd.DataFrame: The dataframe with timestamps. column : int, optional The column to process, by default 0. + + Returns + ------- + pd.DataFrame + The dataframe with reformatted timestamps. + + Raises + ------ + ValueError + If no suitable time format is found. """ for time_format in TIME_FORMATS: try: - data.iloc[:, column] = data.iloc[:, column].apply( - lambda x, time_format=time_format: datetime.strptime(x, time_format), + data[data.columns[column]] = data.iloc[:, column].apply( + lambda x, time_format=time_format: datetime.strptime(x, time_format) # NOQA: DTZ007 ) - return data - except Exception: - pass - else: - raise ValueError(f"Could not find a suitable time format: {data.iloc[0, column]}") + return data # NOQA: TRY300 + except Exception as e: # NOQA: BLE001 + logger.debug(f"Time format {time_format} did not work for {data.iloc[0, column]} for column {column}: {e}") + msg = f"Could not find a suitable time format: {data.iloc[0, column]} or failed assignment to DataFrame." + raise ValueError( + msg, + ) def _process_end_time(data: pd.DataFrame, column: int = 1) -> pd.DataFrame: @@ -125,16 +156,16 @@ def _process_end_time(data: pd.DataFrame, column: int = 1) -> pd.DataFrame: ) # Increment date if end time is before start time timedelta = [ - pd.Timedelta(days=1) if x < y else pd.Timedelta(days=0) for x, y in zip(data.iloc[:, 0], data.iloc[:, 1]) + pd.Timedelta(days=1) if pd.Timestamp(x) < pd.Timestamp(y) else pd.Timedelta(days=0) + for x, y in zip(data.iloc[:, 0], data.iloc[:, 1], strict=False) ] - data[data.columns[column]] = data[data.columns[column]] + pd.to_timedelta(timedelta) + data[data.columns[column]] += pd.to_timedelta(timedelta) return data def _process_data(data: pd.DataFrame, filepath: str) -> pd.DataFrame: """ - Certain online text files have no comments or have a comment in the third - column. + Certain files have no comments or have a comment in the third column. Parameters ---------- @@ -155,11 +186,11 @@ def _process_data(data: pd.DataFrame, filepath: str) -> pd.DataFrame: else: data["Instrument"] = "SDO" if "Start Date/Time" in data.columns: - data.rename(columns={"Start Date/Time": "Start Time"}, inplace=True) + data = data.rename(columns={"Start Date/Time": "Start Time"}) if "FSN" in data.columns: - data.rename(columns={"FSN": "Comment"}, inplace=True) + data = data.rename(columns={"FSN": "Comment"}) if "Unnamed: 2" in data.columns: - data.rename(columns={"Unnamed: 2": "Comment"}, inplace=True) + data = data.rename(columns={"Unnamed: 2": "Comment"}) if data.columns[-1] == "Comment": data["Comment"].fillna(pd.read_fwf(filepath).columns[0]) else: @@ -170,8 +201,9 @@ def _process_data(data: pd.DataFrame, filepath: str) -> pd.DataFrame: def _reformat_data(data: pd.DataFrame, filepath: str) -> pd.DataFrame: """ - Due to the fact that the text files are not consistent, we need to reformat - them. + Due to the fact that the text files are not consistent. + + We need to reformat them. Parameters ---------- @@ -189,14 +221,12 @@ def _reformat_data(data: pd.DataFrame, filepath: str) -> pd.DataFrame: data["Start Time"] = [None] * len(data) data["End Time"] = [None] * len(data) for i, row in enumerate(data[0].str.split()): - data["Start Time"][i] = row[0] - data["End Time"][i] = row[1] - data.drop(columns=[0], inplace=True) + data.iloc[i, data.columns.get_loc("Start Time")] = row[0] + data.iloc[i, data.columns.get_loc("End Time")] = row[1] + data = data.drop(columns=[0]) data = data.iloc[:, [1, 2, 0]] data.columns = ["Start Time", "End Time", "Comment"] - elif "_2" in filepath: - data.columns = ["Start Time", "Comment"] - elif "_3" in filepath: + elif "_2" in filepath or "_3" in filepath: data.columns = ["Start Time", "Comment"] elif "_4" in filepath: data = data.iloc[:, [1, 0]] @@ -205,9 +235,9 @@ def _reformat_data(data: pd.DataFrame, filepath: str) -> pd.DataFrame: return data -def process_txt(filepath: str, skip_rows: Optional[list], data: pd.DataFrame) -> pd.DataFrame: +def process_txt(filepath: str, skip_rows: list | None, data: pd.DataFrame) -> pd.DataFrame: """ - Processes a text file. + Process a text file. Parameters ---------- @@ -235,16 +265,14 @@ def process_txt(filepath: str, skip_rows: Optional[list], data: pd.DataFrame) -> ) if "sdo_spacecraft_night" not in filepath: new_data = _process_end_time(new_data) - if len(new_data.columns) in [2, 3]: + if len(new_data.columns) in {2, 3}: new_data = _process_data(new_data, filepath) - elif len(new_data.columns) > 3: + elif len(new_data.columns) > 3: # NOQA: PLR2004 logger.debug(f"Unexpected number of columns for {filepath}, dropping all but first two") new_data = new_data.iloc[:, [0, 1]] new_data.columns = ["Start Time", "End Time"] - try: + with contextlib.suppress(Exception): new_data = _process_time(new_data, 1) - except Exception: - pass new_data = _process_data(new_data, filepath) else: new_data = pd.read_csv(filepath, header=None, sep=" ", skiprows=skip_rows, engine="python") @@ -252,19 +280,16 @@ def process_txt(filepath: str, skip_rows: Optional[list], data: pd.DataFrame) -> new_data = _process_time(new_data) new_data["Instrument"] = new_data["Comment"].apply(lambda x: "AIA" if "AIA" in x else None) new_data["Instrument"] = new_data["Comment"].apply(lambda x: "HMI" if "HMI" in x else None) - new_data["Source"] = filepath.split("/")[-1] + new_data["Source"] = filepath.rsplit("/", maxsplit=1)[-1] data = pd.concat([data, new_data], ignore_index=True) - if data.empty: - data = new_data - else: - data = pd.concat([data, new_data], ignore_index=True) - new_data["Source"] = filepath.split("/")[-1] + data = new_data if data.empty else pd.concat([data, new_data], ignore_index=True) + new_data["Source"] = filepath.rsplit("/", maxsplit=1)[-1] return pd.concat([data, new_data], ignore_index=True) -def process_html(url: str, data: pd.DataFrame) -> pd.DataFrame: +def process_html(url: str, data: pd.DataFrame) -> pd.DataFrame: # NOQA: PLR0914 """ - Processes an html file. + Process an html file. Parameters ---------- @@ -278,8 +303,9 @@ def process_html(url: str, data: pd.DataFrame) -> pd.DataFrame: pd.DataFrame Dataframe with the data from the html file. """ - request = requests.get(url) - if request.status_code == 404: + request = requests.get(url, timeout=60) + if request.status_code == 404: # NOQA: PLR2004 + logger.warning(f"URL not found: {url}") return data soup = BeautifulSoup(request.text, "html.parser") table = soup.find_all("table") @@ -302,17 +328,18 @@ def process_html(url: str, data: pd.DataFrame) -> pd.DataFrame: return data instrument = ["HMI" if "HMI" in new_row else "AIA" if "AIA" in new_row else "SDO" for new_row in text] comment = [new_row.replace("\n", " ") for new_row in text] + new_dates = dates.copy() for date in dates: # Hack workaround for http://jsoc.stanford.edu/doc/data/hmi/cov2/cov202503.html # where the date just "multiple" if "multiple" in date: - dates[dates.index(date)] = date.replace("multiple", dates[0]) - start_dates = [(_format_date(_clean_date(date), year)) for date in dates] - end_dates = [None] * len(dates) + new_dates[new_dates.index(date)] = date.replace("multiple", new_dates[0]) + start_dates = [(_format_date(_clean_date(date), year)) for date in new_dates] + end_dates = [None] * len(new_dates) new_data = pd.DataFrame( {"Start Time": start_dates, "End Time": end_dates, "Instrument": instrument, "Comment": comment}, ) - new_data["Source"] = url.split("/")[-1] + new_data["Source"] = url.rsplit("/", maxsplit=1)[-1] data = pd.concat([data, new_data]) else: for row in rows[1:]: @@ -336,7 +363,7 @@ def process_html(url: str, data: pd.DataFrame) -> pd.DataFrame: new_data = pd.Series( {"Start Time": start_date, "End Time": end_date, "Instrument": instrument, "Comment": comment}, ) - new_data["Source"] = url.split("/")[-1] + new_data["Source"] = url.rsplit("/", maxsplit=1)[-1] data = pd.concat([data, pd.DataFrame([new_data], columns=new_data.index)]).reset_index(drop=True) return data @@ -356,7 +383,7 @@ def scrape_url(url: str) -> list: List of all the urls scraped. """ base_url = str(Path(url).parent).replace("https:/", "https://") - request = requests.get(url) + request = requests.get(url, timeout=60) soup = BeautifulSoup(request.text, "html.parser") urls = [] for link in soup.find_all("a"): @@ -439,7 +466,8 @@ def drop_duplicates(data: pd.DataFrame) -> pd.DataFrame: elif "html" in url: final_timeline = process_html(url, final_timeline) else: - raise ValueError(f"Unknown file type for {url}") + msg = f"Unknown file type for {url}" + raise ValueError(msg) logger.info(f"{len(final_timeline.index)} rows in total") final_timeline = final_timeline.sort_values("Start Time")