Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
"metadata": {},
"outputs": [],
"source": [
"day_obs = 20250601\n",
"number_of_days = 14"
"day_obs_start = 20250501\n",
"day_obs_end = 20250515"
]
},
{
Expand Down Expand Up @@ -65,6 +65,12 @@
"m1m3_inside_cell_sal_index = 113\n",
"dome_inside_sal_index = 111\n",
"\n",
"# Number of columns to consider - Maximum 336\n",
"N_COLS = 45\n",
"\n",
"# CLT TimeZone Manual Offset\n",
"clt_tz_offset = -4 # hours\n",
"\n",
"# Set global font size for labels, titles, and ticks\n",
"plt.rcParams.update(\n",
" {\n",
Expand Down Expand Up @@ -101,67 +107,6 @@
"metadata": {},
"outputs": [],
"source": [
"def get_latest_forecast(_df, start_time=None, end_time=None):\n",
" \"\"\"\n",
" From a reformatted forecast DataFrame, selects forecast rows\n",
" where forecast_timestamp falls between two consecutive published timestamps.\n",
"\n",
" Parameters\n",
" ----------\n",
" _df : pd.DataFrame\n",
" DataFrame with columns: published_timestamp, forecast_timestamp, temperature, temperatureSpread.\n",
" start_time : datetime-like, optional\n",
" Minimum forecast_timestamp to include.\n",
" end_time : datetime-like, optional\n",
" Maximum forecast_timestamp to include.\n",
"\n",
" Returns\n",
" -------\n",
" pd.DataFrame\n",
" DataFrame indexed by forecast_timestamp, containing only selected forecasts.\n",
" \"\"\"\n",
" if isinstance(start_time, Time):\n",
" start_time = start_time.to_datetime().replace(tzinfo=None)\n",
" if isinstance(end_time, Time):\n",
" end_time = end_time.to_datetime().replace(tzinfo=None)\n",
"\n",
" _df = _df.copy()\n",
" _df[\"forecast_timestamp\"] = pd.to_datetime(_df[\"forecast_timestamp\"])\n",
" _df[\"published_timestamp\"] = pd.to_datetime(_df[\"published_timestamp\"])\n",
"\n",
" _df = _df.sort_values(\"published_timestamp\")\n",
"\n",
" # Container for filtered rows\n",
" filtered = []\n",
"\n",
" # Group forecasts between each pair of published timestamps\n",
" for i in range(len(_df.published_timestamp.unique()) - 1):\n",
" t0 = _df.published_timestamp.unique()[i]\n",
" t1 = _df.published_timestamp.unique()[i + 1]\n",
" mask = (\n",
" (_df[\"forecast_timestamp\"] >= t0)\n",
" & (_df[\"forecast_timestamp\"] < t1)\n",
" & (_df[\"published_timestamp\"] == t0)\n",
" )\n",
" filtered.append(_df[mask])\n",
"\n",
" result_df = pd.concat(filtered, ignore_index=True)\n",
"\n",
" # Filter by time range if specified\n",
" if start_time is not None:\n",
" result_df = result_df[\n",
" result_df[\"forecast_timestamp\"] >= pd.to_datetime(start_time)\n",
" ]\n",
" if end_time is not None:\n",
" result_df = result_df[\n",
" result_df[\"forecast_timestamp\"] <= pd.to_datetime(end_time)\n",
" ]\n",
"\n",
" # Set index and return\n",
" result_df = result_df.set_index(\"forecast_timestamp\")\n",
" return result_df.sort_index()\n",
"\n",
"\n",
"def query_ess_inside_dome(client, start_time, end_time):\n",
" \"\"\"\n",
" Query the actual temperature inside the dome.\n",
Expand Down Expand Up @@ -230,25 +175,22 @@
" if day_obs_end is None:\n",
" day_obs_end = day_obs_start\n",
"\n",
" start_time = getDayObsStartTime(day_obs_start)\n",
" end_time = getDayObsEndTime(day_obs_end)\n",
"\n",
" NPOINTS = 336\n",
"\n",
" # The timedelta below helps including the first and last data point\n",
" start_time = getDayObsStartTime(day_obs_start) - pd.to_timedelta(\"30min\")\n",
" end_time = getDayObsEndTime(day_obs_end) + pd.to_timedelta(\"30min\")\n",
" \n",
" _df = getEfdData(\n",
" client=client,\n",
" topic=\"lsst.sal.WeatherForecast.hourlyTrend\",\n",
" columns=[f\"timestamp{i}\" for i in range(NPOINTS)]\n",
" + [f\"temperature{i}\" for i in range(NPOINTS)]\n",
" + [f\"temperatureSpread{i}\" for i in range(NPOINTS)],\n",
" columns=[f\"timestamp{i}\" for i in range(N_COLS)]\n",
" + [f\"temperature{i}\" for i in range(N_COLS)]\n",
" + [f\"temperatureSpread{i}\" for i in range(N_COLS)],\n",
" begin=start_time,\n",
" end=end_time,\n",
" )\n",
"\n",
" for i in range(NPOINTS):\n",
" _df[f\"timestamp{i}\"] = pd.to_datetime(\n",
" _df[f\"timestamp{i}\"], unit=\"s\", utc=True\n",
" ).dt.strftime(\"%Y-%m-%dT%H:%M:%S.%fZ\")\n",
" for i in range(N_COLS):\n",
" _df[f\"timestamp{i}\"] = pd.to_datetime(_df[f\"timestamp{i}\"], unit=\"s\", utc=True)\n",
"\n",
" return _df\n",
"\n",
Expand All @@ -269,53 +211,64 @@
" _df[\"index\"] = _df[\"key\"].str.extract(r\"(\\d+)$\")[0].astype(\"Int64\")\n",
"\n",
" # Pivot to final table\n",
" _df = _df.pivot(index=\"index\", columns=\"field\", values=\"value\").reset_index(\n",
" drop=True\n",
" )\n",
" _df = _df.pivot(index=\"index\", columns=\"field\", values=\"value\").reset_index(drop=True)\n",
" _df[\"timestamp\"] = pd.to_datetime(_df[\"timestamp\"])\n",
" _df = _df.where(pd.notnull(_df), pd.NA)\n",
"\n",
" mask = _df.timestamp < (_df.timestamp[0] + pd.Timedelta(hours=48))\n",
" _df = _df[mask]\n",
" _df.columns.name = None\n",
"\n",
" return _df\n",
"\n",
"\n",
"def unpack_hourly_weather_forecast_dataframe(df):\n",
"def unpack_hourly_weather_forecast_matching_timestamps(_df):\n",
" \"\"\"\n",
" Reformats a temperature forecast DataFrame into long format.\n",
" Use the published data to select the timestamps and temperatures \n",
" we want to include in the output dataframe.\n",
"\n",
" Parameters\n",
" ----------\n",
" df : pd.DataFrame\n",
" DataFrame with index as publication time and columns temperatureX and temperatureSpreadX.\n",
"\n",
" Returns\n",
" -------\n",
" pd.DataFrame\n",
" Reformatted DataFrame with columns:\n",
" - published_timestamp\n",
" - forecast_timestamp\n",
" - temperature\n",
" - temperatureSpread\n",
" _df : pd.DataFrame\n",
" The weather forecast data. \n",
" The index must contain the timestamp of the published data. \n",
" It must have at least 40 columns for the timestamp,\n",
" temperature, and temperatureSpread.\n",
" \"\"\"\n",
" records = []\n",
"\n",
" for published_time, row in df.iterrows():\n",
" for i in range(336):\n",
" temp = row.get(f\"temperature{i}\", None)\n",
" spread = row.get(f\"temperatureSpread{i}\", None)\n",
" forecast_time = published_time + pd.Timedelta(hours=i)\n",
" records.append(\n",
" {\n",
" \"published_timestamp\": published_time,\n",
" \"forecast_timestamp\": forecast_time,\n",
" \"temperature\": temp,\n",
" \"temperatureSpread\": spread,\n",
" }\n",
" )\n",
"\n",
" return pd.DataFrame(records)"
" # Round published time to make it easier to select good data\n",
" _df.index = _df.index.round(\"h\")\n",
"\n",
" # Exclude rows published at times different from expected.\n",
" _df = _df[_df.index.hour.isin([4, 16])]\n",
"\n",
" # Columns that starts with `timestamp`\n",
" cols = [col for col in _df.columns if col.startswith(\"timestamp\")]\n",
"\n",
" # Create empty dataframe\n",
" new_df = None\n",
"\n",
" # Extract time range associated with the time between published times\n",
" for published_time, row in _df[cols].iterrows():\n",
"\n",
" # Get the beginning of the data corresponding to this time range\n",
" index_start = (row - published_time).abs().argmin()\n",
Copy link

Copilot AI Jul 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Computing index_start this way returns a column label string instead of an integer offset, which will break the subsequent index_end calculation; consider extracting the numeric suffix or using positional indexing.

Suggested change
" index_start = (row - published_time).abs().argmin()\n",
" index_start = int((row - published_time).abs().argmin().lstrip('timestamp'))\n",

Copilot uses AI. Check for mistakes.
"\n",
" # Number of hours between published forecasts is always the same\n",
" index_end = index_start + 12\n",
"\n",
" new_cols = (\n",
" [f\"timestamp{i}\" for i in range(index_start, index_end)] + \n",
" [f\"temperature{i}\" for i in range(index_start, index_end)] + \n",
" [f\"temperatureSpread{i}\" for i in range(index_start, index_end)]\n",
" )\n",
" \n",
" sub_df = _df[new_cols]\n",
" sub_df = unpack_hourly_weather_forecast(sub_df.loc[published_time])\n",
" sub_df[\"published_time\"] = published_time\n",
"\n",
" if new_df is not None:\n",
" new_df = pd.concat([new_df, sub_df])\n",
" else:\n",
" new_df = sub_df\n",
"\n",
" return new_df"
Comment on lines +266 to +271
Copy link

Copilot AI Jul 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Repeatedly concatenating DataFrames inside a loop can be inefficient; consider collecting each sub_df in a list and performing one concat after the loop.

Suggested change
" if new_df is not None:\n",
" new_df = pd.concat([new_df, sub_df])\n",
" else:\n",
" new_df = sub_df\n",
"\n",
" return new_df"
" sub_dfs.append(sub_df)\n",
"\n",
" new_df = pd.concat(sub_dfs, ignore_index=True)\n",
" return new_df\n",

Copilot uses AI. Check for mistakes.
]
},
{
Expand All @@ -337,15 +290,75 @@
"Once near 4 UTC and once near 16h UTC."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aade2cde-4e54-40c4-8d4f-2bfc29227c0d",
"metadata": {},
"outputs": [],
"source": [
"t_start = getDayObsStartTime(day_obs_start)\n",
"t_end = getDayObsEndTime(day_obs_end)\n",
"\n",
"print(f\" Query datasets between {t_start.isot} and {t_end.isot}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "09292bea-dc00-4003-b5a6-25cd7be9b74f",
"metadata": {},
"outputs": [],
"source": [
"df_hourly_weather_forecast = query_hourly_weather_forecast(efd_client, day_obs_start, day_obs_end)\n",
"df_hourly_weather_forecast.head()"
]
},
{
"cell_type": "markdown",
"id": "a916437e-2992-4972-9324-bf7d54591528",
"metadata": {},
"source": [
"Assuming that the timestamps are all correct, let's do some further analysis to understand the data. \n",
"First, let's compare two forecast rows that start at 00h UTC on that same day. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f061d145-5f63-4495-a0d2-616958f835a9",
"metadata": {},
"outputs": [],
"source": [
"_df1 = unpack_hourly_weather_forecast(df_hourly_weather_forecast.iloc[0])\n",
"_df2 = unpack_hourly_weather_forecast(df_hourly_weather_forecast.iloc[1])\n",
"\n",
"title = f\"Forecast Comparison for {df_hourly_weather_forecast[\"timestamp0\"].iloc[0]}\"\n",
"fig, ax = plt.subplots(num=title, ncols=1, nrows=1)\n",
"\n",
"ax.plot(_df1.timestamp, _df1.temperature, label=f\"Published at {df_hourly_weather_forecast.index[0]}\")\n",
"ax.plot(_df2.timestamp, _df2.temperature, label=f\"Published at {df_hourly_weather_forecast.index[1]}\")\n",
"\n",
"ax.set_xlabel(\"Time [UTC]\")\n",
"ax.set_ylabel(\"Temperature (ºC)\")\n",
"ax.legend()\n",
"ax.xaxis.set_major_formatter(mdates.DateFormatter(\"%Y-%m-%d %H:%M\"))\n",
"\n",
"fig.suptitle(title)\n",
"fig.autofmt_xdate()\n",
"fig.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a20f54e-c166-47e8-9982-00b36c3189a1",
"metadata": {},
"outputs": [],
"source": [
"df_forecast = unpack_hourly_weather_forecast(\n",
" query_hourly_weather_forecast(efd_client, day_obs).iloc[0]\n",
"df_forecast = unpack_hourly_weather_forecast_matching_timestamps(\n",
" query_hourly_weather_forecast(efd_client, day_obs_start, day_obs_start+2)\n",
")\n",
"\n",
"df_weather_station = query_ess_weather_station(\n",
Expand Down Expand Up @@ -397,17 +410,21 @@
"metadata": {},
"outputs": [],
"source": [
"title = f\"MeteoBlue Reliability Study - {day_obs}\"\n",
"title = f\"MeteoBlue Reliability Study - {day_obs_start}\"\n",
"tz_offset = pd.to_timedelta(f\"{clt_tz_offset}h\")\n",
"\n",
"fig, ax = plt.subplots(num=title, ncols=1, nrows=1)\n",
"\n",
"ax.plot(df_forecast.timestamp, df_forecast.temperature, label=\"MeteoBlue Hourly Trend\")\n",
"ax.plot(df_forecast.timestamp - tz_offset, df_forecast.temperature, \"C0-\", label=\"MeteoBlue Hourly Trend\")\n",
"ax.plot(df_forecast.timestamp - tz_offset, df_forecast.temperature - df_forecast.temperatureSpread, \"C0-\", alpha=0.3)\n",
"ax.plot(df_forecast.timestamp - tz_offset, df_forecast.temperature + df_forecast.temperatureSpread, \"C0-\", alpha=0.3)\n",
"ax.plot(\n",
" df_weather_station.index,\n",
" df_weather_station[\"mean\"],\n",
" \"C1-\",\n",
" label=\"Weather Station Temperature\",\n",
")\n",
"ax.plot(df_inside_dome.index, df_inside_dome[\"mean\"], label=\"Inside Dome Temperature\")\n",
"ax.plot(df_inside_dome.index, df_inside_dome[\"mean\"], \"C2-\", label=\"Inside Dome Temperature\")\n",
"\n",
"ax.set_xlabel(\"Time [UTC]\")\n",
"ax.set_ylabel(\"Temperature (ºC)\")\n",
Expand All @@ -418,7 +435,7 @@
"fig.autofmt_xdate()\n",
"fig.tight_layout()\n",
"\n",
"plt.savefig(f\"{title}.png\")\n",
"plt.savefig(f\"{title}_t_offset.png\")\n",
"plt.show()"
]
},
Expand All @@ -437,18 +454,14 @@
"metadata": {},
"outputs": [],
"source": [
"time_start = getDayObsStartTime(day_obs) - pd.Timedelta(number_of_days, unit=\"days\")\n",
"time_end = getDayObsEndTime(day_obs)\n",
"time_start = getDayObsStartTime(day_obs_start)\n",
"time_end = getDayObsEndTime(day_obs_end)\n",
"\n",
"day_obs_start = getDayObsForTime(time_start)\n",
"day_obs_end = day_obs\n",
"print(f\"Query data from {day_obs_start} to {day_obs_end}\")\n",
"\n",
"df_forecast = get_latest_forecast(\n",
" unpack_hourly_weather_forecast_dataframe(\n",
"df_forecast = unpack_hourly_weather_forecast_matching_timestamps(\n",
" query_hourly_weather_forecast(efd_client, day_obs_start, day_obs_end)\n",
" )\n",
")\n",
"\n",
"df_weather_station = query_ess_weather_station(efd_client, time_start, time_end)\n",
"\n",
Expand All @@ -463,17 +476,21 @@
"outputs": [],
"source": [
"title = f\"Hourly Forecast from {day_obs_start} to {day_obs_end}\"\n",
"tz_offset = pd.to_timedelta(f\"{clt_tz_offset}h\")\n",
"\n",
"fig, ax = plt.subplots(num=title)\n",
"\n",
"# Plot forecast\n",
"ax.plot(df_forecast[\"temperature\"], label=\"Meteoblue Latests Forecast\")\n",
"ax.plot(df_forecast.timestamp - tz_offset , df_forecast.temperature, \"C0-\", label=\"Meteoblue Latests Forecast\")\n",
"ax.plot(df_forecast.timestamp - tz_offset , df_forecast.temperature - df_forecast.temperatureSpread, \"C0-\", alpha=0.3)\n",
"ax.plot(df_forecast.timestamp - tz_offset , df_forecast.temperature + df_forecast.temperatureSpread, \"C0-\", alpha=0.3)\n",
"\n",
"# Plot real data\n",
"ax.plot(df_weather_station[\"mean\"], label=\"Weather Station Temperature\")\n",
"ax.plot(df_inside_dome[\"mean\"], label=\"Inside Dome Temperature\")\n",
"ax.plot(df_weather_station[\"mean\"], \"C1-\", label=\"Weather Station Temperature\")\n",
"ax.plot(df_inside_dome[\"mean\"], \"C2-\", label=\"Inside Dome Temperature\")\n",
"\n",
"# Plot every time the data was published\n",
"for pub_t in df_forecast[\"published_timestamp\"].unique():\n",
"for pub_t in df_forecast[\"published_time\"].unique():\n",
" ax.axvline(x=pub_t, ls=\":\", alpha=0.5)\n",
"\n",
"# Add it to the existing legend\n",
Expand Down Expand Up @@ -536,7 +553,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.10"
"version": "3.12.11"
}
},
"nbformat": 4,
Expand Down