Skip to content

Commit d234696

Browse files
committed
feat: observability / logging
1 parent 549bcd8 commit d234696

File tree

6 files changed

+201
-23
lines changed

6 files changed

+201
-23
lines changed

utils/import_python_official.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import pandas as pd
88
import requests
99
from icalendar import Calendar
10-
from logging_config import get_logger
10+
from logging_config import get_tqdm_logger
1111
from tidy_conf import fuzzy_match
1212
from tidy_conf import load_conferences
1313
from tidy_conf import merge_conferences
@@ -18,7 +18,7 @@
1818
from tidy_conf.yaml import load_title_mappings
1919
from tidy_conf.yaml import write_df_yaml
2020

21-
logger = get_logger(__name__)
21+
logger = get_tqdm_logger(__name__)
2222

2323

2424
def ics_to_dataframe() -> pd.DataFrame:

utils/logging_config.py

Lines changed: 97 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
import logging
44
import sys
55
from pathlib import Path
6+
from typing import TYPE_CHECKING
7+
8+
if TYPE_CHECKING:
9+
pass
610

711

812
def setup_logging(level: str = "INFO", log_file: str | None = None, include_timestamp: bool = True) -> logging.Logger:
@@ -50,6 +54,75 @@ def setup_logging(level: str = "INFO", log_file: str | None = None, include_time
5054
return logger
5155

5256

57+
class TqdmLoggingHandler(logging.Handler):
58+
"""Custom logging handler that uses tqdm.write() for tqdm-compatible output."""
59+
60+
def __init__(self, level=logging.NOTSET):
61+
super().__init__(level)
62+
63+
def emit(self, record):
64+
try:
65+
msg = self.format(record)
66+
# Try to import and use tqdm.write, fallback to print
67+
try:
68+
from tqdm import tqdm
69+
70+
tqdm.write(msg)
71+
except ImportError:
72+
print(msg)
73+
except Exception:
74+
self.handleError(record)
75+
76+
77+
def setup_tqdm_logging(
78+
level: str = "INFO",
79+
log_file: str | None = None,
80+
include_timestamp: bool = True,
81+
) -> logging.Logger:
82+
"""Set up tqdm-compatible logging configuration for the project.
83+
84+
Args:
85+
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
86+
log_file: Optional log file path
87+
include_timestamp: Whether to include timestamps in log messages
88+
89+
Returns
90+
-------
91+
logging.Logger: Configured logger instance with tqdm-compatible output
92+
"""
93+
# Create logger
94+
logger = logging.getLogger("python_deadlines")
95+
logger.setLevel(getattr(logging, level.upper()))
96+
97+
# Clear existing handlers to avoid duplicates
98+
logger.handlers.clear()
99+
100+
# Create formatter
101+
if include_timestamp:
102+
formatter = logging.Formatter(
103+
"%(asctime)s - %(name)s - %(levelname)s - %(message)s",
104+
datefmt="%Y-%m-%d %H:%M:%S",
105+
)
106+
else:
107+
formatter = logging.Formatter("%(levelname)s - %(message)s")
108+
109+
# Tqdm-compatible console handler
110+
tqdm_handler = TqdmLoggingHandler()
111+
tqdm_handler.setFormatter(formatter)
112+
logger.addHandler(tqdm_handler)
113+
114+
# File handler (optional)
115+
if log_file:
116+
log_path = Path(log_file)
117+
log_path.parent.mkdir(parents=True, exist_ok=True)
118+
119+
file_handler = logging.FileHandler(log_path, encoding="utf-8")
120+
file_handler.setFormatter(formatter)
121+
logger.addHandler(file_handler)
122+
123+
return logger
124+
125+
53126
def get_logger(name: str | None = None) -> logging.Logger:
54127
"""Get a logger instance with the project configuration.
55128
@@ -67,6 +140,29 @@ def get_logger(name: str | None = None) -> logging.Logger:
67140

68141
# Set up basic configuration if not already configured
69142
if not logger.handlers:
70-
return setup_logging()
143+
return setup_tqdm_logging()
144+
145+
return logger
146+
147+
148+
def get_tqdm_logger(name: str | None = None, level: str = "INFO") -> logging.Logger:
149+
"""Get a tqdm-compatible logger instance.
150+
151+
Args:
152+
name: Logger name (defaults to calling module)
153+
level: Logging level
154+
155+
Returns
156+
-------
157+
logging.Logger: Tqdm-compatible logger instance
158+
"""
159+
if name is None:
160+
name = "python_deadlines"
161+
162+
logger = logging.getLogger(name)
163+
164+
# Set up tqdm-compatible configuration if not already configured
165+
if not logger.handlers:
166+
return setup_tqdm_logging(level=level)
71167

72168
return logger

utils/main.py

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,52 @@
11
import sys
2+
import time
23

34
sys.path.append(".")
45
from import_python_official import main as official_updater
56
from import_python_organizers import main as organizer_updater
7+
from logging_config import get_tqdm_logger
68
from sort_yaml import sort_data
79

10+
11+
def main():
12+
"""Main data processing pipeline with comprehensive logging."""
13+
logger = get_tqdm_logger(__name__)
14+
15+
logger.info("🚀 Starting Python Deadlines data processing pipeline")
16+
start_time = time.time()
17+
18+
try:
19+
# Step 1: Import from Python official calendar
20+
logger.info("📅 Step 1: Importing from Python official calendar")
21+
step_start = time.time()
22+
official_updater()
23+
logger.info(f"✅ Official calendar import completed in {time.time() - step_start:.2f}s")
24+
25+
# Step 2: Sort and validate data
26+
logger.info("🔄 Step 2: Sorting and validating data")
27+
step_start = time.time()
28+
sort_data(skip_links=True)
29+
logger.info(f"✅ Data sorting completed in {time.time() - step_start:.2f}s")
30+
31+
# Step 3: Import from Python organizers
32+
logger.info("👥 Step 3: Importing from Python organizers")
33+
step_start = time.time()
34+
organizer_updater()
35+
logger.info(f"✅ Organizers import completed in {time.time() - step_start:.2f}s")
36+
37+
# Step 4: Final sort and validation
38+
logger.info("🔄 Step 4: Final sorting and validation")
39+
step_start = time.time()
40+
sort_data(skip_links=True)
41+
logger.info(f"✅ Final sorting completed in {time.time() - step_start:.2f}s")
42+
43+
total_time = time.time() - start_time
44+
logger.info(f"🎉 Data processing pipeline completed successfully in {total_time:.2f}s")
45+
46+
except Exception as e:
47+
logger.error(f"❌ Pipeline failed with error: {e}", exc_info=True)
48+
sys.exit(1)
49+
50+
851
if __name__ == "__main__":
9-
official_updater()
10-
sort_data(skip_links=True)
11-
organizer_updater()
12-
sort_data(skip_links=True)
52+
main()

utils/sort_yaml.py

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import pydantic
1313
import pytz
1414
import yaml
15+
from logging_config import get_tqdm_logger
1516
from tidy_conf import auto_add_sub
1617
from tidy_conf import write_conference_yaml
1718
from tidy_conf.date import clean_dates
@@ -156,6 +157,8 @@ def check_links(data):
156157
# Sort:
157158
def sort_data(base="", prefix="", skip_links=False):
158159
"""Sort and clean the conference data."""
160+
logger = get_tqdm_logger(__name__)
161+
159162
# Load different data files
160163
current = Path(base, "_data", "conferences.yml")
161164
out_current = Path(base, "_data", f"{prefix}conferences.yml")
@@ -164,73 +167,101 @@ def sort_data(base="", prefix="", skip_links=False):
164167
legacy = Path(base, "_data", "legacy.yml")
165168
out_legacy = Path(base, "_data", f"{prefix}legacy.yml")
166169

170+
logger.info("📊 Loading conference data files")
167171
data = []
172+
files_loaded = 0
168173

169174
for url in (current, archive, legacy):
170-
with url.open(encoding="utf-8") as stream, contextlib.suppress(yaml.YAMLError):
171-
if stream:
172-
data += yaml.load(stream, Loader=Loader) # nosec B506 # noqa: S506
175+
if url.exists():
176+
with url.open(encoding="utf-8") as stream, contextlib.suppress(yaml.YAMLError):
177+
if stream:
178+
file_data = yaml.load(stream, Loader=Loader) # nosec B506 # noqa: S506
179+
if file_data:
180+
data += file_data
181+
files_loaded += 1
182+
logger.debug(f"Loaded {len(file_data)} entries from {url.name}")
183+
184+
logger.info(f"📋 Loaded {len(data)} conferences from {files_loaded} files")
173185

174186
from tidy_conf.schema import Conference
175187

188+
logger.debug("🔧 Ordering keywords")
176189
for i, q in enumerate(data.copy()):
177190
data[i] = order_keywords(q)
178191

179192
# Clean Dates
193+
logger.info("📅 Cleaning dates")
180194
data = tidy_dates(data)
181195

182196
# Clean Titles
197+
logger.info("🏷️ Cleaning titles")
183198
data = tidy_titles(data)
184199

185200
# Add Sub
201+
logger.info("🏢 Adding submission types")
186202
data = auto_add_sub(data)
187203

188204
# Geocode Data
205+
logger.info("🗺️ Adding geolocation data")
189206
data = add_latlon(data)
190207

191208
# Merge duplicates
209+
logger.info("🔄 Merging duplicates")
192210
data = merge_duplicates(data)
193211

194212
# Check Links
195213
if not skip_links:
214+
logger.info("🔗 Checking link availability")
196215
data = check_links(data)
216+
else:
217+
logger.info("⏭️ Skipping link checking")
197218

198219
for i, q in enumerate(data.copy()):
199220
data[i] = order_keywords(q)
200221

222+
logger.info("✅ Validating conference data with Pydantic schema")
201223
new_data = []
224+
validation_errors = 0
225+
202226
for q in data:
203227
try:
204228
new_data.append(Conference(**q))
205229
except pydantic.ValidationError as e: # noqa: PERF203
206-
print(f"Error: {e}")
207-
print(f"Data: \n{yaml.dump(q, default_flow_style=False)}")
208-
print("\n\n")
230+
validation_errors += 1
231+
logger.error(f"❌ Validation error in conference: {e}")
232+
logger.debug(f"Invalid data: \n{yaml.dump(q, default_flow_style=False)}")
209233
continue
234+
235+
if validation_errors > 0:
236+
logger.warning(f"⚠️ {validation_errors} conferences failed validation and were skipped")
237+
210238
data = new_data
239+
logger.info(f"✅ {len(data)} conferences passed validation")
211240

212241
# Split data by cfp
242+
logger.info("📂 Splitting data by CFP status")
213243
conf, tba, expired, legacy = split_data(data)
244+
logger.info(f"📊 Split results: {len(conf)} active, {len(tba)} TBA, {len(expired)} expired, {len(legacy)} legacy")
214245

215-
# just sort:
246+
# Sort data
247+
logger.info("🔄 Sorting conferences by CFP date")
216248
conf.sort(key=sort_by_cfp, reverse=True)
217-
# pretty_print("Date Sorting:", conf, tba, expired, legacy)
218249
conf.sort(key=sort_by_date_passed)
219-
# pretty_print("Date and Passed Deadline Sorting with tba:", conf, tba, expired)
220250
tba.sort(key=sort_by_date, reverse=True)
221251

252+
logger.info(f"💾 Writing {len(conf + tba)} active conferences to {out_current.name}")
222253
write_conference_yaml(conf + tba, out_current)
223254

224255
expired.sort(key=sort_by_date, reverse=True)
225-
226-
# pretty_print("New archive:", data)
256+
logger.info(f"📦 Writing {len(expired)} expired conferences to {out_archive.name}")
227257
write_conference_yaml(expired, out_archive)
228258

229259
legacy.sort(key=sort_by_name, reverse=True)
230-
231-
# pretty_print("New legacy:", data)
260+
logger.info(f"🗂️ Writing {len(legacy)} legacy conferences to {out_legacy.name}")
232261
write_conference_yaml(legacy, out_legacy)
233262

263+
logger.info("🎉 Conference data sorting and cleaning completed successfully")
264+
234265

235266
if __name__ == "__main__":
236267
import argparse

utils/tidy_conf/latlon.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,25 @@
1+
import sys
12
import time
23
import urllib
34

45
import requests
56
from tqdm import tqdm
67

8+
sys.path.append("..")
9+
from logging_config import get_tqdm_logger
10+
711

812
def add_latlon(data):
913
"""Add latitude and longitude to the data."""
14+
logger = get_tqdm_logger(__name__)
15+
1016
# Cache for locations
1117
cache = {}
1218
# Copy of data for unlocated conferences
1319
data_copy = []
1420

21+
logger.debug(f"Processing geolocation for {len(data)} conferences")
22+
1523
# Go through the data and check if the location is already in the cache
1624
for i, q in tqdm(enumerate(data), total=len(data)):
1725
if ("place" not in q) or ("online" in q["place"].lower()):
@@ -31,7 +39,7 @@ def add_latlon(data):
3139
try:
3240
q["place"] = q["place"].split(",")[0].strip() + ", " + q["place"].split(",")[-1].strip()
3341
except IndexError:
34-
tqdm.write(f"IndexError: {q['place']}")
42+
logger.error(f"IndexError processing place: {q['place']}")
3543

3644
# Check if the location is already in the cache
3745
places = [q["place"]]
@@ -69,11 +77,11 @@ def add_latlon(data):
6977
cache[place] = new_location[-1]
7078
except IndexError:
7179
cache[place] = None
72-
tqdm.write(f"No response from Openstreetmaps for {q['place']}")
80+
logger.warning(f"No response from OpenStreetMap for {q['place']}")
7381
time.sleep(2)
7482
else:
7583
cache[place] = None
76-
tqdm.write(f"No response from Openstreetmaps for {q['place']}")
84+
logger.warning(f"No response from OpenStreetMap for {q['place']}")
7785
else:
7886
if new_location:
7987
data[i]["location"] = new_location

utils/tidy_conf/links.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import sys
12
from datetime import datetime
23
from datetime import timedelta
34
from datetime import timezone
@@ -7,6 +8,8 @@
78
import requests
89
from tqdm import tqdm
910

11+
sys.path.append("..")
12+
1013

1114
def get_cache_location():
1215
# Check if the URL is cached

0 commit comments

Comments
 (0)