Skip to content

Commit 549bcd8

Browse files
committed
fix: CSV merging
1 parent 88b5af5 commit 549bcd8

File tree

2 files changed

+144
-21
lines changed

2 files changed

+144
-21
lines changed

utils/import_python_organizers.py

Lines changed: 71 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -54,37 +54,79 @@ def map_columns(df, reverse=False):
5454

5555
def write_csv(df, year, csv_location):
5656
"""Write the CSV files for the conferences."""
57+
from logging_config import get_tqdm_logger
58+
59+
logger = get_tqdm_logger(__name__)
60+
61+
logger.info(f"Starting write_csv for year {year} with df shape: {df.shape}")
62+
logger.debug(f"write_csv input columns: {df.columns.tolist()}")
63+
64+
# Validate conference names before processing
65+
invalid_conferences = df[~df["conference"].apply(lambda x: isinstance(x, str) and len(str(x).strip()) > 0)]
66+
if not invalid_conferences.empty:
67+
logger.error(f"Found {len(invalid_conferences)} rows with invalid conference names in write_csv:")
68+
for idx, row in invalid_conferences.iterrows():
69+
logger.error(f" Row {idx}: conference = {row['conference']} (type: {type(row['conference'])})")
70+
# Fix invalid conference names
71+
df.loc[invalid_conferences.index, "conference"] = df.loc[invalid_conferences.index, "conference"].apply(
72+
lambda x: str(x) if pd.notna(x) else f"Conference_{invalid_conferences.index}",
73+
)
74+
5775
df["cfp"] = df["cfp"].str.slice(stop=10).str.replace(r"\b(TBA|None)\b", "", regex=True)
5876
df["tutorial_deadline"] = (
5977
df["tutorial_deadline"].fillna("").apply(str).str.slice(stop=10).str.replace(r"\b(TBA|None)\b", "", regex=True)
6078
)
6179
df = map_columns(df, reverse=True)
80+
logger.debug(f"After map_columns, df shape: {df.shape}")
81+
6282
for y in range(year, datetime.now(tz=timezone.utc).year + 10):
6383
if y in df["year"].unique():
64-
df.loc[
65-
df["year"] == y,
66-
[
67-
"Subject",
68-
"Start Date",
69-
"End Date",
70-
"Location",
71-
"Country",
72-
"Venue",
73-
"Tutorial Deadline",
74-
"Talk Deadline",
75-
"Website URL",
76-
"Proposal URL",
77-
"Sponsorship URL",
78-
],
79-
].fillna("").astype(str).sort_values(by="Start Date").to_csv(Path(csv_location, f"{y}.csv"), index=False)
84+
# Extract and prepare data for this year
85+
df_year_subset = df.loc[df["year"] == y]
86+
logger.debug(f"Year {y} subset shape: {df_year_subset.shape}")
87+
88+
csv_data = (
89+
df_year_subset[
90+
[
91+
"Subject",
92+
"Start Date",
93+
"End Date",
94+
"Location",
95+
"Country",
96+
"Venue",
97+
"Tutorial Deadline",
98+
"Talk Deadline",
99+
"Website URL",
100+
"Proposal URL",
101+
"Sponsorship URL",
102+
]
103+
]
104+
.fillna("")
105+
.astype(str)
106+
.sort_values(by=["Start Date", "End Date", "Subject"])
107+
)
108+
109+
logger.debug(f"Writing CSV for year {y} with {len(csv_data)} conferences")
110+
logger.debug(f"Sample conference names: {csv_data['Subject'].head().tolist()}")
111+
112+
csv_data.to_csv(Path(csv_location, f"{y}.csv"), index=False)
113+
logger.info(f"Successfully wrote {Path(csv_location, f'{y}.csv')}")
80114

81115

82116
def main(year=None, base=""):
83117
"""Import Python conferences from a csv file Github."""
118+
from logging_config import get_tqdm_logger
119+
120+
# Setup tqdm-compatible logging for this module
121+
logger = get_tqdm_logger(__name__)
122+
logger.info("🚀 Starting import_python_organizers main function")
123+
84124
# If no year is provided, use the current year
85125
if year is None:
86126
year = datetime.now(tz=timezone.utc).year
87127

128+
logger.info(f"Processing conferences for year: {year}")
129+
88130
# Load current conferences
89131
_data_path = Path(base, "_data")
90132
_utils_path = Path(base, "utils")
@@ -150,15 +192,24 @@ def main(year=None, base=""):
150192
)
151193
continue
152194

153-
df_merged, df_remote = fuzzy_match(
154-
df_yml[df_yml["year"] == y],
155-
df_csv_for_merge.loc[df_csv_for_merge["year"] == y],
156-
)
195+
logger.info(f"Processing year {y} merge operations")
196+
df_yml_year = df_yml[df_yml["year"] == y]
197+
df_csv_year = df_csv_for_merge.loc[df_csv_for_merge["year"] == y]
198+
logger.debug(f"Year {y}: df_yml_year shape: {df_yml_year.shape}, df_csv_year shape: {df_csv_year.shape}")
199+
200+
df_merged, df_remote = fuzzy_match(df_yml_year, df_csv_year)
201+
logger.info(f"Fuzzy match completed for year {y}. df_merged shape: {df_merged.shape}")
202+
157203
df_merged["year"] = y
158204
df_merged = df_merged.drop(["conference"], axis=1)
205+
logger.debug(f"After dropping conference column: {df_merged.shape}")
206+
159207
df_merged = deduplicate(df_merged)
160208
df_remote = deduplicate(df_remote)
209+
logger.debug(f"After deduplication - df_merged: {df_merged.shape}, df_remote: {df_remote.shape}")
210+
161211
df_merged = merge_conferences(df_merged, df_remote)
212+
logger.info(f"Merge conferences completed for year {y}. Final shape: {df_merged.shape}")
162213

163214
df_new = pd.concat([df_new, df_merged], ignore_index=True)
164215

utils/tidy_conf/interactive_merge.py

Lines changed: 73 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import logging
12
import sys
23
from collections import defaultdict
34

@@ -23,9 +24,16 @@ def fuzzy_match(df_yml, df_remote):
2324
Keeps temporary track of rejections to avoid asking the same question multiple
2425
times.
2526
"""
27+
logger = logging.getLogger(__name__)
28+
logger.info(f"Starting fuzzy_match with df_yml shape: {df_yml.shape}, df_remote shape: {df_remote.shape}")
29+
2630
df_yml = tidy_df_names(df_yml)
2731
df_remote = tidy_df_names(df_remote)
2832

33+
logger.debug(f"After tidy_df_names - df_yml shape: {df_yml.shape}, df_remote shape: {df_remote.shape}")
34+
logger.debug(f"df_yml columns: {df_yml.columns.tolist()}")
35+
logger.debug(f"df_remote columns: {df_remote.columns.tolist()}")
36+
2937
_, known_rejections = load_title_mappings(path="utils/tidy_conf/data/.tmp/rejections.yml")
3038

3139
new_mappings = defaultdict(list)
@@ -73,23 +81,46 @@ def fuzzy_match(df_yml, df_remote):
7381
update_title_mappings(new_rejections, path="utils/tidy_conf/data/.tmp/rejections.yml")
7482

7583
# Combine dataframes
84+
logger.info("Combining dataframes using title_match index")
7685
df.set_index("title_match", inplace=True)
86+
logger.debug(f"df index after set_index: {df.index.tolist()[:5]}...")
87+
7788
df_new = df.combine_first(df_remote)
89+
logger.info(f"Combined dataframe shape: {df_new.shape}")
90+
logger.debug(f"df_new index: {df_new.index.tolist()[:5]}...")
91+
92+
# Validate that the index contains actual conference names, not integers
93+
integer_indices = [idx for idx in df_new.index if isinstance(idx, int)]
94+
if integer_indices:
95+
logger.warning(f"Found {len(integer_indices)} integer indices in df_new: {integer_indices[:5]}...")
7896

7997
# Fill missing CFPs with "TBA"
8098
df_new.loc[df_new["cfp"].isna(), "cfp"] = "TBA"
8199

100+
logger.info("fuzzy_match completed successfully")
82101
return df_new, df_remote
83102

84103

85104
def merge_conferences(df_yml, df_remote):
86105
"""Merge two dataframes on title and interactively resolve conflicts."""
106+
logger = logging.getLogger(__name__)
107+
logger.info(f"Starting merge_conferences with df_yml shape: {df_yml.shape}, df_remote shape: {df_remote.shape}")
108+
109+
# Data validation before merge
110+
logger.debug(f"df_yml columns: {df_yml.columns.tolist()}")
111+
logger.debug(f"df_remote columns: {df_remote.columns.tolist()}")
112+
logger.debug(f"df_yml index: {df_yml.index.tolist()[:5]}...") # Show first 5 indices
113+
logger.debug(f"df_remote index: {df_remote.index.tolist()[:5]}...")
114+
87115
df_new = get_schema()
88116
columns = df_new.columns.tolist()
117+
logger.debug(f"Schema columns: {columns}")
89118

90119
with contextlib.suppress(KeyError):
120+
logger.debug("Dropping 'conference' column from df_yml")
91121
df_yml = df_yml.drop(["conference"], axis=1)
92122
with contextlib.suppress(KeyError):
123+
logger.debug("Dropping 'conference' column from df_remote")
93124
df_remote = df_remote.drop(["conference"], axis=1)
94125

95126
replacements = {
@@ -98,9 +129,32 @@ def merge_conferences(df_yml, df_remote):
98129
"Czech Republic": "Czechia",
99130
}
100131

132+
logger.info("Performing pandas merge on 'title_match'")
101133
df_merge = pd.merge(left=df_yml, right=df_remote, how="outer", on="title_match", validate="one_to_one")
134+
logger.info(f"Merge completed. df_merge shape: {df_merge.shape}")
135+
logger.debug(f"df_merge columns: {df_merge.columns.tolist()}")
136+
logger.debug(f"df_merge index: {df_merge.index.tolist()[:5]}...")
137+
102138
for i, row in df_merge.iterrows():
103-
df_new.loc[i, "conference"] = i
139+
# Use the actual conference name from title_match index, not the row index
140+
conference_name = df_merge.index.name if hasattr(df_merge.index, "name") and df_merge.index.name else i
141+
if hasattr(row, "name") and row.name:
142+
conference_name = row.name
143+
logger.debug(f"Using row.name for conference: {conference_name}")
144+
elif "title_match" in row and pd.notna(row["title_match"]):
145+
conference_name = row["title_match"]
146+
logger.debug(f"Using title_match for conference: {conference_name}")
147+
else:
148+
logger.warning(f"Falling back to index {i} for conference name")
149+
conference_name = i
150+
151+
# Validate conference name is a string
152+
if not isinstance(conference_name, str):
153+
logger.error(f"Conference name is not a string: {type(conference_name)} = {conference_name}")
154+
conference_name = str(conference_name)
155+
156+
df_new.loc[i, "conference"] = conference_name
157+
logger.debug(f"Set conference[{i}] = {conference_name}")
104158
for column in columns:
105159
cx, cy = column + "_x", column + "_y"
106160
# print(i,cx,cy,cx in df_merge.columns and cy in df_merge.columns,column in df_merge.columns,)
@@ -257,4 +311,22 @@ def merge_conferences(df_yml, df_remote):
257311

258312
# Fill in missing CFPs with TBA
259313
df_new.loc[df_new.cfp.isna(), "cfp"] = "TBA"
314+
315+
# Final validation before returning
316+
logger.info(f"Merge completed. Final df_new shape: {df_new.shape}")
317+
logger.debug(f"Final df_new columns: {df_new.columns.tolist()}")
318+
319+
# Validate conference names
320+
invalid_conferences = df_new[~df_new["conference"].apply(lambda x: isinstance(x, str) and len(str(x).strip()) > 0)]
321+
if not invalid_conferences.empty:
322+
logger.error(f"Found {len(invalid_conferences)} rows with invalid conference names:")
323+
for idx, row in invalid_conferences.iterrows():
324+
logger.error(f" Row {idx}: conference = {row['conference']} (type: {type(row['conference'])})")
325+
326+
# Check for null conference names
327+
null_conferences = df_new[df_new["conference"].isna()]
328+
if not null_conferences.empty:
329+
logger.error(f"Found {len(null_conferences)} rows with null conference names")
330+
331+
logger.info("Merge validation completed")
260332
return df_new

0 commit comments

Comments
 (0)