Skip to content

Commit 1fccc23

Browse files
committed
fix(utils): variations
1 parent 3b67718 commit 1fccc23

File tree

2 files changed

+86
-35
lines changed

2 files changed

+86
-35
lines changed

utils/import_python_organizers.py

Lines changed: 68 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import re
12
import sys
23
import urllib
34
from datetime import datetime
@@ -13,8 +14,8 @@
1314
from tidy_conf import merge_conferences
1415
from tidy_conf.deduplicate import deduplicate
1516
from tidy_conf.schema import get_schema
16-
from tidy_conf.titles import tidy_df_names
1717
from tidy_conf.utils import fill_missing_required
18+
from tidy_conf.yaml import load_title_mappings
1819
from tidy_conf.yaml import write_df_yaml
1920

2021

@@ -98,7 +99,7 @@ def main(year=None, base=""):
9899
df_yml = load_conferences()
99100
df_schema = get_schema()
100101
df_new = pd.DataFrame(columns=df_schema.columns)
101-
df_csv = pd.DataFrame(columns=df_schema.columns)
102+
df_csv_raw = pd.DataFrame(columns=df_schema.columns)
102103

103104
# Parse your csv file and iterate through year by year
104105
for y in range(year, datetime.now(tz=timezone.utc).year + 10):
@@ -107,42 +108,52 @@ def main(year=None, base=""):
107108
df["year"] = y
108109
except urllib.error.HTTPError:
109110
break
110-
df_csv = pd.concat([df_csv, df], ignore_index=True)
111-
112-
# Load old ics dataframe from cached data
113-
try:
114-
# Load the old ics dataframe from cache
115-
df_csv_old = pd.read_csv(cache_file)
116-
except FileNotFoundError:
117-
df_csv_old = pd.DataFrame(columns=df_csv.columns)
118-
119-
# Load and apply the title mappings, remove years from conference names
120-
df_csv = tidy_df_names(df_csv)
111+
df_csv_raw = pd.concat([df_csv_raw, df], ignore_index=True)
112+
113+
# Load old csv dataframe from cached data
114+
# try:
115+
# df_csv_old = pd.read_csv(cache_file)
116+
# except FileNotFoundError:
117+
# df_csv_old = pd.DataFrame(columns=df_csv_raw.columns)
118+
119+
# Create a copy for processing with standardized names
120+
df_csv_standardized = df_csv_raw.copy()
121+
122+
# Load and apply the title mappings
123+
_, known_mappings = load_title_mappings(reverse=True)
124+
df_csv_standardized["conference"] = (
125+
df_csv_standardized["conference"]
126+
.replace(re.compile(r"\b\s+(19|20)\d{2}\s*\b"), "", regex=True)
127+
.replace(known_mappings)
128+
)
121129

122-
# Store the new ics dataframe to cache
123-
df_cache = df_csv.copy()
130+
# Store the new csv dataframe to cache (with original names)
131+
df_cache = df_csv_raw.copy()
124132

125133
# Get the difference between the old and new dataframes
126-
df_diff = pd.concat([df_csv_old, df_csv]).drop_duplicates(keep=False)
134+
# _ = pd.concat([df_csv_old, df_csv_raw]).drop_duplicates(keep=False)
127135

128-
# Deduplicate the new dataframe
129-
df_csv = deduplicate(df_diff, "conference")
136+
# Deduplicate the new dataframe (with standardized names for merging)
137+
df_csv_for_merge = deduplicate(df_csv_standardized, "conference")
130138

131-
if df_csv.empty:
139+
if df_csv_for_merge.empty:
132140
print("No new conferences found in Python organiser source.")
141+
return
133142

134-
# Adjust deduplication and merging logic to retain valid data
143+
# Process year by year
135144
for y in range(year, datetime.now(tz=timezone.utc).year + 10):
136-
if df_csv.loc[df_csv["year"] == y].empty or df_yml[df_yml["year"] == y].empty:
145+
if df_csv_for_merge.loc[df_csv_for_merge["year"] == y].empty or df_yml[df_yml["year"] == y].empty:
137146
# Concatenate the new data with the existing data
138147
df_new = pd.concat(
139-
[df_new, df_yml[df_yml["year"] == y], df_csv.loc[df_csv["year"] == y]],
148+
[df_new, df_yml[df_yml["year"] == y], df_csv_for_merge.loc[df_csv_for_merge["year"] == y]],
140149
ignore_index=True,
141150
)
142151
continue
143152

144-
# Perform fuzzy matching and merge
145-
df_merged, df_remote = fuzzy_match(df_yml[df_yml["year"] == y], df_csv.loc[df_csv["year"] == y])
153+
df_merged, df_remote = fuzzy_match(
154+
df_yml[df_yml["year"] == y],
155+
df_csv_for_merge.loc[df_csv_for_merge["year"] == y],
156+
)
146157
df_merged["year"] = y
147158
df_merged = df_merged.drop(["conference"], axis=1)
148159
df_merged = deduplicate(df_merged)
@@ -151,27 +162,50 @@ def main(year=None, base=""):
151162

152163
df_new = pd.concat([df_new, df_merged], ignore_index=True)
153164

154-
# Write the new data to the YAML file
165+
# Fill in missing required fields
155166
df_new = fill_missing_required(df_new)
167+
168+
# Write the new data to the YAML file
156169
write_df_yaml(df_new, target_file)
157170

158-
# Write the new data to the CSV file
159-
df_new.loc[:, "Location"] = df_new.place
171+
# Prepare CSV output with original names
172+
df_csv_output = df_csv_raw.copy()
173+
174+
# Map from the standardized data back to original
175+
mapping_dict = {}
176+
for idx, row in df_csv_raw.iterrows():
177+
standardized_conf = re.sub(r"\b\s+(19|20)\d{2}\s*\b", "", row["conference"])
178+
if standardized_conf in known_mappings:
179+
standardized_conf = known_mappings[standardized_conf]
180+
mapping_key = (standardized_conf, row["year"])
181+
mapping_dict[mapping_key] = idx
182+
183+
# Update the CSV output with information from the merged data
184+
for _, row in df_new.iterrows():
185+
key = (row["conference"], row["year"])
186+
if key in mapping_dict:
187+
original_idx = mapping_dict[key]
188+
# Update only fields that were potentially enriched during merge
189+
for col in ["start", "end", "cfp", "link", "cfp_link", "sponsor", "finaid"]:
190+
if col in row and pd.notna(row[col]):
191+
df_csv_output.at[original_idx, col] = row[col]
192+
193+
# Write the CSV with original names
194+
df_csv_output.loc[:, "Location"] = df_csv_output.place
160195
try:
161-
df_new.loc[:, "Country"] = (
162-
df_new.place.str.split(",")
196+
df_csv_output.loc[:, "Country"] = (
197+
df_csv_output.place.str.split(",")
163198
.str[-1]
164199
.str.strip()
165200
.apply(lambda x: iso3166.countries_by_name.get(x.upper(), iso3166.Country("", "", "", "", "")).alpha3)
166201
)
167202
except AttributeError as e:
168-
df_csv.loc[:, "Country"] = ""
169-
print(f"Error: Country iso3 not found for {df_new.place} - {e}")
203+
df_csv_output.loc[:, "Country"] = ""
204+
print(f"Error: Country iso3 not found for {df_csv_output.place} - {e}")
170205

171-
print(f"Writing {len(df_new)} conferences to CSV file")
172-
write_csv(df_new, year, csv_location)
206+
write_csv(df_csv_output, year, csv_location)
173207

174-
# Save the new ics dataframe to cache
208+
# Save the new dataframe to cache
175209
df_cache.to_csv(cache_file, index=False)
176210

177211

utils/tidy_conf/yaml.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import re
12
import sys
23

34
import yaml
@@ -92,9 +93,25 @@ def load_title_mappings(reverse=False, path="utils/tidy_conf/data/titles.yml"):
9293

9394
for key, values in data.get("alt_name", {}).items():
9495
global_name = values.get("global")
95-
variations = values.get("variations", [])
96+
variations_raw = values.get("variations", [])
9697
regexes = values.get("regexes", [])
9798

99+
variations = []
100+
for current_variation in (global_name, *variations_raw):
101+
if not current_variation:
102+
continue
103+
current_variations = set(current_variation.strip())
104+
current_variations.update(
105+
variation.replace("Conference", "").strip().replace("Conf", "")
106+
for variation in current_variations.copy()
107+
)
108+
current_variations.update(re.sub(r"\s+", "", variation).strip() for variation in current_variations.copy())
109+
current_variations.update(re.sub(r"\W", "", variation).strip() for variation in current_variations.copy())
110+
current_variations.update(
111+
re.sub(r"\b\s+(19|20)\d{2}\s*\b", "", variation).strip() for variation in current_variations.copy()
112+
)
113+
variations.extend(current_variations)
114+
98115
if reverse:
99116
# Reverse mapping: map variations and regexes back to the global name
100117
if global_name:

0 commit comments

Comments
 (0)