1+ import re
12import sys
23import urllib
34from datetime import datetime
1314from tidy_conf import merge_conferences
1415from tidy_conf .deduplicate import deduplicate
1516from tidy_conf .schema import get_schema
16- from tidy_conf .titles import tidy_df_names
1717from tidy_conf .utils import fill_missing_required
18+ from tidy_conf .yaml import load_title_mappings
1819from tidy_conf .yaml import write_df_yaml
1920
2021
@@ -98,7 +99,7 @@ def main(year=None, base=""):
9899 df_yml = load_conferences ()
99100 df_schema = get_schema ()
100101 df_new = pd .DataFrame (columns = df_schema .columns )
101- df_csv = pd .DataFrame (columns = df_schema .columns )
102+ df_csv_raw = pd .DataFrame (columns = df_schema .columns )
102103
103104 # Parse your csv file and iterate through year by year
104105 for y in range (year , datetime .now (tz = timezone .utc ).year + 10 ):
@@ -107,42 +108,52 @@ def main(year=None, base=""):
107108 df ["year" ] = y
108109 except urllib .error .HTTPError :
109110 break
110- df_csv = pd .concat ([df_csv , df ], ignore_index = True )
111-
112- # Load old ics dataframe from cached data
113- try :
114- # Load the old ics dataframe from cache
115- df_csv_old = pd .read_csv (cache_file )
116- except FileNotFoundError :
117- df_csv_old = pd .DataFrame (columns = df_csv .columns )
118-
119- # Load and apply the title mappings, remove years from conference names
120- df_csv = tidy_df_names (df_csv )
111+ df_csv_raw = pd .concat ([df_csv_raw , df ], ignore_index = True )
112+
113+ # Load old csv dataframe from cached data
114+ # try:
115+ # df_csv_old = pd.read_csv(cache_file)
116+ # except FileNotFoundError:
117+ # df_csv_old = pd.DataFrame(columns=df_csv_raw.columns)
118+
119+ # Create a copy for processing with standardized names
120+ df_csv_standardized = df_csv_raw .copy ()
121+
122+ # Load and apply the title mappings
123+ _ , known_mappings = load_title_mappings (reverse = True )
124+ df_csv_standardized ["conference" ] = (
125+ df_csv_standardized ["conference" ]
126+ .replace (re .compile (r"\b\s+(19|20)\d{2}\s*\b" ), "" , regex = True )
127+ .replace (known_mappings )
128+ )
121129
122- # Store the new ics dataframe to cache
123- df_cache = df_csv .copy ()
130+ # Store the new csv dataframe to cache (with original names)
131+ df_cache = df_csv_raw .copy ()
124132
125133 # Get the difference between the old and new dataframes
126- df_diff = pd .concat ([df_csv_old , df_csv ]).drop_duplicates (keep = False )
134+ # _ = pd.concat([df_csv_old, df_csv_raw ]).drop_duplicates(keep=False)
127135
128- # Deduplicate the new dataframe
129- df_csv = deduplicate (df_diff , "conference" )
136+ # Deduplicate the new dataframe (with standardized names for merging)
137+ df_csv_for_merge = deduplicate (df_csv_standardized , "conference" )
130138
131- if df_csv .empty :
139+ if df_csv_for_merge .empty :
132140 print ("No new conferences found in Python organiser source." )
141+ return
133142
134- # Adjust deduplication and merging logic to retain valid data
143+ # Process year by year
135144 for y in range (year , datetime .now (tz = timezone .utc ).year + 10 ):
136- if df_csv .loc [df_csv ["year" ] == y ].empty or df_yml [df_yml ["year" ] == y ].empty :
145+ if df_csv_for_merge .loc [df_csv_for_merge ["year" ] == y ].empty or df_yml [df_yml ["year" ] == y ].empty :
137146 # Concatenate the new data with the existing data
138147 df_new = pd .concat (
139- [df_new , df_yml [df_yml ["year" ] == y ], df_csv .loc [df_csv ["year" ] == y ]],
148+ [df_new , df_yml [df_yml ["year" ] == y ], df_csv_for_merge .loc [df_csv_for_merge ["year" ] == y ]],
140149 ignore_index = True ,
141150 )
142151 continue
143152
144- # Perform fuzzy matching and merge
145- df_merged , df_remote = fuzzy_match (df_yml [df_yml ["year" ] == y ], df_csv .loc [df_csv ["year" ] == y ])
153+ df_merged , df_remote = fuzzy_match (
154+ df_yml [df_yml ["year" ] == y ],
155+ df_csv_for_merge .loc [df_csv_for_merge ["year" ] == y ],
156+ )
146157 df_merged ["year" ] = y
147158 df_merged = df_merged .drop (["conference" ], axis = 1 )
148159 df_merged = deduplicate (df_merged )
@@ -151,27 +162,50 @@ def main(year=None, base=""):
151162
152163 df_new = pd .concat ([df_new , df_merged ], ignore_index = True )
153164
154- # Write the new data to the YAML file
165+ # Fill in missing required fields
155166 df_new = fill_missing_required (df_new )
167+
168+ # Write the new data to the YAML file
156169 write_df_yaml (df_new , target_file )
157170
158- # Write the new data to the CSV file
159- df_new .loc [:, "Location" ] = df_new .place
171+ # Prepare CSV output with original names
172+ df_csv_output = df_csv_raw .copy ()
173+
174+ # Map from the standardized data back to original
175+ mapping_dict = {}
176+ for idx , row in df_csv_raw .iterrows ():
177+ standardized_conf = re .sub (r"\b\s+(19|20)\d{2}\s*\b" , "" , row ["conference" ])
178+ if standardized_conf in known_mappings :
179+ standardized_conf = known_mappings [standardized_conf ]
180+ mapping_key = (standardized_conf , row ["year" ])
181+ mapping_dict [mapping_key ] = idx
182+
183+ # Update the CSV output with information from the merged data
184+ for _ , row in df_new .iterrows ():
185+ key = (row ["conference" ], row ["year" ])
186+ if key in mapping_dict :
187+ original_idx = mapping_dict [key ]
188+ # Update only fields that were potentially enriched during merge
189+ for col in ["start" , "end" , "cfp" , "link" , "cfp_link" , "sponsor" , "finaid" ]:
190+ if col in row and pd .notna (row [col ]):
191+ df_csv_output .at [original_idx , col ] = row [col ]
192+
193+ # Write the CSV with original names
194+ df_csv_output .loc [:, "Location" ] = df_csv_output .place
160195 try :
161- df_new .loc [:, "Country" ] = (
162- df_new .place .str .split ("," )
196+ df_csv_output .loc [:, "Country" ] = (
197+ df_csv_output .place .str .split ("," )
163198 .str [- 1 ]
164199 .str .strip ()
165200 .apply (lambda x : iso3166 .countries_by_name .get (x .upper (), iso3166 .Country ("" , "" , "" , "" , "" )).alpha3 )
166201 )
167202 except AttributeError as e :
168- df_csv .loc [:, "Country" ] = ""
169- print (f"Error: Country iso3 not found for { df_new .place } - { e } " )
203+ df_csv_output .loc [:, "Country" ] = ""
204+ print (f"Error: Country iso3 not found for { df_csv_output .place } - { e } " )
170205
171- print (f"Writing { len (df_new )} conferences to CSV file" )
172- write_csv (df_new , year , csv_location )
206+ write_csv (df_csv_output , year , csv_location )
173207
174- # Save the new ics dataframe to cache
208+ # Save the new dataframe to cache
175209 df_cache .to_csv (cache_file , index = False )
176210
177211
0 commit comments