|
| 1 | +"""Tests for CSV processing functionality and fixes.""" |
| 2 | + |
| 3 | +import sys |
| 4 | +import tempfile |
| 5 | +from pathlib import Path |
| 6 | +from unittest.mock import Mock |
| 7 | +from unittest.mock import patch |
| 8 | + |
| 9 | +import pandas as pd |
| 10 | + |
| 11 | +sys.path.append(str(Path(__file__).parent.parent / "utils")) |
| 12 | + |
| 13 | +import import_python_organizers |
| 14 | + |
| 15 | + |
| 16 | +class TestCSVSortingFixes: |
| 17 | + """Test CSV sorting functionality fixes.""" |
| 18 | + |
| 19 | + def test_csv_sorting_compliance_python_organizers(self): |
| 20 | + """Test that CSV is sorted according to python-organizers expectations.""" |
| 21 | + # Create test data with internal column names, then add required CSV columns |
| 22 | + test_data = pd.DataFrame( |
| 23 | + { |
| 24 | + "conference": ["Conference A", "Conference B", "Conference C", "Conference D"], |
| 25 | + "start": ["2025-06-15", "2025-06-01", "2025-06-01", "2025-06-15"], |
| 26 | + "end": ["2025-06-17", "2025-06-03", "2025-06-05", "2025-06-17"], |
| 27 | + "Location": ["City A", "City B", "City C", "City D"], # Note: Location, not place |
| 28 | + "tutorial_deadline": ["2025-03-01", "2025-02-01", "2025-02-15", "2025-03-15"], |
| 29 | + "cfp": ["2025-04-01", "2025-03-01", "2025-03-15", "2025-04-15"], |
| 30 | + "link": ["https://a.com", "https://b.com", "https://c.com", "https://d.com"], |
| 31 | + "cfp_link": ["https://cfp-a.com", "https://cfp-b.com", "https://cfp-c.com", "https://cfp-d.com"], |
| 32 | + "sponsor": [ |
| 33 | + "https://sponsor-a.com", |
| 34 | + "https://sponsor-b.com", |
| 35 | + "https://sponsor-c.com", |
| 36 | + "https://sponsor-d.com", |
| 37 | + ], |
| 38 | + "year": [2025, 2025, 2025, 2025], |
| 39 | + # Add required columns for write_csv |
| 40 | + "Country": ["USA", "UK", "Canada", "Germany"], |
| 41 | + "Venue": ["Convention Center A", "Convention Center B", "Convention Center C", "Convention Center D"], |
| 42 | + }, |
| 43 | + ) |
| 44 | + |
| 45 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 46 | + temp_path = Path(temp_dir) |
| 47 | + |
| 48 | + # Call write_csv function |
| 49 | + import_python_organizers.write_csv(test_data, 2025, temp_path) |
| 50 | + |
| 51 | + # Read back the written CSV to verify sorting |
| 52 | + csv_file = temp_path / "2025.csv" |
| 53 | + assert csv_file.exists() |
| 54 | + |
| 55 | + result_df = pd.read_csv(csv_file) |
| 56 | + |
| 57 | + # Verify the CSV is sorted by Start Date, End Date, Subject |
| 58 | + # Expected order should be: B (06-01,06-03), C (06-01,06-05), A (06-15,06-17), D (06-15,06-17) |
| 59 | + expected_subjects = ["Conference B", "Conference C", "Conference A", "Conference D"] |
| 60 | + actual_subjects = result_df["Subject"].tolist() |
| 61 | + |
| 62 | + assert ( |
| 63 | + actual_subjects == expected_subjects |
| 64 | + ), f"CSV not sorted correctly. Expected: {expected_subjects}, Got: {actual_subjects}" |
| 65 | + |
| 66 | + def test_csv_column_structure(self): |
| 67 | + """Test that CSV has the expected column structure.""" |
| 68 | + test_data = pd.DataFrame( |
| 69 | + { |
| 70 | + "conference": ["Test Conference"], |
| 71 | + "start": ["2025-06-01"], |
| 72 | + "end": ["2025-06-03"], |
| 73 | + "Location": ["Test City"], |
| 74 | + "Country": ["USA"], |
| 75 | + "Venue": ["Test Venue"], |
| 76 | + "tutorial_deadline": ["2025-03-01"], |
| 77 | + "cfp": ["2025-04-01"], |
| 78 | + "link": ["https://test.com"], |
| 79 | + "cfp_link": ["https://cfp-test.com"], |
| 80 | + "sponsor": ["https://sponsor-test.com"], |
| 81 | + "year": [2025], |
| 82 | + }, |
| 83 | + ) |
| 84 | + |
| 85 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 86 | + temp_path = Path(temp_dir) |
| 87 | + |
| 88 | + import_python_organizers.write_csv(test_data, 2025, temp_path) |
| 89 | + |
| 90 | + csv_file = temp_path / "2025.csv" |
| 91 | + result_df = pd.read_csv(csv_file) |
| 92 | + |
| 93 | + # Verify expected columns are present in correct order |
| 94 | + expected_columns = [ |
| 95 | + "Subject", |
| 96 | + "Start Date", |
| 97 | + "End Date", |
| 98 | + "Location", |
| 99 | + "Country", |
| 100 | + "Venue", |
| 101 | + "Tutorial Deadline", |
| 102 | + "Talk Deadline", |
| 103 | + "Website URL", |
| 104 | + "Proposal URL", |
| 105 | + "Sponsorship URL", |
| 106 | + ] |
| 107 | + |
| 108 | + assert list(result_df.columns) == expected_columns |
| 109 | + |
| 110 | + def test_conference_name_validation_in_csv_processing(self): |
| 111 | + """Test that CSV processing validates conference names correctly.""" |
| 112 | + # This tests the fix for conference name corruption |
| 113 | + test_data = pd.DataFrame( |
| 114 | + { |
| 115 | + "conference": ["Valid Conference", None, "", "Another Valid Conference"], |
| 116 | + "year": [2025, 2025, 2025, 2025], |
| 117 | + "start": ["2025-06-01", "2025-06-02", "2025-06-03", "2025-06-04"], |
| 118 | + "end": ["2025-06-03", "2025-06-04", "2025-06-05", "2025-06-06"], |
| 119 | + "Location": ["City A", "City B", "City C", "City D"], |
| 120 | + "Country": ["USA", "UK", "Canada", "Germany"], |
| 121 | + "Venue": ["Venue A", "Venue B", "Venue C", "Venue D"], |
| 122 | + "tutorial_deadline": ["", "", "", ""], |
| 123 | + "cfp": ["", "", "", ""], |
| 124 | + "link": ["https://a.com", "https://b.com", "https://c.com", "https://d.com"], |
| 125 | + "cfp_link": ["", "", "", ""], |
| 126 | + "sponsor": ["", "", "", ""], |
| 127 | + }, |
| 128 | + ) |
| 129 | + |
| 130 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 131 | + temp_path = Path(temp_dir) |
| 132 | + |
| 133 | + # This should not crash and should handle invalid conference names |
| 134 | + import_python_organizers.write_csv(test_data, 2025, temp_path) |
| 135 | + |
| 136 | + csv_file = temp_path / "2025.csv" |
| 137 | + assert csv_file.exists() |
| 138 | + |
| 139 | + result_df = pd.read_csv(csv_file) |
| 140 | + |
| 141 | + # Verify that all Subject entries are strings (not NaN, not empty) |
| 142 | + for subject in result_df["Subject"]: |
| 143 | + assert isinstance(subject, str) |
| 144 | + assert subject.strip() != "" |
| 145 | + |
| 146 | + |
| 147 | +class TestDataValidation: |
| 148 | + """Test data validation during CSV processing.""" |
| 149 | + |
| 150 | + def test_cfp_deadline_processing(self): |
| 151 | + """Test CFP deadline processing and cleanup.""" |
| 152 | + test_data = pd.DataFrame( |
| 153 | + { |
| 154 | + "cfp": ["2025-02-15 23:59:00", "TBA", "None", "2025-03-15 23:59:00"], |
| 155 | + "tutorial_deadline": ["2025-02-01", "TBA", "", "2025-03-01"], |
| 156 | + "conference": ["Conf A", "Conf B", "Conf C", "Conf D"], |
| 157 | + "year": [2025, 2025, 2025, 2025], |
| 158 | + "start": ["2025-06-01", "2025-06-02", "2025-06-03", "2025-06-04"], |
| 159 | + "end": ["2025-06-03", "2025-06-04", "2025-06-05", "2025-06-06"], |
| 160 | + "Location": ["City A", "City B", "City C", "City D"], |
| 161 | + "Country": ["USA", "UK", "Canada", "Germany"], |
| 162 | + "Venue": ["Venue A", "Venue B", "Venue C", "Venue D"], |
| 163 | + "link": ["https://a.com", "https://b.com", "https://c.com", "https://d.com"], |
| 164 | + "cfp_link": ["", "", "", ""], |
| 165 | + "sponsor": ["", "", "", ""], |
| 166 | + }, |
| 167 | + ) |
| 168 | + |
| 169 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 170 | + temp_path = Path(temp_dir) |
| 171 | + |
| 172 | + import_python_organizers.write_csv(test_data, 2025, temp_path) |
| 173 | + |
| 174 | + csv_file = temp_path / "2025.csv" |
| 175 | + |
| 176 | + # Read CSV with proper settings to preserve empty strings |
| 177 | + result_df = pd.read_csv(csv_file, keep_default_na=False, na_values=[]) |
| 178 | + |
| 179 | + # Check that TBA and None values were processed correctly |
| 180 | + talk_deadlines = result_df["Talk Deadline"].tolist() |
| 181 | + assert "2025-02-15" in talk_deadlines |
| 182 | + assert "2025-03-15" in talk_deadlines |
| 183 | + # TBA and None should be converted to empty strings |
| 184 | + assert "" in talk_deadlines |
| 185 | + |
| 186 | + def test_country_code_assignment(self): |
| 187 | + """Test country code assignment from location.""" |
| 188 | + test_data = pd.DataFrame( |
| 189 | + { |
| 190 | + "conference": ["Conf A", "Conf B", "Conf C"], |
| 191 | + "Location": ["New York, USA", "London, United Kingdom", "Invalid Location"], |
| 192 | + "year": [2025, 2025, 2025], |
| 193 | + "start": ["2025-06-01", "2025-06-02", "2025-06-03"], |
| 194 | + "end": ["2025-06-03", "2025-06-04", "2025-06-05"], |
| 195 | + "Country": ["", "", ""], |
| 196 | + "Venue": ["Venue A", "Venue B", "Venue C"], |
| 197 | + "tutorial_deadline": ["", "", ""], |
| 198 | + "cfp": ["", "", ""], |
| 199 | + "link": ["https://a.com", "https://b.com", "https://c.com"], |
| 200 | + "cfp_link": ["", "", ""], |
| 201 | + "sponsor": ["", "", ""], |
| 202 | + }, |
| 203 | + ) |
| 204 | + |
| 205 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 206 | + temp_path = Path(temp_dir) |
| 207 | + |
| 208 | + import_python_organizers.write_csv(test_data, 2025, temp_path) |
| 209 | + |
| 210 | + csv_file = temp_path / "2025.csv" |
| 211 | + result_df = pd.read_csv(csv_file, keep_default_na=False, na_values=[]) |
| 212 | + |
| 213 | + # Check that country codes were assigned where possible |
| 214 | + countries = result_df["Country"].tolist() |
| 215 | + # Should handle country assignment or gracefully fail |
| 216 | + assert len(countries) == 3 |
| 217 | + for country in countries: |
| 218 | + # Should be a string (even if empty) |
| 219 | + assert isinstance(country, str) |
| 220 | + |
| 221 | + |
| 222 | +class TestErrorHandling: |
| 223 | + """Test error handling in CSV processing.""" |
| 224 | + |
| 225 | + def test_empty_dataframe_handling(self): |
| 226 | + """Test handling of empty DataFrames.""" |
| 227 | + empty_df = pd.DataFrame( |
| 228 | + columns=[ |
| 229 | + "conference", |
| 230 | + "year", |
| 231 | + "start", |
| 232 | + "end", |
| 233 | + "Location", |
| 234 | + "Country", |
| 235 | + "Venue", |
| 236 | + "tutorial_deadline", |
| 237 | + "cfp", |
| 238 | + "link", |
| 239 | + "cfp_link", |
| 240 | + "sponsor", |
| 241 | + ], |
| 242 | + ) |
| 243 | + |
| 244 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 245 | + temp_path = Path(temp_dir) |
| 246 | + |
| 247 | + # Should not crash with empty DataFrame |
| 248 | + import_python_organizers.write_csv(empty_df, 2025, temp_path) |
| 249 | + |
| 250 | + csv_file = temp_path / "2025.csv" |
| 251 | + # File should still be created, even if empty |
| 252 | + assert csv_file.exists() |
| 253 | + |
| 254 | + def test_malformed_data_handling(self): |
| 255 | + """Test handling of malformed data.""" |
| 256 | + malformed_data = pd.DataFrame( |
| 257 | + { |
| 258 | + "conference": [123, None, "Valid Conference"], # Mixed types |
| 259 | + "year": ["invalid", 2025, 2025], # Invalid year |
| 260 | + "start": ["invalid-date", "2025-06-02", "2025-06-03"], |
| 261 | + "end": ["2025-06-03", "invalid-date", "2025-06-05"], |
| 262 | + "Location": [None, "", "Valid City"], |
| 263 | + "Country": ["", "", ""], |
| 264 | + "Venue": ["", "", ""], |
| 265 | + "tutorial_deadline": ["", "", ""], |
| 266 | + "cfp": ["", "", ""], |
| 267 | + "link": ["", "", ""], |
| 268 | + "cfp_link": ["", "", ""], |
| 269 | + "sponsor": ["", "", ""], |
| 270 | + }, |
| 271 | + ) |
| 272 | + |
| 273 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 274 | + temp_path = Path(temp_dir) |
| 275 | + |
| 276 | + # Should handle malformed data gracefully |
| 277 | + import_python_organizers.write_csv(malformed_data, 2025, temp_path) |
| 278 | + |
| 279 | + csv_file = temp_path / "2025.csv" |
| 280 | + assert csv_file.exists() |
| 281 | + |
| 282 | + result_df = pd.read_csv(csv_file, keep_default_na=False, na_values=[]) |
| 283 | + # Should have converted everything to strings |
| 284 | + for col in result_df.columns: |
| 285 | + for val in result_df[col]: |
| 286 | + assert isinstance(val, str) |
| 287 | + |
| 288 | + |
| 289 | +class TestIntegrationWithLogging: |
| 290 | + """Test integration with the new logging system.""" |
| 291 | + |
| 292 | + @patch("logging_config.get_tqdm_logger") |
| 293 | + def test_logging_integration_in_csv_write(self, mock_logger): |
| 294 | + """Test that CSV processing integrates with tqdm logging.""" |
| 295 | + mock_logger_instance = Mock() |
| 296 | + mock_logger.return_value = mock_logger_instance |
| 297 | + |
| 298 | + test_data = pd.DataFrame( |
| 299 | + { |
| 300 | + "conference": ["Test Conference"], |
| 301 | + "year": [2025], |
| 302 | + "start": ["2025-06-01"], |
| 303 | + "end": ["2025-06-03"], |
| 304 | + "Location": ["Test City"], |
| 305 | + "Country": ["USA"], |
| 306 | + "Venue": ["Test Venue"], |
| 307 | + "tutorial_deadline": [""], |
| 308 | + "cfp": [""], |
| 309 | + "link": ["https://test.com"], |
| 310 | + "cfp_link": [""], |
| 311 | + "sponsor": [""], |
| 312 | + }, |
| 313 | + ) |
| 314 | + |
| 315 | + with tempfile.TemporaryDirectory() as temp_dir: |
| 316 | + temp_path = Path(temp_dir) |
| 317 | + |
| 318 | + import_python_organizers.write_csv(test_data, 2025, temp_path) |
| 319 | + |
| 320 | + # Verify logging was called |
| 321 | + assert mock_logger.called |
| 322 | + assert mock_logger_instance.info.called |
| 323 | + |
| 324 | + # Check that specific logging messages were recorded |
| 325 | + log_calls = [call.args[0] for call in mock_logger_instance.info.call_args_list] |
| 326 | + assert any("Starting write_csv" in msg for msg in log_calls) |
| 327 | + assert any("Successfully wrote" in msg for msg in log_calls) |
0 commit comments