|
1 | 1 | """Engie price scraper - Fetches tariffs from Engie market offers""" |
2 | 2 | from typing import List |
| 3 | +import re |
3 | 4 | import httpx |
4 | 5 | from io import BytesIO |
5 | 6 | from pdfminer.high_level import extract_text |
@@ -104,10 +105,265 @@ async def fetch_offers(self) -> List[OfferData]: |
104 | 105 | raise Exception("Échec du scraping Engie - raison inconnue") |
105 | 106 |
|
106 | 107 | def _parse_pdf(self, text: str) -> List[OfferData]: |
107 | | - """Parse PDF text from Engie tariff sheet""" |
108 | | - # For now, return empty list to use fallback |
109 | | - # PDF parsing can be implemented later with proper regex patterns |
110 | | - return [] |
| 108 | + """ |
| 109 | + Parse PDF text from Engie tariff sheet. |
| 110 | +
|
| 111 | + The PDF has two main sections: |
| 112 | + 1. "Fourniture comptage simple (CS)" - BASE offers for 3-36 kVA |
| 113 | + 2. "Fourniture comptage Heures pleines/Heures creuses (HP/HC)" - HC/HP offers for 6-36 kVA |
| 114 | +
|
| 115 | + Format in PDF (pdfminer extracts numbers separated by spaces): |
| 116 | + - BASE: "puissance abo_HTT abo_TTC kwh_HTT kwh_TTC" |
| 117 | + - HC/HP: "puissance abo_HTT abo_TTC hp_HTT hp_TTC hc_HTT hc_TTC" |
| 118 | + """ |
| 119 | + offers = [] |
| 120 | + |
| 121 | + try: |
| 122 | + # Extract validity date from "Grille tarifaire - MONTH YEAR" |
| 123 | + date_match = re.search(r'Grille tarifaire\s*-\s*(\w+)\s+(\d{4})', text, re.IGNORECASE) |
| 124 | + if date_match: |
| 125 | + month_str, year_str = date_match.groups() |
| 126 | + months_fr = { |
| 127 | + 'janvier': 1, 'février': 2, 'fevrier': 2, 'mars': 3, 'avril': 4, |
| 128 | + 'mai': 5, 'juin': 6, 'juillet': 7, 'août': 8, 'aout': 8, |
| 129 | + 'septembre': 9, 'octobre': 10, 'novembre': 11, 'décembre': 12, 'decembre': 12 |
| 130 | + } |
| 131 | + month = months_fr.get(month_str.lower(), 9) # Default to September |
| 132 | + valid_from = datetime(int(year_str), month, 1, 0, 0, 0, tzinfo=UTC) |
| 133 | + self.logger.info(f"Parsed validity date: {valid_from}") |
| 134 | + else: |
| 135 | + valid_from = datetime(2025, 9, 1, 0, 0, 0, tzinfo=UTC) |
| 136 | + self.logger.warning("Could not parse validity date, using default: September 2025") |
| 137 | + |
| 138 | + # Parse BASE offers (Comptage Simple) |
| 139 | + base_prices = self._extract_base_prices(text) |
| 140 | + for power, prices in base_prices.items(): |
| 141 | + offers.append( |
| 142 | + OfferData( |
| 143 | + name=f"Elec Référence 1 an - Base {power} kVA", |
| 144 | + offer_type="BASE", |
| 145 | + description=f"Offre à prix fixe pendant 1 an - Électricité verte - Option Base - {power} kVA", |
| 146 | + subscription_price=prices["subscription"], |
| 147 | + base_price=prices["kwh"], |
| 148 | + power_kva=power, |
| 149 | + valid_from=valid_from, |
| 150 | + ) |
| 151 | + ) |
| 152 | + |
| 153 | + # Parse HC/HP offers (Heures Pleines/Heures Creuses) |
| 154 | + hc_hp_prices = self._extract_hc_hp_prices(text) |
| 155 | + for power, prices in hc_hp_prices.items(): |
| 156 | + offers.append( |
| 157 | + OfferData( |
| 158 | + name=f"Elec Tranquillité 1 an - Heures Creuses {power} kVA", |
| 159 | + offer_type="HC_HP", |
| 160 | + description=f"Offre à prix fixe pendant 1 an - Électricité verte - Heures Creuses - {power} kVA", |
| 161 | + subscription_price=prices["subscription"], |
| 162 | + hp_price=prices["hp"], |
| 163 | + hc_price=prices["hc"], |
| 164 | + power_kva=power, |
| 165 | + valid_from=valid_from, |
| 166 | + ) |
| 167 | + ) |
| 168 | + |
| 169 | + if offers: |
| 170 | + self.logger.info(f"Successfully parsed {len(offers)} offers from Engie PDF") |
| 171 | + else: |
| 172 | + self.logger.warning("No offers parsed from Engie PDF") |
| 173 | + |
| 174 | + return offers |
| 175 | + |
| 176 | + except Exception as e: |
| 177 | + self.logger.error(f"Error parsing Engie PDF: {e}", exc_info=True) |
| 178 | + return [] |
| 179 | + |
| 180 | + def _extract_base_prices(self, text: str) -> dict: |
| 181 | + """ |
| 182 | + Extract BASE (Comptage Simple) prices from PDF text. |
| 183 | +
|
| 184 | + The PDF is structured in vertical columns, so pdfminer extracts values |
| 185 | + on separate lines. The structure is: |
| 186 | + - First, all subscription TTC values for 9 power levels (3-36 kVA) |
| 187 | + - Then, alternating: abo_TTC, prix_HTT, prix_TTC for each power level |
| 188 | +
|
| 189 | + We look for the specific pattern where the first abo_TTC (36.61) appears, |
| 190 | + then extract the sequence: abo_TTC, skip HTT, get TTC for each power. |
| 191 | + """ |
| 192 | + prices = {} |
| 193 | + powers = [3, 6, 9, 12, 15, 18, 24, 30, 36] |
| 194 | + |
| 195 | + try: |
| 196 | + lines = text.split('\n') |
| 197 | + |
| 198 | + # Find the BASE section - look for first subscription TTC value around 36.61 |
| 199 | + # The pattern is: find "0,10334" (prix HTT) followed by "0,15998" (prix TTC) |
| 200 | + base_start_idx = None |
| 201 | + for i, line in enumerate(lines): |
| 202 | + stripped = line.strip() |
| 203 | + # Look for the first BASE subscription TTC (around 36-37) |
| 204 | + if stripped == '36,61': |
| 205 | + base_start_idx = i |
| 206 | + break |
| 207 | + |
| 208 | + if base_start_idx is None: |
| 209 | + self.logger.warning("Could not find BASE section start (36,61)") |
| 210 | + return {} |
| 211 | + |
| 212 | + self.logger.debug(f"Found BASE section start at line {base_start_idx}") |
| 213 | + |
| 214 | + # Extract values starting from base_start_idx |
| 215 | + # Pattern for each power: abo_TTC, prix_HTT (skip), prix_TTC |
| 216 | + # Example sequence: 36,61, 0,10334, 0,15998, 34,12, 0,10334, 0,15998, ... |
| 217 | + # We need 27 values (9 powers × 3 values each) |
| 218 | + values = [] |
| 219 | + for i in range(base_start_idx, min(base_start_idx + 60, len(lines))): |
| 220 | + stripped = lines[i].strip() |
| 221 | + # Stop when we have enough values OR hit the next section |
| 222 | + if 'Tranquillité' in stripped or 'Acheminement' in stripped: |
| 223 | + break |
| 224 | + if stripped and stripped not in ['-', 'HTT', 'TTC*', 'TTC']: |
| 225 | + try: |
| 226 | + val = float(stripped.replace(',', '.')) |
| 227 | + values.append(val) |
| 228 | + # Stop once we have 27 values (9 powers × 3) |
| 229 | + if len(values) >= 27: |
| 230 | + break |
| 231 | + except ValueError: |
| 232 | + pass |
| 233 | + |
| 234 | + self.logger.debug(f"Extracted {len(values)} values for BASE: {values[:15]}...") |
| 235 | + |
| 236 | + # Parse values: every 3 values = (abo_TTC, prix_HTT, prix_TTC) |
| 237 | + for idx, power in enumerate(powers): |
| 238 | + start = idx * 3 |
| 239 | + if start + 2 < len(values): |
| 240 | + abo_ttc = values[start] |
| 241 | + # values[start + 1] is prix_HTT (skip) |
| 242 | + prix_ttc = values[start + 2] |
| 243 | + |
| 244 | + # Validate the values |
| 245 | + if 20 < abo_ttc < 50 and 0.10 < prix_ttc < 0.25: |
| 246 | + prices[power] = { |
| 247 | + "subscription": abo_ttc, |
| 248 | + "kwh": prix_ttc |
| 249 | + } |
| 250 | + self.logger.debug(f"BASE {power} kVA: subscription={abo_ttc}, kwh={prix_ttc}") |
| 251 | + |
| 252 | + self.logger.info(f"Extracted {len(prices)} BASE prices from PDF") |
| 253 | + return prices |
| 254 | + |
| 255 | + except Exception as e: |
| 256 | + self.logger.error(f"Error extracting BASE prices: {e}", exc_info=True) |
| 257 | + return {} |
| 258 | + |
| 259 | + def _extract_hc_hp_prices(self, text: str) -> dict: |
| 260 | + """ |
| 261 | + Extract HC/HP (Heures Pleines/Heures Creuses) prices from PDF text. |
| 262 | +
|
| 263 | + The PDF structure is complex for HC/HP: |
| 264 | + - 6 and 9 kVA have complete data grouped: abo_TTC, hp_HTT, hp_TTC, hc_HTT, hc_TTC |
| 265 | + - 12-36 kVA have subscriptions grouped first, then prices grouped |
| 266 | +
|
| 267 | + The prices (hp_TTC, hc_TTC) are the same for all power levels (0.16240 and 0.13704) |
| 268 | + Only subscriptions vary by power level. |
| 269 | + """ |
| 270 | + prices = {} |
| 271 | + |
| 272 | + try: |
| 273 | + lines = text.split('\n') |
| 274 | + |
| 275 | + # Find the Tranquillité section |
| 276 | + tranquillite_idx = None |
| 277 | + for i, line in enumerate(lines): |
| 278 | + if 'Tranquillité' in line: |
| 279 | + tranquillite_idx = i |
| 280 | + break |
| 281 | + |
| 282 | + if tranquillite_idx is None: |
| 283 | + self.logger.warning("Could not find Tranquillité section") |
| 284 | + return {} |
| 285 | + |
| 286 | + # Extract all numeric values from Tranquillité section until Acheminement |
| 287 | + values = [] |
| 288 | + found_start = False |
| 289 | + for i in range(tranquillite_idx, min(tranquillite_idx + 150, len(lines))): |
| 290 | + stripped = lines[i].strip() |
| 291 | + |
| 292 | + # Look for first subscription TTC value (around 37.43 for 6 kVA) |
| 293 | + if not found_start: |
| 294 | + try: |
| 295 | + val = float(stripped.replace(',', '.')) |
| 296 | + if 35 < val < 40: # First subscription TTC |
| 297 | + found_start = True |
| 298 | + values.append(val) |
| 299 | + except ValueError: |
| 300 | + pass |
| 301 | + continue |
| 302 | + |
| 303 | + # Stop at Acheminement section |
| 304 | + if 'Acheminement' in stripped or 'courte utilisation' in stripped: |
| 305 | + break |
| 306 | + |
| 307 | + if stripped and stripped not in ['-', 'HTT', 'TTC*', 'TTC']: |
| 308 | + try: |
| 309 | + val = float(stripped.replace(',', '.')) |
| 310 | + values.append(val) |
| 311 | + except ValueError: |
| 312 | + pass |
| 313 | + |
| 314 | + self.logger.debug(f"Extracted {len(values)} values for HC/HP") |
| 315 | + |
| 316 | + # The structure observed in the PDF: |
| 317 | + # [0-4]: 6 kVA: abo_TTC, hp_HTT, hp_TTC, hc_HTT, hc_TTC |
| 318 | + # [5-9]: 9 kVA: abo_TTC, hp_HTT, hp_TTC, hc_HTT, hc_TTC |
| 319 | + # [10-15]: Subscriptions TTC for 12, 15, 18, 24, 30, 36 kVA |
| 320 | + # [16+]: Repeated price sets (hp_HTT, hp_TTC, hc_HTT, hc_TTC) for 12-36 kVA |
| 321 | + |
| 322 | + if len(values) >= 10: |
| 323 | + # 6 kVA: values[0]=abo_TTC, values[2]=hp_TTC, values[4]=hc_TTC |
| 324 | + prices[6] = { |
| 325 | + "subscription": values[0], |
| 326 | + "hp": values[2], |
| 327 | + "hc": values[4] |
| 328 | + } |
| 329 | + |
| 330 | + # 9 kVA: values[5]=abo_TTC, values[7]=hp_TTC, values[9]=hc_TTC |
| 331 | + prices[9] = { |
| 332 | + "subscription": values[5], |
| 333 | + "hp": values[7], |
| 334 | + "hc": values[9] |
| 335 | + } |
| 336 | + |
| 337 | + # 12-36 kVA: subscriptions at values[10-15], prices repeated |
| 338 | + remaining_powers = [12, 15, 18, 24, 30, 36] |
| 339 | + if len(values) >= 16: |
| 340 | + # The prices are the same for all (0.16240 and 0.13704) |
| 341 | + # We use the first hp_TTC and hc_TTC values extracted for 6 kVA |
| 342 | + hp_ttc = values[2] # 0.16240 |
| 343 | + hc_ttc = values[4] # 0.13704 |
| 344 | + |
| 345 | + for idx, power in enumerate(remaining_powers): |
| 346 | + sub_idx = 10 + idx |
| 347 | + if sub_idx < len(values): |
| 348 | + abo_ttc = values[sub_idx] |
| 349 | + # Validate subscription value |
| 350 | + if 25 < abo_ttc < 50: |
| 351 | + prices[power] = { |
| 352 | + "subscription": abo_ttc, |
| 353 | + "hp": hp_ttc, |
| 354 | + "hc": hc_ttc |
| 355 | + } |
| 356 | + |
| 357 | + # Log extracted prices |
| 358 | + for power, data in prices.items(): |
| 359 | + self.logger.debug(f"HC/HP {power} kVA: subscription={data['subscription']}, hp={data['hp']}, hc={data['hc']}") |
| 360 | + |
| 361 | + self.logger.info(f"Extracted {len(prices)} HC/HP prices from PDF") |
| 362 | + return prices |
| 363 | + |
| 364 | + except Exception as e: |
| 365 | + self.logger.error(f"Error extracting HC/HP prices: {e}", exc_info=True) |
| 366 | + return {} |
111 | 367 |
|
112 | 368 | def _get_fallback_offers(self) -> List[OfferData]: |
113 | 369 | """Generate offers from fallback pricing data""" |
|
0 commit comments