fix(scraper): implement Engie PDF parsing instead of fallback (#62)

m4dm4rtig4n · Clément VALENTIN · claude · web-flow · commit 3d5809f962eb · 2025-12-05T00:51:53.000+01:00
The Engie scraper was always using fallback data because _parse_pdf() returned an empty list. This commit implements proper PDF parsing: - Extract validity date from "Grille tarifaire - MONTH YEAR" pattern - Parse BASE offers (9 power levels, 3-36 kVA) from vertical columns - Parse HC/HP offers (8 power levels, 6-36 kVA) with grouped data - Handle complex PDF structure where pdfminer extracts values line by line The scraper now properly extracts 17 offers (9 BASE + 8 HC/HP) directly from the PDF instead of relying on hardcoded fallback values. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Clément VALENTIN <clement.valentin@blacktiger.tech> Co-authored-by: Claude <noreply@anthropic.com>
diff --git a/apps/api/pyproject.toml b/apps/api/pyproject.toml
@@ -68,6 +68,7 @@ disallow_untyped_defs = true
 
 [dependency-groups]
 dev = [
+    "ruff>=0.13.3",
     "pytest>=8.4.2",
     "pytest-asyncio>=1.2.0",
 ]
diff --git a/apps/api/src/services/price_scrapers/engie_scraper.py b/apps/api/src/services/price_scrapers/engie_scraper.py
@@ -1,5 +1,6 @@
 """Engie price scraper - Fetches tariffs from Engie market offers"""
 from typing import List
+import re
 import httpx
 from io import BytesIO
 from pdfminer.high_level import extract_text
@@ -104,10 +105,265 @@ async def fetch_offers(self) -> List[OfferData]:
         raise Exception("Échec du scraping Engie - raison inconnue")
 
     def _parse_pdf(self, text: str) -> List[OfferData]:
-        """Parse PDF text from Engie tariff sheet"""
-        # For now, return empty list to use fallback
-        # PDF parsing can be implemented later with proper regex patterns
-        return []
+        """
+        Parse PDF text from Engie tariff sheet.
+
+        The PDF has two main sections:
+        1. "Fourniture comptage simple (CS)" - BASE offers for 3-36 kVA
+        2. "Fourniture comptage Heures pleines/Heures creuses (HP/HC)" - HC/HP offers for 6-36 kVA
+
+        Format in PDF (pdfminer extracts numbers separated by spaces):
+        - BASE: "puissance abo_HTT abo_TTC kwh_HTT kwh_TTC"
+        - HC/HP: "puissance abo_HTT abo_TTC hp_HTT hp_TTC hc_HTT hc_TTC"
+        """
+        offers = []
+
+        try:
+            # Extract validity date from "Grille tarifaire - MONTH YEAR"
+            date_match = re.search(r'Grille tarifaire\s*-\s*(\w+)\s+(\d{4})', text, re.IGNORECASE)
+            if date_match:
+                month_str, year_str = date_match.groups()
+                months_fr = {
+                    'janvier': 1, 'février': 2, 'fevrier': 2, 'mars': 3, 'avril': 4,
+                    'mai': 5, 'juin': 6, 'juillet': 7, 'août': 8, 'aout': 8,
+                    'septembre': 9, 'octobre': 10, 'novembre': 11, 'décembre': 12, 'decembre': 12
+                }
+                month = months_fr.get(month_str.lower(), 9)  # Default to September
+                valid_from = datetime(int(year_str), month, 1, 0, 0, 0, tzinfo=UTC)
+                self.logger.info(f"Parsed validity date: {valid_from}")
+            else:
+                valid_from = datetime(2025, 9, 1, 0, 0, 0, tzinfo=UTC)
+                self.logger.warning("Could not parse validity date, using default: September 2025")
+
+            # Parse BASE offers (Comptage Simple)
+            base_prices = self._extract_base_prices(text)
+            for power, prices in base_prices.items():
+                offers.append(
+                    OfferData(
+                        name=f"Elec Référence 1 an - Base {power} kVA",
+                        offer_type="BASE",
+                        description=f"Offre à prix fixe pendant 1 an - Électricité verte - Option Base - {power} kVA",
+                        subscription_price=prices["subscription"],
+                        base_price=prices["kwh"],
+                        power_kva=power,
+                        valid_from=valid_from,
+                    )
+                )
+
+            # Parse HC/HP offers (Heures Pleines/Heures Creuses)
+            hc_hp_prices = self._extract_hc_hp_prices(text)
+            for power, prices in hc_hp_prices.items():
+                offers.append(
+                    OfferData(
+                        name=f"Elec Tranquillité 1 an - Heures Creuses {power} kVA",
+                        offer_type="HC_HP",
+                        description=f"Offre à prix fixe pendant 1 an - Électricité verte - Heures Creuses - {power} kVA",
+                        subscription_price=prices["subscription"],
+                        hp_price=prices["hp"],
+                        hc_price=prices["hc"],
+                        power_kva=power,
+                        valid_from=valid_from,
+                    )
+                )
+
+            if offers:
+                self.logger.info(f"Successfully parsed {len(offers)} offers from Engie PDF")
+            else:
+                self.logger.warning("No offers parsed from Engie PDF")
+
+            return offers
+
+        except Exception as e:
+            self.logger.error(f"Error parsing Engie PDF: {e}", exc_info=True)
+            return []
+
+    def _extract_base_prices(self, text: str) -> dict:
+        """
+        Extract BASE (Comptage Simple) prices from PDF text.
+
+        The PDF is structured in vertical columns, so pdfminer extracts values
+        on separate lines. The structure is:
+        - First, all subscription TTC values for 9 power levels (3-36 kVA)
+        - Then, alternating: abo_TTC, prix_HTT, prix_TTC for each power level
+
+        We look for the specific pattern where the first abo_TTC (36.61) appears,
+        then extract the sequence: abo_TTC, skip HTT, get TTC for each power.
+        """
+        prices = {}
+        powers = [3, 6, 9, 12, 15, 18, 24, 30, 36]
+
+        try:
+            lines = text.split('\n')
+
+            # Find the BASE section - look for first subscription TTC value around 36.61
+            # The pattern is: find "0,10334" (prix HTT) followed by "0,15998" (prix TTC)
+            base_start_idx = None
+            for i, line in enumerate(lines):
+                stripped = line.strip()
+                # Look for the first BASE subscription TTC (around 36-37)
+                if stripped == '36,61':
+                    base_start_idx = i
+                    break
+
+            if base_start_idx is None:
+                self.logger.warning("Could not find BASE section start (36,61)")
+                return {}
+
+            self.logger.debug(f"Found BASE section start at line {base_start_idx}")
+
+            # Extract values starting from base_start_idx
+            # Pattern for each power: abo_TTC, prix_HTT (skip), prix_TTC
+            # Example sequence: 36,61, 0,10334, 0,15998, 34,12, 0,10334, 0,15998, ...
+            # We need 27 values (9 powers × 3 values each)
+            values = []
+            for i in range(base_start_idx, min(base_start_idx + 60, len(lines))):
+                stripped = lines[i].strip()
+                # Stop when we have enough values OR hit the next section
+                if 'Tranquillité' in stripped or 'Acheminement' in stripped:
+                    break
+                if stripped and stripped not in ['-', 'HTT', 'TTC*', 'TTC']:
+                    try:
+                        val = float(stripped.replace(',', '.'))
+                        values.append(val)
+                        # Stop once we have 27 values (9 powers × 3)
+                        if len(values) >= 27:
+                            break
+                    except ValueError:
+                        pass
+
+            self.logger.debug(f"Extracted {len(values)} values for BASE: {values[:15]}...")
+
+            # Parse values: every 3 values = (abo_TTC, prix_HTT, prix_TTC)
+            for idx, power in enumerate(powers):
+                start = idx * 3
+                if start + 2 < len(values):
+                    abo_ttc = values[start]
+                    # values[start + 1] is prix_HTT (skip)
+                    prix_ttc = values[start + 2]
+
+                    # Validate the values
+                    if 20 < abo_ttc < 50 and 0.10 < prix_ttc < 0.25:
+                        prices[power] = {
+                            "subscription": abo_ttc,
+                            "kwh": prix_ttc
+                        }
+                        self.logger.debug(f"BASE {power} kVA: subscription={abo_ttc}, kwh={prix_ttc}")
+
+            self.logger.info(f"Extracted {len(prices)} BASE prices from PDF")
+            return prices
+
+        except Exception as e:
+            self.logger.error(f"Error extracting BASE prices: {e}", exc_info=True)
+            return {}
+
+    def _extract_hc_hp_prices(self, text: str) -> dict:
+        """
+        Extract HC/HP (Heures Pleines/Heures Creuses) prices from PDF text.
+
+        The PDF structure is complex for HC/HP:
+        - 6 and 9 kVA have complete data grouped: abo_TTC, hp_HTT, hp_TTC, hc_HTT, hc_TTC
+        - 12-36 kVA have subscriptions grouped first, then prices grouped
+
+        The prices (hp_TTC, hc_TTC) are the same for all power levels (0.16240 and 0.13704)
+        Only subscriptions vary by power level.
+        """
+        prices = {}
+
+        try:
+            lines = text.split('\n')
+
+            # Find the Tranquillité section
+            tranquillite_idx = None
+            for i, line in enumerate(lines):
+                if 'Tranquillité' in line:
+                    tranquillite_idx = i
+                    break
+
+            if tranquillite_idx is None:
+                self.logger.warning("Could not find Tranquillité section")
+                return {}
+
+            # Extract all numeric values from Tranquillité section until Acheminement
+            values = []
+            found_start = False
+            for i in range(tranquillite_idx, min(tranquillite_idx + 150, len(lines))):
+                stripped = lines[i].strip()
+
+                # Look for first subscription TTC value (around 37.43 for 6 kVA)
+                if not found_start:
+                    try:
+                        val = float(stripped.replace(',', '.'))
+                        if 35 < val < 40:  # First subscription TTC
+                            found_start = True
+                            values.append(val)
+                    except ValueError:
+                        pass
+                    continue
+
+                # Stop at Acheminement section
+                if 'Acheminement' in stripped or 'courte utilisation' in stripped:
+                    break
+
+                if stripped and stripped not in ['-', 'HTT', 'TTC*', 'TTC']:
+                    try:
+                        val = float(stripped.replace(',', '.'))
+                        values.append(val)
+                    except ValueError:
+                        pass
+
+            self.logger.debug(f"Extracted {len(values)} values for HC/HP")
+
+            # The structure observed in the PDF:
+            # [0-4]: 6 kVA: abo_TTC, hp_HTT, hp_TTC, hc_HTT, hc_TTC
+            # [5-9]: 9 kVA: abo_TTC, hp_HTT, hp_TTC, hc_HTT, hc_TTC
+            # [10-15]: Subscriptions TTC for 12, 15, 18, 24, 30, 36 kVA
+            # [16+]: Repeated price sets (hp_HTT, hp_TTC, hc_HTT, hc_TTC) for 12-36 kVA
+
+            if len(values) >= 10:
+                # 6 kVA: values[0]=abo_TTC, values[2]=hp_TTC, values[4]=hc_TTC
+                prices[6] = {
+                    "subscription": values[0],
+                    "hp": values[2],
+                    "hc": values[4]
+                }
+
+                # 9 kVA: values[5]=abo_TTC, values[7]=hp_TTC, values[9]=hc_TTC
+                prices[9] = {
+                    "subscription": values[5],
+                    "hp": values[7],
+                    "hc": values[9]
+                }
+
+                # 12-36 kVA: subscriptions at values[10-15], prices repeated
+                remaining_powers = [12, 15, 18, 24, 30, 36]
+                if len(values) >= 16:
+                    # The prices are the same for all (0.16240 and 0.13704)
+                    # We use the first hp_TTC and hc_TTC values extracted for 6 kVA
+                    hp_ttc = values[2]  # 0.16240
+                    hc_ttc = values[4]  # 0.13704
+
+                    for idx, power in enumerate(remaining_powers):
+                        sub_idx = 10 + idx
+                        if sub_idx < len(values):
+                            abo_ttc = values[sub_idx]
+                            # Validate subscription value
+                            if 25 < abo_ttc < 50:
+                                prices[power] = {
+                                    "subscription": abo_ttc,
+                                    "hp": hp_ttc,
+                                    "hc": hc_ttc
+                                }
+
+            # Log extracted prices
+            for power, data in prices.items():
+                self.logger.debug(f"HC/HP {power} kVA: subscription={data['subscription']}, hp={data['hp']}, hc={data['hc']}")
+
+            self.logger.info(f"Extracted {len(prices)} HC/HP prices from PDF")
+            return prices
+
+        except Exception as e:
+            self.logger.error(f"Error extracting HC/HP prices: {e}", exc_info=True)
+            return {}
 
     def _get_fallback_offers(self) -> List[OfferData]:
         """Generate offers from fallback pricing data"""
diff --git a/apps/api/uv.lock b/apps/api/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,7 @@ disallow_untyped_defs = true`
`68`	`68`
`69`	`69`	`[dependency-groups]`
`70`	`70`	`dev = [`
	`71`	`+ "ruff>=0.13.3",`
`71`	`72`	`"pytest>=8.4.2",`
`72`	`73`	`"pytest-asyncio>=1.2.0",`
`73`	`74`	`]`