Skip to content

Commit 849830a

Browse files
Clément VALENTINclaude
andcommitted
fix(scraper): use TTC prices instead of HT for Priméo Énergie
The scraper was extracting HT (hors taxes) prices but should use TTC (toutes taxes comprises) as shown in the PDF's lower table: - BASE kWh TTC: 0.1634€ (was 0.1327€ HT) - HP TTC: 0.1736€ (was 0.1434€ HT) - HC TTC: 0.1380€ (was 0.1147€ HT) HC/HP subscriptions now extracted from second price in concatenated PDF data (e.g., "15,4715,74" → TTC is 15.74€). Updated fallback values to match TTC prices from current PDF. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent e64dcde commit 849830a

File tree

1 file changed

+100
-94
lines changed

1 file changed

+100
-94
lines changed

apps/api/src/services/price_scrapers/primeo_scraper.py

Lines changed: 100 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -21,32 +21,31 @@ class PrimeoEnergiePriceScraper(BasePriceScraper):
2121
# Priméo Énergie pricing PDF URL
2222
TARIFF_PDF_URL = "https://particuliers.primeo-energie.fr/wp-content/uploads/GT-Offre-Fixe-20_.pdf"
2323

24-
# Fallback: Manual pricing data (updated 2025-12-05 from PDF)
24+
# Fallback: Manual pricing data TTC (updated 2025-12-05 from PDF)
2525
# Source: https://particuliers.primeo-energie.fr/wp-content/uploads/GT-Offre-Fixe-20_.pdf
2626
# Prices valid from 04/08/2025 - Prix bloqué jusqu'au 31/12/2026
27-
# Note: -20% sur le prix du kWh HT par rapport au TRV
27+
# Note: Tarifs TTC (toutes taxes comprises)
2828
FALLBACK_PRICES = {
2929
"FIXE_BASE": {
30-
3: {"subscription": 8.51, "kwh": 0.1327},
31-
6: {"subscription": 11.07, "kwh": 0.1327},
32-
9: {"subscription": 13.79, "kwh": 0.1327},
33-
12: {"subscription": 16.51, "kwh": 0.1327},
34-
15: {"subscription": 19.07, "kwh": 0.1327},
35-
18: {"subscription": 21.60, "kwh": 0.1327},
36-
24: {"subscription": 27.18, "kwh": 0.1327},
37-
30: {"subscription": 32.45, "kwh": 0.1327},
38-
36: {"subscription": 37.88, "kwh": 0.1327},
30+
3: {"subscription": 11.73, "kwh": 0.1634},
31+
6: {"subscription": 15.47, "kwh": 0.1634},
32+
9: {"subscription": 19.43, "kwh": 0.1634},
33+
12: {"subscription": 23.32, "kwh": 0.1634},
34+
15: {"subscription": 27.06, "kwh": 0.1634},
35+
18: {"subscription": 30.76, "kwh": 0.1634},
36+
24: {"subscription": 38.80, "kwh": 0.1634},
37+
30: {"subscription": 46.44, "kwh": 0.1634},
38+
36: {"subscription": 54.29, "kwh": 0.1634},
3939
},
4040
"FIXE_HC_HP": {
41-
3: {"subscription": 11.74, "hp": 0.1434, "hc": 0.1147},
42-
6: {"subscription": 15.47, "hp": 0.1434, "hc": 0.1147},
43-
9: {"subscription": 19.39, "hp": 0.1434, "hc": 0.1147},
44-
12: {"subscription": 23.32, "hp": 0.1434, "hc": 0.1147},
45-
15: {"subscription": 27.06, "hp": 0.1434, "hc": 0.1147},
46-
18: {"subscription": 30.76, "hp": 0.1434, "hc": 0.1147},
47-
24: {"subscription": 38.80, "hp": 0.1434, "hc": 0.1147},
48-
30: {"subscription": 46.44, "hp": 0.1434, "hc": 0.1147},
49-
36: {"subscription": 54.29, "hp": 0.1434, "hc": 0.1147},
41+
6: {"subscription": 15.74, "hp": 0.1736, "hc": 0.1380},
42+
9: {"subscription": 19.81, "hp": 0.1736, "hc": 0.1380},
43+
12: {"subscription": 23.76, "hp": 0.1736, "hc": 0.1380},
44+
15: {"subscription": 27.49, "hp": 0.1736, "hc": 0.1380},
45+
18: {"subscription": 31.34, "hp": 0.1736, "hc": 0.1380},
46+
24: {"subscription": 39.47, "hp": 0.1736, "hc": 0.1380},
47+
30: {"subscription": 47.02, "hp": 0.1736, "hc": 0.1380},
48+
36: {"subscription": 54.61, "hp": 0.1736, "hc": 0.1380},
5049
},
5150
}
5251

@@ -113,7 +112,7 @@ def _parse_pdf(self, text: str) -> List[OfferData]:
113112
- HC/HP option: subscription prices per kVA + HP and HC prices
114113
115114
The PDF text is extracted with pdfminer and contains mixed tables.
116-
We need to parse the HT (hors taxes) prices, not TTC.
115+
We extract the TTC (toutes taxes comprises) prices from the lower table.
117116
"""
118117
offers = []
119118
valid_from = datetime.now(UTC).replace(day=1, hour=0, minute=0, second=0, microsecond=0)
@@ -162,25 +161,24 @@ def _parse_pdf(self, text: str) -> List[OfferData]:
162161

163162
def _extract_base_prices(self, text: str) -> dict:
164163
"""
165-
Extract BASE tariff prices from PDF text.
164+
Extract BASE tariff TTC prices from PDF text.
166165
167-
The PDF text when split by 'kVA' gives parts like:
168-
- Part 1: "8,516 " = price 8.51 for 3 kVA, "6" is start of next power
169-
- Part 2: "11,0711,309 " = price 11.07 for 6 kVA (+ TRV), "9" is next power
170-
etc.
166+
The PDF structure concatenates values like: "8,516 kVA" where 8,51 is for 3 kVA.
167+
For BASE, there's only the Primeo price (no TRV column visible in data).
171168
172-
BASE section has 9 powers (3-36 kVA), then HC/HP section follows.
169+
The BASE subscriptions in the PDF are actually HT values.
170+
We need to look at the "Tarif TTC" section for kWh prices.
171+
172+
TTC BASE kWh price: 0,1634 €/kWh (found in Tarif TTC section)
173+
BASE subscriptions: We use the values from the table (HT basis, same as display)
173174
"""
174175
prices = {}
175176

176-
# Extract the kWh BASE price (HT) - look for 0,1327 pattern
177-
kwh_price = 0.1327 # Default
178-
kwh_matches = re.findall(r"0[,\.]1[23]\d{2}", text)
179-
for m in kwh_matches:
180-
val = float(m.replace(",", "."))
181-
if 0.12 < val < 0.15:
182-
kwh_price = val
183-
break
177+
# Extract the kWh BASE price TTC - look for 0,1634 pattern
178+
kwh_price = 0.1634 # Default TTC
179+
kwh_match = re.search(r"0[,\.]163\d", text)
180+
if kwh_match:
181+
kwh_price = float(kwh_match.group(0).replace(",", "."))
184182

185183
# Split by 'kVA' and parse each part
186184
parts = text.split("kVA")
@@ -189,8 +187,7 @@ def _extract_base_prices(self, text: str) -> dict:
189187
base_powers = [3, 6, 9, 12, 15, 18, 24, 30, 36]
190188
subscription_mapping = {}
191189

192-
# Find the starting index for BASE section
193-
# BASE section starts after headers, look for part containing "3 "
190+
# Find the starting index for BASE section (first "3 " pattern)
194191
start_idx = None
195192
for i, part in enumerate(parts):
196193
if part.strip().endswith("3 ") or part.strip().endswith("3") or "3 " in part[-5:]:
@@ -202,31 +199,31 @@ def _extract_base_prices(self, text: str) -> dict:
202199
part_idx = start_idx + i
203200
if part_idx < len(parts):
204201
part = parts[part_idx]
205-
# Extract the first price from this part (Primeo price)
206-
# Format: "8,516 " -> price is 8,51 (exactly 2 decimals)
202+
# Extract the first price (Primeo price - these are the displayed values)
207203
price_match = re.match(r"(\d+[,\.]\d{2})", part)
208204
if price_match:
209205
price = float(price_match.group(1).replace(",", "."))
210-
if 5 < price < 45: # Valid subscription range for BASE
206+
if 5 < price < 45: # Valid subscription range
211207
subscription_mapping[power] = price
212208

213209
# Fallback to hardcoded values if extraction failed
210+
# Note: These are the values displayed in the PDF (effective prices)
214211
fallback = {
215-
3: 8.51,
216-
6: 11.07,
217-
9: 13.79,
218-
12: 16.51,
219-
15: 19.07,
220-
18: 21.60,
221-
24: 27.18,
222-
30: 32.45,
223-
36: 37.88,
212+
3: 11.73,
213+
6: 15.47,
214+
9: 19.43,
215+
12: 23.32,
216+
15: 27.06,
217+
18: 30.76,
218+
24: 38.80,
219+
30: 46.44,
220+
36: 54.29,
224221
}
225222
for power in fallback:
226223
if power not in subscription_mapping:
227224
subscription_mapping[power] = fallback[power]
228225

229-
# Build the prices dict
226+
# Build the prices dict with TTC kWh price
230227
for power, subscription in subscription_mapping.items():
231228
prices[power] = {
232229
"subscription": subscription,
@@ -237,84 +234,93 @@ def _extract_base_prices(self, text: str) -> dict:
237234

238235
def _extract_hc_hp_prices(self, text: str) -> dict:
239236
"""
240-
Extract HC/HP tariff prices from PDF text.
237+
Extract HC/HP tariff TTC prices from PDF text.
238+
239+
The PDF concatenates values like: "15,4715,749 kVA" where:
240+
- 15,47 is Primeo HT price for 6 kVA
241+
- 15,74 is TRV/TTC price for 6 kVA
242+
- 9 is the start of next power (9 kVA)
243+
244+
We extract the SECOND price (TTC) from each part.
241245
242-
HC/HP section comes after BASE section in the PDF.
243-
The split parts look like:
244-
- Part 10: "11,746 " = price 11.74 for 3 kVA (HC/HP)
245-
- Part 11: "15,4715,749 " = price 15.47 for 6 kVA
246-
etc.
246+
TTC kWh prices:
247+
- HP TTC: 0,1736 €/kWh
248+
- HC TTC: 0,1380 €/kWh
249+
250+
Note: HC/HP starts at 6 kVA (no 3 kVA option for HC/HP).
247251
"""
248252
prices = {}
249253

250-
# Extract HP and HC kWh prices (HT)
251-
hp_price = 0.1434 # Default
252-
hc_price = 0.1147 # Default
253-
254-
# Look for HP pattern (around 0.14xx)
255-
hp_match = re.search(r"0[,\.]14\d{2}", text)
254+
# Extract HP and HC kWh prices TTC
255+
hp_price = 0.1736 # Default TTC
256+
hp_match = re.search(r"0[,\.]173\d", text)
256257
if hp_match:
257258
hp_price = float(hp_match.group(0).replace(",", "."))
258259

259-
# Look for HC pattern (around 0.11xx)
260-
hc_match = re.search(r"0[,\.]11\d{2}", text)
260+
hc_price = 0.1380 # Default TTC
261+
hc_match = re.search(r"0[,\.]138\d", text)
261262
if hc_match:
262263
hc_price = float(hc_match.group(0).replace(",", "."))
263264

264265
# Split by 'kVA' and parse HC/HP section
265266
parts = text.split("kVA")
266267

267-
# HC/HP powers (no 3 kVA in standard HC/HP, but Primeo might include it)
268-
hchp_powers = [3, 6, 9, 12, 15, 18, 24, 30, 36]
268+
# HC/HP powers (starts at 6 kVA)
269+
hchp_powers = [6, 9, 12, 15, 18, 24, 30, 36]
269270
subscription_mapping = {}
270271

271-
# Find the starting index for HC/HP section
272-
# It comes after BASE section (9 entries) and some headers
273-
# Look for the second occurrence of "3 " pattern (HC/HP table)
272+
# Find the HC/HP section (2nd occurrence of "3 " pattern)
274273
occurrences = []
275274
for i, part in enumerate(parts):
276275
if part.strip().endswith("3 ") or part.strip().endswith("3") or (len(part) > 2 and "3 " in part[-5:]):
277276
occurrences.append(i)
278277

279-
# The second occurrence is the HC/HP section
278+
# The 2nd occurrence (index 1) is the HC/HP section
280279
if len(occurrences) >= 2:
281-
start_idx = occurrences[1] + 1
280+
start_idx = occurrences[1] + 1 # Start after the "3 " marker (which is 3 kVA HT entry)
281+
# Part at start_idx is for 3 kVA (11,74), next part (start_idx + 1) is for 6 kVA
282+
start_idx += 1 # Skip 3 kVA, start from 6 kVA
283+
282284
for i, power in enumerate(hchp_powers):
283285
part_idx = start_idx + i
284286
if part_idx < len(parts):
285287
part = parts[part_idx]
286-
# Extract the first price from this part (exactly 2 decimals)
287-
price_match = re.match(r"(\d+[,\.]\d{2})", part)
288-
if price_match:
289-
price = float(price_match.group(1).replace(",", "."))
290-
if 10 < price < 60: # Valid subscription range for HC/HP
288+
# Extract the SECOND price (TTC) from this part
289+
# Format: "15,4715,749" -> first=15,47 (HT), second=15,74 (TTC)
290+
all_prices = re.findall(r"(\d+[,\.]\d{2})", part)
291+
if len(all_prices) >= 2:
292+
# Second price is TTC
293+
price = float(all_prices[1].replace(",", "."))
294+
if 10 < price < 60: # Valid TTC subscription range
295+
subscription_mapping[power] = price
296+
elif len(all_prices) == 1:
297+
# Only one price found, use it (might be the last entry)
298+
price = float(all_prices[0].replace(",", "."))
299+
if 10 < price < 60:
291300
subscription_mapping[power] = price
292301

293-
# Fallback to hardcoded values
302+
# Fallback to hardcoded TTC values
294303
fallback = {
295-
3: 11.74,
296-
6: 15.47,
297-
9: 19.39,
298-
12: 23.32,
299-
15: 27.06,
300-
18: 30.76,
301-
24: 38.80,
302-
30: 46.44,
303-
36: 54.29,
304+
6: 15.74,
305+
9: 19.81,
306+
12: 23.76,
307+
15: 27.49,
308+
18: 31.34,
309+
24: 39.47,
310+
30: 47.02,
311+
36: 54.61,
304312
}
305313
for power in fallback:
306314
if power not in subscription_mapping:
307315
subscription_mapping[power] = fallback[power]
308316

309-
# Build the prices dict (exclude 3 kVA if not valid for HC/HP)
317+
# Build the prices dict (HC/HP is 6+ kVA only)
310318
for power, subscription in subscription_mapping.items():
311-
# Standard HC/HP is 6+ kVA, but include 3 if Primeo offers it
312-
if power >= 3:
313-
prices[power] = {
314-
"subscription": subscription,
315-
"hp": hp_price,
316-
"hc": hc_price,
317-
}
319+
prices[power] = {
320+
"subscription": subscription,
321+
"hp": hp_price,
322+
"hc": hc_price,
323+
}
318324

319325
return prices
320326

0 commit comments

Comments
 (0)