-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript.py
More file actions
301 lines (253 loc) · 11.8 KB
/
script.py
File metadata and controls
301 lines (253 loc) · 11.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from datetime import datetime, timedelta
import time
from dotenv import load_dotenv
import logging
import glob
import re
import os
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("pv_webscraper.log"),
logging.StreamHandler()
]
)
# --- Einstellungen ---
start_date = datetime(2024, 7, 6)
end_date = datetime.today()
# Absoluter Pfad zum Verzeichnis, wo das Skript liegt
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
# Download-Ordner relativ zum Skript-Standort
download_dir = os.path.join(BASE_DIR, "pv_data")
# Verzeichnis sicherstellen
os.makedirs(download_dir, exist_ok=True)
# Chrome-Optionen mit absolutem Download-Pfad
prefs = {'download.default_directory': download_dir}
def extract_date_from_filename(filename):
m = re.search(r'(\d{2}-\d{2}-\d{4})', filename)
if m:
return datetime.strptime(m.group(1), "%d-%m-%Y")
return None
# Prüfe, ob für das Datum bereits ein Report im Download-Ordner liegt (z.B. nach Namensmuster)
def is_file_downloaded(download_dir, date):
all_files = os.listdir(download_dir)
for f in all_files:
filedate = extract_date_from_filename(f)
if filedate and filedate.date() == date.date():
return True
return False
# --- Login Information ---
logging.info("Starte Login-Prozess …")
load_dotenv() # .env-Datei laden
EMAIL = os.getenv("PV_EMAIL")
PASSWORD = os.getenv("PV_PASSWORD")
LINK = os.getenv("PV_LINK")
# --- Browser-Setup ---
options = webdriver.ChromeOptions()
prefs = {'download.default_directory': download_dir}
options.add_experimental_option('prefs', prefs)
driver = webdriver.Chrome(options=options)
driver.get(LINK)
wait = WebDriverWait(driver, 20)
logging.info("PV Webscraper startet.")
username_input = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input#username")))
username_input.clear()
username_input.send_keys(EMAIL)
password_input = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "input#value")))
password_input.clear()
password_input.send_keys(PASSWORD)
login_btn = wait.until(EC.element_to_be_clickable((By.ID, "btn_outerverify")))
login_btn.click()
logging.info("Login erfolgreich.")
# --- Go to "Report Management Site --- "
logging.info("Wechsle zur Seite Report Management …")
report_management = wait.until(EC.element_to_be_clickable((By.XPATH, '//span[@title="Report Management"]')))
report_management.click()
# wait.until(EC.visibility_of_element_located((By.XPATH, '//span[contains(text(),"Plant Report")]')))
logging.info("Tab Report Management aktiviert.")
def wait_for_download(directory, timeout=60):
"""
Wartet bis mindestens eine neue Datei vollständig im Download-Ordner (ohne .crdownload) erscheint.
"""
seconds = 0
while seconds < timeout:
files = glob.glob(os.path.join(directory, "*"))
# Prüfen, ob mindestens eine Datei existiert, die NICHT auf .crdownload endet
ready_files = [f for f in files if not f.endswith('.crdownload')]
if ready_files:
# Prüfen, ob KEINE Datei mehr im Download mit .crdownload liegt (Download abgeschlossen)
crdownload_files = [f for f in files if f.endswith('.crdownload')]
if not crdownload_files:
return True
time.sleep(1)
seconds += 1
raise TimeoutError("Download wurde nicht abgeschlossen.")
def select_date(driver, wait, date_obj):
# Warten, bis kein Modal mehr sichtbar ist
try:
wait.until_not(EC.visibility_of_element_located((By.CLASS_NAME, "ant-modal-root")))
except Exception:
pass
# 1. Klicke auf das Datumseingabefeld (öffnet Kalender)
date_input = wait.until(EC.element_to_be_clickable((By.ID, 'statisticTime')))
date_input.click()
# Mapping Monatsnamen
months = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
# 2. Monat/Jahr im Kalender holen
while True:
month_btn = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "button.ant-picker-month-btn")))
year_btn = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "button.ant-picker-year-btn")))
# --- Robust: Warte bis Monatsname lesbar ist ---
for _ in range(20):
visible_month = month_btn.text.strip()
visible_year = year_btn.text.strip()
if visible_month in months and visible_year.isdigit():
break
time.sleep(0.1)
else:
raise Exception(f"Monatsname konnte nicht gelesen werden: '{month_btn.text}'")
# Format month: 'Aug', year: '2025'
current_month = months[visible_month]
current_year = int(visible_year)
# 3. Navigiere zum gewünschten Jahr/Monat
if current_year > date_obj.year or (current_year == date_obj.year and current_month > date_obj.month):
# Zurück klicken
prev_btn = driver.find_element(By.CSS_SELECTOR, "button.ant-picker-header-prev-btn")
prev_btn.click()
elif current_year < date_obj.year or (current_year == date_obj.year and current_month < date_obj.month):
# Vorwärts klicken
next_btn = driver.find_element(By.CSS_SELECTOR, "button.ant-picker-header-next-btn")
next_btn.click()
else:
break # Monat/Jahr stimmt
# 4. Tag auswählen
day_str = str(date_obj.day).zfill(2)
day_cell = driver.find_element(By.XPATH, f'//td[@class="ant-picker-cell" or contains(@class,"ant-picker-cell")]/div[text()="{int(day_str)}"]')
day_cell.click()
# 1. Intervall auf "Daily" setzen
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "span.ant-select-selection-item"))).click()
wait.until(EC.element_to_be_clickable((By.XPATH, '//div[contains(@class,"ant-select-item-option") and @title="Daily"]'))).click()
# --- Schleife über Datumsbereich ---
current_date = start_date
while current_date <= end_date:
logging.info(f"Verarbeite Datum: {current_date.strftime('%Y-%m-%d')}")
if is_file_downloaded(download_dir, current_date):
logging.info(f"Report für {current_date.strftime('%Y-%m-%d')} bereits vorhanden – überspringe Download.")
current_date += timedelta(days=1)
continue
try:
# 2. Datum eingeben
select_date(driver, wait, current_date)
# 3. Search-Button klicken
# Vorher den aktuellen Wert der ersten Zeile holen (z. B. das erste Datum)
old_row_text = driver.find_element(By.CSS_SELECTOR, "tbody tr td").text
search_btn = driver.find_element(By.XPATH, '//button[text()="Search"]')
search_btn.click()
def table_changed(driver):
try:
new_text = driver.find_element(By.CSS_SELECTOR, "tbody tr td").text
return new_text != old_row_text
except StaleElementReferenceException:
return False
wait.until(table_changed)
# 4. Export-Button klicken
export_btn = driver.find_element(By.XPATH, '//button[contains(@class,"ant-btn-default") and text()="Export"]') # Warte, bis das Task-Modal sichtbar ist
export_btn.click()
wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "ant-modal-root")))
# 5. Download-Button im Task-Fenster
# Greife die erste Zeile der Task-Liste (neuster Report oben)
logging.info("Starte Download für %s", current_date.strftime("%Y-%m-%d"))
download_btn = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[contains(@class, "ant-modal-root")]//a[@title="Download"]')))
download_btn.click()
wait_for_download(download_dir)
# Now close the modal
close_btn = driver.find_element(By.CLASS_NAME, "ant-modal-close")
close_btn.click()
wait.until_not(EC.visibility_of_element_located((By.CLASS_NAME, "ant-modal-root")))
logging.info("Export- und Download-Vorgang abgeschlossen.")
except Exception as e:
logging.error(f"Fehler bei {current_date.strftime('%Y-%m-%d')}: {e}", exc_info=True)
logging.info("Springe zum nächsten Tag.")
finally:
current_date += timedelta(days=1)
logging.info("PV Webscraper beendet.")
logging.info("Clean-up duplicates ...")
def cleanup_duplicates(download_dir):
"""
Entfernt alle CSV-Dateien im Download-Ordner, die die Endung (1), (2), ... im Namen haben.
Behält jeweils nur das erste Original.
"""
all_files = os.listdir(download_dir)
to_remove = []
for filename in all_files:
if "(" in filename or ")" in filename:
filepath = os.path.join(download_dir, filename)
to_remove.append(filepath)
else:
print("Kein Duplikat:", filename)
for f in to_remove:
try:
os.remove(f)
logging.info(f"Datei mit Klammer gelöscht: {os.path.basename(f)}")
except Exception as e:
logging.warning(f"Fehler beim Löschen von {os.path.basename(f)}: {e}")
cleanup_duplicates(download_dir)
def all_dates(start_date, end_date):
d = start_date
while d <= end_date:
yield d
d += timedelta(days=1)
def find_missing_dates(download_dir, start_date, end_date):
"""
Findet alle Tage im Datumsbereich, für die KEINE Datei existiert.
"""
all_files = os.listdir(download_dir)
existing_dates = set()
for f in all_files:
filedate = extract_date_from_filename(f)
if filedate:
existing_dates.add(filedate.date())
missing = []
d = start_date.date()
end = end_date.date()
while d <= end:
if d not in existing_dates:
missing.append(datetime.combine(d, datetime.min.time()))
d += timedelta(days=1)
return missing
def redownload_missing_days(driver, wait, download_dir, missing_dates, max_retries=3):
for date in missing_dates:
for attempt in range(1, max_retries + 1):
logging.info(f"Versuch {attempt} für {date.strftime('%Y-%m-%d')}")
try:
select_date(driver, wait, date)
# 3. Search-Button klicken
old_row_text = driver.find_element(By.CSS_SELECTOR, "tbody tr td").text
search_btn = driver.find_element(By.XPATH, '//button[text()="Search"]')
search_btn.click()
# Warte auf Tabellenupdate (siehe vorherige Vorschläge)
export_btn = driver.find_element(By.XPATH, '//button[contains(@class,"ant-btn-default") and text()="Export"]')
export_btn.click()
wait.until(EC.visibility_of_element_located((By.CLASS_NAME, "ant-modal-root")))
download_btn = wait.until(EC.element_to_be_clickable((By.XPATH, '//div[contains(@class, "ant-modal-root")]//a[@title="Download"]')))
download_btn.click()
wait_for_download(download_dir)
close_btn = driver.find_element(By.CLASS_NAME, "ant-modal-close")
close_btn.click()
wait.until_not(EC.visibility_of_element_located((By.CLASS_NAME, "ant-modal-root")))
if is_file_downloaded(download_dir, date):
logging.info(f"Download für {date.strftime('%Y-%m-%d')} erfolgreich.")
break
except Exception as e:
logging.error(f"Fehler beim Download für {date.strftime('%Y-%m-%d')} Versuch {attempt}: {e}")
if attempt == max_retries:
logging.warning(f"Download für {date.strftime('%Y-%m-%d')} nach {max_retries} Versuchen nicht möglich.")
driver.quit()