Skip to content

Commit 7a353b5

Browse files
refactor: enhance competition scraping functions and unify data handling
1 parent b2f2942 commit 7a353b5

File tree

1 file changed

+90
-101
lines changed

1 file changed

+90
-101
lines changed

be-scraper-fastapi/app/services/repo_additional/bulk_competition_service.py

Lines changed: 90 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
11
"""
2-
Service for bulk creating/updating competitions with multilingual data
2+
Service for bulk creating/updating competitions with multilingual info
33
"""
44
from typing import Optional
55
from app.repositories import d1_competition_crud as competition_crud
66
from app.services.scrape.competitions import links_service
77
from app.services.unify.function import find_original_sentence
8+
from app.services.scrape.competitions.scrape import (
9+
get_competition_name,
10+
get_competition_description,
11+
get_competition_image_link,
12+
get_competition_application_link)
813
from app.services.unify.lists import (
914
ar_names_list, tr_names_list, en_names_list,
1015
tr_links_list, en_links_list, min_members_list, max_members_list,
@@ -17,56 +22,12 @@
1722
from datetime import datetime
1823

1924

20-
def get_competition_name(soup):
21-
"""Extract competition name from BeautifulSoup object"""
25+
def scrape_competition_info(link: str):
26+
"""Scrape competition info from a single link"""
2227
try:
23-
competition_name = soup.find('div', class_='container').find('h1').text.strip()
24-
return competition_name
25-
except:
26-
return None
27-
28-
29-
def get_competition_description(soup):
30-
"""Extract competition description from BeautifulSoup object"""
31-
try:
32-
# Get the description from the first tab content
33-
description = soup.find('div', id='tabsNavigation1').text.strip()
34-
return description if description else None
35-
except:
36-
pass
37-
return None
38-
39-
40-
def get_competition_image_link(soup):
41-
"""Extract competition image link from BeautifulSoup object"""
42-
try:
43-
from urllib.parse import unquote
44-
image_element = soup.find('div', id='tabsNavigation1').find('img')
45-
if image_element:
46-
img_src = image_element.get('src')
47-
if img_src:
48-
img_src = unquote(img_src)
49-
if img_src.startswith('http'):
50-
return img_src
51-
else:
52-
return f"https://teknofest.org{img_src}"
53-
except:
54-
pass
55-
return None
56-
57-
58-
def get_competition_application_link(soup):
59-
"""Extract competition application link from BeautifulSoup object"""
60-
try:
61-
application_link = soup.find('div', id='tabsNavigation1').find('a')['href']
62-
return application_link
63-
except:
64-
return None
65-
28+
if __name__ == "__main__":
29+
print(f" Scraping competition info from {link}")
6630

67-
def scrape_competition_data(link: str):
68-
"""Scrape competition data from a single link"""
69-
try:
7031
response = requests.get(link, timeout=10)
7132
response.raise_for_status()
7233
soup = BeautifulSoup(response.text, 'html.parser')
@@ -77,6 +38,11 @@ def scrape_competition_data(link: str):
7738
application_link = get_competition_application_link(soup)
7839

7940
print(f" Scraped: {name or 'N/A'}")
41+
42+
if __name__ == "__main__":
43+
print(f" Description: {description[:100] + '...' if description else 'N/A'}")
44+
print(f" Image Link: {image_link or 'N/A'}")
45+
print(f" Application Link: {application_link or 'N/A'}")
8046

8147
return {
8248
'name': name,
@@ -214,9 +180,9 @@ def find_competition_in_db(tr_name: Optional[str] = None, en_name: Optional[str]
214180
return None
215181

216182

217-
def merge_competition_data(idx: int, tr_data: dict, en_data: dict):
183+
def merge_competition_info(idx: int, tr_info: dict, en_info: dict):
218184
"""
219-
Merge competition data from Turkish and English sources along with predefined list data
185+
Merge competition info from Turkish and English sources along with predefined list info
220186
into a single Competition object
221187
"""
222188
competition = Competition()
@@ -238,7 +204,7 @@ def merge_competition_data(idx: int, tr_data: dict, en_data: dict):
238204
if idx < len(t3kys_number_list) and t3kys_number_list[idx] is not None:
239205
competition.t3kys_number = str(t3kys_number_list[idx])
240206

241-
# Use predefined data from lists (these are already properly matched by index)
207+
# Use predefined info from lists (these are already properly matched by index)
242208
if idx < len(tr_names_list):
243209
competition.tr_name = tr_names_list[idx].strip()
244210
if idx < len(en_names_list):
@@ -247,9 +213,9 @@ def merge_competition_data(idx: int, tr_data: dict, en_data: dict):
247213
competition.ar_name = ar_names_list[idx].strip()
248214

249215
if idx < len(tr_links_list):
250-
competition.tr_link = f"https://teknofest.org/tr/yarismalar/{tr_links_list[idx].strip()}/"
216+
competition.tr_link = f"{tr_links_list[idx].strip()}/"
251217
if idx < len(en_links_list):
252-
competition.en_link = f"https://teknofest.org/en/competitions/{en_links_list[idx].strip()}/"
218+
competition.en_link = f"{en_links_list[idx].strip()}/"
253219

254220
# Set min and max members from CSV if available
255221
if idx < len(min_members_list) and min_members_list[idx] is not None:
@@ -258,22 +224,22 @@ def merge_competition_data(idx: int, tr_data: dict, en_data: dict):
258224
competition.max_member = max_members_list[idx]
259225

260226
# Add scraped descriptions if available
261-
if en_data.get('description'):
262-
competition.en_description = en_data['description']
263-
if tr_data.get('description'):
264-
competition.tr_description = tr_data['description']
227+
if en_info.get('description'):
228+
competition.en_description = en_info['description']
229+
if tr_info.get('description'):
230+
competition.tr_description = tr_info['description']
265231

266232
# Add scraped application links if available
267-
if en_data.get('application_link'):
268-
competition.application_link_en = en_data['application_link']
269-
if tr_data.get('application_link'):
270-
competition.application_link_tr = tr_data['application_link']
233+
if en_info.get('application_link'):
234+
competition.application_link_en = en_info['application_link']
235+
if tr_info.get('application_link'):
236+
competition.application_link_tr = tr_info['application_link']
271237

272238
# Use image from whichever source has it
273-
if en_data.get('image_link'):
274-
competition.image_path = en_data['image_link']
275-
elif tr_data.get('image_link'):
276-
competition.image_path = tr_data['image_link']
239+
if en_info.get('image_link'):
240+
competition.image_path = en_info['image_link']
241+
elif tr_info.get('image_link'):
242+
competition.image_path = tr_info['image_link']
277243

278244
return competition
279245

@@ -324,12 +290,12 @@ def bulk_create_update_competitions_from_remote(year: str = None):
324290
print(f"Found {len(en_links)} English competition links")
325291

326292
except Exception as e:
327-
print(f"Error fetching competition data: {str(e)}")
293+
print(f"Error fetching competition info: {str(e)}")
328294
return {
329295
'created': 0,
330296
'updated': 0,
331297
'failed': 1,
332-
'details': [{'error': f'Failed to fetch competition data: {str(e)}'}]
298+
'details': [{'error': f'Failed to fetch competition info: {str(e)}'}]
333299
}
334300

335301
# Match TR and EN competitions by position
@@ -353,41 +319,41 @@ def bulk_create_update_competitions_from_remote(year: str = None):
353319

354320
print(f"\n({idx+1}/{max_competitions}): {identifier}")
355321

356-
# Scrape data from each language version (with session cookie if available)
322+
# Scrape info from each language version (with session cookie if available)
357323
if tr_link:
358324
response = requests.get(tr_link, cookies=session_cookies, timeout=10) if session_cookies else requests.get(tr_link, timeout=10)
359325
tr_soup = BeautifulSoup(response.content, 'html.parser')
360-
tr_data = {
326+
tr_info = {
361327
'name': get_competition_name(tr_soup),
362328
'description': get_competition_description(tr_soup),
363329
'image_link': get_competition_image_link(tr_soup),
364330
'application_link': get_competition_application_link(tr_soup),
365331
'link': tr_link
366332
}
367333
else:
368-
tr_data = {}
334+
tr_info = {}
369335

370336
if en_link:
371337
response = requests.get(en_link, cookies=session_cookies, timeout=10) if session_cookies else requests.get(en_link, timeout=10)
372338
en_soup = BeautifulSoup(response.content, 'html.parser')
373-
en_data = {
339+
en_info = {
374340
'name': get_competition_name(en_soup),
375341
'description': get_competition_description(en_soup),
376342
'image_link': get_competition_image_link(en_soup),
377343
'application_link': get_competition_application_link(en_soup),
378344
'link': en_link
379345
}
380346
else:
381-
en_data = {}
347+
en_info = {}
382348

383349
# Create competition object
384350
competition = Competition()
385351

386-
# Set names from scraped data
387-
if tr_data.get('name'):
388-
competition.tr_name = tr_data['name']
389-
if en_data.get('name'):
390-
competition.en_name = en_data['name']
352+
# Set names from scraped info
353+
if tr_info.get('name'):
354+
competition.tr_name = tr_info['name']
355+
if en_info.get('name'):
356+
competition.en_name = en_info['name']
391357

392358
# Set links
393359
if tr_link:
@@ -396,22 +362,22 @@ def bulk_create_update_competitions_from_remote(year: str = None):
396362
competition.en_link = en_link
397363

398364
# Set descriptions
399-
if tr_data.get('description'):
400-
competition.tr_description = tr_data['description']
401-
if en_data.get('description'):
402-
competition.en_description = en_data['description']
365+
if tr_info.get('description'):
366+
competition.tr_description = tr_info['description']
367+
if en_info.get('description'):
368+
competition.en_description = en_info['description']
403369

404370
# Set application links
405-
if tr_data.get('application_link'):
406-
competition.application_link_tr = tr_data['application_link']
407-
if en_data.get('application_link'):
408-
competition.application_link_en = en_data['application_link']
371+
if tr_info.get('application_link'):
372+
competition.application_link_tr = tr_info['application_link']
373+
if en_info.get('application_link'):
374+
competition.application_link_en = en_info['application_link']
409375

410376
# Set image
411-
if en_data.get('image_link'):
412-
competition.image_path = en_data['image_link']
413-
elif tr_data.get('image_link'):
414-
competition.image_path = tr_data['image_link']
377+
if en_info.get('image_link'):
378+
competition.image_path = en_info['image_link']
379+
elif tr_info.get('image_link'):
380+
competition.image_path = tr_info['image_link']
415381

416382
# set id
417383

@@ -429,7 +395,7 @@ def bulk_create_update_competitions_from_remote(year: str = None):
429395
# Create or update
430396
if existing_competition:
431397
print(f" Found existing competition (ID: {existing_competition.id})")
432-
# Merge with existing data
398+
# Merge with existing info
433399
for field in ['tr_name', 'tr_description', 'tr_link', 'en_name', 'en_description', 'en_link',
434400
'image_path', 'application_link_tr', 'application_link_en']:
435401
new_value = getattr(competition, field)
@@ -485,14 +451,17 @@ def bulk_create_update_competitions_from_remote(year: str = None):
485451

486452
def bulk_create_update_competitions_multilingual(source: str = "lists", year: str = None):
487453
"""
488-
Create or update all competitions in the database with multilingual data.
454+
Create or update all competitions in the database with multilingual info.
489455
490456
Args:
491457
source: 'lists' for local CSV or 'remote' for website scraping
492458
year: Competition year (used for tracking which years competitions are held)
493459
494460
Returns a summary of the operation.
495461
"""
462+
if __name__ == "__main__":
463+
print(f"Running bulk_create_update_competitions_multilingual with source='{source}' and year='{year}'")
464+
496465
if source == "remote":
497466
return bulk_create_update_competitions_from_remote(year=year)
498467

@@ -523,28 +492,36 @@ def bulk_create_update_competitions_multilingual(source: str = "lists", year: st
523492

524493
for idx in range(max_competitions):
525494
try:
526-
# Get data from predefined lists
495+
# Get info from predefined lists
527496
tr_name = tr_names_list[idx].strip() if idx < len(tr_names_list) else None
528497
en_name = en_names_list[idx].strip() if idx < len(en_names_list) else None
529498
ar_name = ar_names_list[idx].strip() if idx < len(ar_names_list) else None
530499
tr_link = tr_links_list[idx].strip() if idx < len(tr_links_list) else None
531500
en_link = en_links_list[idx].strip() if idx < len(en_links_list) else None
532501

533502
# Build full URLs
534-
tr_url = f"https://teknofest.org/tr/yarismalar/{tr_link}/" if tr_link else None
535-
en_url = f"https://teknofest.org/en/competitions/{en_link}/" if en_link else None
503+
tr_url = f"{tr_link}/" if tr_link else None
504+
en_url = f"{en_link}/" if en_link else None
505+
506+
if __name__ == "__main__":
507+
print(f"\nProcessing competition index {idx}:")
508+
print(f" TR Name: {tr_name}")
509+
print(f" TR Link: {tr_url}")
510+
print(f" EN Name: {en_name}")
511+
print(f" EN Link: {en_url}")
512+
print(f" AR Name: {ar_name}")
536513

537514
# Generate identifier from available names
538515
identifier = en_name or tr_name or ar_name or f"competition_{idx}"
539516

540517
print(f"\n({idx+1}/{max_competitions}): {identifier}")
541518

542-
# Scrape data from each language version for descriptions and images
543-
tr_data = scrape_competition_data(tr_url) if tr_url else {}
544-
en_data = scrape_competition_data(en_url) if en_url else {}
519+
# Scrape info from each language version for descriptions and images
520+
tr_info = scrape_competition_info(tr_url) if tr_url else {}
521+
en_info = scrape_competition_info(en_url) if en_url else {}
545522

546-
# Merge the data (passing index to use list data)
547-
competition = merge_competition_data(idx, tr_data, en_data)
523+
# Merge the info (passing index to use list info)
524+
competition = merge_competition_info(idx, tr_info, en_info)
548525

549526
# Add year to years list
550527
competition.years = [year]
@@ -559,10 +536,18 @@ def bulk_create_update_competitions_multilingual(source: str = "lists", year: st
559536
en_link=en_url
560537
)
561538

539+
if __name__ == "__main__":
540+
if existing_competition:
541+
print(f" Found existing competition in DB (ID: {existing_competition.id})")
542+
continue
543+
else:
544+
print(f" No existing competition found in DB")
545+
continue
546+
562547
# Create or update
563548
if existing_competition:
564549
print(f" Found existing competition (ID: {existing_competition.id})")
565-
# Merge with existing data, preserving fields not set in new data
550+
# Merge with existing info, preserving fields not set in new info
566551
for field in ['tr_name', 'tr_description', 'tr_link', 'en_name', 'en_description', 'en_link',
567552
'ar_name', 'ar_description', 'ar_link', 'image_path', 'min_member', 'max_member',
568553
'application_link_tr', 'application_link_en']:
@@ -621,3 +606,7 @@ def bulk_create_update_competitions_multilingual(source: str = "lists", year: st
621606
print(f"{'='*60}\n")
622607

623608
return results
609+
610+
if __name__ == "__main__":
611+
# Example usage: bulk create/update competitions from lists for current year
612+
bulk_create_update_competitions_multilingual(source="lists", year="2026")

0 commit comments

Comments
 (0)