Skip to content

Commit 51bb41d

Browse files
feat: enhance competition data scraping to include timeline and awards
1 parent 7a353b5 commit 51bb41d

File tree

1 file changed

+106
-0
lines changed

1 file changed

+106
-0
lines changed

be-scraper-fastapi/app/services/repo_additional/bulk_competition_service.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@
1010
get_competition_description,
1111
get_competition_image_link,
1212
get_competition_application_link)
13+
from app.services.scrape.competitions.scrape import (
14+
get_competition_timeline,
15+
get_competition_awards,
16+
get_session_id_for_specific_year)
17+
from app.services.repo_additional.competition_crud_services import update_or_create_competition_data
1318
from app.services.unify.lists import (
1419
ar_names_list, tr_names_list, en_names_list,
1520
tr_links_list, en_links_list, min_members_list, max_members_list,
@@ -562,6 +567,53 @@ def bulk_create_update_competitions_multilingual(source: str = "lists", year: st
562567
if existing_competition.id is not None:
563568
competition_crud_class.update_competition(existing_competition.id, existing_competition)
564569
results['updated'] += 1
570+
571+
# Scrape timeline and awards data for competition_data
572+
competition_id = existing_competition.id
573+
try:
574+
timeline = None
575+
awards = None
576+
577+
# Try scraping from EN link first (usually more complete)
578+
if en_url:
579+
try:
580+
response = requests.get(en_url, timeout=10)
581+
response.raise_for_status()
582+
soup = BeautifulSoup(response.text, 'html.parser')
583+
timeline = get_competition_timeline(soup)
584+
awards = get_competition_awards(soup)
585+
except Exception as e:
586+
if __name__ == "__main__":
587+
print(f" Could not scrape from EN link: {e}")
588+
589+
# If EN didn't work or wasn't available, try TR link
590+
if (not timeline or not awards) and tr_url:
591+
try:
592+
response = requests.get(tr_url, timeout=10)
593+
response.raise_for_status()
594+
soup = BeautifulSoup(response.text, 'html.parser')
595+
if not timeline:
596+
timeline = get_competition_timeline(soup)
597+
if not awards:
598+
awards = get_competition_awards(soup)
599+
except Exception as e:
600+
if __name__ == "__main__":
601+
print(f" Could not scrape from TR link: {e}")
602+
603+
# Update competition data if we have timeline or awards
604+
if timeline or awards:
605+
update_or_create_competition_data(
606+
competition_id=competition_id,
607+
year=year,
608+
timeline=timeline,
609+
awards=awards
610+
)
611+
if __name__ == "__main__":
612+
print(f" ✓ Updated competition data")
613+
except Exception as e:
614+
if __name__ == "__main__":
615+
print(f" Warning: Could not update competition data: {e}")
616+
565617
results['details'].append({
566618
'index': idx,
567619
'identifier': identifier,
@@ -579,6 +631,60 @@ def bulk_create_update_competitions_multilingual(source: str = "lists", year: st
579631
# Create new competition
580632
competition_crud_class.create_competition(competition)
581633
results['created'] += 1
634+
635+
# Get the created competition's ID and scrape timeline/awards data
636+
competition_id = None
637+
if competition.tk_number:
638+
# Retrieve the newly created competition to get its ID
639+
created_competition = competition_crud_class.get_competition_by_tk_number(competition.tk_number)
640+
if created_competition:
641+
competition_id = created_competition.id
642+
643+
if competition_id:
644+
try:
645+
timeline = None
646+
awards = None
647+
648+
# Try scraping from EN link first (usually more complete)
649+
if en_url:
650+
try:
651+
response = requests.get(en_url, timeout=10)
652+
response.raise_for_status()
653+
soup = BeautifulSoup(response.text, 'html.parser')
654+
timeline = get_competition_timeline(soup)
655+
awards = get_competition_awards(soup)
656+
except Exception as e:
657+
if __name__ == "__main__":
658+
print(f" Could not scrape from EN link: {e}")
659+
660+
# If EN didn't work or wasn't available, try TR link
661+
if (not timeline or not awards) and tr_url:
662+
try:
663+
response = requests.get(tr_url, timeout=10)
664+
response.raise_for_status()
665+
soup = BeautifulSoup(response.text, 'html.parser')
666+
if not timeline:
667+
timeline = get_competition_timeline(soup)
668+
if not awards:
669+
awards = get_competition_awards(soup)
670+
except Exception as e:
671+
if __name__ == "__main__":
672+
print(f" Could not scrape from TR link: {e}")
673+
674+
# Create competition data if we have timeline or awards
675+
if timeline or awards:
676+
update_or_create_competition_data(
677+
competition_id=competition_id,
678+
year=year,
679+
timeline=timeline,
680+
awards=awards
681+
)
682+
if __name__ == "__main__":
683+
print(f" ✓ Created competition data")
684+
except Exception as e:
685+
if __name__ == "__main__":
686+
print(f" Warning: Could not create competition data: {e}")
687+
582688
results['details'].append({
583689
'index': idx,
584690
'identifier': identifier,

0 commit comments

Comments
 (0)