-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathweb_scraping.py
More file actions
69 lines (50 loc) · 2.39 KB
/
web_scraping.py
File metadata and controls
69 lines (50 loc) · 2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import re
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
from datetime import datetime, timezone
from upload_raw_to_bigquery import upload_to_bigquery
with sync_playwright() as p:
# 1. Select browser engine
browser = p.chromium.launch(headless=True) # default is headless=True, headless=False for debugging
page = browser.new_page()
# 2. go to page and wait for table to appear
page.goto("https://growthlist.co/india-startups")
# Wait for the table wrapper to ensure the JS has rendered the table
page.wait_for_selector('div[class*="ninja_table_wrapper"]')
# 3. Interaction (DO THIS BEFORE SOUP)
# Target the dropdown and select '100' to load more data into the DOM
tfoot_locator = page.locator('div[class*="ninja_table_wrapper"]').locator("tfoot")
dropdown = page.locator("select.nt_pager_selection")
dropdown.select_option("100")
# Wait a moment for the table to reload with 100 items
page.wait_for_timeout(2000)
# 4. Parsing (Capture the snapshot AFTER the interaction)
page_content = page.content()
soup = BeautifulSoup(page_content, 'html.parser')
target_table = soup.select_one('table[data-unique_identifier^="ninja_table_unique_id_"]')
scraped_data = []
if target_table:
# Adding date and time of scraped item in iso 8601 format
timestamp = datetime.now(timezone.utc).replace(microsecond=0).isoformat()
# Get header
header_cells = target_table.find("thead").find_all("th")
pattern = r'[^a-zA-Z0-9_\s]' # remove invalid characters from column name
headers = [re.sub(pattern, "", th.get_text(strip=True)) for th in header_cells]
# Get Rows
rows = target_table.find("tbody").find_all("tr")
for row in rows:
cells = row.find_all("td")
item = {header: cell.get_text(strip=True) for header, cell in zip(headers, cells)}
item["scraped_at_utc"] = timestamp
scraped_data.append(item)
# Close the page, then broswer to end the connection
page.close()
browser.close()
if scraped_data:
print(f'Found {len(scraped_data)} records. Upload to BigQuery')
upload_to_bigquery(scraped_data)
# Run transformation immediately after upload
from transform_raw_data import run_transformations
run_transformations()
else:
print("No data found to upload via web scraping")