Skip to content

Commit c73807e

Browse files
committed
prompt: #file:test_visasq.py の実装内容を下に、単独で実行できるスクリプトとして #file:visasq_scraper.py に実装を追加して。
1 parent c6a8cc8 commit c73807e

File tree

2 files changed

+103
-89
lines changed

2 files changed

+103
-89
lines changed

scripts/visasq_scraper.py

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
import asyncio
2+
import csv
3+
import datetime
4+
import os
5+
6+
from playwright.async_api import Page, async_playwright
7+
8+
9+
async def dump_csv(entries, filepath="assets/visasq_entries.csv"):
10+
"""CSV ファイルにエントリを保存するヘルパー関数"""
11+
# assets ディレクトリが存在しない場合は作成
12+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
13+
14+
with open(filepath, "w", newline="", encoding="utf-8") as csvfile:
15+
fieldnames = [
16+
"id",
17+
"url",
18+
"title",
19+
"description",
20+
]
21+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
22+
writer.writeheader()
23+
for entry in entries:
24+
writer.writerow(entry)
25+
26+
27+
async def retrieve_visasq_entries(page: Page, url: str):
28+
entries = []
29+
await page.goto(url)
30+
await page.wait_for_load_state("networkidle")
31+
32+
# href の url パターンが /issue/12345/ のような形式である要素
33+
for entry in await page.query_selector_all("a[href^='/issue/']"):
34+
url = await entry.get_attribute("href")
35+
36+
# h3 タグの中にある要素を取得
37+
h3_element = await entry.query_selector("h3")
38+
39+
# p class=description-regular-14 の中にある要素を取得
40+
p_element = await entry.query_selector("p.description-regular-14")
41+
42+
# h3_element と p_element が両方存在する場合のみ処理を続ける
43+
if not h3_element or not p_element:
44+
continue
45+
46+
# entries リストに辞書形式で追加
47+
entries.append(
48+
{
49+
"id": url.split("/")[-2], # URL から ID を抽出
50+
"url": url,
51+
"title": await h3_element.inner_text() if h3_element else "",
52+
"description": await p_element.inner_text() if p_element else "",
53+
}
54+
)
55+
return entries
56+
57+
58+
async def main():
59+
async with async_playwright() as p:
60+
browser = await p.chromium.launch()
61+
page = await browser.new_page()
62+
63+
BASE_URL = "https://expert.visasq.com"
64+
all_entries = []
65+
max_page = 15
66+
67+
try:
68+
for page_number in range(1, max_page + 1):
69+
print(f"Retrieving entries from page {page_number}...")
70+
entries = await retrieve_visasq_entries(
71+
page=page,
72+
url=f"{BASE_URL}/issue/?keyword=&is_started_only=true&page={page_number}",
73+
)
74+
75+
# entries の url を絶対 URL に変換
76+
for entry in entries:
77+
entry["url"] = f"{BASE_URL}{entry['url']}"
78+
79+
all_entries.extend(entries)
80+
print(f"Found {len(entries)} entries on page {page_number}")
81+
if len(entries) == 0:
82+
print("No more entries found, stopping the scrape.")
83+
break
84+
except Exception as e:
85+
print(f"An error occurred: {e}")
86+
finally:
87+
await browser.close()
88+
89+
# 現在の日時をファイル名に含める
90+
now = datetime.datetime.now()
91+
filepath = "assets/visasq_entries_" + now.strftime("%Y%m%d_%H%M%S") + ".csv"
92+
93+
await dump_csv(
94+
entries=all_entries,
95+
filepath=filepath,
96+
)
97+
98+
print(f"Scraping completed. Total entries: {len(all_entries)}")
99+
print(f"Results saved to: {filepath}")
100+
101+
102+
if __name__ == "__main__":
103+
asyncio.run(main())

tests/test_visasq.py

Lines changed: 0 additions & 89 deletions
This file was deleted.

0 commit comments

Comments
 (0)