Skip to content

Commit 9403f17

Browse files
committed
add visasq issues scraper
1 parent 7e03e04 commit 9403f17

File tree

3 files changed

+90
-0
lines changed

3 files changed

+90
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,3 +169,4 @@ tests/test_codegen.py
169169
!.streamlit/secrets.toml.example
170170
playwright/.auth/*
171171
!playwright/.auth/.gitkeep
172+
assets/visasq_entries_*.csv

assets/.gitkeep

Whitespace-only changes.

tests/test_visasq.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import csv
2+
import datetime
3+
4+
import pytest
5+
6+
from playwright.sync_api import Page
7+
from tests import flags
8+
9+
10+
def dump_csv(entries, filepath="assets/visasq_entries.csv"):
11+
"""CSV ファイルにエントリを保存するヘルパー関数"""
12+
with open(filepath, "w", newline="", encoding="utf-8") as csvfile:
13+
fieldnames = [
14+
"id",
15+
"url",
16+
"title",
17+
"description",
18+
]
19+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
20+
writer.writeheader()
21+
for entry in entries:
22+
writer.writerow(entry)
23+
24+
25+
def retrieve_visasq_entries(page: Page, url: str):
26+
entries = []
27+
page.goto(url)
28+
page.wait_for_load_state("networkidle")
29+
30+
# href の url パターンが /issue/12345/ のような形式である要素
31+
for entry in page.query_selector_all("a[href^='/issue/']"):
32+
url = entry.get_attribute("href")
33+
34+
# h3 タグの中にある要素を取得
35+
h3_element = entry.query_selector("h3")
36+
37+
# p class=description-regular-14 の中にある要素を取得
38+
p_element = entry.query_selector("p.description-regular-14")
39+
40+
# h3_element と p_element が両方存在する場合のみ処理を続ける
41+
if not h3_element or not p_element:
42+
continue
43+
44+
# entries リストに辞書形式で追加
45+
entries.append(
46+
{
47+
"id": url.split("/")[-2], # URL から ID を抽出
48+
"url": url,
49+
"title": h3_element.inner_text() if h3_element else "",
50+
"description": p_element.inner_text() if p_element else "",
51+
}
52+
)
53+
return entries
54+
55+
56+
@pytest.mark.skipif(flags.SKIP, reason="This test is just a sample scraper and is skipped by default.")
57+
def test_visasq_entries(page: Page):
58+
BASE_URL = "https://expert.visasq.com"
59+
all_entries = []
60+
max_page = 15
61+
try:
62+
for page_number in range(1, max_page + 1):
63+
print(f"Retrieving entries from page {page_number}...")
64+
entries = retrieve_visasq_entries(
65+
page=page,
66+
url=f"{BASE_URL}/issue/?keyword=&is_started_only=true&page={page_number}",
67+
)
68+
# entries の url を絶対 URL に変換
69+
for entry in entries:
70+
entry["url"] = f"{BASE_URL}{entry['url']}"
71+
72+
# print(
73+
# json.dumps(
74+
# entries,
75+
# indent=2,
76+
# ensure_ascii=False,
77+
# )
78+
# )
79+
80+
all_entries.extend(entries)
81+
except Exception as e:
82+
print(f"An error occurred at page {page_number}: {e}")
83+
84+
now = datetime.datetime.now()
85+
filepath = "assets/visasq_entries_" + now.strftime("%Y%m%d_%H%M%S") + ".csv"
86+
dump_csv(
87+
entries=all_entries,
88+
filepath=filepath,
89+
)

0 commit comments

Comments
 (0)