Skip to content

Commit b3dd4a6

Browse files
authored
Merge pull request #17 from ks6088ts-labs/feature/issue-16_scraping-visasq
scraping visasq
2 parents 9403f17 + f56e2b8 commit b3dd4a6

File tree

7 files changed

+1670
-1419
lines changed

7 files changed

+1670
-1419
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
name: scrape-visasq
2+
3+
on:
4+
push:
5+
workflow_dispatch:
6+
schedule:
7+
- cron: "0 0 * * 3" # Every Wednesday at 00:00 UTC
8+
9+
jobs:
10+
scrape-visasq:
11+
runs-on: ubuntu-latest
12+
timeout-minutes: 10
13+
steps:
14+
- name: Docker Login
15+
uses: docker/login-action@v3
16+
with:
17+
username: ${{ secrets.DOCKERHUB_USERNAME }}
18+
password: ${{ secrets.DOCKERHUB_TOKEN }}
19+
- name: Run Docker container and generate output file
20+
run: |
21+
mkdir -p assets
22+
echo "test" > assets/test.txt
23+
docker run \
24+
--rm \
25+
-v $(pwd)/assets:/app/assets \
26+
ks6088ts/workshop-playwright-python:latest \
27+
python scripts/visasq.py scrape --max-page 20
28+
29+
- name: Upload output file as artifact
30+
uses: actions/upload-artifact@v4
31+
with:
32+
name: assets
33+
path: ./assets/
34+
retention-days: 14

Dockerfile

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ COPY --from=requirements-stage /tmp/requirements.txt /app/requirements.txt
1919
COPY . .
2020

2121
# Install dependencies
22-
RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
22+
RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt && \
23+
playwright install --with-deps
24+
2325

2426
CMD ["python", "workshop_playwright_python/core.py"]

Makefile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ update: ## update packages
5757
# ---
5858
DOCKER_REPO_NAME ?= ks6088ts
5959
DOCKER_IMAGE_NAME ?= workshop-playwright-python
60-
DOCKER_COMMAND ?=
60+
DOCKER_COMMAND ?= python scripts/visasq.py scrape --help
6161

6262
# Tools
6363
TOOLS_DIR ?= /usr/local/bin
@@ -88,6 +88,13 @@ docker-scan: ## scan Docker image
8888
.PHONY: ci-test-docker
8989
ci-test-docker: docker-lint docker-build docker-scan docker-run ## run CI test for Docker
9090

91+
.PHONY: docker-visasq-scrape
92+
docker-visasq-scrape: ## scrape visasq entries using Docker
93+
docker run --rm \
94+
-v $(PWD)/assets:/app/assets \
95+
$(DOCKER_REPO_NAME)/$(DOCKER_IMAGE_NAME):$(GIT_TAG) \
96+
python scripts/visasq.py scrape --max-page 20
97+
9198
# ---
9299
# Docs
93100
# ---

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ dependencies = [
1111
"pytest-playwright>=0.7.0",
1212
"streamlit>=1.45.0",
1313
"streamlit-authenticator>=0.4.2",
14+
"typer>=0.16.0",
1415
]
1516

1617
[project.optional-dependencies]

scripts/visasq.py

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
import asyncio
2+
import csv
3+
import datetime
4+
import os
5+
6+
import typer
7+
8+
from playwright.async_api import Page, async_playwright
9+
10+
app = typer.Typer(help="VisaSQ スクレイパー CLI ツール")
11+
12+
13+
async def dump_csv(entries, filepath="assets/visasq_entries.csv"):
14+
"""CSV ファイルにエントリを保存するヘルパー関数"""
15+
# assets ディレクトリが存在しない場合は作成
16+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
17+
18+
with open(filepath, "w", newline="", encoding="utf-8") as csvfile:
19+
fieldnames = [
20+
"id",
21+
"url",
22+
"title",
23+
"description",
24+
]
25+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
26+
writer.writeheader()
27+
for entry in entries:
28+
writer.writerow(entry)
29+
30+
31+
async def retrieve_visasq_entries(page: Page, url: str):
32+
entries = []
33+
print(f"Retrieving entries from {url}...")
34+
await page.goto(url)
35+
await page.wait_for_load_state("networkidle")
36+
37+
# href の url パターンが /issue/12345/ のような形式である要素
38+
for entry in await page.query_selector_all("a[href^='/issue/']"):
39+
url = await entry.get_attribute("href")
40+
41+
# h3 タグの中にある要素を取得
42+
h3_element = await entry.query_selector("h3")
43+
44+
# p class=description-regular-14 の中にある要素を取得
45+
p_element = await entry.query_selector("p.description-regular-14")
46+
47+
# h3_element と p_element が両方存在する場合のみ処理を続ける
48+
if not h3_element or not p_element:
49+
continue
50+
51+
# entries リストに辞書形式で追加
52+
entries.append(
53+
{
54+
"id": url.split("/")[-2], # URL から ID を抽出
55+
"url": url,
56+
"title": await h3_element.inner_text() if h3_element else "",
57+
"description": await p_element.inner_text() if p_element else "",
58+
}
59+
)
60+
return entries
61+
62+
63+
async def run_scraper(
64+
base_url: str,
65+
max_page: int,
66+
keyword: str = "",
67+
is_started_only: bool = True,
68+
output_dir: str = "assets",
69+
):
70+
async with async_playwright() as p:
71+
browser = await p.chromium.launch()
72+
page = await browser.new_page()
73+
74+
all_entries = []
75+
76+
try:
77+
for page_number in range(1, max_page + 1):
78+
print(f"Retrieving entries from page {page_number}...")
79+
80+
# キーワードとフィルター条件を URL に追加
81+
url = f"{base_url}/issue/?keyword={keyword}&is_started_only={'true' if is_started_only else 'false'}&page={page_number}" # noqa: E501
82+
83+
entries = await retrieve_visasq_entries(page=page, url=url)
84+
85+
# entries の url を絶対 URL に変換
86+
for entry in entries:
87+
entry["url"] = f"{base_url}{entry['url']}"
88+
89+
all_entries.extend(entries)
90+
print(f"Found {len(entries)} entries on page {page_number}")
91+
if len(entries) == 0:
92+
print("No more entries found, stopping the scrape.")
93+
break
94+
except Exception as e:
95+
print(f"An error occurred: {e}")
96+
finally:
97+
await browser.close()
98+
99+
# 現在の日時をファイル名に含める
100+
now = datetime.datetime.now()
101+
filepath = f"{output_dir}/visasq_entries_" + now.strftime("%Y%m%d_%H%M%S") + ".csv"
102+
103+
await dump_csv(entries=all_entries, filepath=filepath)
104+
105+
print(f"Scraping completed. Total entries: {len(all_entries)}")
106+
print(f"Results saved to: {filepath}")
107+
108+
return all_entries
109+
110+
111+
@app.command()
112+
def scrape(
113+
max_page: int = typer.Option(15, "--max-page", "-m", help="スクレイピングする最大ページ数"),
114+
keyword: str = typer.Option("", "--keyword", "-k", help="検索キーワード"),
115+
is_started_only: bool = typer.Option(
116+
True, "--started-only/--not-started-only", help="進行中の案件のみを表示するかどうか"
117+
),
118+
base_url: str = typer.Option("https://expert.visasq.com", "--base-url", "-u", help="VisaSQ の基本 URL"),
119+
output_dir: str = typer.Option("assets", "--output-dir", "-o", help="出力ディレクトリ"),
120+
):
121+
"""
122+
VisaSQ からデータをスクレイピングし、CSV ファイルに保存します
123+
"""
124+
typer.echo(f"スクレイピングを開始します。最大ページ数: {max_page}")
125+
asyncio.run(run_scraper(base_url, max_page, keyword, is_started_only, output_dir))
126+
127+
128+
@app.callback()
129+
def callback():
130+
"""
131+
VisaSQ ウェブサイトから案件情報をスクレイピングするツール
132+
"""
133+
pass
134+
135+
136+
if __name__ == "__main__":
137+
app()

tests/test_visasq.py

Lines changed: 0 additions & 89 deletions
This file was deleted.

0 commit comments

Comments
 (0)