Skip to content

Commit 1f4a90e

Browse files
committed
make docker-visasq-scrape to get issues
1 parent c73807e commit 1f4a90e

File tree

3 files changed

+57
-15
lines changed

3 files changed

+57
-15
lines changed

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,6 @@ COPY . .
2020

2121
# Install dependencies
2222
RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
23+
RUN playwright install --with-deps
2324

2425
CMD ["python", "workshop_playwright_python/core.py"]

Makefile

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ update: ## update packages
5757
# ---
5858
DOCKER_REPO_NAME ?= ks6088ts
5959
DOCKER_IMAGE_NAME ?= workshop-playwright-python
60-
DOCKER_COMMAND ?=
60+
DOCKER_COMMAND ?= python scripts/visasq.py scrape --help
6161

6262
# Tools
6363
TOOLS_DIR ?= /usr/local/bin
@@ -88,6 +88,13 @@ docker-scan: ## scan Docker image
8888
.PHONY: ci-test-docker
8989
ci-test-docker: docker-lint docker-build docker-scan docker-run ## run CI test for Docker
9090

91+
.PHONY: docker-visasq-scrape
92+
docker-visasq-scrape: ## scrape visasq entries using Docker
93+
docker run --rm \
94+
-v $(PWD)/assets:/app/assets \
95+
$(DOCKER_REPO_NAME)/$(DOCKER_IMAGE_NAME):$(GIT_TAG) \
96+
python scripts/visasq.py scrape --max-page 20
97+
9198
# ---
9299
# Docs
93100
# ---

scripts/visasq_scraper.py renamed to scripts/visasq.py

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,13 @@
22
import csv
33
import datetime
44
import os
5+
from typing import Optional
56

7+
import typer
68
from playwright.async_api import Page, async_playwright
79

10+
app = typer.Typer(help="VisaSQ スクレイパー CLI ツール")
11+
812

913
async def dump_csv(entries, filepath="assets/visasq_entries.csv"):
1014
"""CSV ファイルにエントリを保存するヘルパー関数"""
@@ -26,6 +30,7 @@ async def dump_csv(entries, filepath="assets/visasq_entries.csv"):
2630

2731
async def retrieve_visasq_entries(page: Page, url: str):
2832
entries = []
33+
print(f"Retrieving entries from {url}...")
2934
await page.goto(url)
3035
await page.wait_for_load_state("networkidle")
3136

@@ -55,26 +60,31 @@ async def retrieve_visasq_entries(page: Page, url: str):
5560
return entries
5661

5762

58-
async def main():
63+
async def run_scraper(
64+
base_url: str,
65+
max_page: int,
66+
keyword: str = "",
67+
is_started_only: bool = True,
68+
output_dir: str = "assets",
69+
):
5970
async with async_playwright() as p:
6071
browser = await p.chromium.launch()
6172
page = await browser.new_page()
6273

63-
BASE_URL = "https://expert.visasq.com"
6474
all_entries = []
65-
max_page = 15
6675

6776
try:
6877
for page_number in range(1, max_page + 1):
6978
print(f"Retrieving entries from page {page_number}...")
70-
entries = await retrieve_visasq_entries(
71-
page=page,
72-
url=f"{BASE_URL}/issue/?keyword=&is_started_only=true&page={page_number}",
73-
)
79+
80+
# キーワードとフィルター条件を URL に追加
81+
url = f"{base_url}/issue/?keyword={keyword}&is_started_only={'true' if is_started_only else 'false'}&page={page_number}"
82+
83+
entries = await retrieve_visasq_entries(page=page, url=url)
7484

7585
# entries の url を絶対 URL に変換
7686
for entry in entries:
77-
entry["url"] = f"{BASE_URL}{entry['url']}"
87+
entry["url"] = f"{base_url}{entry['url']}"
7888

7989
all_entries.extend(entries)
8090
print(f"Found {len(entries)} entries on page {page_number}")
@@ -88,16 +98,40 @@ async def main():
8898

8999
# 現在の日時をファイル名に含める
90100
now = datetime.datetime.now()
91-
filepath = "assets/visasq_entries_" + now.strftime("%Y%m%d_%H%M%S") + ".csv"
101+
filepath = f"{output_dir}/visasq_entries_" + now.strftime("%Y%m%d_%H%M%S") + ".csv"
92102

93-
await dump_csv(
94-
entries=all_entries,
95-
filepath=filepath,
96-
)
103+
await dump_csv(entries=all_entries, filepath=filepath)
97104

98105
print(f"Scraping completed. Total entries: {len(all_entries)}")
99106
print(f"Results saved to: {filepath}")
100107

108+
return all_entries
109+
110+
111+
@app.command()
112+
def scrape(
113+
max_page: int = typer.Option(15, "--max-page", "-m", help="スクレイピングする最大ページ数"),
114+
keyword: str = typer.Option("", "--keyword", "-k", help="検索キーワード"),
115+
is_started_only: bool = typer.Option(
116+
True, "--started-only/--not-started-only", help="進行中の案件のみを表示するかどうか"
117+
),
118+
base_url: str = typer.Option("https://expert.visasq.com", "--base-url", "-u", help="VisaSQ の基本 URL"),
119+
output_dir: str = typer.Option("assets", "--output-dir", "-o", help="出力ディレクトリ"),
120+
):
121+
"""
122+
VisaSQ からデータをスクレイピングし、CSV ファイルに保存します
123+
"""
124+
typer.echo(f"スクレイピングを開始します。最大ページ数: {max_page}")
125+
asyncio.run(run_scraper(base_url, max_page, keyword, is_started_only, output_dir))
126+
127+
128+
@app.callback()
129+
def callback():
130+
"""
131+
VisaSQ ウェブサイトから案件情報をスクレイピングするツール
132+
"""
133+
pass
134+
101135

102136
if __name__ == "__main__":
103-
asyncio.run(main())
137+
app()

0 commit comments

Comments
 (0)