Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions .github/workflows/scrape-visasq.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: scrape-visasq

on:
push:
workflow_dispatch:
schedule:
- cron: "0 0 * * 3" # Every Wednesday at 00:00 UTC

jobs:
scrape-visasq:
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- name: Docker Login
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Run Docker container and generate output file
run: |
mkdir -p assets
echo "test" > assets/test.txt
docker run \
--rm \
-v $(pwd)/assets:/app/assets \
ks6088ts/workshop-playwright-python:latest \
python scripts/visasq.py scrape --max-page 20

- name: Upload output file as artifact
uses: actions/upload-artifact@v4
with:
name: assets
path: ./assets/
retention-days: 14
4 changes: 3 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ COPY --from=requirements-stage /tmp/requirements.txt /app/requirements.txt
COPY . .

# Install dependencies
RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt && \
playwright install --with-deps


CMD ["python", "workshop_playwright_python/core.py"]
9 changes: 8 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ update: ## update packages
# ---
DOCKER_REPO_NAME ?= ks6088ts
DOCKER_IMAGE_NAME ?= workshop-playwright-python
DOCKER_COMMAND ?=
DOCKER_COMMAND ?= python scripts/visasq.py scrape --help

# Tools
TOOLS_DIR ?= /usr/local/bin
Expand Down Expand Up @@ -88,6 +88,13 @@ docker-scan: ## scan Docker image
.PHONY: ci-test-docker
ci-test-docker: docker-lint docker-build docker-scan docker-run ## run CI test for Docker

.PHONY: docker-visasq-scrape
docker-visasq-scrape: ## scrape visasq entries using Docker
docker run --rm \
-v $(PWD)/assets:/app/assets \
$(DOCKER_REPO_NAME)/$(DOCKER_IMAGE_NAME):$(GIT_TAG) \
python scripts/visasq.py scrape --max-page 20

# ---
# Docs
# ---
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dependencies = [
"pytest-playwright>=0.7.0",
"streamlit>=1.45.0",
"streamlit-authenticator>=0.4.2",
"typer>=0.16.0",
]

[project.optional-dependencies]
Expand Down
137 changes: 137 additions & 0 deletions scripts/visasq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import asyncio
import csv
import datetime
import os

import typer

from playwright.async_api import Page, async_playwright

app = typer.Typer(help="VisaSQ スクレイパー CLI ツール")


async def dump_csv(entries, filepath="assets/visasq_entries.csv"):
"""CSV ファイルにエントリを保存するヘルパー関数"""
# assets ディレクトリが存在しない場合は作成
os.makedirs(os.path.dirname(filepath), exist_ok=True)

with open(filepath, "w", newline="", encoding="utf-8") as csvfile:
fieldnames = [
"id",
"url",
"title",
"description",
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for entry in entries:
writer.writerow(entry)


async def retrieve_visasq_entries(page: Page, url: str):
entries = []
print(f"Retrieving entries from {url}...")
await page.goto(url)
await page.wait_for_load_state("networkidle")

# href の url パターンが /issue/12345/ のような形式である要素
for entry in await page.query_selector_all("a[href^='/issue/']"):
url = await entry.get_attribute("href")

# h3 タグの中にある要素を取得
h3_element = await entry.query_selector("h3")

# p class=description-regular-14 の中にある要素を取得
p_element = await entry.query_selector("p.description-regular-14")

# h3_element と p_element が両方存在する場合のみ処理を続ける
if not h3_element or not p_element:
continue

# entries リストに辞書形式で追加
entries.append(
{
"id": url.split("/")[-2], # URL から ID を抽出
"url": url,
"title": await h3_element.inner_text() if h3_element else "",
"description": await p_element.inner_text() if p_element else "",
}
)
return entries


async def run_scraper(
base_url: str,
max_page: int,
keyword: str = "",
is_started_only: bool = True,
output_dir: str = "assets",
):
async with async_playwright() as p:
browser = await p.chromium.launch()
page = await browser.new_page()

all_entries = []

try:
for page_number in range(1, max_page + 1):
print(f"Retrieving entries from page {page_number}...")

# キーワードとフィルター条件を URL に追加
url = f"{base_url}/issue/?keyword={keyword}&is_started_only={'true' if is_started_only else 'false'}&page={page_number}" # noqa: E501

entries = await retrieve_visasq_entries(page=page, url=url)

# entries の url を絶対 URL に変換
for entry in entries:
entry["url"] = f"{base_url}{entry['url']}"

all_entries.extend(entries)
print(f"Found {len(entries)} entries on page {page_number}")
if len(entries) == 0:
print("No more entries found, stopping the scrape.")
break
except Exception as e:
print(f"An error occurred: {e}")
finally:
await browser.close()

# 現在の日時をファイル名に含める
now = datetime.datetime.now()
filepath = f"{output_dir}/visasq_entries_" + now.strftime("%Y%m%d_%H%M%S") + ".csv"

await dump_csv(entries=all_entries, filepath=filepath)

print(f"Scraping completed. Total entries: {len(all_entries)}")
print(f"Results saved to: {filepath}")

return all_entries


@app.command()
def scrape(
max_page: int = typer.Option(15, "--max-page", "-m", help="スクレイピングする最大ページ数"),
keyword: str = typer.Option("", "--keyword", "-k", help="検索キーワード"),
is_started_only: bool = typer.Option(
True, "--started-only/--not-started-only", help="進行中の案件のみを表示するかどうか"
),
base_url: str = typer.Option("https://expert.visasq.com", "--base-url", "-u", help="VisaSQ の基本 URL"),
output_dir: str = typer.Option("assets", "--output-dir", "-o", help="出力ディレクトリ"),
):
"""
VisaSQ からデータをスクレイピングし、CSV ファイルに保存します
"""
typer.echo(f"スクレイピングを開始します。最大ページ数: {max_page}")
asyncio.run(run_scraper(base_url, max_page, keyword, is_started_only, output_dir))


@app.callback()
def callback():
"""
VisaSQ ウェブサイトから案件情報をスクレイピングするツール
"""
pass


if __name__ == "__main__":
app()
89 changes: 0 additions & 89 deletions tests/test_visasq.py

This file was deleted.

Loading
Loading