22import csv
33import datetime
44import os
5+ from typing import Optional
56
7+ import typer
68from playwright .async_api import Page , async_playwright
79
10+ app = typer .Typer (help = "VisaSQ スクレイパー CLI ツール" )
11+
812
913async def dump_csv (entries , filepath = "assets/visasq_entries.csv" ):
1014 """CSV ファイルにエントリを保存するヘルパー関数"""
@@ -26,6 +30,7 @@ async def dump_csv(entries, filepath="assets/visasq_entries.csv"):
2630
2731async def retrieve_visasq_entries (page : Page , url : str ):
2832 entries = []
33+ print (f"Retrieving entries from { url } ..." )
2934 await page .goto (url )
3035 await page .wait_for_load_state ("networkidle" )
3136
@@ -55,26 +60,31 @@ async def retrieve_visasq_entries(page: Page, url: str):
5560 return entries
5661
5762
58- async def main ():
63+ async def run_scraper (
64+ base_url : str ,
65+ max_page : int ,
66+ keyword : str = "" ,
67+ is_started_only : bool = True ,
68+ output_dir : str = "assets" ,
69+ ):
5970 async with async_playwright () as p :
6071 browser = await p .chromium .launch ()
6172 page = await browser .new_page ()
6273
63- BASE_URL = "https://expert.visasq.com"
6474 all_entries = []
65- max_page = 15
6675
6776 try :
6877 for page_number in range (1 , max_page + 1 ):
6978 print (f"Retrieving entries from page { page_number } ..." )
70- entries = await retrieve_visasq_entries (
71- page = page ,
72- url = f"{ BASE_URL } /issue/?keyword=&is_started_only=true&page={ page_number } " ,
73- )
79+
80+ # キーワードとフィルター条件を URL に追加
81+ url = f"{ base_url } /issue/?keyword={ keyword } &is_started_only={ 'true' if is_started_only else 'false' } &page={ page_number } "
82+
83+ entries = await retrieve_visasq_entries (page = page , url = url )
7484
7585 # entries の url を絶対 URL に変換
7686 for entry in entries :
77- entry ["url" ] = f"{ BASE_URL } { entry ['url' ]} "
87+ entry ["url" ] = f"{ base_url } { entry ['url' ]} "
7888
7989 all_entries .extend (entries )
8090 print (f"Found { len (entries )} entries on page { page_number } " )
@@ -88,16 +98,40 @@ async def main():
8898
8999 # 現在の日時をファイル名に含める
90100 now = datetime .datetime .now ()
91- filepath = "assets /visasq_entries_" + now .strftime ("%Y%m%d_%H%M%S" ) + ".csv"
101+ filepath = f" { output_dir } /visasq_entries_" + now .strftime ("%Y%m%d_%H%M%S" ) + ".csv"
92102
93- await dump_csv (
94- entries = all_entries ,
95- filepath = filepath ,
96- )
103+ await dump_csv (entries = all_entries , filepath = filepath )
97104
98105 print (f"Scraping completed. Total entries: { len (all_entries )} " )
99106 print (f"Results saved to: { filepath } " )
100107
108+ return all_entries
109+
110+
111+ @app .command ()
112+ def scrape (
113+ max_page : int = typer .Option (15 , "--max-page" , "-m" , help = "スクレイピングする最大ページ数" ),
114+ keyword : str = typer .Option ("" , "--keyword" , "-k" , help = "検索キーワード" ),
115+ is_started_only : bool = typer .Option (
116+ True , "--started-only/--not-started-only" , help = "進行中の案件のみを表示するかどうか"
117+ ),
118+ base_url : str = typer .Option ("https://expert.visasq.com" , "--base-url" , "-u" , help = "VisaSQ の基本 URL" ),
119+ output_dir : str = typer .Option ("assets" , "--output-dir" , "-o" , help = "出力ディレクトリ" ),
120+ ):
121+ """
122+ VisaSQ からデータをスクレイピングし、CSV ファイルに保存します
123+ """
124+ typer .echo (f"スクレイピングを開始します。最大ページ数: { max_page } " )
125+ asyncio .run (run_scraper (base_url , max_page , keyword , is_started_only , output_dir ))
126+
127+
128+ @app .callback ()
129+ def callback ():
130+ """
131+ VisaSQ ウェブサイトから案件情報をスクレイピングするツール
132+ """
133+ pass
134+
101135
102136if __name__ == "__main__" :
103- asyncio . run ( main () )
137+ app ( )
0 commit comments