3
3
import datetime
4
4
import os
5
5
6
+ import typer
7
+
6
8
from playwright .async_api import Page , async_playwright
7
9
10
+ app = typer .Typer (help = "VisaSQ スクレイパー CLI ツール" )
11
+
8
12
9
13
async def dump_csv (entries , filepath = "assets/visasq_entries.csv" ):
10
14
"""CSV ファイルにエントリを保存するヘルパー関数"""
@@ -26,6 +30,7 @@ async def dump_csv(entries, filepath="assets/visasq_entries.csv"):
26
30
27
31
async def retrieve_visasq_entries (page : Page , url : str ):
28
32
entries = []
33
+ print (f"Retrieving entries from { url } ..." )
29
34
await page .goto (url )
30
35
await page .wait_for_load_state ("networkidle" )
31
36
@@ -55,26 +60,31 @@ async def retrieve_visasq_entries(page: Page, url: str):
55
60
return entries
56
61
57
62
58
- async def main ():
63
+ async def run_scraper (
64
+ base_url : str ,
65
+ max_page : int ,
66
+ keyword : str = "" ,
67
+ is_started_only : bool = True ,
68
+ output_dir : str = "assets" ,
69
+ ):
59
70
async with async_playwright () as p :
60
71
browser = await p .chromium .launch ()
61
72
page = await browser .new_page ()
62
73
63
- BASE_URL = "https://expert.visasq.com"
64
74
all_entries = []
65
- max_page = 15
66
75
67
76
try :
68
77
for page_number in range (1 , max_page + 1 ):
69
78
print (f"Retrieving entries from page { page_number } ..." )
70
- entries = await retrieve_visasq_entries (
71
- page = page ,
72
- url = f"{ BASE_URL } /issue/?keyword=&is_started_only=true&page={ page_number } " ,
73
- )
79
+
80
+ # キーワードとフィルター条件を URL に追加
81
+ url = f"{ base_url } /issue/?keyword={ keyword } &is_started_only={ 'true' if is_started_only else 'false' } &page={ page_number } " # noqa: E501
82
+
83
+ entries = await retrieve_visasq_entries (page = page , url = url )
74
84
75
85
# entries の url を絶対 URL に変換
76
86
for entry in entries :
77
- entry ["url" ] = f"{ BASE_URL } { entry ['url' ]} "
87
+ entry ["url" ] = f"{ base_url } { entry ['url' ]} "
78
88
79
89
all_entries .extend (entries )
80
90
print (f"Found { len (entries )} entries on page { page_number } " )
@@ -88,16 +98,40 @@ async def main():
88
98
89
99
# 現在の日時をファイル名に含める
90
100
now = datetime .datetime .now ()
91
- filepath = "assets /visasq_entries_" + now .strftime ("%Y%m%d_%H%M%S" ) + ".csv"
101
+ filepath = f" { output_dir } /visasq_entries_" + now .strftime ("%Y%m%d_%H%M%S" ) + ".csv"
92
102
93
- await dump_csv (
94
- entries = all_entries ,
95
- filepath = filepath ,
96
- )
103
+ await dump_csv (entries = all_entries , filepath = filepath )
97
104
98
105
print (f"Scraping completed. Total entries: { len (all_entries )} " )
99
106
print (f"Results saved to: { filepath } " )
100
107
108
+ return all_entries
109
+
110
+
111
+ @app .command ()
112
+ def scrape (
113
+ max_page : int = typer .Option (15 , "--max-page" , "-m" , help = "スクレイピングする最大ページ数" ),
114
+ keyword : str = typer .Option ("" , "--keyword" , "-k" , help = "検索キーワード" ),
115
+ is_started_only : bool = typer .Option (
116
+ True , "--started-only/--not-started-only" , help = "進行中の案件のみを表示するかどうか"
117
+ ),
118
+ base_url : str = typer .Option ("https://expert.visasq.com" , "--base-url" , "-u" , help = "VisaSQ の基本 URL" ),
119
+ output_dir : str = typer .Option ("assets" , "--output-dir" , "-o" , help = "出力ディレクトリ" ),
120
+ ):
121
+ """
122
+ VisaSQ からデータをスクレイピングし、CSV ファイルに保存します
123
+ """
124
+ typer .echo (f"スクレイピングを開始します。最大ページ数: { max_page } " )
125
+ asyncio .run (run_scraper (base_url , max_page , keyword , is_started_only , output_dir ))
126
+
127
+
128
+ @app .callback ()
129
+ def callback ():
130
+ """
131
+ VisaSQ ウェブサイトから案件情報をスクレイピングするツール
132
+ """
133
+ pass
134
+
101
135
102
136
if __name__ == "__main__" :
103
- asyncio . run ( main () )
137
+ app ( )
0 commit comments