Skip to content

Commit 5a2d99f

Browse files
committed
feat: 11_爬虫入门实战4_高效率的爬虫实现
1 parent 8520c9d commit 5a2d99f

File tree

8 files changed

+1196
-0
lines changed

8 files changed

+1196
-0
lines changed

docs/爬虫入门/11_爬虫入门实战4_高效率的爬虫实现.md

Lines changed: 640 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# -*- coding: utf-8 -*-
2+
import multiprocessing
3+
import time
4+
5+
6+
def worker(num):
7+
print(f"Worker {num} started")
8+
time.sleep(2)
9+
print(f"Worker {num} finished")
10+
11+
12+
if __name__ == "__main__":
13+
processes = []
14+
for i in range(5):
15+
p = multiprocessing.Process(target=worker, args=(i,))
16+
processes.append(p)
17+
p.start()
18+
19+
for p in processes:
20+
p.join()
21+
22+
print("All processes completed")
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import threading
2+
import time
3+
4+
def worker(num):
5+
print(f"Thread {num} started")
6+
time.sleep(2)
7+
print(f"Thread {num} finished")
8+
9+
threads = []
10+
for i in range(5):
11+
t = threading.Thread(target=worker, args=(i,))
12+
threads.append(t)
13+
t.start()
14+
15+
for t in threads:
16+
t.join()
17+
18+
print("All threads completed")
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# -*- coding: utf-8 -*-
2+
import asyncio
3+
4+
async def worker(num):
5+
print(f"Coroutine {num} started")
6+
await asyncio.sleep(2)
7+
print(f"Coroutine {num} finished")
8+
9+
async def main():
10+
tasks = [asyncio.create_task(worker(i)) for i in range(5)]
11+
await asyncio.gather(*tasks)
12+
13+
asyncio.run(main())
14+
print("All coroutines completed")
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
# -*- coding: utf-8 -*-
2+
# @Author : [email protected]
3+
# @Name : 程序员阿江-Relakkes
4+
# @Time : 2024/4/7 20:54
5+
# @Desc : 存放一些公共的函数
6+
from typing import List
7+
8+
9+
class SymbolContent:
10+
symbol: str = ""
11+
name: str = ""
12+
price: str = "" # 价格(盘中)
13+
change_price: str = "" # 跌涨价格
14+
change_percent: str = "" # 跌涨幅
15+
market_price: str = "" # 市值
16+
17+
@classmethod
18+
def get_fields(cls) -> List[str]:
19+
return [key for key in cls.__dict__.keys() if not key.startswith('__') and key != "get_fields"]
20+
21+
def __str__(self):
22+
return f"""
23+
Symbol: {self.symbol}
24+
Name: {self.name}
25+
Price: {self.price}
26+
Change Price: {self.change_price}
27+
Change Percent: {self.change_percent}
28+
Market Price: {self.market_price}
29+
"""
30+
31+
32+
def make_req_params_and_headers():
33+
headers = {
34+
# cookies是必须的,并且和common_params的crumb参数绑定的。
35+
'cookie': 'GUC=AQEBCAFmDYVmOUIdcARM&s=AQAAANxlE2ny&g=Zgw0yA; A1=d=AQABBBB0fGQCEKnzzPnIHq8Lm4HEj-GCp50FEgEBCAGFDWY5Zliia3sB_eMBAAcIEHR8ZOGCp50&S=AQAAAgF-nCWw8AxSZ-gyIaeg4aI; A3=d=AQABBBB0fGQCEKnzzPnIHq8Lm4HEj-GCp50FEgEBCAGFDWY5Zliia3sB_eMBAAcIEHR8ZOGCp50&S=AQAAAgF-nCWw8AxSZ-gyIaeg4aI; axids=gam=y-lf5u4KlE2uJWDQYbXyUTkKMC2GVH7OUj~A&dv360=eS1XSElPM3l4RTJ1SHVVV3hNZVBDeG9aTDlDYXdaQ1dPNX5B&ydsp=y-_wiZU4RE2uIAxUbGalyjvJCoR6Le9iVT~A&tbla=y-gt2Wyc1E2uI4nvAYanhnPTMrhn4c3edZ~A; tbla_id=fde33964-c427-4b9c-b849-90a304938e21-tuctb84a272; cmp=t=1712472060&j=0&u=1YNN; gpp=DBABBg~BVoIgACA.QA; gpp_sid=8; A1S=d=AQABBBB0fGQCEKnzzPnIHq8Lm4HEj-GCp50FEgEBCAGFDWY5Zliia3sB_eMBAAcIEHR8ZOGCp50&S=AQAAAgF-nCWw8AxSZ-gyIaeg4aI&j=WORLD',
36+
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
37+
}
38+
common_params = {
39+
'crumb': 'UllRf10isbP',
40+
'lang': 'en-US',
41+
'region': 'US',
42+
'formatted': 'true',
43+
'corsDomain': 'finance.yahoo.com',
44+
}
45+
common_payload_data = {
46+
'offset': 0, # 这个是分页其实位置
47+
'size': 25, # 这个是分页数量
48+
'sortType': 'DESC',
49+
'sortField': 'intradaymarketcap',
50+
'quoteType': 'CRYPTOCURRENCY',
51+
'query': {
52+
'operator': 'and',
53+
'operands': [
54+
{
55+
'operator': 'eq',
56+
'operands': [
57+
'currency',
58+
'USD',
59+
],
60+
},
61+
{
62+
'operator': 'eq',
63+
'operands': [
64+
'exchange',
65+
'CCC',
66+
],
67+
},
68+
],
69+
},
70+
'userId': '',
71+
'userIdType': 'guid',
72+
}
73+
return common_params, headers, common_payload_data
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
# -*- coding: utf-8 -*-
2+
import asyncio
3+
import csv
4+
import time
5+
from typing import Any, Dict, List
6+
7+
import aiofiles
8+
import httpx
9+
from common import SymbolContent, make_req_params_and_headers
10+
11+
HOST = "https://query1.finance.yahoo.com"
12+
SYMBOL_QUERY_API_URI = "/v1/finance/screener"
13+
PAGE_SIZE = 100 # 可选配置(25, 50, 100)
14+
15+
16+
def parse_symbol_content(quote_item: Dict) -> SymbolContent:
17+
"""
18+
数据提取
19+
:param quote_item:
20+
:return:
21+
"""
22+
symbol_content = SymbolContent()
23+
symbol_content.symbol = quote_item["symbol"]
24+
symbol_content.name = quote_item["shortName"]
25+
symbol_content.price = quote_item["regularMarketPrice"]["fmt"]
26+
symbol_content.change_price = quote_item["regularMarketChange"]["fmt"]
27+
symbol_content.change_percent = quote_item["regularMarketChangePercent"]["fmt"]
28+
symbol_content.market_price = quote_item["marketCap"]["fmt"]
29+
return symbol_content
30+
31+
32+
async def send_request(page_start: int, page_size: int) -> Dict[str, Any]:
33+
"""
34+
公共的发送请求的函数
35+
:param page_start: 分页起始位置
36+
:param page_size: 每一页的长度
37+
:return:
38+
"""
39+
# print(f"[send_request] page_start:{page_start}")
40+
req_url = HOST + SYMBOL_QUERY_API_URI
41+
common_params, headers, common_payload_data = make_req_params_and_headers()
42+
# 修改分页变动参数
43+
common_payload_data["offset"] = page_start
44+
common_payload_data["size"] = page_size
45+
46+
async with httpx.AsyncClient() as client:
47+
response = await client.post(url=req_url, params=common_params, json=common_payload_data, headers=headers,
48+
timeout=30)
49+
if response.status_code != 200:
50+
raise Exception("发起请求时发生异常,请求发生错误,原因:", response.text)
51+
try:
52+
response_dict: Dict = response.json()
53+
return response_dict
54+
except Exception as e:
55+
raise e
56+
57+
58+
async def fetch_currency_data_single(page_start: int) -> List[SymbolContent]:
59+
"""
60+
Fetch currency data for a single page.
61+
:param page_start: Page start index.
62+
:return: List of SymbolContent for the page.
63+
"""
64+
try:
65+
response_dict: Dict = await send_request(page_start=page_start, page_size=PAGE_SIZE)
66+
return [
67+
parse_symbol_content(quote) for quote in response_dict["finance"]["result"][0]["quotes"]
68+
]
69+
except Exception as e:
70+
print(f"Error fetching data for page_start={page_start}: {e}")
71+
return []
72+
73+
74+
async def fetch_currency_data_list(max_total_count: int) -> List[SymbolContent]:
75+
"""
76+
Fetch currency data using asyncio.
77+
:param max_total_count: Maximum total count of currencies.
78+
:return: List of all SymbolContent.
79+
"""
80+
page_starts = list(range(0, max_total_count, PAGE_SIZE))
81+
print(f"总共发起: {len(page_starts)} 次网络请求")
82+
83+
tasks = [fetch_currency_data_single(page_start) for page_start in page_starts]
84+
results = await asyncio.gather(*tasks)
85+
86+
# 扁平化结果列表
87+
return [item for sublist in results for item in sublist]
88+
89+
90+
async def get_max_total_count() -> int:
91+
"""
92+
获取所有币种总数量
93+
:return:
94+
"""
95+
print("开始获取最大的币种数量")
96+
try:
97+
response_dict: Dict = await send_request(page_start=0, page_size=PAGE_SIZE)
98+
total_num: int = response_dict["finance"]["result"][0]["total"]
99+
print(f"获取到 {total_num} 种币种")
100+
return total_num
101+
except Exception as e:
102+
print("错误信息:", e)
103+
return 0
104+
105+
106+
async def save_data_to_csv(save_file_name: str, currency_data_list: List[SymbolContent]) -> None:
107+
"""
108+
保存数据存储到CSV文件中
109+
:param save_file_name: 保存的文件名
110+
:param currency_data_list:
111+
:return:
112+
"""
113+
async with aiofiles.open(save_file_name, mode='w', newline='', encoding='utf-8') as file:
114+
writer = csv.writer(file)
115+
# 写入标题行
116+
await file.write(','.join(SymbolContent.get_fields()) + '\n')
117+
# 遍历数据列表,并将每个币种的名称写入CSV
118+
for symbol in currency_data_list:
119+
await file.write(f"{symbol.symbol},{symbol.name},{symbol.price},{symbol.change_price},{symbol.change_percent},{symbol.market_price}\n")
120+
121+
122+
async def run_crawler_async(save_file_name: str) -> None:
123+
"""
124+
爬虫主流程(异步并发版本)
125+
:param save_file_name:
126+
:return:
127+
"""
128+
# step1 获取最大数据总量
129+
max_total: int = await get_max_total_count()
130+
# step2 遍历每一页数据并解析存储到数据容器中
131+
data_list: List[SymbolContent] = await fetch_currency_data_list(max_total)
132+
# step3 将数据容器中的数据保存csv
133+
await save_data_to_csv(save_file_name, data_list)
134+
135+
async def main():
136+
"""
137+
主函数
138+
:return:
139+
"""
140+
start_time = time.time()
141+
save_csv_file_name = f"symbol_data_{int(start_time)}.csv"
142+
await run_crawler_async(save_csv_file_name)
143+
end_time = time.time()
144+
print(f"asyncio调度协程执行程序耗时: {end_time - start_time} 秒")
145+
146+
147+
if __name__ == '__main__':
148+
asyncio.run(main())
149+

0 commit comments

Comments
 (0)