Skip to content

Commit 374ae77

Browse files
♻️ Refactor FastCrawler core codebase and synced to FastAPI backend client (#53)
* 🐛 Fix: Controller injection to spiders from app * 🐛 Fix Rocketry proto impl * ♻️ Refactor interface to use list instead of set * 🎨 Refactor design of engine * ✅ Update new engine designs * 🎨 Refactor spider w.r.t new engine design * ✨ Add batching requests * ✅ Add tests for batching * ✅ Refactor existing test designs * ✨ Add max_depth to control recursion * ✅ Add tests for max_depth * ✨ Add Sleep interval between request (Optional) * ✅ Add test for interval between request * ✨ Add cycle_sleep_interval & max_request_count * ⚰️ Remove old aio engine design * ♻️ Refactor AioEngine to send data or json base on contents * ✅ Refactor aio engine tests for con limit * 🐛 Fix await a Coroutine * 🐛 Fix use ssl instead verify_ssl key in aiohttp * ✅ Update test approx time to pass * 🐛 Fix list of tasks return in the `RocketryApplication` * ⚡️ Replace if statement by an expression * 🚑 Fix toggle task new_status * ⚡️ Change task schedule returns task instead of None * ♻️ Refactor core's structure for typing improvements * ♻️ Refactor task manager structure for typing improvements * ♻️ Refactor engine to use dataclass and have better typings * 🐛 Fix minor issue with task registry * ♻️ Refactor core with new design * 🏷️ Improve explicit typing for URL * ♻️ Refactor tests to pass with new design * 🎨 Add customized exception for bad configuration * ♻️ Rename Proto to ABC * ♻️ Refactor task to be immutable * ♻️ Refactor class variable names to be PEP-8 friendly --------- Co-authored-by: Sadegh Yazdani <[email protected]>
1 parent 72139fd commit 374ae77

File tree

22 files changed

+547
-316
lines changed

22 files changed

+547
-316
lines changed

docs_src/processor/tutorial001.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ async def main():
3333
cond="every 1 second",
3434
controller=ProcessController(app=RocketryApplication()),
3535
)
36-
await process.add_spiders()
36+
await process.add_spiders_to_controller()
3737
assert len(await process.controller.app.get_all_tasks()) == 1
3838
await process.start(silent=False)
3939

fastcrawler/core/app.py

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,20 @@
11
import asyncio
2-
from typing import Callable
2+
from typing import Any, Coroutine
33

4-
from fastcrawler.exceptions import NoCrawlerFoundError
4+
from fastcrawler.exceptions import NoCrawlerFoundErrorError
55
from fastcrawler.schedule.adopter import ProcessController, RocketryApplication
6-
from fastcrawler.schedule.contracts import ControllerProto
6+
from fastcrawler.schedule.contracts import ControllerABC
77

88
from .process import Process
99

1010

11+
def list_process(crawlers: list[Process] | Process) -> list[Process]:
12+
if isinstance(crawlers, Process):
13+
return [crawlers]
14+
else:
15+
return crawlers
16+
17+
1118
class FastCrawler:
1219
"""The client interface to start all crawlers.
1320
Initialize all crawlers
@@ -22,47 +29,48 @@ class FastCrawler:
2229
2330
"""
2431

25-
controller: ControllerProto | None = None
32+
controller: ControllerABC | None = None
2633

2734
def __init__(
2835
self,
2936
crawlers: list[Process] | Process,
30-
controller: ControllerProto | None = None,
37+
controller: ControllerABC | None = None,
3138
):
3239
"""Initialize FastCrawler with defined crawlers"""
33-
...
34-
if isinstance(crawlers, Process):
35-
self.crawlers = [
36-
crawlers,
37-
]
38-
else:
39-
self.crawlers = crawlers
40-
40+
self.crawlers = list_process(crawlers)
4141
self.controller = controller or ProcessController(app=RocketryApplication())
4242
if not self.crawlers or len(self.crawlers) == 0:
43-
raise NoCrawlerFoundError
43+
raise NoCrawlerFoundErrorError
4444

4545
@property
46-
def get_all_serves(self) -> list[Callable]:
46+
def get_all_serves(self) -> list[Coroutine[Any, Any, None]]:
4747
"""get all application to be served"""
48+
assert self.controller is not None
4849
return [
4950
self.controller.app.serve(),
5051
]
5152

5253
async def serve(self) -> None:
53-
"""Serve protocol for uvicorn"""
54+
"""
55+
Serve protocol for uvicorn, useful with combination
56+
with other tools if get_all_serves is customized
57+
"""
5458
await asyncio.gather(*self.get_all_serves)
5559
return None
5660

5761
async def start(self, silent=True) -> None:
5862
"""Start all crawlers in background explicitly without schedule"""
63+
64+
# TODO: make here multi processing, for more than one process!
65+
# or use rocketry to trigger the tasks, if possible :)
5966
await asyncio.gather(*[crawler.start(silent) for crawler in self.crawlers])
6067
return None
6168

6269
async def run(self) -> None:
6370
"""Run all crawlers in background explicitly with schedule"""
6471
for crawler in self.crawlers:
65-
await crawler.add_spiders()
72+
crawler.controller = self.controller
73+
await crawler.add_spiders_to_controller()
6674
await self.serve()
6775
return None
6876

@@ -76,6 +84,7 @@ async def shutdown(self) -> None:
7684

7785
async def _shutdown(self) -> None:
7886
"""Safe shut down event for application crawler"""
87+
assert self.controller is not None
7988
await self.shutdown()
8089
await self.controller.shut_down()
8190
return None

fastcrawler/core/process.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from uuid import uuid4
22

33
from fastcrawler.core.spider import Spider
4-
from fastcrawler.schedule.contracts import ControllerProto
4+
from fastcrawler.schedule.contracts import ControllerABC
55
from fastcrawler.schedule.schema import Task
66

77

@@ -14,7 +14,7 @@ class Process:
1414
def __init__(
1515
self,
1616
spider: Spider,
17-
controller: ControllerProto | None = None,
17+
controller: ControllerABC | None = None,
1818
cond: str | Task | None = None,
1919
*args,
2020
**kwargs,
@@ -23,15 +23,15 @@ def __init__(
2323
2424
Args:
2525
spider (Spider): _description_
26-
controller (None | ControllerProto, optional): _description_. Defaults to None.
26+
controller (None | ControllerABC, optional): _description_. Defaults to None.
2727
cond (Task | None, optional): _description_. Defaults to None.
2828
"""
2929
if isinstance(cond, Task):
3030
self.task = cond
3131
else:
3232
self.task = Task(
3333
start_cond=cond or "every 1 second",
34-
name=spider.__class__.__name__ + str(uuid4()),
34+
name=f"{uuid4()}@{spider.__class__.__name__}",
3535
)
3636
self.args = args
3737
self.kwargs = kwargs
@@ -44,23 +44,26 @@ async def start(self, silent: bool = True) -> None:
4444
This method will disable scheduler temporarily to avoid duplicate running
4545
"""
4646
if self.controller:
47-
await self.controller.toggle_task(self.task.name, new_status=False)
47+
await self.controller.toggle_task(str(self.task.name), new_status=False)
4848
await self.spider.start(silent=silent)
4949
if self.controller:
50-
await self.controller.toggle_task(self.task.name, new_status=True)
50+
await self.controller.toggle_task(str(self.task.name), new_status=True)
5151
return None
5252

5353
async def stop(self) -> None:
5454
"""Stop the crawling process"""
55-
self.spider.is_stopped = True
55+
for instance in self.spider.instances:
56+
instance.is_stopped = True
57+
5658
if self.controller:
57-
self.controller.toggle_task(self.task.name, new_status=False)
59+
await self.controller.toggle_task(str(self.task.name), new_status=False)
5860
return None
5961

60-
async def add_spiders(self) -> None:
62+
async def add_spiders_to_controller(self) -> None:
6163
"""
6264
Run the crawling process
6365
"""
66+
assert self.controller is not None
6467
if self.task:
6568
await self.controller.add_task(self.spider.start, self.task)
6669
else:

0 commit comments

Comments
 (0)