Skip to content

Commit d8b9c08

Browse files
✨ Feature/scheduler (#5)
* feat: add basic of rocketry * feat: add rocketry * feat: pycharm conf * removed: CI pipeline(temp) * fixed: some typos * added: interface for task manager * removed: fastapi * added: Task prototype * implemented: rocketry proto * ✏️ fix proto/typo/lint * ✅ Add tests for schedule * test: add new tests for `RocketryApplication`, `RocketryManager` * fix: some problem for default value in schema * ci: add rocketry, redbird from Mani repo * ci: update pydantic to 2.0.2 * build: update setup.cfg due to #9 * 🐛 fix selectors base * ✏️ improve names & code quality * ✅ update: test code quality --------- Co-authored-by: ManiMozaffar <[email protected]> Co-authored-by: Sadegh Yazdani
1 parent 187f6b0 commit d8b9c08

File tree

18 files changed

+510
-30
lines changed

18 files changed

+510
-30
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,4 +163,4 @@ cython_debug/
163163
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
164164
# and can be added to the global gitignore or merged into this file. For a more nuclear
165165
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
166-
#.idea/
166+
.idea/

checks.sh

100755100644
File mode changed.

fastcrawler/exceptions.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,24 @@ def __init__(self, model):
2626
super().__init__(self.message)
2727

2828

29+
class TaskNotFound(BaseModelError):
30+
def __init__(self, task_name):
31+
super().__init__(
32+
f"The Task with name={task_name} has not been found",
33+
"\nPlease check your input and be sure the task name is correct",
34+
)
35+
36+
37+
class NoCrawlerFound(BaseModelError):
38+
def __init__(self):
39+
super().__init__(
40+
"No task has been registered in the application."
41+
"\nPlease make sure that you've assigned the crawlers to the application"
42+
"so the application is aware of the crawlers."
43+
"\nThis may also raise if you have overridden the library's startup built in method"
44+
)
45+
46+
2947
class ProcessorNotSupported(BaseModelError):
3048
def __init__(self, model):
3149
self.model = model

fastcrawler/parsers/base.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,9 @@
66
class ParserProtocol(Protocol):
77
def __init__(self, scraped_data: Any):
88
"""Initilize the parser with the given data (html/json/etc)"""
9-
...
109

1110
def parse(self, model: Any) -> Any:
1211
"""
1312
Parse the saved data, with given model, which should be a pydantic model
1413
imported from fastcrawler library
1514
"""
16-
...

fastcrawler/parsers/selectors/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def __init__(
3636

3737
def __repr__(self):
3838
"""Represents a selector for debugging purposes"""
39-
return f"Field(type={self.__class__.__name__} extract={self.extract}, " f"many={self.many}, query={self.query})"
39+
return f"Field(type={self.__class__.__name__} extract={self.extract}, many={self.many}, query={self.query})"
4040

4141
def resolve(self, scraped_data, model):
4242
"""Must be implemented by outer classes.

fastcrawler/parsers/selectors/css.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@
77
from fastcrawler.parsers.utils import _UNSET
88

99
from ..processors.base import ProcessorInterface
10+
from .base import BaseSelector
1011

1112

12-
class _CSSField:
13+
class _CSSField(BaseSelector):
1314
"""
1415
CSSSelectorField represents a field that can be retrieved from a given HTML
1516
document using CSS selectors.

fastcrawler/parsers/selectors/regex.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,10 @@
66
from fastcrawler.parsers.pydantic import BaseModelType
77
from fastcrawler.parsers.utils import _UNSET
88

9+
from .base import BaseSelector
910

10-
class _RegexField:
11+
12+
class _RegexField(BaseSelector):
1113
"""
1214
RegexField represents a field that can be retrieved from a given HTML
1315
document using Regex.

fastcrawler/parsers/selectors/xpath.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,10 @@
77
from fastcrawler.parsers.utils import _UNSET
88

99
from ..processors.base import ProcessorInterface
10+
from .base import BaseSelector
1011

1112

12-
class _XPATHField:
13+
class _XPATHField(BaseSelector):
1314
"""
1415
XPATHField represents a field that can be retrieved from a given HTML
1516
document using XPath.

fastcrawler/schedule/adopter.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
from typing import Callable
2+
3+
from rocketry import Rocketry
4+
from rocketry.conditions.api import cron
5+
6+
from fastcrawler.exceptions import TaskNotFound
7+
8+
from .schema import Task
9+
10+
11+
class RocketryApplication:
12+
def __init__(self, *args, **kwargs):
13+
self.task_lib: Rocketry = Rocketry(*args, **kwargs)
14+
15+
async def serve(self, *args, **kwargs): # pragma: no cover
16+
"""Proto to serve with uvicorn"""
17+
await self.start_up()
18+
return await self.task_lib.serve(*args, **kwargs)
19+
20+
async def get_all_tasks(self) -> set[Task]:
21+
return self.task_lib.session.tasks
22+
23+
async def add_task(self, task_func: Callable, settings: Task) -> None:
24+
"""
25+
...
26+
"""
27+
self.task_lib.task(**dict(settings))(task_func)
28+
return None
29+
30+
async def start_up(self) -> None:
31+
"""
32+
Run Startup Event
33+
"""
34+
35+
async def shut_down(self) -> None:
36+
self.task_lib.session.shut_down()
37+
return None
38+
39+
40+
class RocketryManager:
41+
def __init__(self, app: RocketryApplication):
42+
self.app = app
43+
44+
async def all(self) -> set[Task]:
45+
"""
46+
Return all tasks from internal
47+
"""
48+
return await self.app.get_all_tasks()
49+
50+
async def add_task(self, task_func: Callable, settings: Task) -> None:
51+
"""
52+
Add tasks within internal python API
53+
"""
54+
await self.app.add_task(task_func, settings)
55+
return None
56+
57+
async def change_task_schedule(
58+
self,
59+
task_name: str,
60+
schedule: str,
61+
) -> None:
62+
"""
63+
Reschedule a task
64+
schedule:
65+
- can be string
66+
`every 2 seconds`
67+
- can be cron
68+
`*/2 * * * *`
69+
"""
70+
for task in await self.app.get_all_tasks():
71+
if task.name == task_name:
72+
if schedule.count(" ") == 4:
73+
task.start_cond = cron(schedule)
74+
else:
75+
task.start_cond = schedule
76+
return None
77+
raise TaskNotFound(task_name)
78+
79+
async def toggle_task(self, task_name: str) -> None:
80+
"""
81+
Disables or enable one task
82+
"""
83+
for task in await self.app.get_all_tasks():
84+
if task.name == task_name:
85+
if task.disabled:
86+
task.disabled = False
87+
else:
88+
task.disabled = True
89+
return None
90+
raise TaskNotFound(task_name)

fastcrawler/schedule/proto.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
from typing import Callable, Protocol
2+
3+
from .schema import Task
4+
5+
6+
class TaskApplicationProto(Protocol): # pragma: no cover
7+
task_lib: Callable
8+
9+
def __init__(self, *args, **kwargs):
10+
"""Initialize task application"""
11+
12+
async def serve(self, *args, **kwargs):
13+
"""Proto to serve with Uvicorn"""
14+
15+
async def get_all_tasks(self) -> set[Task]:
16+
"""Returns all tasks that exists in Fast Crawler"""
17+
18+
async def add_task(self, *args, **kwargs) -> None:
19+
"""Dynamically add a task to fast crawler"""
20+
21+
async def start_up(self) -> None:
22+
"""Manage start up actvity"""
23+
24+
async def shut_down(self) -> None:
25+
"""Manage shut down activity"""
26+
27+
28+
class TaskControllerProto(Protocol): # pragma: no cover
29+
app: TaskApplicationProto
30+
31+
def __init__(self, app: TaskApplicationProto):
32+
"""Initialize task application
33+
34+
Args:
35+
app (TaskProcessor): _description_
36+
"""
37+
38+
async def all(self) -> list[Task]:
39+
"""
40+
Return all tasks from internal
41+
"""
42+
43+
async def add_task(self, task_func: Callable, settings: Task) -> None:
44+
"""
45+
Add tasks within internal python API
46+
"""
47+
48+
async def change_task_schedule(self, task_name: str, schedule: str) -> None:
49+
"""
50+
Reschedule a task
51+
schedule:
52+
- can be string
53+
`every 2 seconds`
54+
- can be cron
55+
`*/2 * * * *`
56+
"""
57+
58+
async def toggle_task(self, task_name: str) -> None:
59+
"""
60+
Disables or enable one task
61+
"""

0 commit comments

Comments
 (0)