|
5 | 5 | from asyncio import Task |
6 | 6 | from typing import Any |
7 | 7 |
|
| 8 | +from functools import partial |
8 | 9 | from pydantic import BaseModel, Field |
9 | 10 |
|
10 | 11 | from crawlee import Request |
@@ -67,23 +68,34 @@ def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> lis |
67 | 68 | async def _create_requests_from_url( |
68 | 69 | remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient |
69 | 70 | ) -> list[Request]: |
| 71 | + """Crete list of requests from url. |
| 72 | +
|
| 73 | + Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting |
| 74 | + callback on each response body and use URL_NO_COMMAS_REGEX regexp to find all links. Create list of Requests from |
| 75 | + collected links and additional inputs stored in other attributes of each remote_url_requests_inputs. |
| 76 | + """ |
70 | 77 | created_requests: list[Request] = [] |
71 | 78 |
|
72 | | - def extract_requests_from_response(task: Task) -> list[Request]: |
| 79 | + def create_requests_from_response(request_input: _SimpleUrlInput, task: Task) -> list[Request]: |
| 80 | + """Callback to scrape response body with regexp and create Requests from macthes.""" |
73 | 81 | matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8')) |
74 | | - created_requests.extend([Request.from_url(match.group(0)) for match in matches]) |
| 82 | + created_requests.extend([Request.from_url( |
| 83 | + match.group(0), |
| 84 | + method=request_input.method, |
| 85 | + payload=request_input.payload.encode('utf-8'), |
| 86 | + headers=request_input.headers, |
| 87 | + user_data=request_input.user_data) for match in matches]) |
75 | 88 |
|
76 | 89 | remote_url_requests = [] |
77 | | - for request_input in remote_url_requests_inputs: |
| 90 | + for remote_url_requests_input in remote_url_requests_inputs: |
78 | 91 | task = asyncio.create_task( |
79 | 92 | http_client.send_request( |
80 | | - method=request_input.method, |
81 | | - url=request_input.requests_from_url, |
82 | | - headers=request_input.headers, |
83 | | - payload=request_input.payload.encode('utf-8'), |
| 93 | + method='GET', |
| 94 | + url=remote_url_requests_input.requests_from_url, |
84 | 95 | ) |
85 | 96 | ) |
86 | | - task.add_done_callback(extract_requests_from_response) |
| 97 | + |
| 98 | + task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input)) |
87 | 99 | remote_url_requests.append(task) |
88 | 100 |
|
89 | 101 | await asyncio.gather(*remote_url_requests) |
|
0 commit comments