|
5 | 5 | from asyncio import Task
|
6 | 6 | from typing import Any
|
7 | 7 |
|
| 8 | +from functools import partial |
8 | 9 | from pydantic import BaseModel, Field
|
9 | 10 |
|
10 | 11 | from crawlee import Request
|
@@ -67,23 +68,34 @@ def _create_requests_from_input(simple_url_inputs: list[_SimpleUrlInput]) -> lis
|
67 | 68 | async def _create_requests_from_url(
|
68 | 69 | remote_url_requests_inputs: list[_RequestsFromUrlInput], http_client: BaseHttpClient
|
69 | 70 | ) -> list[Request]:
|
| 71 | + """Crete list of requests from url. |
| 72 | +
|
| 73 | + Send GET requests to urls defined in each requests_from_url of remote_url_requests_inputs. Run extracting |
| 74 | + callback on each response body and use URL_NO_COMMAS_REGEX regexp to find all links. Create list of Requests from |
| 75 | + collected links and additional inputs stored in other attributes of each remote_url_requests_inputs. |
| 76 | + """ |
70 | 77 | created_requests: list[Request] = []
|
71 | 78 |
|
72 |
| - def extract_requests_from_response(task: Task) -> list[Request]: |
| 79 | + def create_requests_from_response(request_input: _SimpleUrlInput, task: Task) -> list[Request]: |
| 80 | + """Callback to scrape response body with regexp and create Requests from macthes.""" |
73 | 81 | matches = re.finditer(URL_NO_COMMAS_REGEX, task.result().read().decode('utf-8'))
|
74 |
| - created_requests.extend([Request.from_url(match.group(0)) for match in matches]) |
| 82 | + created_requests.extend([Request.from_url( |
| 83 | + match.group(0), |
| 84 | + method=request_input.method, |
| 85 | + payload=request_input.payload.encode('utf-8'), |
| 86 | + headers=request_input.headers, |
| 87 | + user_data=request_input.user_data) for match in matches]) |
75 | 88 |
|
76 | 89 | remote_url_requests = []
|
77 |
| - for request_input in remote_url_requests_inputs: |
| 90 | + for remote_url_requests_input in remote_url_requests_inputs: |
78 | 91 | task = asyncio.create_task(
|
79 | 92 | http_client.send_request(
|
80 |
| - method=request_input.method, |
81 |
| - url=request_input.requests_from_url, |
82 |
| - headers=request_input.headers, |
83 |
| - payload=request_input.payload.encode('utf-8'), |
| 93 | + method='GET', |
| 94 | + url=remote_url_requests_input.requests_from_url, |
84 | 95 | )
|
85 | 96 | )
|
86 |
| - task.add_done_callback(extract_requests_from_response) |
| 97 | + |
| 98 | + task.add_done_callback(partial(create_requests_from_response, remote_url_requests_input)) |
87 | 99 | remote_url_requests.append(task)
|
88 | 100 |
|
89 | 101 | await asyncio.gather(*remote_url_requests)
|
|
0 commit comments