|
1 | 1 | import logging |
2 | 2 | from datetime import date, timedelta |
3 | 3 | from typing import Dict |
| 4 | +from urllib.parse import urljoin |
4 | 5 |
|
5 | 6 | import requests |
6 | 7 | from faker import Faker # skipcq: BAN-B410 |
7 | 8 | from lxml.etree import HTMLParser # skipcq: BAN-B410 |
8 | 9 | from lxml.html import document_fromstring |
9 | 10 | from requests import HTTPError, Response |
10 | | -from requests.exceptions import MissingSchema |
11 | 11 |
|
12 | 12 | from api_app.analyzers_manager.classes import FileAnalyzer |
13 | 13 | from api_app.models import PythonConfig |
@@ -137,32 +137,25 @@ def identify_text_input(self, input_name: str) -> str: |
137 | 137 | if input_name in names: |
138 | 138 | return fake_value |
139 | 139 |
|
140 | | - def extract_action_attribute(self, form) -> str: |
| 140 | + # guarda anche i log di errore |
| 141 | + |
| 142 | + @staticmethod |
| 143 | + def extract_action_attribute(base_site: str, form) -> str: |
| 144 | + # we always return an URL to prevent MissingSchema error in request |
141 | 145 | form_action: str = form.get("action", None) |
142 | 146 | if not form_action: |
143 | 147 | logger.info( |
144 | | - f"'action' attribute not found in form. Defaulting to {self.target_site=}" |
| 148 | + f"'action' attribute not found in form. Defaulting to {base_site=}" |
145 | 149 | ) |
146 | | - form_action = self.target_site |
147 | | - elif form_action.startswith("/"): # pure relative url |
148 | | - logger.info(f"Found relative url in {form_action=}") |
149 | | - form_action = form_action.replace("/", "", 1) |
150 | | - base_site = self.target_site |
151 | | - |
152 | | - if base_site.endswith("/"): |
153 | | - base_site = base_site[:-1] |
154 | | - form_action = base_site + "/" + form_action |
155 | | - elif ( |
156 | | - "." in form_action and "://" not in form_action |
157 | | - ): # found a domain (relative file names such as "login.php" should start with /) |
158 | | - logger.info(f"Found a domain in form action {form_action=}") |
159 | | - else: |
160 | | - base_site = self.target_site |
161 | | - |
162 | | - if base_site.endswith("/"): |
163 | | - base_site = base_site[:-1] |
164 | | - form_action = base_site + "/" + form_action |
165 | | - |
| 150 | + return base_site |
| 151 | + if "://" not in base_site: |
| 152 | + # if target site is a domain add a temporary default |
| 153 | + # schema so we can use urljoin as if it was an url |
| 154 | + base_site = "https://" + base_site |
| 155 | + |
| 156 | + form_action = urljoin(base_site, form_action) |
| 157 | + if "://" not in form_action: |
| 158 | + form_action = "https://" + form_action |
166 | 159 | logger.info(f"Extracted action to post data to: {form_action}") |
167 | 160 |
|
168 | 161 | return form_action |
@@ -203,34 +196,21 @@ def compile_form_field(self, form) -> dict: |
203 | 196 |
|
204 | 197 | def perform_request_to_form(self, form) -> Response: |
205 | 198 | params = self.compile_form_field(form) |
206 | | - dest_url = self.extract_action_attribute(form) |
| 199 | + dest_url = self.extract_action_attribute(self.target_site, form) |
207 | 200 | logger.info(f"Job #{self.job_id}: Sending {params=} to submit url {dest_url}") |
208 | 201 | headers = { |
209 | 202 | "User-Agent": self.user_agent, |
210 | 203 | } |
211 | | - try: |
212 | | - response = requests.post( |
213 | | - url=dest_url, |
214 | | - data=params, |
215 | | - headers=headers, |
216 | | - proxies=( |
217 | | - {"http": self.proxy_address, "https": self.proxy_address} |
218 | | - if self.proxy_address |
219 | | - else None |
220 | | - ), |
221 | | - ) |
222 | | - except MissingSchema: |
223 | | - logger.info(f"Adding default 'https://' schema to {dest_url}") |
224 | | - response = requests.post( |
225 | | - url="https://" + dest_url, |
226 | | - data=params, |
227 | | - headers=headers, |
228 | | - proxies=( |
229 | | - {"http": self.proxy_address, "https": self.proxy_address} |
230 | | - if self.proxy_address |
231 | | - else None |
232 | | - ), |
233 | | - ) |
| 204 | + response = requests.post( |
| 205 | + url=dest_url, |
| 206 | + data=params, |
| 207 | + headers=headers, |
| 208 | + proxies=( |
| 209 | + {"http": self.proxy_address, "https": self.proxy_address} |
| 210 | + if self.proxy_address |
| 211 | + else None |
| 212 | + ), |
| 213 | + ) |
234 | 214 | logger.info(f"Request headers: {response.request.headers}") |
235 | 215 | return response |
236 | 216 |
|
|
0 commit comments