|
1 | 1 | """Common functions and network utilities.""" |
2 | 2 |
|
| 3 | +import json |
3 | 4 | import time |
4 | 5 | from enum import StrEnum |
| 6 | +from io import BytesIO |
5 | 7 |
|
| 8 | +import certifi |
6 | 9 | import httpx |
7 | 10 | import loguru |
| 11 | +import pycurl |
| 12 | +from selenium import webdriver |
| 13 | +from selenium.common.exceptions import WebDriverException |
| 14 | +from selenium.webdriver.chrome.options import Options |
| 15 | +from selenium.webdriver.common.by import By |
| 16 | +from selenium.webdriver.support import expected_conditions as ec |
| 17 | +from selenium.webdriver.support.ui import WebDriverWait |
8 | 18 |
|
9 | 19 |
|
10 | 20 | class HttpMethod(StrEnum): |
@@ -148,3 +158,152 @@ def make_http_request_with_retries( |
148 | 158 | else: |
149 | 159 | logger.info("Retrying...") |
150 | 160 | return None |
| 161 | + |
| 162 | + |
| 163 | +def parse_response_headers(headers_bytes: bytes) -> dict[str, str]: |
| 164 | + """Parse HTTP response header from bytes to a dictionary. |
| 165 | +
|
| 166 | + Returns |
| 167 | + ------- |
| 168 | + dict |
| 169 | + A dictionary of HTTP response headers. |
| 170 | + """ |
| 171 | + headers = {} |
| 172 | + headers_text = headers_bytes.decode("utf-8") |
| 173 | + for line in headers_text.split("\r\n"): |
| 174 | + if ": " in line: |
| 175 | + key, value = line.split(": ", maxsplit=1) |
| 176 | + headers[key] = value |
| 177 | + return headers |
| 178 | + |
| 179 | + |
| 180 | +def send_http_request_with_retries_pycurl( |
| 181 | + url: str, |
| 182 | + data: dict | None = None, |
| 183 | + delay_before_request: float = 1.0, |
| 184 | + logger: "loguru.Logger" = loguru.logger, |
| 185 | +) -> dict: |
| 186 | + """Query the Figshare API and return the JSON response. |
| 187 | +
|
| 188 | + Parameters |
| 189 | + ---------- |
| 190 | + url : str |
| 191 | + URL to send the request to. |
| 192 | + data : dict, optional |
| 193 | + Data to send in the request body (for POST requests). |
| 194 | + delay_before_request : float, optional |
| 195 | + Time to wait before sending the request, in seconds. |
| 196 | +
|
| 197 | + Returns |
| 198 | + ------- |
| 199 | + dict |
| 200 | + A dictionary with the following keys: |
| 201 | + - status_code: HTTP status code of the response. |
| 202 | + - elapsed_time: Time taken to perform the request. |
| 203 | + - headers: Dictionary of response headers. |
| 204 | + - response: JSON response from the API. |
| 205 | + """ |
| 206 | + # First, we wait. |
| 207 | + # https://docs.figshare.com/#figshare_documentation_api_description_rate_limiting |
| 208 | + # "We recommend that clients use the API responsibly |
| 209 | + # and do not make more than one request per second." |
| 210 | + headers = { |
| 211 | + "User-Agent": ( |
| 212 | + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " |
| 213 | + "(KHTML, like Gecko) Chrome/143.0.0.0 Safari/537.36" |
| 214 | + ), |
| 215 | + "Content-Type": "application/json", |
| 216 | + } |
| 217 | + time.sleep(delay_before_request) |
| 218 | + results = {} |
| 219 | + # Initialize a Curl object. |
| 220 | + curl = pycurl.Curl() |
| 221 | + # Set the URL to send the request to. |
| 222 | + curl.setopt(curl.URL, url) |
| 223 | + # Add headers as a list of strings. |
| 224 | + headers_lst = [f"{key}: {value}" for key, value in headers.items()] |
| 225 | + curl.setopt(curl.HTTPHEADER, headers_lst) |
| 226 | + # Handle SSL certificates. |
| 227 | + curl.setopt(curl.CAINFO, certifi.where()) |
| 228 | + # Follow redirect. |
| 229 | + curl.setopt(curl.FOLLOWLOCATION, True) # noqa: FBT003 |
| 230 | + # If data is provided, set the request to POST and add the data. |
| 231 | + if data is not None: |
| 232 | + curl.setopt(curl.POST, True) # noqa: FBT003 |
| 233 | + data_json = json.dumps(data) |
| 234 | + curl.setopt(curl.POSTFIELDS, data_json) |
| 235 | + # Capture the response body in a buffer. |
| 236 | + body_buffer = BytesIO() |
| 237 | + curl.setopt(curl.WRITEFUNCTION, body_buffer.write) |
| 238 | + # Capture the response headers in a buffer. |
| 239 | + header_buffer = BytesIO() |
| 240 | + curl.setopt(curl.HEADERFUNCTION, header_buffer.write) |
| 241 | + # Perform the request. |
| 242 | + curl.perform() |
| 243 | + # Get the HTTP status code. |
| 244 | + status_code = curl.getinfo(curl.RESPONSE_CODE) |
| 245 | + results["status_code"] = status_code |
| 246 | + # Get elapsed time. |
| 247 | + elapsed_time = curl.getinfo(curl.TOTAL_TIME) |
| 248 | + results["elapsed_time"] = elapsed_time |
| 249 | + # Close the Curl object. |
| 250 | + curl.close() |
| 251 | + # Get the response headers from the buffer. |
| 252 | + response_headers = parse_response_headers(header_buffer.getvalue()) |
| 253 | + results["headers"] = response_headers |
| 254 | + # Get the response body from the buffer. |
| 255 | + response = body_buffer.getvalue() |
| 256 | + # Convert the response body from bytes to a string. |
| 257 | + response = response.decode("utf-8") |
| 258 | + # Convert the response string to a JSON object. |
| 259 | + try: |
| 260 | + response = json.loads(response) |
| 261 | + except json.JSONDecodeError: |
| 262 | + logger.error("Error decoding JSON response:") |
| 263 | + logger.error(response[:100]) |
| 264 | + response = None |
| 265 | + results["response"] = response |
| 266 | + return results |
| 267 | + |
| 268 | + |
| 269 | +def get_html_page_with_selenium( |
| 270 | + url: str, tag: str = "body", logger: "loguru.Logger" = loguru.logger |
| 271 | +) -> str | None: |
| 272 | + """Get HTML page content using Selenium. |
| 273 | +
|
| 274 | + Parameters |
| 275 | + ---------- |
| 276 | + url : str |
| 277 | + URL of the web page to retrieve. |
| 278 | + tag : str, optional |
| 279 | + HTML tag to wait for before retrieving the page content (default is "body"). |
| 280 | +
|
| 281 | + Returns |
| 282 | + ------- |
| 283 | + str | None |
| 284 | + HTML content of the page, or None if an error occurs. |
| 285 | + """ |
| 286 | + options = Options() |
| 287 | + options.add_argument("--headless") |
| 288 | + options.add_argument("--enable-javascript") |
| 289 | + page_content = "" |
| 290 | + logger.info("Retrieving page with Selenium:") |
| 291 | + logger.info(url) |
| 292 | + try: |
| 293 | + driver = webdriver.Chrome(options=options) |
| 294 | + driver.get(url) |
| 295 | + page_content = ( |
| 296 | + WebDriverWait(driver, 10) |
| 297 | + .until(ec.visibility_of_element_located((By.CSS_SELECTOR, tag))) |
| 298 | + .text |
| 299 | + ) |
| 300 | + driver.quit() |
| 301 | + except WebDriverException as e: |
| 302 | + logger.error("Cannot retrieve page:") |
| 303 | + logger.error(url) |
| 304 | + logger.error(f"Selenium error: {e}") |
| 305 | + return None |
| 306 | + if not page_content: |
| 307 | + logger.error("Retrieved page content is empty.") |
| 308 | + return None |
| 309 | + return page_content |
0 commit comments