Skip to content

Commit 44941b5

Browse files
feat: add headers argument and a custom user-agents for http requests (#53)
* fix: added custom-headers with user-agents for requests Signed-off-by: Peter Staar <[email protected]> * updated the logic to ensure we always have headers Signed-off-by: Peter Staar <[email protected]> * use lower case headers and change to docling-core agent name Signed-off-by: Michele Dolfi <[email protected]> * change default Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Peter Staar <[email protected]> Signed-off-by: Michele Dolfi <[email protected]> Co-authored-by: Michele Dolfi <[email protected]>
1 parent 2c88e56 commit 44941b5

File tree

1 file changed

+16
-3
lines changed

1 file changed

+16
-3
lines changed

docling_core/utils/file.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,18 @@
55

66
"""File-related utilities."""
77

8+
import importlib
89
import tempfile
910
from pathlib import Path
10-
from typing import Union
11+
from typing import Dict, Optional, Union
1112

1213
import requests
1314
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
1415

1516

16-
def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
17+
def resolve_file_source(
18+
source: Union[Path, AnyHttpUrl, str], headers: Optional[Dict[str, str]] = None
19+
) -> Path:
1720
"""Resolves the source (URL, path) of a file to a local file path.
1821
1922
If a URL is provided, the content is first downloaded to a temporary local file.
@@ -29,7 +32,17 @@ def resolve_file_source(source: Union[Path, AnyHttpUrl, str]) -> Path:
2932
"""
3033
try:
3134
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
32-
res = requests.get(http_url, stream=True)
35+
36+
# make all header keys lower case
37+
_headers = headers or {}
38+
req_headers = {k.lower(): v for k, v in _headers.items()}
39+
# add user-agent is not set
40+
if "user-agent" not in req_headers:
41+
agent_name = f"docling-core/{importlib.metadata.version('docling-core')}"
42+
req_headers["user-agent"] = agent_name
43+
44+
# fetch the page
45+
res = requests.get(http_url, stream=True, headers=req_headers)
3346
res.raise_for_status()
3447
fname = None
3548
# try to get filename from response header

0 commit comments

Comments
 (0)