Skip to content

Commit 3eeb259

Browse files
dolfim-ibmvtempest
andauthored
feat: Download Google docs and drive files via export url (#335)
* # Google Docs, Files, PDF URLs, Spreadsheets, Presentations: convert to export URL Signed-off-by: vtempest <[email protected]> * standard way of exporting ?format= Signed-off-by: vtempest <[email protected]> --------- Signed-off-by: vtempest <[email protected]> Co-authored-by: vtempest <[email protected]>
1 parent 062124e commit 3eeb259

File tree

1 file changed

+27
-0
lines changed

1 file changed

+27
-0
lines changed

docling_core/utils/file.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"""File-related utilities."""
77

88
import importlib
9+
import re
910
import tempfile
1011
from io import BytesIO
1112
from pathlib import Path
@@ -76,6 +77,32 @@ def resolve_source_to_stream(
7677
agent_name = f"docling-core/{importlib.metadata.version('docling-core')}"
7778
req_headers["user-agent"] = agent_name
7879

80+
# Google Docs, Files, PDF URLs, Spreadsheets, Presentations: convert to export URL
81+
google_doc_id = re.search(
82+
r"google\.com\/(file|document|spreadsheets|presentation)\/d\/([\w-]+)",
83+
str(http_url),
84+
)
85+
if google_doc_id:
86+
doc_type = google_doc_id.group(1)
87+
doc_id = google_doc_id.group(2)
88+
89+
if doc_type == "file":
90+
http_url = TypeAdapter(AnyHttpUrl).validate_python(
91+
f"https://drive.google.com/uc?export=download&id={doc_id}"
92+
)
93+
elif doc_type == "document":
94+
http_url = TypeAdapter(AnyHttpUrl).validate_python(
95+
f"https://docs.google.com/document/d/{doc_id}/export?format=docx"
96+
)
97+
elif doc_type == "spreadsheets":
98+
http_url = TypeAdapter(AnyHttpUrl).validate_python(
99+
f"https://docs.google.com/spreadsheets/d/{doc_id}/export?format=xlsx"
100+
)
101+
elif doc_type == "presentation":
102+
http_url = TypeAdapter(AnyHttpUrl).validate_python(
103+
f"https://docs.google.com/presentation/d/{doc_id}/export?format=pptx"
104+
)
105+
79106
# fetch the page
80107
res = requests.get(http_url, stream=True, headers=req_headers)
81108
res.raise_for_status()

0 commit comments

Comments
 (0)