Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions core/tool_tiers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ docs:
- get_doc_content
- create_doc
- modify_doc_text
- get_doc_markdown
extended:
- search_docs
- find_and_replace_doc
Expand Down
170 changes: 170 additions & 0 deletions gdocs/docs_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import logging
import asyncio
import io
import re

from googleapiclient.http import MediaIoBaseDownload

Expand Down Expand Up @@ -47,6 +48,71 @@

logger = logging.getLogger(__name__)

def _replace_base64_images_with_placeholders(markdown_content: str) -> str:
"""
Replace base64 encoded images with readable placeholders.

Converts:
[image1]: <...>
To:
[image1]: [PNG image, ~2.5MB]

Also handles inline images:
![alt text](...)
To:
![alt text]([JPEG image, ~1.2MB])
"""

def calculate_size(base64_str: str) -> str:
"""Calculate approximate size of base64 encoded data."""
# Base64 encoding increases size by ~33%, so reverse that
# Each base64 char represents 6 bits, so 4 chars = 3 bytes
num_chars = len(base64_str)
size_bytes = (num_chars * 3) // 4

if size_bytes < 1024:
return f"{size_bytes}B"
elif size_bytes < 1024 * 1024:
return f"{size_bytes / 1024:.1f}KB"
else:
return f"{size_bytes / (1024 * 1024):.1f}MB"

def replace_reference_image(match):
"""Replace reference-style image with placeholder."""
image_ref = match.group(1)
image_type = match.group(2)
base64_data = match.group(3)

size = calculate_size(base64_data)
return f"[{image_ref}]: [{image_type.upper()} image, ~{size}]"

def replace_inline_image(match):
"""Replace inline image with placeholder."""
alt_text = match.group(1)
image_type = match.group(2)
base64_data = match.group(3)

size = calculate_size(base64_data)
return f"![{alt_text}]([{image_type.upper()} image, ~{size}])"

# Pattern for reference-style images: [image1]: <data:image/png;base64,...>
reference_pattern = r'\[([^\]]+)\]:\s*<data:image/([^;]+);base64,([^>]+)>'
markdown_content = re.sub(reference_pattern, replace_reference_image, markdown_content)

# Pattern for inline images: ![alt text](data:image/jpeg;base64,...)
inline_pattern = r'!\[([^\]]*)\]\(data:image/([^;]+);base64,([^)]+)\)'
markdown_content = re.sub(inline_pattern, replace_inline_image, markdown_content)

# Also handle images without angle brackets (some variations)
reference_pattern_no_brackets = r'\[([^\]]+)\]:\s*data:image/([^;]+);base64,(\S+)'
markdown_content = re.sub(reference_pattern_no_brackets, replace_reference_image, markdown_content)
Comment on lines +98 to +108
Copy link

Copilot AI Aug 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function performs three separate regex substitutions on the same content. For large documents, this could be inefficient. Consider combining the patterns into a single regex with alternation groups or using re.compile() to pre-compile the patterns if this function will be called frequently.

Suggested change
# Pattern for reference-style images: [image1]: <data:image/png;base64,...>
reference_pattern = r'\[([^\]]+)\]:\s*<data:image/([^;]+);base64,([^>]+)>'
markdown_content = re.sub(reference_pattern, replace_reference_image, markdown_content)
# Pattern for inline images: ![alt text](data:image/jpeg;base64,...)
inline_pattern = r'!\[([^\]]*)\]\(data:image/([^;]+);base64,([^)]+)\)'
markdown_content = re.sub(inline_pattern, replace_inline_image, markdown_content)
# Also handle images without angle brackets (some variations)
reference_pattern_no_brackets = r'\[([^\]]+)\]:\s*data:image/([^;]+);base64,(\S+)'
markdown_content = re.sub(reference_pattern_no_brackets, replace_reference_image, markdown_content)
# Use pre-compiled patterns for efficiency
markdown_content = _REFERENCE_PATTERN.sub(replace_reference_image, markdown_content)
markdown_content = _INLINE_PATTERN.sub(replace_inline_image, markdown_content)
markdown_content = _REFERENCE_PATTERN_NO_BRACKETS.sub(replace_reference_image, markdown_content)

Copilot uses AI. Check for mistakes.

# Log if we found and replaced images
if 'data:image' in markdown_content:
logger.warning("Found remaining base64 images that couldn't be processed")

return markdown_content

@server.tool()
@handle_http_errors("search_docs", is_read_only=True, service_type="docs")
@require_google_service("drive", "drive_read")
Expand Down Expand Up @@ -243,6 +309,110 @@ def process_tab_hierarchy(tab, level=0):
)
return header + body_text

@server.tool()
@handle_http_errors("get_doc_markdown", is_read_only=True, service_type="docs")
@require_google_service("drive", "drive_read")
async def get_doc_markdown(
service,
user_google_email: str,
document_id: str,
include_metadata: bool = True,
process_images: bool = True,
) -> str:
"""
Retrieves a Google Doc in Markdown format, preserving formatting like headers,
bold, italic, links, lists, and tables.

Args:
user_google_email (str): The user's Google email address. Required.
document_id (str): The ID of the Google Doc to export. Required.
include_metadata (bool): Whether to include document metadata header. Defaults to True.
process_images (bool): Whether to replace base64 images with placeholders. Defaults to True.
If True, replaces large base64 encoded images with readable placeholders like [PNG image, ~2.5MB].
If False, keeps original base64 encoded images (warning: may consume significant context).

Returns:
str: The document content in Markdown format.
"""
logger.info(f"[get_doc_markdown] Exporting document '{document_id}' as Markdown for user '{user_google_email}'")

try:
# Get file metadata first
file_metadata = await asyncio.to_thread(
service.files().get(
fileId=document_id,
fields="id, name, mimeType, webViewLink, modifiedTime"
).execute
)

mime_type = file_metadata.get("mimeType", "")
file_name = file_metadata.get("name", "Unknown Document")
web_view_link = file_metadata.get("webViewLink", "#")
modified_time = file_metadata.get("modifiedTime", "Unknown")

# Check if it's a Google Doc
if mime_type != "application/vnd.google-apps.document":
return f"Error: File '{file_name}' is not a Google Doc (MIME type: {mime_type}). This tool only works with native Google Docs."

# Export as Markdown using Drive API
request = service.files().export_media(
fileId=document_id,
mimeType='text/markdown' # Native Markdown export
)

# Download the exported content
fh = io.BytesIO()
downloader = MediaIoBaseDownload(fh, request)
loop = asyncio.get_event_loop()
done = False

while not done:
status, done = await loop.run_in_executor(None, downloader.next_chunk)
if status:
logger.debug(f"Download progress: {int(status.progress() * 100)}%")

# Get the Markdown content
markdown_content = fh.getvalue().decode('utf-8')

# Process base64 images and replace with placeholders (if requested)
if process_images:
original_size = len(markdown_content)
markdown_content = _replace_base64_images_with_placeholders(markdown_content)
new_size = len(markdown_content)
if new_size < original_size:
size_reduction_mb = (original_size - new_size) / (1024 * 1024)
logger.info(f"[get_doc_markdown] Replaced base64 images with placeholders, reduced size by ~{size_reduction_mb:.1f}MB")

# Prepare output
if include_metadata:
header = f"""---
title: {file_name}
document_id: {document_id}
modified: {modified_time}
link: {web_view_link}
---

"""
return header + markdown_content
else:
return markdown_content

except Exception as e:
# If Markdown export fails, provide helpful error message
if "Invalid mime type" in str(e) or "Export only supports" in str(e):
Comment on lines +400 to +402
Copy link

Copilot AI Aug 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

String matching on exception messages is fragile and could break if the API changes error message formats. Consider catching specific exception types from the Google API client library instead of relying on string content matching.

Suggested change
except Exception as e:
# If Markdown export fails, provide helpful error message
if "Invalid mime type" in str(e) or "Export only supports" in str(e):
except HttpError as e:
# If Markdown export fails, provide helpful error message
error_content = None
try:
import json
error_content = json.loads(e.content.decode("utf-8"))
except Exception:
error_content = {}
error_reason = ""
if error_content and "error" in error_content:
error_message = error_content["error"].get("message", "")
error_reason = error_content["error"].get("status", "")
else:
error_message = str(e)
# Check for invalid mime type or unsupported export
if (
("Invalid mime type" in error_message)
or ("Export only supports" in error_message)
or (error_reason in ["FAILED_PRECONDITION", "INVALID_ARGUMENT"])
):

Copilot uses AI. Check for mistakes.
error_msg = (
f"Markdown export failed for document '{file_name}'. "
"This might be because:\n"
"1. The document has complex formatting not supported in Markdown\n"
"2. Your Google Workspace version doesn't support Markdown export yet\n\n"
"Alternative: Use 'get_doc_content' to get plain text, or try 'text/plain' export."
)
logger.error(f"Markdown export not supported: {e}")
return error_msg
else:
logger.error(f"Failed to export document as Markdown: {e}")
raise

@server.tool()
@handle_http_errors("list_docs_in_folder", is_read_only=True, service_type="docs")
@require_google_service("drive", "drive_read")
Expand Down