Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions src/backend/core/api/imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Import endpoints for Outline (zip upload)."""

from __future__ import annotations

import io
import zipfile

import rest_framework as drf

from core.services.outline_import import OutlineImportError, process_outline_zip


# ---------- Outline (Zip Upload) ----------


class OutlineImportUploadView(drf.views.APIView):
parser_classes = [drf.parsers.MultiPartParser]
permission_classes = [drf.permissions.IsAuthenticated]

def post(self, request):
uploaded = request.FILES.get("file")
if not uploaded:
raise drf.exceptions.ValidationError({"file": "File is required"})

name = getattr(uploaded, "name", "")
if not name.endswith(".zip"):
raise drf.exceptions.ValidationError({"file": "Must be a .zip file"})

try:
content = uploaded.read()
# Fail fast if the upload is not a valid zip archive
with zipfile.ZipFile(io.BytesIO(content)):
pass
created_ids = process_outline_zip(request.user, content)
except zipfile.BadZipFile as exc:
raise drf.exceptions.ValidationError({"file": "Invalid zip archive"}) from exc
except OutlineImportError as exc:
raise drf.exceptions.ValidationError({"file": str(exc)}) from exc

return drf.response.Response(
{"created_document_ids": created_ids}, status=drf.status.HTTP_201_CREATED
)
205 changes: 205 additions & 0 deletions src/backend/core/services/outline_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
"""Service to import an Outline export (.zip) into Docs documents."""

from __future__ import annotations

import io
import mimetypes
import re
import uuid
import zipfile
from typing import Iterable
import posixpath

from django.conf import settings
from django.core.files.storage import default_storage

from lasuite.malware_detection import malware_detection

from core import enums, models
from core.services.converter_services import YdocConverter


class OutlineImportError(Exception):
"""Raised when the Outline archive is invalid or unsafe."""


def _ensure_dir_documents(user, dir_path: str, dir_docs: dict[str, models.Document]) -> models.Document | None:
"""Ensure each path segment in dir_path has a container document.

Returns the deepest parent document or None when dir_path is empty.
"""
if not dir_path:
return None

parts = [p for p in dir_path.split("/") if p]
parent: models.Document | None = None
current = ""
for part in parts:
current = f"{current}/{part}" if current else part
if current in dir_docs:
parent = dir_docs[current]
continue

if parent is None:
doc = models.Document.add_root(
depth=1,
creator=user,
title=part,
link_reach=models.LinkReachChoices.RESTRICTED,
)
else:
doc = parent.add_child(creator=user, title=part)

models.DocumentAccess.objects.update_or_create(
document=doc,
user=user,
defaults={"role": models.RoleChoices.OWNER},
)
dir_docs[current] = doc
parent = doc

return parent


def _upload_attachment(user, doc: models.Document, arcname: str, data: bytes) -> str:
"""Upload a binary asset into object storage and return its public media URL."""
content_type, _ = mimetypes.guess_type(arcname)
ext = (arcname.split(".")[-1] or "bin").lower()
file_id = uuid.uuid4()
key = f"{doc.key_base}/{enums.ATTACHMENTS_FOLDER:s}/{file_id!s}.{ext}"
extra_args = {
"Metadata": {
"owner": str(user.id),
"status": enums.DocumentAttachmentStatus.READY,
},
}
if content_type:
extra_args["ContentType"] = content_type

default_storage.connection.meta.client.upload_fileobj(
io.BytesIO(data), default_storage.bucket_name, key, ExtraArgs=extra_args
)
doc.attachments.append(key)
doc.save(update_fields=["attachments", "updated_at"])
malware_detection.analyse_file(key, document_id=doc.id)
return f"{settings.MEDIA_BASE_URL}{settings.MEDIA_URL}{key}"


def process_outline_zip(user, zip_bytes: bytes) -> list[str]:
"""Process an Outline export zip and create Docs documents.

Returns the list of created document IDs (stringified UUIDs) corresponding to
markdown-backed documents. Container folders used to rebuild hierarchy are not listed.
"""
archive = zipfile.ZipFile(io.BytesIO(zip_bytes))

# Basic Zip Slip protection: refuse absolute or parent-traversal entries
for name in archive.namelist():
# Normalize to posix separators and check traversal
if name.startswith("/") or "\\" in name:
raise OutlineImportError("Unsafe path in archive")
parts = [p for p in name.split("/") if p]
if any(part == ".." for part in parts):
raise OutlineImportError("Unsafe path in archive")

created_ids: list[str] = []
dir_docs: dict[str, models.Document] = {}
md_files: Iterable[str] = sorted(
[
n
for n in archive.namelist()
if n.lower().endswith(".md")
and not n.startswith("__MACOSX/")
and not any(part.startswith(".") for part in n.split("/"))
]
)

# Build a set of md files that have corresponding directories (Outline nested docs)
# e.g., "Doc.md" and "Doc/" both exist -> "Doc" is a parent with nested children
md_with_dirs: set[str] = set()
for md_path in md_files:
# Remove .md extension to get potential directory name
base_path = md_path.rsplit(".md", 1)[0]
# Check if there's a directory with the same name
if any(n.startswith(f"{base_path}/") for n in archive.namelist()):
md_with_dirs.add(base_path)

img_pattern = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")

def read_bytes(path_in_zip: str) -> bytes | None:
try:
with archive.open(path_in_zip, "r") as f:
return f.read()
except KeyError:
return None

converter = YdocConverter()

for md_path in md_files:
dir_path, file_name = (
(md_path.rsplit("/", 1) + [""])[:2] if "/" in md_path else ("", md_path)
)
parent_doc = _ensure_dir_documents(user, dir_path, dir_docs)

try:
raw_md = archive.read(md_path).decode("utf-8", errors="ignore")
except Exception: # noqa: BLE001
raw_md = ""

title_match = re.search(r"^#\s+(.+)$", raw_md, flags=re.MULTILINE)
title = title_match.group(1).strip() if title_match else file_name.rsplit(".", 1)[0]

if parent_doc is None:
doc = models.Document.add_root(
depth=1,
creator=user,
title=title,
link_reach=models.LinkReachChoices.RESTRICTED,
)
else:
doc = parent_doc.add_child(creator=user, title=title)

# If this md file has a corresponding directory, register it as a container
# so nested children will use this doc as parent instead of creating a duplicate
base_path = md_path.rsplit(".md", 1)[0]
if base_path in md_with_dirs:
dir_docs[base_path] = doc

models.DocumentAccess.objects.update_or_create(
document=doc,
user=user,
defaults={"role": models.RoleChoices.OWNER},
)

def replace_img_link(match: re.Match[str]) -> str:
url = match.group(1)
if url.startswith("http://") or url.startswith("https://"):
return match.group(0)
asset_rel = f"{dir_path}/{url}" if dir_path else url
asset_rel = re.sub(r"/+", "/", asset_rel)
# sanitize computed asset path
if asset_rel.startswith("/") or any(part == ".." for part in asset_rel.split("/")):
return match.group(0)
data = read_bytes(asset_rel)
if data is None:
return match.group(0)
media_url = _upload_attachment(user, doc, arcname=url, data=data)
return match.group(0).replace(url, media_url)

rewritten_md = img_pattern.sub(replace_img_link, raw_md)

try:
ydoc_b64 = converter.convert(
rewritten_md.encode("utf-8"),
content_type="text/markdown",
accept="application/vnd.yjs.doc",
)
doc.content = ydoc_b64
doc.save()
except Exception: # noqa: BLE001
# Keep doc without content on conversion error but continue import
pass

created_ids.append(str(doc.id))

return created_ids
127 changes: 127 additions & 0 deletions src/backend/core/tests/imports/test_api_outline_import_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""Tests for the Outline zip import API endpoint."""

import io
import zipfile
from unittest.mock import patch

from django.core.files.uploadedfile import SimpleUploadedFile

import pytest
from rest_framework.test import APIClient

from core import factories
from core.api.viewsets import malware_detection
from core.services.outline_import import OutlineImportError


pytestmark = pytest.mark.django_db


def make_zip_with_markdown_and_image(md_path: str, md_content: str, img_path: str, img_bytes: bytes) -> bytes:
buf = io.BytesIO()
with zipfile.ZipFile(buf, mode="w") as zf:
zf.writestr(md_path, md_content)
zf.writestr(img_path, img_bytes)
return buf.getvalue()


def test_outline_import_upload_anonymous_forbidden():
"""Anonymous users must not be able to use the import endpoint."""
client = APIClient()

# Minimal empty zip
buf = io.BytesIO()
with zipfile.ZipFile(buf, mode="w"):
pass
upload = SimpleUploadedFile(name="export.zip", content=buf.getvalue(), content_type="application/zip")

response = client.post("/api/v1.0/imports/outline/upload", {"file": upload}, format="multipart")

assert response.status_code == 401
assert response.json()["detail"] == "Authentication credentials were not provided."


@patch("core.services.converter_services.YdocConverter.convert", return_value="YmFzZTY0Y29udGVudA==")
def test_outline_import_upload_authenticated_success(mock_convert):
"""Authenticated users can upload an Outline export zip and create documents."""
user = factories.UserFactory()
client = APIClient()
client.force_login(user)

# Markdown referencing a local image in the same directory
md = "# Imported Title\n\nSome text.\n\n![Alt](image.png)\n"
img = (
b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00"
b"\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\xf8\xff\xff?\x00\x05\xfe\x02\xfe"
b"\xa7V\xbd\xfa\x00\x00\x00\x00IEND\xaeB`\x82"
)
zip_bytes = make_zip_with_markdown_and_image(
md_path="Folder1/page.md",
md_content=md,
img_path="Folder1/image.png",
img_bytes=img,
)

upload = SimpleUploadedFile(name="export.zip", content=zip_bytes, content_type="application/zip")

with patch.object(malware_detection, "analyse_file") as mock_analyse_file:
response = client.post("/api/v1.0/imports/outline/upload", {"file": upload}, format="multipart")

assert response.status_code == 201
data = response.json()
assert "created_document_ids" in data
# Only the markdown-backed document ids are returned (container folders are not listed)
assert len(data["created_document_ids"]) == 1

# The converter must have been called once per markdown file
mock_convert.assert_called_once()
# An antivirus scan is run for the uploaded image
assert mock_analyse_file.called


def test_outline_import_upload_invalid_zip_returns_validation_error():
"""Invalid archives are rejected with a validation error instead of crashing."""
user = factories.UserFactory()
client = APIClient()
client.force_login(user)

upload = SimpleUploadedFile(
name="export.zip",
content=b"not-a-zip",
content_type="application/zip",
)

response = client.post(
"/api/v1.0/imports/outline/upload",
{"file": upload},
format="multipart",
)

assert response.status_code == 400
assert response.json() == {"file": ["Invalid zip archive"]}


@patch("core.api.imports.process_outline_zip", side_effect=OutlineImportError("boom"))
def test_outline_import_upload_outline_error_returns_validation_error(mock_process_outline):
"""Service-level Outline import errors are surfaced as validation errors."""
user = factories.UserFactory()
client = APIClient()
client.force_login(user)

zip_bytes = make_zip_with_markdown_and_image(
md_path="doc.md",
md_content="# Title",
img_path="",
img_bytes=b"",
)
upload = SimpleUploadedFile(name="export.zip", content=zip_bytes, content_type="application/zip")

response = client.post(
"/api/v1.0/imports/outline/upload",
{"file": upload},
format="multipart",
)

assert response.status_code == 400
assert response.json() == {"file": ["boom"]}
mock_process_outline.assert_called_once()
Loading
Loading