diff --git a/CHANGES.md b/CHANGES.md index 5759551..d771194 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,7 @@ ## Unreleased +- Use `pathlibfs` for scheme-agnostic source access ## 2023-10-07 0.1.0 - Add example data files in different formats diff --git a/hubspot_tech_writing/core.py b/hubspot_tech_writing/core.py index 108a9bb..bf1775d 100644 --- a/hubspot_tech_writing/core.py +++ b/hubspot_tech_writing/core.py @@ -14,7 +14,7 @@ from hubspot_tech_writing.hubspot_api import HubSpotAdapter, HubSpotBlogPost, HubSpotFile from hubspot_tech_writing.util.common import ContentTypeResolver from hubspot_tech_writing.util.html import HTMLImageTranslator -from hubspot_tech_writing.util.io import to_io +from hubspot_tech_writing.util.io import open_url, to_io logger = logging.getLogger(__name__) @@ -75,9 +75,14 @@ def upload( folder_id: t.Optional[str] = None, folder_path: t.Optional[str] = None, ): - source_path = Path(source) + source_path: Path + if isinstance(source, str): + source_path = open_url(source) + else: + source_path = source + logger.info(f"Source: {source_path}") - ctr = ContentTypeResolver(name=source_path) + ctr = ContentTypeResolver(filepath=source_path) logger.info(f"Uploading file: {source}") hsa = HubSpotAdapter(access_token=access_token) @@ -101,6 +106,7 @@ def upload( ) hit = HTMLImageTranslator(html=html, source_path=source_path, uploader=uploader) hit.discover().process() + logger.debug(hit) html = hit.html_out # Upload blog post. diff --git a/hubspot_tech_writing/hubspot_api.py b/hubspot_tech_writing/hubspot_api.py index d16b63c..8a35542 100644 --- a/hubspot_tech_writing/hubspot_api.py +++ b/hubspot_tech_writing/hubspot_api.py @@ -3,13 +3,14 @@ import os import typing as t from copy import deepcopy -from pathlib import Path +from tempfile import NamedTemporaryFile import hubspot from click import confirm from hubspot import HubSpot from hubspot.cms.blogs.blog_posts import BlogPost from hubspot.files.files import File +from pathlibfs import Path logger = logging.getLogger(__name__) @@ -130,12 +131,14 @@ def get_file_by_name(self, file: "HubSpotFile") -> File: logger.info(f"Found file: id={result.id}, path={result.path}, url={result.url}") return result - def save_file(self, file_id: str, source: str): + def save_file(self, file_id: str, source: Path): """ Save / overwrite existing file. """ + tmpfile = NamedTemporaryFile() + tmpfile.write(source.read_bytes()) return self.hs.files.files.files_api.replace( - file_id=file_id, file=source, options=json.dumps(self.FILE_OPTIONS) + file_id=file_id, file=tmpfile.name, options=json.dumps(self.FILE_OPTIONS) ) def delete_file_by_id(self, identifier: str) -> t.Optional[File]: @@ -254,7 +257,7 @@ class HubSpotFile: def __init__( self, hubspot_adapter: HubSpotAdapter, - source: t.Union[str, Path], + source: Path, identifier: t.Optional[str] = None, name: t.Optional[str] = None, folder_id: t.Optional[str] = None, @@ -286,7 +289,7 @@ def __init__( def __str__(self): return ( f"{self.__class__.__name__} identifier={self.identifier}, " - f"name={self.name}, folder={self.folder_id or self.folder_path}" + f"name={self.name}, folder={self.folder_id or self.folder_path}, source={self.source}" ) def load(self): @@ -310,7 +313,7 @@ def save(self): if not self.source: raise ValueError(f"Unable to save file without source: {self}") logger.info(f"Saving file: {self}") - return self.hsa.save_file(file_id=self.identifier, source=str(self.source)) + return self.hsa.save_file(file_id=self.identifier, source=self.source) def delete(self): """ diff --git a/hubspot_tech_writing/util/common.py b/hubspot_tech_writing/util/common.py index 0056f9a..21e01c5 100644 --- a/hubspot_tech_writing/util/common.py +++ b/hubspot_tech_writing/util/common.py @@ -1,9 +1,11 @@ import logging import typing as t -from pathlib import Path import colorlog from colorlog.escape_codes import escape_codes +from pathlibfs import Path + +from hubspot_tech_writing.util.io import path_without_scheme def setup_logging(level=logging.INFO, verbose: bool = False): @@ -23,9 +25,9 @@ class ContentTypeResolver: HTML_SUFFIXES = [".html", ".html5", ".htm"] TEXT_SUFFIXES = MARKUP_SUFFIXES + HTML_SUFFIXES + [".txt"] - def __init__(self, name: t.Union[str, Path]): - self.name = name - self.suffix = Path(name).suffix + def __init__(self, filepath: t.Union[str, Path]): + self.path = path_without_scheme(filepath) + self.suffix = self.path.suffix def is_markup(self): return self.suffix in self.MARKUP_SUFFIXES diff --git a/hubspot_tech_writing/util/html.py b/hubspot_tech_writing/util/html.py index f1353a1..f881dba 100644 --- a/hubspot_tech_writing/util/html.py +++ b/hubspot_tech_writing/util/html.py @@ -2,9 +2,10 @@ import logging import typing as t from copy import deepcopy -from pathlib import Path +from pprint import pformat from bs4 import BeautifulSoup +from pathlibfs import Path logger = logging.getLogger(__name__) @@ -12,7 +13,7 @@ @dataclasses.dataclass class HTMLImage: alt: str - src: str + src: Path class HTMLImageTranslator: @@ -21,19 +22,18 @@ class HTMLImageTranslator: After that, replace URLs in HTML document. """ - def __init__(self, html: str, source_path: t.Union[str, Path], uploader: t.Optional[t.Callable] = None): + def __init__(self, html: str, source_path: Path, uploader: t.Optional[t.Callable] = None): self.html_in: str = html self.html_out: t.Optional[str] = None - self.source_path = source_path + self.source = source_path self.uploader = uploader self.images_in: t.List[HTMLImage] = [] self.images_local: t.List[HTMLImage] = [] self.images_remote: t.List[HTMLImage] = [] def __str__(self): - return ( - f"HTMLImageTranslator:\nin: {self.images_in}\nlocal: {self.images_local}\nremote: {self.images_remote}" - ) + info = {"source": self.source, "in": self.images_in, "local": self.images_local, "remote": self.images_remote} + return f"HTMLImageTranslator:\n{pformat(info)}" def discover(self): self.scan().resolve() @@ -59,9 +59,10 @@ def resolve(self) -> "HTMLImageTranslator": """ Process discovered image elements, computing effective paths. """ - if self.source_path is None: + if self.source is None: + logger.warning("No resolving without source path") return self - parent_path = Path(self.source_path) + parent_path = self.source if parent_path.is_file(): parent_path = parent_path.parent self.images_local = [] @@ -74,7 +75,7 @@ def resolve(self) -> "HTMLImageTranslator": # Relative paths are relative to the original document. else: - image_new.src = str(Path(parent_path) / image.src) + image_new.src = parent_path / image.src self.images_local.append(image_new) return self @@ -86,10 +87,9 @@ def upload(self) -> "HTMLImageTranslator": logger.warning("No upload without uploader") return self for image_local in self.images_local: - hs_file = self.uploader(source=image_local.src, name=Path(image_local.src).name) - image_url = hs_file.url + hs_file = self.uploader(source=image_local.src, name=image_local.src.name) image_remote: HTMLImage = deepcopy(image_local) - image_remote.src = image_url + image_remote.src = hs_file.url self.images_remote.append(image_remote) return self diff --git a/hubspot_tech_writing/util/io.py b/hubspot_tech_writing/util/io.py index 0adf7b7..7762844 100644 --- a/hubspot_tech_writing/util/io.py +++ b/hubspot_tech_writing/util/io.py @@ -3,20 +3,73 @@ import typing as t from pathlib import Path -import requests +from pathlibfs import Path as PathPlus +from yarl import URL @contextlib.contextmanager def to_io(source: t.Union[str, Path, t.IO]) -> t.Generator[t.IO, None, None]: - if isinstance(source, (str, Path)): + """ + Main context manager for accessing resources. + Before accessing / opening, it converges a path string, object, or IO handle, to an IO handle. + """ + fp: t.IO + if isinstance(source, io.TextIOWrapper): + fp = source + elif isinstance(source, (str, Path, PathPlus)): source = str(source) - fp: t.IO - if source.startswith("http://") or source.startswith("https://"): - response = requests.get(source, timeout=10.0) - fp = io.StringIO(response.text) - else: - fp = open(source, "r") + path = open_url(source) + fp = path.open(mode="rt") else: - fp = source + raise TypeError(f"Unable to converge to IO handle. type={type(source)}, value={source}") yield fp fp.close() + + +def open_url(url: str) -> PathPlus: + """ + Access URL, with specific handling for GitHub URLs. + + When approached using a GitHub HTTP URL, converge it to a pathlibfs / fsspec URL, + and open it. + + Input URLs + ---------- + github+https://foobar:ghp_lalala@github.com/acme/sweet-camino/path/to/document.md + github+https://foobar:ghp_lalala@github.com/acme/sweet-camino/blob/main/path/to/document.md + + Output Path + ----------- + fs = Path("github://path/to/document.md", username="foobar", token="ghp_lalala", org="acme", repo="sweet-camino") + """ + uri = URL(url) + + if uri.scheme.startswith("github+https"): + path_fragments = uri.path.split("/")[1:] + path_kwargs = { + "username": uri.user, + "token": uri.password, + "org": path_fragments[0], + "repo": path_fragments[1], + } + + real_path_fragments = path_fragments[2:] + if path_fragments[2] in ["blob", "raw"]: + real_path_fragments = path_fragments[4:] + + downstream_url = "github://" + "/".join(real_path_fragments) + path = PathPlus(downstream_url, **path_kwargs) + + else: + path = PathPlus(url) + return path + + +def path_without_scheme(url_like: str) -> PathPlus: + """ + Return a pathlibfs Path, without the scheme. + """ + url = URL(str(url_like)) + if url.is_absolute(): + url = url.with_scheme("") + return PathPlus(str(url)) diff --git a/pyproject.toml b/pyproject.toml index afcb1ae..114528f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,7 +70,9 @@ dependencies = [ "hubspot-api-client<9", "markdown<4", "mkdocs-linkcheck<2", + "pathlibfs<0.6", "requests<3", + "yarl<2", ] [project.optional-dependencies] diff --git a/tests/conftest.py b/tests/conftest.py index 55cb4eb..3341e70 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -15,6 +15,30 @@ def markdownfile() -> Path: return Path(__file__).parent / "data" / "hubspot-blog-post-original.md" +def get_markdownurl(infix: str = "", scheme: str = "https:") -> str: + return f"{scheme}//github.com/crate-workbench/hubspot-tech-writing/{infix}tests/data/hubspot-blog-post-original.md" + + +@pytest.fixture +def markdownurl_https_raw() -> str: + return get_markdownurl(infix="raw/main/") + + +@pytest.fixture +def markdownurl_github_https_bare() -> str: + return get_markdownurl(scheme="github+https:") + + +@pytest.fixture +def markdownurl_github_https_raw() -> str: + return get_markdownurl(infix="raw/main/", scheme="github+https:") + + +@pytest.fixture +def markdownurl_github_https_blob() -> str: + return get_markdownurl(infix="blob/main/", scheme="github+https:") + + @pytest.fixture def markdownfile_minimal_broken_links() -> Path: return Path(__file__).parent / "data" / "minimal-broken-links.md" diff --git a/tests/test_core.py b/tests/test_core.py index a370356..eaeb475 100644 --- a/tests/test_core.py +++ b/tests/test_core.py @@ -13,9 +13,8 @@ def test_convert_file(markdownfile): check_content(html) -def test_convert_url(): - url = "https://github.com/crate-workbench/hubspot-tech-writing/raw/main/tests/data/hubspot-blog-post-original.md" - html = convert(url) +def test_convert_url(markdownurl_https_raw): + html = convert(markdownurl_https_raw) check_content(html) diff --git a/tests/test_hubspot_blogpost.py b/tests/test_hubspot_blogpost.py index 7530538..c4c05f4 100644 --- a/tests/test_hubspot_blogpost.py +++ b/tests/test_hubspot_blogpost.py @@ -1,19 +1,25 @@ -import json import os import pytest -from hubspot.cms.blogs.blog_posts.rest import RESTResponse -from urllib3 import HTTPResponse from hubspot_tech_writing.core import delete_blogpost, upload +from tests.test_hubspot_file import response_simulator_upload +from tests.util import mkresponse -def mkresponse(data, status=200, reason="OK"): - body = json.dumps(data).encode("utf-8") - return RESTResponse(HTTPResponse(body=body, status=status, reason=reason)) +def response_simulator_create(self, method, url, **kwargs): + if method == "GET" and url == "https://api.hubapi.com/cms/v3/blogs/posts": + response = mkresponse({"total": 0, "results": []}) + elif method == "POST" and url == "https://api.hubapi.com/cms/v3/blogs/posts": + response = mkresponse({"id": "12345"}, status=201, reason="Created") + elif method == "PATCH" and url == "https://api.hubapi.com/cms/v3/blogs/posts/12345": + response = mkresponse({"id": "12345"}) + else: + raise ValueError(f"No HTTP conversation mock for: method={method}, url={url}") + return response -def response_simulator_create(self, method, url, **kwargs): +def response_simulator_create_with_image(self, method, url, **kwargs): if method == "GET" and url == "https://api.hubapi.com/cms/v3/blogs/posts": response = mkresponse({"total": 0, "results": []}) elif method == "POST" and url == "https://api.hubapi.com/cms/v3/blogs/posts": @@ -93,7 +99,7 @@ def test_upload_blogpost_create_from_markdown(hubspot_access_token, mocker, capl def test_upload_blogpost_update(hubspot_access_token, mocker, caplog, tmp_path): - tmpfile = tmp_path / "foo.html" + tmpfile = tmp_path / "foo.md" tmpfile.write_text("# Foobar\nFranz jagt im komplett verwahrlosten Taxi quer durch Bayern.") mocker.patch("hubspot.cms.blogs.blog_posts.rest.RESTClientObject.request", response_simulator_update) @@ -109,6 +115,35 @@ def test_upload_blogpost_update(hubspot_access_token, mocker, caplog, tmp_path): assert "Saving blog post: HubSpotBlogPost identifier=12345, name=hstw-test" in caplog.text +def test_upload_blogpost_with_image(hubspot_access_token, mocker, caplog, tmp_path): + mdfile = tmp_path / "foo.md" + pngfile = tmp_path / "images" / "bar.png" + pngfile.parent.mkdir() + mdfile.write_text("![bar](images/bar.png)") + pngfile.write_bytes(b"") + + mocker.patch("hubspot.cms.blogs.blog_posts.rest.RESTClientObject.request", response_simulator_create_with_image) + mocker.patch("hubspot.files.files.rest.RESTClientObject.request", response_simulator_upload) + upload( + source=mdfile, + name="hstw-test", + content_group_id="55844199082", + folder_path="/path/to/assets", + access_token=hubspot_access_token, + ) + + assert "Uploading file:" in caplog.text + + assert "Loading file: HubSpotFile identifier=None, name=bar.png, folder=/path/to/assets" in caplog.text + assert "Searching for 'bar.png' in folder path '/path/to/assets'" in caplog.text + assert "File does not exist: bar.png" in caplog.text + assert "Creating: HubSpotFile identifier=None, name=bar.png, folder=/path/to/assets" in caplog.text + assert "Saving file: HubSpotFile identifier=12345, name=bar.png, folder=/path/to/assets" in caplog.text + + assert "Loading blog post: HubSpotBlogPost identifier=None, name=hstw-test" in caplog.text + assert "Saving blog post: HubSpotBlogPost identifier=12345, name=hstw-test" in caplog.text + + def test_delete_by_identifier(hubspot_access_token, mocker, caplog): mocker.patch.dict(os.environ, {"CONFIRM": "yes"}) mocker.patch("hubspot.cms.blogs.blog_posts.rest.RESTClientObject.request", response_simulator_delete_id) diff --git a/tests/test_hubspot_file.py b/tests/test_hubspot_file.py index 8f1e6dc..b050551 100644 --- a/tests/test_hubspot_file.py +++ b/tests/test_hubspot_file.py @@ -5,16 +5,18 @@ from hubspot_tech_writing.core import delete_file, upload -from .test_hubspot_blogpost import mkresponse +from .util import mkresponse def response_simulator_upload(self, method, url, **kwargs): if method == "GET" and url == "https://api.hubapi.com/files/v3/files/search": response = mkresponse({"total": 0, "results": []}) elif method == "POST" and url == "https://api.hubapi.com/files/v3/files": - response = mkresponse({"id": "12345"}, status=201, reason="Created") + response = mkresponse( + {"id": "12345", "url": "https://site.example/hubfs/any.png"}, status=201, reason="Created" + ) elif method == "PUT" and url == "https://api.hubapi.com/files/v3/files/12345": - response = mkresponse({"id": "12345"}) + response = mkresponse({"id": "12345", "url": "https://site.example/hubfs/any.png"}) else: raise ValueError(f"No HTTP conversation mock for: method={method}, url={url}") return response diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 0000000..83e640b --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,80 @@ +from hubspot_tech_writing.util.common import ContentTypeResolver +from hubspot_tech_writing.util.io import open_url, path_without_scheme + + +def test_content_type_resolver_local_html(): + ctr = ContentTypeResolver("/path/to/document.html") + assert ctr.is_file() is False + assert ctr.is_text() is True + assert ctr.is_markup() is False + assert ctr.is_html() is True + assert ctr.suffix == ".html" + + +def test_content_type_resolver_local_markdown(): + ctr = ContentTypeResolver("/path/to/document.md") + assert ctr.is_file() is False + assert ctr.is_text() is True + assert ctr.is_markup() is True + assert ctr.is_html() is False + assert ctr.suffix == ".md" + + +def test_content_type_resolver_local_text(): + ctr = ContentTypeResolver("/path/to/document.txt") + assert ctr.is_file() is False + assert ctr.is_text() is True + assert ctr.is_markup() is False + assert ctr.is_html() is False + assert ctr.suffix == ".txt" + + +def test_content_type_resolver_local_image(): + ctr = ContentTypeResolver("/path/to/document.png") + assert ctr.is_file() is True + assert ctr.is_text() is False + assert ctr.is_markup() is False + assert ctr.is_html() is False + assert ctr.suffix == ".png" + + +def test_content_type_resolver_remote_https(): + ctr = ContentTypeResolver("https://site.example/path/to/document.md") + assert ctr.is_file() is False + assert ctr.is_text() is True + assert ctr.is_markup() is True + assert ctr.is_html() is False + assert ctr.suffix == ".md" + + +def test_content_type_resolver_remote_github(): + ctr = ContentTypeResolver("github://site.example/path/to/document.md") + assert ctr.is_file() is False + assert ctr.is_text() is True + assert ctr.is_markup() is True + assert ctr.is_html() is False + assert ctr.suffix == ".md" + + +def test_content_type_resolver_remote_github_https(): + ctr = ContentTypeResolver("github+https://site.example/path/to/document.md") + assert ctr.is_file() is False + assert ctr.is_text() is True + assert ctr.is_markup() is True + assert ctr.is_html() is False + assert ctr.suffix == ".md" + + +def test_path_without_scheme_local(): + assert str(path_without_scheme("/path/to/document.md")) == "/path/to/document.md" + + +def test_path_without_scheme_url(): + assert str(path_without_scheme("https://site.example/path/to/document.md")) == "//site.example/path/to/document.md" + + +def test_path_from_url(markdownurl_github_https_bare, markdownurl_github_https_raw, markdownurl_github_https_blob): + reference = "github://tests/data/hubspot-blog-post-original.md" + assert str(open_url(markdownurl_github_https_bare)) == reference + assert str(open_url(markdownurl_github_https_raw)) == reference + assert str(open_url(markdownurl_github_https_blob)) == reference diff --git a/tests/util.py b/tests/util.py new file mode 100644 index 0000000..39d2a55 --- /dev/null +++ b/tests/util.py @@ -0,0 +1,9 @@ +import json + +from hubspot.cms.blogs.blog_posts.rest import RESTResponse +from urllib3 import HTTPResponse + + +def mkresponse(data, status=200, reason="OK"): + body = json.dumps(data).encode("utf-8") + return RESTResponse(HTTPResponse(body=body, status=status, reason=reason))