Skip to content

Commit fff0d10

Browse files
committed
Use pathlibfs for scheme-agnostic file access
1 parent 99153f1 commit fff0d10

File tree

7 files changed

+80
-17
lines changed

7 files changed

+80
-17
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33

44
## Unreleased
5+
- Use `pathlibfs` for scheme-agnostic file access
56

67
## 2023-10-07 0.1.0
78
- Add example data files in different formats

hubspot_tech_writing/core.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import logging
33
import typing as t
44
import warnings
5-
from pathlib import Path
5+
from pathlibfs import Path
66
from tempfile import NamedTemporaryFile
77

88
import hubspot
@@ -14,7 +14,7 @@
1414
from hubspot_tech_writing.hubspot_api import HubSpotAdapter, HubSpotBlogPost, HubSpotFile
1515
from hubspot_tech_writing.util.common import ContentTypeResolver
1616
from hubspot_tech_writing.util.html import HTMLImageTranslator
17-
from hubspot_tech_writing.util.io import to_io
17+
from hubspot_tech_writing.util.io import to_io, path_from_url
1818

1919
logger = logging.getLogger(__name__)
2020

@@ -75,9 +75,13 @@ def upload(
7575
folder_id: t.Optional[str] = None,
7676
folder_path: t.Optional[str] = None,
7777
):
78-
source_path = Path(source)
78+
source_path = source
79+
if isinstance(source_path, str):
80+
source_path = path_from_url(source)
81+
logger.info(f"Source: {source}")
82+
logger.info(f"Source path: {source_path} {type(source_path)}")
7983

80-
ctr = ContentTypeResolver(name=source_path)
84+
ctr = ContentTypeResolver(filepath=source_path)
8185

8286
logger.info(f"Uploading file: {source}")
8387
hsa = HubSpotAdapter(access_token=access_token)
@@ -100,6 +104,7 @@ def upload(
100104
upload, access_token=access_token, folder_id=folder_id, folder_path=folder_path
101105
)
102106
hit = HTMLImageTranslator(html=html, source_path=source_path, uploader=uploader)
107+
logger.info(hit)
103108
hit.discover().process()
104109
html = hit.html_out
105110

hubspot_tech_writing/hubspot_api.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import typing as t
55
from copy import deepcopy
66
from pathlib import Path
7+
from tempfile import NamedTemporaryFile
78

89
import hubspot
910
from click import confirm
@@ -134,8 +135,10 @@ def save_file(self, file_id: str, source: str):
134135
"""
135136
Save / overwrite existing file.
136137
"""
138+
tmpfile = NamedTemporaryFile()
139+
tmpfile.write(source.read_bytes())
137140
return self.hs.files.files.files_api.replace(
138-
file_id=file_id, file=source, options=json.dumps(self.FILE_OPTIONS)
141+
file_id=file_id, file=tmpfile.name, options=json.dumps(self.FILE_OPTIONS)
139142
)
140143

141144
def delete_file_by_id(self, identifier: str) -> t.Optional[File]:
@@ -286,7 +289,7 @@ def __init__(
286289
def __str__(self):
287290
return (
288291
f"{self.__class__.__name__} identifier={self.identifier}, "
289-
f"name={self.name}, folder={self.folder_id or self.folder_path}"
292+
f"name={self.name}, folder={self.folder_id or self.folder_path}, source={self.source}"
290293
)
291294

292295
def load(self):
@@ -310,7 +313,7 @@ def save(self):
310313
if not self.source:
311314
raise ValueError(f"Unable to save file without source: {self}")
312315
logger.info(f"Saving file: {self}")
313-
return self.hsa.save_file(file_id=self.identifier, source=str(self.source))
316+
return self.hsa.save_file(file_id=self.identifier, source=self.source)
314317

315318
def delete(self):
316319
"""

hubspot_tech_writing/util/common.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import logging
22
import typing as t
3-
from pathlib import Path
3+
from pathlibfs import Path
44

55
import colorlog
66
from colorlog.escape_codes import escape_codes
7+
from yarl import URL
78

89

910
def setup_logging(level=logging.INFO, verbose: bool = False):
@@ -23,9 +24,10 @@ class ContentTypeResolver:
2324
HTML_SUFFIXES = [".html", ".html5", ".htm"]
2425
TEXT_SUFFIXES = MARKUP_SUFFIXES + HTML_SUFFIXES + [".txt"]
2526

26-
def __init__(self, name: t.Union[str, Path]):
27-
self.name = name
28-
self.suffix = Path(name).suffix
27+
def __init__(self, filepath: t.Union[str, Path]):
28+
self.url = URL(str(filepath)).with_scheme("")
29+
self.path = Path(str(self.url))
30+
self.suffix = self.path.suffix
2931

3032
def is_markup(self):
3133
return self.suffix in self.MARKUP_SUFFIXES
@@ -38,3 +40,8 @@ def is_text(self):
3840

3941
def is_file(self):
4042
return not self.is_text()
43+
44+
45+
def url_to_path(filepath: str):
46+
url = URL(str(filepath)).with_scheme("")
47+
return Path(str(url))

hubspot_tech_writing/util/html.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
import logging
33
import typing as t
44
from copy import deepcopy
5-
from pathlib import Path
5+
from pathlibfs import Path
66

77
from bs4 import BeautifulSoup
88

9+
from hubspot_tech_writing.util.common import url_to_path
10+
911
logger = logging.getLogger(__name__)
1012

1113

@@ -32,7 +34,7 @@ def __init__(self, html: str, source_path: t.Union[str, Path], uploader: t.Optio
3234

3335
def __str__(self):
3436
return (
35-
f"HTMLImageTranslator:\nin: {self.images_in}\nlocal: {self.images_local}\nremote: {self.images_remote}"
37+
f"HTMLImageTranslator:\nsource path: {self.source_path}\nin: {self.images_in}\nlocal: {self.images_local}\nremote: {self.images_remote}"
3638
)
3739

3840
def discover(self):
@@ -60,8 +62,9 @@ def resolve(self) -> "HTMLImageTranslator":
6062
Process discovered image elements, computing effective paths.
6163
"""
6264
if self.source_path is None:
65+
logger.warning("No resolving without source path")
6366
return self
64-
parent_path = Path(self.source_path)
67+
parent_path = self.source_path
6568
if parent_path.is_file():
6669
parent_path = parent_path.parent
6770
self.images_local = []
@@ -74,7 +77,7 @@ def resolve(self) -> "HTMLImageTranslator":
7477

7578
# Relative paths are relative to the original document.
7679
else:
77-
image_new.src = str(Path(parent_path) / image.src)
80+
image_new.src = parent_path / image.src
7881
self.images_local.append(image_new)
7982
return self
8083

@@ -86,7 +89,7 @@ def upload(self) -> "HTMLImageTranslator":
8689
logger.warning("No upload without uploader")
8790
return self
8891
for image_local in self.images_local:
89-
hs_file = self.uploader(source=image_local.src, name=Path(image_local.src).name)
92+
hs_file = self.uploader(source=image_local.src, name=image_local.src.name)
9093
image_url = hs_file.url
9194
image_remote: HTMLImage = deepcopy(image_local)
9295
image_remote.src = image_url

hubspot_tech_writing/util/io.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,64 @@
11
import contextlib
22
import io
33
import typing as t
4-
from pathlib import Path
4+
from pathlibfs import Path
55

66
import requests
7+
from yarl import URL
78

89

910
@contextlib.contextmanager
1011
def to_io(source: t.Union[str, Path, t.IO]) -> t.Generator[t.IO, None, None]:
1112
if isinstance(source, (str, Path)):
1213
source = str(source)
1314
fp: t.IO
15+
path = path_from_url(source)
16+
fp = path.open(mode="rt")
17+
"""
1418
if source.startswith("http://") or source.startswith("https://"):
1519
response = requests.get(source, timeout=10.0)
1620
fp = io.StringIO(response.text)
1721
else:
1822
fp = open(source, "r")
23+
"""
1924
else:
2025
fp = source
2126
yield fp
2227
fp.close()
28+
29+
30+
def path_from_url(url: str) -> Path:
31+
"""
32+
Convert GitHub HTTP URL to pathlibfs / fsspec URL.
33+
34+
Input URLs
35+
----------
36+
github+https://foobar:[email protected]/acme/sweet-camino/path/to/document.md
37+
github+https://foobar:[email protected]/acme/sweet-camino/blob/main/path/to/document.md
38+
39+
Output Path
40+
-----------
41+
fs = Path("github://path/to/document.md", username="foobar", token="ghp_lalala", org="acme", repo="sweet-camino")
42+
"""
43+
uri = URL(url)
44+
45+
if uri.scheme.startswith("github+https"):
46+
47+
path_fragments = uri.path.split("/")[1:]
48+
path_kwargs = {
49+
"username": uri.user,
50+
"token": uri.password,
51+
"org": path_fragments[0],
52+
"repo": path_fragments[1],
53+
}
54+
55+
real_path_fragments = path_fragments[2:]
56+
if path_fragments[2] == "blob":
57+
real_path_fragments = path_fragments[4:]
58+
59+
downstream_url = "github://" + "/".join(real_path_fragments)
60+
path = Path(downstream_url, **path_kwargs)
61+
62+
else:
63+
path = Path(url)
64+
return path

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,9 @@ dependencies = [
7070
"hubspot-api-client<9",
7171
"markdown<4",
7272
"mkdocs-linkcheck<2",
73+
"pathlibfs<0.6",
7374
"requests<3",
75+
"yarl<2",
7476
]
7577

7678
[project.optional-dependencies]

0 commit comments

Comments
 (0)