Skip to content

Commit d2e3275

Browse files
committed
tests, docs, compatibility
1 parent 0bd6fbf commit d2e3275

File tree

9 files changed

+565
-202
lines changed

9 files changed

+565
-202
lines changed

cloudpathlib/__init__.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from .azure.azblobclient import AzureBlobClient
66
from .azure.azblobpath import AzureBlobPath
77
from .cloudpath import CloudPath, implementation_registry
8-
from .patches import patch_open, patch_os_functions, patch_glob
8+
from .patches import patch_open, patch_os_functions, patch_glob, patch_all_builtins
99
from .gs.gsclient import GSClient
1010
from .gs.gspath import GSPath
1111
from .http.httpclient import HttpClient, HttpsClient
@@ -38,6 +38,7 @@
3838
"patch_open",
3939
"patch_glob",
4040
"patch_os_functions",
41+
"patch_all_builtins",
4142
"S3Client",
4243
"S3Path",
4344
]
@@ -53,6 +54,4 @@
5354
patch_glob()
5455

5556
if bool(os.environ.get("CLOUDPATHLIB_PATCH_ALL", "")):
56-
patch_open()
57-
patch_os_functions()
58-
patch_glob()
57+
patch_all_builtins()

cloudpathlib/http/httpclient.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,12 @@ def _get_metadata(self, cloud_path: HttpPath) -> dict:
7979
"content_type": response.headers.get("Content-Type", None),
8080
}
8181

82+
def _is_file_or_dir(self, cloud_path: HttpPath) -> Optional[str]:
83+
if self.dir_matcher(cloud_path.as_url()):
84+
return "dir"
85+
else:
86+
return "file"
87+
8288
def _download_file(self, cloud_path: HttpPath, local_path: Union[str, os.PathLike]) -> Path:
8389
local_path = Path(local_path)
8490
with self.opener.open(cloud_path.as_url()) as response:

cloudpathlib/local/localclient.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,14 @@ def _is_file(self, cloud_path: "LocalPath", follow_symlinks=True) -> bool:
118118

119119
return self._cloud_path_to_local(cloud_path).is_file(**kwargs)
120120

121+
def _is_file_or_dir(self, cloud_path: "LocalPath") -> Optional[str]:
122+
if self._is_dir(cloud_path):
123+
return "dir"
124+
elif self._is_file(cloud_path):
125+
return "file"
126+
else:
127+
raise FileNotFoundError(f"Path could not be identified as file or dir: {cloud_path}")
128+
121129
def _list_dir(
122130
self, cloud_path: "LocalPath", recursive=False
123131
) -> Iterable[Tuple["LocalPath", bool]]:

cloudpathlib/patches.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -378,3 +378,25 @@ def __exit__(self, exc_type, exc_value, traceback):
378378

379379
def patch_glob():
380380
return _GlobPatch()
381+
382+
383+
class _PatchAllBuiltins:
384+
def __init__(self):
385+
self.patch_open = patch_open()
386+
self.patch_os_functions = patch_os_functions()
387+
self.patch_glob = patch_glob()
388+
389+
def __enter__(self):
390+
self.patch_open.__enter__()
391+
self.patch_os_functions.__enter__()
392+
self.patch_glob.__enter__()
393+
return
394+
395+
def __exit__(self, exc_type, exc_value, traceback):
396+
self.patch_open.__exit__(exc_type, exc_value, traceback)
397+
self.patch_os_functions.__exit__(exc_type, exc_value, traceback)
398+
self.patch_glob.__exit__(exc_type, exc_value, traceback)
399+
400+
401+
def patch_all_builtins():
402+
return _PatchAllBuiltins()

docs/docs/patching_builtins.ipynb

Lines changed: 252 additions & 171 deletions
Large diffs are not rendered by default.
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
#!/usr/bin/env python
2+
# coding: utf-8
3+
4+
# # Compatibility
5+
6+
# ## Patching Python builtins (third-party library compatibility)
7+
#
8+
# Not every Python library in the broad universe of Python libraries is implemented to accept pathlib-compatible objects like those implemented by cloudpathlib. Many libraries will only accept strings as filepaths. These libraries internally use `open`, functions from `os` and `os.path`, or other core library modules like `glob` to navigate paths and manipulate them.
9+
#
10+
# This means that out-of-the-box you can't just pass a `CloudPath` object to any library. For those implemented with `pathlib`, this will work. For anything else the code will throw an exception at some point.
11+
#
12+
# The long-term solution is to ask developers to implement their library to support either (1) pathlib-compatible objects for files and directories, or (2) file-like objects passed directly (e.g., so you could call `CloudPath.open` in your code and pass the the file-like object to the library).
13+
#
14+
# The near-term workaround that will be compatible with some libraries is to patch the builtins to make `open`, `os`, `os.path`, and `glob` work with `CloudPath` objects. Because this overrides default Python functionality, this is not on by default. When patched, these functions will use the `CloudPath` version if they are passed a `CloudPath` and will fallback to their normal implementations otherwise.
15+
#
16+
# There are three ways to enable these patches: environment variables, globally with a function call, or just in a specific context with a context manager.
17+
#
18+
# ## Differences in reading versus writing to `CloudPath`
19+
#
20+
# A major reason to patch these builtins is if you want to write to a `CloudPath` with a third party library. For scenarios where you are reading files, you may not need to do any patching. Many python libraries support using [`__fspath__`](https://docs.python.org/3/library/os.html#os.PathLike.__fspath__) to get the location of a file on disk.
21+
#
22+
# We implement `CloudPath.__fspath__`, which will cache the file to the local disk and provide that file path as a string to any library that uses `fspath`. This works well for reading files, but not for writing them. Because there is no callback to our code once that filepath gets written to, we can't see changes and then push those changes from the cache back to the cloud (see related discussions in [#73](https://github.com/drivendataorg/cloudpathlib/issues/73), [#128](https://github.com/drivendataorg/cloudpathlib/issues/128), [#140](https://github.com/drivendataorg/cloudpathlib/pull/140)). In many scenarios our code will never get called again.
23+
#
24+
# For this reason, it is better to patch the built-in functions to handle `CloudPath` objects rather than rely on `__fspath__`, especially if you are writing to these files.
25+
#
26+
#
27+
# ## Setting with environment variables
28+
#
29+
# These methods can be enabled by setting the following environment variables:
30+
# - `CLOUDPATHLIB_PACTH_ALL=1` - patch all the builtins we implement: `open`, `os` functions, and `glob`
31+
# - `CLOUDPATHLIB_PACTH_OPEN=1` - patch the builtin `open` method
32+
# - `CLOUDPATHLIB_PACTH_OS_FUNCTIONS=1` - patch the `os` functions
33+
# - `CLOUDPATHLIB_PACTH_GLOB=1` - patch the `glob` module
34+
#
35+
# You can set environment variables in many ways, but it is common to either pass it at the command line with something like `CLOUDPATHLIB_PACTH_ALL=1 python my_script.py` or to set it in your Python script with `os.environ['CLOUDPATHLIB_PACTH_ALL'] = 1`. Note, these _must_ be set before any `cloudpathlib` methods are imported.
36+
#
37+
# ## Setting with patch methods globally
38+
#
39+
# Instead of setting environment variables, you can call methods to patch the functions. For example, you may call these at import time in your application or script. This will use the patched methods throughout your application.
40+
#
41+
# ```python
42+
# from cloudpathlib import patch_all_builtins, patch_open, patch_os_functions, patch_glob
43+
#
44+
# # patch the builtins your code or a library that you call uses
45+
# patch_open()
46+
# patch_os_functions()
47+
# patch_glob()
48+
#
49+
# # or, if you want all of these at once
50+
# patch_all_builtins()
51+
# ```
52+
#
53+
# ## Setting with a context manager
54+
#
55+
# Finally, you can control the scope which the patach is used with a context manager. For example, you may have just one call to an external library that is failing to accept `CloudPath`. You can limit the patch effect to that call by using a context manager, which will remove the patch at the end of the block. This is useful if you want to patch the functions for a specific block of code but not for the rest of the application.
56+
#
57+
# ```python
58+
# from cloudpathlib import patch_all_builtins
59+
#
60+
# with patch_all_builtins():
61+
# with open(cloud_path) as f:
62+
# data = f.read()
63+
# ```
64+
#
65+
# This is the narrowest, most targeted way to update the builtin Python methods that don't just work with `CloudPath` objects.
66+
#
67+
# Next, we'll walk through some examples of patching and using these methods.
68+
#
69+
70+
# We can see a similar result for patching the functions in the `os` module.
71+
72+
# ## Patching `open`
73+
#
74+
# Sometimes code uses the Python built-in `open` to open files and operate on them. In those cases, passing a `CloudPath` will fail. You can patch the built-in `open` so that when a `CloudPath` is provided it uses `CloudPath.open`, otherwise defers to the original behavior.
75+
#
76+
# Here's an example that would not work unless you patch the built-ins (for example, if you depend on a third-party library that calls `open`).
77+
#
78+
# It will fail with an `OverwriteNewerLocalError` becasuse `read_text` tries to download from the cloud to a cache path that has been updated locally (but, crucially, not rewritten back to the cloud).
79+
#
80+
81+
# Imagine that deep in a third-party library a function is implemented like this
82+
def library_function(filepath: str):
83+
with open(filepath, "w") as f:
84+
f.write("hello!")
85+
86+
87+
from cloudpathlib import CloudPath
88+
89+
# create file to read
90+
cp = CloudPath("s3://cloudpathlib-test-bucket/patching_builtins/new_file.txt")
91+
92+
try:
93+
library_function(cp)
94+
95+
# read the text that was written
96+
assert cp.read_text() == "hello!"
97+
except Exception as e:
98+
print(type(e))
99+
print(e)
100+
101+
102+
# ### Patching `open` in Jupyter notebooks
103+
#
104+
# Since this documentation runs as a Jupyter notebook, there is an extra step to patch `open`. Jupyter notebooks inject their own `open` into the user namespace. After enabling the patch, ensure the notebook's `open` refers to the patched built-in:
105+
#
106+
# ```python
107+
# from cloudpathlib import patch_open
108+
#
109+
# open = patch_open().patched # rebind notebook's open to the patched version
110+
# ```
111+
112+
from cloudpathlib import CloudPath, patch_open
113+
114+
# enable patch and rebind notebook's open
115+
open = patch_open().patched
116+
117+
# create file to read
118+
cp = CloudPath("s3://cloudpathlib-test-bucket/patching_builtins/file.txt")
119+
120+
library_function(cp)
121+
assert cp.read_text() == "hello!"
122+
print("Succeeded!")
123+
124+
125+
# ## Examples: os.path functions with CloudPath
126+
#
127+
# The snippet below demonstrates common `os.path` functions when patched to accept `CloudPath` values. These calls work for `CloudPath` and still behave normally for string paths.
128+
#
129+
130+
import os
131+
132+
from cloudpathlib import patch_os_functions, CloudPath
133+
134+
cp = CloudPath("s3://cloudpathlib-test-bucket/patching_builtins/file.txt")
135+
folder = cp.parent
136+
137+
try:
138+
print(os.path.isdir(folder))
139+
except Exception as e:
140+
print("Unpatched version fails:")
141+
print(e)
142+
143+
144+
with patch_os_functions():
145+
result = os.path.isdir(folder)
146+
print("Patched version of `os.path.isdir` returns: ", result)
147+
148+
print("basename:", os.path.basename(cp))
149+
150+
print("dirname:", os.path.dirname(cp))
151+
152+
joined = os.path.join(folder, "dir", "sub", "name.txt")
153+
print("join:", joined)
154+
155+
156+
# ## Examples: glob with CloudPath
157+
#
158+
# The snippet below demonstrates `glob.glob` and `glob.iglob` working with `CloudPath` as the pattern or `root_dir` when patched.
159+
#
160+
161+
from glob import glob
162+
163+
from cloudpathlib import patch_glob, CloudPath
164+
165+
try:
166+
glob(CloudPath("s3://cloudpathlib-test-bucket/manual-tests/**/*dir*/**"))
167+
except Exception as e:
168+
print("Unpatched version fails:")
169+
print(e)
170+
171+
172+
with patch_glob():
173+
print("Patched succeeds:")
174+
print(glob(CloudPath("s3://cloudpathlib-test-bucket/manual-tests/**/*dir*/**/*")))
175+
176+
# or equivalently
177+
print(glob("**/*dir*/**/*", root_dir=CloudPath("s3://cloudpathlib-test-bucket/manual-tests/")))
178+
179+
180+
# # Examples with third party libraries
181+
#
182+
# Here we show that third party libraries, like Pillow, that don't work as expected without patching the built-ins.
183+
#
184+
# However, if we patch built-ins, we can see the functions work as expected.
185+
186+
# ## Pillow example
187+
188+
from cloudpathlib import CloudPath, patch_all_builtins
189+
from PIL import Image
190+
191+
192+
base = CloudPath("s3://cloudpathlib-test-bucket/patching_builtins/third_party/")
193+
194+
img_path = base / "pillow_demo.png"
195+
196+
# Unpatched: using CloudPath directly fails
197+
try:
198+
Image.new("RGB", (10, 10), color=(255, 0, 0)).save(img_path)
199+
except Exception as e:
200+
print("Pillow without patch: FAILED:", e)
201+
202+
203+
# Patched: success with patching builtins
204+
with patch_all_builtins():
205+
Image.new("RGB", (10, 10), color=(255, 0, 0)).save(img_path)
206+
207+
assert img_path.read_bytes()
208+
print("With patches, Pillow successfully writes to a CloudPath")
209+
210+
211+
# ## Caveat: Some libraries still do not work
212+
#
213+
# Even with patches, some libraries will not work. For example, writing directly to a `CloudPath` with `pandas` is not possible because `pandas` has a complex set of IO checks it does in its own codebase.
214+
#
215+
# For many of these libraries (including `pandas`) using `CloudPath.open` and then passing the buffer to the functions that can read and write to those buffers is usually the cleanest workaround.
216+
#
217+
# For example, here is the best way to write to a `CloudPath` with `pandas`:
218+
219+
import pandas as pd
220+
221+
df = pd.DataFrame([[0, 1], [2, 3]], columns=['a', 'b'])
222+
223+
cloud_path = base / "data.csv"
224+
225+
try:
226+
df.to_csv(cloud_path)
227+
except Exception as e:
228+
print("Could not write with `to_csv` because error: ", e)
229+
230+
231+
# instead, use .open
232+
with cloud_path.open("w") as f:
233+
df.to_csv(f)
234+
235+
assert cloud_path.exists()
236+
print("Successfully wrote to ", cloud_path)
237+
238+

docs/mkdocs.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@ nav:
1818
- Home: "index.md"
1919
- Why cloudpathlib?: "why_cloudpathlib.ipynb"
2020
- Authentication: "authentication.md"
21+
- AnyPath: "anypath-polymorphism.md"
2122
- HTTP URLs: "http.md"
2223
- Caching: "caching.ipynb"
23-
- AnyPath: "anypath-polymorphism.md"
24+
- Compatibility: "patching_builtins.ipynb"
2425
- Other Client settings: "other_client_settings.md"
2526
- Testing code that uses cloudpathlib: "testing_mocked_cloudpathlib.ipynb"
2627
- Integrations: "integrations.md"

test-open.py

Lines changed: 0 additions & 25 deletions
This file was deleted.

0 commit comments

Comments
 (0)