Skip to content

Commit 8121161

Browse files
Added file_content utilities to cwl_utils (#165)
Reading file content and checking maximum bytes limit are utility functions that can be useful for any CWL parser, not only cwltool. Therefore, they deserve to be migrated into the `cwl_utils` codebase.
1 parent e315943 commit 8121161

File tree

4 files changed

+133
-3
lines changed

4 files changed

+133
-3
lines changed

cwl_utils/parser/cwl_v1_0_utils.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
import hashlib
3-
from typing import Any, List, Optional, Union
3+
from typing import Any, IO, List, Optional, Union
44

55
from ruamel import yaml
66
from schema_salad.exceptions import ValidationException
@@ -10,6 +10,27 @@
1010
from cwl_utils.errors import WorkflowException
1111

1212

13+
CONTENT_LIMIT: int = 64 * 1024
14+
15+
16+
def content_limit_respected_read_bytes(f: IO[bytes]) -> bytes:
17+
"""
18+
Read file content up to 64 kB as a byte array.
19+
20+
Truncate content for larger files.
21+
"""
22+
return f.read(CONTENT_LIMIT)
23+
24+
25+
def content_limit_respected_read(f: IO[bytes]) -> str:
26+
"""
27+
Read file content up to 64 kB as an utf-8 encoded string.
28+
29+
Truncate content for larger files.
30+
"""
31+
return content_limit_respected_read_bytes(f).decode("utf-8")
32+
33+
1334
def convert_stdstreams_to_files(clt: cwl.CommandLineTool) -> None:
1435
for out in clt.outputs:
1536
if out.type == 'stdout':

cwl_utils/parser/cwl_v1_1_utils.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
import hashlib
3-
from typing import Any, List, Optional, Union, cast
3+
from typing import Any, IO, List, Optional, Union, cast
44

55
from ruamel import yaml
66
from schema_salad.exceptions import ValidationException
@@ -10,6 +10,27 @@
1010
from cwl_utils.errors import WorkflowException
1111

1212

13+
CONTENT_LIMIT: int = 64 * 1024
14+
15+
16+
def content_limit_respected_read_bytes(f: IO[bytes]) -> bytes:
17+
"""
18+
Read file content up to 64 kB as a byte array.
19+
20+
Truncate content for larger files.
21+
"""
22+
return f.read(CONTENT_LIMIT)
23+
24+
25+
def content_limit_respected_read(f: IO[bytes]) -> str:
26+
"""
27+
Read file content up to 64 kB as an utf-8 encoded string.
28+
29+
Truncate content for larger files.
30+
"""
31+
return content_limit_respected_read_bytes(f).decode("utf-8")
32+
33+
1334
def convert_stdstreams_to_files(clt: cwl.CommandLineTool) -> None:
1435
for out in clt.outputs:
1536
if out.type == 'stdout':

cwl_utils/parser/cwl_v1_2_utils.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
import hashlib
3-
from typing import Any, List, Optional, Union, cast
3+
from typing import Any, IO, List, Optional, Union, cast
44

55
from ruamel import yaml
66
from schema_salad.exceptions import ValidationException
@@ -10,6 +10,32 @@
1010
from cwl_utils.errors import WorkflowException
1111

1212

13+
CONTENT_LIMIT: int = 64 * 1024
14+
15+
16+
def content_limit_respected_read_bytes(f: IO[bytes]) -> bytes:
17+
"""
18+
Read file content up to 64 kB as a byte array.
19+
20+
Throw exception for larger files (see https://www.commonwl.org/v1.2/Workflow.html#Changelog).
21+
"""
22+
contents = f.read(CONTENT_LIMIT + 1)
23+
if len(contents) > CONTENT_LIMIT:
24+
raise WorkflowException(
25+
"file is too large, loadContents limited to %d bytes" % CONTENT_LIMIT
26+
)
27+
return contents
28+
29+
30+
def content_limit_respected_read(f: IO[bytes]) -> str:
31+
"""
32+
Read file content up to 64 kB as an utf-8 encoded string.
33+
34+
Throw exception for larger files (see https://www.commonwl.org/v1.2/Workflow.html#Changelog).
35+
"""
36+
return content_limit_respected_read_bytes(f).decode("utf-8")
37+
38+
1339
def convert_stdstreams_to_files(clt: cwl.CommandLineTool) -> None:
1440
for out in clt.outputs:
1541
if out.type == 'stdout':

tests/test_parser_utils.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# SPDX-License-Identifier: Apache-2.0
22
"""Test the CWL parsers utility functions."""
3+
import tempfile
34
from pathlib import Path
45

56
from pytest import raises
@@ -11,11 +12,32 @@
1112
import cwl_utils.parser.cwl_v1_1_utils
1213
import cwl_utils.parser.cwl_v1_2
1314
import cwl_utils.parser.cwl_v1_2_utils
15+
from cwl_utils.errors import WorkflowException
1416
from cwl_utils.parser import load_document_by_uri
1517

1618
HERE = Path(__file__).resolve().parent
1719

1820

21+
def test_v1_0_file_content_64_kB() -> None:
22+
"""Test that reading file content is allowed up to 64kB in CWL v1.0."""
23+
text = "a" * cwl_utils.parser.cwl_v1_0_utils.CONTENT_LIMIT
24+
with tempfile.TemporaryFile() as f:
25+
f.write(text.encode("utf-8"))
26+
f.seek(0)
27+
content = cwl_utils.parser.cwl_v1_0_utils.content_limit_respected_read(f)
28+
assert content == text
29+
30+
31+
def test_v1_0_file_content_larger_than_64_kB() -> None:
32+
"""Test that reading file content is truncated to 64kB for larger files in CWL v1.0."""
33+
text = "a" * (cwl_utils.parser.cwl_v1_0_utils.CONTENT_LIMIT + 1)
34+
with tempfile.TemporaryFile() as f:
35+
f.write(text.encode("utf-8"))
36+
f.seek(0)
37+
content = cwl_utils.parser.cwl_v1_0_utils.content_limit_respected_read(f)
38+
assert content == text[0 : cwl_utils.parser.cwl_v1_0_utils.CONTENT_LIMIT]
39+
40+
1941
def test_v1_0_stdout_to_file() -> None:
2042
"""Test that stdout shortcut is converted to stdout parameter with CWL v1.0."""
2143
clt = cwl_utils.parser.cwl_v1_0.CommandLineTool(
@@ -130,6 +152,26 @@ def test_v1_0_type_for_source_with_id() -> None:
130152
assert source_type == "File"
131153

132154

155+
def test_v1_1_file_content_64_kB() -> None:
156+
"""Test that reading file content is allowed up to 64kB in CWL v1.1."""
157+
text = "a" * cwl_utils.parser.cwl_v1_1_utils.CONTENT_LIMIT
158+
with tempfile.TemporaryFile() as f:
159+
f.write(text.encode("utf-8"))
160+
f.seek(0)
161+
content = cwl_utils.parser.cwl_v1_1_utils.content_limit_respected_read(f)
162+
assert content == text
163+
164+
165+
def test_v1_1_file_content_larger_than_64_kB() -> None:
166+
"""Test that reading file content is truncated to 64kB for larger files in CWL v1.1."""
167+
text = "a" * (cwl_utils.parser.cwl_v1_1_utils.CONTENT_LIMIT + 1)
168+
with tempfile.TemporaryFile() as f:
169+
f.write(text.encode("utf-8"))
170+
f.seek(0)
171+
content = cwl_utils.parser.cwl_v1_1_utils.content_limit_respected_read(f)
172+
assert content == text[0 : cwl_utils.parser.cwl_v1_1_utils.CONTENT_LIMIT]
173+
174+
133175
def test_v1_1_stdout_to_file() -> None:
134176
"""Test that stdout shortcut is converted to stdout parameter with CWL v1.1."""
135177
clt = cwl_utils.parser.cwl_v1_1.CommandLineTool(
@@ -287,6 +329,26 @@ def test_v1_1_type_for_source_with_id() -> None:
287329
assert source_type == "File"
288330

289331

332+
def test_v1_2_file_content_64_kB() -> None:
333+
"""Test that reading file content is allowed up to 64kB in CWL v1.2."""
334+
text = "a" * cwl_utils.parser.cwl_v1_2_utils.CONTENT_LIMIT
335+
with tempfile.TemporaryFile() as f:
336+
f.write(text.encode("utf-8"))
337+
f.seek(0)
338+
content = cwl_utils.parser.cwl_v1_2_utils.content_limit_respected_read(f)
339+
assert content == text
340+
341+
342+
def test_v1_2_file_content_larger_than_64_kB() -> None:
343+
"""Test that reading file content fails for files larger than 64kB in CWL v1.0."""
344+
with raises(WorkflowException):
345+
text = "a" * (cwl_utils.parser.cwl_v1_2_utils.CONTENT_LIMIT + 1)
346+
with tempfile.TemporaryFile() as f:
347+
f.write(text.encode("utf-8"))
348+
f.seek(0)
349+
cwl_utils.parser.cwl_v1_2_utils.content_limit_respected_read(f)
350+
351+
290352
def test_v1_2_stdout_to_file() -> None:
291353
"""Test that stdout shortcut is converted to stdout parameter with CWL v1.2."""
292354
clt = cwl_utils.parser.cwl_v1_2.CommandLineTool(

0 commit comments

Comments
 (0)