Skip to content

Commit a94d751

Browse files
bogdang989tetron
authored andcommitted
WIP Add support for optional secondary files in v1.1.0 (#804)
* Add support for optional secondary files in v1.1.0 * Refactor to assume schema salad applies secondaryFilesDSL * Bump schema-salad version pin * Update embedded v1.1 schema
1 parent 6d29984 commit a94d751

File tree

11 files changed

+204
-34
lines changed

11 files changed

+204
-34
lines changed

cwltool/builder.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,22 @@
11
from __future__ import absolute_import
22

33
import copy
4+
import os
45
import logging
56
from typing import (Any, Callable, Dict, List, MutableMapping, MutableSequence,
67
Optional, Set, Tuple, Union)
78

9+
from typing_extensions import Text, Type, TYPE_CHECKING # pylint: disable=unused-import
10+
# move to a regular typing import when Python 3.3-3.6 is no longer supported
11+
812
from rdflib import Graph, URIRef # pylint: disable=unused-import
913
from rdflib.namespace import OWL, RDFS
1014
from ruamel.yaml.comments import CommentedMap
1115
from schema_salad import validate
1216
from schema_salad.schema import Names, convert_to_dict
1317
from schema_salad.avro.schema import make_avsc_object, Schema
1418
from schema_salad.sourceline import SourceLine
19+
from schema_salad.ref_resolver import uri_file_path
1520
from six import iteritems, string_types
1621
from typing_extensions import (TYPE_CHECKING, # pylint: disable=unused-import
1722
Text, Type)
@@ -131,6 +136,7 @@ def __init__(self,
131136
outdir, # type: Text
132137
tmpdir, # type: Text
133138
stagedir, # type: Text
139+
cwl_version, # type: Text
134140
): # type: (...) -> None
135141

136142
self.job = job
@@ -140,6 +146,7 @@ def __init__(self,
140146
self.names = names
141147
self.requirements = requirements
142148
self.hints = hints
149+
self.cwl_version = cwl_version
143150
self.resources = resources
144151
self.mutation_manager = mutation_manager
145152
self.formatgraph = formatgraph
@@ -284,26 +291,39 @@ def _capture_files(f):
284291
if "secondaryFiles" not in datum:
285292
datum["secondaryFiles"] = []
286293
for sf in aslist(schema["secondaryFiles"]):
287-
if isinstance(sf, MutableMapping) or "$(" in sf or "${" in sf:
288-
sfpath = self.do_eval(sf, context=datum)
294+
sf_required = True
295+
if isinstance(sf, MutableMapping) and "pattern" in sf and self.cwl_version in ['v1.1.0-dev1']:
296+
if 'required' in sf:
297+
sf_required = self.do_eval(sf['required'], context=datum)
298+
elif isinstance(sf, string_types):
299+
sf = {"pattern": sf}
289300
else:
290-
sfpath = substitute(datum["basename"], sf)
301+
raise validate.ValidationException("Not a secondary file definition: %s" % sf)
302+
303+
if "$(" in sf["pattern"] or "${" in sf["pattern"]:
304+
sfpath = self.do_eval(sf["pattern"], context=datum)
305+
else:
306+
sfpath = substitute(datum["basename"], sf["pattern"])
307+
291308
for sfname in aslist(sfpath):
309+
if not sfname:
310+
continue
292311
found = False
293312
for d in datum["secondaryFiles"]:
294313
if not d.get("basename"):
295314
d["basename"] = d["location"][d["location"].rindex("/")+1:]
296315
if d["basename"] == sfname:
297316
found = True
298317
if not found:
318+
sf_location = datum["location"][0:datum["location"].rindex("/")+1]+sfname
299319
if isinstance(sfname, MutableMapping):
300320
datum["secondaryFiles"].append(sfname)
301-
elif discover_secondaryFiles:
321+
elif discover_secondaryFiles and os.path.exists(uri_file_path(sf_location)):
302322
datum["secondaryFiles"].append({
303-
"location": datum["location"][0:datum["location"].rindex("/")+1]+sfname,
323+
"location": sf_location,
304324
"basename": sfname,
305325
"class": "File"})
306-
else:
326+
elif sf_required:
307327
raise WorkflowException("Missing required secondary file '%s' from file object: %s" % (
308328
sfname, json_dumps(datum, indent=4)))
309329

cwltool/command_line_tool.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
from typing import (Any, Callable, Dict, Generator, List, Mapping, MutableMapping,
1616
MutableSequence, Optional, Set, Union, cast)
1717

18+
from typing_extensions import Text, Type, TYPE_CHECKING # pylint: disable=unused-import
19+
# move to a regular typing import when Python 3.3-3.6 is no longer supported
20+
1821
import shellescape
1922
from schema_salad import validate
2023
from schema_salad.avro.schema import Schema
@@ -759,18 +762,37 @@ def collect_output(self,
759762
primary.setdefault("secondaryFiles", [])
760763
pathprefix = primary["path"][0:primary["path"].rindex("/")+1]
761764
for sf in aslist(schema["secondaryFiles"]):
765+
if isinstance(sf, MutableMapping) and 'pattern' in sf:
766+
if 'required' in sf:
767+
sf_required = sf['required']
768+
else:
769+
sf_required = False
770+
sf = sf['pattern']
771+
else:
772+
sf_required = False
773+
762774
if isinstance(sf, MutableMapping) or "$(" in sf or "${" in sf:
763775
sfpath = builder.do_eval(sf, context=primary)
764776
subst = False
765777
else:
778+
if sf.endswith('?') and \
779+
self.metadata['cwlVersion'] in ['v1.1.0-dev1']:
780+
sf_required = False
781+
sf = sf[:-1]
766782
sfpath = sf
767783
subst = True
768784
for sfitem in aslist(sfpath):
785+
if not sfitem:
786+
continue
769787
if isinstance(sfitem, string_types):
770788
if subst:
771789
sfitem = {"path": substitute(primary["path"], sfitem)}
772790
else:
773791
sfitem = {"path": pathprefix+sfitem}
792+
if not os.path.exists(sfitem['path']) and sf_required:
793+
raise WorkflowException(
794+
"Missing required secondary file '%s'" % (
795+
sfitem["path"]))
774796
if "path" in sfitem and "location" not in sfitem:
775797
revmap(sfitem)
776798
if fs_access.isfile(sfitem["location"]):

cwltool/process.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,7 @@ def inc(d): # type: (List[int]) -> None
673673
or tempfile.mkdtemp())
674674
stagedir = fs_access.realpath(runtime_context.stagedir
675675
or tempfile.mkdtemp())
676+
cwl_version = self.metadata["cwlVersion"]
676677

677678
builder = Builder(job,
678679
files,
@@ -694,7 +695,8 @@ def inc(d): # type: (List[int]) -> None
694695
load_listing,
695696
outdir,
696697
tmpdir,
697-
stagedir)
698+
stagedir,
699+
cwl_version)
698700

699701
bindings.extend(builder.bind_input(
700702
self.inputs_record_schema, job,

cwltool/schemas/v1.1.0-dev1/CommandLineTool.yml

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ $graph:
5050
5151
## Changelog
5252
53+
* Clarify documentation around `valueFrom` and `null` inputs.
5354
* Default values for some fields are now expressed in the schema.
5455
* When defining record types with `CommandInputRecordSchema`, fields of
5556
type `File` may now include `format`, `loadContents`,
@@ -69,9 +70,12 @@ $graph:
6970
* Clarify semantics of `shellQuote`
7071
* Expressions are now allowed to evaluate to `null` or `Dirent` in
7172
[InitialWorkDirRequirement.listing](#InitialWorkDirRequirement).
73+
* Clarify behavior of secondaryFiles on output.
7274
* [Addition](#Requirements_and_hints) of `cwl:requirements` field to
73-
input object documents
75+
input object documents.
7476
* Clarify behavior of `glob` for absolute paths and symlinks.
77+
* Clarify behavior of `glob` to include directories.
78+
* `secondaryFiles` can now be explicitly marked as `required` or not.
7579
7680
See also the [CWL Workflow Description, v1.1.0-dev1 changelog](Workflow.html#Changelog).
7781
@@ -202,6 +206,9 @@ $graph:
202206
`InputParameter.default` field) must be applied before evaluating the
203207
expression.
204208
209+
If the value of the associated input parameter is `null`, `valueFrom` is
210+
not evaluated and nothing is added to the command line.
211+
205212
When a binding is part of the `CommandLineTool.arguments` field,
206213
the `valueFrom` field is required.
207214
@@ -240,12 +247,12 @@ $graph:
240247
- type: array
241248
items: string
242249
doc: |
243-
Find files relative to the output directory, using POSIX glob(3)
244-
pathname matching. If an array is provided, find files that match any
245-
pattern in the array. If an expression is provided, the expression must
246-
return a string or an array of strings, which will then be evaluated as
247-
one or more glob patterns. Must only match and return files which
248-
actually exist.
250+
Find files or directories relative to the output directory, using POSIX
251+
glob(3) pathname matching. If an array is provided, find files or
252+
directories that match any pattern in the array. If an expression is
253+
provided, the expression must return a string or an array of strings,
254+
which will then be evaluated as one or more glob patterns. Must only
255+
match and return files/directories which actually exist.
249256
250257
If the value of glob is a relative path pattern (does not
251258
begin with a slash '/') then it is resolved relative to the
@@ -259,14 +266,14 @@ $graph:
259266
260267
A glob may match a path within the output directory which is
261268
actually a symlink to another file. In this case, the
262-
expected behavior is for the resulting File object to take the
269+
expected behavior is for the resulting File/Directory object to take the
263270
`basename` (and corresponding `nameroot` and `nameext`) of the
264-
symlink. The `location` of the File is implementation
265-
dependent, but logically the File should have the same content
266-
as the symlink target. Platforms may stage output files to
271+
symlink. The `location` of the File/Directory is implementation
272+
dependent, but logically the File/Directory should have the same content
273+
as the symlink target. Platforms may stage output files/directories to
267274
cloud storage that lack the concept of a symlink. In
268-
this case file content may be duplicated, or (to avoid
269-
duplication) the File `location` may refer to the symlink
275+
this case file content and directories may be duplicated, or (to avoid
276+
duplication) the File/Directory `location` may refer to the symlink
270277
target.
271278
272279
It is an error if a symlink in the output directory (or any

cwltool/schemas/v1.1.0-dev1/Process.yml

Lines changed: 71 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,9 @@ $graph:
237237
- "null"
238238
- type: array
239239
items: [File, Directory]
240-
jsonldPredicate: "cwl:secondaryFiles"
240+
jsonldPredicate:
241+
_id: "cwl:secondaryFiles"
242+
secondaryFilesDSL: true
241243
doc: |
242244
A list of additional files or directories that are associated with the
243245
primary file and must be transferred alongside the primary file.
@@ -458,18 +460,25 @@ $graph:
458460
secondaryFiles:
459461
type:
460462
- "null"
461-
- string
462-
- Expression
463+
- SecondaryFileSchema
463464
- type: array
464-
items: [string, Expression]
465-
jsonldPredicate: "cwl:secondaryFiles"
465+
items: SecondaryFileSchema
466+
jsonldPredicate:
467+
_id: "cwl:secondaryFiles"
468+
secondaryFilesDSL: true
466469
doc: |
467470
Only valid when `type: File` or is an array of `items: File`.
468471
469-
Provides a pattern or expression specifying files or directories that
470-
must be included alongside the primary file. All listed secondary
471-
files must be present. An implementation may fail workflow execution
472-
if an expected secondary file does not exist.
472+
Provides a pattern or expression specifying files or
473+
directories that should be included alongside the primary
474+
file. Secondary files may be required or optional. When not
475+
explicitly specified, secondary files specified for `inputs`
476+
are required and `outputs` are optional. An implementation
477+
must include matching Files and Directories in the
478+
`secondaryFiles` property of the primary file. These Files
479+
and Directories must be transferred and staged alongside the
480+
primary file. An implementation may fail workflow execution
481+
if a required secondary file does not exist.
473482
474483
If the value is an expression, the value of `self` in the expression
475484
must be the primary input or output File object to which this binding
@@ -480,6 +489,8 @@ $graph:
480489
`path` or `location` and `basename` fields set, or an array consisting
481490
of strings or File or Directory objects. It is legal to reference an
482491
unchanged File or Directory object taken from input as a secondaryFile.
492+
The expression may return "null" in which case there is no secondaryFile
493+
from that expression.
483494
484495
To work on non-filename-preserving storage systems, portable tool
485496
descriptions should avoid constructing new values from `location`, but
@@ -490,11 +501,13 @@ $graph:
490501
it specifies that the following pattern should be applied to the path
491502
of the primary file to yield a filename relative to the primary File:
492503
493-
1. If string begins with one or more caret `^` characters, for each
504+
1. If string ends with `?` character, remove the last `?` and mark
505+
the resulting secondary file as optional.
506+
2. If string begins with one or more caret `^` characters, for each
494507
caret, remove the last file extension from the path (the last
495508
period `.` and all following characters). If there are no file
496509
extensions, the path is unchanged.
497-
2. Append the remainder of the string to the end of the file path.
510+
3. Append the remainder of the string to the end of the file path.
498511
499512
streamable:
500513
type: boolean?
@@ -850,3 +863,50 @@ $graph:
850863
type: array
851864
items: CommandInputSchema
852865
doc: The list of type definitions.
866+
867+
- name: SecondaryFileSchema
868+
type: record
869+
fields:
870+
- name: pattern
871+
type:
872+
- string
873+
- Expression
874+
doc: |
875+
Provides a pattern or expression specifying files or directories that
876+
should be included alongside the primary file.
877+
878+
If the value is an expression, the value of `self` in the expression
879+
must be the primary input or output File object to which this binding
880+
applies. The `basename`, `nameroot` and `nameext` fields must be
881+
present in `self`. For `CommandLineTool` outputs the `path` field must
882+
also be present. The expression must return a filename string relative
883+
to the path to the primary File, a File or Directory object with either
884+
`path` or `location` and `basename` fields set, or an array consisting
885+
of strings or File or Directory objects. It is legal to reference an
886+
unchanged File or Directory object taken from input as a secondaryFile.
887+
The expression may return "null" in which case there is no secondaryFile
888+
from that expression.
889+
890+
To work on non-filename-preserving storage systems, portable tool
891+
descriptions should avoid constructing new values from `location`, but
892+
should construct relative references using `basename` or `nameroot`
893+
instead.
894+
895+
If a value in `secondaryFiles` is a string that is not an expression,
896+
it specifies that the following pattern should be applied to the path
897+
of the primary file to yield a filename relative to the primary File:
898+
899+
1. If string begins with one or more caret `^` characters, for each
900+
caret, remove the last file extension from the path (the last
901+
period `.` and all following characters). If there are no file
902+
extensions, the path is unchanged.
903+
2. If string ends with `?` character, remove the last `?` and mark
904+
the resulting secondary file as optional.
905+
3. Append the remainder of the string to the end of the file path.
906+
- name: required
907+
type: ["null", boolean, string, Expression]
908+
doc: |
909+
An implementation must not fail workflow execution if `required` is
910+
set to `false` and the expected secondary file does not exist.
911+
Default value for `required` field is `true` for secondary files on
912+
input and `false` for secondary files on output.

cwltool/schemas/v1.1.0-dev1/Workflow.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ $graph:
6868
* `WorkflowStepInput` now has a `label` field
6969
* [Addition](#Requirements_and_hints) of `cwl:requirments` field to
7070
input object documents
71+
* `secondaryFiles` can now be explicitly marked as `required` or not.
7172
7273
See also the [CWL Command Line Tool Description, v1.1.0-dev1 changelog](CommandLineTool.html#Changelog).
7374
@@ -327,7 +328,7 @@ $graph:
327328
1. `null` if there is no `source` field
328329
2. the value of the parameter(s) specified in the `source` field when this
329330
workflow input parameter **is not** specified in this workflow step's `scatter` field.
330-
3. an element of the parameter specified in the `source` field when this workflow input
331+
3. an element of the parameter specified in the `source` field when this workflow input
331332
parameter **is** specified in this workflow step's `scatter` field.
332333
333334
The value of `inputs` in the parameter reference or expression must be

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ requests>=2.4.3
22
ruamel.yaml>=0.12.4,<=0.15.77
33
rdflib>=4.2.2,<4.3
44
shellescape>=3.4.1,<3.5
5-
schema-salad>=3.0,<3.1
5+
schema-salad>=3.1,<4
66
typing>=3.5.3; python_version<"3.6"
77
pathlib2==2.3.2; python_version<"3"
88
prov==1.5.1

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@
5353
'ruamel.yaml >= 0.12.4, <= 0.15.77',
5454
'rdflib >= 4.2.2, < 4.3.0',
5555
'shellescape >= 3.4.1, < 3.5',
56-
'schema-salad >= 3.0, < 3.1',
56+
'schema-salad >= 3.1, < 4',
5757
'mypy-extensions',
5858
'six >= 1.9.0', # >= 1.9.0 required by prov
5959
'psutil',

tests/secondary-files-job.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
fasta_path:
2+
class: File
3+
location: 2.fasta

0 commit comments

Comments
 (0)