Skip to content

Commit 000b7f2

Browse files
committed
Use git sparse-checkout
1 parent c56b73d commit 000b7f2

File tree

7 files changed

+109
-49
lines changed

7 files changed

+109
-49
lines changed

babs/bootstrap.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,11 @@ def _bootstrap_single_app_scripts(
410410
print('This bash script will be named as `participant_job.sh`')
411411
bash_path = op.join(self.analysis_path, 'code', 'participant_job.sh')
412412
container.generate_bash_participant_job(
413-
bash_path, self.input_datasets, self.processing_level, system
413+
bash_path,
414+
self.input_datasets,
415+
self.processing_level,
416+
system,
417+
project_root=op.dirname(self.analysis_path),
414418
)
415419

416420
# also, generate a bash script of a test job used by `babs check-setup`:
@@ -480,6 +484,7 @@ def _bootstrap_pipeline_scripts(self, container_ds, container_config, system):
480484
run_script_relpath='code/pipeline_zip.sh',
481485
container_images=container_images,
482486
datalad_run_message='pipeline',
487+
project_root=op.dirname(self.analysis_path),
483488
)
484489

485490
with open(bash_path, 'w') as f:

babs/container.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,9 @@ def generate_bash_run_bidsapp(self, bash_path, input_ds, processing_level):
145145
print('Below is the generated BIDS App run script:')
146146
print(script_content)
147147

148-
def generate_bash_participant_job(self, bash_path, input_ds, processing_level, system):
148+
def generate_bash_participant_job(
149+
self, bash_path, input_ds, processing_level, system, project_root=None
150+
):
149151
"""Generate bash script for participant job.
150152
151153
Parameters
@@ -158,6 +160,9 @@ def generate_bash_participant_job(self, bash_path, input_ds, processing_level, s
158160
whether processing is done on a subject-wise or session-wise basis
159161
system: class `System`
160162
information on cluster management system
163+
project_root : str, optional
164+
Absolute path to the BABS project root (parent of `analysis/`).
165+
Shown in the script error message when PROJECT_ROOT is unset.
161166
"""
162167

163168
script_content = generate_submit_script(
@@ -169,6 +174,7 @@ def generate_bash_participant_job(self, bash_path, input_ds, processing_level, s
169174
processing_level=processing_level,
170175
container_name=self.container_name,
171176
zip_foldernames=self.config['zip_foldernames'],
177+
project_root=project_root,
172178
)
173179

174180
with open(bash_path, 'w') as f:

babs/generate_submit_script.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ def generate_submit_script(
2929
run_script_relpath=None,
3030
container_images=None,
3131
datalad_run_message=None,
32+
project_root=None,
3233
):
3334
"""
3435
Generate a bash script that runs the BIDS App singularity image.
@@ -57,6 +58,10 @@ def generate_submit_script(
5758
List of container image paths. None for single-app mode.
5859
datalad_run_message : str, optional
5960
Custom message for datalad run. None uses container name.
61+
project_root : str, optional
62+
Absolute path to the BABS project root (parent of `analysis/`).
63+
Passed to the template; used in the error message when PROJECT_ROOT
64+
is unset. If None, the placeholder ``{project_root}`` is shown.
6065
6166
Returns
6267
-------
@@ -116,6 +121,7 @@ def generate_submit_script(
116121
run_script_relpath=run_script_relpath,
117122
container_images=container_images,
118123
datalad_run_message=datalad_run_message,
124+
project_root=project_root,
119125
)
120126

121127

babs/templates/bidsapp_pipeline_run.sh.jinja2

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,14 @@ sesid="$2"
1515
filterfile="${PWD}/${sesid}_filter.json"
1616
{% raw %}{{% endraw %}
1717
echo "{"
18-
echo "'fmap': {'datatype': 'fmap'},"
18+
echo "'fmap': {'datatype': 'fmap', 'session': '$sesid'},"
1919
{% set first_filter_step = steps_with_filter[0] %}
2020
{% if 'fmriprep' in first_filter_step['container_name'].lower() %}
2121
echo "'bold': {'datatype': 'func', 'session': '$sesid', 'suffix': 'bold'},"
2222
{% elif 'qsiprep' in first_filter_step['container_name'].lower() %}
2323
echo "'dwi': {'datatype': 'dwi', 'session': '$sesid', 'suffix': 'dwi'},"
24+
{% elif 'aslprep' in first_filter_step['container_name'].lower() %}
25+
echo "'asl': {'datatype': 'perf', 'session': '$sesid', 'suffix': 'asl'},"
2426
{% endif %}
2527
echo "'sbref': {'datatype': 'func', 'session': '$sesid', 'suffix': 'sbref'},"
2628
echo "'flair': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'FLAIR'},"

babs/templates/bidsapp_run.sh.jinja2

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,13 @@ sesid="$2"
1414
filterfile="${PWD}/${sesid}_filter.json"
1515
{% raw %}{{% endraw %}
1616
echo "{"
17-
echo "'fmap': {'datatype': 'fmap'},"
17+
echo "'fmap': {'datatype': 'fmap', 'session': '$sesid'},"
1818
{% if 'fmriprep' in container_name.lower() %}
1919
echo "'bold': {'datatype': 'func', 'session': '$sesid', 'suffix': 'bold'},"
2020
{% elif 'qsiprep' in container_name.lower() %}
2121
echo "'dwi': {'datatype': 'dwi', 'session': '$sesid', 'suffix': 'dwi'},"
22+
{% elif 'aslprep' in container_name.lower() %}
23+
echo "'asl': {'datatype': 'perf', 'session': '$sesid', 'suffix': 'asl'},"
2224
{% endif %}
2325
echo "'sbref': {'datatype': 'func', 'session': '$sesid', 'suffix': 'sbref'},"
2426
echo "'flair': {'datatype': 'anat', 'session': '$sesid', 'suffix': 'FLAIR'},"

babs/templates/determine_zipfilename.sh.jinja2

Lines changed: 19 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,32 +1,31 @@
11
# shellcheck disable=SC1091
22
{% if has_a_zipped_input_dataset %}
3-
find_single_zipfile() {% raw %}{{% endraw %}
3+
find_single_zip_in_git_tree() {{ '{' }}
4+
local zip_search_path="$1"
5+
local name="$2"
6+
local hits count
47

5-
local path="$1"
6-
local name="$2"
7-
local pattern="${path}/${subid}{% if processing_level == 'session' %}_${sesid}{%endif%}_*${name}*.zip"
8-
local zipfile
8+
hits="$(
9+
git -C "${zip_search_path}" ls-tree -r --name-only HEAD \
10+
| grep -E "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}{% raw %}.*${sesid}{% endraw %}{% endif %}{% raw %}.*${name}.*\.zip${% endraw %}" \
11+
|| true
12+
)"
913

10-
# Use find instead of ls for better handling of non-alphanumeric filenames
11-
zipfile=$(find "$path" -maxdepth 1 -wholename "${pattern}" -print -quit || true)
12-
if [ -z "$zipfile" ]; then
13-
echo "ERROR: No zip file found matching pattern: ${pattern}" >&2
14-
exit 1
15-
fi
14+
count="$(printf "%s\n" "${hits}" | sed '/^$/d' | wc -l | tr -d ' ')"
1615

17-
# Check if there's a second match
18-
another_zipfile=$(find "$path" -maxdepth 1 -name "${pattern}" -print -quit -skip 1 || true)
19-
if [ -n "${another_zipfile}" ]; then
20-
echo "Multiple zip files found matching pattern: $pattern" >&2
21-
exit 98
22-
fi
16+
if [ "${count}" -ne 1 ]; then
17+
{% raw %} echo "ERROR: Expected exactly 1 matching ${name} zip in ${zip_search_path}, found ${count}" 1>&2
18+
printf "%s\n" "${hits}" 1>&2
19+
exit 1
20+
{% endraw %}
21+
fi
2322

24-
echo "$zipfile"
25-
{% raw %}}{% endraw %}
23+
printf "%s/%s\n" "${zip_search_path}" "${hits}"
24+
{{ '}' }}
2625

2726
{% for input_dataset in input_datasets %}
2827
{% if input_dataset['is_zipped'] %}
29-
{{ input_dataset['name'].upper() }}_ZIP="$(find_single_zipfile {{ input_dataset['path_in_babs'] }} {{ input_dataset['name'] }})"
28+
{{ input_dataset['name'].upper() }}_ZIP="$(find_single_zip_in_git_tree {{ input_dataset['path_in_babs'] }} {{ input_dataset['name'] }})"
3029
echo 'found {{ input_dataset['name'] }} zipfile:'
3130
echo "${%raw%}{{%endraw%}{{ input_dataset['name'].upper() }}_ZIP{%raw%}}{%endraw%}"
3231
{% endif %}

babs/templates/participant_job.sh.jinja2

Lines changed: 65 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,24 @@ BRANCH="job-${%raw%}{{%endraw%}{{varname_jobid}}{%raw%}}{%endraw%}-${%raw%}{{%en
3232
BRANCH="job-${%raw%}{{%endraw%}{{varname_jobid}}{%raw%}}{%endraw%}-${%raw%}{{%endraw%}{{varname_taskid}}{%raw%}}{%endraw%}-${subid}"
3333
{% endif %}
3434

35+
cleanup() {
36+
set +e
37+
if [ -d "{{ job_scratch_directory }}/{% raw %}${BRANCH}{% endraw %}/ds" ]; then
38+
cd "{{ job_scratch_directory }}/{% raw %}${BRANCH}{% endraw %}/ds" 2>/dev/null || true
39+
datalad drop -r . --reckless availability --reckless modification >/dev/null 2>&1 || true
40+
git annex dead here >/dev/null 2>&1 || true
41+
fi
42+
cd "{{ job_scratch_directory }}" 2>/dev/null || true
43+
rm -rf "{{ job_scratch_directory }}/{% raw %}${BRANCH}{% endraw %}" >/dev/null 2>&1 || true
44+
}
45+
trap cleanup EXIT
46+
3547
mkdir "${BRANCH}"
3648
cd "${BRANCH}"
3749

3850
# datalad clone the input ria:
3951
echo '# Clone the data from input RIA:'
40-
datalad clone "${dssource}" ds
52+
datalad clone "${dssource}" ds -- --no-checkout
4153
cd ds
4254

4355
# set up the result deposition:
@@ -48,32 +60,69 @@ git remote add outputstore "${pushgitremote}"
4860
echo "# Create a new branch for this job's results:"
4961
git checkout -b "${BRANCH}"
5062

63+
# always use sparse-checkout, print error when not available
64+
if ! git sparse-checkout init --cone; then
65+
echo "ERROR: git sparse-checkout is not available (or failed to initialize) on this system." 1>&2
66+
exit 1
67+
fi
68+
69+
git sparse-checkout set \
70+
code \
71+
containers \
72+
{% for input_dataset in input_datasets %}
73+
{{ input_dataset['path_in_babs'] }}{% if not loop.last %} \
74+
{% endif %}
75+
{% endfor %}
76+
77+
git checkout -f
78+
5179
# Start of the application-specific code: ------------------------------
5280

53-
# pull down input data (but don't retrieve the data content) and remove other sub's data:
54-
echo "# Pull down the input subject (or dataset) but don't retrieve data contents:"
81+
# pull down only needed session path and explicit dataset-level metadata:
82+
echo "# Pull down the input session but don't retrieve data contents:"
5583
{% for input_dataset in input_datasets %}
5684
{% if not input_dataset['is_zipped'] %}
57-
datalad get -n "{{ input_dataset['path_in_babs'] }}/${subid}"
58-
(cd {{ input_dataset['path_in_babs'] }} && find . -type d -name 'sub*' | grep -v "$subid" | xargs rm -rf)
59-
{% if processing_level == 'session' %}
60-
(cd {{ input_dataset['path_in_babs'] }}/"${subid}" && find . -type d -name 'ses*' | grep -v "$sesid" | xargs rm -rf)
61-
{% endif %}
85+
datalad get -n "{{ input_dataset['path_in_babs'] }}/{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}"
86+
87+
datalad get -n \
88+
"{{ input_dataset['path_in_babs'] }}/dataset_description.json" \
89+
"{{ input_dataset['path_in_babs'] }}/participants.tsv" \
90+
"{{ input_dataset['path_in_babs'] }}/participants.json" || true
6291
{% else %}
6392
datalad get -n "{{ input_dataset['path_in_babs'] }}"
64-
(cd {{ input_dataset['path_in_babs'] }} && find . -type f -name 'sub*.zip' | grep -v "$subid" | xargs rm -f)
6593
{% endif %}
6694
{% endfor %}
6795

6896
{{ zip_locator_text }}
6997

98+
# Link to shared container image so each job does not re-clone the same image
99+
PROJECT_ROOT="${PROJECT_ROOT:?ERROR: PROJECT_ROOT env var must be set to {{ project_root | default('{project_root}') }}}"
100+
CONTAINER_SHARED="${PROJECT_ROOT}/analysis/containers/.datalad/environments/{{ container_name }}/image"
101+
CONTAINER_JOB="containers/.datalad/environments/{{ container_name }}/image"
102+
103+
if [ ! -e "${CONTAINER_SHARED}" ]; then
104+
echo "ERROR: shared container image not found at ${CONTAINER_SHARED}" 1>&2
105+
exit 1
106+
fi
107+
108+
mkdir -p "containers/.datalad/environments/{{ container_name }}"
109+
# Replace any existing path (e.g. sparse-checkout placeholder or annex pointer) with symlink to shared image
110+
ln -sf "${CONTAINER_SHARED}" "${CONTAINER_JOB}"
111+
112+
if [ ! -e "${CONTAINER_JOB}" ]; then
113+
echo "ERROR: failed to create symlink ${CONTAINER_JOB}" 1>&2
114+
exit 1
115+
fi
116+
70117
# datalad run:
71118
datalad run \
72119
-i "{{ run_script_relpath if run_script_relpath else 'code/' + container_name + '_zip.sh' }}" \
73120
{% for input_dataset in input_datasets %}
74121
{% if not input_dataset['is_zipped'] %}
75-
-i "{{ input_dataset['unzipped_path_containing_subject_dirs'] }}/${subid}{% if processing_level == 'session' %}/${sesid}{% endif %}" \
76-
-i "{{ input_dataset['unzipped_path_containing_subject_dirs'] }}/*json" \
122+
-i "{{ input_dataset['unzipped_path_containing_subject_dirs'] }}/{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}/{% raw %}${sesid}{% endraw %}{% endif %}" \
123+
-i "{{ input_dataset['path_in_babs'] }}/dataset_description.json" \
124+
-i "{{ input_dataset['path_in_babs'] }}/participants.tsv" \
125+
-i "{{ input_dataset['path_in_babs'] }}/participants.json" \
77126
{% else %}
78127
-i "${%raw%}{{%endraw%}{{ input_dataset['name'].upper() }}_ZIP{%raw%}}{%endraw%}" \
79128
{% endif %}
@@ -82,7 +131,7 @@ datalad run \
82131
{% for image_path in container_images %}
83132
-i "{{ image_path }}" \
84133
{% endfor %}
85-
{% elif not run_script_relpath %}
134+
{% else %}
86135
-i "containers/.datalad/environments/{{container_name}}/image" \
87136
{% endif %}
88137
{% if datalad_expand_inputs %}
@@ -91,11 +140,11 @@ datalad run \
91140
--explicit \
92141
{% if zip_foldernames is not none %}
93142
{% for key, value in zip_foldernames.items() %}
94-
-o "${subid}{% if processing_level == 'session' %}_${sesid}{% endif %}_{{ key }}-{{ value }}.zip" \
143+
-o "{% raw %}${subid}{% endraw %}{% if processing_level == 'session' %}_{% raw %}${sesid}{% endraw %}{% endif %}_{{ key }}-{{ value }}.zip" \
95144
{% endfor %}
96145
{% endif %}
97-
-m "{{ datalad_run_message if datalad_run_message is defined else container_name }} ${subid}{% if processing_level == 'session' %} ${sesid}{% endif %}" \
98-
"bash ./{{ run_script_relpath if run_script_relpath else 'code/' + container_name + '_zip.sh' }} ${subid} {% if processing_level == 'session' %} ${sesid}{% endif %}{% for input_dataset in input_datasets %}{% if input_dataset['is_zipped'] %} ${%raw%}{{%endraw%}{{ input_dataset['name'].upper() }}_ZIP{%raw%}}{%endraw%}{%endif%}{%endfor%}"
146+
-m "{{ (datalad_run_message if datalad_run_message is defined and datalad_run_message else container_name) }} {% raw %}${subid}{% endraw %}{% if processing_level == 'session' %} {% raw %}${sesid}{% endraw %}{% endif %}" \
147+
"bash ./{{ run_script_relpath if run_script_relpath else 'code/' + container_name + '_zip.sh' }} {% raw %}${subid}{% endraw %} {% if processing_level == 'session' %} {% raw %}${sesid}{% endraw %}{% endif %}{% for input_dataset in input_datasets %}{% if input_dataset['is_zipped'] %} ${%raw%}{{%endraw%}{{ input_dataset['name'].upper() }}_ZIP{%raw%}}{%endraw%}{%endif%}{%endfor%}"
99148

100149
# Finish up:
101150
# push result file content to output RIA storage:
@@ -106,13 +155,4 @@ datalad push --to output-storage
106155
echo '# Push the branch with provenance records:'
107156
flock "${DSLOCKFILE}" git push outputstore
108157

109-
# Delete:
110-
datalad drop -r . --reckless availability --reckless modification
111-
112-
git annex dead here
113-
114-
# cd out of $BRANCH:
115-
cd ../..
116-
rm -rf "${BRANCH}"
117-
118-
echo SUCCESS
158+
echo SUCCESS

0 commit comments

Comments
 (0)