dsub/test/integration/io_setup.sh at df7ad924473415b2a5d28aa9d2a12407c57e61d5 · DataBiosphere/dsub · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

readonly GENOMICS_PUBLIC_BUCKET="gs://genomics-public-data"
readonly POPULATION_FILE="ftp-trace.ncbi.nih.gov/1000genomes/ftp/20131219.superpopulations.tsv"
readonly POPULATION_FILE_FULL_PATH="${GENOMICS_PUBLIC_BUCKET}"/"${POPULATION_FILE}"
readonly POPULATION_MD5="68a73f849b82071afe11888bac1aa8a7"

readonly INPUT_BAM_FILE="ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/pilot3_exon_targetted_GRCh37_bams/data/NA06986/alignment/NA06986.chromY.ILLUMINA.bwa.CEU.exon_targetted.20100311.bam"
readonly INPUT_BAM_FULL_PATH="${GENOMICS_PUBLIC_BUCKET}/${INPUT_BAM_FILE}"
readonly INPUT_BAM_MD5="4afb9b8908959dbd4e2d5c54bf254c93"

# This bucket is requester-pays enabled.
# The same test files from above are assumed to be copied over and made public.
readonly REQUESTER_PAYS_INPUT_BAM_FULL_PATH="gs://${DSUB_BUCKET_REQUESTER_PAYS}/${INPUT_BAM_FILE}"
readonly REQUESTER_PAYS_POPULATION_FILE_FULL_PATH="gs://${DSUB_BUCKET_REQUESTER_PAYS}/${POPULATION_FILE}"

# This is the image we use to test the PD mount feature.
# Inject the TEST_TOKEN into the name so that multiple tests can run
# concurrently. Since the image test can be run multiple times for one
# setting of the TEST_TOKEN (see run_tests.sh), also add the process id
# of the running test.
readonly TEST_IMAGE_NAME="dsub-e2e-test-image-$(echo ${TEST_TOKEN} | tr '_' '-')-$$"
readonly TEST_IMAGE_GCS_LOCATION="gs://dsub-test-e2e-bucket/dsub-test-image.tar.gz"
readonly TEST_IMAGE_URL="https://www.googleapis.com/compute/v1/projects/${PROJECT_ID}/global/images/${TEST_IMAGE_NAME}"

# This is the name and URL of a disk that we create in order to test the PD
# "mount existing disk" feature. Note that GCP supports regional disks,
# but we test zonal, as it is most likely to be used in practice.
# For the mount test, we create a disk from an image, so that the disk is
# already formatted. Note that the mount fails if the disk is not formatted.
readonly TEST_EXISTING_DISK_IMAGE_NAME="dsub-e2e-test-disk-image-$(echo ${TEST_TOKEN} | tr '_' '-')-$$"
readonly TEST_EXISTING_DISK_NAME="dsub-e2e-test-disk-$(echo ${TEST_TOKEN} | tr '_' '-')-$$"
readonly TEST_EXISTING_DISK_ZONE="us-central1-a"
readonly TEST_EXISTING_DISK_URL="https://www.googleapis.com/compute/v1/projects/${PROJECT_ID}/zones/${TEST_EXISTING_DISK_ZONE}/disks/${TEST_EXISTING_DISK_NAME}"

# This is the path we use to test local file:// mounts
readonly TEST_TMP_PATH="/tmp/dsub_test_files"
readonly TEST_LOCAL_MOUNT_PARAMETER="file://${TEST_TMP_PATH}"

function io_setup::mount_local_path_setup() {
  mkdir -p "${TEST_TMP_PATH}"
  if [[ ! -f "${TEST_TMP_PATH}/${POPULATION_FILE}" ]]; then
    gsutil cp "${POPULATION_FILE_FULL_PATH}" "${TEST_TMP_PATH}/${POPULATION_FILE}"
  fi
  if [[ ! -f "${TEST_TMP_PATH}/${INPUT_BAM_FILE}" ]]; then
    gsutil cp "${INPUT_BAM_FULL_PATH}" "${TEST_TMP_PATH}/${INPUT_BAM_FILE}"
  fi
}
readonly -f io_setup::mount_local_path_setup

function io_setup::exit_handler_image() {
  local code="${?}"

  echo "Deleting image ${TEST_IMAGE_NAME}..."
  gcloud --quiet compute images delete "${TEST_IMAGE_NAME}"
  echo "Image successfully deleted."

  return "${code}"
}
readonly -f io_setup::exit_handler_image

function io_setup::image_setup() {
  trap "io_setup::exit_handler_image" EXIT

  echo "Creating image ${TEST_IMAGE_NAME} from ${TEST_IMAGE_GCS_LOCATION}..."
  gcloud compute images create "${TEST_IMAGE_NAME}" \
    --source-uri "${TEST_IMAGE_GCS_LOCATION}"
  echo "Image successfully created."
}
readonly -f io_setup::image_setup


function io_setup::exit_handler_disk() {
  local code="${?}"

  echo "Deleting image ${TEST_EXISTING_DISK_IMAGE_NAME}..."
  gcloud --quiet compute images delete "${TEST_EXISTING_DISK_IMAGE_NAME}"
  echo "Image successfully deleted."

  # Delete the disk, but in a retry loop - if the VM has not yet gone away,
  # it'll be marked as "in use" and the delete fails
  echo "Deleting disk ${TEST_EXISTING_DISK_NAME}..."
  local TOTAL_WAIT_SECONDS="$((60 * 2))"
  local WAIT_INTERVAL=5

  for ((waited = 0; waited <= TOTAL_WAIT_SECONDS; waited += WAIT_INTERVAL)); do
    if gcloud --quiet compute disks delete "${TEST_EXISTING_DISK_NAME}" \
      --zone="${TEST_EXISTING_DISK_ZONE}"; then
      break
    fi

    if ((waited >= TOTAL_WAIT_SECONDS)); then
      1>&2 echo "Failed to delete disk after ${waited} seconds"
      exit 1
    fi

    echo "Sleeping ${WAIT_INTERVAL}s"
    sleep "${WAIT_INTERVAL}s"
  done
  echo "Disk successfully deleted."

  return "${code}"
}
readonly -f io_setup::exit_handler_disk


function io_setup::existing_disk_setup() {
  trap "io_setup::exit_handler_disk" EXIT

  echo "Creating image ${TEST_EXISTING_DISK_IMAGE_NAME} from ${TEST_IMAGE_GCS_LOCATION}..."
  gcloud compute images create "${TEST_EXISTING_DISK_IMAGE_NAME}" \
    --source-uri "${TEST_IMAGE_GCS_LOCATION}"
  echo "Image successfully created."

  echo "Creating disk from ${TEST_IMAGE_GCS_LOCATION} from ${TEST_EXISTING_DISK_IMAGE_NAME}..."
  gcloud compute disks create "${TEST_EXISTING_DISK_NAME}" \
    --image="${TEST_EXISTING_DISK_IMAGE_NAME}" \
    --zone="${TEST_EXISTING_DISK_ZONE}"
  echo "Disk successfully created from image."
}
readonly -f io_setup::image_setup


function io_setup::run_dsub_requester_pays() {
  run_dsub \
    --unique-job-id \
    ${IMAGE:+--image "${IMAGE}"} \
    --user-project "${PROJECT_ID}" \
    --script "${SCRIPT_DIR}/script_io_test.sh" \
    --env TASK_ID="task" \
    --input INPUT_PATH="${REQUESTER_PAYS_INPUT_BAM_FULL_PATH}" \
    --output OUTPUT_PATH="${OUTPUTS}/task/*.md5" \
    --env TEST_NAME="${TEST_NAME}" \
    --input POPULATION_FILE_PATH="${REQUESTER_PAYS_POPULATION_FILE_FULL_PATH}" \
    --output OUTPUT_POPULATION_FILE="${OUTPUTS}/*" \
    --wait
}
readonly -f io_setup::run_dsub_requester_pays

function io_setup::run_dsub_with_mount() {
  local mount_point="${1}"
  local requester_pays="${2:-}"

  run_dsub \
    --unique-job-id \
    ${IMAGE:+--image "${IMAGE}"} \
    ${requester_pays:+--user-project "${PROJECT_ID}"} \
    --script "${SCRIPT_DIR}/script_io_test.sh" \
    --env TASK_ID="task" \
    --output OUTPUT_PATH="${OUTPUTS}/task/*.md5" \
    --env TEST_NAME="${TEST_NAME}" \
    --env INPUT_BAM="${INPUT_BAM_FILE}" \
    --env POPULATION_FILE="${POPULATION_FILE}" \
    --mount MOUNT_POINT="${mount_point}" \
    --output OUTPUT_POPULATION_FILE="${OUTPUTS}/*" \
    --wait
}
readonly -f io_setup::run_dsub_with_mount

function io_setup::run_dsub() {
  run_dsub \
    --unique-job-id \
    ${IMAGE:+--image "${IMAGE}"} \
    --script "${SCRIPT_DIR}/script_io_test.sh" \
    --env TASK_ID="task" \
    --input INPUT_PATH="${INPUT_BAM_FULL_PATH}" \
    --output OUTPUT_PATH="${OUTPUTS}/task/*.md5" \
    --env TEST_NAME="${TEST_NAME}" \
    --input POPULATION_FILE_PATH="${POPULATION_FILE_FULL_PATH}" \
    --output OUTPUT_POPULATION_FILE="${OUTPUTS}/*" \
    --wait
}
readonly -f io_setup::run_dsub

function io_setup::_check_output() {
  local output_file="${1}"
  local result_expected="${2}"

  local result=$(gsutil cat "${output_file}")
  if ! diff <(echo "${result_expected}") <(echo "${result}"); then
    echo "Output file does not match expected"
    exit 1
  fi

  echo
  echo "Output file matches expected:"
  echo "*****************************"
  echo "${result}"
  echo "*****************************"
}
readonly -f io_setup::_check_output

function io_setup::check_output() {
  echo
  echo "Checking output..."

  io_setup::_check_output \
    "${OUTPUTS}/task/$(basename "${INPUT_BAM_FULL_PATH}").md5" \
    "${INPUT_BAM_MD5}"

  io_setup::_check_output \
    "${OUTPUTS}/task.md5" \
    "${POPULATION_MD5}"

  echo "SUCCESS"
}
readonly -f io_setup::check_output

function io_setup::check_dstat() {
  local job_id="${1}"
  local check_inputs="${2}"
  local mount_point="${3:-}"
  local check_requester_pays_inputs="${4:-}"

  echo
  echo "Checking dstat output for job-id: ${job_id}..."

  local dstat_output=$(run_dstat --status '*' --jobs "${job_id}" --full)

  echo "  Checking user-id"
  util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].user-id" "${USER:-jupyter}"

  echo "  Checking logging"
  util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].logging" "${LOGGING}"

  echo "  Checking status"
  util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].status" "SUCCESS"

  echo "  Checking script"
  util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].script-name" "script_io_test.sh"
  local expected_script=$(cat "${SCRIPT_DIR}/script_io_test.sh")
  util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].script" "${expected_script}"

  echo "  Checking datetime fields..."
  for field in 'create-time' 'end-time' 'start-time' 'last-update'; do
    if ! util::dstat_yaml_job_has_valid_datetime_field "${dstat_output}" "[0].${field}"; then
      echo "dstat output for ${job_id} does not include a valid ${field}."
      echo "${dstat_output}"
      exit 1
    fi
  done

  echo "  Checking envs..."
  util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].envs.TASK_ID" "task"
  util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].envs.TEST_NAME" "${TEST_NAME}"

  if [[ "${check_inputs}" == "true" ]]; then
    echo "  Checking inputs..."
    util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].inputs.INPUT_PATH" "${INPUT_BAM_FULL_PATH}"
    util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].inputs.POPULATION_FILE_PATH" "${POPULATION_FILE_FULL_PATH}"
  fi

  echo "  Checking outputs..."
  util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].outputs.OUTPUT_PATH" "${OUTPUTS}/task/*.md5"
  util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].outputs.OUTPUT_POPULATION_FILE" "${OUTPUTS}/*"

  if [[ -n "${mount_point:-}" ]]; then
    echo "  Checking mounts..."
    util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].mounts.MOUNT_POINT" "${mount_point}"
  fi

  if [[ "${check_requester_pays_inputs}" == "true" ]]; then
    echo "  Checking inputs (for requester pays bucket)..."
    util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].inputs.INPUT_PATH" "${REQUESTER_PAYS_INPUT_BAM_FULL_PATH}"
    util::dstat_yaml_assert_field_equal "${dstat_output}" "[0].inputs.POPULATION_FILE_PATH" "${REQUESTER_PAYS_POPULATION_FILE_FULL_PATH}"
  fi

  echo "SUCCESS"
}
readonly -f io_setup::check_dstat