Skip to content

Commit 5d0fd60

Browse files
authored
feat: Dockerfile allows custom mcore install (#2005)
Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com>
1 parent ea844b9 commit 5d0fd60

File tree

2 files changed

+38
-4
lines changed

2 files changed

+38
-4
lines changed

.github/workflows/cicd-main.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,7 @@ jobs:
283283
${{ env.container-registry }}/megatron-bridge:${{ github.sha }}
284284
secrets: |
285285
GH_TOKEN=${{ secrets.PAT }}
286+
target: mbridge_final
286287

287288
- name: Verify MCore commit in Docker image
288289
if: ${{ github.event.inputs.mcore_commit != '' }}

docker/Dockerfile.ci

Lines changed: 37 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@
1313
# limitations under the License.
1414

1515
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:25.11-py3
16-
FROM ${BASE_IMAGE} AS megatron_bridge
17-
WORKDIR /opt/Megatron-Bridge
16+
ARG MCORE_LAYER=submodule_mcore
17+
FROM ${BASE_IMAGE} AS mbridge_base
1818
ENV PATH="/root/.local/bin:$PATH"
1919
ENV UV_PROJECT_ENVIRONMENT=/opt/venv
2020
ENV PATH="$UV_PROJECT_ENVIRONMENT/bin:$PATH"
@@ -28,12 +28,44 @@ RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \
2828

2929
COPY pyproject.toml uv.lock /opt/Megatron-Bridge/
3030
COPY src/megatron/bridge/__init__.py src/megatron/bridge/package_info.py /opt/Megatron-Bridge/src/megatron/bridge/
31-
COPY 3rdparty/Megatron-LM/pyproject.toml /opt/Megatron-Bridge/3rdparty/Megatron-LM/
32-
COPY 3rdparty/Megatron-LM/megatron/core/__init__.py 3rdparty/Megatron-LM/megatron/core/package_info.py /opt/Megatron-Bridge/3rdparty/Megatron-LM/megatron/core/
3331

32+
FROM mbridge_base as mbridge_custom_mcore
33+
34+
ARG MCORE_REPO
35+
ARG MCORE_COMMIT
36+
ARG MCORE_HOST
37+
WORKDIR /src/Megatron-LM
38+
39+
# Clone Custom MCore Installation
40+
RUN --mount=type=secret,id=CI_JOB_TOKEN,target=/run/secrets/CI_JOB_TOKEN \
41+
CI_JOB_TOKEN=$(cat /run/secrets/CI_JOB_TOKEN) && \
42+
git clone ${MCORE_HOST}:${CI_JOB_TOKEN}@${MCORE_REPO} . && \
43+
git pull && \
44+
git fetch origin $MCORE_COMMIT && \
45+
git checkout FETCH_HEAD && \
46+
echo "Container built with Megatron-LM commit hash: $MCORE_COMMIT" && \
47+
rm -rf .git && \
48+
rm -rf ~/.netrc && \
49+
mkdir -p /tmp/Megatron-LM/megatron/core/ && \
50+
cp pyproject.toml /tmp/Megatron-LM/ && \
51+
cp megatron/core/__init__.py megatron/core/package_info.py /tmp/Megatron-LM/megatron/core/
52+
53+
FROM mbridge_base as mbridge_submodule_mcore
54+
55+
WORKDIR /src
56+
COPY 3rdparty/Megatron-LM/pyproject.toml /tmp/Megatron-LM/
57+
COPY 3rdparty/Megatron-LM/megatron/core/__init__.py 3rdparty/Megatron-LM/megatron/core/package_info.py /tmp/Megatron-LM/megatron/core/
58+
59+
FROM mbridge_${MCORE_LAYER} as mbridge_mcore
60+
61+
FROM mbridge_base as mbridge_final
62+
63+
WORKDIR /opt/Megatron-Bridge
3464
# Build arg to skip --locked when testing with different MCore versions
3565
ARG MCORE_TRIGGERED_TESTING=false
3666

67+
COPY --from=mbridge_mcore /tmp/Megatron-LM /opt/Megatron-Bridge/3rdparty/Megatron-LM
68+
3769
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
3870
--mount=type=cache,target=/var/lib/apt,sharing=locked \
3971
--mount=type=cache,target=/root/.cache/uv \
@@ -48,3 +80,4 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
4880
uv cache prune
4981

5082
COPY . /opt/Megatron-Bridge
83+
COPY --from=mbridge_mcore /src/Megatron-LM* /opt/Megatron-Bridge/3rdparty

0 commit comments

Comments
 (0)