Skip to content

Commit 2189bb4

Browse files
Merge branch 'main' into rust_dl_apis_serialization
2 parents 4a23c68 + 87f4cbf commit 2189bb4

File tree

7 files changed

+20
-59
lines changed

7 files changed

+20
-59
lines changed

.ci/jenkins/lib/build-container-matrix.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ timeout_minutes: 240
1111
# Infrastructure
1212
kubernetes:
1313
cloud: il-ipp-blossom-prod
14-
namespace: swx-media
14+
namespace: nbu-swx-nixl
1515
limits: "{memory: 16Gi, cpu: 8000m}"
1616
requests: "{memory: 8Gi, cpu: 4000m}"
1717

@@ -93,6 +93,7 @@ steps:
9393
9494
- name: Add Version Info
9595
run: |
96+
git config --global --add safe.directory '*'
9697
# Extract standardized 8-char commit hash for UCX version info:
9798
if [[ "$BUILD_TARGET" == "nixlbench" ]]; then
9899
CLEAN_UCX=$(cd "$WORKSPACE/ucx-src" && git rev-parse --short=8 HEAD)

.ci/jenkins/lib/build-matrix.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ timeout_minutes: 240
2929

3030
kubernetes:
3131
cloud: il-ipp-blossom-prod
32-
namespace: swx-media
32+
namespace: nbu-swx-nixl
3333
limits: "{memory: 8Gi, cpu: 8000m}"
3434
requests: "{memory: 8Gi, cpu: 8000m}"
3535

.ci/jenkins/pipeline/proj-jjb.yaml

Lines changed: 8 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
- job-template:
1010
name: "{jjb_proj}-dispatcher" # Will be expanded to 'nixl-ci-dispatcher'
1111
project-type: pipeline
12-
folder: "{jjb_folder}"
1312
properties:
1413
# GitHub integration settings
1514
- github:
@@ -56,49 +55,39 @@
5655
githubHelper = GithubHelper.getInstance("${{GIT_PASSWORD}}", VARIABLE_FROM_POST)
5756
}}
5857
// Update GitHub commit status
59-
githubHelper.updateCommitStatus("$BUILD_URL", "CI Dispatcher started", GitHubCommitState.PENDING)
58+
def blueOceanUrl = "${{JENKINS_URL}}blue/organizations/jenkins/nixl-ci-dispatcher/detail/nixl-ci-dispatcher/${{BUILD_NUMBER}}/pipeline/"
59+
githubHelper.updateCommitStatus(blueOceanUrl, "NIXL CI Started", GitHubCommitState.PENDING)
6060
currentBuild.description = githubHelper.getBuildDescription()
6161
try {{
6262
// Trigger the actual build and test jobs in parallel
6363
parallel build: {{
6464
def buildJob = 'nixl-ci-build'
65-
def build = build job: buildJob, parameters: [
65+
build job: buildJob, parameters: [
6666
string(name: 'sha1', value: githubHelper.getMergedSHA()),
6767
string(name: 'githubData', value: VARIABLE_FROM_POST)
68-
], propagate: false
69-
currentBuild.description += "<br>Job: <a href='${{JENKINS_URL}}blue/organizations/jenkins/NIXL%2F${{buildJob}}/detail/${{buildJob}}/${{build.number}}/pipeline/'>${{buildJob}}</a> Result: <b style='color:${{build.result == 'SUCCESS' ? 'green' : 'red'}}'>${{build.result}}</b>"
70-
if (!build.resultIsBetterOrEqualTo('SUCCESS')) {{
71-
currentBuild.result = build.result
72-
error("Job ${{buildJob}} failed")
73-
}}
68+
]
7469
}}, test: {{
7570
def buildJob = 'nixl-ci-test'
76-
def build = build job: buildJob, parameters: [
71+
build job: buildJob, parameters: [
7772
string(name: 'sha1', value: githubHelper.getMergedSHA()),
7873
string(name: 'githubData', value: VARIABLE_FROM_POST)
79-
], propagate: false
80-
currentBuild.description += "<br>Job: <a href='${{JENKINS_URL}}blue/organizations/jenkins/NIXL%2F${{buildJob}}/detail/${{buildJob}}/${{build.number}}/pipeline/'>${{buildJob}}</a> Result: <b style='color:${{build.result == 'SUCCESS' ? 'green' : 'red'}}'>${{build.result}}</b>"
81-
if (!build.resultIsBetterOrEqualTo('SUCCESS')) {{
82-
currentBuild.result = build.result
83-
error("Job ${{buildJob}} failed")
84-
}}
74+
]
8575
}},
8676
failFast: false // Continue even if some parallel jobs fail
8777
88-
githubHelper.updateCommitStatus("$BUILD_URL", "CI Dispatcher successeded", GitHubCommitState.SUCCESS)
78+
githubHelper.updateCommitStatus(blueOceanUrl, "NIXL CI Succeeded", GitHubCommitState.SUCCESS)
8979
}} catch(Exception ex) {{
9080
// Handle build failures
9181
currentBuild.result = 'FAILURE'
9282
println ex
93-
githubHelper.updateCommitStatus("$BUILD_URL", "CI Dispatcher failed", GitHubCommitState.FAILURE)
83+
githubHelper.updateCommitStatus(blueOceanUrl, "NIXL CI Failed", GitHubCommitState.FAILURE)
9484
error("failed")
9585
}}
9686
9787
# Template for the main build job that performs the actual build process
9888
- job-template:
9989
name: "{jjb_proj}-build" # Will be expanded to 'nixl-ci-build'
10090
project-type: pipeline
101-
folder: "{jjb_folder}"
10291
disabled: false
10392
properties:
10493
# Similar properties as dispatcher job
@@ -161,7 +150,6 @@
161150
- job-template:
162151
name: "{jjb_proj}-test" # Will be expanded to 'nixl-ci-test'
163152
project-type: pipeline
164-
folder: "{jjb_folder}"
165153
disabled: false
166154
properties:
167155
# Similar properties as dispatcher job
@@ -226,7 +214,6 @@
226214
- job-template:
227215
name: "{jjb_proj}-build-container"
228216
project-type: pipeline
229-
folder: "{jjb_folder}"
230217
disabled: false
231218
properties:
232219
- build-discarder:
@@ -325,7 +312,6 @@
325312
jjb_proj: 'nixl-ci' # Project prefix for job names
326313
jjb_git: 'git@github.com:ai-dynamo/nixl.git' # Repository URL
327314
jjb_jenkinsfile: '.ci/jenkins/pipeline/Jenkinsfile' # Main pipeline definition
328-
jjb_folder: 'NIXL'
329315
jjb_branch: 'main' # Default branch
330316
jjb_gh_url: 'https://github.com/ai-dynamo/nixl' # GitHub web URL
331317
jobs:

benchmark/nixlbench/src/worker/nixl/nixl_worker.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -862,7 +862,7 @@ xferBenchNixlWorker::allocateMemory(int num_threads) {
862862
iov_lists.push_back(iov_list);
863863

864864
/* Workaround for a GUSLI registration bug which resets memory to 0 */
865-
if (seg_type == DRAM_SEG) {
865+
if (XFERBENCH_BACKEND_GUSLI == xferBenchConfig::backend && seg_type == DRAM_SEG) {
866866
for (auto &iov : iov_list) {
867867
if (isInitiator()) {
868868
memset((void *)iov.addr, XFERBENCH_INITIATOR_BUFFER_ELEMENT, buffer_size);

docs/BackendGuide.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -108,16 +108,16 @@ Note that getNotif does not know which agent it should look for to receive the n
108108

109109
A key underlying abstraction for NIXL library is a descriptor list, that is made of a memory space (host/GPU/block/File/Obj-Store) and a list of descriptors. There are 2 types of descriptors used for the SB API.
110110

111-
*For transfers: (addr, len, devID, metadata), where metadata is a pointer to an nixlBackendMD object relevant to the registered memory that this descriptor falls within.
112-
*For registration, (addr, len, devID, str) where str is an optional byte-array for extra information. The table below shows the meaning of devID for different memory spaces, as well as optional meaning for File and Object-Store.
111+
* For transfers: (addr, len, devID, metadata), where metadata is a pointer to an nixlBackendMD object relevant to the registered memory that this descriptor falls within.
112+
* For registration, (addr, len, devID, str) where str is an optional byte-array for extra information. The table below shows the meaning of devID for different memory spaces, as well as optional meaning for File and Object-Store.
113113

114114
| mem type | addr | len | devID | str (byte-array) |
115115
| -------- | ------ | ---- | ------------- | -------------------------- |
116116
| DRAM | | | 0 (or region) | - |
117117
| VRAM | | | GPU ID | - |
118118
| BLK | | | Vol ID | - |
119119
| FILE | offset | Or 0 | fd | Path + (access mode) |
120-
| DRAM | offset | Or 0 | key | Extended key (+ bucket ID) |
120+
| OBJ | offset | Or 0 | key | Extended key (+ bucket ID) |
121121

122122
## Plugin Manager API
123123

@@ -165,7 +165,7 @@ In this step, if the plugin supports talking to remote agents, the required conn
165165

166166
When a connection is requested to an remote agent, which is possible if the remote agent’s metadata is already loaded, the local Agent would look for common backend plugins between itself and the remote agent, and for each of them initiate a connect by using the **connect** API in SB API of such backends.
167167

168-
### Register (Degister) memory with NIXL:
168+
### Register (Deregister) memory with NIXL:
169169

170170
The agent will receive a list of allocated memories and desired backend from the user, and then will give only one element at a time to the specified backend. Note that backends usually require to register the memories they will access during transfers, and based on that registration keep some metadata for that memory region. For instance, in case of UCX, per each contiguous region of memory, it will produce some local metadata for that region. Agent will give only a single contiguous region of memory to the **register** call in SB API, and in return gets a key (a pointer) to the metadata that backend created for this memory region. Later on, during transfer, the agent will give the same key back to the backend, so backends do not need to do any bookkeeping of such metadata.
171171

src/infra/nixl_descriptors.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,19 +112,19 @@ nixlBlobDesc::nixlBlobDesc(const nixlBasicDesc &desc,
112112
}
113113

114114
nixlBlobDesc::nixlBlobDesc(const nixl_blob_t &blob) {
115-
size_t meta_size = blob.size() - sizeof(nixlBasicDesc);
116-
if (meta_size > 0) {
115+
if (blob.size() > sizeof(nixlBasicDesc)) {
116+
const size_t meta_size = blob.size() - sizeof(nixlBasicDesc);
117117
metaInfo.resize(meta_size);
118118
blob.copy(reinterpret_cast<char*>(this), sizeof(nixlBasicDesc));
119119
blob.copy(reinterpret_cast<char*>(&metaInfo[0]),
120120
meta_size, sizeof(nixlBasicDesc));
121-
} else if (meta_size == 0) {
121+
} else if (blob.size() == sizeof(nixlBasicDesc)) {
122122
blob.copy(reinterpret_cast<char*>(this), sizeof(nixlBasicDesc));
123123
} else { // Error
124124
addr = 0;
125125
len = 0;
126126
devId = 0;
127-
metaInfo.resize(0);
127+
metaInfo.clear();
128128
}
129129
}
130130

test/unit/plugins/ucx/ucx_backend_test.cpp

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -274,24 +274,6 @@ void *releaseValidationPtr(nixl_mem_t mem_type, void *addr)
274274
return nullptr;
275275
}
276276

277-
void
278-
allocateWrongGPUTest(nixlUcxEngine *ucx, int dev_id) {
279-
nixlBlobDesc desc = {0};
280-
nixlBackendMD* md;
281-
void* buf;
282-
283-
allocateBuffer(VRAM_SEG, dev_id, desc.len, buf);
284-
285-
desc.devId = dev_id;
286-
desc.addr = (uint64_t) buf;
287-
288-
int ret = ucx->registerMem(desc, VRAM_SEG, md);
289-
290-
nixl_exit_on_failure((ret == NIXL_ERR_NOT_SUPPORTED), "Failed to register memory", "test");
291-
292-
releaseBuffer(VRAM_SEG, dev_id, buf);
293-
}
294-
295277
void
296278
allocateAndRegister(nixlUcxEngine *ucx,
297279
int dev_id,
@@ -749,14 +731,6 @@ int main()
749731
#endif
750732
}
751733

752-
#ifdef HAVE_CUDA
753-
if (n_vram_dev > 1) {
754-
//Test if registering on a different GPU fails correctly
755-
allocateWrongGPUTest(ucx[0][0], 1);
756-
std::cout << "Verified registration on wrong GPU fails correctly\n";
757-
}
758-
#endif
759-
760734
// Deallocate UCX engines
761735
for(int i = 0; i < 2; i++) {
762736
for(int j = 0; j < 2; j++) {

0 commit comments

Comments
 (0)