Skip to content

Commit 275bc90

Browse files
committed
add functional test
Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
1 parent 5ad3ea4 commit 275bc90

File tree

1 file changed

+61
-0
lines changed

1 file changed

+61
-0
lines changed
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import os
16+
from pathlib import Path
17+
import json
18+
import pytest
19+
20+
from nemo_automodel.components.datasets.llm import retrieval_dataset_inline as rdi
21+
22+
def load_jsonl_one_line(path):
23+
ans = []
24+
with open(path, "r") as f:
25+
for line in f:
26+
ans.append(json.loads(line))
27+
break # only load the first line
28+
return ans
29+
30+
def _embedding_testdata_training_file() -> Path:
31+
return Path(os.environ["TEST_DATA_DIR"]) / "embedding_testdata" / "training.jsonl"
32+
33+
34+
def test_retrieval_dataset_inline_embedding_testdata_smoke():
35+
data_file = _embedding_testdata_training_file()
36+
if not data_file.exists():
37+
pytest.skip(f"Missing embedding test data file: {data_file}")
38+
39+
ds = rdi.make_retrieval_dataset(
40+
data_dir_list=str(data_file),
41+
data_type="train",
42+
train_n_passages=2, # 1 positive + 1 negative
43+
do_shuffle=False,
44+
max_train_samples=1,
45+
)
46+
47+
assert len(ds) >= 1
48+
49+
ex = ds[0]
50+
assert isinstance(ex.get("question"), str) and ex["question"]
51+
assert isinstance(ex.get("doc_text"), list) and len(ex["doc_text"]) == 2
52+
assert isinstance(ex["doc_text"][0], str)
53+
assert isinstance(ex.get("doc_image"), list) and len(ex["doc_image"]) == 2
54+
assert isinstance(ex.get("query_instruction"), str)
55+
assert isinstance(ex.get("passage_instruction"), str)
56+
57+
payload = load_jsonl_one_line(data_file)
58+
assert ex['doc_text'][0] == payload[0]['pos_doc']
59+
assert ex['doc_text'][1] == payload[0]['neg_doc'][0]
60+
assert ex['question'] == payload[0]['query']
61+

0 commit comments

Comments
 (0)