Skip to content

Commit ccd4741

Browse files
authored
Adding custom dataset file (meta-llama#659)
1 parent 778e31e commit ccd4741

File tree

1 file changed

+37
-0
lines changed

1 file changed

+37
-0
lines changed
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
import importlib
2+
from pathlib import Path
3+
4+
def load_module_from_py_file(py_file: str) -> object:
5+
"""
6+
This method loads a module from a py file which is not in the Python path
7+
"""
8+
module_name = Path(py_file).name
9+
loader = importlib.machinery.SourceFileLoader(module_name, py_file)
10+
spec = importlib.util.spec_from_loader(module_name, loader)
11+
module = importlib.util.module_from_spec(spec)
12+
13+
loader.exec_module(module)
14+
15+
return module
16+
17+
18+
def get_custom_dataset(dataset_config, tokenizer, split: str):
19+
if ":" in dataset_config.file:
20+
module_path, func_name = dataset_config.file.split(":")
21+
else:
22+
module_path, func_name = dataset_config.file, "get_custom_dataset"
23+
24+
if not module_path.endswith(".py"):
25+
raise ValueError(f"Dataset file {module_path} is not a .py file.")
26+
27+
module_path = Path(module_path)
28+
if not module_path.is_file():
29+
raise FileNotFoundError(f"Dataset py file {module_path.as_posix()} does not exist or is not a file.")
30+
31+
module = load_module_from_py_file(module_path.as_posix())
32+
try:
33+
return getattr(module, func_name)(dataset_config, tokenizer, split)
34+
except AttributeError as e:
35+
print(f"It seems like the given method name ({func_name}) is not present in the dataset .py file ({module_path.as_posix()}).")
36+
raise e
37+

0 commit comments

Comments
 (0)