foundation-model-stack
diff --git a/‎.pylintrc‎
Lines changed: 1 addition & 1 deletion b/‎.pylintrc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎architecture_records/003-generic-tracker-framework.md‎
Lines changed: 1 addition & 1 deletion b/‎architecture_records/003-generic-tracker-framework.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎architecture_records/004-datapreprocessor.md‎
Lines changed: 422 additions & 0 deletions b/‎architecture_records/004-datapreprocessor.md‎
Lines changed: 422 additions & 0 deletions
diff --git a/‎tests/acceleration/test_acceleration_framework.py‎
Lines changed: 3 additions & 3 deletions b/‎tests/acceleration/test_acceleration_framework.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/artifacts/predefined_data_configs/__init__.py‎
Lines changed: 30 additions & 0 deletions b/‎tests/artifacts/predefined_data_configs/__init__.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎tests/artifacts/predefined_data_configs/apply_custom_template.yaml‎
Lines changed: 14 additions & 0 deletions b/‎tests/artifacts/predefined_data_configs/apply_custom_template.yaml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎tests/artifacts/predefined_data_configs/pretokenized_json_data.yaml‎
Lines changed: 6 additions & 0 deletions b/‎tests/artifacts/predefined_data_configs/pretokenized_json_data.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎tests/artifacts/predefined_data_configs/tokenize_and_apply_input_masking.yaml‎
Lines changed: 14 additions & 0 deletions b/‎tests/artifacts/predefined_data_configs/tokenize_and_apply_input_masking.yaml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎tests/data/__init__.py‎ ‎tests/artifacts/testdata/__init__.py‎tests/data/__init__.py renamed to tests/artifacts/testdata/__init__.py b/‎tests/data/__init__.py‎ ‎tests/artifacts/testdata/__init__.py‎tests/data/__init__.py renamed to tests/artifacts/testdata/__init__.py
diff --git a/‎tests/data/empty_data.json‎ ‎tests/artifacts/testdata/empty_data.json‎tests/data/empty_data.json renamed to tests/artifacts/testdata/empty_data.json b/‎tests/data/empty_data.json‎ ‎tests/artifacts/testdata/empty_data.json‎tests/data/empty_data.json renamed to tests/artifacts/testdata/empty_data.json
@@ -638,7 +638,7 @@ callbacks=cb_,
 dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
 
 # Argument names that match this expression will be ignored.
-ignored-argument-names=_.*|^ignored_|^unused_
+ignored-argument-names=_.*|^ignored_|^unused_|kwargs
 
 # Tells whether we should check for unused import in __init__ files.
 init-import=no
 
@@ -54,13 +54,13 @@
 from tuning.utils.import_utils import is_fms_accelerate_available
 
 # for some reason the CI will raise an import error if we try to import
-# these from tests.data
+# these from tests.artifacts.testdata
 TWITTER_COMPLAINTS_JSON_FORMAT = os.path.join(
-    os.path.dirname(__file__), "../data/twitter_complaints_json.json"
+    os.path.dirname(__file__), "../artifacts/testdata/twitter_complaints_json.json"
 )
 TWITTER_COMPLAINTS_TOKENIZED = os.path.join(
     os.path.dirname(__file__),
-    "../data/twitter_complaints_tokenized_with_maykeye_tinyllama_v0.json",
+    "../artifacts/testdata/twitter_complaints_tokenized_with_maykeye_tinyllama_v0.json",
 )
 
 # pylint: disable=import-error
 
@@ -0,0 +1,30 @@
+# Copyright The FMS HF Tuning Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Helpful datasets for configuring individual unit tests.
+"""
+# Standard
+import os
+
+### Constants used for data
+PREDEFINED_DATA_CONFIGS = os.path.join(os.path.dirname(__file__))
+APPLY_CUSTOM_TEMPLATE_YAML = os.path.join(
+    PREDEFINED_DATA_CONFIGS, "apply_custom_template.yaml"
+)
+PRETOKENIZE_JSON_DATA_YAML = os.path.join(
+    PREDEFINED_DATA_CONFIGS, "pretokenized_json_data.yaml"
+)
+TOKENIZE_AND_APPLY_INPUT_MASKING_YAML = os.path.join(
+    PREDEFINED_DATA_CONFIGS, "tokenize_and_apply_input_masking.yaml"
+)
@@ -0,0 +1,14 @@
+dataprocessor:
+    type: default
+datasets:
+  - name: apply_custom_data_template
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: apply_custom_data_formatting_template
+        arguments:
+          remove_columns: all
+          batched: false
+          fn_kwargs:
+            dataset_text_field: "dataset_text_field"
+            dataset_template: "dataset_template"
@@ -0,0 +1,6 @@
+dataprocessor:
+    type: default
+datasets:
+  - name: pretokenized_dataset
+    data_paths:
+      - "FILE_PATH"
@@ -0,0 +1,14 @@
+dataprocessor:
+    type: default
+datasets:
+  - name: text_dataset_input_output_masking
+    data_paths:
+      - "FILE_PATH"
+    data_handlers:
+      - name: tokenize_and_apply_input_masking
+        arguments:
+          remove_columns: all
+          batched: false
+          fn_kwargs:
+            input_field: "INPUT"
+            output_field: "OUTPUT"