1
+ # SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # Example usage of the script to compute the hidden states for a conversation dataset
17
+ # This script computes hidden states using a Hugging Face model and saves them to
18
+ # the specified output directory. It does so in a data-parallel manner across 8 GPUs, by splitting
19
+ # the input file into 8 parts and running 8 processes in parallel, one on each GPU.
20
+
21
+ # Note: depending on the write-throughput of the destination disk, this is not guaranteed
22
+ # to yield a speed improvement compared to running the model-parallel version. Consider
23
+ # benchmarking on a smaller dataset before launching a large run.
24
+
25
+ INPUT_FILE=synthetic_conversations/daring-anteater.jsonl
26
+ OUTPUT_DIR=/mnt/md0/eagle-hidden-states/llama1b/daring_anteater/
27
+
28
+ split -n l/8 --numeric-suffixes=0 -d --additional-suffix=.jsonl $INPUT_FILE /tmp/part-
29
+
30
+ for i in $( seq 0 7)
31
+ do
32
+ CUDA_VISIBLE_DEVICES=$i python3 collect_hidden_states/compute_hidden_states_hf.py --model meta-llama/Llama-3.2-1B-Instruct --input-file /tmp/part-0${i} .jsonl --output-dir $OUTPUT_DIR &
33
+ done
34
+ wait
35
+
36
+ rm /tmp/part-* .jsonl
0 commit comments