1+ # SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+ # SPDX-License-Identifier: Apache-2.0
3+ #
4+ # Licensed under the Apache License, Version 2.0 (the "License");
5+ # you may not use this file except in compliance with the License.
6+ # You may obtain a copy of the License at
7+ #
8+ # http://www.apache.org/licenses/LICENSE-2.0
9+ #
10+ # Unless required by applicable law or agreed to in writing, software
11+ # distributed under the License is distributed on an "AS IS" BASIS,
12+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+ # See the License for the specific language governing permissions and
14+ # limitations under the License.
15+
16+ # Example usage of the script to compute the hidden states for a conversation dataset
17+ # This script computes hidden states using a Hugging Face model and saves them to
18+ # the specified output directory. It does so in a data-parallel manner across 8 GPUs, by splitting
19+ # the input file into 8 parts and running 8 processes in parallel, one on each GPU.
20+
21+ # Note: depending on the write-throughput of the destination disk, this is not guaranteed
22+ # to yield a speed improvement compared to running the model-parallel version. Consider
23+ # benchmarking on a smaller dataset before launching a large run.
24+
25+ INPUT_FILE=synthetic_conversations/daring-anteater.jsonl
26+ OUTPUT_DIR=/mnt/md0/eagle-hidden-states/llama1b/daring_anteater/
27+
28+ split -n l/8 --numeric-suffixes=0 -d --additional-suffix=.jsonl $INPUT_FILE /tmp/part-
29+
30+ for i in $( seq 0 7)
31+ do
32+ CUDA_VISIBLE_DEVICES=$i python3 collect_hidden_states/compute_hidden_states_hf.py --model meta-llama/Llama-3.2-1B-Instruct --input-file /tmp/part-0${i} .jsonl --output-dir $OUTPUT_DIR &
33+ done
34+ wait
35+
36+ rm /tmp/part-* .jsonl
0 commit comments