Skip to content

Commit e385a60

Browse files
authored
[Other] Add script for generating subgraph_dataset. (#465)
* Add script to generate subgraph dataset of different level. * Add deduplicate step. * Add unittest generate step. * Update.
1 parent 6618995 commit e385a60

File tree

3 files changed

+235
-113
lines changed

3 files changed

+235
-113
lines changed

graph_net/tools/deduplicated.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
"""Utility for moving duplicate sample models based on their graph hash."""
2+
3+
from __future__ import annotations
4+
5+
import argparse
6+
import shutil
7+
from collections import defaultdict
8+
from pathlib import Path
9+
from typing import Dict, List
10+
11+
12+
def collect_graph_hashs(samples_dir: Path) -> Dict[str, List[Path]]:
13+
if not samples_dir.is_dir():
14+
raise FileNotFoundError(f"Samples directory not found: {samples_dir}")
15+
16+
graph_hash2model_paths: Dict[str, List[Path]] = defaultdict(list)
17+
all_graph_hashs = sorted(samples_dir.rglob("graph_hash.txt"))
18+
for filepath in all_graph_hashs:
19+
model_path = filepath.parent
20+
graph_hash = filepath.read_text(encoding="utf-8").strip()
21+
graph_hash2model_paths[graph_hash].append(model_path)
22+
return graph_hash2model_paths
23+
24+
25+
def main(args):
26+
print(f"Copy samples: {args.samples_dir} -> {args.target_dir}")
27+
shutil.copytree(args.samples_dir, args.target_dir)
28+
graph_hash2model_paths = collect_graph_hashs(args.target_dir)
29+
num_removed_samples = 0
30+
for graph_hash, model_paths in graph_hash2model_paths.items():
31+
# Keep the first sample and move the rest.
32+
for idx in range(1, len(model_paths)):
33+
print(f"Remove {model_paths[idx]}")
34+
shutil.rmtree(model_paths[idx])
35+
num_removed_samples += 1
36+
print(
37+
f"Totally {len(graph_hash2model_paths)} different graph_hashs, {num_removed_samples} samples are removed."
38+
)
39+
40+
41+
if __name__ == "__main__":
42+
parser = argparse.ArgumentParser(description=__doc__)
43+
parser.add_argument(
44+
"--samples-dir",
45+
type=Path,
46+
required=True,
47+
default=None,
48+
help="Root directory containing sample models.",
49+
)
50+
parser.add_argument(
51+
"--target-dir",
52+
type=Path,
53+
required=True,
54+
help="Directory where duplicate models will be moved to.",
55+
)
56+
args = parser.parse_args()
57+
main(args)
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
#!/bin/bash
2+
set -x
3+
4+
OP_NUM=${1:-64}
5+
GPU_ID=${2:-4}
6+
7+
export CUDA_VISIBLE_DEVICES="${GPU_ID}"
8+
export PYTHONPATH=/work/GraphNet:/work/abstract_pass/Athena:$PYTHONPATH
9+
10+
GRAPH_NET_ROOT=$(python3 -c "import graph_net; import os; print(os.path.dirname(os.path.dirname(graph_net.__file__)))")
11+
12+
DECOMPOSE_WORKSPACE=/work/graphnet_test_workspace/subgraph_dataset_20251221
13+
LEVEL_DECOMPOSE_WORKSPACE=$DECOMPOSE_WORKSPACE/decomposed_${OP_NUM}ops
14+
OP_NAMES_OUTPUT_DIR=${DECOMPOSE_WORKSPACE}/sample_op_names
15+
RANGE_DECOMPOSE_OUTPUT_DIR="${LEVEL_DECOMPOSE_WORKSPACE}/range_decompose"
16+
GRAPH_VAR_RENAME_OUTPUT_DIR=$LEVEL_DECOMPOSE_WORKSPACE/graph_var_renamed
17+
DEDUPLICATED_OUTPUT_DIR=$LEVEL_DECOMPOSE_WORKSPACE/deduplicated
18+
UNITTESTS_OUTPUT_DIR=$LEVEL_DECOMPOSE_WORKSPACE/unittests
19+
20+
mkdir -p "$LEVEL_DECOMPOSE_WORKSPACE"
21+
22+
model_list="$GRAPH_NET_ROOT/graph_net/config/torch_samples_list.txt"
23+
range_decomposed_subgraph_list=${LEVEL_DECOMPOSE_WORKSPACE}/range_decomposed_subgraph_sample_list.txt
24+
deduplicated_subgraph_list=${LEVEL_DECOMPOSE_WORKSPACE}/deduplicated_subgraph_sample_list.txt
25+
26+
function generate_subgraph_list() {
27+
local target_dir="$1"
28+
local sample_list="$2"
29+
echo ">>> Generate subgraph_sample_list for samples under ${target_dir}."
30+
echo ">>>"
31+
cat $model_list \
32+
| grep -v '# ' \
33+
| xargs -I {} find ${target_dir}/{} -name "model.py" \
34+
| xargs dirname \
35+
| xargs realpath --relative-to=$target_dir \
36+
| tee $sample_list
37+
}
38+
39+
function generate_op_names() {
40+
echo ">>> [1] Generate op_names.txt for samples in ${model_list}."
41+
echo ">>>"
42+
python3 -m graph_net.model_path_handler \
43+
--model-path-list $model_list \
44+
--handler-config=$(base64 -w 0 <<EOF
45+
{
46+
"handler_path": "$GRAPH_NET_ROOT/graph_net/torch/typical_sequence_split_points.py",
47+
"handler_class_name": "OpNamesExtractor",
48+
"handler_config": {
49+
"resume": true,
50+
"model_path_prefix": "$GRAPH_NET_ROOT",
51+
"output_dir": "${OP_NAMES_OUTPUT_DIR}"
52+
}
53+
}
54+
EOF
55+
)
56+
}
57+
58+
function generate_split_point() {
59+
# MIN_SEQ_OPS, MAX_SEQ_OPS
60+
# level 1: 2, 4
61+
# level 2: 4, 8
62+
# level 3: 8, 16
63+
# level 4: 16, 32
64+
# level 5: 32, 64
65+
MIN_SEQ_OPS=$((${OP_NUM} / 2))
66+
MAX_SEQ_OPS=${OP_NUM}
67+
echo ">>> [2] Generate split points for samples in ${model_list}."
68+
echo ">>> OP_NUM: ${OP_NUM}, MIN_SEQ_OPS: ${MIN_SEQ_OPS}, MAX_SEQ_OPS: ${MAX_SEQ_OPS}"
69+
echo ">>>"
70+
python3 -m graph_net.torch.typical_sequence_split_points \
71+
--model-list "$model_list" \
72+
--op-names-path-prefix "${OP_NAMES_OUTPUT_DIR}" \
73+
--device "cuda" \
74+
--window-size ${OP_NUM} \
75+
--fold-policy default \
76+
--fold-times 16 \
77+
--min-seq-ops ${MIN_SEQ_OPS} \
78+
--max-seq-ops ${MAX_SEQ_OPS} \
79+
--subgraph-ranges-json "$LEVEL_DECOMPOSE_WORKSPACE/subgraph_ranges_${OP_NUM}.json" \
80+
--output-json "$LEVEL_DECOMPOSE_WORKSPACE/split_results_${OP_NUM}.json"
81+
}
82+
83+
function range_decompose() {
84+
echo ">>> [3] Decompose according to split_results.json for samples in ${model_list}."
85+
echo ">>>"
86+
python3 -m graph_net.model_path_handler \
87+
--model-path-list "$model_list" \
88+
--handler-config=$(base64 -w 0 <<EOF
89+
{
90+
"handler_path": "$GRAPH_NET_ROOT/graph_net/torch/graph_decomposer.py",
91+
"handler_class_name": "RangeDecomposerExtractor",
92+
"handler_config": {
93+
"resume": false,
94+
"model_path_prefix": "$GRAPH_NET_ROOT",
95+
"output_dir": "${RANGE_DECOMPOSE_OUTPUT_DIR}",
96+
"split_results_path": "$LEVEL_DECOMPOSE_WORKSPACE/split_results_${OP_NUM}.json",
97+
"subgraph_ranges_path": "$LEVEL_DECOMPOSE_WORKSPACE/subgraph_ranges_${OP_NUM}.json",
98+
"group_head_and_tail": true,
99+
"chain_style": false
100+
}
101+
}
102+
EOF
103+
)
104+
}
105+
106+
function rename_subgraph() {
107+
echo ">>> [4] Rename subgraph samples under ${RANGE_DECOMPOSE_OUTPUT_DIR}."
108+
echo ">>>"
109+
python3 -m graph_net.model_path_handler \
110+
--model-path-list ${range_decomposed_subgraph_list} \
111+
--handler-config=$(base64 -w 0 <<EOF
112+
{
113+
"handler_path": "$GRAPH_NET_ROOT/graph_net/torch/graph_variable_renamer.py",
114+
"handler_class_name": "GraphVariableRenamer",
115+
"handler_config": {
116+
"device": "cuda",
117+
"resume": true,
118+
"model_path_prefix": "$RANGE_DECOMPOSE_OUTPUT_DIR",
119+
"data_input_predicator_filepath": "$GRAPH_NET_ROOT/graph_net/torch/constraint_util.py",
120+
"data_input_predicator_class_name": "NaiveDataInputPredicator",
121+
"model_runnable_predicator_filepath": "$GRAPH_NET_ROOT/graph_net/torch/constraint_util.py",
122+
"model_runnable_predicator_class_name": "ModelRunnablePredicator",
123+
"output_dir": "$GRAPH_VAR_RENAME_OUTPUT_DIR"
124+
}
125+
}
126+
EOF
127+
)
128+
}
129+
130+
function remove_duplicates() {
131+
echo ">>> [5] Remove duplicated subgraph samples under ${GRAPH_VAR_RENAME_OUTPUT_DIR}."
132+
echo ">>>"
133+
python3 -m graph_net.tools.deduplicated \
134+
--samples-dir ${GRAPH_VAR_RENAME_OUTPUT_DIR} \
135+
--target-dir ${DEDUPLICATED_OUTPUT_DIR}
136+
}
137+
138+
function generate_unittests() {
139+
echo ">>> [6] Generate unittests for subgraph samples under ${DEDUPLICATED_OUTPUT_DIR}."
140+
echo ">>>"
141+
python3 -m graph_net.model_path_handler \
142+
--model-path-list ${deduplicated_subgraph_list} \
143+
--handler-config=$(base64 -w 0 <<EOF
144+
{
145+
"handler_path": "$GRAPH_NET_ROOT/graph_net/sample_pass/agent_unittest_generator.py",
146+
"handler_class_name": "AgentUnittestGeneratorPass",
147+
"handler_config": {
148+
"framework": "torch",
149+
"model_path_prefix": "${DEDUPLICATED_OUTPUT_DIR}",
150+
"output_dir": "$UNITTESTS_OUTPUT_DIR",
151+
"device": "cuda",
152+
"generate_main": true,
153+
"try_run": true,
154+
"resume": true,
155+
"data_input_predicator_filepath": "$GRAPH_NET_ROOT/graph_net/torch/constraint_util.py",
156+
"data_input_predicator_class_name": "RenamedDataInputPredicator"
157+
}
158+
}
159+
EOF
160+
)
161+
}
162+
163+
main() {
164+
timestamp=`date +%Y%m%d_%H%M`
165+
suffix="${OP_NUM}ops_${timestamp}"
166+
#generate_op_names 2>&1 | tee ${LEVEL_DECOMPOSE_WORKSPACE}/log_op_names_${suffix}.txt
167+
generate_split_point 2>&1 | tee ${LEVEL_DECOMPOSE_WORKSPACE}/log_split_point_${suffix}.txt
168+
range_decompose 2>&1 | tee ${LEVEL_DECOMPOSE_WORKSPACE}/log_range_decompose_${suffix}.txt
169+
170+
generate_subgraph_list ${RANGE_DECOMPOSE_OUTPUT_DIR} ${range_decomposed_subgraph_list}
171+
rename_subgraph 2>&1 | tee ${LEVEL_DECOMPOSE_WORKSPACE}/log_rename_subgraph_${suffix}.txt
172+
remove_duplicates 2>&1 | tee ${LEVEL_DECOMPOSE_WORKSPACE}/log_remove_duplicates_${suffix}.txt
173+
174+
generate_subgraph_list ${DEDUPLICATED_OUTPUT_DIR} ${deduplicated_subgraph_list}
175+
generate_unittests 2>&1 | tee ${LEVEL_DECOMPOSE_WORKSPACE}/log_generate_unittests_${suffix}.txt
176+
}
177+
178+
main

tools/deduplicated.py

Lines changed: 0 additions & 113 deletions
This file was deleted.

0 commit comments

Comments
 (0)