1+ #!/usr/bin/env python3
2+ #-------------------------------------------------------------
3+ #
4+ # Licensed to the Apache Software Foundation (ASF) under one
5+ # or more contributor license agreements. See the NOTICE file
6+ # distributed with this work for additional information
7+ # regarding copyright ownership. The ASF licenses this file
8+ # to you under the Apache License, Version 2.0 (the
9+ # "License"); you may not use this file except in compliance
10+ # with the License. You may obtain a copy of the License at
11+ #
12+ # http://www.apache.org/licenses/LICENSE-2.0
13+ #
14+ # Unless required by applicable law or agreed to in writing,
15+ # software distributed under the License is distributed on an
16+ # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17+ # KIND, either express or implied. See the License for the
18+ # specific language governing permissions and limitations
19+ # under the License.
20+ #
21+ #-------------------------------------------------------------
22+ """
23+ Create pre-split binary chunks from ImageNet data for SystemDS LARS training.
24+
25+ This script reads existing CSV or binary data and splits it into manageable chunks
26+ for memory-efficient training with large datasets.
27+ """
28+
29+ import os
30+ import sys
31+ import numpy as np
32+ import pandas as pd
33+ from pathlib import Path
34+
35+ def create_binary_chunks (data_dir = "imagenet_data" , chunk_size = 10000 ):
36+ """
37+ Create binary chunk files from existing ImageNet data.
38+
39+ Args:
40+ data_dir: Directory containing the ImageNet data
41+ chunk_size: Number of samples per chunk
42+ """
43+ data_path = Path (data_dir )
44+
45+ print (f"Creating binary chunks from data in: { data_path } " )
46+ print (f"Chunk size: { chunk_size } " )
47+
48+ # Check what data we have available
49+ csv_train = data_path / "imagenet_train.csv"
50+ csv_val = data_path / "imagenet_val.csv"
51+
52+ if csv_train .exists () and csv_val .exists ():
53+ print ("Found CSV files, converting to binary chunks..." )
54+ create_chunks_from_csv (data_path , chunk_size )
55+ else :
56+ print ("CSV files not found, creating dummy chunks for testing..." )
57+ create_dummy_chunks (data_path , chunk_size )
58+
59+ def create_chunks_from_csv (data_path , chunk_size ):
60+ """Create chunks from CSV files."""
61+
62+ # Read training data
63+ print ("Reading training CSV..." )
64+ train_df = pd .read_csv (data_path / "imagenet_train.csv" , header = None )
65+ print (f"Training data shape: { train_df .shape } " )
66+
67+ # Read validation data
68+ print ("Reading validation CSV..." )
69+ val_df = pd .read_csv (data_path / "imagenet_val.csv" , header = None )
70+ print (f"Validation data shape: { val_df .shape } " )
71+
72+ # Split training data into chunks
73+ train_labels = train_df .iloc [:, 0 ].values
74+ train_data = train_df .iloc [:, 1 :].values
75+
76+ # Convert to float and normalize
77+ train_data = train_data .astype (np .float64 ) / 255.0
78+
79+ num_train_chunks = (len (train_data ) + chunk_size - 1 ) // chunk_size
80+ print (f"Creating { num_train_chunks } training chunks..." )
81+
82+ for i in range (num_train_chunks ):
83+ start_idx = i * chunk_size
84+ end_idx = min ((i + 1 ) * chunk_size , len (train_data ))
85+
86+ chunk_data = train_data [start_idx :end_idx ]
87+ chunk_labels = train_labels [start_idx :end_idx ]
88+
89+ # Convert labels to one-hot (assuming 10 classes for now)
90+ num_classes = 10
91+ chunk_labels_onehot = np .eye (num_classes )[chunk_labels ]
92+
93+ # Save as binary files that SystemDS can read
94+ chunk_num = f"{ i + 1 :03d} "
95+
96+ # Save data chunk as CSV
97+ data_file = data_path / f"train_chunk_{ chunk_num } .csv"
98+ pd .DataFrame (chunk_data ).to_csv (data_file , header = False , index = False )
99+
100+ # Save labels chunk as CSV
101+ labels_file = data_path / f"train_labels_{ chunk_num } .csv"
102+ pd .DataFrame (chunk_labels_onehot ).to_csv (labels_file , header = False , index = False )
103+
104+ print (f" Chunk { chunk_num } : { chunk_data .shape [0 ]} samples" )
105+
106+ # Process validation data (typically smaller, so fewer chunks)
107+ val_labels = val_df .iloc [:, 0 ].values
108+ val_data = val_df .iloc [:, 1 :].values
109+ val_data = val_data .astype (np .float64 ) / 255.0
110+
111+ val_chunk_size = min (chunk_size , len (val_data ))
112+ num_val_chunks = (len (val_data ) + val_chunk_size - 1 ) // val_chunk_size
113+ print (f"Creating { num_val_chunks } validation chunks..." )
114+
115+ for i in range (num_val_chunks ):
116+ start_idx = i * val_chunk_size
117+ end_idx = min ((i + 1 ) * val_chunk_size , len (val_data ))
118+
119+ chunk_data = val_data [start_idx :end_idx ]
120+ chunk_labels = val_labels [start_idx :end_idx ]
121+
122+ # Convert labels to one-hot
123+ chunk_labels_onehot = np .eye (num_classes )[chunk_labels ]
124+
125+ chunk_num = f"{ i + 1 :03d} "
126+
127+ # Save data chunk as CSV
128+ data_file = data_path / f"val_chunk_{ chunk_num } .csv"
129+ pd .DataFrame (chunk_data ).to_csv (data_file , header = False , index = False )
130+
131+ # Save labels chunk as CSV
132+ labels_file = data_path / f"val_labels_{ chunk_num } .csv"
133+ pd .DataFrame (chunk_labels_onehot ).to_csv (labels_file , header = False , index = False )
134+
135+ print (f" Val chunk { chunk_num } : { chunk_data .shape [0 ]} samples" )
136+
137+ def create_dummy_chunks (data_path , chunk_size ):
138+ """Create dummy chunks for testing when real data isn't available."""
139+ print ("Creating dummy data chunks for testing..." )
140+
141+ # ImageNet-like dimensions
142+ img_height , img_width , channels = 224 , 224 , 3
143+ num_features = img_height * img_width * channels
144+ num_classes = 10
145+
146+ # Create training chunks
147+ num_train_samples = chunk_size * 2 # Create 2 chunks for demo
148+
149+ print (f"Generating { num_train_samples } dummy training samples..." )
150+ train_data = np .random .rand (num_train_samples , num_features ).astype (np .float64 )
151+ train_labels = np .random .randint (0 , num_classes , num_train_samples )
152+ train_labels_onehot = np .eye (num_classes )[train_labels ]
153+
154+ # Split into chunks
155+ for i in range (2 ): # 2 training chunks
156+ start_idx = i * chunk_size
157+ end_idx = (i + 1 ) * chunk_size
158+
159+ chunk_data = train_data [start_idx :end_idx ]
160+ chunk_labels_onehot_chunk = train_labels_onehot [start_idx :end_idx ]
161+
162+ chunk_num = f"{ i + 1 :03d} "
163+
164+ # Save chunks as CSV
165+ data_file = data_path / f"train_chunk_{ chunk_num } .csv"
166+ pd .DataFrame (chunk_data ).to_csv (data_file , header = False , index = False )
167+
168+ labels_file = data_path / f"train_labels_{ chunk_num } .csv"
169+ pd .DataFrame (chunk_labels_onehot_chunk ).to_csv (labels_file , header = False , index = False )
170+
171+ print (f" Created train chunk { chunk_num } : { chunk_data .shape } " )
172+
173+ # Create validation chunk
174+ num_val_samples = min (chunk_size , 5000 ) # Smaller validation set
175+ print (f"Generating { num_val_samples } dummy validation samples..." )
176+
177+ val_data = np .random .rand (num_val_samples , num_features ).astype (np .float64 )
178+ val_labels = np .random .randint (0 , num_classes , num_val_samples )
179+ val_labels_onehot = np .eye (num_classes )[val_labels ]
180+
181+ # Save validation chunk as CSV
182+ data_file = data_path / "val_chunk_001.csv"
183+ pd .DataFrame (val_data ).to_csv (data_file , header = False , index = False )
184+
185+ labels_file = data_path / "val_labels_001.csv"
186+ pd .DataFrame (val_labels_onehot ).to_csv (labels_file , header = False , index = False )
187+
188+ print (f" Created val chunk 001: { val_data .shape } " )
189+
190+ def main ():
191+ """Main execution."""
192+ data_dir = "imagenet_data"
193+ chunk_size = 10000
194+
195+ if len (sys .argv ) > 1 :
196+ data_dir = sys .argv [1 ]
197+ if len (sys .argv ) > 2 :
198+ chunk_size = int (sys .argv [2 ])
199+
200+ # Create data directory if it doesn't exist
201+ os .makedirs (data_dir , exist_ok = True )
202+
203+ create_binary_chunks (data_dir , chunk_size )
204+
205+ print ("\n ✅ Binary chunk creation completed!" )
206+ print (f"Chunks saved in: { data_dir } /" )
207+ print ("Files created:" )
208+
209+ data_path = Path (data_dir )
210+ for file in sorted (data_path .glob ("*_chunk_*.bin" )):
211+ size_mb = file .stat ().st_size / (1024 * 1024 )
212+ print (f" { file .name } ({ size_mb :.1f} MB)" )
213+
214+ if __name__ == "__main__" :
215+ main ()
0 commit comments