Skip to content

Commit a80e3dc

Browse files
JonahBalshaiJaviermateornoahschuetz
authored andcommitted
[SYSTEMDS-2622] NN Lib: LARS Optimizer and Examples
Closes #2287. Co-authored-by: Javiermateor <[email protected]> Co-authored-by: Noah Schuetz <[email protected]>
1 parent 41d6c9d commit a80e3dc

22 files changed

+3921
-13
lines changed
Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
#!/usr/bin/env python3
2+
#-------------------------------------------------------------
3+
#
4+
# Licensed to the Apache Software Foundation (ASF) under one
5+
# or more contributor license agreements. See the NOTICE file
6+
# distributed with this work for additional information
7+
# regarding copyright ownership. The ASF licenses this file
8+
# to you under the Apache License, Version 2.0 (the
9+
# "License"); you may not use this file except in compliance
10+
# with the License. You may obtain a copy of the License at
11+
#
12+
# http://www.apache.org/licenses/LICENSE-2.0
13+
#
14+
# Unless required by applicable law or agreed to in writing,
15+
# software distributed under the License is distributed on an
16+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
17+
# KIND, either express or implied. See the License for the
18+
# specific language governing permissions and limitations
19+
# under the License.
20+
#
21+
#-------------------------------------------------------------
22+
"""
23+
Create pre-split binary chunks from ImageNet data for SystemDS LARS training.
24+
25+
This script reads existing CSV or binary data and splits it into manageable chunks
26+
for memory-efficient training with large datasets.
27+
"""
28+
29+
import os
30+
import sys
31+
import numpy as np
32+
import pandas as pd
33+
from pathlib import Path
34+
35+
def create_binary_chunks(data_dir="imagenet_data", chunk_size=10000):
36+
"""
37+
Create binary chunk files from existing ImageNet data.
38+
39+
Args:
40+
data_dir: Directory containing the ImageNet data
41+
chunk_size: Number of samples per chunk
42+
"""
43+
data_path = Path(data_dir)
44+
45+
print(f"Creating binary chunks from data in: {data_path}")
46+
print(f"Chunk size: {chunk_size}")
47+
48+
# Check what data we have available
49+
csv_train = data_path / "imagenet_train.csv"
50+
csv_val = data_path / "imagenet_val.csv"
51+
52+
if csv_train.exists() and csv_val.exists():
53+
print("Found CSV files, converting to binary chunks...")
54+
create_chunks_from_csv(data_path, chunk_size)
55+
else:
56+
print("CSV files not found, creating dummy chunks for testing...")
57+
create_dummy_chunks(data_path, chunk_size)
58+
59+
def create_chunks_from_csv(data_path, chunk_size):
60+
"""Create chunks from CSV files."""
61+
62+
# Read training data
63+
print("Reading training CSV...")
64+
train_df = pd.read_csv(data_path / "imagenet_train.csv", header=None)
65+
print(f"Training data shape: {train_df.shape}")
66+
67+
# Read validation data
68+
print("Reading validation CSV...")
69+
val_df = pd.read_csv(data_path / "imagenet_val.csv", header=None)
70+
print(f"Validation data shape: {val_df.shape}")
71+
72+
# Split training data into chunks
73+
train_labels = train_df.iloc[:, 0].values
74+
train_data = train_df.iloc[:, 1:].values
75+
76+
# Convert to float and normalize
77+
train_data = train_data.astype(np.float64) / 255.0
78+
79+
num_train_chunks = (len(train_data) + chunk_size - 1) // chunk_size
80+
print(f"Creating {num_train_chunks} training chunks...")
81+
82+
for i in range(num_train_chunks):
83+
start_idx = i * chunk_size
84+
end_idx = min((i + 1) * chunk_size, len(train_data))
85+
86+
chunk_data = train_data[start_idx:end_idx]
87+
chunk_labels = train_labels[start_idx:end_idx]
88+
89+
# Convert labels to one-hot (assuming 10 classes for now)
90+
num_classes = 10
91+
chunk_labels_onehot = np.eye(num_classes)[chunk_labels]
92+
93+
# Save as binary files that SystemDS can read
94+
chunk_num = f"{i+1:03d}"
95+
96+
# Save data chunk as CSV
97+
data_file = data_path / f"train_chunk_{chunk_num}.csv"
98+
pd.DataFrame(chunk_data).to_csv(data_file, header=False, index=False)
99+
100+
# Save labels chunk as CSV
101+
labels_file = data_path / f"train_labels_{chunk_num}.csv"
102+
pd.DataFrame(chunk_labels_onehot).to_csv(labels_file, header=False, index=False)
103+
104+
print(f" Chunk {chunk_num}: {chunk_data.shape[0]} samples")
105+
106+
# Process validation data (typically smaller, so fewer chunks)
107+
val_labels = val_df.iloc[:, 0].values
108+
val_data = val_df.iloc[:, 1:].values
109+
val_data = val_data.astype(np.float64) / 255.0
110+
111+
val_chunk_size = min(chunk_size, len(val_data))
112+
num_val_chunks = (len(val_data) + val_chunk_size - 1) // val_chunk_size
113+
print(f"Creating {num_val_chunks} validation chunks...")
114+
115+
for i in range(num_val_chunks):
116+
start_idx = i * val_chunk_size
117+
end_idx = min((i + 1) * val_chunk_size, len(val_data))
118+
119+
chunk_data = val_data[start_idx:end_idx]
120+
chunk_labels = val_labels[start_idx:end_idx]
121+
122+
# Convert labels to one-hot
123+
chunk_labels_onehot = np.eye(num_classes)[chunk_labels]
124+
125+
chunk_num = f"{i+1:03d}"
126+
127+
# Save data chunk as CSV
128+
data_file = data_path / f"val_chunk_{chunk_num}.csv"
129+
pd.DataFrame(chunk_data).to_csv(data_file, header=False, index=False)
130+
131+
# Save labels chunk as CSV
132+
labels_file = data_path / f"val_labels_{chunk_num}.csv"
133+
pd.DataFrame(chunk_labels_onehot).to_csv(labels_file, header=False, index=False)
134+
135+
print(f" Val chunk {chunk_num}: {chunk_data.shape[0]} samples")
136+
137+
def create_dummy_chunks(data_path, chunk_size):
138+
"""Create dummy chunks for testing when real data isn't available."""
139+
print("Creating dummy data chunks for testing...")
140+
141+
# ImageNet-like dimensions
142+
img_height, img_width, channels = 224, 224, 3
143+
num_features = img_height * img_width * channels
144+
num_classes = 10
145+
146+
# Create training chunks
147+
num_train_samples = chunk_size * 2 # Create 2 chunks for demo
148+
149+
print(f"Generating {num_train_samples} dummy training samples...")
150+
train_data = np.random.rand(num_train_samples, num_features).astype(np.float64)
151+
train_labels = np.random.randint(0, num_classes, num_train_samples)
152+
train_labels_onehot = np.eye(num_classes)[train_labels]
153+
154+
# Split into chunks
155+
for i in range(2): # 2 training chunks
156+
start_idx = i * chunk_size
157+
end_idx = (i + 1) * chunk_size
158+
159+
chunk_data = train_data[start_idx:end_idx]
160+
chunk_labels_onehot_chunk = train_labels_onehot[start_idx:end_idx]
161+
162+
chunk_num = f"{i+1:03d}"
163+
164+
# Save chunks as CSV
165+
data_file = data_path / f"train_chunk_{chunk_num}.csv"
166+
pd.DataFrame(chunk_data).to_csv(data_file, header=False, index=False)
167+
168+
labels_file = data_path / f"train_labels_{chunk_num}.csv"
169+
pd.DataFrame(chunk_labels_onehot_chunk).to_csv(labels_file, header=False, index=False)
170+
171+
print(f" Created train chunk {chunk_num}: {chunk_data.shape}")
172+
173+
# Create validation chunk
174+
num_val_samples = min(chunk_size, 5000) # Smaller validation set
175+
print(f"Generating {num_val_samples} dummy validation samples...")
176+
177+
val_data = np.random.rand(num_val_samples, num_features).astype(np.float64)
178+
val_labels = np.random.randint(0, num_classes, num_val_samples)
179+
val_labels_onehot = np.eye(num_classes)[val_labels]
180+
181+
# Save validation chunk as CSV
182+
data_file = data_path / "val_chunk_001.csv"
183+
pd.DataFrame(val_data).to_csv(data_file, header=False, index=False)
184+
185+
labels_file = data_path / "val_labels_001.csv"
186+
pd.DataFrame(val_labels_onehot).to_csv(labels_file, header=False, index=False)
187+
188+
print(f" Created val chunk 001: {val_data.shape}")
189+
190+
def main():
191+
"""Main execution."""
192+
data_dir = "imagenet_data"
193+
chunk_size = 10000
194+
195+
if len(sys.argv) > 1:
196+
data_dir = sys.argv[1]
197+
if len(sys.argv) > 2:
198+
chunk_size = int(sys.argv[2])
199+
200+
# Create data directory if it doesn't exist
201+
os.makedirs(data_dir, exist_ok=True)
202+
203+
create_binary_chunks(data_dir, chunk_size)
204+
205+
print("\n✅ Binary chunk creation completed!")
206+
print(f"Chunks saved in: {data_dir}/")
207+
print("Files created:")
208+
209+
data_path = Path(data_dir)
210+
for file in sorted(data_path.glob("*_chunk_*.bin")):
211+
size_mb = file.stat().st_size / (1024 * 1024)
212+
print(f" {file.name} ({size_mb:.1f} MB)")
213+
214+
if __name__ == "__main__":
215+
main()

0 commit comments

Comments
 (0)