23
23
import torch
24
24
25
25
from lightning .data .processing .readers import BaseReader
26
- from lightning .data .streaming .constants import _TORCH_GREATER_EQUAL_2_1_0
26
+ from lightning .data .streaming .constants import _IS_IN_STUDIO , _TORCH_GREATER_EQUAL_2_1_0
27
27
from lightning .data .streaming .data_processor import DataChunkRecipe , DataProcessor , DataTransformRecipe
28
28
from lightning .data .streaming .resolver import (
29
29
Dir ,
@@ -169,8 +169,8 @@ def map(
169
169
output_dir: The folder where the processed data should be stored.
170
170
num_workers: The number of workers to use during processing
171
171
fast_dev_run: Whether to use process only a sub part of the inputs
172
- num_nodes: When doing remote execution, the number of nodes to use.
173
- machine: When doing remote execution, the machine to use.
172
+ num_nodes: When doing remote execution, the number of nodes to use. Only supported on https://lightning.ai/.
173
+ machine: When doing remote execution, the machine to use. Only supported on https://lightning.ai/.
174
174
num_downloaders: The number of downloaders per worker.
175
175
reorder_files: By default, reorders the files by file size to distribute work equally among all workers.
176
176
Set this to ``False`` if the order in which samples are processed should be preserved.
@@ -183,6 +183,17 @@ def map(
183
183
if len (inputs ) == 0 :
184
184
raise ValueError (f"The provided inputs should be non empty. Found { inputs } ." )
185
185
186
+ if not _IS_IN_STUDIO and (machine is not None or num_nodes is not None ):
187
+ raise ValueError (
188
+ "Only https://lightning.ai/ supports multiple nodes or selecting a machine."
189
+ " Create an account to try it out." )
190
+
191
+ if not _IS_IN_STUDIO :
192
+ print (
193
+ "Create an account on https://lightning.ai/ to transform your data faster using "
194
+ "multiple nodes and large machines."
195
+ )
196
+
186
197
if num_nodes is None or int (os .getenv ("DATA_OPTIMIZER_NUM_NODES" , 0 )) > 0 :
187
198
_output_dir : Dir = _resolve_dir (output_dir )
188
199
@@ -242,8 +253,8 @@ def optimize(
242
253
compression: The compression algorithm to use over the chunks.
243
254
num_workers: The number of workers to use during processing
244
255
fast_dev_run: Whether to use process only a sub part of the inputs
245
- num_nodes: When doing remote execution, the number of nodes to use.
246
- machine: When doing remote execution, the machine to use.
256
+ num_nodes: When doing remote execution, the number of nodes to use. Only supported on https://lightning.ai/.
257
+ machine: When doing remote execution, the machine to use. Only supported on https://lightning.ai/.
247
258
num_downloaders: The number of downloaders per worker.
248
259
reorder_files: By default, reorders the files by file size to distribute work equally among all workers.
249
260
Set this to ``False`` if the order in which samples are processed should be preserved.
@@ -258,6 +269,18 @@ def optimize(
258
269
if chunk_size is None and chunk_bytes is None :
259
270
raise ValueError ("Either `chunk_size` or `chunk_bytes` needs to be defined." )
260
271
272
+ if not _IS_IN_STUDIO and (machine is not None or num_nodes is not None ):
273
+ raise ValueError (
274
+ "Only https://lightning.ai/ supports multiple nodes or selecting a machine."
275
+ "Create an account to try it out."
276
+ )
277
+
278
+ if not _IS_IN_STUDIO :
279
+ print (
280
+ "Create an account on https://lightning.ai/ to optimize your data faster "
281
+ "using multiple nodes and large machines."
282
+ )
283
+
261
284
if num_nodes is None or int (os .getenv ("DATA_OPTIMIZER_NUM_NODES" , 0 )) > 0 :
262
285
_output_dir : Dir = _resolve_dir (output_dir )
263
286
@@ -312,6 +335,9 @@ def __init__(self, folder: str, max_workers: Optional[int] = os.cpu_count()) ->
312
335
self .max_workers = max_workers or 1
313
336
self .futures : List [concurrent .futures .Future ] = []
314
337
338
+ if not _IS_IN_STUDIO :
339
+ print ("This method is optimized to run on https://lightning.ai/. Don't use it otherwise." )
340
+
315
341
def __iter__ (self ) -> Any :
316
342
"""This function queues the folders to perform listdir across multiple workers."""
317
343
with concurrent .futures .ThreadPoolExecutor (max_workers = self .max_workers ) as executor :
0 commit comments