|
| 1 | +#!/usr/bin/env python |
| 2 | +""" |
| 3 | +File Generator Script using DataChain Delta |
| 4 | +
|
| 5 | +This script demonstrates: |
| 6 | +1. Creating numbered text files in a 'test' directory |
| 7 | +2. Using DataChain's delta flag for incremental dataset processing |
| 8 | +
|
| 9 | +Each execution: |
| 10 | +- Creates a new numbered file in the 'test' directory |
| 11 | +- Updates a DataChain dataset to track these files incrementally |
| 12 | +""" |
| 13 | + |
| 14 | +import re |
| 15 | +import time |
| 16 | + |
| 17 | +from utils import generate_next_file |
| 18 | + |
| 19 | +import datachain as dc |
| 20 | +from datachain import C, File |
| 21 | + |
| 22 | + |
| 23 | +def extract_file_number(file: File) -> int: |
| 24 | + """Extract file number from the filename.""" |
| 25 | + match = re.search(r"file-(\d+)\.txt", file.name) |
| 26 | + if match: |
| 27 | + return int(match.group(1)) |
| 28 | + return -1 |
| 29 | + |
| 30 | + |
| 31 | +def process_files_with_delta(): |
| 32 | + """ |
| 33 | + Process files in the test directory using DataChain with delta mode. |
| 34 | + This demonstrates incremental processing - only new files are processed. |
| 35 | + """ |
| 36 | + chain = ( |
| 37 | + dc.read_storage("test/", update=True, delta=True, delta_on="file.path") |
| 38 | + .filter(C("file.path").glob("*.txt")) |
| 39 | + .map(file_number=extract_file_number) |
| 40 | + .map(content=lambda file: file.read_text()) |
| 41 | + .map(processed_at=lambda: time.strftime("%Y-%m-%d %H:%M:%S")) |
| 42 | + .save(name="test_files") |
| 43 | + ) |
| 44 | + |
| 45 | + # Show information about the dataset |
| 46 | + print(f"\nProcessed files. Total records: {chain.count()}") |
| 47 | + print("\nDataset versions:") |
| 48 | + test_dataset = dc.datasets().filter(C("name") == "test_files") |
| 49 | + |
| 50 | + for version in test_dataset.collect("version"): |
| 51 | + print(f"- Version: {version}") |
| 52 | + |
| 53 | + # Show the last 3 records to demonstrate the incremental processing |
| 54 | + print("\nLatest files processed:") |
| 55 | + chain.order_by("file_number", descending=True).limit(3).show() |
| 56 | + |
| 57 | + |
| 58 | +if __name__ == "__main__": |
| 59 | + # Generate a new file |
| 60 | + new_file = generate_next_file() |
| 61 | + print(f"Created new file: {new_file}") |
| 62 | + |
| 63 | + # Process all new file with (delta update) |
| 64 | + process_files_with_delta() |
0 commit comments