@@ -66,7 +66,24 @@ use super::{
66
66
LogStream , ARROW_FILE_EXTENSION ,
67
67
} ;
68
68
69
- // NOTE: this requires that custom partition values should not have special characters in their name/value
69
+ /// Regex pattern for parsing arrow file names.
70
+ ///
71
+ /// # Format
72
+ /// The expected format is: `<schema_key>.<front_part>.<file_id>.data.arrows`
73
+ /// where:
74
+ /// - schema_key: A key that is associated with the timestamp at ingestion, hash of arrow schema and the key-value
75
+ /// pairs joined by '&' and '=' (e.g., "20200201T1830f8a5fc1edc567d56&key1=value1&key2=value2")
76
+ /// - front_part: Captured for parquet file naming, contains the timestamp associted with current/time-partition
77
+ /// as well as the custom partitioning key=value pairs (e.g., "date=2020-01-21.hour=10.minute=30.key1=value1.key2=value2.ee529ffc8e76")
78
+ /// - file_id: Numeric id for individual arrows files
79
+ ///
80
+ /// # Limitations
81
+ /// - Partition keys and values must only contain alphanumeric characters
82
+ /// - Special characters in partition values will cause the pattern to fail in capturing
83
+ ///
84
+ /// # Examples
85
+ /// Valid: "key1=value1,key2=value2"
86
+ /// Invalid: "key1=special!value,key2=special#value"
70
87
static ARROWS_NAME_STRUCTURE : Lazy < Regex > = Lazy :: new ( || {
71
88
Regex :: new ( r"^[a-zA-Z0-9&=]+\.(?P<front>\S+)\.\d+\.data\.arrows$" ) . expect ( "Validated regex" )
72
89
} ) ;
@@ -97,6 +114,7 @@ pub struct Stream {
97
114
pub metadata : RwLock < LogStreamMetadata > ,
98
115
pub data_path : PathBuf ,
99
116
pub options : Arc < Options > ,
117
+ /// Writer with a 16KB buffer size for optimal I/O performance.
100
118
pub writer : Mutex < Writer < 16384 > > ,
101
119
pub ingestor_id : Option < String > ,
102
120
}
0 commit comments