5151 dataset . columns . find_by ( name : target ) . update ( is_target : true )
5252 dataset . columns . where ( name : hidden_cols ) . update_all ( hidden : true )
5353 dataset . columns . find_by ( name : "SibSp" ) . update ( preprocessing_steps : {
54- training : { method : :median } ,
55- } )
54+ training : { method : :median } ,
55+ } )
5656 dataset . columns . find_by ( name : "Parch" ) . update ( preprocessing_steps : {
57- training : { method : :median } ,
58- } )
57+ training : { method : :median } ,
58+ } )
5959 dataset . columns . find_by ( name : "Sex" ) . update ( preprocessing_steps : {
60- training : {
61- method : :most_frequent ,
62- encoding : :one_hot ,
63- params : {
64- categorical_min : 2 ,
65- } ,
66- } ,
67- } )
60+ training : {
61+ method : :most_frequent ,
62+ encoding : :one_hot ,
63+ params : {
64+ categorical_min : 2 ,
65+ } ,
66+ } ,
67+ } )
6868 dataset . columns . find_by ( name : "Embarked" ) . update ( preprocessing_steps : {
69- training : {
70- method : :most_frequent ,
71- encoding : :one_hot ,
72- params : {
73- categorical_min : 2 ,
74- } ,
75- } ,
76- } )
69+ training : {
70+ method : :most_frequent ,
71+ encoding : :one_hot ,
72+ params : {
73+ categorical_min : 2 ,
74+ } ,
75+ } ,
76+ } )
7777 dataset . columns . find_by ( name : "Age" ) . update ( preprocessing_steps : {
78- training : {
79- method : :median ,
80- } ,
81- } )
78+ training : {
79+ method : :median ,
80+ } ,
81+ } )
8282 dataset . refresh
8383 end
8484 end
@@ -131,15 +131,97 @@ def incr_time
131131 @time += 1 . second
132132 end
133133
134+ def relative_dir ( dir )
135+ dir . split ( Rails . root . to_s ) . last
136+ end
137+
134138 describe "#deploy" do
135- it "uses deployed version for prediction" do
139+ it "maintains dataset directory structure and versioning" do
140+ @t1 = EasyML ::Support ::EST . parse ( "2025-01-01" ) . beginning_of_day
141+ Timecop . freeze ( @t1 )
142+
136143 mock_s3_upload
144+ model . save
137145 model . unlock!
138146
147+ @t2 = EasyML ::Support ::EST . parse ( "2025-01-02" ) . beginning_of_day
148+ Timecop . freeze ( @t2 )
149+
150+ model . train ( async : false )
151+ model . deploy ( async : false )
152+ model_v1 = model . current_version
153+
154+ # Verify initial dataset structure
155+ expect ( Dir . exist? ( model_v1 . dataset . raw . dir ) ) . to be true
156+ expect ( relative_dir ( model_v1 . dataset . raw . dir ) ) . to eq ( "/easy_ml/datasets/titanic_dataset/2025_01_01_00_00_00/files/splits/raw" )
157+
158+ expect ( Dir . exist? ( File . join ( model_v1 . dataset . dir , "features" ) ) ) . to be true
159+
160+ feature_files = model_v1 . dataset . features . find_by ( name : "Family Size" ) . files
161+ expect ( feature_files . count ) . to be > 0
162+ feature_files . each do |feature_file |
163+ dir = File . dirname ( feature_file )
164+ expect ( relative_dir ( dir ) ) . to eq ( "/easy_ml/datasets/titanic_dataset/2025_01_01_00_00_00/features/family_size/compacted" )
165+ end
166+
167+ # New dataset version has been shipped, so it doesn't conflict with the deployed version
168+ expect ( relative_dir ( model . dataset . raw . dir ) ) . to match ( %r{/easy_ml/datasets/titanic_dataset/2025_01_02_00_00_\d {2}/files/splits/raw} )
169+ feature_files = model . dataset . features . find_by ( name : "Family Size" ) . files
170+ expect ( feature_files . count ) . to be > 0
171+ feature_files . each do |feature_file |
172+ dir = File . dirname ( feature_file )
173+ expect ( relative_dir ( dir ) ) . to match ( %r{/easy_ml/datasets/titanic_dataset/2025_01_02_00_00_\d {2}/features/family_size/compacted} )
174+ end
175+
176+ # Make changes that require a new version
177+ model . dataset . columns . where ( name : "Age" ) . update_all ( hidden : true )
178+ model . dataset . refresh
179+
180+ @t3 = EasyML ::Support ::EST . parse ( "2025-01-03" ) . beginning_of_day
181+ Timecop . freeze ( @t3 )
182+
183+ model . train ( async : false )
184+ model . deploy ( async : false )
185+ model_v2 = model . current_version
186+
187+ # Verify new version structure
188+ expect ( Dir . exist? ( model_v2 . dataset . raw . dir ) ) . to be true
189+ expect ( Dir . exist? ( File . join ( model_v2 . dataset . dir , "features" ) ) ) . to be true
190+ expect ( Dir . exist? ( File . join ( model_v2 . dataset . dir , "features" ) ) ) . to be true
191+
192+ # Verify old version files were copied to new version
193+ old_files = Dir . glob ( File . join ( model_v1 . dataset . raw . dir , "**/*" ) ) . select { |f | File . file? ( f ) }
194+ new_files = Dir . glob ( File . join ( model_v2 . dataset . raw . dir , "**/*" ) ) . select { |f | File . file? ( f ) }
195+ expect ( old_files . count ) . to be > 0
196+ expect ( new_files . count ) . to be >= old_files . count
197+
198+ # Test which files are queried
199+ #
200+ # When using original feature (from v1 model)
201+ feature = model_v1 . dataset . features . find_by ( name : "Family Size" )
202+ file_pattern = %r{easy_ml/datasets/titanic_dataset/2025_01_01_00_00_\d {2}/features/family_size/compacted/feature.\d .parquet}
203+ expect ( Polars ) . to receive ( :scan_parquet ) . with ( file_pattern ) . at_least ( :once )
204+ feature . query ( limit : 1 )
205+
206+ feature_v2 = model_v2 . dataset . features . find_by ( name : "Family Size" )
207+ file_pattern_v2 = %r{easy_ml/datasets/titanic_dataset/2025_01_02_00_00_\d {2}/features/family_size/compacted/feature.\d .parquet}
208+ expect ( Polars ) . to receive ( :scan_parquet ) . with ( file_pattern_v2 ) . at_least ( :once )
209+ feature_v2 . query ( limit : 1 )
210+
211+ feature_v3 = model . dataset . features . find_by ( name : "Family Size" )
212+ file_pattern_v3 = %r{easy_ml/datasets/titanic_dataset/2025_01_03_00_00_\d {2}/features/family_size/compacted/feature.\d .parquet}
213+ expect ( Polars ) . to receive ( :scan_parquet ) . with ( file_pattern_v3 ) . at_least ( :once )
214+ feature_v3 . query ( limit : 1 )
215+
216+ Timecop . return
217+ end
218+
219+ it "uses deployed version for prediction" do
220+ mock_s3_upload
139221 @time = EasyML ::Support ::EST . now
140222 Timecop . freeze ( @time )
141223
142- model . save
224+ model . save!
143225 model . unlock!
144226 model . train ( async : false )
145227 model . deploy ( async : false )
@@ -293,7 +375,7 @@ def incr_time
293375
294376 mock_s3_upload
295377
296- @time = EasyML ::Support ::EST . parse ( "2024-01-01" )
378+ @time = EasyML ::Support ::EST . parse ( "2024-01-01" ) . beginning_of_day
297379 Timecop . freeze ( @time )
298380
299381 model . save
@@ -305,7 +387,7 @@ def incr_time
305387 model_v1 = model . current_version
306388
307389 def extract_timestamp ( dir )
308- EasyML ::Support ::UTC . parse ( dir . gsub ( /\D / , "" ) ) . in_time_zone ( EST )
390+ EasyML ::Support ::EST . parse ( dir . gsub ( /\D / , "" ) )
309391 end
310392
311393 expect ( extract_timestamp ( model_v1 . dataset . raw . dir ) ) . to eq ( EasyML ::Support ::EST . parse ( "2024-01-01" ) )
0 commit comments