Skip to content

Commit fca03d6

Browse files
Merge pull request #62 from brettshollenberger/fix-deploy-dirs
Fix deploy dirs
2 parents 259dc78 + 8c070f6 commit fca03d6

File tree

9 files changed

+147
-55
lines changed

9 files changed

+147
-55
lines changed

Gemfile.lock

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
PATH
22
remote: .
33
specs:
4-
easy_ml (0.2.0.pre.rc95)
4+
easy_ml (0.2.0.pre.rc96)
55
activerecord
66
activerecord-import (~> 1.8.1)
77
activesupport

app/models/easy_ml/concerns/versionable.rb

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,23 @@ module Versionable
44
extend ActiveSupport::Concern
55

66
included do
7+
STRING_FORMAT = "%Y_%m_%d_%H_%M_%S".freeze
8+
79
def bump_version(force: false)
810
return version if version.present? && !force
911

10-
prev_version = version
11-
timestamp = Time.now.utc.strftime("%Y%m%d%H%M%S")
12-
timestamp = (prev_version.to_i + 1).to_s if timestamp.to_i <= prev_version.to_i
12+
tz = ActiveSupport::TimeZone.new(EasyML::Configuration.timezone)
13+
orig_version = version
14+
prev_version = tz.parse(version.gsub(/_/, "")) if version
15+
timestamp = Time.current.in_time_zone(EasyML::Configuration.timezone)
16+
timestamp = (prev_version + 1.second) if prev_version && compare_versions(timestamp, prev_version)
17+
18+
self.version = timestamp.strftime(STRING_FORMAT)
19+
end
1320

14-
self.version = timestamp
21+
def compare_versions(version1, version2)
22+
tz = ActiveSupport::TimeZone.new(EasyML::Configuration.timezone)
23+
tz.parse(version1.strftime(STRING_FORMAT).gsub(/_/, "")) <= tz.parse(version2.strftime(STRING_FORMAT).gsub(/_/, ""))
1524
end
1625
end
1726
end

app/models/easy_ml/dataset.rb

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,10 @@ def root_dir=(value)
114114
write_attribute(:root_dir, value)
115115
end
116116

117+
def dir
118+
root_dir
119+
end
120+
117121
def set_root_dir
118122
bump_version
119123
write_attribute(:root_dir, default_root_dir)
@@ -211,12 +215,13 @@ def processed
211215
end
212216

213217
def bump_versions(version)
218+
original_version = self.version
214219
self.version = version
215220

216-
@raw = raw.cp(version)
217-
@processed = processed.cp(version)
221+
@raw = raw.cp(dir.gsub(original_version, version))
222+
@processed = processed.cp(dir.gsub(original_version, version))
218223
save.tap do
219-
features.each(&:bump_version)
224+
features.each { |feature| feature.bump_version(original_version, version) }
220225
EasyML::Feature.import(features.to_a, on_duplicate_key_update: [:version])
221226
end
222227
end
@@ -750,7 +755,7 @@ def initialize_split(type)
750755
split_type.new(**args)
751756
when EasyML::Data::Splits::FileSplit.to_s
752757
split_type.new(**args.merge(
753-
dir: Pathname.new(root_dir).append("files/splits/#{type}").to_s,
758+
dir: Pathname.new(root_dir).join("files/splits/#{type}").to_s,
754759
))
755760
end
756761
end

app/models/easy_ml/feature.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -433,9 +433,9 @@ def insert_after(feature_class)
433433
end
434434
end
435435

436-
def bump_version
437-
feature_store.bump_version(version)
438-
write_attribute(:version, version + 1)
436+
def bump_version(original_version, version)
437+
feature_store.bump_version(original_version, version)
438+
write_attribute(:version, self.version + 1)
439439
self
440440
end
441441

app/models/easy_ml/model.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ def actually_train(&progress_block)
203203

204204
def unlock!
205205
Support::Lockable.unlock!(lock_key)
206+
EasyML::Deploy.new(model: self).unlock!
206207
end
207208

208209
def lock_model
@@ -558,6 +559,7 @@ def actually_deploy
558559
dataset.upload_remote_files
559560
model_snapshot = snapshot
560561

562+
Thread.current[:deploying] = true
561563
# Prepare the model to be retrained (reset values so they don't conflict with our snapshotted version)
562564
bump_version(force: true)
563565
dataset.bump_versions(version)

easy_ml-0.2.0.pre.rc96.gem

932 KB
Binary file not shown.

lib/easy_ml/feature_store.rb

Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
module EasyML
22
class FeatureStore < EasyML::Data::DatasetManager
3-
attr_reader :feature
3+
attr_reader :feature, :dataset
44

55
def initialize(feature)
66
@feature = feature
7+
@dataset = feature&.dataset
78

89
datasource_config = feature&.dataset&.datasource&.configuration
910
if datasource_config
@@ -27,11 +28,11 @@ def synced?
2728
files.any?
2829
end
2930

30-
def bump_version(version)
31+
def bump_version(original_version, version)
3132
compact
3233
cp(
33-
feature_dir_for_version(version),
34-
feature_dir_for_version(version + 1),
34+
feature_dir.gsub(version, original_version),
35+
feature_dir,
3536
)
3637
end
3738

@@ -41,21 +42,14 @@ def batch_size
4142
@batch_size ||= feature.batch_size || 10_000
4243
end
4344

44-
def feature_dir_for_version(version)
45+
def feature_dir
4546
File.join(
46-
Rails.root,
47-
"easy_ml/datasets",
48-
feature&.dataset&.name&.parameterize&.gsub("-", "_"),
47+
dataset.dir,
4948
"features",
50-
feature&.name&.parameterize&.gsub("-", "_"),
51-
version.to_s
49+
feature&.name&.parameterize&.gsub("-", "_")
5250
)
5351
end
5452

55-
def feature_dir
56-
feature_dir_for_version(feature.version)
57-
end
58-
5953
def s3_prefix
6054
File.join("datasets", feature_dir.split("datasets").last)
6155
end

lib/easy_ml/version.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# frozen_string_literal: true
22

33
module EasyML
4-
VERSION = "0.2.0-rc95"
4+
VERSION = "0.2.0-rc96"
55

66
module Version
77
end

spec/app/models/easy_ml/deploy_spec.rb

Lines changed: 110 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -51,34 +51,34 @@
5151
dataset.columns.find_by(name: target).update(is_target: true)
5252
dataset.columns.where(name: hidden_cols).update_all(hidden: true)
5353
dataset.columns.find_by(name: "SibSp").update(preprocessing_steps: {
54-
training: { method: :median },
55-
})
54+
training: { method: :median },
55+
})
5656
dataset.columns.find_by(name: "Parch").update(preprocessing_steps: {
57-
training: { method: :median },
58-
})
57+
training: { method: :median },
58+
})
5959
dataset.columns.find_by(name: "Sex").update(preprocessing_steps: {
60-
training: {
61-
method: :most_frequent,
62-
encoding: :one_hot,
63-
params: {
64-
categorical_min: 2,
65-
},
66-
},
67-
})
60+
training: {
61+
method: :most_frequent,
62+
encoding: :one_hot,
63+
params: {
64+
categorical_min: 2,
65+
},
66+
},
67+
})
6868
dataset.columns.find_by(name: "Embarked").update(preprocessing_steps: {
69-
training: {
70-
method: :most_frequent,
71-
encoding: :one_hot,
72-
params: {
73-
categorical_min: 2,
74-
},
75-
},
76-
})
69+
training: {
70+
method: :most_frequent,
71+
encoding: :one_hot,
72+
params: {
73+
categorical_min: 2,
74+
},
75+
},
76+
})
7777
dataset.columns.find_by(name: "Age").update(preprocessing_steps: {
78-
training: {
79-
method: :median,
80-
},
81-
})
78+
training: {
79+
method: :median,
80+
},
81+
})
8282
dataset.refresh
8383
end
8484
end
@@ -131,15 +131,97 @@ def incr_time
131131
@time += 1.second
132132
end
133133

134+
def relative_dir(dir)
135+
dir.split(Rails.root.to_s).last
136+
end
137+
134138
describe "#deploy" do
135-
it "uses deployed version for prediction" do
139+
it "maintains dataset directory structure and versioning" do
140+
@t1 = EasyML::Support::EST.parse("2025-01-01").beginning_of_day
141+
Timecop.freeze(@t1)
142+
136143
mock_s3_upload
144+
model.save
137145
model.unlock!
138146

147+
@t2 = EasyML::Support::EST.parse("2025-01-02").beginning_of_day
148+
Timecop.freeze(@t2)
149+
150+
model.train(async: false)
151+
model.deploy(async: false)
152+
model_v1 = model.current_version
153+
154+
# Verify initial dataset structure
155+
expect(Dir.exist?(model_v1.dataset.raw.dir)).to be true
156+
expect(relative_dir(model_v1.dataset.raw.dir)).to eq("/easy_ml/datasets/titanic_dataset/2025_01_01_00_00_00/files/splits/raw")
157+
158+
expect(Dir.exist?(File.join(model_v1.dataset.dir, "features"))).to be true
159+
160+
feature_files = model_v1.dataset.features.find_by(name: "Family Size").files
161+
expect(feature_files.count).to be > 0
162+
feature_files.each do |feature_file|
163+
dir = File.dirname(feature_file)
164+
expect(relative_dir(dir)).to eq("/easy_ml/datasets/titanic_dataset/2025_01_01_00_00_00/features/family_size/compacted")
165+
end
166+
167+
# New dataset version has been shipped, so it doesn't conflict with the deployed version
168+
expect(relative_dir(model.dataset.raw.dir)).to match(%r{/easy_ml/datasets/titanic_dataset/2025_01_02_00_00_\d{2}/files/splits/raw})
169+
feature_files = model.dataset.features.find_by(name: "Family Size").files
170+
expect(feature_files.count).to be > 0
171+
feature_files.each do |feature_file|
172+
dir = File.dirname(feature_file)
173+
expect(relative_dir(dir)).to match(%r{/easy_ml/datasets/titanic_dataset/2025_01_02_00_00_\d{2}/features/family_size/compacted})
174+
end
175+
176+
# Make changes that require a new version
177+
model.dataset.columns.where(name: "Age").update_all(hidden: true)
178+
model.dataset.refresh
179+
180+
@t3 = EasyML::Support::EST.parse("2025-01-03").beginning_of_day
181+
Timecop.freeze(@t3)
182+
183+
model.train(async: false)
184+
model.deploy(async: false)
185+
model_v2 = model.current_version
186+
187+
# Verify new version structure
188+
expect(Dir.exist?(model_v2.dataset.raw.dir)).to be true
189+
expect(Dir.exist?(File.join(model_v2.dataset.dir, "features"))).to be true
190+
expect(Dir.exist?(File.join(model_v2.dataset.dir, "features"))).to be true
191+
192+
# Verify old version files were copied to new version
193+
old_files = Dir.glob(File.join(model_v1.dataset.raw.dir, "**/*")).select { |f| File.file?(f) }
194+
new_files = Dir.glob(File.join(model_v2.dataset.raw.dir, "**/*")).select { |f| File.file?(f) }
195+
expect(old_files.count).to be > 0
196+
expect(new_files.count).to be >= old_files.count
197+
198+
# Test which files are queried
199+
#
200+
# When using original feature (from v1 model)
201+
feature = model_v1.dataset.features.find_by(name: "Family Size")
202+
file_pattern = %r{easy_ml/datasets/titanic_dataset/2025_01_01_00_00_\d{2}/features/family_size/compacted/feature.\d.parquet}
203+
expect(Polars).to receive(:scan_parquet).with(file_pattern).at_least(:once)
204+
feature.query(limit: 1)
205+
206+
feature_v2 = model_v2.dataset.features.find_by(name: "Family Size")
207+
file_pattern_v2 = %r{easy_ml/datasets/titanic_dataset/2025_01_02_00_00_\d{2}/features/family_size/compacted/feature.\d.parquet}
208+
expect(Polars).to receive(:scan_parquet).with(file_pattern_v2).at_least(:once)
209+
feature_v2.query(limit: 1)
210+
211+
feature_v3 = model.dataset.features.find_by(name: "Family Size")
212+
file_pattern_v3 = %r{easy_ml/datasets/titanic_dataset/2025_01_03_00_00_\d{2}/features/family_size/compacted/feature.\d.parquet}
213+
expect(Polars).to receive(:scan_parquet).with(file_pattern_v3).at_least(:once)
214+
feature_v3.query(limit: 1)
215+
216+
Timecop.return
217+
end
218+
219+
it "uses deployed version for prediction" do
220+
mock_s3_upload
139221
@time = EasyML::Support::EST.now
140222
Timecop.freeze(@time)
141223

142-
model.save
224+
model.save!
143225
model.unlock!
144226
model.train(async: false)
145227
model.deploy(async: false)
@@ -293,7 +375,7 @@ def incr_time
293375

294376
mock_s3_upload
295377

296-
@time = EasyML::Support::EST.parse("2024-01-01")
378+
@time = EasyML::Support::EST.parse("2024-01-01").beginning_of_day
297379
Timecop.freeze(@time)
298380

299381
model.save
@@ -305,7 +387,7 @@ def incr_time
305387
model_v1 = model.current_version
306388

307389
def extract_timestamp(dir)
308-
EasyML::Support::UTC.parse(dir.gsub(/\D/, "")).in_time_zone(EST)
390+
EasyML::Support::EST.parse(dir.gsub(/\D/, ""))
309391
end
310392

311393
expect(extract_timestamp(model_v1.dataset.raw.dir)).to eq(EasyML::Support::EST.parse("2024-01-01"))

0 commit comments

Comments
 (0)