Skip to content

Commit 69c10b4

Browse files
Sync parquet files, refresh datasets every morning
1 parent 829af34 commit 69c10b4

File tree

5 files changed

+44
-24
lines changed

5 files changed

+44
-24
lines changed

config/initializers/zhong.rb

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@
88
EasyML::CleanJob.perform_later
99
end
1010

11+
every 1.day, "refresh datasets" do
12+
EasyML::Dataset.all.each do |dataset|
13+
dataset.refresh_async
14+
end
15+
end
16+
1117
every 1.hour, "schedule retraining" do
1218
EasyML::ScheduleRetrainingJob.perform_later
1319
end

easy_ml-0.2.0.pre.rc101.gem

932 KB
Binary file not shown.

lib/easy_ml/data/polars_column.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ class PolarsColumn
1616
array: Polars::List,
1717
}
1818
POLARS_MAP = {
19+
Polars::Decimal => :float,
1920
Polars::Float64 => :float,
2021
Polars::Int64 => :integer,
2122
Polars::Float32 => :float,

lib/easy_ml/data/synced_directory.rb

Lines changed: 36 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ def download_file(object)
126126
)
127127

128128
Rails.logger.info("Downloaded #{object.key} to #{local_file_path}")
129-
if object.key.end_with?(".gz")
129+
if object.key.end_with?(".gz") && !object.key.end_with?(".parquet.gz")
130130
ungzipped_file_path = ungzip_file(local_file_path)
131131
Rails.logger.info("Ungzipped to #{ungzipped_file_path}")
132132
end
@@ -284,48 +284,61 @@ def upload_file(file_path)
284284
relative_path = Pathname.new(file_path).relative_path_from(Pathname.new(root_dir)).to_s
285285
s3_key = s3_prefix.present? ? File.join(s3_prefix, File.basename(relative_path)) : relative_path
286286

287-
# Create a temporary gzipped version of the file
288-
gzipped_file_path = "#{file_path}.gz"
289-
290287
begin
291-
Rails.logger.info("Compressing and uploading #{file_path} to s3://#{s3_bucket}/#{s3_key}")
288+
Rails.logger.info("Uploading #{file_path} to s3://#{s3_bucket}/#{s3_key}")
292289

293-
# Compress the file
294-
Zlib::GzipWriter.open(gzipped_file_path) do |gz|
290+
if file_path.end_with?(".parquet")
291+
# Upload parquet files directly without compression
295292
File.open(file_path, "rb") do |file|
296-
gz.write(file.read)
293+
s3.put_object(
294+
bucket: s3_bucket,
295+
key: s3_key,
296+
body: file
297+
)
298+
end
299+
Rails.logger.info("Successfully uploaded #{file_path} to s3://#{s3_bucket}/#{s3_key}")
300+
else
301+
# Create a temporary gzipped version of the file
302+
gzipped_file_path = "#{file_path}.gz"
303+
304+
# Compress the file
305+
Zlib::GzipWriter.open(gzipped_file_path) do |gz|
306+
File.open(file_path, "rb") do |file|
307+
gz.write(file.read)
308+
end
297309
end
298-
end
299310

300-
# Upload the gzipped file
301-
File.open(gzipped_file_path, "rb") do |file|
302-
s3.put_object(
303-
bucket: s3_bucket,
304-
key: "#{s3_key}.gz",
305-
body: file,
306-
content_encoding: "gzip",
307-
)
308-
end
311+
# Upload the gzipped file
312+
File.open(gzipped_file_path, "rb") do |file|
313+
s3.put_object(
314+
bucket: s3_bucket,
315+
key: "#{s3_key}.gz",
316+
body: file,
317+
content_encoding: "gzip",
318+
)
319+
end
320+
321+
Rails.logger.info("Successfully uploaded #{file_path} to s3://#{s3_bucket}/#{s3_key}.gz")
309322

310-
Rails.logger.info("Successfully uploaded #{file_path} to s3://#{s3_bucket}/#{s3_key}.gz")
323+
# Clean up temporary gzipped file
324+
File.delete(gzipped_file_path) if File.exist?(gzipped_file_path)
325+
end
311326
rescue Aws::S3::Errors::ServiceError, StandardError => e
312327
Rails.logger.error("Failed to upload #{file_path}: #{e.message}")
313328
raise e
314-
ensure
315-
# Clean up temporary gzipped file
316-
File.delete(gzipped_file_path) if File.exist?(gzipped_file_path)
317329
end
318330
end
319331

320332
def should_upload?(file_path)
321333
relative_path = Pathname.new(file_path).relative_path_from(Pathname.new(root_dir)).to_s
322334
s3_key = s3_prefix.present? ? File.join(s3_prefix, relative_path) : relative_path
335+
s3_key = "#{s3_key}.gz" unless file_path.end_with?(".parquet")
323336

324337
begin
325338
# Check if file exists in S3
326339
response = s3.head_object(
327340
bucket: s3_bucket,
328-
key: "#{s3_key}.gz",
341+
key: s3_key,
329342
)
330343

331344
# Compare modification times

lib/easy_ml/version.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# frozen_string_literal: true
22

33
module EasyML
4-
VERSION = "0.2.0-rc100"
4+
VERSION = "0.2.0-rc101"
55

66
module Version
77
end

0 commit comments

Comments
 (0)