|
1 | 1 | # frozen_string_literal: true |
2 | 2 |
|
3 | | -require "numo/narray" |
| 3 | +# Pure Ruby implementation of preprocessing functions |
4 | 4 |
|
5 | 5 | module AnnEmbed |
6 | 6 | # Data preprocessing utilities |
7 | 7 | module Preprocessing |
8 | 8 | class << self |
9 | 9 | # Normalize data using specified method |
10 | | - # @param data [Array, Numo::NArray] Input data |
| 10 | + # @param data [Array] Input data (2D array) |
11 | 11 | # @param method [Symbol] Normalization method (:standard, :minmax, :l2) |
12 | | - # @return [Numo::NArray] Normalized data |
| 12 | + # @return [Array] Normalized data |
13 | 13 | def normalize(data, method: :standard) |
14 | | - data_array = prepare_data(data) |
| 14 | + raise ArgumentError, "Unsupported data type: #{data.class}" unless data.is_a?(Array) |
15 | 15 |
|
16 | 16 | case method |
17 | 17 | when :standard |
18 | | - standard_normalize(data_array) |
| 18 | + standard_normalize(data) |
19 | 19 | when :minmax |
20 | | - minmax_normalize(data_array) |
| 20 | + minmax_normalize(data) |
21 | 21 | when :l2 |
22 | | - l2_normalize(data_array) |
| 22 | + l2_normalize(data) |
23 | 23 | else |
24 | 24 | raise ArgumentError, "Unknown normalization method: #{method}" |
25 | 25 | end |
26 | 26 | end |
27 | 27 |
|
28 | 28 | # Reduce dimensionality using PCA before embedding |
29 | | - # @param data [Array, Numo::NArray] Input data |
| 29 | + # @param data [Array] Input data |
30 | 30 | # @param n_components [Integer] Number of PCA components |
31 | | - # @return [Numo::NArray] Reduced data |
| 31 | + # @return [Array] Reduced data |
32 | 32 | def pca_reduce(data, n_components) |
33 | | - data_array = prepare_data(data) |
34 | | - |
35 | | - # Use SVD for PCA |
36 | | - mean = data_array.mean(axis: 0) |
37 | | - centered = data_array - mean |
38 | | - |
39 | | - u, s, vt = SVD.randomized_svd(centered, n_components) |
40 | | - u * s |
| 33 | + # Note: This would require SVD implementation in pure Ruby |
| 34 | + # For now, raise an error suggesting to use the Rust-based SVD module |
| 35 | + raise NotImplementedError, "PCA reduction requires the SVD module which needs to be called directly" |
41 | 36 | end |
42 | 37 |
|
43 | 38 | private |
44 | 39 |
|
45 | | - def prepare_data(data) |
46 | | - case data |
47 | | - when Numo::NArray |
48 | | - data |
49 | | - when Array |
50 | | - Numo::DFloat.cast(data) |
51 | | - else |
52 | | - raise ArgumentError, "Unsupported data type: #{data.class}" |
53 | | - end |
54 | | - end |
55 | | - |
56 | 40 | def standard_normalize(data) |
57 | | - mean = data.mean(axis: 0) |
58 | | - std = data.stddev(axis: 0) |
59 | | - std[std.eq(0)] = 1.0 # Avoid division by zero |
| 41 | + # Pure Ruby implementation of standard normalization |
| 42 | + return data if data.empty? |
| 43 | + |
| 44 | + # Calculate mean and std for each column |
| 45 | + n_rows = data.size |
| 46 | + n_cols = data.first.size |
| 47 | + |
| 48 | + means = Array.new(n_cols, 0.0) |
| 49 | + stds = Array.new(n_cols, 0.0) |
| 50 | + |
| 51 | + # Calculate means |
| 52 | + data.each do |row| |
| 53 | + row.each_with_index { |val, j| means[j] += val } |
| 54 | + end |
| 55 | + means.map! { |m| m / n_rows } |
60 | 56 |
|
61 | | - (data - mean) / std |
| 57 | + # Calculate standard deviations |
| 58 | + data.each do |row| |
| 59 | + row.each_with_index { |val, j| stds[j] += (val - means[j]) ** 2 } |
| 60 | + end |
| 61 | + stds.map! { |s| Math.sqrt(s / n_rows) } |
| 62 | + stds.map! { |s| s == 0 ? 1.0 : s } # Avoid division by zero |
| 63 | + |
| 64 | + # Normalize |
| 65 | + data.map do |row| |
| 66 | + row.map.with_index { |val, j| (val - means[j]) / stds[j] } |
| 67 | + end |
62 | 68 | end |
63 | 69 |
|
64 | 70 | def minmax_normalize(data) |
65 | | - min = data.min(axis: 0) |
66 | | - max = data.max(axis: 0) |
67 | | - range = max - min |
68 | | - range[range.eq(0)] = 1.0 # Avoid division by zero |
| 71 | + # Pure Ruby implementation of min-max normalization |
| 72 | + return data if data.empty? |
| 73 | + |
| 74 | + n_cols = data.first.size |
| 75 | + mins = Array.new(n_cols) { Float::INFINITY } |
| 76 | + maxs = Array.new(n_cols) { -Float::INFINITY } |
| 77 | + |
| 78 | + # Find min and max for each column |
| 79 | + data.each do |row| |
| 80 | + row.each_with_index do |val, j| |
| 81 | + mins[j] = val if val < mins[j] |
| 82 | + maxs[j] = val if val > maxs[j] |
| 83 | + end |
| 84 | + end |
| 85 | + |
| 86 | + # Calculate ranges |
| 87 | + ranges = mins.zip(maxs).map { |min, max| max - min } |
| 88 | + ranges.map! { |r| r == 0 ? 1.0 : r } # Avoid division by zero |
69 | 89 |
|
70 | | - (data - min) / range |
| 90 | + # Normalize |
| 91 | + data.map do |row| |
| 92 | + row.map.with_index { |val, j| (val - mins[j]) / ranges[j] } |
| 93 | + end |
71 | 94 | end |
72 | 95 |
|
73 | 96 | def l2_normalize(data) |
74 | | - norms = Numo::NMath.sqrt((data**2).sum(axis: 1)) |
75 | | - norms[norms.eq(0)] = 1.0 # Avoid division by zero |
76 | | - |
77 | | - data / norms.expand_dims(1) |
| 97 | + # Pure Ruby implementation of L2 normalization |
| 98 | + data.map do |row| |
| 99 | + norm = Math.sqrt(row.sum { |val| val ** 2 }) |
| 100 | + norm = 1.0 if norm == 0 # Avoid division by zero |
| 101 | + row.map { |val| val / norm } |
| 102 | + end |
78 | 103 | end |
79 | 104 | end |
80 | 105 | end |
|
0 commit comments