Merge pull request #7 from jmitrevs/initialRecurr-simpleRNN

JanFSchulte · web-flow · commit 925fe64519f0 · 2025-02-12T09:24:23.000-05:00
Fix pytorch simple RNN for oneAPI; add initial state version for Quartus and oneAPI
diff --git a/hls4ml/backends/oneapi/passes/recurrent_templates.py b/hls4ml/backends/oneapi/passes/recurrent_templates.py
@@ -333,6 +333,7 @@ def format(self, node):
     typedef {weight_t.name} weight_t;
     typedef {bias_t.name} bias_t;
     typedef {recurrent_weight_t.name} recurrent_weight_t;
+    typedef {recurrent_bias_t.name} recurrent_bias_t;
 
     typedef {act_t} ACT_CONFIG_T;
     template<class x_T, class y_T, class config_T>
@@ -350,6 +351,9 @@ def format(self, node):
 simple_rnn_pytorch_function_template = (
     'nnet::simple_rnn_pytorch<{input_t}, {output_t}, {config}>({input}, {output}, {weights});'
 )
+simple_rnn_pytorch_function_initial_state_template = (
+    'nnet::simple_rnn_pytorch_init_state<{input_t}, {h_t}, {output_t}, {config}>({input}, {init_state}, {output}, {weights});'
+)
 
 
 class SimpleRNNConfigTemplate(LayerConfigTemplate):
@@ -395,10 +399,17 @@ def __init__(self):
 
     def format(self, node):
         params = self._default_function_params(node)
+        if params['pass_initial_states'] == 'true':
+            params['h_t'] = node.get_input_variable(node.inputs[1]).type.name
+            params['init_state'] = node.get_input_variable(node.inputs[1]).name
+
         if node.get_attr('pytorch', False):
-            self.template = simple_rnn_pytorch_function_template
+            if params['pass_initial_states'] == 'true':
+                template = simple_rnn_pytorch_function_initial_state_template
+            else:
+                template = simple_rnn_pytorch_function_template
             params['weights'] = 'w{0}, wr{0}, b{0}, br{0}'.format(str(node.index))
         else:
-            self.template = simple_rnn_function_template
+            template = simple_rnn_function_template
             params['weights'] = 'w{0}, wr{0}, b{0}'.format(str(node.index))
-        return self.template.format(**params)
+        return template.format(**params)
diff --git a/hls4ml/backends/quartus/passes/recurrent_templates.py b/hls4ml/backends/quartus/passes/recurrent_templates.py
@@ -285,6 +285,9 @@ def format(self, node):
 simple_rnn_pytorch_function_template = (
     'nnet::simple_rnn_pytorch<{input_t}, {output_t}, {config}>({input}, {output}, {weights});'
 )
+simple_rnn_pytorch_function_initial_state_template = (
+    'nnet::simple_rnn_pytorch<{input_t}, {input2_t}, {output_t}, {config}>({input}, {input2}, {output}, {weights});'
+)
 
 
 class SimpleRNNConfigTemplate(LayerConfigTemplate):
@@ -326,13 +329,20 @@ def format(self, node):
 class SimpleRNNFunctionTemplate(FunctionCallTemplate):
     def __init__(self):
         super().__init__(SimpleRNN, include_header=recurrent_include_list)
-        self.template = simple_rnn_function_template
 
     def format(self, node):
         params = self._default_function_params(node)
+        if params['pass_initial_states'] == 'true':
+            params['input2_t'] = node.get_input_variable(node.inputs[1]).type.name
+            params['input2'] = node.get_input_variable(node.inputs[1]).name
+
         if node.get_attr('pytorch', False):
-            self.template = simple_rnn_pytorch_function_template
+            if params['pass_initial_states'] == 'true':
+                template = simple_rnn_pytorch_function_initial_state_template
+            else:
+                template = simple_rnn_pytorch_function_template
             params['weights'] = 'w{0}, wr{0}, b{0}, br{0}'.format(str(node.index))
         else:
+            template = simple_rnn_function_template
             params['weights'] = 'w{0}, wr{0}, b{0}'.format(str(node.index))
-        return self.template.format(**params)
+        return template.format(**params)
diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_recurrent.h
@@ -404,8 +404,8 @@ void simple_rnn_pytorch_cell(const in_T &inputs, h_T &hidden_state, h_T &hidden_
 
     // Hidden state
     [[intel::fpga_register]] accum_array_T hiddenCand;
-    multiply_U<in_T, accum_array_T, typename CONFIG_T::recurrent_weight_t, CONFIG_T::n_out>(hidden_state, hiddenCand,
-                                                                                            rec_kernel);
+    multiply_U<h_T, accum_array_T, typename CONFIG_T::recurrent_weight_t, CONFIG_T::n_out>(hidden_state, hiddenCand,
+                                                                                           rec_kernel);
 
     // Hidden state bias addition
     [[intel::fpga_register]] accum_array_T hiddenBias;
@@ -437,7 +437,69 @@ void simple_rnn_pytorch(const data_T &data, res_T &res, const typename CONFIG_T:
 INIT_LOOP:
     #pragma unroll
     for (int x = 0; x < CONFIG_T::n_out; x++) {
-        hidden_state[x][0] = 0;
+        hidden_state[0][x] = 0;
+    }
+
+    [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::n_timesteps; i++) {
+
+        // Data at current time step
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_in; x++) {
+            in[x] = data[x + i * CONFIG_T::n_in];
+        }
+
+        // Hidden state at current time step
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            hidden_state_temp[x] = hidden_state[i][x];
+        }
+
+        // Do SimpleRNN
+        simple_rnn_pytorch_cell<data_T, res_T, CONFIG_T>(in, hidden_state_temp, h, kernel, rec_kernel, bias, rec_bias);
+
+        // Write result
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            hidden_state[i + 1][x] = h[x];
+        }
+    }
+
+    if (CONFIG_T::return_sequences == 0) {
+        // Output when return_sequences is false
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            res[x] = hidden_state[CONFIG_T::n_timesteps][x];
+        }
+    } else {
+        // Output when return_sequences is true
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_timesteps; x++) {
+            #pragma unroll
+            for (int h = 0; h < CONFIG_T::n_out; h++) {
+                res[x * CONFIG_T::n_out + h] = hidden_state[x + 1][h];
+            }
+        }
+    }
+}
+
+template <class data_T, class h_T, class res_T, typename CONFIG_T>
+void simple_rnn_pytorch_init_state(const data_T &data, const h_T &hin, res_T &res, const typename CONFIG_T::weight_t &kernel,
+                                   const typename CONFIG_T::recurrent_weight_t &rec_kernel,
+                                   const typename CONFIG_T::bias_t &bias,
+                                   const typename CONFIG_T::recurrent_bias_t &rec_bias) {
+
+    using in_T = array<typename data_T::value_type, CONFIG_T::n_in>;
+
+    [[intel::fpga_register]] h_T hidden_state[CONFIG_T::n_timesteps + 1];
+    [[intel::fpga_register]] h_T hidden_state_temp;
+    [[intel::fpga_register]] h_T h;
+    [[intel::fpga_register]] in_T in;
+
+// Set initially hidden state (output) to zero
+INIT_LOOP:
+    #pragma unroll
+    for (int x = 0; x < CONFIG_T::n_out; x++) {
+        hidden_state[0][x] = hin[x];
     }
 
     [[intel::disable_loop_pipelining]] for (int i = 0; i < CONFIG_T::n_timesteps; i++) {
@@ -451,7 +513,7 @@ void simple_rnn_pytorch(const data_T &data, res_T &res, const typename CONFIG_T:
         // Hidden state at current time step
         #pragma unroll
         for (int x = 0; x < CONFIG_T::n_out; x++) {
-            hidden_state_temp[x] = hidden_state[x][i];
+            hidden_state_temp[x] = hidden_state[i][x];
         }
 
         // Do SimpleRNN
@@ -460,23 +522,23 @@ void simple_rnn_pytorch(const data_T &data, res_T &res, const typename CONFIG_T:
         // Write result
         #pragma unroll
         for (int x = 0; x < CONFIG_T::n_out; x++) {
-            hidden_state[x][i + 1] = h[x];
+            hidden_state[i + 1][x] = h[x];
         }
     }
 
     if (CONFIG_T::return_sequences == 0) {
         // Output when return_sequences is false
         #pragma unroll
         for (int x = 0; x < CONFIG_T::n_out; x++) {
-            res[x] = hidden_state[x][CONFIG_T::n_timesteps];
+            res[x] = hidden_state[CONFIG_T::n_timesteps][x];
         }
     } else {
         // Output when return_sequences is true
         #pragma unroll
         for (int x = 0; x < CONFIG_T::n_timesteps; x++) {
             #pragma unroll
             for (int h = 0; h < CONFIG_T::n_out; h++) {
-                res[x * CONFIG_T::n_out + h] = hidden_state[h][x + 1];
+                res[x * CONFIG_T::n_out + h] = hidden_state[x + 1][h];
             }
         }
     }
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_recurrent.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_recurrent.h
@@ -490,6 +490,68 @@ void simple_rnn_pytorch(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in],
     }
 }
 
+template <class data_T, class data2_T, class res_T, typename CONFIG_T>
+void simple_rnn_pytorch(data_T data[CONFIG_T::n_timesteps * CONFIG_T::n_in], data2_T hin[CONFIG_T::n_out],
+                        res_T res[CONFIG_T::n_outputs * CONFIG_T::n_out],
+                        const typename CONFIG_T::weight_t kernel[CONFIG_T::n_in * CONFIG_T::n_out],
+                        const typename CONFIG_T::weight_t rec_kernel[CONFIG_T::n_out * CONFIG_T::n_out],
+                        const typename CONFIG_T::bias_t bias[CONFIG_T::n_out],
+                        const typename CONFIG_T::bias_t rec_bias[CONFIG_T::n_out]) {
+    data2_T hidden_state[CONFIG_T::n_out][CONFIG_T::n_timesteps + 1] hls_register;
+    data2_T hidden_state_temp[CONFIG_T::n_out] hls_register;
+    data2_T h[CONFIG_T::n_out] hls_register;
+    data_T in[CONFIG_T::n_in] hls_register;
+
+// Set initially hidden state (output) to zero
+INIT_LOOP:
+    #pragma unroll
+    for (int x = 0; x < CONFIG_T::n_out; x++) {
+        hidden_state[x][0] = hin[x];
+    }
+
+    #pragma disable_loop_pipelining
+    for (int i = 0; i < CONFIG_T::n_timesteps; i++) {
+
+        // Data at current time step
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_in; x++) {
+            in[x] = data[x + i * CONFIG_T::n_in];
+        }
+
+        // Hidden state at current time step
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            hidden_state_temp[x] = hidden_state[x][i];
+        }
+
+        // Do SimpleRNN
+        simple_rnn_pytorch_cell<data_T, res_T, CONFIG_T>(in, hidden_state_temp, h, kernel, rec_kernel, bias, rec_bias);
+
+        // Write result
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            hidden_state[x][i + 1] = h[x];
+        }
+    }
+
+    if (CONFIG_T::return_sequences == 0) {
+        // Output when return_sequences is false
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_out; x++) {
+            res[x] = hidden_state[x][CONFIG_T::n_timesteps];
+        }
+    } else {
+        // Output when return_sequences is true
+        #pragma unroll
+        for (int x = 0; x < CONFIG_T::n_timesteps; x++) {
+            #pragma unroll
+            for (int h = 0; h < CONFIG_T::n_out; h++) {
+                res[x * CONFIG_T::n_out + h] = hidden_state[h][x + 1];
+            }
+        }
+    }
+}
+
 //----------------------
 // LSTM
 //----------------------
diff --git a/test/pytest/test_recurrent_pytorch.py b/test/pytest/test_recurrent_pytorch.py
@@ -171,7 +171,7 @@ def forward(self, x, h0):
         return output
 
 
-@pytest.mark.parametrize('backend', ['Quartus'])
+@pytest.mark.parametrize('backend', ['Quartus', 'oneAPI'])
 @pytest.mark.parametrize('io_type', ['io_parallel'])
 def test_rnn(backend, io_type):
     if not (backend in ('Quartus', 'oneAPI') and io_type == "io_stream"):