diff --git a/gpu/tensor/README.md b/gpu/tensor/README.md
new file mode 100644
index 0000000000..15f1dd7e2c
--- /dev/null
+++ b/gpu/tensor/README.md
@@ -0,0 +1,75 @@
+# GPU & FPGA - A Great Heterogeneous Acceleration Engine for Federated Learning
+
+This project is an industrial-level heterogeneous acceleration system to support and speed up federated learning. We've designed and implemented two different heterogeneous acceleration solutions using GPU and FPGA, respectively, that can significantly accelerate the Paillier cryptosystem while maintaining functionality, accuracy and scalability.  
+
+### How to test GPU engine
+- Requirements / Recommendations:
+    - At least one capable NVIDIA GPU device is required.
+        - We would recommend such device with a GPU microarchitecture of or later than Volta, such as Tesla V100 or Tesla V100S, to fully utilize the functional supports in our CUDA code.
+    - CentOS with version >= 7.8.2003
+        - We haven't tested if our engine works well in other Linux releases, such as Ubuntu and Debian. However, it should work with at most some slight modifications.
+    - Python with version >= 3.6.8
+        - The latest version of NumPy (1.19.4 as of now) is recommended.
+        - You may need to install other essential Python packages.
+    - If you would like to compile the CUDA code:
+        - gcc version 4.8.5 would suffice. Don't use gcc later than 7 since nvcc doesn't support it.
+        - nvcc version 10.0.130 would suffice.
+- To test GPU engine functionality
+    ```python
+    python3 -m paillier_gpu.tests.test_gpu_engine
+    ```
+- To test GPU engine performance (profiling)
+    ```python
+    python3 -m paillier_gpu.tests.test_gpu_performance
+    ```
+- You may switch the RAND_TYPE variable between INT64_TYPE and FLOAT_TYPE in the test file, which is recommended to make sure that both float64 (double) and int64 (long long) types can pass all assertions.
+
+### How to test FPGA engine
+- Requirements / Recommendations:
+    - At least one capable Xilinx FPGA device, such as Alveo U250, is required.
+    - CentOS with version >= 7.8.2003
+        - We haven't tested if our engine works well in other Linux releases, such as Ubuntu and Debian. However, it should work with at most some slight modifications.
+    - Python with version >= 3.6.8
+        - The latest version of NumPy (1.19.4 as of now) is recommended.
+        - You may need to install other essential Python packages.
+    - GCC with version >= 4.8.5 if you would like to compile the C source code.
+    - Superuser privileges are required as we need access sensitive directories.
+        - Note that the Python path with and without sudo may differ.
+- To test FPGA engine functionality
+    ```python
+    sudo python3 -m paillier_fpga.tests.test_fpga_engine
+    ```
+- To test FPGA engine performance (profiling)
+    ```python
+    sudo python3 -m paillier_fpga.tests.test_fpga_performance
+    ```
+- You may switch the RAND_TYPE variable between INT64_TYPE and FLOAT_TYPE in the test file, which is recommended to make sure that both float64 (double) and int64 (long long) types can pass all assertions.
+
+### Profiling Information
+The profiling result was obtained from a server with the following configuration.  
+|Hardware Type|Model|Quantity|Remark|
+|-|-|-|-|
+|CPU|Intel Xeon Silver 4114 CPU @ 2.20GHz|2|only 1 core is used in profiling|
+|GPU|NVIDIA Tesla V100 PCIe 32GB|4|only 1 GPU card is used in profiling|
+|FPGA|Xilinx Alveo U250|1||
+|Memory|Samsung 16GiB DIMM DDR4 Synchronous 2666 MHz (0.4 ns)|12|192 GB in total|
+|Hard Disk|2TB WDC WD20SPZX-60U|1||  
+
+The chart is an overview of the profiling information of our GPU and FPGA engines compared to a CPU implementation under a unified shape of 666*666, where the throughput means the number of operations (instances, either fixed-point numbers or Paillier-encrypted numbers) a device is capable to compute within a second. For matrix multiplication, we consider the number of operations as the number of modular exponentiations we have to compute under a naive O(n^3) algorithm.  
+We don't count the memory allocation time as it could take a significant amount of time for I/O-bound operators like those involving modular multiplication instead of modular exponentiation. As a result, we would recommend users to reuse the already-allocated CPU memory space as much as possible in a way similar to register renaming.
+
+|Operator|CPU Throughput|GPU Throughput|GPU Speedup|FPGA Throughput|FPGA Speedup|
+|-|-|-|-|-|-|
+|fp_encode|62303.97|33611720.05|539.48|7215836.85|115.82|
+|fp_decode|567913.21|25958708.28|45.71|583509.90|1.03|
+|pi_encrypt|205864.74|24814051.60|120.54|687947.44|3.34|
+|pi_gen_obf_seed|444.05|86766.80|195.40|33653.43|75.79|
+|pi_obfuscate|60236.27|11101085.43|184.29|2035691.96|33.80|
+|pi_decrypt|1590.48|299298.46|188.18|69354.57|43.61|
+|fp_mul|228424.79|11480248.47|50.26|1695313.95|7.42|
+|pi_add|29759.90|1203071.88|40.43|423378.92|14.23|
+|pi_mul|6175.70|1068244.51|172.98|359942.47|58.28|
+|pi_matmul|4178.43|620310.10|148.46|150362.36|35.99|
+|pi_sum(axis=0)|12865.10|1675271.14|130.22|844531.30|65.65|
+|pi_sum(axis=1)|15919.62|4651463.65|292.18|947461.90|59.52|
+|pi_sum(axis=None)|10277.66|4677684.56|455.13|877720.61|85.40|
\ No newline at end of file
diff --git a/gpu/tensor/paillier_fpga/paillier_fpga/__init__.py b/gpu/tensor/paillier_fpga/paillier_fpga/__init__.py
new file mode 100644
index 0000000000..47fa86a8b9
--- /dev/null
+++ b/gpu/tensor/paillier_fpga/paillier_fpga/__init__.py
@@ -0,0 +1,19 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+from .fpga_tensor import keygen, SK, PK, Cipherblock
+
+__version__ = '0.1.0'
+__all__ = ['keygen', "SK", "PK", "Cipherblock"]
diff --git a/gpu/tensor/paillier_fpga/paillier_fpga/fpga_engine.py b/gpu/tensor/paillier_fpga/paillier_fpga/fpga_engine.py
new file mode 100644
index 0000000000..abf4edf65b
--- /dev/null
+++ b/gpu/tensor/paillier_fpga/paillier_fpga/fpga_engine.py
@@ -0,0 +1,4434 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+# from ctypes.util import find_library
+import os
+import random
+import math
+import numpy as np
+
+from ctypes import cdll, c_buffer, cast
+from ctypes import c_char_p, c_void_p
+from ctypes import (
+    c_int32,
+    c_uint8,
+    c_bool,
+    c_uint32,
+    c_double,
+    c_int64,
+    c_uint64,
+    c_size_t,
+)
+from fate_arch.tensor.impl.blocks.python_paillier_block import (
+    PaillierPublicKey,
+    PaillierPrivateKey,
+    PaillierEncryptedNumber,
+    FixedPointNumber,
+)
+
+from concurrent.futures import ProcessPoolExecutor as Executor
+
+# define memory types
+MEM_HOST = 1
+MEM_DEVICE = 2
+# the extended memory types, correspond with the device type defined below
+MEM_FPGA_NUM_0 = 20
+MEM_FPGA_NUM_1 = 21
+
+# aliases defined by WeBank
+PaillierPublicKeyStorage = PaillierPublicKey
+PaillierPrivateKeyStorage = PaillierPrivateKey
+
+'''##############import ctypes to implement py2c and c2py#################'''
+'''############## load the .so library written in C     ##################'''
+
+# note the .so library hasn't be named, use FPGA_1024 as an example
+# we made 3 libraries, each one indicating a different .so library
+# the number indicating the CIPHER_BIT length
+FPGA_LIB = cdll.LoadLibrary(os.path.dirname(__file__) + "/FPGA_LIB.so")
+# FPGA_4096 = cdll.LoadLibrary("FPGA_4096.so")
+
+# set the CIPHER_BIT according to the library chosen.
+CIPHER_BITS = 2048
+PLAIN_BITS = 2048
+BYTE_LEN = 8
+CIPHER_BYTE = CIPHER_BITS // BYTE_LEN
+PLAIN_BYTE = PLAIN_BITS // BYTE_LEN
+
+# ### DEFINE THE BTYE_LENGTHS OF DATA TYPES ####
+CHAR_BYTE = 1
+U_INT32_BYTE = 4
+DOUBLE_BYTE = 8
+INT64_BYTE = 8
+
+# DEFINE THE RETURN TYPE OF C_malloc####
+FPGA_LIB.c_malloc.restype = c_void_p
+
+# DEFINE TWO DIFFERENT TYPE OF DATA####
+INT64_TYPE = 1  # datatype flag for int32 and int64
+FLOAT_TYPE = 2  # datatype flag for float and double
+
+# define base for Paillier encrypted numbers
+PEN_BASE = 16
+# as there's no BASE defined in Python PaillierEncryptedNumber,
+# and we need this in CUDA, we define PEN_BASE as 16
+
+
+# ############################################################################
+# ######################Useful independent functions##########################
+# ###################Reconstruct ndaray from C memory type####################
+# ############################################################################
+''' Device Initializer '''
+
+
+def initialize_device():
+    FPGA_LIB.init()
+    # FPGA_LIB.print_example_banner()
+
+
+'''reset FPGA functions'''
+
+
+def reset_device(dev_num):
+    FPGA_LIB.reset_device(c_uint8(dev_num))
+
+
+def init_dev_reg(dev_num):
+    FPGA_LIB.init_regs(c_uint8(dev_num))
+
+
+def check_FPGA_status(dev_num):
+    FPGA_LIB.status_check(c_uint8(dev_num))
+
+
+def __get_C_fpn(fpn_space, size):
+    '''
+    copy FixedPointNumber (FPN) object out from C memory space,
+    reform a ndarray, return it to upper python level
+    --------------------
+    Para:
+    res_fpn_space: int, indicating the start address of a c_memory space
+    size: int, the number of FPN in the C memory space
+    Return:
+    A ndarray, each element is a bigint
+    '''
+    res_fpn = []
+    get_res = c_buffer(PLAIN_BYTE)
+    for i in range(size):
+        FPGA_LIB.bigint_get(
+            cast(get_res, c_void_p),
+            c_void_p(fpn_space),
+            c_size_t(PLAIN_BITS),
+            c_size_t(i),
+        )
+        res_fpn.append(int.from_bytes(get_res.raw, 'little'))
+    return np.asarray(res_fpn)
+
+
+def __get_C_pen(pen_space, index, size):
+    '''
+    copy PaillierEncryptedNumber(PEN) object out from C memory space,
+    reform a ndarray, return it to upper python level
+    ------------------
+    Para:
+    res_pen_space: int, indicating the start address of a continuous C memory space
+    index: int, the offset from start address that we start to get PEN
+    size: int, the number of PEN ought to get
+    Return:
+    A ndarray, each element is a bigint
+    '''
+    res_pen = []
+    get_res = c_buffer(CIPHER_BYTE)
+    for i in range(size):
+        FPGA_LIB.bigint_get(
+            cast(get_res, c_void_p),
+            c_void_p(pen_space + index * CIPHER_BYTE),
+            c_size_t(CIPHER_BITS),
+            c_size_t(i),
+        )
+        res_pen.append(int.from_bytes(get_res.raw, 'little'))
+    return np.asarray(res_pen)
+
+
+bi_c2p = __get_C_pen
+
+
+def __get_C_uint32(uint32_space, size):
+    '''
+    copy uint32 out from C memory space, form a ndarraay
+    since numpy has a very good support for basic C numeric objects,
+    A single memcpy will be sufficient
+    ------------------------
+    Para:
+    res_uint32_space: int, indicating the start address of a continuous C memory space
+    size: int, the number of uint32 ought to get
+    '''
+    uint32_list = [0 for _ in range(size)]
+    int_list = (c_uint32 * size)(*uint32_list)
+    FPGA_LIB.unsigned_get(
+        int_list, c_void_p(uint32_space), c_size_t(size), c_bool(False)
+    )
+    uint32_list = [int_list[i] for i in range(size)]
+    return np.asarray(uint32_list)
+
+
+def __get_C_double(double_space, size):
+    '''copy double out from C memory space, form a ndarray'''
+    res_double_list = [0 for _ in range(size)]
+    double_list = (c_double * size)(*res_double_list)
+    FPGA_LIB.double_get(
+        double_list, c_void_p(double_space), c_size_t(size), c_bool(False)
+    )
+    # TODO: convert all the data in one step, no loop
+    res_double_list = [double_list[i] for i in range(size)]
+    return np.asarray(res_double_list)
+
+
+def __get_C_int64(int64_space, size):
+    '''copy int64 out from C memory space, form a ndarray'''
+    res_int64_list = [0 for _ in range(size)]
+    int64_list = (c_int64 * size)(*res_int64_list)
+    FPGA_LIB.int64_get(
+        int64_list,
+        c_void_p(int64_space),
+        c_size_t(size),
+        c_bool(False))
+    # TODO: convert all the data in one step, no loop
+    res_int64_list = [int64_list[i] for i in range(size)]
+    return np.asarray(res_int64_list)
+
+
+def __get_c_fpn_storage(fpn, base, exp, vec_size, n, max_int):
+    '''
+    Construct array of FixedPointNumber from given C memory spaces
+    -------------------
+    Para:
+    fpn:  int, start address of a C memory space,
+               inside which stores FPN's encodings(bigint, PLAIN_BITS long)
+    base: int, start address of a C memory space,
+               inside which stores FPN's base(uint32)
+    exp:  int, start address of a C memory space,
+               inside which stores FPN's exp(uint32)
+    vec_size:   int, the number of bigint
+    n, max_int: int, the key used to encode the original plaintext
+
+    Return:
+    A ndarray, each element is a FixedPointNumber
+    '''
+    res_fpn = __get_C_fpn(fpn, vec_size)
+    # res_base = __get_C_uint32(base,size)
+    res_exp = __get_C_uint32(exp, vec_size)
+    result_FixedPointNumber = []
+    for i in range(vec_size):
+        result_FixedPointNumber.append(
+            FixedPointNumber(res_fpn[i], float(res_exp[i]), n, max_int)
+        )
+    return result_FixedPointNumber
+
+
+def __get_c_pen_storage_raw(pen, base, exp, vec_size, n):
+    res_cipher = __get_C_pen(pen, 0, vec_size)
+    res_base = __get_C_uint32(base, vec_size)
+    res_exp = __get_C_uint32(exp, vec_size)
+    return res_cipher, res_base, res_exp
+
+
+def __get_c_pen_storage_mp(pen, base, exp, vec_size, n, thread_num=4):
+    '''
+    Use multi-process to accelerate __get_C_pen process.
+
+    Since on Linux, python use fork to create sub-process,
+    thus the C memory space is shared between father and child processes.
+    And the whole process concerns no CUDA and cuda-context,
+    even the return result is in python object form.
+    So we can use multi-process for acceleration here safely
+    ---------------------------------
+    Para:
+        thread_num: number of processes used in multi-processing
+    Return:
+        tuple, (ndarray, ndarray, ndarray)
+    '''
+    job_cnt = round(vec_size / thread_num)
+    job_idx = 0
+    job_idx_list, job_cnt_list = [0], []
+    for i in range(thread_num - 1):
+        job_idx += job_cnt
+        job_idx_list.append(job_idx)
+        job_cnt_list.append(job_cnt)
+    job_cnt_list.append(vec_size - job_cnt * (thread_num - 1))
+    # for __get_C_pen, use multiprocess to accelerate
+    executor = Executor()
+    futures = []
+    for i in range(thread_num):
+        futures.append(
+            executor.submit(__get_C_pen, pen, job_idx_list[i], job_cnt_list[i])
+        )
+    res_list = [r.result() for r in futures]
+    res_pen = []
+    for res in res_list:
+        res_pen.extend(res)
+    # for uint32, no special demand for multiprocess
+    res_base = __get_C_uint32(base, vec_size)
+    res_exp = __get_C_uint32(exp, vec_size)
+    return np.asarray(res_pen), res_base, res_exp
+
+
+def __get_c_pen_storage(pen, base, exp, vec_size, n):
+    '''
+    Construct array of PaillierEncryptedNumber storage from given memory space
+    ------------------
+    pen:  int, start address of a C memory space,
+               inside which stores PEN's encodings(bigint, CIPHER_BITS long)
+    base: int, start address of a C memory space,
+               inside which stores PEN's base(uint32)
+    exp:  int, start address of a C memory space,
+               inside which stores PEN's exp(uint32)
+    vec_size:   int, the number of bigint
+    n, max_int: int, the key used to encode the original plaintext
+
+    Return:
+    A ndarray, each element is a PaillierEncryptedNumber (PEN)
+    '''
+    res_cipher = __get_C_pen(pen, 0, vec_size)
+    res_exp = __get_C_uint32(exp, vec_size)
+
+    res_PaillierEncryptedNumber = []
+    public_key = PaillierPublicKey(n)
+    for i in range(vec_size):
+        res_PaillierEncryptedNumber.append(
+            PaillierEncryptedNumber(
+                public_key, res_cipher[i], int(
+                    round(
+                        res_exp[i]))))
+
+    return np.asarray(res_PaillierEncryptedNumber)
+
+
+#######################################################################
+# #########################DEFINITION OF CLASSES#######################
+#######################################################################
+'''#############  the definition of functions and classes #################'''
+
+'''
+    TensorStorage.data Containing the address pointing to a double type
+    All the int32/int64 have been transformed to int64_t type
+    All the float32/float64 have been transformed to double type
+    We assume that TensorStorage has 2 types:
+    1. data is ndarray, caculation can be performed directly by ndarray.
+    2. data is C memory pointer, used for performing further encoding for
+       the lower bound
+'''
+
+
+class TensorStorage:
+    '''
+    TensorStorage Class is used for store plaintexts.
+    Currently support
+    1. int32, int64 (all transformed to int64_t type)
+    2. float32, float64 (all transformed to double type)
+
+    Attributes:
+        data: ndarray or int,
+            1. ndarray means data is a python object
+            2. int means data is a C memory object, the value of int is the C memory's
+               start address
+        vec_size: int, the number of data stored in current class
+                       saved here since it may lost when data transfered to C memory
+        mem_type: int, value is MEM_HOST or MEM_DEVICE, where the data is stored
+                       default MEM_HOST
+        data_type: int, value is INT_TYPE or FLOAT_TYPE, the data type of plaintext,
+                        saved here since it may lost when data transfered to C memory
+    '''
+
+    def __init__(self, data, vec_size, mem_type: int, data_type: int):
+        # numpy has some strange shallowcopies which causes incontinuous memory space
+        # so add np.ascontinuousarray here to prevent potential errors
+        self.data = np.ascontiguousarray(
+            data) if isinstance(data, np.ndarray) else data
+        self.vec_size = vec_size
+        self.mem_type = mem_type
+        self.data_type = data_type
+
+    def __str__(self):
+        return f"{self.__class__}:{self.data}"
+
+    def __del__(self):
+        te_free(self)
+
+
+class BigIntStorage:
+    '''
+    Used for store bigint objects:
+
+    Attributes:
+        bigint_storage: int, the start address of the C memory storing bigint
+        elem_size:      int, the size of the bigint,
+                            useless since we unified into CIPHER_BITS
+        vec_size:       int, the number of bigint stored in this class
+        mem_type:       int, MEM_HOST or MEM_DEVICE, where data is stored, default MEM_HOST
+
+    '''
+
+    def __init__(self, data, vec_size, mem_type: int, elem_size: int):
+        # 1:cpu/host  2:FPGA/device
+        self.mem_type = mem_type
+        # self.data = data
+        self.bigint_storage = data
+        self.elem_size = elem_size
+        self.vec_size = vec_size
+
+    def __len__(self):
+        return len(self.data)
+
+    def __del__(self):
+        bi_free(self)
+
+
+class FixedPointStorage:
+    '''
+    Contains the 3 pointers indicating start address of C memory,
+    which can be handled directly by passing it to C functions in GPU_LIB
+    ------------------
+    Attributes:
+        bigint_storage: int, start address of C memory,
+                                in which stores the mantissa of a fpn array
+        base_storage:   int, start address of C memory,
+                                in which stores the base array of the fpn array
+        exp_storage:    int, start address of C memory,
+                                in which stores the exponent array of fpn array
+        vec_size:       int, the number of data stored in current class
+                                saved here since it may lost when data transfered to C memory
+        mem_type:       int, value is MEM_HOST or MEM_DEVICE, where the data is stored
+                                default MEM_HOST
+        data_type:      int, value is INT_TYPE or FLOAT_TYPE, the data type of plaintext,
+                                saved here since it may lost when data transfered to C memory
+        encode_n, max_int: bigint, the para used for encode the plaintext
+    '''
+
+    def __init__(
+            self,
+            bigint_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            n,
+            max_int,
+            mem_type: int,
+            data_type,
+    ):
+        # 1:cpu/host  2:FPGA/device
+        self.mem_type = mem_type
+        '''Actual data and length for fpn'''
+        self.bigint_storage = bigint_storage
+        self.base_storage = base_storage
+        self.exp_storage = exp_storage
+        self.vec_size = vec_size
+        '''TensorStorage needed paras'''
+        self.data_type = data_type
+        '''En/Decode needed paras '''
+        # these 2 are just python int, not BigintStorage nor C_types
+        self.encode_n = n
+        self.max_int = max_int
+
+    def __len__(self):
+        return self.vec_size
+
+    def __del__(self):
+        fp_free(self)
+
+
+class PaillierEncryptedStorage:
+    '''
+    Contains the 3 pointers indicating start address of C memory,
+    which can be handled directly by passing it to C functions in GPU_LIB
+    --------------------
+    Attributes:
+        pen_storage:    int, start address of C memory,
+                                in which stores the mantissa of the pen array
+        base_storage:   int, start address of C memory,
+                                in which stores the bases of the pen array
+        exp_storage:    int, start address of C memory,
+                                in which stores the exponents of the pen array
+        vec_size:       int, the number of data stored in current class
+                                saved here since it may lost when data transfered to C memory
+        mem_type:       int, value is MEM_HOST or MEM_DEVICE, where the data is stored
+                                default MEM_HOST
+        data_type:      int, value is INT_TYPE or FLOAT_TYPE, the data type of plaintext,
+                                saved here since it may lost when data transfered to C memory
+        encode_n, max_int: bigint, the para used for encode the plaintext
+    '''
+
+    def __init__(
+            self,
+            pen_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            mem_type: int,
+            data_type,
+            fpn_encode_n,
+            fpn_encode_max_int,
+    ):
+        # 1:cpu/host  2:FPGA/device
+        self.mem_type = mem_type
+        '''Actual data and length for pen'''
+        self.pen_storage = pen_storage
+        self.base_storage = base_storage
+        self.exp_storage = exp_storage
+        self.vec_size = vec_size
+        '''TensorStorage needed paras'''
+        self.data_type = data_type
+        '''En/Decode needed paras '''
+        self.encode_n = fpn_encode_n
+        self.encode_max_int = fpn_encode_max_int
+        '''Pub_key paras'''
+
+    def __len__(self):
+        return self.vec_size
+
+    def __del__(self):
+        pi_free(self)
+
+
+class TensorShapeStorage:
+    '''
+    Used for store the shape, currently support 2 dim
+    The behavior is identical to numpy
+    -------------------
+    Attributes:
+        dim1: the 1st dim, aka the row
+        dim2: the 2nd dim, aka the col
+    '''
+
+    def __init__(self, dim1=None, dim2=None):
+        if dim1 is not None and not isinstance(dim1, int):
+            raise TypeError("invalid dimension")
+        if dim2 is not None and not isinstance(dim2, int):
+            raise TypeError("invalid dimension")
+        self.dim1 = dim1
+        self.dim2 = dim2
+
+    def size(self):
+        dim1 = 1 if self.dim1 is None else self.dim1
+        dim2 = 1 if self.dim2 is None else self.dim2
+        return dim1 * dim2
+
+    def __getitem__(self, item):
+        return self.to_tuple().__getitem__(item)
+
+    def __len__(self):
+        return len(self.to_tuple())
+
+    def to_tuple(self):
+        if self.dim1 is None:
+            return ()
+        else:
+            if self.dim2 is None:
+                return (self.dim1,)
+            else:
+                return (self.dim1, self.dim2)
+
+    def from_tuple(self, v):
+        if len(v) == 1:
+            self.dim1 = v[0]
+            self.dim2 = None
+        elif len(v) == 2:
+            self.dim1 = v[0]
+            self.dim2 = v[1]
+        else:
+            self.dim1 = None
+            self.dim2 = None
+        return self
+
+    def transpose(self):
+        return TensorShapeStorage(self.dim2, self.dim1)
+
+    def matmul(self, other):
+        return TensorShapeStorage(self.dim1, other.dim2)
+
+
+class PubKeyStorage:
+    '''
+    Used for store PaillierPublicKey info as C-accpetable data type
+    -------------
+    Attributes:
+       n,g, nsquare, max_int:
+            c_char_p, actual value is bytes
+            all identical to PaillierPublicKey, which is defined in fate_script
+    '''
+
+    def __init__(self, n, g, nsquare, max_int):
+        self.n = c_char_p(n.to_bytes(CIPHER_BYTE, 'little'))
+        self.g = c_char_p(g.to_bytes(CIPHER_BYTE, 'little'))
+        self.nsquare = c_char_p(nsquare.to_bytes(CIPHER_BYTE, 'little'))
+        self.max_int = c_char_p(max_int.to_bytes(CIPHER_BYTE, 'little'))
+
+
+class PrivKeyStorage:
+    '''
+    Used for store PaillierPrivateKey info as C-acceptable data type
+    ------------
+    Attributes are all identical to PaillierPrivateKey, defined in fate_script
+    '''
+
+    def __init__(self, p, q, psquare, qsquare, q_inverse, hp, hq):
+        self.p = c_char_p(p.to_bytes(CIPHER_BYTE, 'little'))
+        self.q = c_char_p(q.to_bytes(CIPHER_BYTE, 'little'))
+        self.psquare = c_char_p(psquare.to_bytes(CIPHER_BYTE, 'little'))
+        self.qsquare = c_char_p(qsquare.to_bytes(CIPHER_BYTE, 'little'))
+        self.q_inverse = c_char_p(q_inverse.to_bytes(CIPHER_BYTE, 'little'))
+        self.hp = c_char_p(hp.to_bytes(CIPHER_BYTE, 'little'))
+        self.hq = c_char_p(hq.to_bytes(CIPHER_BYTE, 'little'))
+
+
+##########################################################################
+# ###############FUNCTION DEFINITION START################################
+##########################################################################
+
+
+def te_p2c_shape(shape, res):
+    '''
+    Change a 2-elem tuple into a TensorShapeStorage object
+    -------------
+    Para:
+        shape:   tuple, with no more than 2 elements
+        res:     return value
+    Return:
+        res,     TensorShapeStorage
+    '''
+    if res is None:
+        res = TensorShapeStorage()
+    res.from_tuple(shape)
+    return res
+
+
+def te_c2p_shape(shape):
+    '''
+    recover the shape_tuple from TensorShapeStorage
+    --------------
+    Para:   shape:   TensorShapeStorage
+    Return: tuple
+    '''
+    return shape.to_tuple()
+
+
+def te_free(tes):
+    '''
+    free the c memory space in a TensorStorage class
+    ------------
+    Para:   tes: TensorStorage
+    Return: None
+    '''
+    if isinstance(tes.data, int):
+        # means that it is a C memory pointer
+        FPGA_LIB.c_free(c_void_p(tes.data))
+        tes.data = None
+    # otherwise, tes.data is a python datatype(list or ndarray)
+
+
+def te_p2c(data, res=None):
+    '''
+    transmit the data storage form from Python to C
+    we assume data's structure has already been preserved by the upper layer
+    using the TensorShapeStorage class
+    ------------------
+    Args:
+        data, list or ndarray, the original data array
+    Return:
+        TensorStorage, and data is a C pointer
+    '''
+    # flatten the current ndarray for get the actual vec_size
+    if isinstance(data, list):
+        data = np.asarray(data)
+    if not isinstance(data, np.ndarray):
+        raise TypeError("Unsupported Data Structure")
+    vec_size = data.size
+
+    # malloc c memory space
+    if res is None:
+        storage_pointer = FPGA_LIB.c_malloc(c_size_t(vec_size * DOUBLE_BYTE))
+    else:
+        storage_pointer = res.data
+
+    # switch the differnt data types
+    if data.dtype == 'int32':
+        new_data = data.astype(np.int64)
+        data_pointer = new_data.ctypes.data_as(c_void_p)
+        data_type = INT64_TYPE
+        FPGA_LIB.int64_set(
+            c_void_p(storage_pointer),
+            data_pointer,
+            c_size_t(vec_size))
+    elif data.dtype == 'int64':
+        data_pointer = data.ctypes.data_as(c_void_p)
+        data_type = INT64_TYPE
+        FPGA_LIB.int64_set(
+            c_void_p(storage_pointer),
+            data_pointer,
+            c_size_t(vec_size))
+    elif data.dtype == 'float32':
+        new_data = data.astype(np.float64)
+        data_pointer = new_data.ctypes.data_as(c_void_p)
+        data_type = FLOAT_TYPE
+        FPGA_LIB.float64_set(
+            c_void_p(storage_pointer), data_pointer, c_size_t(vec_size)
+        )
+    elif data.dtype == 'float64':
+        data_pointer = data.ctypes.data_as(c_void_p)
+        data_type = FLOAT_TYPE
+        FPGA_LIB.float64_set(
+            c_void_p(storage_pointer), data_pointer, c_size_t(vec_size)
+        )
+    else:
+        raise PermissionError("Invalid Data Type")
+    return _te_init_store(
+        res,
+        storage_pointer,
+        vec_size,
+        MEM_FPGA_NUM_0,
+        data_type)
+
+
+def te_c2p(store):
+    '''
+    transmit TensorShapeStorage form from C to Python
+    due to different data type, the return array may diff
+    -----------
+    Para:
+        store: TensorShapeStorage, the storage waited to be changed
+    Return:
+        res_array: np.ndarray, the returned ndarray to Python
+    '''
+    if store.data_type == FLOAT_TYPE:
+        temp_array = __get_C_double(store.data, store.vec_size)
+        res_array = temp_array.astype(np.float64)
+        return res_array
+    elif store.data_type == INT64_TYPE:
+        temp_array = __get_C_int64(store.data, store.vec_size)
+        res_array = temp_array.astype(np.int64)
+        return res_array
+    else:
+        raise PermissionError("Invalid Data Type")
+
+
+def te_c2bytes(data, res):
+    '''
+    transmit TensorShapeStorage form from C to bytes stream.
+    Used for communication between sites, since C memory is not shared
+    --------------------
+    Para:
+        data: TensorShapeStorage, data is a C memory ptr
+        res:  the return bytes string
+    Return:
+        res:  bytes
+    '''
+    data_type = data.data_type
+    bytes_result = c_buffer(DOUBLE_BYTE * data.vec_size + U_INT32_BYTE)
+    # first 4 bytes: contains the data_type info
+    # remain bytes:  contains the data
+    FPGA_LIB.get_bytes(
+        cast(bytes_result, c_void_p),
+        c_char_p(data_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_void_p(data.data),
+        c_size_t(data.vec_size),
+    )
+    return bytes_result.raw
+
+
+def fp_c2bytes(store, res):
+    '''
+    transmit FixedPointStorage form to bytes stream;
+    Used for communication between sites, since C memory is not shared
+    Other info besides the C memory, including data_type, mem_type,
+    are also included
+    -----------------
+    Para:
+        store: FixedPointStorage
+        res:   the return bytes string
+    Return:
+        res:   bytes
+    '''
+    # uint32
+    data_type = store.data_type
+    mem_type = store.mem_type
+    # bigint
+    encode_n = store.encode_n
+    max_int = store.max_int
+    # actual storage
+    bytes_result = c_buffer(
+        (PLAIN_BYTE + U_INT32_BYTE * 2) * store.vec_size + U_INT32_BYTE * 2 + PLAIN_BYTE * 2
+    )
+    FPGA_LIB.fp_get_bytes(
+        cast(bytes_result, c_void_p),
+        c_char_p(data_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(mem_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(encode_n.to_bytes(PLAIN_BYTE, 'little')),
+        c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
+        c_void_p(store.bigint_storage),
+        c_void_p(store.base_storage),
+        c_void_p(store.exp_storage),
+        c_size_t(store.vec_size),
+    )
+    return bytes_result.raw
+
+
+def pi_c2bytes(store, res):
+    '''
+    transmit PaillierEncryptedNumber form to bytes stream
+    Used for communication between sites, since C memory is not shared
+    ----------------
+    Para:
+        store: PaillierEncryptedStorage
+        res:   the return bytes string
+    Return:
+        res:   bytes
+    '''
+    # uint32
+    data_type = store.data_type
+    mem_type = store.mem_type
+    # bigint
+    encode_n = store.encode_n
+    max_int = store.encode_max_int
+    # actual storage
+    bytes_result = c_buffer(
+        (CIPHER_BYTE + U_INT32_BYTE * 2) * store.vec_size + U_INT32_BYTE * 2 + CIPHER_BYTE * 2
+    )
+    FPGA_LIB.pi_get_bytes(
+        cast(bytes_result, c_void_p),
+        c_char_p(data_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(mem_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(encode_n.to_bytes(CIPHER_BYTE, 'little')),
+        c_char_p(max_int.to_bytes(CIPHER_BYTE, 'little')),
+        c_void_p(store.pen_storage),
+        c_void_p(store.base_storage),
+        c_void_p(store.exp_storage),
+        c_size_t(store.vec_size),
+    )
+    return bytes_result.raw
+
+
+def _te_init_store(store, data, vec_size, mem_type, data_type):
+    '''
+    initialize tensor storage,
+    -----------
+    Para:
+        store: the return value, TensorStorage, default None
+        Other paras' definition are equals to the one in TensorStorage
+    Return:
+        TensorShapeStorage
+    '''
+    if store is None:
+        store = TensorStorage(data, vec_size, mem_type, data_type)
+    else:
+        store.data = data
+        store.vec_size = vec_size
+        if mem_type is not None:
+            store.mem_type = mem_type
+        store.data_type = data_type
+    return store
+
+
+def te_bytes2c(data, res):
+    '''
+    Restore TensorStorage from bytes buffer,
+    TensorStorage.data is a ptr pointing to the restored C memory space.
+    -------------
+    Para:
+        data: the bytes string
+        res:  the return value, TensorStorage
+    Return:
+        res:  TensorStorage, the restored struct from para.data
+    '''
+    data_type_result = c_buffer(U_INT32_BYTE)
+    len_data = len(data) - U_INT32_BYTE
+    if res is None:
+        storage_pointer = FPGA_LIB.c_malloc(c_size_t(len_data))
+    else:
+        storage_pointer = res.data
+    FPGA_LIB.from_bytes_get_c(
+        cast(data_type_result, c_void_p),
+        c_void_p(storage_pointer),
+        c_char_p(data),
+        c_size_t(len_data),
+    )
+    data_type = int.from_bytes(data_type_result, 'little')
+    # TODO: change according to different data_types,
+    # now just use DOUBLE BYTE because we have only INT64 and DOUBLE,
+    # all of them are 8 bytes(Equal to DOUBLE_BYTE)
+    vec_size = len_data // DOUBLE_BYTE
+    return _te_init_store(
+        res,
+        storage_pointer,
+        vec_size,
+        MEM_FPGA_NUM_0,
+        data_type)
+
+
+def fp_bytes2c(data, res):
+    '''
+    Restore FixedPointStorage from bytes buffer.
+    ---------------
+    Para:
+        data: the bytes string
+        res:  the return value, FixedPointStorage
+    Return:
+        res:  FixedPointStorage, the restored struct from para.data.
+    '''
+    # caculate vec_size
+    vec_size = (len(data) - 2 * (U_INT32_BYTE + PLAIN_BYTE)) // (U_INT32_BYTE * 2 + PLAIN_BYTE)
+    # uint32
+    data_type = c_buffer(U_INT32_BYTE)
+    mem_type = c_buffer(U_INT32_BYTE)
+    # bigint
+    encode_n = c_buffer(PLAIN_BYTE)
+    max_int = c_buffer(PLAIN_BYTE)
+    # storage
+    fpn = FPGA_LIB.c_malloc(c_size_t(PLAIN_BYTE * vec_size))
+    base = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    exp = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+
+    FPGA_LIB.fp_from_bytes_get_c(
+        cast(data_type, c_void_p),
+        cast(mem_type, c_void_p),
+        cast(encode_n, c_void_p),
+        cast(max_int, c_void_p),
+        cast(fpn, c_void_p),
+        cast(base, c_void_p),
+        cast(exp, c_void_p),
+        c_char_p(data),
+        c_size_t(vec_size),
+    )
+    return _fp_init_store(
+        res,
+        fpn,
+        base,
+        exp,
+        vec_size,
+        int.from_bytes(encode_n, 'little'),
+        int.from_bytes(max_int, 'little'),
+        int.from_bytes(mem_type, 'little'),
+        int.from_bytes(data_type, 'little'),
+    )
+
+
+def pi_bytes2c(data, res):
+    '''
+    Restored PaillierEncryptedStorage from bytes buffer
+    --------------
+    Para:
+        data: the bytes string
+        res:  the return value, PaillierEncryptedStorage
+    Return:
+        res:  PaillierEncryptedStorage, the restored struct from para.data
+    '''
+    # caculate vec_size
+    vec_size = (len(data) - 2 * (U_INT32_BYTE + CIPHER_BYTE)) // (U_INT32_BYTE * 2 + CIPHER_BYTE)
+    # uint32
+    data_type = c_buffer(U_INT32_BYTE)
+    mem_type = c_buffer(U_INT32_BYTE)
+    # bigint
+    encode_n = c_buffer(CIPHER_BYTE)
+    max_int = c_buffer(CIPHER_BYTE)
+    # storage
+    pen = FPGA_LIB.c_malloc(c_size_t(CIPHER_BYTE * vec_size))
+    base = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    exp = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+
+    FPGA_LIB.pi_from_bytes_get_c(
+        cast(data_type, c_void_p),
+        cast(mem_type, c_void_p),
+        cast(encode_n, c_void_p),
+        cast(max_int, c_void_p),
+        cast(pen, c_void_p),
+        cast(base, c_void_p),
+        cast(exp, c_void_p),
+        c_char_p(data),
+        c_size_t(vec_size),
+    )
+    return _pi_init_store(
+        res,
+        pen,
+        base,
+        exp,
+        vec_size,
+        int.from_bytes(mem_type, 'little'),
+        int.from_bytes(data_type, 'little'),
+        int.from_bytes(encode_n, 'little'),
+        int.from_bytes(max_int, 'little'),
+    )
+
+
+def _te_init_shape(shape_store, shape_tuple):
+    '''
+    Init TensorShapeStorage
+    ----------
+    Para:
+        shape_store: TensorShapeStorage or None, return value, default None
+        shape_tuple: tuple, at most 2 dim, source data of TensorShapeStorage
+    Return:
+        TensorShapeStorage
+    '''
+    if shape_store is None:
+        shape_store = TensorShapeStorage()
+    shape_store.from_tuple(shape_tuple)
+    return shape_store
+
+
+def _te_init_ss(
+        res_store, res_data, vec_size, res_shape, shape_tuple, mem_type, data_type
+):
+    '''
+    Init TensorStorage and TensorShapeStorage at the same time
+    ------------
+    Para:
+        res_store: The return value, TensorStorage, default None
+        res_data:  int or ndarray
+        vec_size:  int
+        res_shape: The return value, TensorShapeStorage, default None
+        shape_tuple, tuple, at most 2 dim
+        mem_type:  int
+        data_type: int
+    Return:
+        tuple, (TensorStorage, TensorShapeStorage)
+    '''
+    return _te_init_store(
+        res_store, res_data, vec_size, mem_type, data_type
+    ), _te_init_shape(res_shape, shape_tuple)
+
+
+'''''' '''
+The following calculators are done on TensorStorage
+Definition and output are the same with numpy
+TensorStorage.data should all be ndarray datatype in order to support numpy
+
+NOT USED IN OUR FATE IMPLEMENTATION,
+but Webank's implementation seems to have used them
+''' ''''''
+
+
+def te_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
+    if axis == 1:
+        res_data = store.data[:, start:stop]
+    elif axis == 0:
+        res_data = store.data[start:stop]
+    else:
+        raise NotImplementedError()
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def te_cat(stores, axis, res_store, res_shape):
+    if axis == 0:
+        res_data = np.vstack([x.data for x in stores])
+    elif axis == 1:
+        res_data = np.hstack([x.data for x in stores])
+    else:
+        raise NotImplementedError()
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        stores[0].mem_type,
+        stores[0].data_type,
+    )
+
+
+# TODO: precise data_type
+
+
+def te_pow(left_store, right, left_shape, res_store, res_shape, stream):
+    res_data = left_store.data ** right
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+# TODO: precise data_type
+
+
+def te_add(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    res_data = left_store.data + right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+# TODO: precise data_type
+
+
+def te_mul(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    res_data = left_store.data * right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_truediv(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    res_data = left_store.data / right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        FLOAT_TYPE,
+    )
+
+
+def te_floordiv(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    res_data = left_store.data // right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        INT64_TYPE,
+    )
+
+
+# TODO: precise data_type
+
+
+def te_sub(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    res_data = left_store.data - right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+# TODO: precise data_type
+
+
+def te_matmul(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    res_data = left_store.data @ right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_abs(left_store, left_shape, res_store, res_shape, stream):
+    return _te_init_ss(
+        res_store,
+        abs(left_store.data),
+        left_store.vec_size,
+        res_shape,
+        left_shape.to_tuple(),
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_neg(left_store, left_shape, res_store, res_shape, stream):
+    return _te_init_ss(
+        res_store,
+        -left_store.data,
+        left_store.vec_size,
+        res_shape,
+        left_shape.to_tuple(),
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_transpose(left_store, left_shape, res_store, res_shape, stream):
+    res_data = left_store.data.transpose()
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_sum(left_store, left_shape, axis, res_store, res_shape, stream):
+    res_data = left_store.data.sum(axis=axis)
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_reshape(store, shape, new_shape, res_store, res_shape, stream):
+    return _te_init_ss(
+        res_store,
+        store.data.reshape(new_shape),
+        store.vec_size,
+        res_shape,
+        new_shape.to_tuple(),
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def te_exp(store, shape, res_store, res_shape, stream):
+    return _te_init_ss(
+        res_store,
+        np.exp(store.data),
+        store.vec_size,
+        res_shape,
+        shape.to_tuple(),
+        store.mem_type,
+        FLOAT_TYPE,
+    )
+
+
+def te_hstack(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    _store, _shape = te_cat([left_store, right_store], 1, res_store, res_shape)
+    # avoid naming collision
+    return _te_init_ss(
+        res_store,
+        _store.data,
+        _store.vec_size,
+        _shape,
+        _shape.to_tuple(),
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_c2p_first(store):
+    '''
+    Get the first element in the C data storage of TensorStorage
+    ---------------
+    Para:
+        store: TensorStorage, store.data must be a pointer to C memory
+    Return:
+        int or double, the first element in the C memory
+    '''
+    if store.data_type == FLOAT_TYPE:
+        temp_array = __get_C_double(store.data, store.vec_size)
+        res_array = temp_array.astype(np.float64)
+        return res_array[0]
+    elif store.data_type == INT64_TYPE:
+        temp_array = __get_C_int64(store.data, store.vec_size)
+        res_array = temp_array.astype(np.int64)
+        return res_array[0]
+    else:
+        raise PermissionError("Invalid Data Type")
+
+
+def bi_alloc(res, vec_size, elem_size, mem_type):
+    return _bi_init_store(
+        res,
+        FPGA_LIB.c_malloc(c_size_t(vec_size * elem_size)),
+        vec_size,
+        elem_size,
+        mem_type,
+    )
+
+
+'''################malloc a space with size elements############### '''
+'''
+    function: allocate space and form a new PaillierEncryptedStorage Class
+    res:    spilted to 3 different parts, indicating the 3 parts
+            that are needed for the PaillierEncrytedStorage
+    size:   is the number of elements that need to be alloced
+    return: A PaillierEncryptedStorage class, wrapping res as a class
+'''
+
+
+def pi_alloc(res, size, mem_type):
+    res_pen = FPGA_LIB.c_malloc(c_size_t(size * CIPHER_BYTE))
+    res_base = FPGA_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    res_exp = FPGA_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    # data_type, encode_n and encode_max_int all set to 0
+    return _pi_init_store(
+        res,
+        res_pen,
+        res_base,
+        res_exp,
+        size,
+        mem_type,
+        0,
+        0,
+        0)
+
+
+def fp_alloc(res, size, mem_type):
+    res_fpn = FPGA_LIB.c_malloc(c_size_t(size * PLAIN_BYTE))
+    res_base = FPGA_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    res_exp = FPGA_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    return _fp_init_store(
+        res,
+        res_fpn,
+        res_base,
+        res_exp,
+        size,
+        0,
+        0,
+        mem_type,
+        0)
+
+
+def te_alloc(res, size, mem_type):
+    data = FPGA_LIB.c_malloc(c_size_t(size * DOUBLE_BYTE))
+    return _te_init_store(res, data, size, mem_type, 0)
+
+
+def pi_free(ptr):
+    '''
+    The delete function of PaillierEncryptedStorage,
+    Due to different mem_type, the delete method may change
+    --------------
+    Para:
+        ptr: PaillierEncryptedStorage
+    '''
+    FPGA_LIB.c_free(c_void_p(ptr.pen_storage))
+    FPGA_LIB.c_free(c_void_p(ptr.base_storage))
+    FPGA_LIB.c_free(c_void_p(ptr.exp_storage))
+    ptr.pen_storage, ptr.base_storage, ptr.exp_storage = None, None, None
+
+
+# Host2Device and Device2Host calculators are not implemented on FPGA currently
+def pi_d2h(target, src, size, stream):
+    return src
+
+
+def pi_h2d(target, src, size, stream):
+    return src
+
+
+def pi_h2d_pub_key(target, src):
+    return src
+
+
+def pi_h2d_priv_key(target, src):
+    return src
+
+
+def pi_p2c_pub_key(target, src):
+    '''
+    Transfer Python form PaillierPublicKey to C form PubKeyStorage,
+    the latter can be used for C/FPGA computing
+    '''
+    target = PubKeyStorage(src.n, src.g, src.nsquare, src.max_int)
+    return target
+
+
+def pi_p2c_priv_key(target, src):
+    '''
+    Transfer Python form PaillierPrivateKey to C form PrivKeyStorage
+    the latter one can be used for C/FPGA computing
+    '''
+    target = PrivKeyStorage(
+        src.p, src.q, src.psquare, src.qsquare, src.q_inverse, src.hp, src.hq
+    )
+    return target
+
+
+# ###########PaillierEncrypted STORAGE INITIALIZE#################
+def _pi_init_store(
+        res_store,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        mem_type,
+        data_type,
+        encode_n,
+        encode_max_int,
+):
+    '''
+    init a new PaillierEncryptedStorage
+    ---------------
+    Para:
+        res_store, PaillierEncryptedStorage or None, return value, default None
+        Else paras are identical to the ones described in PaillierEncryptedStorage
+    '''
+    if res_store is None:
+        res_store = PaillierEncryptedStorage(
+            pen_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            mem_type,
+            data_type,
+            encode_n,
+            encode_max_int,
+        )
+    else:
+        res_store.pen_storage = pen_storage
+        res_store.base_storage = base_storage
+        res_store.exp_storage = exp_storage
+        res_store.vec_size = vec_size
+        res_store.mem_type = mem_type
+        '''TensorStorage needed'''
+        res_store.data_type = data_type
+        '''FixedPointNumber Needed'''
+        res_store.encode_n = encode_n
+        res_store.encode_max_int = encode_max_int
+    return res_store
+
+
+_pi_init_shape = _te_init_shape
+
+
+def _pi_init_ss(
+        res_store,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        res_shape,
+        res_shape_tuple,
+        mem_type,
+        data_type,
+        encode_n,
+        encode_max_int,
+):
+    '''
+    init PaillierEncryptedStorage and corresponding TensorShapeStorage at same time
+    Paras are identical to _pi_init_store & _te_init_shape
+    '''
+    return _pi_init_store(
+        res_store,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        mem_type,
+        data_type,
+        encode_n,
+        encode_max_int,
+    ), _pi_init_shape(res_shape, res_shape_tuple)
+
+
+''' transfor PEN tensor from Python memory to C memory '''
+
+
+def pi_p2c(target, src, data_type=FLOAT_TYPE):
+    '''
+    Transform list of PaillierEncryptedNumber to
+    C-memory style PaillierEncryptedStorage
+    --------------------
+    Para:
+        target:     PaillierEncryptedStorage, return value
+        src:        List or ndarray, each element is a PaillierEncryptedNumber
+        data_type:  int, src's original datatype, default double
+    '''
+    if isinstance(src, list):
+        src = np.array(src)
+    if not isinstance(src, np.ndarray):
+        raise TypeError("Unsupported Data Structure")
+    src = src.flatten()
+    vec_size = src.size
+    # malloc the space for the type
+    if target is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = target.pen_storage
+        res_base = target.base_storage
+        res_exp = target.exp_storage
+    # get the two encoding parameters
+    n = src[0].public_key.n
+    max_int = src[0].public_key.max_int
+    base_temp = []
+    exp_temp = []
+    # Due to the special condition that big_ints in ndaray are not continuously stored
+    # they are actually oject type rather than int type
+    # Actually ndarray stores its reference/pointer continuously rather than real value
+    # So we should use a for loop to handle each bigint and memcpy it
+    for i in range(vec_size):
+        src_number = src[i].ciphertext(False).to_bytes(CIPHER_BYTE, 'little')
+        FPGA_LIB.bigint_set(
+            c_char_p(res_pen),
+            c_char_p(src_number),
+            c_size_t(CIPHER_BITS),
+            c_size_t(i))
+        base_temp.append(PEN_BASE)
+        exp_temp.append(src[i].exponent)
+    # base and exp are deepcopyed in order to prevent potential double free
+    # here
+    base_arr_ptr = np.asarray(base_temp).ctypes.data_as(c_void_p)
+    exp_arr_ptr = np.asarray(exp_temp).ctypes.data_as(c_void_p)
+    FPGA_LIB.unsigned_set(c_void_p(res_base), base_arr_ptr, c_size_t(vec_size))
+    FPGA_LIB.unsigned_set(c_void_p(res_exp), exp_arr_ptr, c_size_t(vec_size))
+    return _pi_init_store(
+        target,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        MEM_FPGA_NUM_0,
+        data_type,
+        n,
+        max_int,
+    )
+
+
+def _bi_init_store(res_store, data, count, elem_size, mem_type):
+    '''init a new BigIntStorage object'''
+    if res_store is None:
+        res_store = BigIntStorage(data, count, mem_type, elem_size)
+    else:
+        res_store.bigint_storage = data
+        res_store.vec_size = count
+        res_store.mem_type = mem_type
+        res_store.elem_size = elem_size
+    return res_store
+
+
+_bi_init_shape = _te_init_shape
+
+
+def _bi_init_ss(
+        res_store,
+        res_data,
+        vec_size,
+        res_shape,
+        res_shape_tuple,
+        elem_size,
+        mem_type):
+    '''Init BigIntStorage and the corresponding TensorShapeStorage'''
+    return _bi_init_store(
+        res_store, res_data, vec_size, elem_size, mem_type
+    ), _bi_init_shape(res_shape, res_shape_tuple)
+
+
+def _fp_init_store(
+        res_store,
+        fpn_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        n,
+        max_int,
+        mem_type,
+        data_type,
+):
+    '''
+    Init FixedPointStorage class,
+    paras are identical to the elements in FixedPointStorage
+    '''
+    if res_store is None:
+        res_store = FixedPointStorage(
+            fpn_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            n,
+            max_int,
+            mem_type,
+            data_type,
+        )
+    else:
+        res_store.bigint_storage = fpn_storage
+        res_store.base_storage = base_storage
+        res_store.exp_storage = exp_storage
+        res_store.vec_size = vec_size
+        res_store.mem_type = mem_type
+        '''TensorStorage needed paras'''
+        res_store.data_type = data_type
+        '''En/Decode needed paras '''
+        res_store.encode_n = n
+        res_store.max_int = max_int
+    return res_store
+
+
+def _fp_init_ss(
+        res_store,
+        fpn_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        n,
+        max_int,
+        res_shape,
+        res_shape_tuple,
+        mem_type,
+        data_type,
+):
+    '''Init FexiedPointStorage and the corresponding TensorShapeStorage'''
+    return _fp_init_store(
+        res_store,
+        fpn_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        n,
+        max_int,
+        mem_type,
+        data_type,
+    ), _te_init_shape(res_shape, res_shape_tuple)
+
+
+def __get_FPGA_device_num(device_type: int):
+    '''
+    get the actual physical number of FPGA device from the current mem_type
+    ----------------
+    Para: device_type, the mem_type stored in Storage type, since it is mixed
+    with GPU and CPU, to get physical No. of FPGA, we should do some pre-process
+    '''
+    # if device_type >= MIN_FPGA and device_type <= MAX_FPGA:
+    #     FPGA_dev_num = device_type % 10
+    # else:
+    #     raise PermissionError("DEVICE TYPE IS NOT FPGA!")
+    return 0
+
+
+def pi_encrypt(pub_key, fps, res=None, stream=None):
+    '''
+    perform paillier encryption for FixedPointStorage,
+    use raw encrypt with no obfuscation
+    ----------------
+    Para:
+        pubkey: Dev_PubKeyPtr, the PaillierPublicKey class stored in GPU memory
+        fps:    FixedPointStorage, fpn value waiting to be encrypted
+        res:    None or PaillierEncryptedStorage, return value, default None
+        stream: None, currently not used
+    Return:
+        PaillierEncryptedStorage, the encrypted value
+    '''
+    src_fpn = fps.bigint_storage
+    src_base = fps.base_storage
+    src_exp = fps.exp_storage
+    vec_size = fps.vec_size
+
+    if res is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = res.pen_storage
+        res_base = res.base_storage
+        res_exp = res.exp_storage
+
+    # get the actual FPGA device number and pass it to C-level function
+    FPGA_dev_num = __get_FPGA_device_num(fps.mem_type)
+    FPGA_LIB.encrypt_without_obf(
+        c_char_p(src_fpn),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(vec_size),
+        c_size_t(FPGA_dev_num),
+    )
+    return _pi_init_store(
+        res,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        fps.mem_type,
+        fps.data_type,
+        fps.encode_n,
+        fps.max_int,
+    )
+
+
+def pi_decrypt(pub_key, priv_key, pes, res=None, stream=None):
+    '''
+    perform decryption and decode as a whole
+    ---------------------
+    Para:
+        pub_key:   Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        priv_key:  Dev_PrivKeyStorage, PaillierPrivateKey stored in GPU mem
+        pes:       PaillierEncryptedStorage, pens waiting to be decrypted
+        res:       TensorStorage, the return value;
+        stream:    None, currently not used
+        fps:       FixedPointStorage, the middle memory space used
+                   after decrypt and before encode
+    Return:
+        TensorStorage, the decrypted then decoded value
+    '''
+    src_pen = pes.pen_storage
+    src_base = pes.base_storage
+    src_exp = pes.exp_storage
+    vec_size = pes.vec_size
+    '''malloc space for the return FixedPointStorage'''
+    res_fpn = FPGA_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+    res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    '''call the decrypt function'''
+    FPGA_dev_num = __get_FPGA_device_num(pes.mem_type)
+    FPGA_LIB.decrypt(
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_fpn),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        priv_key.p,
+        priv_key.q,
+        priv_key.psquare,
+        priv_key.qsquare,
+        priv_key.q_inverse,
+        priv_key.hp,
+        priv_key.hq,
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(vec_size),
+        c_size_t(FPGA_dev_num),
+    )
+    '''call the decode function'''
+    decrypt_store = FixedPointStorage(
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        pes.encode_n,
+        pes.encode_max_int,
+        pes.mem_type,
+        pes.data_type,
+    )
+    return fp_decode(decrypt_store, res, stream)
+
+
+def pi_obfuscate(pub_key, pes, obf_seeds, res, stream):
+    '''
+    apply obfuscation to a PaillierEncryptedStorage using the
+    obfuscation seed given, actually a mulmod
+    ----------------------
+    Para:
+        pubkey:    Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        pes:       PaillierEncryptedStorage, raw pen haven't be obfuscated
+        obf_seeds: BigIntStorage, random bigint generated by pi_gen_obf_seed
+        res:       PaillierEncryptedStorage, the obfuscated return value
+    Return:
+        PaillierEncryptedStorage, the same as res
+    '''
+    # get the pen storage data
+    src_pen = pes.pen_storage
+    src_base = pes.base_storage
+    src_exp = pes.exp_storage
+    vec_size = pes.vec_size
+    # get the bigint random ptr
+    obf_rand = obf_seeds.bigint_storage
+    '''initialize the result space'''
+    if res is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = res.pen_storage
+        res_base = res.base_storage
+        res_exp = res.exp_storage
+    '''run the modular mul function'''
+    # we will do the obfs on the device same as pes's device
+    # Although the obfs_seed may be generated on another device
+    # But since all datas are stored in CPU memory, this won't be a serious
+    # problem
+    FPGA_dev_num = __get_FPGA_device_num(pes.mem_type)
+    FPGA_LIB.obf_modular_multiplication(
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(obf_rand),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(CIPHER_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(vec_size),
+        c_size_t(FPGA_dev_num),
+    )
+    return _pi_init_store(
+        res,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        pes.mem_type,
+        pes.data_type,
+        pes.encode_n,
+        pes.encode_max_int,
+    )
+
+
+def pi_gen_obf_seed(res_store, pub_key, count, elem_size, rand_seed, stream):
+    '''
+    generate random bigint and perform expmod based on the given public key.
+    The calculation result is then used as obfuscation seed for further encrypt.
+    --------------
+    Para:
+        res_store:   BigIntStorage, the return value
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        count:       int, the number of random numbers need to be generated
+        elem_size:   int, the length of the random bigint
+        rand_seed:   the seed used for generating random number
+    Return:
+        BigIntStorage, same as res_store
+    '''
+    rand_storage = bi_gen_rand(elem_size, count, None, rand_seed, stream)
+    rand_data = rand_storage.bigint_storage
+    if res_store is None:
+        res_data = FPGA_LIB.c_malloc(c_size_t(count * CIPHER_BYTE))
+    else:
+        res_data = res_store.bigint_storage
+    FPGA_dev_num = __get_FPGA_device_num(rand_storage.mem_type)
+    FPGA_LIB.obf_modular_exponentiation(
+        c_char_p(rand_data),
+        c_size_t(1024),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_char_p(res_data),
+        c_size_t(CIPHER_BITS),
+        c_size_t(count),
+        c_size_t(FPGA_dev_num),
+    )
+    return _bi_init_store(res_store, res_data, count, MEM_DEVICE, elem_size)
+
+
+def pi_gen_obf_seed_gmp(res_store, pub_key, count, elem_size, stream):
+    '''
+    generate random bigint and perform expmod based on the given public key.
+    The calculation result is then used as obfuscation seed for further encrypt.
+    --------------
+    Para:
+        res_store:   BigIntStorage, the return value
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        count:       int, the number of random numbers need to be generated
+        elem_size:   int, the length of the random bigint
+        rand_seed:   the seed used for generating random number
+    Return:
+        BigIntStorage, same as res_store
+    '''
+    res_rand = FPGA_LIB.c_malloc(c_size_t(count * 1024 // 8))
+    FPGA_LIB.gmp_random(
+        c_char_p(res_rand),
+        c_size_t(1024),
+        c_size_t(1024),
+        c_size_t(1024),
+        c_size_t(count),
+        pub_key.n,
+    )
+    if res_store is None:
+        res_data = FPGA_LIB.c_malloc(c_size_t(count * CIPHER_BYTE))
+    else:
+        res_data = res_store.bigint_storage
+    FPGA_dev_num = 0
+    FPGA_LIB.obf_modular_exponentiation(
+        c_char_p(res_rand),
+        c_size_t(1024),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_char_p(res_data),
+        c_size_t(CIPHER_BITS),
+        c_size_t(count),
+        c_size_t(FPGA_dev_num),
+    )
+    return _bi_init_store(res_store, res_data, count, MEM_DEVICE, 2048 // 8)
+
+
+def __shape_decompose(shape):
+    '''
+    Decompose TensorShapeStorage to 2-D tuple
+    satisfying fpga computation demand
+
+    WARNING:
+    not same output as numpy,
+    extra switch needed after computing to suit numpy shape output
+    '''
+    shape_tuple = shape.to_tuple()
+    if len(shape_tuple) == 0:
+        return 1, 1
+    elif len(shape_tuple) == 1:
+        return 1, shape_tuple[0]
+    elif len(shape_tuple) == 2:
+        return shape_tuple[0], shape_tuple[1]
+    else:
+        raise PermissionError("Invalid Shape")
+
+
+def __shape_resolve(shape_1, shape_2):
+    '''check aligment capability of shape_1 & shape_2 to support broadcast'''
+
+    def check_func(a, b):
+        return a == b or a == 1 or b == 1
+
+    P, Q = __shape_decompose(shape_1)
+    R, S = __shape_decompose(shape_2)
+    max_shape_size = max(len(shape_1.to_tuple()), len(shape_2.to_tuple()))
+    if check_func(P, R) and check_func(Q, S):
+        if max_shape_size == 0:
+            return P, Q, R, S, ()
+        elif max_shape_size == 1:
+            return P, Q, R, S, (max(Q, S),)
+        elif max_shape_size == 2:
+            return P, Q, R, S, (max(P, R), max(Q, S))
+        else:
+            raise PermissionError(f"Invalid shape, {shape_1}, {shape_2}")
+    else:
+        raise PermissionError("shape cannot align", shape_1, shape_2)
+
+
+def pi_add(
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''
+    Perform element-wise encrypted add, support broadcast over cols or rows
+    ---------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        right_store: PaillierEncryptedStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if left/right operators cannot aligned for compute,
+                         even if broadcast is supported
+    '''
+    # first get the shape of the res type
+    P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
+    res_size = max(P, R) * max(Q, S)
+    # the left_store data
+    l_pen = left_store.pen_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_pen = right_store.pen_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    # perform calculation
+    FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+    FPGA_LIB.pen_matrix_add_pen_matrix(
+        c_char_p(l_pen),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_pen),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(R),
+        c_size_t(S),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(CIPHER_BITS),
+        c_size_t(FPGA_dev_num),
+    )
+    # handle the result's data type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_mul(
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''
+    Perform element-wise encrypted muliply, support broadcast for cols and rows
+    --------------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        right_store: FixedPointStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if left/right operators cannot aligned for compute,
+                         even if broadcast is supported
+    '''
+    # check for alignment capablity of shapes
+    P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
+    # P,Q is the dim of the left_store(pen)
+    # R,S is the dim of the right_store(fpn)
+    res_size = max(P, R) * max(Q, S)
+    # the left_store data
+    l_pen = left_store.pen_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_fpn = right_store.bigint_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    # '''call the batch_mul function'''
+    FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+    FPGA_LIB.fpn_matrix_elementwise_multiply_pen_matrix(
+        c_char_p(r_fpn),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(l_pen),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(R),
+        c_size_t(S),
+        c_size_t(P),
+        c_size_t(Q),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(FPGA_dev_num),
+    )
+    # handle the result's data type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def fp_transpose(left_store, left_shape, res_store, res_shape, stream):
+    '''
+    transpose the C-memory stored matrix of FixedPointStorage,
+    support at most 2-D matrix
+    -----------------
+    Para:
+        left_store:  FixedPointStorage, left_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (FixedPointStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if dimension is higher than 2-D, not supported
+    '''
+    # didn't use FPGA driver, no need for check for mem_type
+    left_shape_tuple = left_shape.to_tuple()
+    # get the left_store parameters
+    src_fpn = left_store.bigint_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    # malloc space for the res value
+    if res_store is None:
+        res_fpn = FPGA_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_fpn = res_store.bigint_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    # Handling different shapes
+    if len(left_shape_tuple) < 2:
+        # the tuple is 0-D or 1-D
+        # transpose returns the same value as input in numpy
+        # make the output same as input, memcpy is to prevent potential double
+        # free
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_fpn),
+            c_void_p(src_fpn),
+            c_size_t(vec_size * PLAIN_BYTE))
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(vec_size * U_INT32_BYTE))
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(vec_size * U_INT32_BYTE))
+        return _fp_init_ss(
+            res_store,
+            res_fpn,
+            res_base,
+            res_exp,
+            left_store.vec_size,
+            left_store.encode_n,
+            left_store.max_int,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+        )
+    elif len(left_shape_tuple) == 2:
+        # the tuple is 2-D
+        # do a normal transpose
+        res_shape_tuple = (left_shape_tuple[1], left_shape_tuple[0])
+        FPGA_LIB.transpose(
+            c_char_p(src_fpn),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(res_shape_tuple[1]),
+            c_size_t(res_shape_tuple[0]),
+            c_size_t(PLAIN_BITS),
+        )
+        return _fp_init_ss(
+            res_store,
+            res_fpn,
+            res_base,
+            res_exp,
+            vec_size,
+            left_store.encode_n,
+            left_store.max_int,
+            res_shape,
+            res_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+        )
+    else:
+        raise PermissionError("Unsupported shape")
+
+
+def pi_matmul(
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''
+    Perform matrix multiply under encryption
+    ------------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        right_store: FixedPointStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if shape is invalid for 1-D or 2-D matrix mul
+        ValueError, if left/right operators' shape can't align for matmul
+    '''
+
+    # '''pre-process shape'''
+    left_tuple = left_shape.to_tuple()
+    right_tuple = right_shape.to_tuple()
+
+    if len(left_tuple) == 0 or len(right_tuple) == 0 or len(left_tuple) > 2 or len(right_tuple) > 2:
+        raise PermissionError("Invalid shape")
+    P, Q = __shape_decompose(left_shape)
+    R, S = __shape_decompose(right_shape)
+    if len(right_tuple) == 1:
+        R, S = S, R
+    if Q != R:
+        raise ValueError("shape not aligned")
+    if len(left_tuple) == 1 and len(right_tuple) == 1:
+        res_shape_tuple = ()
+    elif len(left_tuple) == 1 and len(right_tuple) == 2:
+        res_shape_tuple = (S,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 1:
+        res_shape_tuple = (P,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 2:
+        res_shape_tuple = (P, S)
+    else:
+        raise RuntimeError(
+            "Default error, won't occur unless something VERY STRANGE happens"
+        )
+    res_size = P * S
+    # the left_store data
+    l_pen = left_store.pen_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_fpn = right_store.bigint_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    '''call the matrix_mul function'''
+    FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+    FPGA_LIB.pen_matrix_multiply_fpn_matrix(
+        c_char_p(l_pen),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_fpn),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(S),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(FPGA_dev_num),
+    )
+    # check for data type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_rmatmul(
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''
+    Perform matrix multiply under encryption.
+    rmatmul means right_op is PaillierEncryptedStorage, differ from pi_matmul
+    Due to implementation of cuda code, right_store needs to be transposed
+    -------------------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  FixedPointStorage, left_operator
+        right_store: PaillierEncryptedStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if shape is invalid for 1-D or 2-D matrix mul
+        ValueError, if left/right operators' shape can't align for matmul
+        RuntimeError,  default error for shape evaluation
+    '''
+    left_tuple = left_shape.to_tuple()
+    right_tuple = right_shape.to_tuple()
+    if len(left_tuple) == 0 or len(right_tuple) == 0 or len(left_tuple) > 2 or len(right_tuple) > 2:
+        raise PermissionError("Invalid shape")
+    P, Q = __shape_decompose(left_shape)
+    R, S = __shape_decompose(right_shape)
+    if len(right_tuple) == 1:
+        R, S = S, R
+    if Q != R:
+        raise ValueError("shape not aligned")
+    if len(left_tuple) == 1 and len(right_tuple) == 1:
+        res_shape_tuple = ()
+    elif len(left_tuple) == 1 and len(right_tuple) == 2:
+        res_shape_tuple = (S,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 1:
+        res_shape_tuple = (P,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 2:
+        res_shape_tuple = (P, S)
+    else:
+        raise RuntimeError(
+            "You should never ever see this error unless something VERY STRANGE occurs"
+        )
+    res_size = P * S
+    # the left_store data
+    l_fpn = left_store.bigint_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_pen = right_store.pen_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    '''call the matrix_mul function'''
+    FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+    FPGA_LIB.fpn_matrix_multiply_pen_matrix(
+        c_char_p(l_fpn),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_pen),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(S),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_uint32(FPGA_dev_num),
+    )
+    # check for data type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        right_store.mem_type,
+        data_type,
+        right_store.encode_n,
+        right_store.encode_max_int,
+    )
+
+
+def pi_transpose(left_store, left_shape, res_store, res_shape, stream):
+    '''
+    transpose the C-memory stored matrix of PaillierEncryptedStorage,
+    support at most 2-D matrix
+    -----------------
+    Para:
+        left_store:  PaillierEncryptedStorage, left_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if dimension is higher than 2-D, not supported
+    '''
+    left_shape_tuple = left_shape.to_tuple()
+    # get the left_store parameters
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    # malloc space for the res value
+    if res_store is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    '''Start handling different type of data '''
+    if len(left_shape_tuple) < 2:
+        # just a raw memcpy, no transpose needed for this scene
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_pen),
+            c_void_p(src_pen),
+            c_size_t(vec_size * CIPHER_BYTE))
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(vec_size * U_INT32_BYTE))
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(vec_size * U_INT32_BYTE))
+        return _pi_init_ss(
+            res_store,
+            res_pen,
+            res_base,
+            res_exp,
+            left_store.vec_size,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
+    elif len(left_shape_tuple) == 2:
+        res_shape_tuple = (left_shape_tuple[1], left_shape_tuple[0])
+        # call the C transpose functions
+        FPGA_LIB.transpose(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(res_shape_tuple[1]),
+            c_size_t(res_shape_tuple[0]),
+            c_size_t(CIPHER_BITS),
+        )
+        return _pi_init_ss(
+            res_store,
+            res_pen,
+            res_base,
+            res_exp,
+            vec_size,
+            res_shape,
+            res_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
+    else:
+        raise PermissionError("Invalid Shape")
+
+
+def pi_sum(
+        pub_key,
+        left_store,
+        left_shape,
+        axis,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    '''
+    Perform sum according to the axis
+    ----------------------
+    Para:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        axis:        int or None, the dimension which sum is performed
+                        None: sum over all elements
+                        0:    sum vertically, over the 1st demension
+                        1:    sum horizontally, over the 2nd demension
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple, (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        Permission error: when the input axis is not aligned to input shape
+    '''
+    # get the original data
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    # initialize the result
+    res_pen, res_base, res_exp = 0, 0, 0
+    res_shape_tuple = ()
+    # get the original data's tuple
+    left_shape_tuple = left_shape.to_tuple()
+
+    if len(left_shape_tuple) == 0:
+        # handling shape (), meaning only one element in left_store
+        if axis is not None and axis != 0:
+            raise PermissionError(
+                "Cannot set axis other than 0 or None for dimension 0"
+            )
+        if res_store is None:
+            res_pen = FPGA_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+            res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+            res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_pen),
+            c_void_p(src_pen),
+            c_size_t(vec_size * CIPHER_BYTE))
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(vec_size * U_INT32_BYTE))
+        FPGA_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(vec_size * U_INT32_BYTE))
+        return _pi_init_ss(
+            left_store,
+            res_pen,
+            res_base,
+            res_exp,
+            vec_size,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
+    elif axis is None or len(left_shape_tuple) == 1:
+        # handling shape (n, ) or axis == None
+        # malloc space for results
+        if len(left_shape_tuple) == 1 and axis is not None and axis >= 1:
+            raise PermissionError(
+                "axis is out of bounds for array of dimension 1")
+        if res_store is None:
+            res_pen = FPGA_LIB.c_malloc(c_size_t(1 * CIPHER_BYTE))
+            res_base = FPGA_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+            res_exp = FPGA_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        # other return paras
+        result_size = 1
+        res_shape_tuple = ()
+        '''call the C pen_sum function'''
+        FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+        FPGA_LIB.pen_sum(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(1),
+            c_size_t(vec_size),
+            pub_key.n,
+            pub_key.g,
+            pub_key.nsquare,
+            pub_key.max_int,
+            c_size_t(CIPHER_BITS),
+            c_size_t(FPGA_dev_num),
+        )
+    elif axis == 0:
+        # handling 2-D matrix, axis == 0 means sum vertically
+        # since current sum only support horizontal sum
+        # aka batch sum over continuous memory space
+        transpose_store, transpose_shape = pi_transpose(
+            left_store, left_shape, None, None, stream
+        )
+        src_pen = transpose_store.pen_storage
+        src_base = transpose_store.base_storage
+        src_exp = transpose_store.exp_storage
+        transpose_tuple = transpose_shape.to_tuple()
+        '''perform sum on the transposed matrix'''
+        if res_store is None:
+            res_pen = FPGA_LIB.c_malloc(
+                c_size_t(transpose_tuple[0] * CIPHER_BYTE))
+            res_base = FPGA_LIB.c_malloc(
+                c_size_t(transpose_tuple[0] * U_INT32_BYTE))
+            res_exp = FPGA_LIB.c_malloc(
+                c_size_t(transpose_tuple[0] * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        result_size = transpose_tuple[0]
+        res_shape_tuple = (transpose_tuple[0],)
+        '''Call the C function'''
+        # print(transpose_tuple[0])
+        FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+        FPGA_LIB.pen_sum(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(transpose_tuple[0]),
+            c_size_t(transpose_tuple[1]),
+            pub_key.n,
+            pub_key.g,
+            pub_key.nsquare,
+            pub_key.max_int,
+            c_size_t(CIPHER_BITS),
+            c_size_t(FPGA_dev_num),
+        )
+    elif axis == 1:
+        # handling 2-D matrix, axis == 1 means sum horizontally
+        if res_store is None:
+            res_pen = FPGA_LIB.c_malloc(
+                c_size_t(left_shape_tuple[0] * CIPHER_BYTE))
+            res_base = FPGA_LIB.c_malloc(
+                c_size_t(left_shape_tuple[0] * U_INT32_BYTE))
+            res_exp = FPGA_LIB.c_malloc(
+                c_size_t(left_shape_tuple[0] * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        result_size = left_shape_tuple[0]
+        # the res_shape tuple is also clear
+        result_size = left_shape_tuple[0]
+        res_shape_tuple = (left_shape_tuple[0],)
+        '''Call the pen_sum: a C function'''
+        FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+        FPGA_LIB.pen_sum(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(left_shape_tuple[0]),
+            c_size_t(left_shape_tuple[1]),
+            pub_key.n,
+            pub_key.g,
+            pub_key.nsquare,
+            pub_key.max_int,
+            c_size_t(CIPHER_BITS),
+            c_size_t(FPGA_dev_num),
+        )
+    else:
+        raise PermissionError("Invalid Axis or Shape")
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        result_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def fp_encode(
+        store, n, max_int, precision=None, max_exponent=None, res=None, stream=None
+):
+    '''
+    Perform encode to a TensorStorage
+    -----------------
+    Paras:
+        store:        TensorStorage, raw data to be encoded
+        n:            big int, the same n in pubkey used for encryption
+        max_int:      big int, same max_int in pubkey.
+        precision:    int, the precision of encoding, default None
+        max_exponent: None or int, currently not used
+        res:          FixedPointStorage, the return value
+    Return:
+        FixedPointStorage, same as res
+    Raise:
+        PermissionError: For unsupported data type or encoding style
+    '''
+    if max_exponent is not None:
+        raise PermissionError("max_exponent not supported")
+    if precision is None:
+        precision = -1
+    data_storage = store.data
+    vec_size = store.vec_size
+    # malloc the return memory space
+    if res is None:
+        res_fpn = FPGA_LIB.c_malloc(c_size_t(PLAIN_BYTE * vec_size))
+        res_base = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    else:
+        res_fpn = res.bigint_storage
+        res_base = res.base_storage
+        res_exp = res.exp_storage
+    # Due to the different nature of encoding float/int
+    # Handle the two different data type seperately
+    FPGA_dev_num = __get_FPGA_device_num(store.mem_type)
+    if store.data_type == FLOAT_TYPE:
+        FPGA_LIB.encode_double(
+            c_void_p(data_storage),
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_int32(precision),
+            c_char_p(n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS),
+            c_size_t(vec_size),
+            c_size_t(FPGA_dev_num),
+        )
+    elif store.data_type == INT64_TYPE:
+        FPGA_LIB.encode_int(
+            c_void_p(data_storage),
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_int32(precision),
+            c_char_p(n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS),
+            c_size_t(vec_size),
+            c_size_t(FPGA_dev_num),
+        )
+    else:
+        raise PermissionError("Invalid Data Type")
+
+    '''get the three elements, store it in a FPNStorage'''
+
+    return _fp_init_store(
+        res,
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        n,
+        max_int,
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def __fp_decode(store, res, stream):
+    '''
+    Decode a FixedPointStorage in CPU, using fp_c2p to implement
+    Currently not used, as a GPU version has been done
+    ------------------
+    Paras:
+        store:   FixedPointStorage, the raw data to be decoded
+        res:     TensorStorage, the decoded result
+    Return:
+        TensorStorage, same as res
+    '''
+    vec_size = store.vec_size
+    fpn_array = __get_c_fpn_storage(
+        store.bigint_storage,
+        store.base_storage,
+        store.exp_storage,
+        vec_size,
+        store.encode_n,
+        store.max_int,
+    )
+
+    CPU_decode = []
+    if store.data_type == INT64_TYPE:
+        for i in range(vec_size):
+            CPU_decode.append(int(fpn_array[i].decode()))
+    elif store.data_type == FLOAT_TYPE:
+        for i in range(vec_size):
+            CPU_decode.append(fpn_array[i].decode())
+    else:
+        raise PermissionError("Invalid Data Type")
+
+    # reform the value to TensorStorage
+    decode_data = te_p2c(CPU_decode, None)
+    res_data = decode_data.data
+    decode_data.data = None
+    return _te_init_store(
+        res,
+        res_data,
+        vec_size,
+        store.mem_type,
+        store.data_type)
+
+
+def fp_decode(store, res, stream):
+    '''
+    Decode a FixedPointStorage in GPU
+    ------------------
+    Paras:
+        store:   FixedPointStorage, the raw data to be decoded
+        res:     TensorStorage, the decoded result
+    Return:
+        TensorStorage, same as res
+    '''
+    if store.data_type == FLOAT_TYPE:
+        res_store = (
+            FPGA_LIB.c_malloc(c_size_t(store.vec_size * DOUBLE_BYTE))
+            if res is None
+            else res.data
+        )
+        FPGA_LIB.decode_double(
+            c_void_p(store.bigint_storage),
+            c_void_p(store.base_storage),
+            c_void_p(store.exp_storage),
+            c_char_p(store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(store.max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS),
+            c_void_p(res_store),
+            c_size_t(store.vec_size),
+        )
+    elif store.data_type == INT64_TYPE:
+        res_store = (
+            FPGA_LIB.c_malloc(c_size_t(store.vec_size * INT64_BYTE))
+            if res is None
+            else res.data
+        )
+        FPGA_LIB.decode_int(
+            c_void_p(store.bigint_storage),
+            c_void_p(store.base_storage),
+            c_void_p(store.exp_storage),
+            c_char_p(store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(store.max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS),
+            c_void_p(res_store),
+            c_size_t(store.vec_size),
+        )
+    else:
+        raise PermissionError("Invalid Data Type")
+    return _te_init_store(
+        res, res_store, store.vec_size, store.mem_type, store.data_type
+    )
+
+
+def fp_d2h(target, src, stream):
+    return src
+
+
+def bi_free(src):
+    FPGA_LIB.c_free(c_void_p(src.bigint_storage))
+    src.bigint_storage = None
+
+
+def fp_free(src):
+    FPGA_LIB.c_free(c_void_p(src.bigint_storage))
+    FPGA_LIB.c_free(c_void_p(src.base_storage))
+    FPGA_LIB.c_free(c_void_p(src.exp_storage))
+    src.bigint_storage, src.base_storage, src.exp_storage = None, None, None
+
+
+'''
+    function: change the FixedPointStorage's data back into a C type
+    As there is no shape involved in the function,
+    we cannot know the return shape of the function
+    input:
+            src: FixedPointStorage, containing the data that need to be changed
+    output:
+            return value: containing 3 ndarray:
+                            fpn_array,base_array,exp_array
+'''
+
+
+def fp_c2p(src):
+    return __get_c_fpn_storage(
+        src.bigint_storage,
+        src.base_storage,
+        src.exp_storage,
+        src.vec_size,
+        src.encode_n,
+        src.max_int,
+    )
+
+
+def pi_c2p_mp(src):
+    '''
+    convert PaillierEncryptedStorage from C mem type to Python one
+    this one use multiprocess to accelerate
+    --------------
+    Para:    src, PaillierEncryptedStorage
+    Return:  tuple, each element is a ndarray,
+                    identical to sequence of encoding, base, exponent
+    '''
+    return __get_c_pen_storage_mp(
+        src.pen_storage,
+        src.base_storage,
+        src.exp_storage,
+        src.vec_size,
+        src.encode_n)
+
+
+def pi_c2p(src):
+    '''convert PaillierEncryptedStorage from C mem type to Python one'''
+    return __get_c_pen_storage_raw(
+        src.pen_storage,
+        src.base_storage,
+        src.exp_storage,
+        src.vec_size,
+        src.encode_n)
+
+
+def fp_mul(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store,
+        res_shape,
+        stream):
+    '''
+    Perform element-wise multiplication between two FixedPointStorage.
+    This is a plaintext computation rather than an encrypted one.
+    ------------------
+    Paras:
+        left_store, right_store: FixedPointStorage
+        left_shape, right_shape: TensorShapeStorage
+    Return:
+        tuple, (FixedPointStorage, TensorShapeStorage)
+    '''
+    P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
+    # P,Q is the dim of the left_store(pen)
+    # R,S is the dim of the right_store(fpn)
+    res_size = max(P, R) * max(Q, S)
+    # the left_store data
+    l_fpn = left_store.bigint_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_fpn = right_store.bigint_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_fpn = FPGA_LIB.c_malloc(c_size_t(res_size * PLAIN_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_fpn = res_store.bigint_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    FPGA_dev_num = __get_FPGA_device_num(left_store.mem_type)
+    FPGA_LIB.fpn_mul(
+        c_char_p(l_fpn),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_fpn),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_fpn),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(R),
+        c_size_t(S),
+        c_char_p(left_store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
+        c_size_t(PLAIN_BITS),
+        c_size_t(FPGA_dev_num),
+    )
+    # handle data_type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+    return _fp_init_ss(
+        res_store,
+        res_fpn,
+        res_base,
+        res_exp,
+        res_size,
+        left_store.encode_n,
+        left_store.max_int,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+    )
+
+
+def fp_p2c(target, src, data_type=FLOAT_TYPE):
+    '''change a FixedPointNumber ndarray into a FixedPointStorage Class'''
+    if isinstance(src, list):
+        vec_size = len(src)
+    elif isinstance(src, np.ndarray):
+        vec_size = src.size
+        src = src.flat
+    else:
+        raise TypeError("Unsupported Data Structure")
+    # malloc the space for the type
+    if target is None:
+        res_fpn = FPGA_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_fpn = target.bigint_storage
+        res_base = target.base_storage
+        res_exp = target.exp_storage
+    # the temp ndarray buffer
+    base_temp, exp_temp = [], []
+    # get the two encoding parameters
+    n = src[0].n
+    max_int = src[0].max_int
+    for i in range(vec_size):
+        src_number = src[i].encoding.to_bytes(PLAIN_BYTE, 'little')
+        FPGA_LIB.bigint_set(
+            c_char_p(res_fpn),
+            c_char_p(src_number),
+            c_size_t(PLAIN_BITS),
+            c_size_t(i))
+        base_temp.append(src[i].BASE)
+        exp_temp.append(src[i].exponent)
+
+    base_arr_ptr = np.asarray(base_temp).ctypes.data_as(c_void_p)
+    exp_arr_ptr = np.asarray(exp_temp).ctypes.data_as(c_void_p)
+    FPGA_LIB.unsigned_set(c_void_p(res_base), base_arr_ptr, c_size_t(vec_size))
+    FPGA_LIB.unsigned_set(c_void_p(res_exp), exp_arr_ptr, c_size_t(vec_size))
+
+    return _fp_init_store(
+        target,
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        n,
+        max_int,
+        MEM_FPGA_NUM_0,
+        data_type,
+    )
+
+
+def fp_h2d(target, src):
+    return src
+
+
+def _index_reset(index, dim_size):
+    if index < 0:
+        res_index = index + dim_size
+        res_index = max(0, res_index)
+    elif index > dim_size:
+        res_index = dim_size
+    else:
+        res_index = index
+    return res_index
+
+
+def fp_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
+    '''
+    slice a contiguous memory space, now support two directions.
+    -----------------------------
+    Para:
+    store: FixedPointStorage, the data to be sliced
+    shape: TensorShapeStorage, the original shape of the storage
+    start: int, the start index of the slice (included)
+    end:   int, the end index of the slice(not included),
+           if larger than the last index, concatencate it into the dim size
+    axis:  0 or 1, 0 means cut it horizontally, 1 means cut it vertically
+    stream: the current stream of the task, not used now
+    -----------------------------
+    Return:
+    res_store, res_shape, FixedPointStorage, TensorShapeStorage
+    Raise:
+        PermissionError: if the input start/stop/axis is not valid
+    '''
+    src_fpn = store.bigint_storage
+    src_base = store.base_storage
+    src_exp = store.exp_storage
+    fpn_shape_tuple = shape.to_tuple()
+    dim0, dim1 = 0, 0
+    '''handle shape and index'''
+    if len(fpn_shape_tuple) == 0:
+        raise PermissionError("Cannot slice 0 dim!")
+    elif len(fpn_shape_tuple) == 1:
+        dim0, dim1 = 1, fpn_shape_tuple[0]
+        if axis == 0:
+            raise PermissionError("Cannot slice 1 dim horizontally!")
+        start = _index_reset(start, dim1)
+        stop = _index_reset(stop, dim1)
+    elif len(fpn_shape_tuple) == 2:
+        dim0, dim1 = fpn_shape_tuple[0], fpn_shape_tuple[1]
+        if axis == 0:
+            start = _index_reset(start, dim0)
+            stop = _index_reset(stop, dim0)
+        if axis == 1:
+            start = _index_reset(start, dim1)
+            stop = _index_reset(stop, dim1)
+    else:
+        raise PermissionError("Invalid shape")
+    # handle condition that a[k: l] k>=l for 2-d array
+    # will cause the result shape to be (0, dim1)
+    if axis == 0 and start >= stop:
+        res_fpn, res_base, res_exp = None, None, None
+        return _fp_init_ss(
+            None,
+            res_fpn,
+            res_base,
+            res_exp,
+            0,
+            store.encode_n,
+            store.encode_max_int,
+            None,
+            (0, dim1),
+            store.mem_type,
+            store.data_type,
+        )
+    # handle condition that a[:,k:l] k>=l for 2-d array
+    # will cause the result shape to be (dim0, 0)
+    if axis == 1 and start >= stop:
+        res_fpn, res_base, res_exp = None, None, None
+        res_shape_tuple = (dim0, 0) if len(fpn_shape_tuple) == 2 else (0,)
+        return _fp_init_ss(
+            None,
+            res_fpn,
+            res_base,
+            res_exp,
+            0,
+            store.encode_n,
+            store.encode_max_int,
+            None,
+            res_shape_tuple,
+            store.mem_type,
+            store.data_type,
+        )
+        # handle the normal slice
+    res_shape_tuple, vec_size = (), 0
+    '''useful paras'''
+    bigint_row_bytelen = dim1 * PLAIN_BYTE
+    uint32_row_bytelen = dim1 * U_INT32_BYTE
+    gap_length = stop - start
+    # start slice
+    if axis == 1:
+        'axis == 1 means that we need to cut the matrix vertically'
+        res_bigint_row_bytelen = gap_length * PLAIN_BYTE
+        res_uint32_row_bytelen = gap_length * U_INT32_BYTE
+        if res_store is None:
+            res_fpn = FPGA_LIB.c_malloc(
+                c_size_t(res_bigint_row_bytelen * dim0))
+            res_base = FPGA_LIB.c_malloc(
+                c_size_t(res_uint32_row_bytelen * dim0))
+            res_exp = FPGA_LIB.c_malloc(
+                c_size_t(res_uint32_row_bytelen * dim0))
+        else:
+            res_fpn = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        # call the raw function
+        FPGA_LIB.slice_vertical(
+            c_char_p(src_fpn),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(PLAIN_BITS),
+            c_uint32(0),
+        )
+        if len(fpn_shape_tuple) == 1:
+            res_shape_tuple = (gap_length,)
+            vec_size = res_shape_tuple[0]
+        else:
+            res_shape_tuple = (dim0, gap_length)
+            vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+
+    elif axis == 0:
+        'axis == 0 means that we nned to cut the matrix horizontally'
+        if res_store is None:
+            res_fpn = FPGA_LIB.c_malloc(
+                c_size_t(bigint_row_bytelen * gap_length))
+            res_base = FPGA_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
+            res_exp = FPGA_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
+        else:
+            res_fpn = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        FPGA_LIB.slice_horizontal(
+            c_char_p(src_fpn),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(PLAIN_BITS),
+            c_uint32(0),
+        )
+        res_shape_tuple = (gap_length, dim1)
+        vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+    else:
+        raise NotImplementedError("Only support 2 dimensional slice")
+
+    return _fp_init_ss(
+        res_store,
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        store.encode_n,
+        store.max_int,
+        res_shape,
+        res_shape_tuple,
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def pi_slice(store, shape, start, stop, axis, res_store, res_shape, stream):
+    '''
+    slice a contiguous memory space, now support two directions.
+    -----------------------------
+    Para:
+    store: PaillierEncryptedStorage, the data to be sliced
+    shape: TensorShapeStorage, the original shape of the storage
+    start: int, the start index of the slice (included)
+    end:   int, the end index of the slice(not included),
+           if it is larger than the last index, then it concatencate into the dim size
+    axis:  0 or 1, 0 means cut it horizontally, 1 means cut it vertically
+    stream: the current stream of the task, not used now
+    -----------------------------
+    Return:
+    res_store, res_shape, PaillierEncryptedStorage, TensorShapeStorage
+    '''
+    src_pen = store.pen_storage
+    src_base = store.base_storage
+    src_exp = store.exp_storage
+    # get the two dims and check for illegal status
+    pen_shape_tuple = shape.to_tuple()
+    dim0, dim1 = 0, 0
+    if len(pen_shape_tuple) == 0:
+        raise PermissionError("Cannot slice 0 dim!")
+    elif len(pen_shape_tuple) == 1:
+        dim0, dim1 = 1, pen_shape_tuple[0]
+        if axis == 0:
+            raise PermissionError("Cannot slice 1 dim horizontally!")
+        start = _index_reset(start, dim1)
+        stop = _index_reset(stop, dim1)
+    elif len(pen_shape_tuple) == 2:
+        dim0, dim1 = pen_shape_tuple[0], pen_shape_tuple[1]
+        if axis == 0:
+            start = _index_reset(start, dim0)
+            stop = _index_reset(stop, dim0)
+        if axis == 1:
+            start = _index_reset(start, dim1)
+            stop = _index_reset(stop, dim1)
+    else:
+        raise PermissionError("Invalid shape")
+
+    # handle condition that a[k, l], k>=l for 2-d array
+    # will cause the result shape to be (0, dim1)
+    if axis == 0 and start >= stop:
+        res_pen, res_base, res_exp = None, None, None
+        return _pi_init_ss(
+            None,
+            res_pen,
+            res_base,
+            res_exp,
+            0,
+            None,
+            (0, dim1),
+            store.mem_type,
+            store.data_type,
+            store.encode_n,
+            store.encode_max_int,
+        )
+    # handle condition that a[:, k, l] k>=l for 2-d array
+    # will cause the result shape to be (dim0, 0)
+    if axis == 1 and start >= stop:
+        res_pen, res_base, res_exp = None, None, None
+        res_shape_tuple = (dim0, 0) if len(pen_shape_tuple) == 2 else (0,)
+        return _pi_init_ss(
+            None,
+            res_pen,
+            res_base,
+            res_exp,
+            0,
+            None,
+            res_shape_tuple,
+            store.mem_type,
+            store.data_type,
+            store.encode_n,
+            store.encode_max_int,
+        )
+    # handle the normal slice
+    res_shape_tuple = ()
+    vec_size = 0
+    '''useful paras'''
+    bigint_row_bytelen = dim1 * PLAIN_BYTE
+    uint32_row_bytelen = dim1 * U_INT32_BYTE
+    gap_length = stop - start
+    # start slice
+    if axis == 1:
+        'axis == 1 means that we need to cut the matrix vertically'
+        res_bigint_row_bytelen = gap_length * PLAIN_BYTE
+        res_uint32_row_bytelen = gap_length * U_INT32_BYTE
+        # malloc space for result
+        if res_store is None:
+            res_pen = FPGA_LIB.c_malloc(
+                c_size_t(res_bigint_row_bytelen * dim0))
+            res_base = FPGA_LIB.c_malloc(
+                c_size_t(res_uint32_row_bytelen * dim0))
+            res_exp = FPGA_LIB.c_malloc(
+                c_size_t(res_uint32_row_bytelen * dim0))
+        else:
+            res_pen = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        # call the raw function
+        FPGA_LIB.slice_vertical(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(CIPHER_BITS),
+            c_uint32(0),
+        )
+        if len(pen_shape_tuple) == 1:
+            res_shape_tuple = (gap_length,)
+            vec_size = res_shape_tuple[0]
+        else:
+            res_shape_tuple = (dim0, gap_length)
+            vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+
+    elif axis == 0:
+        'axis == 0 means that we nned to cut the matrix horizontally'
+        if res_store is None:
+            res_pen = FPGA_LIB.c_malloc(
+                c_size_t(bigint_row_bytelen * gap_length))
+            res_base = FPGA_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
+            res_exp = FPGA_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
+        else:
+            res_pen = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        FPGA_LIB.slice_horizontal(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(CIPHER_BITS),
+            c_uint32(0),
+        )
+        # since 1-dim shape will not occur here, result shape is always 2-D
+        res_shape_tuple = (gap_length, dim1)
+        vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+    else:
+        raise NotImplementedError()
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        res_shape,
+        res_shape_tuple,
+        store.mem_type,
+        store.data_type,
+        store.encode_n,
+        store.encode_max_int,
+    )
+
+
+def bi_p2c(data, res, bytelen=CIPHER_BYTE):
+    '''
+    copy data to the C memory pointed to by res
+    -------------------
+    Para:
+        data: List[object], each object is a bigint CIPHER_BIT long
+        res:  int, actually a pointer pointing to C memory
+    Return:
+        None, but the contents in c_void_p(res) has been changed
+    '''
+    for i in range(len(data)):
+        src_number = data[i].to_bytes(bytelen, 'little')
+        FPGA_LIB.bigint_set(
+            c_char_p(res), c_char_p(src_number), c_size_t(bytelen), c_size_t(i)
+        )
+
+
+def bi_gen_rand(elem_size, count, res, rand_seed, stream):
+    '''
+    generate random bigint for pi_obfuscation
+    ------------------
+    Para:
+        elem_size: int, length of random bigint, upper bound is CIPHER_BYTE
+        count:     int, number of random bigint to be generated
+        res:       BigintStorage, the return value
+        rand_seed: seed used for generating random data
+    Return:
+        BigintStorage, same as res
+    '''
+    # Didn't use vectorize since that we need to_bytes()
+    # But ndarray_float65 has no to_bytes method
+    random.seed(rand_seed)
+    rands = np.array([random.randrange(1, 8 ** elem_size) for i in range(count)])
+    if res is None:
+        data_storage = FPGA_LIB.c_malloc(c_size_t(count * CIPHER_BYTE))
+    else:
+        data_storage = res.bigint_storage
+    # CIPHER_BYTE is the upper bound of the length of the rand number
+    '''
+    We assume that the store of random bigint is on FPGA device_0
+    TODO: Add configuration for choosing divice
+    '''
+    bi_p2c(rands, data_storage)
+    return _bi_init_store(
+        res,
+        data_storage,
+        count,
+        mem_type=MEM_FPGA_NUM_0,
+        elem_size=CIPHER_BYTE)
+
+
+def __get_shape_size(shape_tuple):
+    shape_size = 1
+    if len(shape_tuple) == 1:
+        shape_size = shape_tuple[0]
+    elif len(shape_tuple) == 2:
+        shape_size = shape_tuple[0] * shape_tuple[1]
+    else:
+        raise PermissionError("Invalid Shape Tuple")
+
+    return shape_size
+
+
+def pi_reshape(store, shape, new_shape, res_store, res_shape, stream):
+    '''
+    Change a PaillierEcnryptedStorage's shape.
+    No need for change the continuous storage, only change the shape.
+    -------------------
+    Paras:
+        store, shape:  PaillierEncryptedStorage, TensorShapeStorage
+        new_shape:     TensorShapeStorage, the new shape for the pi_storage
+    Returns:
+        tuple: (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        ValueError:    If shape and new_shape's size is unequal
+    '''
+    res_shape_tuple = new_shape.to_tuple()
+    old_shape_tuple = shape.to_tuple()
+    res_shape_size = __get_shape_size(res_shape_tuple)
+    old_shape_size = __get_shape_size(old_shape_tuple)
+    res_vec_size = store.vec_size
+    if res_shape_size != old_shape_size:
+        raise ValueError("total size of new array must be unchanged!")
+    # Still, we do a malloc and memcpy in order to avoid double free in python
+    if res_store is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(CIPHER_BYTE * res_vec_size))
+        res_base = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    FPGA_LIB.c_memcpy(
+        c_void_p(res_pen),
+        c_void_p(store.pen_storage),
+        c_size_t(CIPHER_BYTE * res_vec_size),
+    )
+    FPGA_LIB.c_memcpy(
+        c_void_p(res_base),
+        c_void_p(store.base_storage),
+        c_size_t(U_INT32_BYTE * res_vec_size),
+    )
+    FPGA_LIB.c_memcpy(
+        c_void_p(res_exp),
+        c_void_p(store.exp_storage),
+        c_size_t(U_INT32_BYTE * res_vec_size),
+    )
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        store.vec_size,
+        res_shape,
+        res_shape_tuple,
+        store.mem_type,
+        store.data_type,
+        store.encode_n,
+        store.encode_max_int,
+    )
+
+
+def fp_cat(stores, shapes, axis, res_store, res_shape):
+    '''
+    concat several FixedPointStorage according to axis
+    --------------------
+    Para:
+        stores: List or ndarray, elements are FixedPointStorage
+        shapes: List or ndarray, elements are TensorShapeStorage
+        axis:   int, how stores will be stacked
+                    0 means a vertical stack, stack along 1st dim
+                    1 means a horizontal stack, stack along 2nd dim
+        res_store: FixedPointStorage, the stacked result
+        res_shape: TensorShapeStorage, the result's shape
+    Return:
+        tuple, (FixedPointStorage, TensorShapeStorage)
+    Raise:
+        PermissionError: Invalid input data or invalid shape
+        NotImplementedError: Current only support at most 2-D matrix
+    '''
+    stores = list(stores)
+    shapes = list(shapes)
+    num_stores = len(stores)
+    res_vec_size = np.sum([v.vec_size for v in stores])
+    # Abnormal checks
+    if num_stores < 2:
+        raise PermissionError("At least 2 Storages required for concatenation")
+    if len(shapes) != num_stores:
+        raise PermissionError(
+            "The number of storages and that of shapes didn't match")
+    for v in stores:
+        if v.data_type != stores[0].data_type:
+            raise PermissionError(
+                "All storages should have the same data type")
+        if v.encode_n != stores[0].encode_n:
+            raise PermissionError("All storages should have the same n")
+        if v.max_int != stores[0].max_int:
+            raise PermissionError("All storages should have the same max_int")
+        if v.mem_type != stores[0].mem_type:
+            raise PermissionError(
+                "All storages should have the same memory type")
+    # num_rows, num_cols is the data demanded by C functions
+    # res_rows, res_cols are return values that should be same as numpy's output
+    # distinguish them such that upper and lower level won't bother each other
+    if axis == 0:
+        first_shape_decomposed = __shape_decompose(shapes[0])
+        num_rows, num_cols = 0, first_shape_decomposed[1]
+        for v in shapes:
+            shape_tuple = __shape_decompose(v)
+            if shape_tuple[1] != num_cols:
+                raise PermissionError("Shapes didn't align")
+            num_rows += shape_tuple[0]
+        res_rows = num_rows
+        res_cols = num_cols
+    elif axis == 1:
+        first_shape = shapes[0].to_tuple()
+        if len(first_shape) <= 1:
+            num_rows, num_cols = 1, 0
+            for v in shapes:
+                if len(v.to_tuple()) == 0:
+                    num_cols += 1
+                if len(v.to_tuple()) == 1:
+                    num_cols += v.to_tuple()[0]
+                if len(v.to_tuple()) >= 2:
+                    raise PermissionError("Shape cannot align!!!")
+            res_rows = num_cols
+            res_cols = None
+        elif len(first_shape) == 2:
+            num_rows, num_cols = first_shape[0], 0
+            for v in shapes:
+                v_shape = v.to_tuple()
+                if len(v_shape) != 2 or num_rows != v_shape[0]:
+                    raise PermissionError("Shape cannot align!")
+                num_cols += v_shape[1]
+            res_rows = num_rows
+            res_cols = num_cols
+        else:
+            raise NotImplementedError("Now only support up to 2-D array")
+    else:
+        raise PermissionError("Invalid Axis")
+    res_shape = TensorShapeStorage(res_rows, res_cols)
+
+    fpn_pointers = [c_void_p(v.bigint_storage) for v in stores]
+    base_pointers = [c_void_p(v.base_storage) for v in stores]
+    exp_pointers = [c_void_p(v.exp_storage) for v in stores]
+
+    if res_store is None:
+        res_fpn = FPGA_LIB.c_malloc(c_size_t(PLAIN_BYTE * res_vec_size))
+        res_base = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+    else:
+        res_fpn = res_store.bigint_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    fpn_arr = (c_void_p * num_stores)(*fpn_pointers)
+    base_arr = (c_void_p * num_stores)(*base_pointers)
+    exp_arr = (c_void_p * num_stores)(*exp_pointers)
+    vec_sizes = (c_uint32 * num_stores)(*[v.vec_size for v in stores])
+
+    if axis == 0:
+        '''means that we should cat stores vertically'''
+        FPGA_LIB.vstack(
+            fpn_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_uint64(num_stores),
+            vec_sizes,
+            c_uint64(res_cols),
+            c_size_t(PLAIN_BITS),
+        )
+    elif axis == 1:
+        '''means that we should cat stores horizontally'''
+        FPGA_LIB.hstack(
+            fpn_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_uint64(num_stores),
+            vec_sizes,
+            c_uint64(res_rows),
+            c_size_t(PLAIN_BITS),
+        )
+        # raise NotImplementedError()
+    else:
+        raise NotImplementedError()
+
+    return _fp_init_ss(
+        res_store,
+        res_fpn,
+        res_base,
+        res_exp,
+        int(round(res_vec_size)),
+        stores[0].encode_n,
+        stores[0].max_int,
+        res_shape,
+        res_shape.to_tuple(),
+        stores[0].mem_type,
+        stores[0].data_type,
+    )
+
+
+def pi_cat(stores, shapes, axis, res_store, res_shape):
+    '''
+    concat several PaillierEncryptedStorage according to axis
+    --------------------
+    Para:
+        stores: List or ndarray, elements are PaillierEncryptedStorage
+        shapes: List or ndarray, elements are TensorShapeStorage
+        axis:   int, how stores will be stacked
+                    0 means a vertical stack, stack along 1st dim
+                    1 means a horizontal stack, stack along 2nd dim
+        res_store: PaillierEncryptedStorage, the stacked result
+        res_shape: TensorShapeStorage, the result's shape
+    Return:
+        tuple, (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError: Invalid input data or invalid shape
+        NotImplementedError: Current only support at most 2-D matrix
+    '''
+    stores = list(stores)
+    shapes = list(shapes)
+    num_stores = len(stores)
+    res_vec_size = np.sum([v.vec_size for v in stores])
+
+    # Anomaly checks
+    if num_stores < 2:
+        raise PermissionError("At least 2 Storages required for concatenation")
+    if len(shapes) != num_stores:
+        raise PermissionError(
+            "The number of storages and that of shapes didn't match")
+    for v in stores:
+        if v.data_type != stores[0].data_type:
+            raise PermissionError(
+                "All storages should have the same data type")
+        if v.encode_n != stores[0].encode_n:
+            raise PermissionError("All storages should have the same n")
+        if v.encode_max_int != stores[0].encode_max_int:
+            raise PermissionError("All storages should have the same max_int")
+        if v.mem_type != stores[0].mem_type:
+            raise PermissionError(
+                "All storages should have the same memory type")
+    # num_rows, num_cols is the data demanded by C functions
+    # res_rows, res_cols are return values that should be same as numpy's output
+    # distinguish them so upper and lower level won't bother each other
+    if axis == 0:
+        first_shape_decomposed = __shape_decompose(shapes[0])
+        num_rows, num_cols = 0, first_shape_decomposed[1]
+        for v in shapes:
+            shape_tuple = __shape_decompose(v)
+            if shape_tuple[1] != num_cols:
+                raise PermissionError("Shapes didn't align")
+            num_rows += shape_tuple[0]
+        res_rows = num_rows
+        res_cols = num_cols
+    elif axis == 1:
+        '''the horizontal cat'''
+        first_shape = shapes[0].to_tuple()
+        if len(first_shape) <= 1:
+            num_rows = 1
+            num_cols = 0
+            for v in shapes:
+                if len(v.to_tuple()) == 0:
+                    num_cols += 1
+                if len(v.to_tuple()) == 1:
+                    num_cols += v.to_tuple()[0]
+                if len(v.to_tuple()) >= 2:
+                    raise PermissionError("Shape cannot align!!!")
+            res_rows = num_cols
+            res_cols = None
+            print(num_rows, num_cols, res_rows, res_cols)
+        elif len(first_shape) == 2:
+            num_rows = first_shape[0]
+            num_cols = 0
+            for v in shapes:
+                v_shape = v.to_tuple()
+                if len(v_shape) != 2 or num_rows != v_shape[0]:
+                    raise PermissionError("Shape cannot align!")
+                # num_rows += v_shape[0]
+                num_cols += v_shape[1]
+            res_rows = num_rows
+            res_cols = num_cols
+        else:
+            raise NotImplementedError("Now only support up to 2-D array")
+    else:
+        raise PermissionError("Invalid Axis")
+    res_shape = TensorShapeStorage(res_rows, res_cols)
+
+    pen_pointers = [c_void_p(v.pen_storage) for v in stores]
+    base_pointers = [c_void_p(v.base_storage) for v in stores]
+    exp_pointers = [c_void_p(v.exp_storage) for v in stores]
+    # print(res_vec_size)
+    if res_store is None:
+        res_pen = FPGA_LIB.c_malloc(c_size_t(CIPHER_BYTE * res_vec_size))
+        res_base = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    pen_arr = (c_void_p * num_stores)(*pen_pointers)
+    base_arr = (c_void_p * num_stores)(*base_pointers)
+    exp_arr = (c_void_p * num_stores)(*exp_pointers)
+    vec_sizes = (c_uint32 * num_stores)(*[v.vec_size for v in stores])
+
+    if axis == 0:
+        FPGA_LIB.vstack(
+            pen_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_uint64(num_stores),
+            vec_sizes,
+            c_uint64(num_cols),
+            c_size_t(CIPHER_BITS),
+        )
+    elif axis == 1:
+        FPGA_LIB.hstack(
+            pen_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_uint64(num_stores),
+            vec_sizes,
+            c_uint64(num_rows),
+            c_size_t(CIPHER_BITS),
+        )
+    else:
+        raise NotImplementedError()
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        int(res_vec_size),
+        res_shape,
+        res_shape.to_tuple(),
+        stores[0].mem_type,
+        stores[0].data_type,
+        stores[0].encode_n,
+        stores[0].encode_max_int,
+    )
+
+
+def random_p2c(rands, bitlen, size):
+    bytelen = bit_change(bitlen, 1) // 8
+    data_storage = FPGA_LIB.c_malloc(c_size_t(size * bytelen))
+    bi_p2c(rands, data_storage)
+    return _bi_init_store(
+        None, data_storage, size, mem_type=MEM_FPGA_NUM_0, elem_size=bytelen
+    )
+
+
+def random_c2p(random_store: BigIntStorage, size):
+    bytelen = random_store.elem_size
+    random_res = c_buffer(bytelen)
+    res_list = []
+    for i in range(size):
+        FPGA_LIB.c_memcpy(
+            cast(random_res, c_void_p),
+            c_void_p(random_store.bigint_storage + i * bytelen),
+            c_size_t(bytelen),
+        )
+        temp_int = int.from_bytes(random_res.raw, 'little')
+        res_list.append(temp_int)
+    return res_list
+
+
+class Hash_key_storage:
+    '''
+    parameters:
+    hash_storage: int, address of C memory storing the big integer
+    '''
+
+    def __init__(self, hash_storage):
+        self.hash_storage = hash_storage
+
+    def __del__(self):
+        hash_free(self.hash_storage)
+        self.hash_storage = None
+
+
+def hash_free(hash_key):
+    FPGA_LIB.c_free(c_void_p(hash_key))
+
+
+def hash_p2c(data, bitlen):
+    '''convert the data into Hash_key_storage,
+    since all data is identically bitlen, no value/index is needed'''
+    if isinstance(data, list):
+        data = np.asarray(data)
+    if not isinstance(data, np.ndarray):
+        raise TypeError("Unsupported Data Structure")
+    bytelen = bitlen // 8
+    elem_cnt = data.size
+    data_ptr = FPGA_LIB.c_malloc(c_size_t(bytelen * elem_cnt))
+    for i in range(elem_cnt):
+        try:
+            FPGA_LIB.c_memcpy(
+                c_void_p(data_ptr + i * bytelen),
+                c_char_p(data[i].to_bytes(bytelen, "little")),
+                c_size_t(bytelen),
+            )
+        except AttributeError:
+            raise AttributeError("Only support int type!!!")
+        except BaseException:
+            raise RuntimeError("Running c memory copy failed!!")
+
+    return Hash_key_storage(data_ptr)
+
+
+def hash_c2p(store: Hash_key_storage, size, bitlength):
+    '''
+    transform the rsa computing result into a python dictionary
+    seperate from the sha_c2p due to big endian and small endian difference, especially when there is zero
+    '''
+    bytelen = bitlength // 8
+    sha_res = c_buffer(bytelen)
+    res_list = []
+    for i in range(size):
+        FPGA_LIB.c_memcpy(
+            cast(sha_res, c_void_p),
+            c_void_p(store.hash_storage + i * bytelen),
+            c_size_t(bytelen),
+        )
+        temp_int = int.from_bytes(sha_res.raw, 'little')
+        res_list.append(temp_int)
+    return res_list
+
+
+def rsa_c2bytes(storage: Hash_key_storage, size, bitlength):
+    store_size = bitlength // 8 * size
+    bytes = c_buffer(store_size)
+    FPGA_LIB.c_memcpy(
+        cast(
+            bytes, c_void_p), c_void_p(
+            storage.hash_storage), c_size_t(store_size))
+    return bytes.raw
+
+
+def rsa_bytes2c(bytes, size, bitlength):
+    store_size = bitlength // 8 * size
+    hash_key = FPGA_LIB.c_malloc(c_size_t(store_size))
+    FPGA_LIB.c_memcpy(
+        c_void_p(hash_key),
+        c_char_p(bytes),
+        c_size_t(store_size))
+    return Hash_key_storage(hash_key)
+
+
+def hash_bit_inquiry(hash_method):
+    dist_encode_function = {
+        "md5": 256,
+        "sha1": 256,
+        "sha224": 256,
+        "sha256": 256,
+        "sha384": 512,
+        "sha512": 512,
+        "sm3": 256,
+        "none": 256,
+    }
+    return dist_encode_function[hash_method]
+
+
+def gmp_gen_rand(bit_len, vec_size, n):
+    RSA_bitlength = bit_change(n, 0)
+    output_bitlength = bit_change(bit_len, 1)
+    random_bytelength = output_bitlength // 8
+    if output_bitlength > RSA_bitlength:
+        raise PermissionError(
+            f"bitlength should be smaller than the given size {RSA_bitlength}"
+        )
+
+    res_rand = FPGA_LIB.c_malloc(c_size_t(vec_size * random_bytelength))
+    FPGA_LIB.gmp_random(
+        c_char_p(res_rand),
+        c_size_t(bit_len),
+        c_size_t(output_bitlength),
+        c_size_t(RSA_bitlength),
+        c_size_t(vec_size),
+        c_char_p(n.to_bytes(RSA_bitlength // 8, 'little')),
+    )
+
+    return _bi_init_store(
+        None,
+        res_rand,
+        vec_size,
+        mem_type=MEM_FPGA_NUM_0,
+        elem_size=random_bytelength)
+
+
+def compute_hash(key_number, hash_method, hash_bitlength, size, salt):
+    hash_bytelength = hash_bitlength // 8
+    hash_storage = FPGA_LIB.c_malloc(c_size_t(hash_bytelength * size))
+    if isinstance(key_number, Hash_key_storage):
+        key_length_storage = FPGA_LIB.c_malloc(c_size_t(INT64_BYTE * size))
+        FPGA_LIB.hex_to_int(
+            c_void_p(key_number.hash_storage),
+            c_uint32(hash_bytelength),
+            c_size_t(size),
+            c_void_p(hash_storage),
+            c_void_p(key_length_storage),
+        )
+    else:
+        key_storage, key_length_storage = keyid_p2c(key_number, salt)
+
+    FPGA_LIB.computeSHA256_index(
+        c_void_p(key_storage),
+        c_void_p(key_length_storage),
+        c_size_t(size),
+        c_void_p(hash_storage),
+    )
+    return Hash_key_storage(hash_storage)
+
+
+def keyid_p2c(data, salt):
+    '''
+    Change the input list into a SHA_storage
+
+    Parameters:
+    ------------------
+    data, list or ndarray, contains a butch of id
+        we assume that each id should be a string rather than a int or something else
+    '''
+    # preprocess
+    if isinstance(data, list):
+        data = np.asarray(data)
+    if not isinstance(data, np.ndarray):
+        raise TypeError("Unsupported Data Structure")
+    # malloc the space
+    str_len = 0
+    vec_len = []
+    for x in data:
+        x = str(x) + salt
+        str_len += len(x)
+        vec_len.append(len(x))
+    hash_key = FPGA_LIB.c_malloc(c_size_t(str_len))
+    # then we should feed all the strings into this place
+    index = 0
+    for i in range(len(data)):
+        FPGA_LIB.c_memcpy(
+            c_void_p(hash_key + index),
+            c_char_p(bytes(str(data[i]), encoding="utf-8")),
+            c_size_t(vec_len[i]),
+        )
+        index += vec_len[i]
+    # then the vec_len should also be changed to as a pointer
+    vec_len = np.asarray(vec_len).astype(np.int64)
+    vec_size = vec_len.size
+    length_storage_ptr = FPGA_LIB.c_malloc(c_size_t(vec_size * INT64_BYTE))
+    len_ptr = vec_len.ctypes.data_as(c_void_p)
+    FPGA_LIB.c_memcpy(
+        c_void_p(length_storage_ptr), len_ptr, c_size_t(vec_size * INT64_BYTE)
+    )
+
+    # switch the differnt data type
+    return hash_key, length_storage_ptr
+
+
+def bit_change(raw_number, type):
+    if type == 0:
+        bitlength = math.log(raw_number, 2)
+    else:
+        bitlength = raw_number
+    if bitlength > 4096:
+        raise PermissionError("Invalid Data range for FPGA")
+    if bitlength > 2048:
+        return 4096
+    if bitlength > 1024:
+        return 2048
+    if bitlength > 512:
+        return 1024
+    if bitlength > 256:
+        return 512
+    return 256
+
+
+def rsa_pubkey_id_process(
+        random: BigIntStorage,
+        exponent,
+        modulus,
+        hash: Hash_key_storage,
+        hash_length,
+        size):
+    if size != random.vec_size:
+        raise PermissionError(
+            f"The size of random vector {random.vec_size} does not equal to size of hash {size}"
+        )
+    exp_length = bit_change(exponent, 0)
+    modulus_length = bit_change(modulus, 0)
+    res_rsa = FPGA_LIB.c_malloc(c_size_t(size * modulus_length // 8))
+    exp_ptr = c_char_p(exponent.to_bytes(exp_length // 8, 'little'))
+    modulus_ptr = c_char_p(modulus.to_bytes(modulus_length // 8, 'little'))
+
+    FPGA_LIB.rsa_pubkey_id_process(
+        c_char_p(random.bigint_storage),
+        c_char_p(hash.hash_storage),
+        modulus_ptr,
+        exp_ptr,
+        c_char_p(res_rsa),
+        c_size_t(size),
+        c_size_t(modulus_length),
+        c_size_t(exp_length),
+        c_size_t(hash_length),
+        c_size_t(8 * random.elem_size),
+        c_size_t(0),
+    )
+
+    return Hash_key_storage(res_rsa)
+
+
+def rsa_powmod(hash: Hash_key_storage, exponent, modulus, hash_length, size):
+    exp_length = bit_change(exponent, 0)
+    modulus_length = bit_change(modulus, 0)
+    res_rsa = FPGA_LIB.c_malloc(c_size_t(size * modulus_length // 8))
+    exp_ptr = c_char_p(exponent.to_bytes(exp_length // 8, 'little'))
+    modulus_ptr = c_char_p(modulus.to_bytes(modulus_length // 8, 'little'))
+
+    FPGA_LIB.rsa_powmod(
+        c_char_p(hash.hash_storage),
+        exp_ptr,
+        modulus_ptr,
+        c_char_p(res_rsa),
+        c_size_t(size),
+        c_size_t(hash_length),
+        c_size_t(exp_length),
+        c_size_t(modulus_length),
+        c_size_t(0),
+    )
+
+    return Hash_key_storage(res_rsa)
+
+
+def rsa_divm(
+        hash: Hash_key_storage,
+        random: BigIntStorage,
+        rsa_n,
+        hash_bit,
+        size):
+    if size != random.vec_size:
+        raise PermissionError(
+            f"The size of random vector {random.vec_size} does not equal to that of hash {size}"
+        )
+
+    modulus_length = bit_change(rsa_n, 0)
+    if modulus_length != hash_bit:
+        raise PermissionError(
+            f"The biglength of hash value from host {hash_bit} does not equal to that of key {modulus_length}"
+        )
+
+    modulus_ptr = c_char_p(rsa_n.to_bytes(modulus_length // 8, 'little'))
+    res_rsa = FPGA_LIB.c_malloc(c_size_t(size * modulus_length // 8))
+    FPGA_LIB.RSA_divmod(
+        c_char_p(hash.hash_storage),
+        c_char_p(random.bigint_storage),
+        modulus_ptr,
+        c_char_p(res_rsa),
+        c_size_t(size),
+        c_size_t(hash_bit),
+        c_size_t(8 * random.elem_size),
+    )
+
+    return Hash_key_storage(res_rsa)
+
+
+def fp_align(store, res_store, stream):
+    '''
+    Perform alignment for elements in a FixedPointStorage.
+    ------------------
+    Paras:
+        store: FixedPointStorage
+    Return:
+        res_store: FixedPointStorage
+    '''
+    vec_size = store.vec_size
+    # the src_store data
+    src_fpn = store.bigint_storage
+    src_base = store.base_storage
+    src_exp = store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_fpn = FPGA_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+        res_base = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = FPGA_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_fpn = res_store.bigint_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    FPGA_dev_num = __get_FPGA_device_num(store.mem_type)
+    FPGA_LIB.fpn_align(
+        c_char_p(src_fpn),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_fpn),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(vec_size),
+        c_char_p(store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
+        c_size_t(CIPHER_BITS),
+        c_size_t(FPGA_dev_num),
+    )
+    # # handle data_type
+    # data_type = 0
+    # if store.data_type == INT64_TYPE:
+    #     data_type = INT64_TYPE
+    # else:
+    #     data_type = FLOAT_TYPE
+    return _fp_init_store(
+        res_store,
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        store.encode_n,
+        store.max_int,
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def pi_sum_multi_index(
+        pub_key, store, valid_index, node_id, node_num, min_value=0, max_value=None
+):
+    '''
+    Run sum for data with the same index indicated in the valid_index list
+    Return: A PEN_Storage class with max_value-min_value+1 number of PEN values
+    ------------
+    Parameters:
+        pub_key: PubKeyStorage
+        store:   PaillierEncryptedStorage, the original PEN_storage class
+        valid_index:  ndarray, contains indices like [-1, 1, 2, 1, 3, 3, 2, -1] for each instance,
+                        -1 means that this value will not be calculated if min_value >= 0
+                        1,2,3 means the different groups that it belongs to
+        node_id:      ndarray, contains node_id like [3, 1, 0, 2, 3, 1] for each instance.
+                        0,1,2 represent the node that current instance locates in
+        node_num:     int, number of nodes
+        min_value:    int, The min valid value of the valid index, default 0,
+                           in the above example, if min_value == 1, then -1 will be invalid
+                           if min_value == -1, -1 is also valid
+        max_value:    int, The max valid value of the valid index
+    Return:
+        tuple   (PaillierEncryptedStorage, TensorShapeStorage)
+    '''
+    src_pen = store.pen_storage
+    src_base = store.base_storage
+    src_exp = store.exp_storage
+    vec_size = store.vec_size
+    valid_store = te_p2c(valid_index, None)
+    node_id_store = te_p2c(node_id, None)
+    # set max_value to maximum number if it is not designated
+    max_value = max(valid_index) if max_value is None else max_value
+    index_num = max_value - min_value + 1
+    res_size = index_num * node_num
+
+    res_pen = FPGA_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+    res_base = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_exp = FPGA_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_shape_tuple = (node_num, index_num)
+    FPGA_dev_num = __get_FPGA_device_num(store.mem_type)
+    FPGA_LIB.pen_sum_with_multi_index(
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(index_num),
+        c_size_t(node_num),
+        c_int64(min_value),
+        c_void_p(valid_store.data),
+        c_void_p(node_id_store.data),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(vec_size),
+        c_size_t(CIPHER_BITS),
+        c_uint32(FPGA_dev_num),
+    )
+    return _pi_init_ss(
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        None,
+        res_shape_tuple,
+        MEM_HOST,
+        store.data_type,
+        store.encode_n,
+        store.encode_max_int,
+    )
+
+
+def pi_accumulate(pub_key, store, shape):
+    '''
+    Perform acummulate add for a vector
+    ----------------
+    Paras:
+        pub_key:     PubKeyStorage,
+        left_store:  PaillierEncryptedStorage
+        left_shape:  TensorShapeStorage
+    Return:
+        tuple:       (PaillierEncryptedStorage, TensorShapeStorage)
+    '''
+    src_pen = store.pen_storage
+    src_base = store.base_storage
+    src_exp = store.exp_storage
+    vec_size = store.vec_size
+
+    res_pen = FPGA_LIB.c_malloc(c_size_t(CIPHER_BYTE * vec_size))
+    res_base = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    res_exp = FPGA_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    res_shape_tuple = shape.to_tuple()
+    if len(res_shape_tuple) == 1:
+        res_shape_tuple = (1, res_shape_tuple[0])
+
+    FPGA_LIB.gmp_accumulate(
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_void_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(res_shape_tuple[0]),
+        c_size_t(res_shape_tuple[1]),
+        pub_key.n,
+        pub_key.g,
+        pub_key.nsquare,
+        pub_key.max_int,
+        c_size_t(CIPHER_BITS),
+        0,
+    )
+
+    return _pi_init_ss(
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        None,
+        res_shape_tuple,
+        store.mem_type,
+        store.data_type,
+        store.encode_n,
+        store.encode_max_int,
+    )
diff --git a/gpu/tensor/paillier_fpga/paillier_fpga/fpga_tensor.py b/gpu/tensor/paillier_fpga/paillier_fpga/fpga_tensor.py
new file mode 100644
index 0000000000..b672a512e7
--- /dev/null
+++ b/gpu/tensor/paillier_fpga/paillier_fpga/fpga_tensor.py
@@ -0,0 +1,511 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import typing
+
+import numpy as np
+
+from .fpga_engine import (
+    PaillierEncryptedStorage,
+    TensorShapeStorage,
+    pi_add,
+    te_p2c,
+    fp_encode,
+    pi_encrypt,
+    pi_mul,
+    pi_matmul,
+    pi_rmatmul,
+    pi_sum,
+    pi_p2c_pub_key,
+    pi_decrypt,
+    pi_p2c_priv_key,
+    te_c2p,
+)
+from fate_arch.tensor.impl.blocks.python_paillier_block import (
+    PaillierPublicKey,
+    PaillierPrivateKey,
+    PaillierKeypair,
+)
+
+
+class Cipherblock:
+    def __init__(
+            self,
+            store: PaillierEncryptedStorage,
+            shape: TensorShapeStorage,
+            pk: "PK"):
+        self.store = store
+        self.shape = shape
+        self.pk = pk
+
+    def get_shape(self):
+        return self.shape.to_tuple()
+
+    def get_size(self):
+        return self.shape.size()
+
+    @staticmethod
+    def gen_shape(other):
+        return TensorShapeStorage().from_tuple(other.shape)
+
+    def _add_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        pi_store = pi_encrypt(self.pk.cpu_pub_key, fp_store)
+        res_store, res_shape = pi_add(
+            self.pk.cpu_pub_key, self.store, pi_store, self.shape, self.gen_shape(other))
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def _mul_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        res_store, res_shape = pi_mul(
+            self.pk.cpu_pub_key, self.store, fp_store, self.shape, self.gen_shape(other))
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def _matmul_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        res_store, res_shape = pi_matmul(
+            self.pk.cpu_pub_key, self.store, fp_store, self.shape, self.gen_shape(other))
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def _rmatmul_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        res_store, res_shape = pi_rmatmul(
+            self.pk.cpu_pub_key, fp_store, self.store, self.gen_shape(other), self.shape)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def add_cipherblock(self, other: "Cipherblock") -> "Cipherblock":
+        res_store, res_shape = pi_add(
+            self.pk.cpu_pub_key, self.store, other.store, self.shape, other.shape)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def add_plaintext_f64(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_f32(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_i64(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_i32(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_scalar_f64(
+            self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float64)
+        return self._add_plaintext(other_array)
+
+    def add_plaintext_scalar_f32(
+            self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float32)
+        return self._add_plaintext(other_array)
+
+    def add_plaintext_scalar_i64(
+            self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int64)
+        return self._add_plaintext(other_array)
+
+    def add_plaintext_scalar_i32(
+            self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int32)
+        return self._add_plaintext(other_array)
+
+    def sub_cipherblock(self, other: "Cipherblock") -> "Cipherblock":
+        return self.add_cipherblock(other.mul_plaintext_scalar_i32(-1))
+
+    def sub_plaintext_f64(self, other) -> "Cipherblock":
+        return self.add_plaintext_f64(other * -1)
+
+    def sub_plaintext_f32(self, other) -> "Cipherblock":
+        return self.add_plaintext_f32(other * -1)
+
+    def sub_plaintext_i64(self, other) -> "Cipherblock":
+        return self.add_plaintext_i64(other * -1)
+
+    def sub_plaintext_i32(self, other) -> "Cipherblock":
+        return self.add_plaintext_i32(other * -1)
+
+    def sub_plaintext_scalar_f64(
+            self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_f64(other * -1)
+
+    def sub_plaintext_scalar_f32(
+            self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_f32(other * -1)
+
+    def sub_plaintext_scalar_i64(
+            self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_i64(other * -1)
+
+    def sub_plaintext_scalar_i32(
+            self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_i32(other * -1)
+
+    def mul_plaintext_f64(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_f32(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_i64(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_i32(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_scalar_f64(
+            self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float64)
+        return self._mul_plaintext(other_array)
+
+    def mul_plaintext_scalar_f32(
+            self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float32)
+        return self._mul_plaintext(other_array)
+
+    def mul_plaintext_scalar_i64(
+            self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int64)
+        return self._mul_plaintext(other_array)
+
+    def mul_plaintext_scalar_i32(
+            self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int32)
+        return self._mul_plaintext(other_array)
+
+    def matmul_plaintext_ix2_f64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix2_f32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix2_i64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix2_i32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_f64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_f32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_i64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_i32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_f64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_f32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_i64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_i32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_f64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_f32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_i64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_i32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def sum(self) -> "Cipherblock":
+        res_store, res_shape = pi_sum(
+            self.pk.cpu_pub_key, self.store, self.shape, axis=None
+        )
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def sum_axis(self, axis=None):
+        res_store, res_shape = pi_sum(
+            self.pk.cpu_pub_key, self.store, self.shape, axis)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def mean(self) -> "Cipherblock":
+        return self.sum().mul_plaintext_scalar_f64(float(1 / self.get_size()))
+
+    """parallel"""
+
+    def add_cipherblock_par(self, other: "Cipherblock") -> "Cipherblock":
+        return self.add_cipherblock(other)
+
+    def add_plaintext_f64_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_f64(other)
+
+    def add_plaintext_f32_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_f32(other)
+
+    def add_plaintext_i64_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_i64(other)
+
+    def add_plaintext_scalar_f64_par(
+            self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_f64(other)
+
+    def add_plaintext_scalar_f32_par(
+            self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_f32(other)
+
+    def add_plaintext_scalar_i64_par(
+            self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_i64(other)
+
+    def add_plaintext_scalar_i32_par(
+            self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_i32(other)
+
+    def add_plaintext_i32_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_i32(other)
+
+    def sub_cipherblock_par(self, other: "Cipherblock") -> "Cipherblock":
+        return self.sub_cipherblock(other)
+
+    def sub_plaintext_f64_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_f64(other)
+
+    def sub_plaintext_f32_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_f32(other)
+
+    def sub_plaintext_i64_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_i64(other)
+
+    def sub_plaintext_i32_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_i32(other)
+
+    def sub_plaintext_scalar_f64_par(
+            self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        return self.sub_plaintext_scalar_f64(other)
+
+    def sub_plaintext_scalar_f32_par(
+            self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        return self.sub_plaintext_scalar_f32(other)
+
+    def sub_plaintext_scalar_i64_par(
+            self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        return self.sub_plaintext_scalar_i64(other)
+
+    def sub_plaintext_scalar_i32_par(
+            self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        return self.sub_plaintext_scalar_i32(other)
+
+    def mul_plaintext_f64_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_f64(other)
+
+    def mul_plaintext_f32_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_f32(other)
+
+    def mul_plaintext_i64_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_i64(other)
+
+    def mul_plaintext_i32_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_i32(other)
+
+    def mul_plaintext_scalar_f64_par(
+            self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        return self.mul_plaintext_scalar_f64(other)
+
+    def mul_plaintext_scalar_f32_par(
+            self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        return self.mul_plaintext_scalar_f32(other)
+
+    def mul_plaintext_scalar_i64_par(
+            self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        return self.mul_plaintext_scalar_i64(other)
+
+    def mul_plaintext_scalar_i32_par(
+            self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        return self.mul_plaintext_scalar_i32(other)
+
+    def matmul_plaintext_ix2_f64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_f64(other)
+
+    def matmul_plaintext_ix2_f32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_f32(other)
+
+    def matmul_plaintext_ix2_i64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_i64(other)
+
+    def matmul_plaintext_ix2_i32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_i32(other)
+
+    def matmul_plaintext_ix1_f64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_f64(other)
+
+    def matmul_plaintext_ix1_f32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_f32(other)
+
+    def matmul_plaintext_ix1_i64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_i64(other)
+
+    def matmul_plaintext_ix1_i32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_i32(other)
+
+    def rmatmul_plaintext_ix2_f64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_f64(other)
+
+    def rmatmul_plaintext_ix2_f32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_f32(other)
+
+    def rmatmul_plaintext_ix2_i64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_i64(other)
+
+    def rmatmul_plaintext_ix2_i32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_i32(other)
+
+    def rmatmul_plaintext_ix1_f64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_f64(other)
+
+    def rmatmul_plaintext_ix1_f32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_f32(other)
+
+    def rmatmul_plaintext_ix1_i64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_i64(other)
+
+    def rmatmul_plaintext_ix1_i32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_i32(other)
+
+    def sum_par(self) -> "Cipherblock":
+        return self.sum()
+
+    def mean_par(self) -> "Cipherblock":
+        return self.mean()
+
+
+class PK:
+    def __init__(self, pub_key: PaillierPublicKey):
+        self.pub_key = pub_key
+        self.cpu_pub_key = pi_p2c_pub_key(None, self.pub_key)
+
+    def _encrypt(self, a) -> Cipherblock:
+        shape = TensorShapeStorage().from_tuple(a.shape)
+        fp_store = fp_encode(te_p2c(a), self.pub_key.n, self.pub_key.max_int)
+        pi_store = pi_encrypt(self.cpu_pub_key, fp_store)
+        return Cipherblock(pi_store, shape, self)
+
+    def encrypt_f64(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_f32(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_i64(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_i32(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_f64_par(self, a) -> Cipherblock:
+        return self.encrypt_f64(a)
+
+    def encrypt_f32_par(self, a) -> Cipherblock:
+        return self.encrypt_f32(a)
+
+    def encrypt_i64_par(self, a) -> Cipherblock:
+        return self.encrypt_i64(a)
+
+    def encrypt_i32_par(self, a) -> Cipherblock:
+        return self.encrypt_i32(a)
+
+
+class SK:
+    def __init__(self, priv_key: PaillierPrivateKey, pk: PK):
+        self.priv_key = priv_key
+        self.cpu_priv_key = pi_p2c_priv_key(None, priv_key)
+        self.pk = pk
+
+    def _decrypt(self, a: Cipherblock):
+        if a.store.vec_size == 0:
+            return np.asarray([])
+        te_res = pi_decrypt(a.pk.cpu_pub_key, self.cpu_priv_key, a.store)
+        return te_c2p(te_res).reshape(a.get_shape())
+
+    def decrypt_f64(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.float64)
+
+    def decrypt_f32(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.float32)
+
+    def decrypt_i64(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.int64)
+
+    def decrypt_i32(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.int32)
+
+    def decrypt_f64_par(self, a: Cipherblock):
+        return self.decrypt_f64(a)
+
+    def decrypt_f32_par(self, a: Cipherblock):
+        return self.decrypt_f32(a)
+
+    def decrypt_i64_par(self, a: Cipherblock):
+        return self.decrypt_i64(a)
+
+    def decrypt_i32_par(self, a: Cipherblock):
+        return self.decrypt_i32(a)
+
+
+def keygen(bit_size) -> typing.Tuple[PK, SK]:
+    pub_key, priv_key = PaillierKeypair.generate_keypair(n_length=bit_size)
+    pk = PK(pub_key)
+    sk = SK(priv_key, pk)
+    return pk, sk
diff --git a/gpu/tensor/paillier_fpga/paillier_fpga/tests/__init__.py b/gpu/tensor/paillier_fpga/paillier_fpga/tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/gpu/tensor/paillier_fpga/paillier_fpga/tests/test_fpga_engine.py b/gpu/tensor/paillier_fpga/paillier_fpga/tests/test_fpga_engine.py
new file mode 100755
index 0000000000..48c243c806
--- /dev/null
+++ b/gpu/tensor/paillier_fpga/paillier_fpga/tests/test_fpga_engine.py
@@ -0,0 +1,1133 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import numpy
+import unittest
+import random
+import functools
+import operator
+import time
+
+from fate_arch.tensor.impl.blocks.python_paillier_block import (
+    PaillierKeypair,
+    PaillierEncryptedNumber,
+    FixedPointNumber,
+)
+
+from ..fpga_engine import (
+    FLOAT_TYPE,
+    INT64_TYPE,
+    pi_p2c_pub_key,
+    pi_p2c_priv_key,
+    pi_h2d_pub_key,
+    pi_h2d_priv_key,
+    TensorShapeStorage,
+    bi_alloc,
+    PLAIN_BYTE,
+    MEM_HOST,
+    te_alloc,
+    fp_alloc,
+    pi_alloc,
+    te_p2c,
+    fp_encode,
+    fp_decode,
+    te_c2p,
+    pi_encrypt,
+    pi_gen_obf_seed,
+    CIPHER_BITS,
+    pi_obfuscate,
+    pi_c2p,
+    pi_decrypt,
+    fp_mul,
+    fp_c2p,
+    pi_add,
+    pi_mul,
+    pi_sum,
+    bi_free,
+    te_free,
+    fp_free,
+    pi_free, te_slice, initialize_device, fp_p2c, pi_p2c, bi_gen_rand, bi_c2p, pi_transpose, pi_matmul, fp_transpose,
+    CIPHER_BYTE, te_c2bytes, te_bytes2c, fp_c2bytes, fp_bytes2c, pi_c2bytes, pi_bytes2c, pi_slice, pi_reshape,
+    te_c2p_first, TensorStorage, te_c2p_shape, te_cat, te_pow, te_add, te_mul, te_truediv, te_floordiv, te_sub,
+    te_matmul, te_abs, te_transpose, te_reshape, te_exp, te_hstack, pi_cat, te_sum, fp_slice, te_p2c_shape, fp_cat,
+    te_neg,
+)
+
+# SWITCH DATA TYPE HERE
+# EITHER INT64_TYPE OR FLOAT_TYPE
+RAND_TYPE = INT64_TYPE
+
+TEST_SIZE = 6
+ERROR_TOLERANCE = 1e-10
+
+
+def generate_rand(test_size):
+    if RAND_TYPE == FLOAT_TYPE:
+        return numpy.random.normal(0, 10, test_size)
+    elif RAND_TYPE == INT64_TYPE:
+        return numpy.random.randint(-2 ** 30, 2 ** 30, test_size)
+    else:
+        raise TypeError("Invalid data type")
+
+
+def assert_diff(res, ref):
+    if res == 0 or ref == 0:
+        assert res == 0
+        assert ref == 0
+    else:
+        diff = res - ref
+        assert abs(diff / res) < ERROR_TOLERANCE
+        assert abs(diff / ref) < ERROR_TOLERANCE
+
+
+def assert_ndarray_diff(res, ref):
+    assert res.shape == ref.shape
+    res, ref = res.flatten(), ref.flatten()
+    assert res.shape == ref.shape
+    for i in range(res.size):
+        assert_diff(res[i], ref[i])
+
+
+class TestOperators(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        # sys.stdout = open("stdout.log", 'a')  # uncomment this to redirect stdout
+        # random.seed(time.time())  # no need to set random.seed as we're using numpy.random
+        initialize_device()
+        cls._pub_key, cls._priv_key = PaillierKeypair.generate_keypair()
+        cls.n, cls.max_int = cls._pub_key.n, cls._pub_key.max_int
+        cls._cpu_pub_key = pi_p2c_pub_key(None, cls._pub_key)
+        cls._cpu_priv_key = pi_p2c_priv_key(None, cls._priv_key)
+        cls._fpga_pub_key = pi_h2d_pub_key(None, cls._cpu_pub_key)
+        cls._fpga_priv_key = pi_h2d_priv_key(None, cls._cpu_priv_key)
+        print("\n\n", "*" * 100, "\n\nInitialization complete\nTest Size:", TEST_SIZE)
+
+    # test encode and decode
+    # using operators: te_p2c, fp_encode, fp_c2p, fp_decode, te_c2p
+    def test_encode_and_decode(self):
+        print("\n\n", "*" * 100, "\n\nTest Encode and Decode Begins")
+
+        raw = generate_rand(TEST_SIZE)
+        raw[TEST_SIZE // 2] = 0  # test encode zero
+        store = te_p2c(raw, None)
+        precision = 10000 if RAND_TYPE == FLOAT_TYPE else None
+
+        # check encoded numbers (fixed-point numbers)
+        fpga_encoded_store = fp_encode(store, self.n, self.max_int, precision, None)
+        fpga_encoded = fp_c2p(fpga_encoded_store)
+        cpu_encoded = [FixedPointNumber.encode(v, self.n, self.max_int, precision) for v in raw]
+        assert len(fpga_encoded) == TEST_SIZE
+        assert len(cpu_encoded) == TEST_SIZE
+        for i in range(TEST_SIZE):
+            print("i:", i, ", raw data:", raw[i])
+            print("FPGA encoding:", fpga_encoded[i].encoding, ", base:", fpga_encoded[i].BASE, ", exp:",
+                  fpga_encoded[i].exponent)
+            print("CPU encoding:", cpu_encoded[i].encoding, ", base:", cpu_encoded[i].BASE, ", exp:",
+                  cpu_encoded[i].exponent)
+        for i in range(TEST_SIZE):
+            assert fpga_encoded[i].encoding == cpu_encoded[i].encoding
+            assert fpga_encoded[i].BASE == cpu_encoded[i].BASE
+            assert fpga_encoded[i].exponent == cpu_encoded[i].exponent
+
+        # check decoded numbers
+        cpu_encoded_cpu_decoded = [v.decode() for v in cpu_encoded]
+        cpu_encoded_fpga_decoded = te_c2p(fp_decode(fp_p2c(None, cpu_encoded, RAND_TYPE), None, None))
+        fpga_encoded_cpu_decoded = [v.decode() for v in fpga_encoded]
+        fpga_encoded_fpga_decoded = te_c2p(fp_decode(fpga_encoded_store, None, None))
+        assert len(cpu_encoded_cpu_decoded) == TEST_SIZE
+        assert len(cpu_encoded_fpga_decoded) == TEST_SIZE
+        assert len(fpga_encoded_cpu_decoded) == TEST_SIZE
+        assert len(fpga_encoded_fpga_decoded) == TEST_SIZE
+        for i in range(TEST_SIZE):
+            print("decoded compare: i:", i, cpu_encoded_cpu_decoded[i], cpu_encoded_fpga_decoded[i],
+                  fpga_encoded_cpu_decoded[i], fpga_encoded_fpga_decoded[i])
+            assert_diff(cpu_encoded_fpga_decoded[i], cpu_encoded_cpu_decoded[i])
+            assert_diff(fpga_encoded_cpu_decoded[i], cpu_encoded_cpu_decoded[i])
+            assert_diff(fpga_encoded_fpga_decoded[i], cpu_encoded_cpu_decoded[i])
+
+        print("test passed")
+
+    # test encrypt and decrypt
+    # using operators: fp_encode, pi_encrypt, pi_decrypt, te_p2c, te_c2p, pi_c2p
+    def test_encrypt_and_decrypt(self):
+        print("\n\n", "*" * 100, "\n\nTest Encrypt And Decrypt Begins")
+
+        print("\nPart 1: FPGA encrypt, FPGA decrypt")
+        raw = generate_rand(TEST_SIZE)
+        store = te_p2c(raw, None)
+        encoded = fp_encode(store, self.n, self.max_int)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded, None, None)
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, encrypted, None, None)
+        ref1 = te_c2p(decrypted)
+        assert store.data_type == RAND_TYPE
+        assert encoded.data_type == RAND_TYPE
+        assert encrypted.data_type == RAND_TYPE
+        assert decrypted.data_type == RAND_TYPE
+        for i in range(TEST_SIZE):
+            print("i:", i, ", original:", raw[i], ", decrypted:", ref1[i])
+            assert_diff(raw[i], ref1[i])
+
+        print("\nPart 2: FPGA encrypt, CPU decrypt")
+        tmp_enc, _, tmp_exp = pi_c2p(encrypted)
+        pen_recv = [PaillierEncryptedNumber(self._pub_key, tmp_enc[i], int(round(tmp_exp[i]))) for i in
+                    range(TEST_SIZE)]
+        ref2 = [self._priv_key.decrypt(v) for v in pen_recv]
+        for i in range(TEST_SIZE):
+            print("i:", i, ", original:", raw[i], ", decrypted:", ref2[i])
+            assert_diff(raw[i], ref2[i])
+
+        print("\nPart 3: CPU encrypt, FPGA decrypt")
+        # print("FPGA decrypting a CPU encrypted number currently unavailable, needs pi_p2c support")
+        cpu_encrypted = [self._pub_key.encrypt(raw[i], None, 0) for i in range(TEST_SIZE)]
+        for i in range(TEST_SIZE):
+            print("FPGA: i:", i, ", cipher text:", pen_recv[i].ciphertext(False), ", exp:", pen_recv[i].exponent)
+            print("CPU: i:", i, ", cipher text:", cpu_encrypted[i].ciphertext(False), ", exp:",
+                  cpu_encrypted[i].exponent)
+            assert pen_recv[i].exponent == cpu_encrypted[i].exponent
+            try:
+                assert pen_recv[i].ciphertext(False) == cpu_encrypted[i].ciphertext(False)
+            except AssertionError:
+                # Note that there's an approx 1/1000 probability that these ciphers don't match
+                # However, this shouldn't affect the final result
+                print("\n>>>>>> The following cipher texts didn't match:")
+                print("raw number:", raw[i])
+                print("FPGA encoding:", fp_c2p(encoded)[i].encoding)
+                print("CPU encoding:", FixedPointNumber.encode(raw[i], self.n, self.max_int).encoding)
+                print("FPGA cipher:", pen_recv[i].ciphertext(False))
+                print("CPU cipher:", cpu_encrypted[i].ciphertext(False))
+                print("pub_key.n:", self._pub_key.n)
+                print("pub_key.nsquare:", self._pub_key.nsquare)
+                print("priv_key.p:", self._priv_key.p)
+                print("priv_key.q:", self._priv_key.q)
+                print(">>>>>> End Dumping\n")
+        pi_store = pi_p2c(None, cpu_encrypted, RAND_TYPE)
+        ref3 = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, pi_store, None, None))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], ref3[i])
+            print("i:", i, ", original:", raw[i], ", decrypted:", ref3[i])
+
+        print("test passed")
+
+    def test_pi_add(self):
+        print("\n\n", "*" * 100, "\n\nTest Paillier Encrypted Number Add Begins")
+        raw_1, raw_2 = generate_rand(2), generate_rand(TEST_SIZE)
+        te_store_1, te_store_2 = te_p2c(raw_1, None), te_p2c(raw_2, None)
+        encoded_1, encoded_2 = fp_encode(te_store_1, self.n, self.max_int), fp_encode(te_store_2, self.n, self.max_int)
+        encrypted_1, encrypted_2 = pi_encrypt(self._fpga_pub_key, encoded_1, None, None), pi_encrypt(self._fpga_pub_key,
+                                                                                                     encoded_2, None,
+                                                                                                     None)
+        shape_1, shape_2 = TensorShapeStorage(2, 1), TensorShapeStorage(2, 3)  # passed different shapes
+        res_store, res_shape = pi_add(self._fpga_pub_key, encrypted_1, encrypted_2, shape_1, shape_2, None, None, None)
+        assert res_shape.to_tuple() == (2, 3)
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, res_store, None, None)
+        received = te_c2p(decrypted)
+        for i in range(TEST_SIZE):
+            print("i:", i, ", raw result:", raw_1[i // 3] + raw_2[i], ", FPGA result:", received[i])
+            assert_diff(raw_1[i // 3] + raw_2[i], received[i])
+        print("test passed")
+
+    def test_pi_mul(self):
+        print("\n\n", "*" * 100, "\n\nTest PEN Multiplies FPN Begins")
+        raw_1, raw_2 = generate_rand(3), generate_rand(TEST_SIZE)
+        te_store_1, te_store_2 = te_p2c(raw_1, None), te_p2c(raw_2, None)
+        encoded_1, encoded_2 = fp_encode(te_store_1, self.n, self.max_int), fp_encode(te_store_2, self.n, self.max_int)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded_1, None, None)
+        shape_1, shape_2 = TensorShapeStorage(3), TensorShapeStorage(2, 3)  # passed different shapes
+        res_store, res_shape = pi_mul(self._fpga_pub_key, encrypted, encoded_2, shape_1, shape_2, None, None, None)
+        assert res_shape.to_tuple() == (2, 3)
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, res_store, None, None)
+        received = te_c2p(decrypted)
+        for i in range(TEST_SIZE):
+            print("i:", i, ", raw result:", raw_1[i % 3] * raw_2[i], ", FPGA result:", received[i])
+            assert_diff(raw_1[i % 3] * raw_2[i], received[i])
+        print("test passed")
+
+    def test_gen_obf_seed(self):
+        print("\n\n", "*" * 100, "\n\nTest Generate Obfscator Begins")
+        # why divided by 6, see pi_gen_obf_seed implementation
+        bi_store = pi_gen_obf_seed(None, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, 0, None)
+        obfuscators = bi_c2p(bi_store.bigint_storage, 0, TEST_SIZE)
+        for i in range(TEST_SIZE):
+            print("i:", i, "obfuscator:", obfuscators[i])
+            assert CIPHER_BITS * 0.9 <= obfuscators[i].bit_length()
+            assert obfuscators[i].bit_length() <= CIPHER_BITS
+        print("test passed")
+
+    def test_obfuscate(self):
+        print("\n\n", "*" * 100, "\n\nTest Obfuscate Begins")
+
+        # generate big random values
+        bi_rand_store = bi_gen_rand(CIPHER_BITS // 6, TEST_SIZE, None, 0, None)
+        bi_rand_vals = bi_c2p(bi_rand_store.bigint_storage, 0, TEST_SIZE)
+        obf_rand_store = pi_gen_obf_seed(None, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, 0, None)
+
+        print("\nPart 1: FPGA encrypt, FPGA decrypt")
+        raw = generate_rand(TEST_SIZE)
+        store = te_p2c(raw, None)
+        encoded = fp_encode(store, self.n, self.max_int)
+        raw_encrypted = pi_encrypt(self._fpga_pub_key, encoded, None, None)
+        encrypted = pi_obfuscate(self._fpga_pub_key, raw_encrypted, obf_rand_store, None, None)
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, encrypted, None, None)
+        ref1 = te_c2p(decrypted)
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], ref1[i])
+            print("i:", i, ", original:", raw[i], ", decrypted:", ref1[i])
+
+        print("\nPart 2: FPGA encrypt, CPU decrypt")
+        tmp_enc, _, tmp_exp = pi_c2p(encrypted)
+        pen_recv = [PaillierEncryptedNumber(self._pub_key, tmp_enc[i], int(round(tmp_exp[i]))) for i in
+                    range(TEST_SIZE)]
+        ref2 = [self._priv_key.decrypt(v) for v in pen_recv]
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], ref2[i])
+            print("i:", i, ", original:", raw[i], ", decrypted:", ref2[i])
+
+        print("\nPart 3: CPU encrypt, FPGA decrypt")
+        cpu_encrypted = [self._pub_key.encrypt(raw[i], None, bi_rand_vals[i]) for i in range(TEST_SIZE)]
+        for i in range(TEST_SIZE):
+            print("FPGA: i:", i, ", encoding:", pen_recv[i].ciphertext(False), ", exp:", pen_recv[i].exponent)
+            print("CPU: i:", i, ", encoding:", cpu_encrypted[i].ciphertext(False), ", exp:", cpu_encrypted[i].exponent)
+            assert pen_recv[i].ciphertext(False) == cpu_encrypted[i].ciphertext(False)
+            assert pen_recv[i].exponent == cpu_encrypted[i].exponent
+        pi_store = pi_p2c(None, cpu_encrypted, RAND_TYPE)
+        ref3 = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, pi_store, None, None))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], ref3[i])
+            print("i:", i, ", original:", raw[i], ", decrypted:", ref3[i])
+
+        print("test passed")
+
+    # tests both PEN and FPN transpose
+    def test_transpose(self):
+        print("\n\n", "*" * 100, "\n\nTest transpose of both FPN and PEN matrices Begins")
+        raw = generate_rand(TEST_SIZE)
+        # generate test PaillierEncryptedStorage and its shape
+        te_store = te_p2c(raw, None)
+        encoded = fp_encode(te_store, self.n, self.max_int)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded, None, None)
+        rows, cols = 2, 3
+        shape = TensorShapeStorage(rows, cols)
+        pi_transpose_store, pi_transpose_shape = pi_transpose(encrypted, shape, None, None, None)
+        fp_transpose_store, fp_transpose_shape = fp_transpose(encoded, shape, None, None, None)
+        print("original shape:", shape.to_tuple(), ", transposed FPN shape:", fp_transpose_shape.to_tuple(),
+              ", transposed PEN shape", pi_transpose_shape.to_tuple())
+        assert pi_transpose_shape.to_tuple() == (cols, rows)
+        assert fp_transpose_shape.to_tuple() == (cols, rows)
+        fp_original = fp_c2p(encoded)
+        fp_transposed = fp_c2p(fp_transpose_store)
+        pi_original_cipher, pi_original_base, pi_original_exp = pi_c2p(encrypted)
+        pi_transposed_cipher, pi_transposed_base, pi_transposed_exp = pi_c2p(pi_transpose_store)
+        for i in range(rows):
+            for j in range(cols):
+                print("testing index (", i, ", ", j, ")")
+                assert fp_original[i * cols + j].encoding == fp_transposed[j * rows + i].encoding
+                assert fp_original[i * cols + j].BASE == fp_transposed[j * rows + i].BASE
+                assert fp_original[i * cols + j].exponent == fp_transposed[j * rows + i].exponent
+                assert pi_original_cipher[i * cols + j] == pi_transposed_cipher[j * rows + i]
+                assert pi_original_base[i * cols + j] == pi_transposed_base[j * rows + i]
+                assert pi_original_exp[i * cols + j] == pi_transposed_exp[j * rows + i]
+        print("test passed")
+
+    def test_pi_sum(self):
+        print("\n\n", "*" * 100, "\n\nTest Sum Begins")
+        # generate raw data
+        raw = generate_rand(TEST_SIZE)
+        # generate test PaillierEncryptedStorage and its shape
+        te_store = te_p2c(raw, None)
+        encoded = fp_encode(te_store, self.n, self.max_int)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded, None, None)
+        rows, cols = 2, 3
+        shape = TensorShapeStorage(rows, cols)
+
+        print("raw matrix:\n", numpy.asarray(raw).reshape(rows, cols))
+
+        print("TEST AXIS = 0")
+        res_sum_axis0, res_shape_axis0 = pi_sum(self._fpga_pub_key, encrypted, shape, 0, None, None, None)
+        res_axis0_fpga = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, res_sum_axis0, None, None))
+        res_axis0_cpu = numpy.asarray(raw).reshape(rows, cols).sum(axis=0)
+        print("result shape:", res_shape_axis0.to_tuple())
+        for i in range(cols):
+            print("column:", i, ", CPU result:", res_axis0_cpu[i], ", FPGA result:", res_axis0_fpga[i])
+            assert_diff(res_axis0_cpu[i], res_axis0_fpga[i])
+
+        print("TEST AXIS = 1")
+        res_sum_axis1, res_shape_axis1 = pi_sum(self._fpga_pub_key, encrypted, shape, 1, None, None, None)
+        res_axis1_fpga = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, res_sum_axis1, None, None))
+        res_axis1_cpu = numpy.asarray(raw).reshape(rows, cols).sum(axis=1)
+        print("result shape:", res_shape_axis1.to_tuple())
+        for i in range(rows):
+            print("column:", i, ", CPU result:", res_axis1_cpu[i], ", FPGA result:", res_axis1_fpga[i])
+            assert_diff(res_axis1_cpu[i], res_axis1_fpga[i])
+
+        print("TEST AXIS = None")
+        res_sum_axis, res_shape_axis = pi_sum(self._fpga_pub_key, encrypted, shape, None, None, None, None)
+        res_axis_fpga = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, res_sum_axis, None, None))
+        res_axis_cpu = [numpy.asarray(raw).reshape(rows, cols).sum()]
+        print("result shape:", res_shape_axis.to_tuple())
+        for i in range(pow(CIPHER_BYTE, 0, PLAIN_BYTE)):
+            print("result:", i, ", CPU result:", res_axis_cpu[i], ", FPGA result:", res_axis_fpga[i])
+            assert_diff(res_axis_cpu[i], res_axis_fpga[i])
+        print("test passed")
+
+    def test_pi_matmul(self):
+        print("\n\n", "*" * 100, "\n\nTest PEN Matrix_Multiplies FPN Begins")
+        raw_1, raw_2 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        # generate the 2 operands
+        te_store_1, te_store_2 = te_p2c(raw_1, None), te_p2c(raw_2, None)
+        encoded_1, encoded_2 = fp_encode(te_store_1, self.n, self.max_int), fp_encode(te_store_2, self.n, self.max_int)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded_1, None, None)
+        P, Q, R, S = 2, 3, 3, 2
+        shape_1, shape_2 = TensorShapeStorage(P, Q), TensorShapeStorage(R, S)
+        # then perform the matmul
+        res_store, res_shape = pi_matmul(self._fpga_pub_key, encrypted, encoded_2, shape_1, shape_2, None, None, None)
+        decrypted = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, res_store, None, None))
+        res = numpy.asarray(decrypted).reshape(res_shape.to_tuple())
+        ref = numpy.asarray(raw_1).reshape(P, Q) @ numpy.asarray(raw_2).reshape(R, S)
+        print("FPGA result shape:", res_shape.to_tuple(), ", CPU result shape:", ref.shape)
+        assert res_shape.to_tuple() == ref.shape
+        print("CPU result:\n", ref, "\nFPGA result:\n", res)
+        assert_ndarray_diff(res, ref)
+        print("test passed")
+
+    def test_combination(self):
+        print("\n\n", "*" * 100, "\n\nTest Combination Begins")
+
+        # generate operands
+        raw_1, raw_3 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        if RAND_TYPE == INT64_TYPE:
+            raw_2, raw_4 = [i % 16384 for i in generate_rand(TEST_SIZE)], [i % 16384 for i in generate_rand(TEST_SIZE)]
+        elif RAND_TYPE == FLOAT_TYPE:
+            raw_2, raw_4 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        else:
+            raise PermissionError("Invalid Data Type")
+        print('Raw data:\nraw_1:', raw_1, '\nraw_2:', raw_2, '\nraw_3:', raw_3, '\nraw_4:', raw_4)
+
+        # generate shapes and NumPy arrays
+        rows, cols = 2, 3
+        array_1, array_2 = numpy.asarray(raw_1).reshape(rows, cols), numpy.asarray(raw_2).reshape(cols, rows)
+        array_3, array_4 = numpy.asarray(raw_3).reshape(rows, cols), numpy.asarray(raw_4).reshape(rows, cols)
+        shape_1, shape_2 = TensorShapeStorage(rows, cols), TensorShapeStorage(cols, rows)
+        shape_3, shape_4 = TensorShapeStorage(rows, cols), TensorShapeStorage(rows, cols)
+
+        # transfer and encode
+        te_store_1, te_store_2 = te_p2c(raw_1, None), te_p2c(raw_2, None)
+        te_store_3, te_store_4 = te_p2c(raw_3, None), te_p2c(raw_4, None)
+        encoded_1, encoded_2 = fp_encode(te_store_1, self.n, self.max_int), fp_encode(te_store_2, self.n, self.max_int)
+        encoded_3, encoded_4 = fp_encode(te_store_3, self.n, self.max_int), fp_encode(te_store_4, self.n, self.max_int)
+
+        # perform encrypt and obfs
+        encrypted_old_1 = pi_encrypt(self._fpga_pub_key, encoded_1, None, None)
+        encrypted_old_3 = pi_encrypt(self._fpga_pub_key, encoded_3, None, None)
+        rand_store_1 = pi_gen_obf_seed(None, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, time.time(), None)
+        rand_store_3 = pi_gen_obf_seed(None, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, time.time(), None)
+        encrypted_1 = pi_obfuscate(self._fpga_pub_key, encrypted_old_1, rand_store_1, None, None)
+        encrypted_3 = pi_obfuscate(self._fpga_pub_key, encrypted_old_3, rand_store_3, None, None)
+
+        print("Perform Add")
+        add_res_store, add_res_shape = pi_add(self._fpga_pub_key, encrypted_1, encrypted_3, shape_1, shape_3, None,
+                                              None, None)
+        add_res = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, add_res_store, None, None)).reshape(
+            add_res_shape.to_tuple())
+        add_ref = array_1 + array_3
+        print("FPGA intermediate result:", add_res)
+        print("NumPy intermediate result:", add_ref)
+        assert_ndarray_diff(add_res, add_ref)
+
+        print("Perform Mul")
+        mul_res_store, mul_res_shape = pi_mul(self._fpga_pub_key, add_res_store, encoded_4, add_res_shape, shape_4,
+                                              None, None, None)
+        mul_res = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, mul_res_store, None, None)).reshape(
+            mul_res_shape.to_tuple())
+        mul_ref = (array_1 + array_3) * array_4
+        print("FPGA intermediate result:", mul_res)
+        print("NumPy intermediate result:", mul_ref)
+        assert_ndarray_diff(mul_res, mul_ref)
+
+        print("Perform Matmul")
+        matmul_res_store, matmul_res_shape = pi_matmul(self._fpga_pub_key, mul_res_store, encoded_2, mul_res_shape,
+                                                       shape_2, None, None, None)
+        matmul_res = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, matmul_res_store, None, None)).reshape(
+            matmul_res_shape.to_tuple())
+        matmul_ref = ((array_1 + array_3) * array_4) @ array_2
+        print("FPGA result shape:", matmul_res_shape.to_tuple(), ", CPU result shape:", matmul_ref.shape)
+        print("CPU result:\n", matmul_ref)
+        print("FPGA result:\n", matmul_res)
+        assert_ndarray_diff(matmul_res, matmul_ref)
+
+        print("test passed")
+
+    def test_c2bytes_and_bytes2c(self):
+        print("\n\n", "*" * 100, "\n\nTest bytes and c transformation begins")
+
+        raw = generate_rand(TEST_SIZE)
+        print("Raw Data:", raw)
+
+        print("\nPart 1: test te_c2bytes and te_bytes2c")
+        te_store = te_p2c(raw, None)
+        te_bytes = te_c2bytes(te_store, None)
+        te_store_recv = te_bytes2c(te_bytes, te_store)
+        te_ref = list(te_c2p(te_store_recv))
+        print("Bytes Representation:", te_bytes)
+        print("Received data:", te_ref)
+        assert te_store.data_type == RAND_TYPE
+        assert te_store_recv.data_type == RAND_TYPE
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], te_ref[i])
+
+        print("\nPart 2: test fp_c2bytes and fp_bytes2c")
+        fp_store = fp_encode(te_store, self.n, self.max_int)
+        fp_bytes = fp_c2bytes(fp_store, None)
+        fp_store_recv = fp_bytes2c(fp_bytes, fp_store)
+        fp_ref = list(te_c2p(fp_decode(fp_store_recv, None, None)))
+        print("Bytes Representation (excerpt):", fp_bytes[1888:1999])
+        print("Received data:", fp_ref)
+        assert fp_store.data_type == RAND_TYPE
+        assert fp_store_recv.data_type == RAND_TYPE
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], fp_ref[i])
+
+        print("\nPart 3: test pi_c2bytes and pi_bytes2c")
+        pi_store = pi_encrypt(self._fpga_pub_key, fp_store, None, None)
+        pi_bytes = pi_c2bytes(pi_store, None)
+        pi_store_recv = pi_bytes2c(pi_bytes, pi_store)
+        pi_ref = list(te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, pi_store_recv, None, None)))
+        print("Bytes Representation (excerpt):", pi_bytes[1888:1999])
+        print("Received data:", pi_ref)
+        assert pi_store.data_type == RAND_TYPE
+        assert pi_store_recv.data_type == RAND_TYPE
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], pi_ref[i])
+
+        print("test passed")
+
+    def test_fp_slice(self):
+        print("\n\n", "*" * 100, "\n\nTest fp_slice begins")
+        rows, cols = 3, 4
+        shape = [rows, cols]
+        begin_h, end_h = 2, 3
+        begin_v, end_v = 1, 3
+        raw = numpy.asarray(generate_rand(functools.reduce(operator.mul, [*shape], 1))).reshape(shape)
+        store = te_p2c(raw, None)
+        encoded = fp_encode(store, self.n, self.max_int)
+        slice_h_store, slice_h_shape = fp_slice(encoded, TensorShapeStorage(*shape), begin_h, end_h, 0, None, None,
+                                                None)
+        slice_v_store, slice_v_shape = fp_slice(encoded, TensorShapeStorage(*shape), begin_v, end_v, 1, None, None,
+                                                None)
+        recv_h = numpy.asarray(te_c2p(fp_decode(slice_h_store, None, None))).reshape(slice_h_shape)
+        recv_v = numpy.asarray(te_c2p(fp_decode(slice_v_store, None, None))).reshape(slice_v_shape)
+        print("raw array:\n", raw)
+        print("horizontal slice:\n", recv_h)
+        print("vertical slice:\n", recv_v)
+        for i in range(end_h - begin_h):
+            for j in range(cols):
+                assert_diff(raw[begin_h + i][j], recv_h[i][j])
+        for i in range(rows):
+            for j in range(end_v - begin_v):
+                assert_diff(raw[i][begin_v + j], recv_v[i][j])
+        assert slice_h_store.data_type == RAND_TYPE
+        assert slice_v_store.data_type == RAND_TYPE
+        print("test passed")
+
+    def test_pi_slice(self):
+        print("\n\n", "*" * 100, "\n\nTest pi_slice begins")
+        rows, cols = 3, 4
+        shape = [rows, cols]
+        begin_h, end_h = 2, 3
+        begin_v, end_v = 1, 3
+        raw = numpy.asarray(generate_rand(functools.reduce(operator.mul, [*shape], 1))).reshape(shape)
+        store = te_p2c(raw, None)
+        encoded = fp_encode(store, self.n, self.max_int)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded, None, None)
+        slice_h_store, slice_h_shape = pi_slice(encrypted, TensorShapeStorage(*shape), begin_h, end_h, 0, None, None,
+                                                None)
+        slice_v_store, slice_v_shape = pi_slice(encrypted, TensorShapeStorage(*shape), begin_v, end_v, 1, None, None,
+                                                None)
+        recv_h = numpy.asarray(
+            te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, slice_h_store, None, None))).reshape(
+            slice_h_shape)
+        recv_v = numpy.asarray(
+            te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, slice_v_store, None, None))).reshape(
+            slice_v_shape)
+        print("raw array:\n", raw)
+        print("horizontal slice:\n", recv_h)
+        print("vertical slice:\n", recv_v)
+        for i in range(end_h - begin_h):
+            for j in range(cols):
+                assert_diff(raw[begin_h + i][j], recv_h[i][j])
+        for i in range(rows):
+            for j in range(end_v - begin_v):
+                assert_diff(raw[i][begin_v + j], recv_v[i][j])
+        assert slice_h_store.data_type == RAND_TYPE
+        assert slice_v_store.data_type == RAND_TYPE
+        print("test passed")
+
+    def test_pi_reshape(self):
+        print("\n\n", "*" * 100, "\n\nTest pi_reshape begins")
+        raw = generate_rand(TEST_SIZE)
+        store = te_p2c(raw, None)
+        encoded = fp_encode(store, self.n, self.max_int)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded, None, None)
+        old_shape, new_shape = TensorShapeStorage(2, 3), TensorShapeStorage(3, 2)
+        new_store_res, new_shape_res = pi_reshape(encrypted, old_shape, new_shape, encrypted, None,
+                                                  None)  # PREVENT DOUBLE FREE: option 1
+        print("PyObject ids before and after reshape:", id(new_store_res), id(encrypted))
+        assert id(new_store_res) == id(encrypted)
+        # encrypted.exp_storage, encrypted.pen_storage, encrypted.base_storage =
+        # None, None, None  # PREVENT DOUBLE FREE: option 2
+
+        print("original shape:", old_shape.to_tuple(), ", returned shape:", new_shape_res.to_tuple(),
+              ", expected new shape:", new_shape.to_tuple())
+        assert new_shape.to_tuple() == new_shape_res.to_tuple()
+        recv = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, new_store_res, None, None))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], recv[i])
+        assert encoded.data_type == RAND_TYPE
+        assert encrypted.data_type == RAND_TYPE
+        assert new_store_res.data_type == RAND_TYPE
+        print("raw tensor:\n", numpy.asarray(raw).reshape(old_shape.to_tuple()))
+        print("reshaped tensor:\n", numpy.asarray(recv).reshape(new_shape_res.to_tuple()))
+
+        print("test passed")
+
+    def test_fp_mul(self):
+        print("\n\n", "*" * 100, "\n\nTest fp_mul begins")
+        raw_1, raw_2 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        store_1, store_2 = te_p2c(raw_1, None), te_p2c(raw_2, None)
+        encoded_1, encoded_2 = fp_encode(store_1, self.n, self.max_int), fp_encode(store_2, self.n, self.max_int)
+        res_store, res_shape = fp_mul(encoded_1, encoded_2, TensorShapeStorage(2, 3), TensorShapeStorage(2, 3), None,
+                                      None, None)
+        decoded = fp_decode(res_store, None, None)
+        recv = te_c2p(decoded)
+        assert res_shape.to_tuple() == (2, 3)
+        assert encoded_1.data_type == RAND_TYPE
+        assert encoded_2.data_type == RAND_TYPE
+        assert res_store.data_type == RAND_TYPE
+        assert decoded.data_type == RAND_TYPE
+
+        cpu_encoded_1 = [FixedPointNumber.encode(v, self.n, self.max_int) for v in raw_1]
+        cpu_encoded_2 = [FixedPointNumber.encode(v, self.n, self.max_int) for v in raw_2]
+        cpu_res = [FixedPointNumber((cpu_encoded_1[i].encoding * cpu_encoded_2[i].encoding) % self.n,
+                                    cpu_encoded_1[i].exponent + cpu_encoded_2[i].exponent, self.n, self.max_int) for i
+                   in range(TEST_SIZE)]
+        cpu_ref = [v.decode() for v in cpu_res]
+
+        print("FPGA result:", list(recv))
+        print("CPU result:", list(cpu_ref))
+
+        res_fp = fp_c2p(res_store)
+        for i in range(TEST_SIZE):
+            assert_diff(recv[i], cpu_ref[i])
+            assert_diff(res_fp[i].encoding, cpu_res[i].encoding)
+            assert res_fp[i].BASE == cpu_res[i].BASE
+            assert res_fp[i].exponent == cpu_res[i].exponent
+
+        print("test passed")
+
+    def test_te_c2p_first(self):
+        print("\n\n", "*" * 100, "\n\nTest te_c2p_first begins")
+
+        raw = generate_rand(TEST_SIZE)
+        store = te_p2c(raw, None)
+        print(raw[0], te_c2p_first(store))
+        assert raw[0] == te_c2p_first(store)
+
+        print("test passed")
+
+    def test_malloc(self):
+        print("\n\n", "*" * 100, "\n\nTest malloc begins")
+
+        bi_store = bi_alloc(None, TEST_SIZE, PLAIN_BYTE, MEM_HOST)
+        te_store = te_alloc(None, TEST_SIZE, MEM_HOST)
+        fp_store = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        pi_store = pi_alloc(None, TEST_SIZE, MEM_HOST)
+
+        raw = generate_rand(TEST_SIZE)
+        store = te_p2c(raw, te_store)
+        encoded = fp_encode(store, self.n, self.max_int, None, None, fp_store, None)
+        print("PyObject ids before and after encode:", id(encoded), id(fp_store))
+        assert id(encoded) == id(fp_store)
+        encrypted = pi_encrypt(self._fpga_pub_key, encoded, pi_store, None)
+        obf_seeds = pi_gen_obf_seed(bi_store, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, time.time(), None)
+        encrypted = pi_obfuscate(self._fpga_pub_key, pi_store, obf_seeds, pi_store, None)
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, encrypted, te_store, None)
+        received = te_c2p(decrypted)
+        print("raw data:", raw, "\nreceived data:", list(received))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], received[i])
+
+        bi_free(bi_store)
+        te_free(te_store)
+        fp_free(fp_store)
+        pi_free(pi_store)
+        # fp_store.base_storage, fp_store.bigint_storage, fp_store.exp_storage = None, None, None
+
+        print("test passed")
+
+    def test_p2c(self):
+        print("\n\n", "*" * 100, "\n\nTest fp_p2c & pi_p2c Begins")
+
+        print("Part 1.1: test te_p2c for list")
+        raw = generate_rand(TEST_SIZE)
+        te_store = te_p2c(raw, None)
+        received = te_c2p(te_store)
+        print("raw data:", raw, "\nreceived data:", list(received))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], received[i])
+
+        print("Part 1.2: test te_p2c for ndarray")
+        np_raw = numpy.asarray(raw).reshape(2, 3)
+        te_store = te_p2c(np_raw, None)
+        received = te_c2p(te_store)
+        print("raw data:", raw, "\nreceived data:", list(received))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], received[i])
+
+        print("Part 2.1: test fp_p2c for list")
+        cpu_encoded = [FixedPointNumber.encode(v, self.n, self.max_int) for v in raw]
+        fp_store = fp_p2c(None, cpu_encoded, RAND_TYPE)
+        decoded = te_c2p(fp_decode(fp_store, None, None))
+        print("raw data:", raw, "\ndecoded data:", list(decoded))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], decoded[i])
+
+        print("Part 2.2: test fp_p2c for ndarray")
+        np_cpu_encoded = numpy.asarray(cpu_encoded).reshape(2, 3)
+        fp_store = fp_p2c(None, np_cpu_encoded, RAND_TYPE)
+        decoded = te_c2p(fp_decode(fp_store, None, None))
+        print("raw data:", raw, "\ndecoded data:", list(decoded))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], decoded[i])
+
+        print("Part 3.1: test pi_p2c for list")
+        cpu_encrypted = [self._pub_key.encrypt(raw[i], None, 0) for i in range(TEST_SIZE)]
+        pi_store = pi_p2c(None, cpu_encrypted, RAND_TYPE)
+        decrypted = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, pi_store, None, None))
+        print("raw data:", raw, "\ndecrypted data:", list(decrypted))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], decrypted[i])
+
+        print("Part 3.2: test pi_p2c for ndarray")
+        np_cpu_encrypted = numpy.asarray(cpu_encrypted).reshape(2, 3)
+        pi_store = pi_p2c(None, np_cpu_encrypted, RAND_TYPE)
+        decrypted = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, pi_store, None, None))
+        print("raw data:", raw, "\ndecrypted data:", list(decrypted))
+        for i in range(TEST_SIZE):
+            assert_diff(raw[i], decrypted[i])
+
+        print("test passed")
+
+    def test_tensor(self):
+        print("\n\n", "*" * 100, "\n\nTest Tensor begins")
+        if RAND_TYPE == INT64_TYPE:
+            raw_1, raw_2 = [i % 128 + 1 for i in generate_rand(TEST_SIZE)], [i % 128 + 1 for i in
+                                                                             generate_rand(TEST_SIZE)]
+        elif RAND_TYPE == FLOAT_TYPE:
+            raw_1, raw_2 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        else:
+            raise PermissionError("Invalid Data Type")
+        rows, cols = 2, 3
+        shape = TensorShapeStorage(rows, cols)
+        transposed_shape = TensorShapeStorage(cols, rows)
+        array_1 = numpy.asarray(raw_1).reshape(shape.to_tuple())
+        array_2 = numpy.asarray(raw_2).reshape(shape.to_tuple())
+        array_3 = array_2.transpose()
+        store_1 = TensorStorage(array_1, TEST_SIZE, MEM_HOST, RAND_TYPE)
+        store_2 = TensorStorage(array_2, TEST_SIZE, MEM_HOST, RAND_TYPE)
+        store_3 = TensorStorage(array_3, TEST_SIZE, MEM_HOST, RAND_TYPE)
+        print("raw data:\n", array_1, "\n", array_2)
+
+        print("Part 1: test shape")
+
+        def __run_test_shape(dims):
+            shape = tuple(dims)
+            c_shape = te_p2c_shape(shape, None)
+            py_shape = te_c2p_shape(c_shape)
+            print("compare shapes:", shape, c_shape.to_tuple(), py_shape)
+            assert shape == c_shape.to_tuple()
+            assert shape == py_shape
+
+        __run_test_shape([])
+        __run_test_shape([1])
+        __run_test_shape([1, 2])
+
+        print("Part 2: test te_slice")
+        res_store, res_shape = te_slice(store_1, shape, 1, 2, 0, None, None, None)
+        assert (res_store.data == array_1[1:2]).all()
+        assert res_shape.to_tuple() == (1, cols)
+        res_store, res_shape = te_slice(store_1, shape, 0, 2, 1, None, None, None)
+        assert (res_store.data == array_1[:, 0:2]).all()
+        assert res_shape.to_tuple() == (rows, 2)
+
+        print("Part 3: test te_cat")
+        res_store, res_shape = te_cat([store_1, store_2], 0, None, None)
+        assert (res_store.data == numpy.vstack((array_1, array_2))).all()
+        assert res_shape.to_tuple() == (rows * 2, cols)
+        res_store, res_shape = te_cat([store_1, store_2], 1, None, None)
+        assert (res_store.data == numpy.hstack((array_1, array_2))).all()
+        assert res_shape.to_tuple() == (rows, cols * 2)
+
+        print("Part 4: test te_pow")
+        res_store, res_shape = te_pow(store_1, 9, shape, None, None, None)
+        assert (res_store.data == array_1 ** 9).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 5: test te_add")
+        res_store, res_shape = te_add(store_1, store_2, shape, shape, None, None, None)
+        assert (res_store.data == array_1 + array_2).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 6: test te_mul")
+        res_store, res_shape = te_mul(store_1, store_2, shape, shape, None, None, None)
+        assert (res_store.data == array_1 * array_2).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 7: test te_truediv")
+        res_store, res_shape = te_truediv(store_1, store_2, shape, shape, None, None, None)
+        assert (res_store.data == array_1 / array_2).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 8: test te_floordiv")
+        res_store, res_shape = te_floordiv(store_1, store_2, shape, shape, None, None, None)
+        assert (res_store.data == array_1 // array_2).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 9: test te_sub")
+        res_store, res_shape = te_sub(store_1, store_2, shape, shape, None, None, None)
+        assert (res_store.data == array_1 - array_2).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 10: test te_matmul")
+        res_store, res_shape = te_matmul(store_1, store_3, shape, transposed_shape, None, None, None)
+        print(res_store.data)
+        assert_ndarray_diff(res_store.data, array_1 @ array_2.transpose())
+        assert res_shape.to_tuple() == (rows, rows)
+
+        print("Part 11: test te_abs")
+        res_store, res_shape = te_abs(store_1, shape, None, None, None)
+        assert (res_store.data == abs(array_1)).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 12: test te_neg")
+        res_store, res_shape = te_neg(store_1, shape, None, None, None)
+        assert (res_store.data == -array_1).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 13: test te_transpose")
+        res_store, res_shape = te_transpose(store_1, shape, None, None, None)
+        assert (res_store.data == array_1.transpose()).all()
+        assert res_shape.to_tuple() == transposed_shape.to_tuple()
+
+        print("Part 14: test te_sum")
+        res_store, res_shape = te_sum(store_1, shape, None, None, None, None)
+        assert (res_store.data == array_1.sum()).all()
+        assert res_shape.to_tuple() == ()
+        res_store, res_shape = te_sum(store_1, shape, 0, None, None, None)
+        assert (res_store.data == array_1.sum(axis=0)).all()
+        assert res_shape.to_tuple() == (cols,)
+        res_store, res_shape = te_sum(store_1, shape, 1, None, None, None)
+        assert (res_store.data == array_1.sum(axis=1)).all()
+        assert res_shape.to_tuple() == (rows,)
+
+        print("Part 15: test te_reshape")
+        res_store, res_shape = te_reshape(store_1, shape, transposed_shape, None, None, None)
+        assert (res_store.data == array_1.reshape(transposed_shape.to_tuple())).all()
+        assert res_shape.to_tuple() == transposed_shape.to_tuple()
+
+        print("Part 16: test te_exp")
+        res_store, res_shape = te_exp(store_1, shape, None, None, None)
+        assert (res_store.data == numpy.exp(array_1)).all()
+        assert res_shape.to_tuple() == shape.to_tuple()
+
+        print("Part 17: test te_hstack")
+        res_store, res_shape = te_hstack(store_1, store_2, shape, shape, None, None, None)
+        assert (res_store.data == numpy.hstack((array_1, array_2))).all()
+        assert res_shape.to_tuple() == (rows, cols * 2)
+
+        print("Test passed")
+
+    def test_matmul_fix(self):
+        print("\n\n", "*" * 100, "\n\nTest matmul_fix Begins")
+        print("This test is to test whether the previous overflow bug in matmul has been fixed")
+
+        # use specific operands
+        raw_1 = [-6.328172916615867, -2.8424299647675904, 5.161324580891171, -0.23598534366587853, 0.8092957262188305,
+                 19.50497470592641]
+        raw_2 = [-0.048743928478232584, 6.191889562038381, 2.7177577835259017, 17.09697900858307, 11.31935499510339,
+                 -4.881758293445916]
+        raw_3 = [14.051643909583548, 5.246105161671397, 6.764067053406746, 4.727717881071932, -6.361020843266641,
+                 -12.94175161066905]
+        raw_4 = [-0.003912522017777569, 14.519125724575714, -5.401608455748054, 13.918193685722846, 5.97460357170185,
+                 -3.960383753671568]
+
+        print('Raw data:\n', raw_1, '\n', raw_2, '\n', raw_3, '\n', raw_4)
+
+        # generate shapes and NumPy arrays
+        rows, cols = 2, 3
+        array_1, array_2 = numpy.asarray(raw_1).reshape(rows, cols), numpy.asarray(raw_2).reshape(cols, rows)
+        array_3, array_4 = numpy.asarray(raw_3).reshape(rows, cols), numpy.asarray(raw_4).reshape(rows, cols)
+        shape_1, shape_2 = TensorShapeStorage(rows, cols), TensorShapeStorage(cols, rows)
+        shape_3, shape_4 = TensorShapeStorage(rows, cols), TensorShapeStorage(rows, cols)
+
+        # transfer and encode
+        te_store_1, te_store_2 = te_p2c(raw_1, None), te_p2c(raw_2, None)
+        te_store_3, te_store_4 = te_p2c(raw_3, None), te_p2c(raw_4, None)
+        encoded_1, encoded_2 = fp_encode(te_store_1, self.n, self.max_int), fp_encode(te_store_2, self.n, self.max_int)
+        encoded_3, encoded_4 = fp_encode(te_store_3, self.n, self.max_int), fp_encode(te_store_4, self.n, self.max_int)
+
+        # perform encrypt and obfs
+        encrypted_old_1 = pi_encrypt(self._fpga_pub_key, encoded_1, None, None)
+        encrypted_old_3 = pi_encrypt(self._fpga_pub_key, encoded_3, None, None)
+        rand_store_1 = pi_gen_obf_seed(None, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, time.time(), None)
+        rand_store_3 = pi_gen_obf_seed(None, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, time.time(), None)
+        encrypted_1 = pi_obfuscate(self._fpga_pub_key, encrypted_old_1, rand_store_1, None, None)
+        encrypted_3 = pi_obfuscate(self._fpga_pub_key, encrypted_old_3, rand_store_3, None, None)
+
+        print("Perform Add")
+        add_res_store, add_res_shape = pi_add(self._fpga_pub_key, encrypted_1, encrypted_3, shape_1, shape_3, None,
+                                              None, None)
+        add_res = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, add_res_store, None, None)).reshape(
+            add_res_shape.to_tuple())
+        add_ref = array_1 + array_3
+        print("FPGA intermediate result:", add_res)
+        print("NumPy intermediate result:", add_ref)
+        assert_ndarray_diff(add_res, add_ref)
+
+        print("Perform Mul")
+        mul_res_store, mul_res_shape = pi_mul(self._fpga_pub_key, add_res_store, encoded_4, add_res_shape, shape_4,
+                                              None, None, None)
+        mul_res = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, mul_res_store, None, None)).reshape(
+            mul_res_shape.to_tuple())
+        mul_ref = (array_1 + array_3) * array_4
+        print("FPGA intermediate result:", mul_res)
+        print("NumPy intermediate result:", mul_ref)
+        assert_ndarray_diff(mul_res, mul_ref)
+
+        # The following code is to dump PEN and FPN storages into stdout
+        # print("n (big endian bytes):", self._pub_key.n.to_bytes(CIPHER_BYTE, 'big').hex())
+        # print("nsquare (big endian bytes):", self._pub_key.nsquare.to_bytes(CIPHER_BYTE, 'big').hex())
+        # fp_list = fp_c2p(encoded_2)
+        # pi_cipher, pi_base, pi_exp = pi_c2p(mul_res_store)
+        # print("\n\n>>>>>>>>>>>>>> dumping pen storage\n")
+        # for i in range(TEST_SIZE):
+        #     print("=====================id:", i)
+        #     print("PEN cipher (big endian bytes):", pi_cipher[i].to_bytes(CIPHER_BYTE, 'big').hex())
+        #     print("PEN base (decimal):", pi_base[i])
+        #     print("PEN exponent (decimal):", pi_exp[i])
+        # print("\n\n>>>>>>>>>>>>>> dumping fpn storage\n")
+        # for i in range(TEST_SIZE):
+        #     print("=====================id:", i)
+        #     print("FPN encoding (big endian bytes):", fp_list[i].encoding.to_bytes(CIPHER_BYTE, 'big').hex())
+        #     print("FPN base (decimal):", fp_list[i].BASE)
+        #     print("FPN exponent (decimal):", fp_list[i].exponent)
+
+        # The following code is essentially to decrypt and encrypt again.
+        # However, the numbers might be truncated so that the overflow could be mitigated
+        # tmp_te_store = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, mul_res_store, None, None)
+        # mul_res_store = pi_encrypt(self._fpga_pub_key, fp_encode(tmp_te_store, self.n, self.max_int), None, None)
+        # mul_res_store = pi_obfuscate(self._fpga_pub_key, mul_res_store, rand_store_1, None, None)
+
+        print("Perform Matmul: PEN shape (2, 3), FPN shape (3, 2)")
+        matmul_res_store, matmul_res_shape = pi_matmul(self._fpga_pub_key, mul_res_store, encoded_2, mul_res_shape,
+                                                       shape_2, None, None, None)
+        matmul_res = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, matmul_res_store, None, None)).reshape(
+            matmul_res_shape.to_tuple())
+        matmul_ref = ((array_1 + array_3) * array_4) @ array_2
+        print("FPGA result shape:", matmul_res_shape.to_tuple(), ", CPU result shape:", matmul_ref.shape)
+        print("CPU result:\n", matmul_ref)
+        print("FPGA result:\n", matmul_res)
+        assert_ndarray_diff(matmul_res, matmul_ref)
+
+        print("test passed")
+
+    def test_te_vertical_slice(self):
+        print("\n\n", "*" * 100, "\n\nTest Tensor Vertical Slice Begins")
+        shape = (2, 3)
+        np_raw = numpy.asarray(generate_rand(TEST_SIZE)).reshape(shape)
+        print("raw data:\n", np_raw)
+        np_raw_store = TensorStorage(np_raw, TEST_SIZE, MEM_HOST, RAND_TYPE)
+        np_slice_store, np_slice_shape = te_slice(np_raw_store, TensorShapeStorage(*shape), 2, 3, 1, None, None, None)
+        print("numpy slice data:\n", np_slice_store.data)
+        c_slice_store = te_p2c(np_slice_store.data, None)
+        slice_recv = te_c2p(c_slice_store).reshape(np_slice_shape)
+        print("received slice data:\n", slice_recv)
+        assert_ndarray_diff(np_slice_store.data, slice_recv)
+        print("Test Passed")
+
+    def test_encode_precision_1(self):
+        print("\n\n", "*" * 100, "\n\nTesting encode with precision 1")
+        raw = [19.12634]
+        store = te_p2c(raw, None)
+        fp_store = fp_encode(store, self.n, self.max_int, 1)
+        recv = fp_decode(fp_store, None, None)
+        recv_scalar = te_c2p(recv)
+        print("result:", recv_scalar)
+        assert recv_scalar[0] == 19
+        print("Test passed")
+
+    def test_matmul_limits(self):
+        print("\n\n", "*" * 100, "\n\nTest after how many matmul would cause our internal data structure overflow")
+        shape_tuple = (TEST_SIZE // 2, TEST_SIZE // 2)
+        shape_store = TensorShapeStorage(*shape_tuple)
+        shape_size = functools.reduce(operator.mul, [*shape_tuple], 1)
+        raw_1, raw_2 = [random.gauss(0, 1) for _ in range(shape_size)], [random.gauss(0, 1) for _ in range(shape_size)]
+        left_array, right_array = numpy.asarray(raw_1).reshape(shape_tuple), numpy.asarray(raw_2).reshape(shape_tuple)
+
+        # FPGA encode & encrypt
+        left_store = pi_encrypt(self._fpga_pub_key, fp_encode(te_p2c(raw_1, None), self.n, self.max_int), None, None)
+        obf_seeds = pi_gen_obf_seed(None, self._fpga_pub_key, shape_size, CIPHER_BITS // 6, time.time(), None)
+        left_store = pi_obfuscate(self._fpga_pub_key, left_store, obf_seeds, left_store, None)
+        right_store = fp_encode(te_p2c(raw_2, None), self.n, self.max_int)
+
+        for i in range(1, 100):
+            # Dumping useful data
+            print("\n>>>>>>>>>>>>>>> iteration:", i)
+            _, base, exp = pi_c2p(left_store)
+            fp_py_store = fp_c2p(right_store)
+            all_exponents = [*exp, *[v.exponent for v in fp_py_store]]
+            max_exponent = max(*all_exponents)
+            if i == 1:
+                initial_max_exp = max_exponent
+            print("all exponents:", all_exponents)
+            print("max base:", max(*base, *[v.BASE for v in fp_py_store]), ", max exponent:", max_exponent)
+
+            # Running Numpy and FPGA matmul, storing the result to the left operand
+            left_array = left_array @ right_array
+            left_store, tmp_shape = pi_matmul(self._fpga_pub_key, left_store, right_store, shape_store, shape_store,
+                                              left_store, None, None)
+
+            # Get matmul result of the current iteration and compare
+            tmp_res = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, left_store, None, None)).reshape(
+                tmp_shape.to_tuple())
+            print("FPGA result:\n", tmp_res, "\nCPU result:\n", left_array)
+            try:
+                assert_ndarray_diff(tmp_res, left_array)
+            except AssertionError:
+                final_exponents = [*pi_c2p(left_store)[2], *[v.exponent for v in fp_c2p(right_store)]]
+                final_max_exp = max(*final_exponents)
+                print("final exponents:", final_exponents)
+                print("initial max exponent:", initial_max_exp, ", final max exponent", final_max_exp)
+                print(">>>>> FPGA and CPU results didn't match at iteration #{}.".format(i))
+                # The following assertions are deprecated as we treat max_exp for FPN and PEN separately
+                # assert 256 <= final_max_exp < 512
+                # assert initial_max_exp * int(round(2 ** i)) == final_max_exp
+                # assert i == math.ceil(8 - math.log2(initial_max_exp))
+                break
+
+        print("Test passed")
+
+    def test_fp_cat(self):
+        print("\n\n", "*" * 100, "\n\nTest fp_cat begins")
+        shape_tuple = (2, 3)
+        shape = TensorShapeStorage(*shape_tuple)
+        shape_size = int(round(numpy.prod(shape_tuple)))
+        tmp_1, tmp_2 = generate_rand(shape_size), generate_rand(shape_size)
+        array_1, array_2 = numpy.asarray(tmp_1).reshape(shape_tuple), numpy.asarray(tmp_2).reshape(shape_tuple)
+        print("array_1:\n", array_1, "\narray_2:\n", array_2)
+
+        fp_store_1 = fp_encode(te_p2c(array_1, None), self.n, self.max_int)
+        fp_store_2 = fp_encode(te_p2c(array_2, None), self.n, self.max_int)
+
+        # test vertical cat
+        print("Part 1: test vertical cat")
+        cat_store, cat_shape = fp_cat([fp_store_1, fp_store_2], [shape, shape], 0, None, None)
+        print("result shape:", cat_shape.to_tuple())
+        assert cat_shape.to_tuple() == (shape_tuple[0] * 2, shape_tuple[1])
+        decoded = fp_decode(cat_store, None, None)
+        res = te_c2p(decoded).reshape(cat_shape.to_tuple())
+        ref = numpy.concatenate((array_1, array_2), 0)
+        print("result tensor:\n", res)
+        print("reference tensor:\n", ref)
+        assert_ndarray_diff(res, ref)
+
+        # test horizontal cat
+        print("Part 2: test horizontal cat")
+        cat_store, cat_shape = fp_cat([fp_store_1, fp_store_2], [shape, shape], 1, None, None)
+        print("result shape:", cat_shape.to_tuple())
+        assert cat_shape.to_tuple() == (shape_tuple[0], shape_tuple[1] * 2)
+        decoded = fp_decode(cat_store, None, None)
+        res = te_c2p(decoded).reshape(cat_shape.to_tuple())
+        ref = numpy.concatenate((array_1, array_2), 1)
+        print("result tensor:\n", res)
+        print("reference tensor:\n", ref)
+        assert_ndarray_diff(res, ref)
+
+        print("test passed")
+
+    def test_pi_cat(self):
+        print("\n\n", "*" * 100, "\n\nTest pi_cat begins")
+        shape_tuple = (2, 3)
+        shape = TensorShapeStorage(*shape_tuple)
+        shape_size = int(round(numpy.prod(shape_tuple)))
+        array_1 = numpy.asarray(generate_rand(shape_size)).reshape(shape_tuple)
+        array_2 = numpy.asarray(generate_rand(shape_size)).reshape(shape_tuple)
+        array_3 = numpy.asarray(generate_rand(shape_size)).reshape(shape_tuple)
+        print("array_1:\n", array_1, "\narray_2:\n", array_2, "\narray_3:\n", array_3)
+
+        fp_store_1 = fp_encode(te_p2c(array_1, None), self.n, self.max_int)
+        fp_store_2 = fp_encode(te_p2c(array_2, None), self.n, self.max_int)
+        fp_store_3 = fp_encode(te_p2c(array_3, None), self.n, self.max_int)
+        pi_store_1 = pi_encrypt(self._fpga_pub_key, fp_store_1, None, None)
+        pi_store_2 = pi_encrypt(self._fpga_pub_key, fp_store_2, None, None)
+        pi_store_3 = pi_encrypt(self._fpga_pub_key, fp_store_3, None, None)
+
+        # test horizontal cat
+        print("Part 1: test horizontal cat")
+        cat_store, cat_shape = pi_cat([pi_store_1, pi_store_2, pi_store_3], [shape, shape, shape], 1, None, None)
+        print("result shape:", cat_shape.to_tuple())
+        assert cat_shape.to_tuple() == (shape_tuple[0], shape_tuple[1] * 3)
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, cat_store, None, None)
+        res = te_c2p(decrypted).reshape(cat_shape.to_tuple())
+        ref = numpy.concatenate((array_1, array_2, array_3), 1)
+        print("result tensor:\n", res)
+        print("reference tensor:\n", ref)
+        assert_ndarray_diff(res, ref)
+
+        # test vertical cat
+        print("Part 2: test vertical cat")
+        cat_store, cat_shape = pi_cat([pi_store_1, pi_store_2, pi_store_3], [shape, shape, shape], 0, None, None)
+        print("result shape:", cat_shape.to_tuple())
+        assert cat_shape.to_tuple() == (shape_tuple[0] * 3, shape_tuple[1])
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, cat_store, None, None)
+        res = te_c2p(decrypted).reshape(cat_shape.to_tuple())
+        ref = numpy.concatenate((array_1, array_2, array_3), 0)
+        print("result tensor:\n", res)
+        print("reference tensor:\n", ref)
+        assert_ndarray_diff(res, ref)
+
+        # test concat combination
+        print("Part 3: test combined cat")
+        vector_size = cat_shape.to_tuple()[0]
+        vector_shape = TensorShapeStorage(vector_size, 1)
+        vector = numpy.asarray(generate_rand(vector_size)).reshape(vector_size, 1)
+        print("vector:\n", vector)
+        vector_te_store = te_p2c(vector, None)
+        vector_fp_store = fp_encode(vector_te_store, self.n, self.max_int)
+        vector_pi_store = pi_encrypt(self._fpga_pub_key, vector_fp_store, None, None)
+        cat_store, cat_shape = pi_cat([cat_store, vector_pi_store], [cat_shape, vector_shape], 1, None, None)
+        print("result shape:", cat_shape.to_tuple())
+        assert cat_shape.to_tuple() == (shape_tuple[0] * 3, shape_tuple[1] + 1)
+        decrypted = pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, cat_store, None, None)
+        res = te_c2p(decrypted).reshape(cat_shape.to_tuple())
+        ref = numpy.concatenate([numpy.concatenate((array_1, array_2, array_3), 0), vector], 1)
+        print("result tensor:\n", res)
+        print("reference tensor:\n", ref)
+        assert_ndarray_diff(res, ref)
+
+        print("test passed")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/gpu/tensor/paillier_fpga/paillier_fpga/tests/test_fpga_performance.py b/gpu/tensor/paillier_fpga/paillier_fpga/tests/test_fpga_performance.py
new file mode 100755
index 0000000000..88013399e0
--- /dev/null
+++ b/gpu/tensor/paillier_fpga/paillier_fpga/tests/test_fpga_performance.py
@@ -0,0 +1,324 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import numpy
+import unittest
+import random
+import functools
+import time
+
+from fate_arch.tensor.impl.blocks.python_paillier_block import (
+    PaillierKeypair,
+    PaillierEncryptedNumber,
+    FixedPointNumber,
+    gmpy_math,
+)
+
+from ..fpga_engine import (
+    FLOAT_TYPE,
+    INT64_TYPE,
+    pi_p2c_pub_key,
+    pi_p2c_priv_key,
+    pi_h2d_pub_key,
+    pi_h2d_priv_key,
+    TensorShapeStorage,
+    bi_alloc,
+    PLAIN_BYTE,
+    MEM_HOST,
+    te_alloc,
+    fp_alloc,
+    pi_alloc,
+    te_p2c,
+    fp_encode,
+    fp_decode,
+    te_c2p,
+    pi_encrypt,
+    pi_gen_obf_seed,
+    CIPHER_BITS,
+    pi_obfuscate,
+    pi_c2p,
+    pi_decrypt,
+    fp_mul,
+    fp_c2p,
+    pi_add,
+    pi_mul,
+    pi_sum,
+    bi_free,
+    te_free,
+    fp_free,
+    pi_free, initialize_device, pi_matmul,
+)
+
+RAND_TYPE = FLOAT_TYPE  # SWITCH DATA TYPE HERE: EITHER INT64_TYPE OR FLOAT_TYPE
+NUM_ROWS = 666
+NUM_COLS = 666
+TEST_SIZE = NUM_ROWS * NUM_COLS
+ERROR_TOLERANCE = 1e-10
+
+
+def generate_rand(test_size):
+    if RAND_TYPE == FLOAT_TYPE:
+        return numpy.random.normal(0, 5, test_size)
+    elif RAND_TYPE == INT64_TYPE:
+        return numpy.random.randint(-2 ** 10, 2 ** 10, test_size)
+    else:
+        raise TypeError("Invalid data type")
+
+
+def assert_diff(res, ref):
+    if res == 0 or ref == 0:
+        assert res == 0
+        assert ref == 0
+    else:
+        diff = res - ref
+        assert abs(diff / res) < ERROR_TOLERANCE
+        assert abs(diff / ref) < ERROR_TOLERANCE
+
+
+def assert_ndarray_diff(res, ref):
+    assert res.shape == ref.shape
+    res, ref = res.flatten(), ref.flatten()
+    assert res.shape == ref.shape
+    for i in range(res.size):
+        try:
+            assert_diff(res[i], ref[i])
+        except AssertionError:
+            print("Assertion Error at location", i, ", FPGA result:",
+                  res[i], ", reference result:", ref[i])
+
+
+def profile(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        res = func(*args, **kwargs)
+        end_time = time.time()
+        return res, end_time - start_time
+
+    return wrapper
+
+
+def compare_time(fpga_time, cpu_time, num_instances=TEST_SIZE):
+    print("FPGA time:", fpga_time, "second(s)")
+    print("CPU time:", cpu_time, "second(s)")
+    print("FPGA throughput:", num_instances / fpga_time, "instance(s) per second")
+    print("CPU throughput:", num_instances / cpu_time, "instance(s) per second")
+    print("Speedup:", cpu_time / fpga_time)
+
+
+def cpu_pi_gen_obf_seed(res_store, public_key, count, elem_size, rand_seed, stream):
+    random.seed(rand_seed)
+    rand_vals = [random.randrange(1, 8 ** elem_size) for _ in range(count)]
+    return [gmpy_math.powmod(v, public_key.n, public_key.nsquare) for v in rand_vals]
+
+
+def cpu_pi_obfuscate(public_key, encrypted_numbers, obf_seeds, exponents, res_store, stream):
+    return [PaillierEncryptedNumber(public_key, (encrypted_numbers[i] * obf_seeds[i]) % public_key.nsquare,
+                                    exponents[i]) for i in range(len(encrypted_numbers))]
+
+
+def cpu_fp_mul(left, right):
+    return [FixedPointNumber((left[i].encoding * right[i].encoding) % left[i].n,
+                             left[i].exponent + right[i].exponent, left[i].n, left[i].max_int) for i in
+            range(len(left))]
+
+
+class TestOperators(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        initialize_device()
+        cls._pub_key, cls._priv_key = PaillierKeypair.generate_keypair()
+        cls.n, cls.max_int = cls._pub_key.n, cls._pub_key.max_int
+        cls._cpu_pub_key = pi_p2c_pub_key(None, cls._pub_key)
+        cls._cpu_priv_key = pi_p2c_priv_key(None, cls._priv_key)
+        cls._fpga_pub_key = pi_h2d_pub_key(None, cls._cpu_pub_key)
+        cls._fpga_priv_key = pi_h2d_priv_key(None, cls._cpu_priv_key)
+        print("\n\n", "*" * 100, "\n\nInitialization complete\nTest Size:", TEST_SIZE)
+
+    # test performance
+    def test_performance(self):
+        print("\n\n", "*" * 100, "\n\nTest performance begins")
+
+        print("\n>>>>> generate data and allocate memory spaces")
+        raw, raw2 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        shape_tuple, shape_tuple_T = (NUM_ROWS, NUM_COLS), (NUM_COLS, NUM_ROWS)
+        shape_store, shape_store_T = TensorShapeStorage(*shape_tuple), TensorShapeStorage(*shape_tuple_T)
+        fpga_bi_store, fpga_bi_store2 = bi_alloc(
+            None, TEST_SIZE, PLAIN_BYTE, MEM_HOST), bi_alloc(
+            None, TEST_SIZE, PLAIN_BYTE, MEM_HOST)
+        fpga_te_store, fpga_te_store2 = te_alloc(None, TEST_SIZE, MEM_HOST), te_alloc(None, TEST_SIZE, MEM_HOST)
+        fpga_fp_store, fpga_fp_store2 = fp_alloc(None, TEST_SIZE, MEM_HOST), fp_alloc(None, TEST_SIZE, MEM_HOST)
+        fpga_pi_store, fpga_pi_store2 = pi_alloc(None, TEST_SIZE, MEM_HOST), pi_alloc(None, TEST_SIZE, MEM_HOST)
+        fpga_te_store, fpga_te_store2 = te_p2c(raw, fpga_te_store), te_p2c(raw2, fpga_te_store2)
+
+        print("\n>>>>> fp_encode profiling begins")
+        fpga_encoded, fpga_encode_time = profile(fp_encode)(fpga_te_store, self.n, self.max_int, res=fpga_fp_store)
+        cpu_encoded, cpu_encode_time = profile(
+            lambda l: [
+                FixedPointNumber.encode(
+                    v, self.n, self.max_int) for v in l])(raw)
+        compare_time(fpga_encode_time, cpu_encode_time)
+
+        print("\n>>>>> fp_decode profiling begins")
+        fpga_decoded, fpga_decode_time = profile(fp_decode)(fpga_encoded, fpga_te_store, None)
+        cpu_decoded, cpu_decode_time = profile(lambda l: [v.decode() for v in l])(cpu_encoded)
+        compare_time(fpga_decode_time, cpu_decode_time)
+
+        # check decoded results
+        assert_ndarray_diff(te_c2p(fpga_decoded), numpy.asarray(cpu_decoded))
+
+        print("\n>>>>> pi_encrypt profiling begins")
+        print("This function calculates (encoding * n + 1) % nsquare")
+        fpga_encrypted, fpga_encrypt_time = profile(pi_encrypt)(self._fpga_pub_key, fpga_encoded, fpga_pi_store, None)
+        cpu_encrypted, cpu_encrypt_time = profile(
+            lambda l: [
+                self._pub_key.raw_encrypt(
+                    v.encoding, 1) for v in l])(cpu_encoded)
+        compare_time(fpga_encrypt_time, cpu_encrypt_time)
+
+        print("\n>>>>> pi_gen_obf_seed profiling begins")
+        print("This function calculates (rand() ^ n) % nsquare")
+        fpga_obf_seeds, fpga_gen_obf_seeds_time = profile(pi_gen_obf_seed)(
+            fpga_bi_store, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, 0, None)
+        cpu_obf_seeds, cpu_gen_obf_seefs_time = profile(cpu_pi_gen_obf_seed)(
+            None, self._pub_key, TEST_SIZE, CIPHER_BITS // 6, 0, None)
+        compare_time(fpga_gen_obf_seeds_time, cpu_gen_obf_seefs_time)
+
+        print("\n>>>>> pi_obfuscate profiling begins")
+        print("This function calculates (raw_cipher * obf_seed) % nsquare,")
+        print("\twhere raw_cipher and obf_seed are calculated in pi_encrypt and pi_gen_obf_seeds, respectively")
+        fpga_obfuscated, fpga_obfuscate_time = profile(pi_obfuscate)(
+            self._fpga_pub_key, fpga_encrypted, fpga_obf_seeds, fpga_pi_store, None)
+        cpu_obfuscated, cpu_obfuscate_time = profile(cpu_pi_obfuscate)(
+            self._pub_key, cpu_encrypted, cpu_obf_seeds, [
+                v.exponent for v in cpu_encoded], None, None)
+        compare_time(fpga_obfuscate_time, cpu_obfuscate_time)
+
+        # check intermediate result
+        assert_ndarray_diff(numpy.asarray(pi_c2p(fpga_obfuscated)[0]), numpy.asarray(
+            [v.ciphertext(False) for v in cpu_obfuscated]))
+
+        print("\n>>>>> pi_decrypt profiling begins")
+        print("This function calculates L(cipher ^ lambda % nsquare) * L(g ^ lambda % nsquare) ^ -1 % n")
+        print("fp_decode is by default included in pi_decrypt")
+        fps_buffer = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        fpga_decrypted, fpga_decrypt_time = profile(pi_decrypt)(
+            self._fpga_pub_key, self._fpga_priv_key, fpga_obfuscated, fpga_te_store, fps_buffer)
+        cpu_decrypted, cpu_decrypt_time = profile(lambda l: [self._priv_key.decrypt(v) for v in l])(cpu_obfuscated)
+        compare_time(fpga_decrypt_time, cpu_decrypt_time)
+
+        # check decrypted results
+        assert_ndarray_diff(te_c2p(fpga_decrypted), numpy.asarray(cpu_decrypted))
+
+        print("\n>>>>> generating the other array")
+        # encode the other array
+        fpga_encoded2 = fp_encode(fpga_te_store2, self.n, self.max_int, res=fpga_fp_store2)
+        cpu_encoded2 = [FixedPointNumber.encode(v, self.n, self.max_int) for v in raw2]
+        # encrypt the other array
+        fpga_encrypted2 = pi_encrypt(self._fpga_pub_key, fpga_encoded2, fpga_pi_store2, None)
+        cpu_encrypted2 = [self._pub_key.raw_encrypt(v.encoding, 1) for v in cpu_encoded2]
+        # generate obfuscation seeds (obfuscators) for the other array using a different random seed
+        fpga_obf_seeds2 = pi_gen_obf_seed(fpga_bi_store2, self._fpga_pub_key, TEST_SIZE, CIPHER_BITS // 6, 1, None)
+        cpu_obf_seeds2 = cpu_pi_gen_obf_seed(None, self._pub_key, TEST_SIZE, CIPHER_BITS // 6, 1, None)
+        # obfuscate the other array
+        fpga_obfuscated2 = pi_obfuscate(self._fpga_pub_key, fpga_encrypted2, fpga_obf_seeds2, fpga_pi_store2, None)
+        cpu_obfuscated2 = cpu_pi_obfuscate(
+            self._pub_key, cpu_encrypted2, cpu_obf_seeds2, [
+                v.exponent for v in cpu_encoded2], None, None)
+        # check intermediate result
+        assert_ndarray_diff(numpy.asarray(pi_c2p(fpga_obfuscated2)[0]), numpy.asarray(
+            [v.ciphertext(False) for v in cpu_obfuscated2]))
+
+        print("\n>>>>> fp_mul profiling begins")
+        fpga_fp_mul_store = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        (fpga_fp_mul_res, _), fpga_fp_mul_time = profile(fp_mul)(fpga_encoded,
+                                                                 fpga_encoded2, shape_store, shape_store,
+                                                                 fpga_fp_mul_store, shape_store, None)
+        cpu_fp_mul_res, cpu_fp_mul_time = profile(cpu_fp_mul)(cpu_encoded, cpu_encoded2)
+        compare_time(fpga_fp_mul_time, cpu_fp_mul_time)
+
+        # Compare results
+        received_fp_mul_res = fp_c2p(fpga_fp_mul_res)
+        for i in range(TEST_SIZE):
+            assert_diff(received_fp_mul_res[i].encoding, cpu_fp_mul_res[i].encoding)
+            assert received_fp_mul_res[i].BASE == cpu_fp_mul_res[i].BASE
+            assert received_fp_mul_res[i].exponent == cpu_fp_mul_res[i].exponent
+
+        print("\n>>>>> pi_add profiling begins")
+        (fpga_add_res, _), fpga_add_time = profile(pi_add)(self._fpga_pub_key, fpga_obfuscated,
+                                                           fpga_obfuscated2, shape_store, shape_store, fpga_pi_store,
+                                                           shape_store, None)
+        cpu_add_res, cpu_add_time = profile(lambda a, b: [a[i] + b[i]
+                                                          for i in range(TEST_SIZE)])(cpu_obfuscated, cpu_obfuscated2)
+        compare_time(fpga_add_time, cpu_add_time)
+
+        print("\n>>>>> pi_mul profiling begins")
+        (fpga_mul_res, _), fpga_mul_time = profile(pi_mul)(self._fpga_pub_key, fpga_add_res,
+                                                           fpga_encoded2, shape_store, shape_store, fpga_pi_store,
+                                                           shape_store, None)
+        cpu_mul_res, cpu_mul_time = profile(lambda a, b: [a[i] * b[i]
+                                                          for i in range(TEST_SIZE)])(cpu_add_res, cpu_encoded2)
+        compare_time(fpga_mul_time, cpu_mul_time)
+
+        print("\n>>>>> pi_matmul profiling begins")
+        print("sizes are", shape_tuple, "and", shape_tuple_T)
+        fpga_pi_matmul_store = pi_alloc(None, NUM_ROWS * NUM_ROWS, MEM_HOST)
+        (fpga_matmul_res, fpga_matmul_shape), fpga_matmul_time = profile(pi_matmul)(self._fpga_pub_key,
+                                                                                    fpga_mul_res, fpga_encoded2,
+                                                                                    shape_store, shape_store_T,
+                                                                                    fpga_pi_matmul_store, None, None)
+        cpu_matmul_res, cpu_matmul_time = profile(
+            lambda a, b: a @ b)(numpy.asarray(cpu_mul_res).reshape(shape_tuple),
+                                numpy.asarray(cpu_encoded2).reshape(shape_tuple_T))
+        compare_time(fpga_matmul_time, cpu_matmul_time, NUM_ROWS * TEST_SIZE)
+
+        print("\n>>>>> pi_sum profiling begins")
+        print("shape is", fpga_matmul_shape.to_tuple())
+        fpga_pi_sum_store = pi_alloc(None, max(NUM_ROWS, NUM_COLS), MEM_HOST)
+        for axis in [0, 1, None]:
+            print(">>> axis:", axis)
+            (fpga_sum_res, _), fpga_sum_time = profile(pi_sum)(self._fpga_pub_key,
+                                                               fpga_matmul_res, fpga_matmul_shape, axis,
+                                                               fpga_pi_sum_store, None, None)
+            cpu_sum_res, cpu_sum_time = profile(lambda a: numpy.sum(a, axis))(cpu_matmul_res)
+            compare_time(fpga_sum_time, cpu_sum_time)
+
+            # check result
+            fpga_decrypted = te_c2p(pi_decrypt(self._fpga_pub_key, self._fpga_priv_key, fpga_sum_res, None, None))
+            cpu_decrypted = numpy.asarray([self._priv_key.decrypt(v) for v in cpu_sum_res.flat]
+                                          if axis is not None else [self._priv_key.decrypt(cpu_sum_res)])
+            assert_ndarray_diff(fpga_decrypted, cpu_decrypted)
+
+        print("\n>>>>> free all allocated spaces")
+        bi_free(fpga_bi_store)
+        bi_free(fpga_bi_store2)
+        te_free(fpga_te_store)
+        te_free(fpga_te_store2)
+        fp_free(fpga_fp_store)
+        fp_free(fpga_fp_store2)
+        fp_free(fps_buffer)
+        fp_free(fpga_fp_mul_store)
+        pi_free(fpga_pi_store)
+        pi_free(fpga_pi_store2)
+        pi_free(fpga_pi_matmul_store)
+        pi_free(fpga_pi_sum_store)
+
+        print("test passed")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/gpu/tensor/paillier_fpga/pyproject.toml b/gpu/tensor/paillier_fpga/pyproject.toml
new file mode 100644
index 0000000000..6db3fdecf9
--- /dev/null
+++ b/gpu/tensor/paillier_fpga/pyproject.toml
@@ -0,0 +1,17 @@
+[tool.poetry]
+name = "paillier-fpga"
+version = "0.1.0"
+description = "This project is an industrial-level heterogeneous acceleration system to support and speed up federated learning. We've designed and implemented a heterogeneous acceleration solutions using FPGA, respectively, that can significantly accelerate the Paillier cryptosystem while maintaining functionality, accuracy and scalability."
+authors = ["Xiaolong.Gao <1506957902@qq.com>"]
+
+[tool.poetry.dependencies]
+python = "^3.6"
+numpy = "~1.18.4"
+gmpy2 = "^2.0.8"
+
+[tool.poetry.dev-dependencies]
+pytest = "^5.2"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/gpu/tensor/paillier_gpu/paillier_gpu/__init__.py b/gpu/tensor/paillier_gpu/paillier_gpu/__init__.py
new file mode 100644
index 0000000000..5d9d7241b5
--- /dev/null
+++ b/gpu/tensor/paillier_gpu/paillier_gpu/__init__.py
@@ -0,0 +1,21 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+from .gpu_tensor import keygen, SK, PK, Cipherblock
+
+__version__ = '0.1.0'
+__all__ = ['keygen', "SK", "PK", "Cipherblock"]
diff --git a/gpu/tensor/paillier_gpu/paillier_gpu/gpu_engine.py b/gpu/tensor/paillier_gpu/paillier_gpu/gpu_engine.py
new file mode 100644
index 0000000000..8f492aea9b
--- /dev/null
+++ b/gpu/tensor/paillier_gpu/paillier_gpu/gpu_engine.py
@@ -0,0 +1,4956 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+import random
+import numpy as np
+
+from ctypes import cdll, sizeof, c_buffer, cast, c_int32
+from ctypes import (
+    c_char,
+    c_char_p,
+    c_void_p,
+    c_uint32,
+    c_double,
+    c_int64,
+    c_int,
+    c_size_t,
+)
+
+from fate_arch.tensor.impl.blocks.python_paillier_block import (
+    PaillierPublicKey,
+    PaillierPrivateKey,
+    PaillierEncryptedNumber,
+    FixedPointNumber
+)
+
+from concurrent.futures import ProcessPoolExecutor as Executor
+
+# define memory types
+MEM_HOST = 1
+MEM_DEVICE = 2
+
+# define device type
+# TODO: make those paras into actual use
+# device_type = 00: CPU
+# device_type = 10: GPU
+# device_type = 20: FPGA_num_0
+# device_type = 21: FPGA_num_1
+device_type = 1
+
+# aliases defined by WeBank
+PaillierPublicKeyStorage = PaillierPublicKey
+PaillierPrivateKeyStorage = PaillierPrivateKey
+
+'''##############import ctypes to implement py2c and c2py#################'''
+'''############## load the .so library written in C     ##################'''
+
+# we made 3 libraries, each one indicating a different CIPHER_BIT length
+# here use absolute path to locate the shared library
+GPU_LIB = cdll.LoadLibrary(os.path.dirname(__file__) + "/GPU_2048.so")
+# GPU_LIB = cdll.LoadLibrary("../../../Documents/GPU_2048.so")
+
+# set the CIPHER_BIT according to the library chosen.
+CIPHER_BITS = 2048
+PLAIN_BITS = 2048
+BYTE_LEN = 8
+CIPHER_BYTE = CIPHER_BITS // BYTE_LEN
+PLAIN_BYTE = PLAIN_BITS // BYTE_LEN
+
+# ### DEFINE THE BTYE_LENGTHS OF DATA TYPES ####
+CHAR_BYTE = sizeof(c_char)
+U_INT32_BYTE = sizeof(c_uint32)
+DOUBLE_BYTE = sizeof(c_double)
+INT64_BYTE = sizeof(c_int64)
+
+# DEFINE THE RETURN TYPE OF C_malloc####
+GPU_LIB.c_malloc.restype = c_void_p
+GPU_LIB.c_direct_malloc.restype = c_void_p
+GPU_LIB.cuda_malloc.restype = c_void_p
+
+GPU_LIB.init_pub_key.restype = c_void_p
+GPU_LIB.init_priv_key.restype = c_void_p
+
+GPU_LIB.get_cur_device.restype = c_int
+GPU_LIB.get_cur_context.restype = c_void_p
+GPU_LIB.create_cuda_context.restype = c_void_p
+
+# DEFINE TWO DIFFERENT TYPE OF DATA IN TensorStorage
+INT64_TYPE = 1  # datatype flag for int32 and int64
+FLOAT_TYPE = 2  # datatype flag for float and double
+
+# define BASE for Paillier encrypted numbers
+PEN_BASE = 16
+# as there's no BASE defined in Python PaillierEncryptedNumber,
+# and we need this in CUDA, we define PEN_BASE as 16
+
+''' Device Initializer '''
+
+
+def initialize_device():
+    GPU_LIB.gpu_init()
+    GPU_LIB.print_example_banner()
+
+
+def _MEM_ON_HOST(mem_type):
+    if mem_type == MEM_HOST:
+        return True
+    elif mem_type == MEM_DEVICE:
+        return False
+
+
+# ############################################################################
+'''
+    functions for getting nvidia GPU info
+    And set the cuda context for the new data
+'''
+
+
+def GPU_driver_init():
+    GPU_LIB.cuda_driver_init()
+
+
+def set_GPU_device(device_num):
+    GPU_LIB.set_gpu_device(c_int(device_num))
+
+
+def get_GPU_device():
+    return GPU_LIB.get_cur_device()
+
+
+def get_GPU_context():
+    return GPU_LIB.get_cur_context()
+
+
+def create_GPU_context(device_num):
+    context_pointer = GPU_LIB.create_cuda_context(c_int(device_num))
+    return context_pointer
+
+
+def bind_GPU_context(context_ptr):
+    GPU_LIB.bind_cuda_context(c_void_p(context_ptr))
+
+
+def free_GPU_context(context_pointer):
+    GPU_LIB.free_cuda_context(c_void_p(context_pointer))
+
+
+# ############################################################################
+# ######################Useful independent functions##########################
+# ###################Reconstruct ndaray from C memory type####################
+# ############################################################################
+
+
+def __get_C_fpn(fpn_space, size):
+    '''
+    copy FixedPointNumber (FPN) object out from C memory space,
+    reform a ndarray, return it to upper python level
+    --------------------
+    Para:
+    res_fpn_space: int, indicating the start address of a c_memory space
+    size: int, the number of FPN in the C memory space
+    Return:
+    A ndarray, each element is a bigint
+    '''
+    res_fpn = []
+    get_res = c_buffer(PLAIN_BYTE)
+    for i in range(size):
+        GPU_LIB.c_memcpy(
+            cast(get_res, c_void_p),
+            c_void_p(fpn_space + i * PLAIN_BYTE),
+            c_size_t(PLAIN_BYTE),
+        )
+        res_fpn.append(int.from_bytes(get_res.raw, 'little'))
+    return np.asarray(res_fpn)
+
+
+def __get_C_pen(pen_space, index, size):
+    '''
+    copy PaillierEncryptedNumber(PEN) object out from C memory space,
+    reform a ndarray, return it to upper python level
+    ------------------
+    Para:
+    pen:   int, indicating the start address of a continuous C memory space
+    index: int, the offset from start address that we start to get PEN
+    size:  int, the number of PEN ought to get
+    Return:
+    A ndarray, each element is a bigint
+    '''
+    res_pen = []
+    get_res = c_buffer(CIPHER_BYTE)
+    for i in range(size):
+        GPU_LIB.c_memcpy(
+            cast(get_res, c_void_p),
+            c_void_p(pen_space + (index + i) * CIPHER_BYTE),
+            c_size_t(CIPHER_BYTE),
+        )
+        res_pen.append(int.from_bytes(get_res.raw, 'little'))
+    return np.asarray(res_pen)
+
+
+bi_c2p = __get_C_pen
+
+
+def __get_C_uint32(uint32_space, size):
+    '''
+    copy uint32 out from C memory space, form a ndarraay
+    since numpy has a very good support for basic C numeric objects,
+    A single memcpy will be sufficient
+    ------------------------
+    Para:
+    res_uint32_space: int, indicating the start address of a continuous C memory space
+    size: int, the number of uint32 ought to get
+    '''
+    uint32_list = (c_uint32 * size)(*[0 for _ in range(size)])
+    GPU_LIB.c_memcpy(
+        uint32_list,
+        c_void_p(uint32_space),
+        c_size_t(size * U_INT32_BYTE))
+    return np.asarray(uint32_list)
+
+
+def __get_C_double(double_space, size):
+    '''copy double out from C memory space, form a ndarray'''
+    double_list = (c_double * size)(*[0 for _ in range(size)])
+    GPU_LIB.c_memcpy(
+        double_list,
+        c_void_p(double_space),
+        c_size_t(size * DOUBLE_BYTE))
+    # convert all the data in one step, no loop
+    return np.asarray(double_list)
+
+
+def __get_C_int64(int64_space, size):
+    '''copy int64 out from C memory space, form a ndarray'''
+    int64_list = (c_int64 * size)(*[0 for _ in range(size)])
+    GPU_LIB.c_memcpy(
+        int64_list,
+        c_void_p(int64_space),
+        c_size_t(size * INT64_BYTE))
+    # convert all the data in one step, no loop
+    return np.asarray(int64_list)
+
+
+def __get_c_fpn_storage(fpn, base, exp, vec_size, n, max_int):
+    '''
+    Construct array of FixedPointNumber from given C memory spaces
+    -------------------
+    Para:
+    fpn:  int, start address of a C memory space,
+               inside which stores FPN's encodings(bigint, PLAIN_BITS long)
+    base: int, start address of a C memory space,
+               inside which stores FPN's base(uint32)
+    exp:  int, start address of a C memory space,
+               inside which stores FPN's exp(uint32)
+    vec_size: int, the number of bigint
+    n, max_int: int, the key used to encode the original plaintext
+
+    Return:
+    A ndarray, each element is a FixedPointNumber
+    '''
+    res_fpn = __get_C_fpn(fpn, vec_size)
+    # res_base = __get_C_uint32(base, size)
+    res_exp = __get_C_uint32(exp, vec_size)
+    res_FixedPointNumber = []
+    for i in range(vec_size):
+        res_FixedPointNumber.append(
+            FixedPointNumber(res_fpn[i], int(round(res_exp[i])), n, max_int)
+        )
+    return np.asarray(res_FixedPointNumber)
+
+
+def __get_c_pen_storage_raw(pen, base, exp, vec_size, n):
+    res_cipher = __get_C_pen(pen, 0, vec_size)
+    res_base = __get_C_uint32(base, vec_size)
+    res_exp = __get_C_uint32(exp, vec_size)
+
+    return res_cipher, res_base, res_exp
+
+
+def __get_c_pen_storage_mp(pen, base, exp, vec_size, n, thread_num=4):
+    '''
+    Use multi-process to accelerate __get_C_pen process.
+
+    Since on Linux, python use fork to create sub-process,
+    thus the C memory space is shared between father and child processes.
+    And the whole process concerns no CUDA and cuda-context,
+    even the return result is in python object form.
+    So we can use multi-process for acceleration here safely
+    ---------------------------------
+    Para:
+        thread_num: number of processes used in multi-processing
+    Return:
+        tuple, (ndarray, ndarray, ndarray)
+    '''
+    job_cnt = round(vec_size / thread_num)
+    job_idx = 0
+    job_idx_list, job_cnt_list = [0], []
+    for i in range(thread_num - 1):
+        job_idx += job_cnt
+        job_idx_list.append(job_idx)
+        job_cnt_list.append(job_cnt)
+    job_cnt_list.append(vec_size - job_cnt * (thread_num - 1))
+    # for __get_C_pen, use multiprocess to accelerate
+    executor = Executor()
+    futures = []
+    for i in range(thread_num):
+        futures.append(
+            executor.submit(__get_C_pen, pen, job_idx_list[i], job_cnt_list[i])
+        )
+    res_list = [r.result() for r in futures]
+    res_pen = []
+    for res in res_list:
+        res_pen.extend(res)
+    # for uint32, no special demand for multiprocess
+    res_base = __get_C_uint32(base, vec_size)
+    res_exp = __get_C_uint32(exp, vec_size)
+    return np.asarray(res_pen), res_base, res_exp
+
+
+def __get_c_pen_storage(pen, base, exp, vec_size, n):
+    '''
+    Construct array of PaillierEncryptedNumber storage from given memory space
+    ------------------
+    pen:  int, start address of a C memory space,
+               inside which stores PEN's encodings(bigint, CIPHER_BITS long)
+    base: int, start address of a C memory space,
+               inside which stores PEN's base(uint32)
+    exp:  int, start address of a C memory space,
+               inside which stores PEN's exp(uint32)
+    vec_size:   int, the number of bigint
+    n, max_int: int, the key used to encode the original plaintext
+
+    Return:
+    A ndarray, each element is a PaillierEncryptedNumber (PEN)
+    '''
+    res_pen = __get_C_pen(pen, 0, vec_size)
+    res_exp = __get_C_uint32(exp, vec_size)
+
+    res_PaillierEncryptedNumber = []
+    public_key = PaillierPublicKey(n)
+    for i in range(vec_size):
+        res_PaillierEncryptedNumber.append(
+            PaillierEncryptedNumber(
+                public_key, res_pen[i], int(
+                    round(
+                        res_exp[i]))))
+
+    return np.asarray(res_PaillierEncryptedNumber)
+
+
+#######################################################################
+# #########################DEFINITION OF CLASSES#######################
+#######################################################################
+'''#############  the definition of functions and classes #################'''
+
+'''
+    TensorStorage.data Containing the address pointing to a double type
+    All the int32/int64 have been transformed to int64_t type
+    All the float32/float64 have been transformed to double type
+    We assume that TensorStorage has 2 types:
+    1. data is ndarray, caculation can be performed directly by ndarray.
+    2. data is C memory pointer, used for performing further encoding for
+       the lower bound
+'''
+
+
+class TensorStorage(object):
+    '''
+    TensorStorage Class is used for store plaintexts.
+    Currently support
+    1. int32, int64 (all transformed to int64_t type)
+    2. float32, float64 (all transformed to double type)
+
+    Attributes:
+        data: ndarray or int,
+            1. ndarray means data is a python object
+            2. int means data is a C memory object, the value of int is the C memory's
+               start address
+        vec_size: int, the number of data stored in current class
+                       saved here since it may lost when data transfered to C memory
+        mem_type: int, value is MEM_HOST or MEM_DEVICE, where the data is stored
+                       default MEM_HOST
+        data_type: int, value is INT_TYPE or FLOAT_TYPE, the data type of plaintext,
+                        saved here since it may lost when data transfered to C memory
+    '''
+
+    def __init__(self, data, vec_size, mem_type: int, data_type: int):
+        # numpy has some strange shallowcopies which causes incontinuous memory space
+        # so add np.ascontiguousarray here to prevent potential errors
+        self.data = np.ascontiguousarray(
+            data) if isinstance(data, np.ndarray) else data
+        self.vec_size = vec_size
+        self.mem_type = mem_type
+        self.data_type = data_type  # new parameter
+
+    def __str__(self):
+        return f"{self.__class__}:{self.data}"
+
+    def __del__(self):
+        te_free(self)
+
+
+class BigIntStorage(object):
+    '''
+    Used for store bigint objects:
+
+    Attributes:
+        bigint_storage: int, the start address of the C memory storing bigint
+        elem_size:      int, the size of the bigint,
+                            useless since we unified into CIPHER_BITS
+        vec_size:       int, the number of bigint stored in this class
+        mem_type:       int, MEM_HOST or MEM_DEVICE, where data is stored, default MEM_HOST
+
+    '''
+
+    def __init__(self, data, vec_size, mem_type: int, elem_size: int):
+        # 1:cpu/host  2:gpu/device
+        self.mem_type = mem_type
+        # self.data = data
+        self.bigint_storage = data
+        self.elem_size = elem_size
+        self.vec_size = vec_size
+
+    def __len__(self):
+        return len(self.data)
+
+    def __del__(self):
+        bi_free(self)
+
+
+class FixedPointStorage:
+    '''
+    Contains the 3 pointers indicating start address of C memory,
+    which can be handled directly by passing it to C functions in GPU_LIB
+    ------------------
+    Attributes:
+        bigint_storage: int, start address of C memory,
+                                in which stores the mantissa of a fpn array
+        base_storage:   int, start address of C memory,
+                                in which stores the base array of the fpn array
+        exp_storage:    int, start address of C memory,
+                                in which stores the exponent array of fpn array
+        vec_size:       int, the number of data stored in current class
+                                saved here since it may lost when data transfered to C memory
+        mem_type:       int, value is MEM_HOST or MEM_DEVICE, where the data is stored
+                                default MEM_HOST
+        data_type:      int, value is INT_TYPE or FLOAT_TYPE, the data type of plaintext,
+                                saved here since it may lost when data transfered to C memory
+        encode_n, max_int: bigint, the para used for encode the plaintext
+    '''
+
+    def __init__(
+            self,
+            bigint_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            n,
+            max_int,
+            mem_type: int,
+            data_type,
+    ):
+        # 1:cpu/host  2:gpu/device
+        self.mem_type = mem_type
+        '''Actual data and length for fpn'''
+        self.bigint_storage = bigint_storage
+        self.base_storage = base_storage
+        self.exp_storage = exp_storage
+        self.vec_size = vec_size
+        '''TensorStorage needed paras'''
+        self.data_type = data_type
+        '''En/Decode needed paras '''
+        # these 2 are just python int, not BigintStorage nor C_types
+        self.encode_n = n
+        self.max_int = max_int
+
+    def __len__(self):
+        return self.vec_size
+        # return len(self.data)
+
+    def __del__(self):
+        fp_free(self)
+
+
+class PaillierEncryptedStorage:
+    '''
+    Contains the 3 pointers indicating start address of C memory,
+    which can be handled directly by passing it to C functions in GPU_LIB
+    --------------------
+    Attributes:
+        pen_storage:    int, start address of C memory,
+                                in which stores the mantissa of the pen array
+        base_storage:   int, start address of C memory,
+                                in which stores the bases of the pen array
+        exp_storage:    int, start address of C memory,
+                                in which stores the exponents of the pen array
+        vec_size:       int, the number of data stored in current class
+                                saved here since it may lost when data transfered to C memory
+        mem_type:       int, value is MEM_HOST or MEM_DEVICE, where the data is stored
+                                default MEM_HOST
+        data_type:      int, value is INT_TYPE or FLOAT_TYPE, the data type of plaintext,
+                                saved here since it may lost when data transfered to C memory
+        encode_n, max_int: bigint, the para used for encode the plaintext
+    '''
+
+    def __init__(
+            self,
+            pen_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            mem_type: int,
+            data_type,
+            fpn_encode_n,
+            fpn_encode_max_int,
+    ):
+        self.mem_type = mem_type
+        '''Actual data and length for pen'''
+        self.pen_storage = pen_storage
+        self.base_storage = base_storage
+        self.exp_storage = exp_storage
+        self.vec_size = vec_size
+        '''TensorStorage needed paras'''
+        self.data_type = data_type
+        '''En/Decode needed paras '''
+        self.encode_n = fpn_encode_n
+        self.encode_max_int = fpn_encode_max_int
+        '''Pub_key paras'''
+
+    def __len__(self):
+        return self.vec_size
+
+    def __del__(self):
+        pi_free(self)
+
+
+class TensorShapeStorage:
+    '''
+    Used for store the shape, currently support 2 dim
+    The behavior is identical to numpy
+    -------------------
+    Attributes:
+        dim1: the 1st dim, aka the row
+        dim2: the 2nd dim, aka the col
+    '''
+
+    def __init__(self, dim1=None, dim2=None):
+        if dim1 is not None and not isinstance(dim1, int):
+            raise TypeError("invalid dimension")
+        if dim2 is not None and not isinstance(dim2, int):
+            raise TypeError("invalid dimension")
+        self.dim1 = dim1
+        self.dim2 = dim2
+
+    def size(self):
+        dim1 = 1 if self.dim1 is None else self.dim1
+        dim2 = 1 if self.dim2 is None else self.dim2
+        return dim1 * dim2
+
+    def __getitem__(self, item):
+        return self.to_tuple().__getitem__(item)
+
+    def __len__(self):
+        return len(self.to_tuple())
+
+    def to_tuple(self):
+        if self.dim1 is None:
+            return ()
+        else:
+            if self.dim2 is None:
+                return (self.dim1,)
+            else:
+                return (self.dim1, self.dim2)
+
+    def from_tuple(self, v):
+        if len(v) == 1:
+            self.dim1 = v[0]
+            self.dim2 = None
+        elif len(v) == 2:
+            self.dim1 = v[0]
+            self.dim2 = v[1]
+        else:
+            self.dim1 = None
+            self.dim2 = None
+        return self
+
+    def transpose(self):
+        return TensorShapeStorage(self.dim2, self.dim1)
+
+    def matmul(self, other):
+        return TensorShapeStorage(self.dim1, other.dim2)
+
+
+class PubKeyStorage:
+    '''
+    Used for store PaillierPublicKey info as C-accpetable data type
+    -------------
+    Attributes:
+       n,g, nsquare, max_int:
+            c_char_p, actual value is bytes
+            all identical to PaillierPublicKey, which is defined in fate_script
+    '''
+
+    def __init__(self, n, g, nsquare, max_int):
+        self.n = c_char_p(n.to_bytes(CIPHER_BYTE, 'little'))
+        self.g = c_char_p(g.to_bytes(CIPHER_BYTE, 'little'))
+        self.nsquare = c_char_p(nsquare.to_bytes(CIPHER_BYTE, 'little'))
+        self.max_int = c_char_p(max_int.to_bytes(CIPHER_BYTE, 'little'))
+
+
+class PrivKeyStorage:
+    '''
+    Used for store PaillierPrivateKey info as C-acceptable data type
+    ------------
+    Attributes are all identical to PaillierPrivateKey, defined in fate_script
+    '''
+
+    def __init__(self, p, q, psquare, qsquare, q_inverse, hp, hq):
+        self.p = c_char_p(p.to_bytes(CIPHER_BYTE, 'little'))
+        self.q = c_char_p(q.to_bytes(CIPHER_BYTE, 'little'))
+        self.psquare = c_char_p(psquare.to_bytes(CIPHER_BYTE, 'little'))
+        self.qsquare = c_char_p(qsquare.to_bytes(CIPHER_BYTE, 'little'))
+        self.q_inverse = c_char_p(q_inverse.to_bytes(CIPHER_BYTE, 'little'))
+        self.hp = c_char_p(hp.to_bytes(CIPHER_BYTE, 'little'))
+        self.hq = c_char_p(hq.to_bytes(CIPHER_BYTE, 'little'))
+
+
+class Dev_PubKeyStorage:
+    '''
+    Used for store PaillierPublicKey info in GPU memory
+    -----------------
+    Attributes:
+        pub_key_ptr:
+            int, actually a pointer,
+            pointing to the address where pubkey is stored on GPU
+    '''
+
+    def __init__(self, pubkey_storage):
+        self.pub_key_ptr = GPU_LIB.init_pub_key(
+            pubkey_storage.n,
+            pubkey_storage.g,
+            pubkey_storage.nsquare,
+            pubkey_storage.max_int,
+        )
+
+    def __del__(self):
+        pi_free_d_pub_key(self.pub_key_ptr)
+
+
+class Dev_PrivKeyStorage:
+    '''
+    Used for store PaillierPrivateKey info in GPU memory
+    ------------------
+    Attributes:
+       priv_key_ptr:
+            int, actually a pointer,
+            pointing to the address where privkey is stored on GPU
+    '''
+
+    def __init__(self, privkey_storage):
+        self.priv_key_ptr = GPU_LIB.init_priv_key(
+            privkey_storage.p,
+            privkey_storage.q,
+            privkey_storage.psquare,
+            privkey_storage.qsquare,
+            privkey_storage.q_inverse,
+            privkey_storage.hp,
+            privkey_storage.hq,
+        )
+
+    def __del__(self):
+        pi_free_d_priv_key(self.priv_key_ptr)
+
+
+##########################################################################
+# ####################FUNCTION DEFINITION ################################
+##########################################################################
+def te_p2c_shape(shape, res):
+    '''
+    Change a 2-elem tuple into a TensorShapeStorage object
+    -------------
+    Para:
+        shape:   tuple, with no more than 2 elements
+        res:     return value
+    Return:
+        res,     TensorShapeStorage
+    '''
+    if res is None:
+        res = TensorShapeStorage()
+    res.from_tuple(shape)
+    return res
+
+
+def te_c2p_shape(shape):
+    '''
+    recover the shape_tuple from TensorShapeStorage
+    --------------
+    Para:   shape:   TensorShapeStorage
+    Return: tuple
+    '''
+    return shape.to_tuple()
+
+
+def te_free(tes):
+    '''
+    free the c memory space in a TensorStorage class
+    --------------
+    Para:
+        tes:    TensorStorage,
+                if tes.data is a int, which means that it is a C memory pointer
+    Return:
+        None
+    '''
+    if isinstance(tes.data, int):
+        GPU_LIB.c_free(c_void_p(tes.data))
+        tes.data = None
+
+
+def te_p2c(data, res=None):
+    '''
+    transmit the data storage form from Python to C
+    we assume data's structure has already been preserved by the upper layer
+    using the TensorShapeStorage class
+    ------------------
+    Args:
+        data, list or ndarray, the original data array
+    Return:
+        TensorStorage, and data is a C pointer
+    '''
+    # flatten the current ndarray for get the actual vec_size
+    if isinstance(data, list):
+        data = np.asarray(data)
+    if not isinstance(data, np.ndarray):
+        raise TypeError("Unsupported Data Structure")
+    vec_size = data.size
+
+    # malloc the space
+    if res is None:
+        storage_pointer = GPU_LIB.c_malloc(c_size_t(vec_size * DOUBLE_BYTE))
+    else:
+        storage_pointer = res.data
+
+    # switch the differnt data types
+    if data.dtype == 'int32':
+        new_data = data.astype(np.int64)
+        data_pointer = new_data.ctypes.data_as(c_void_p)
+        data_type = INT64_TYPE
+        GPU_LIB.c_memcpy(
+            c_void_p(storage_pointer),
+            data_pointer,
+            c_size_t(vec_size * INT64_BYTE))
+    elif data.dtype == 'int64':
+        data_pointer = data.ctypes.data_as(c_void_p)
+        data_type = INT64_TYPE
+        GPU_LIB.c_memcpy(
+            c_void_p(storage_pointer),
+            data_pointer,
+            c_size_t(vec_size * INT64_BYTE))
+    elif data.dtype == 'float32':
+        new_data = data.astype(np.float64)
+        data_pointer = new_data.ctypes.data_as(c_void_p)
+        data_type = FLOAT_TYPE
+        GPU_LIB.c_memcpy(
+            c_void_p(storage_pointer),
+            data_pointer,
+            c_size_t(vec_size * DOUBLE_BYTE))
+    elif data.dtype == 'float64':
+        data_pointer = data.ctypes.data_as(c_void_p)
+        data_type = FLOAT_TYPE
+        GPU_LIB.c_memcpy(
+            c_void_p(storage_pointer),
+            data_pointer,
+            c_size_t(vec_size * DOUBLE_BYTE))
+    else:
+        raise PermissionError("Invalid Data Type")
+    return _te_init_store(res, storage_pointer, vec_size, MEM_HOST, data_type)
+
+
+def te_c2p(store):
+    '''
+    transmit TensorShapeStorage form from C to Python
+    due to different data type, the return array may diff
+    -----------
+    Para:
+        store: TensorShapeStorage, the storage waited to be changed
+    Return:
+        res_array: np.ndarray, the returned ndarray to Python
+    '''
+    if store.data_type == FLOAT_TYPE:
+        temp_array = __get_C_double(store.data, store.vec_size)
+        res_array = temp_array.astype(np.float64)
+        return res_array
+    elif store.data_type == INT64_TYPE:
+        temp_array = __get_C_int64(store.data, store.vec_size)
+        res_array = temp_array.astype(np.int64)
+        return res_array
+    else:
+        raise PermissionError("Invalid Data Type")
+
+
+def te_c2bytes(data, res=None):
+    '''
+    transmit TensorShapeStorage form from C to bytes stream.
+    Used for communication between sites, since C memory is not shared
+    --------------------
+    Para:
+        data: TensorShapeStorage, data is a C memory ptr
+        res:  the return bytes string
+    Return:
+        res:  bytes
+    '''
+    bytes_res = c_buffer(DOUBLE_BYTE * data.vec_size + U_INT32_BYTE)
+    # first 4 bytes: contains the data_type info
+    # remaining bytes:  contains the data
+    GPU_LIB.te_get_bytes(
+        cast(bytes_res, c_void_p),
+        c_char_p(data.data_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_void_p(data.data),
+        c_size_t(data.vec_size),
+    )
+    return bytes_res.raw
+    # return pickle.dumps(data)
+
+
+def fp_c2bytes(store, res=None):
+    '''
+    transmit FixedPointStorage form to bytes stream;
+    Used for communication between sites, since C memory is not shared
+    Other info besides the C memory, including data_type, mem_type,
+    are also included
+    -----------------
+    Para:
+        store: FixedPointStorage
+        res:   the return bytes string
+    Return:
+        res:   bytes
+    '''
+    # uint32
+    data_type = store.data_type
+    mem_type = store.mem_type
+    # bigint
+    encode_n = store.encode_n
+    max_int = store.max_int
+    # C memory storage
+    bytes_res = c_buffer(
+        (PLAIN_BYTE + U_INT32_BYTE * 2) * store.vec_size + U_INT32_BYTE * 2 + PLAIN_BYTE * 2
+    )
+    GPU_LIB.fp_get_bytes(
+        cast(bytes_res, c_void_p),
+        c_char_p(data_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(mem_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(encode_n.to_bytes(PLAIN_BYTE, 'little')),
+        c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
+        c_void_p(store.bigint_storage),
+        c_void_p(store.base_storage),
+        c_void_p(store.exp_storage),
+        c_size_t(store.vec_size),
+    )
+    return bytes_res.raw
+
+
+def pi_c2bytes(store, res=None):
+    '''
+    transmit PaillierEncryptedNumber form to bytes stream
+    Used for communication between sites, since C memory is not shared
+    ----------------
+    Para:
+        store: PaillierEncryptedStorage
+        res:   the return bytes string
+    Return:
+        res:   bytes
+    '''
+    # uint32
+    data_type = store.data_type
+    mem_type = store.mem_type
+    # bigint
+    encode_n = store.encode_n
+    max_int = store.encode_max_int
+    # C memory storage
+    bytes_res = c_buffer(
+        (CIPHER_BYTE + U_INT32_BYTE * 2) * store.vec_size + U_INT32_BYTE * 2 + CIPHER_BYTE * 2
+    )
+    GPU_LIB.pi_get_bytes(
+        cast(bytes_res, c_void_p),
+        c_char_p(data_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(mem_type.to_bytes(U_INT32_BYTE, 'little')),
+        c_char_p(encode_n.to_bytes(CIPHER_BYTE, 'little')),
+        c_char_p(max_int.to_bytes(CIPHER_BYTE, 'little')),
+        c_void_p(store.pen_storage),
+        c_void_p(store.base_storage),
+        c_void_p(store.exp_storage),
+        c_size_t(store.vec_size),
+    )
+
+    return bytes_res.raw
+
+
+def _te_init_store(store, data, vec_size, mem_type, data_type):
+    '''
+    initialize tensor storage,
+    -----------
+    Para:
+        store: the return value, TensorStorage, default None
+        Other paras' definition are equals to the one in TensorStorage
+    Return:
+        TensorShapeStorage
+    '''
+    if store is None:
+        store = TensorStorage(data, vec_size, mem_type, data_type)
+    else:
+        store.data = data
+        store.vec_size = vec_size
+        if mem_type is not None:
+            store.mem_type = mem_type
+        store.data_type = data_type
+    return store
+
+
+def te_bytes2c(data, res=None):
+    '''
+    Restore TensorStorage from bytes buffer,
+    TensorStorage.data is a ptr pointing to the restored C memory space.
+    -------------
+    Para:
+        data: the bytes string
+        res:  the return value, TensorStorage
+    Return:
+        res:  TensorStorage, the restored struct from para.data
+    '''
+    data_type_res = c_buffer(U_INT32_BYTE)
+    len_data = len(data) - U_INT32_BYTE
+    if res is None:
+        storage_pointer = GPU_LIB.c_malloc(c_size_t(len_data))
+    else:
+        storage_pointer = res.data
+    GPU_LIB.te_from_bytes_get_c(
+        cast(data_type_res, c_void_p),
+        c_void_p(storage_pointer),
+        c_char_p(data),
+        c_size_t(len_data),
+    )
+    data_type = int.from_bytes(data_type_res, 'little')
+    # TODO: change according to different data_types' length,
+    # now just use DOUBLE BYTE because we have only INT64 and DOUBLE,
+    # all of them are 8 bytes(Equal to DOUBLE_BYTE)
+    vec_size = len_data // DOUBLE_BYTE
+    return _te_init_store(res, storage_pointer, vec_size, MEM_HOST, data_type)
+
+
+def fp_bytes2c(data, res=None):
+    '''
+    Restore FixedPointStorage from bytes buffer.
+    ---------------
+    Para:
+        data: the bytes string
+        res:  the return value, FixedPointStorage
+    Return:
+        res:  FixedPointStorage, the restored struct from para.data.
+    '''
+    # caculate vec_size
+    vec_size = (len(data) - 2 * (U_INT32_BYTE + PLAIN_BYTE)) // (U_INT32_BYTE * 2 + PLAIN_BYTE)
+    # uint32
+    data_type = c_buffer(U_INT32_BYTE)
+    mem_type = c_buffer(U_INT32_BYTE)
+    # bigint
+    encode_n = c_buffer(PLAIN_BYTE)
+    max_int = c_buffer(PLAIN_BYTE)
+    # storage
+    if res is None:
+        fpn = GPU_LIB.c_malloc(c_size_t(PLAIN_BYTE * vec_size))
+        base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+        exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    else:
+        fpn = res.bigint_storage
+        base = res.base_storage
+        exp = res.exp_storage
+
+    GPU_LIB.fp_from_bytes_get_c(
+        cast(data_type, c_void_p),
+        cast(mem_type, c_void_p),
+        cast(encode_n, c_void_p),
+        cast(max_int, c_void_p),
+        cast(fpn, c_void_p),
+        cast(base, c_void_p),
+        cast(exp, c_void_p),
+        c_char_p(data),
+        c_size_t(vec_size),
+    )
+    return _fp_init_store(
+        res,
+        fpn,
+        base,
+        exp,
+        vec_size,
+        int.from_bytes(encode_n, 'little'),
+        int.from_bytes(max_int, 'little'),
+        int.from_bytes(mem_type, 'little'),
+        int.from_bytes(data_type, 'little'),
+    )
+
+
+def pi_bytes2c(data, res=None):
+    '''
+    Restored PaillierEncryptedStorage from bytes buffer
+    --------------
+    Para:
+        data: the bytes string
+        res:  the return value, PaillierEncryptedStorage
+    Return:
+        res:  PaillierEncryptedStorage, the restored struct from para.data
+    '''
+    # caculate vec_size
+    vec_size = (len(data) - 2 * (U_INT32_BYTE + CIPHER_BYTE)) // (U_INT32_BYTE * 2 + CIPHER_BYTE)
+    # uint32
+    data_type = c_buffer(U_INT32_BYTE)
+    mem_type = c_buffer(U_INT32_BYTE)
+    # bigint
+    encode_n = c_buffer(CIPHER_BYTE)
+    max_int = c_buffer(CIPHER_BYTE)
+    # storage
+    if res is None:
+        pen = GPU_LIB.c_malloc(c_size_t(CIPHER_BYTE * vec_size))
+        base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+        exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    else:
+        pen = res.pen_storage
+        base = res.base_storage
+        exp = res.exp_storage
+
+    GPU_LIB.fp_from_bytes_get_c(
+        cast(data_type, c_void_p),
+        cast(mem_type, c_void_p),
+        cast(encode_n, c_void_p),
+        cast(max_int, c_void_p),
+        cast(pen, c_void_p),
+        cast(base, c_void_p),
+        cast(exp, c_void_p),
+        c_char_p(data),
+        c_size_t(vec_size),
+    )
+    return _pi_init_store(
+        res,
+        pen,
+        base,
+        exp,
+        vec_size,
+        int.from_bytes(mem_type, 'little'),
+        int.from_bytes(data_type, 'little'),
+        int.from_bytes(encode_n, 'little'),
+        int.from_bytes(max_int, 'little'),
+    )
+
+
+def _te_init_shape(shape_store, shape_tuple):
+    '''
+    Init TensorShapeStorage
+    ----------
+    Para:
+        shape_store: TensorShapeStorage or None, return value, default None
+        shape_tuple: tuple, at most 2 dim, source data of TensorShapeStorage
+    Return:
+        TensorShapeStorage
+    '''
+    if shape_store is None:
+        shape_store = TensorShapeStorage()
+    shape_store.from_tuple(shape_tuple)
+    return shape_store
+
+
+def _te_init_ss(
+        res_store, res_data, vec_size, res_shape, shape_tuple, mem_type, data_type
+):
+    '''
+    Init TensorStorage and TensorShapeStorage at the same time
+    ------------
+    Para:
+        res_store: The return value, TensorStorage, default None
+        res_data:  int or ndarray
+        vec_size:  int
+        res_shape: The return value, TensorShapeStorage, default None
+        shape_tuple, tuple, at most 2 dim
+        mem_type:  int
+        data_type: int
+    Return:
+        tuple, (TensorStorage, TensorShapeStorage)
+    '''
+    return _te_init_store(
+        res_store, res_data, vec_size, mem_type, data_type
+    ), _te_init_shape(res_shape, shape_tuple)
+
+
+'''''' '''
+The following calculators are done on TensorStorage
+Definition are the same with numpy
+TensorStorage.data should all be ndarray datatype in order to support numpy
+
+NOT USED IN OUR FATE IMPLEMENTATION,
+but Webank's implementation seems to have used them
+''' ''''''
+
+
+def te_slice(
+        store,
+        shape,
+        start,
+        stop,
+        axis,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    if axis == 1:
+        res_data = store.data[:, start:stop]
+    elif axis == 0:
+        res_data = store.data[start:stop]
+    else:
+        raise NotImplementedError()
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def te_cat(stores, axis, res_store=None, res_shape=None):
+    if axis == 0:
+        res_data = np.vstack([x.data for x in stores])
+    elif axis == 1:
+        res_data = np.hstack([x.data for x in stores])
+    else:
+        raise NotImplementedError()
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        stores[0].mem_type,
+        stores[0].data_type,
+    )
+
+
+# TODO: precise data_type
+
+
+def te_pow(
+        left_store,
+        right,
+        left_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    res_data = left_store.data ** right
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+# TODO: precise data_type
+
+
+def te_add(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    res_data = left_store.data + right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+# TODO: precise data_type
+
+
+def te_mul(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    res_data = left_store.data * right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+# TODO: precise data_type
+
+
+def te_truediv(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    res_data = left_store.data / right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        FLOAT_TYPE,
+    )
+
+
+def te_floordiv(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    res_data = left_store.data // right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        INT64_TYPE,
+    )
+
+
+def te_sub(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    res_data = left_store.data - right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+# TODO: precise data_type, currently only inherent from left
+
+
+def te_matmul(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    res_data = left_store.data @ right_store.data
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_abs(left_store, left_shape, res_store, res_shape, stream):
+    return _te_init_ss(
+        res_store,
+        abs(left_store.data),
+        left_store.vec_size,
+        res_shape,
+        left_shape.to_tuple(),
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_neg(
+        left_store,
+        left_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    return _te_init_ss(
+        res_store,
+        -left_store.data,
+        left_store.vec_size,
+        res_shape,
+        left_shape.to_tuple(),
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_transpose(
+        left_store,
+        left_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    res_data = left_store.data.transpose()
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_sum(
+        left_store,
+        left_shape,
+        axis,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    res_data = left_store.data.sum(axis=axis)
+    return _te_init_ss(
+        res_store,
+        res_data,
+        res_data.size,
+        res_shape,
+        res_data.shape,
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_reshape(
+        store,
+        shape,
+        new_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    return _te_init_ss(
+        res_store,
+        store.data.reshape(new_shape),
+        store.vec_size,
+        res_shape,
+        new_shape.to_tuple(),
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def te_exp(store, shape, res_store=None, res_shape=None, stream=None):
+    return _te_init_ss(
+        res_store,
+        np.exp(store.data),
+        store.vec_size,
+        res_shape,
+        shape.to_tuple(),
+        store.mem_type,
+        FLOAT_TYPE,
+    )
+
+
+def te_hstack(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    _store, _shape = te_cat([left_store, right_store], 1, res_store, res_shape)
+    # avoid naming collision
+    return _te_init_ss(
+        res_store,
+        _store.data,
+        _store.vec_size,
+        _shape,
+        _shape.to_tuple(),
+        left_store.mem_type,
+        left_store.data_type,
+    )
+
+
+def te_c2p_first(store):
+    '''
+    Get the first element in the C data storage of TensorStorage
+    ---------------
+    Para:
+        store: TensorStorage, store.data must be a pointer to C memory
+    Return:
+        int or double, the first element in the C memory
+    '''
+    if store.data_type == FLOAT_TYPE:
+        temp_array = __get_C_double(store.data, 1)
+        res_array = temp_array.astype(np.float64)
+        return res_array[0]
+    elif store.data_type == INT64_TYPE:
+        temp_array = __get_C_int64(store.data, 1)
+        res_array = temp_array.astype(np.int64)
+        return res_array[0]
+    else:
+        raise PermissionError("Invalid Data Type")
+
+
+'''################malloc a space with size elements############### '''
+'''
+    function: allocate space and form a new PaillierEncryptedStorage Class
+    res:    spilted to 3 different parts, indicating the 3 parts
+            that are needed for the PaillierEncrytedStorage
+    size:   is the number of elements that need to be alloced
+    return: A PaillierEncryptedStorage class, wrapping res as a class
+'''
+
+
+def direct_bi_alloc(res, vec_size, elem_size, mem_type):
+    return _bi_init_store(
+        res,
+        GPU_LIB.c_direct_malloc(c_size_t(vec_size * elem_size)),
+        vec_size,
+        elem_size,
+        mem_type,
+    )
+
+
+def direct_pi_alloc(res, size, mem_type):
+    if mem_type == MEM_HOST:
+        res_pen = GPU_LIB.c_direct_malloc(c_size_t(size * CIPHER_BYTE))
+    elif mem_type == MEM_DEVICE:
+        res_pen = GPU_LIB.cuda_malloc(c_size_t(size))
+    else:
+        res_pen = None
+    res_base = GPU_LIB.c_direct_malloc(c_size_t(size * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_direct_malloc(c_size_t(size * U_INT32_BYTE))
+    # data_type, encode_n and encode_max_int all set to 0
+    return _pi_init_store(
+        res,
+        res_pen,
+        res_base,
+        res_exp,
+        size,
+        mem_type,
+        0,
+        0,
+        0)
+
+
+def direct_fp_alloc(res, size, mem_type):
+    if mem_type == MEM_HOST:
+        res_fpn = GPU_LIB.c_direct_malloc(c_size_t(size * PLAIN_BYTE))
+    elif mem_type == MEM_DEVICE:
+        res_fpn = GPU_LIB.cuda_malloc(c_size_t(size))
+    else:
+        res_fpn = None
+    res_base = GPU_LIB.c_direct_malloc(c_size_t(size * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_direct_malloc(c_size_t(size * U_INT32_BYTE))
+    return _fp_init_store(
+        res,
+        res_fpn,
+        res_base,
+        res_exp,
+        size,
+        0,
+        0,
+        mem_type,
+        0)
+
+
+def direct_te_alloc(res, size, mem_type):
+    data = GPU_LIB.c_direct_malloc(c_size_t(size * DOUBLE_BYTE))
+    return _te_init_store(res, data, size, mem_type, 0)
+
+
+def bi_alloc(res, vec_size, elem_size, mem_type):
+    return _bi_init_store(
+        res,
+        GPU_LIB.c_malloc(c_size_t(vec_size * elem_size)),
+        vec_size,
+        elem_size,
+        mem_type,
+    )
+
+
+def pi_alloc(res, size, mem_type):
+    if mem_type == MEM_HOST:
+        res_pen = GPU_LIB.c_malloc(c_size_t(size * CIPHER_BYTE))
+    elif mem_type == MEM_DEVICE:
+        res_pen = GPU_LIB.cuda_malloc(c_size_t(size))
+    else:
+        res_pen = None
+    res_base = GPU_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    # data_type, encode_n and encode_max_int all set to 0
+    return _pi_init_store(
+        res,
+        res_pen,
+        res_base,
+        res_exp,
+        size,
+        mem_type,
+        0,
+        0,
+        0)
+
+
+def fp_alloc(res, size, mem_type):
+    if mem_type == MEM_HOST:
+        res_fpn = GPU_LIB.c_malloc(c_size_t(size * PLAIN_BYTE))
+    elif mem_type == MEM_DEVICE:
+        res_fpn = GPU_LIB.cuda_malloc(c_size_t(size))
+    else:
+        res_fpn = None
+    res_base = GPU_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_malloc(c_size_t(size * U_INT32_BYTE))
+    return _fp_init_store(
+        res,
+        res_fpn,
+        res_base,
+        res_exp,
+        size,
+        0,
+        0,
+        mem_type,
+        0)
+
+
+def te_alloc(res, size, mem_type):
+    data = GPU_LIB.c_malloc(c_size_t(size * DOUBLE_BYTE))
+    return _te_init_store(res, data, size, mem_type, 0)
+
+
+def pi_free(ptr):
+    '''
+    The delete function of PaillierEncryptedStorage,
+    Due to different mem_type, the delete method may change
+    --------------
+    Para:
+        ptr: PaillierEncryptedStorage
+    '''
+    if _MEM_ON_HOST(ptr.mem_type):
+        GPU_LIB.c_free(c_void_p(ptr.pen_storage))
+    else:
+        print("free space on gpu")
+        GPU_LIB.cuda_free(c_void_p(ptr.pen_storage))
+    GPU_LIB.c_free(c_void_p(ptr.base_storage))
+    GPU_LIB.c_free(c_void_p(ptr.exp_storage))
+    ptr.pen_storage, ptr.base_storage, ptr.exp_storage = None, None, None
+
+
+def fp_h2d(target, src=None, stream=None):
+    '''TODO: currently not Implemented because it is not used'''
+    return src
+
+
+def fp_d2h(target, src=None, stream=None):
+    '''TODO: currently not Implemented because it is not used'''
+    return src
+
+
+def pi_h2d(pub_key, target, src=None, stream=None):
+    '''
+    Transfer C-memory stored PaillierEncryptedStorage into GPU-memory stored,
+    with the internal exponent aligned done.
+    ---------------
+    Para:
+        pub_key: Dev_PubKeyStorage, Paillier PubKey used for exp align
+        target:  PaillierEncryptedStorage, return value,
+                 target.pen_storage is a pointer pointing to GPU-memory,
+        src:     PaillierEncryptedStorage, source data
+                 src.pen_storage is a pointer pointing to CPU C-memory
+    Return:
+        PaillierEncryptedStorage, ptr pointing to GPU-memory
+    '''
+    vec_size = src.vec_size
+    # pen_storage is a pointer pointing to GPU-memory
+    # base_storage & exp_storage are pointers pointing to CPU C-memory
+    # Since those two are rarely used in computation and costs less time for
+    # copying from Host To Device when compared with encrypted bigint.
+    if target is None:
+        pen_storage = GPU_LIB.cuda_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        base_storage = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        exp_storage = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        pen_storage = target.pen_storage
+        base_storage = target.base_storage
+        exp_storage = target.exp_storage
+
+    GPU_LIB.pen_host2device_exp_align(
+        c_char_p(src.pen_storage),
+        c_void_p(src.base_storage),
+        c_void_p(src.exp_storage),
+        c_void_p(pen_storage),
+        c_void_p(base_storage),
+        c_void_p(exp_storage),
+        c_size_t(vec_size),
+        c_void_p(pub_key.pub_key_ptr),
+    )
+    mem_type = MEM_DEVICE
+    return _pi_init_store(
+        target,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        mem_type,
+        src.data_type,
+        src.encode_n,
+        src.encode_max_int,
+    )
+
+
+def pi_d2h(target, src=None, stream=None):
+    '''
+    Transfer GPU-memory stored PaillierEncryptedStorage into C-memory stored ones.
+    --------------
+    Para:
+        target: PaillierEncryptedStorage, return value
+                target.pen_storage is a pointer pointing to CPU C-memory
+        src:    PaillierEncryptedStorage, source value
+                src.pen_storage is a pointer pointing to GPU-memory
+    Return
+        PaillierEncryptedStorage, ptr pointing to C-memory
+    '''
+    vec_size = src.vec_size
+    if target is None:
+        pen_storage = GPU_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+        base_storage = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        exp_storage = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        pen_storage = target.pen_storage
+        base_storage = target.base_storage
+        exp_storage = target.exp_storage
+
+    GPU_LIB.c_memcpy(
+        c_void_p(base_storage),
+        c_void_p(src.base_storage),
+        c_size_t(vec_size * U_INT32_BYTE),
+    )
+    GPU_LIB.c_memcpy(
+        c_void_p(exp_storage),
+        c_void_p(src.exp_storage),
+        c_size_t(vec_size * U_INT32_BYTE),
+    )
+
+    GPU_LIB.pen_device2host(
+        c_void_p(
+            src.pen_storage), c_char_p(pen_storage), c_size_t(
+            src.vec_size))
+    mem_type = MEM_HOST
+    return _pi_init_store(
+        target,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        src.vec_size,
+        mem_type,
+        src.data_type,
+        src.encode_n,
+        src.encode_max_int,
+    )
+
+
+def pi_h2d_pub_key(src):
+    '''
+    Transfer CPU C-memory stored PubKeyStorage to GPU-memory stored Dev_PubKeyStorage
+    ----------------
+    target:  Dev_PubKeyStorage, return value
+    src:     PubKeyStorage, the source value to be transfered
+    '''
+    target = Dev_PubKeyStorage(src)
+    return target
+
+
+def pi_h2d_priv_key(src):
+    '''
+    Transfer CPU C-memory stored PubKeyStorage to GPU-memory stored Dev_PubKeyStorage
+    ----------------
+    target:  Dev_PrivKeyStorage, return value
+    src:     PrivKeyStorage, the source value to be transfered
+    '''
+    target = Dev_PrivKeyStorage(src)
+    return target
+
+
+def pi_free_d_pub_key(target):
+    '''
+    free memory malloced for Dev_PubKeyStorage, which is on GPU
+    -----------------
+    target:  a pointer pointing to a continuous cuda memory
+    '''
+    GPU_LIB.cuda_free(c_void_p(target))
+
+
+def pi_free_d_priv_key(target):
+    '''
+    free memory malloced for Dev_PrivKeyStorage, which is on GPU
+    ------------------
+    target:  a pointer pointing to a continuous cuda memory
+    '''
+    GPU_LIB.cuda_free(c_void_p(target))
+
+
+def pi_p2c_pub_key(src):
+    '''
+    Transfer Python form PaillierPublicKey to C form PubKeyStorage,
+    the latter can be used for C/Cuda computing
+    '''
+    target = PubKeyStorage(src.n, src.g, src.nsquare, src.max_int)
+    return target
+
+
+def pi_p2c_priv_key(src):
+    '''Transfer Python form PaillierPrivateKey to C form PrivKeyStorage'''
+    target = PrivKeyStorage(
+        src.p, src.q, src.psquare, src.qsquare, src.q_inverse, src.hp, src.hq
+    )
+    return target
+
+
+# ###########PaillierEncrypted STORAGE INITIALIZE#################
+def _pi_init_store(
+        res_store,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        mem_type,
+        data_type,
+        encode_n,
+        encode_max_int,
+):
+    '''
+    init a new PaillierEncryptedStorage
+    ---------------
+    Para:
+        res_store, PaillierEncryptedStorage or None, return value, default None
+        Else paras are identical to the ones described in PaillierEncryptedStorage
+    '''
+    if res_store is None:
+        res_store = PaillierEncryptedStorage(
+            pen_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            mem_type,
+            data_type,
+            encode_n,
+            encode_max_int,
+        )
+    else:
+        res_store.pen_storage = pen_storage
+        res_store.base_storage = base_storage
+        res_store.exp_storage = exp_storage
+        res_store.vec_size = vec_size
+        res_store.mem_type = mem_type
+        '''para needed by TensorStorage'''
+        res_store.data_type = data_type
+        '''para needed by FixedPointNumber'''
+        res_store.encode_n = encode_n
+        res_store.encode_max_int = encode_max_int
+    return res_store
+
+
+_pi_init_shape = _te_init_shape
+
+
+def _pi_init_ss(
+        res_store,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        res_shape,
+        res_shape_tuple,
+        mem_type,
+        data_type,
+        encode_n,
+        encode_max_int,
+):
+    '''
+    init new PaillierEncryptedStorage and corresponding TensorShapeStorage at same time
+    Paras are identical to _pi_init_store & _te_init_shape
+    '''
+    return _pi_init_store(
+        res_store,
+        pen_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        mem_type,
+        data_type,
+        encode_n,
+        encode_max_int,
+    ), _pi_init_shape(res_shape, res_shape_tuple)
+
+
+''' transfor PEN tensor from Python memory to C memory '''
+
+
+def pi_p2c(target, src, data_type=FLOAT_TYPE):
+    '''
+    Transform list of PaillierEncryptedNumber to C-memory style PaillierEncryptedStorage
+    --------------------
+    Para:
+        target:     PaillierEncryptedStorage, return value
+        src:        List or ndarray, each element is a PaillierEncryptedNumber
+        data_type:  int, src's original datatype, default double
+    '''
+    if isinstance(src, list):
+        vec_size = len(src)
+    elif isinstance(src, np.ndarray):
+        vec_size = src.size
+        src = src.flat
+    else:
+        raise TypeError("Unsupported Data Structure")
+    # malloc the space for the type
+    if target is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = target.pen_storage
+        res_base = target.base_storage
+        res_exp = target.exp_storage
+    # get the two encoding parameters
+    n = src[0].public_key.n
+    max_int = src[0].public_key.max_int
+    base_temp = []
+    exp_temp = []
+    # Due to the special condition that big_int in ndarray are not continuously stored,
+    # they are actually object type rather than int type.
+    # So we should use a for loop to handle each bigint and memcpy it
+    for i in range(vec_size):
+        src_number = src[i].ciphertext(False).to_bytes(CIPHER_BYTE, 'little')
+        GPU_LIB.c_memcpy(
+            c_void_p(res_pen + i * CIPHER_BYTE),
+            c_char_p(src_number),
+            c_size_t(CIPHER_BYTE),
+        )
+        base_temp.append(PEN_BASE)
+        exp_temp.append(src[i].exponent)
+    # base and exp are deepcopyed in order to prevent potential double free
+    # here
+    base_array_pointer = np.asarray(
+        base_temp, np.uint32).ctypes.data_as(c_void_p)
+    exp_array_pointer = np.asarray(
+        exp_temp, np.uint32).ctypes.data_as(c_void_p)
+    GPU_LIB.c_memcpy(
+        c_void_p(res_base),
+        base_array_pointer,
+        c_size_t(vec_size * U_INT32_BYTE))
+    GPU_LIB.c_memcpy(
+        c_void_p(res_exp), exp_array_pointer, c_size_t(vec_size * U_INT32_BYTE)
+    )
+    return _pi_init_store(
+        target,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        MEM_HOST,
+        data_type,
+        n,
+        max_int)
+
+
+def _bi_init_store(res_store, data, count, elem_size, mem_type):
+    '''init a new BigIntStorage object'''
+    if res_store is None:
+        res_store = BigIntStorage(data, count, mem_type, elem_size)
+    else:
+        res_store.bigint_storage = data
+        res_store.vec_size = count
+        res_store.elem_size = elem_size
+        res_store.mem_type = mem_type
+    return res_store
+
+
+_bi_init_shape = _te_init_shape
+
+
+def _bi_init_ss(
+        res_store,
+        res_data,
+        vec_size,
+        res_shape,
+        res_shape_tuple,
+        elem_size,
+        mem_type):
+    '''Init BigIntStorage and the corresponding TensorShapeStorage'''
+    return _bi_init_store(
+        res_store, res_data, vec_size, elem_size, mem_type
+    ), _bi_init_shape(res_shape, res_shape_tuple)
+
+
+def _fp_init_store(
+        res_store,
+        fpn_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        n,
+        max_int,
+        mem_type,
+        data_type,
+):
+    '''
+    Init FixedPointStorage class,
+    paras are identical to the elements in FixedPointStorage
+    '''
+    if res_store is None:
+        res_store = FixedPointStorage(
+            fpn_storage,
+            base_storage,
+            exp_storage,
+            vec_size,
+            n,
+            max_int,
+            mem_type,
+            data_type,
+        )
+    else:
+        res_store.bigint_storage = fpn_storage
+        res_store.base_storage = base_storage
+        res_store.exp_storage = exp_storage
+        res_store.vec_size = vec_size
+        res_store.mem_type = mem_type
+        '''TensorStorage needed paras'''
+        res_store.data_type = data_type
+        '''En/Decode needed paras '''
+        res_store.encode_n = n
+        res_store.max_int = max_int
+    return res_store
+
+
+def _fp_init_ss(
+        res_store,
+        fpn_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        n,
+        max_int,
+        res_shape,
+        res_shape_tuple,
+        mem_type,
+        data_type,
+):
+    '''Init FixedPointStorage and the corresponding TensorShapeStorage'''
+    return _fp_init_store(
+        res_store,
+        fpn_storage,
+        base_storage,
+        exp_storage,
+        vec_size,
+        n,
+        max_int,
+        mem_type,
+        data_type,
+    ), _te_init_shape(res_shape, res_shape_tuple)
+
+
+def get_add_mul_size(
+        left_shape: TensorShapeStorage,
+        right_shape: TensorShapeStorage):
+    '''
+    Get the result size of pi_add, pi_mul, fp_mul calculators
+    --------------------
+    Para:
+        left_shape, right_shape: TensorShapeStorage, the two operator's shape
+    Return:
+        res_size: int, the size of the return value
+    '''
+    if isinstance(left_shape, TensorShapeStorage) is False:
+        raise RuntimeError(
+            f"Illegal shape type : {type(left_shape)}, params need type: {TensorShapeStorage}"
+        )
+    if isinstance(right_shape, TensorShapeStorage) is False:
+        raise RuntimeError(
+            f"Illegal shape type : {type(right_shape)}, params need type: {TensorShapeStorage}"
+        )
+
+    P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
+    res_size = max(P, R) * max(Q, S)
+    return res_size
+
+
+def get_matmul_rmatmul_size(
+        left_shape: TensorShapeStorage, right_shape: TensorShapeStorage
+):
+    '''
+    Get the result size of matmul, rmatmul calculators
+    ----------------------
+    Para:
+        left_shape, right_shape: TensorShapeStorage, the two operator's shape
+    Return:
+        res_size: int, the size of the result of corresponding calculators
+    '''
+    if isinstance(left_shape, TensorShapeStorage) is False:
+        raise RuntimeError(
+            f"Illegal shape type : {type(left_shape)}, params need type: {TensorShapeStorage}"
+        )
+    if isinstance(right_shape, TensorShapeStorage) is False:
+        raise RuntimeError(
+            f"Illegal shape type : {type(right_shape)}, params need type: {TensorShapeStorage}"
+        )
+    P, Q = __shape_decompose(left_shape)
+    R, S = __shape_decompose(right_shape)
+    res_size = P * S
+    return res_size
+
+
+def get_sum_size(shape: TensorShapeStorage, axis):
+    '''
+    Get the result size of pi_sum, whose result depends on axis
+    ----------------------
+    Para:
+        shape: TensorShapeStorage, the input store's size
+        axis:  int or None, the dim which sum is performed,
+               0 means vertical sum, 1 means horizontal sum, None means sum all data
+    Return:
+        int, the size of the result of pi_sum
+    '''
+    if isinstance(shape, TensorShapeStorage) is False:
+        raise RuntimeError(
+            f"Illegal shape type : {type(shape)}, params need type: {TensorShapeStorage}"
+        )
+    if axis is None:
+        return 1
+    if len(shape.to_tuple()) < 2:
+        return shape.size()
+    elif len(shape.to_tuple()) == 2 and axis == 0:
+        return shape.to_tuple()[1]
+    elif len(shape.to_tuple()) == 2 and axis == 1:
+        return shape.to_tuple()[0]
+    else:
+        raise RuntimeError("illegal shape or axis!")
+
+
+def get_slice_size(shape: TensorShapeStorage, start: int, stop: int, axis):
+    '''
+    Get the result size of fp_slice, pi_slice, whose result depends on axis
+    ------------------------
+    Para:
+        shape: TensorShapeStorage, the input store's size
+        axis:  int or None, the dim which sum is performed,
+               0 means slice horizontally
+               1 means slice vertically
+    Return:
+        int, the result size of corresponding calculators
+    '''
+    if isinstance(shape, TensorShapeStorage) is False:
+        raise RuntimeError(
+            f"Illegal shape type : {type(shape)}, params need type: {TensorShapeStorage}"
+        )
+    if isinstance(start, int) is False:
+        raise RuntimeError(
+            f"Illegal start type : {type(start)}, params need type : {int}"
+        )
+    if isinstance(stop, int) is False:
+        raise RuntimeError(
+            f"Illegal stop type : {type(stop)}, params need type : {int}"
+        )
+    shape_tuple = shape.to_tuple()
+    dim0, dim1 = 0, 0
+    if len(shape_tuple) == 1:
+        dim0, dim1 = 1, shape_tuple[0]
+    elif len(shape_tuple) == 2:
+        dim0, dim1 = shape_tuple[0], shape_tuple[1]
+    gap_length = stop - start
+    res_size = None
+    if axis == 0:
+        # 'axis == 0 means that we need to cut the matrix horizontally '
+        res_size = dim0 * gap_length
+    elif axis == 1:
+        # 'axis == 1 means that we need to cut the matrix vertically '
+        res_size = dim1 * gap_length
+    return res_size
+
+
+def get_cat_size(shapes: list):
+    '''
+    Get the result size of fp_cat, pi_cat
+    -------------------
+    Para:
+        shape: List[TensorShapeStorage], the to-be-concated stores' shape
+    Return:
+        int, the sum result of all shapes
+    '''
+    if isinstance(shapes, list) is False:
+        raise RuntimeError(
+            f"Illegal shapes type : {type(shapes)}, params need type : {list}"
+        )
+    res_size = np.sum([v.size() for v in shapes])
+    return res_size
+
+
+def pi_encrypt(pub_key, fps, res=None, stream=None):
+    '''
+    perform paillier encryption for FixedPointStorage,
+    use raw encrypt with no obfuscation
+    ----------------
+    Para:
+        pubkey: Dev_PubKeyPtr, the PaillierPublicKey class stored in GPU memory
+        fps:    FixedPointStorage, fpn value waiting to be encrypted
+        res:    None or PaillierEncryptedStorage, return value, default None
+        stream: None, currently not used
+    Return:
+        PaillierEncryptedStorage, the encrypted value
+    '''
+    src_fpn = fps.bigint_storage
+    src_base = fps.base_storage
+    src_exp = fps.exp_storage
+    vec_size = fps.vec_size
+
+    if res is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = res.pen_storage
+        res_base = res.base_storage
+        res_exp = res.exp_storage
+    '''call the encrypt function'''
+    GPU_LIB.encrypt_paillier(
+        c_char_p(src_fpn),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(vec_size),
+        c_uint32(device_type),
+    )
+    return _pi_init_store(
+        res,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        fps.mem_type,
+        fps.data_type,
+        fps.encode_n,
+        fps.max_int,
+    )
+
+
+def pi_decrypt(pub_key, priv_key, pes, res=None, stream=None, fps=None):
+    '''
+    perform decryption and decode as a whole
+    ---------------------
+    Para:
+        pub_key:   Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        priv_key:  Dev_PrivKeyStorage, PaillierPrivateKey stored in GPU mem
+        pes:       PaillierEncryptedStorage, pens waiting to be decrypted
+        res:       TensorStorage, the return value;
+        stream:    None, currently not used
+        fps:       FixedPointStorage, the middle memory space used
+                   after decrypt and before encode
+    Return:
+        TensorStorage, the decrypted then decoded value
+    '''
+    src_pen = pes.pen_storage
+    src_base = pes.base_storage
+    src_exp = pes.exp_storage
+    vec_size = pes.vec_size
+    '''malloc space for middle FixedPointStorage'''
+    if fps is None:
+        res_fpn = GPU_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_fpn = fps.bigint_storage
+        res_base = fps.base_storage
+        res_exp = fps.exp_storage
+    '''call the decrypt function'''
+    GPU_LIB.decrypt_paillier(
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_fpn),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_void_p(pub_key.pub_key_ptr),
+        c_void_p(priv_key.priv_key_ptr),
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(vec_size),
+        c_uint32(device_type),
+    )
+
+    decrypt_store = FixedPointStorage(
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        pes.encode_n,
+        pes.encode_max_int,
+        pes.mem_type,
+        pes.data_type,
+    )
+    return fp_decode(decrypt_store, res, stream)
+
+
+def pi_obfuscate(pub_key, pes, obf_seeds, res=None, stream=None):
+    '''
+    apply obfuscation to a PaillierEncryptedStorage using the
+    obfuscation seed given, actually a mulmod
+    ----------------------
+    Para:
+        pubkey:    Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        pes:       PaillierEncryptedStorage, raw pen haven't be obfuscated
+        obf_seeds: BigIntStorage, random bigint generated by pi_gen_obf_seed
+        res:       PaillierEncryptedStorage, the obfuscated return value
+    Return:
+        PaillierEncryptedStorage, the same as res
+    '''
+    # get the pen storage ptr
+    src_pen = pes.pen_storage
+    src_base = pes.base_storage
+    src_exp = pes.exp_storage
+    vec_size = pes.vec_size
+    # get the bigint random ptr
+    obf_rand = obf_seeds.bigint_storage
+    '''initialize the res space'''
+    if res is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = res.pen_storage
+        res_base = res.base_storage
+        res_exp = res.exp_storage
+    '''run the modular mul function'''
+    GPU_LIB.obf_modular_multiplication(
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(obf_rand),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS),
+        c_size_t(CIPHER_BITS),
+        c_size_t(vec_size),
+        c_uint32(device_type),
+    )
+    return _pi_init_store(
+        res,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        pes.mem_type,
+        pes.data_type,
+        pes.encode_n,
+        pes.encode_max_int,
+    )
+
+
+def pi_gen_obf_seed(res_store, pub_key, count, elem_size, rand_seed, stream):
+    '''
+    generate random bigint and perform expmod based on the given public key.
+    The calculation result is then used as obfuscation seed for further encrypt.
+    --------------
+    Para:
+        res_store:   BigIntStorage, the return value
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        count:       int, the number of random numbers need to be generated
+        elem_size:   int, the length of the random bigint
+        rand_seed:   the seed used for generating random number
+    Return:
+        BigIntStorage, same as res_store
+    '''
+    rand_storage = bi_gen_rand(elem_size, count, res_store, rand_seed, stream)
+    rand_data = rand_storage.bigint_storage
+    if res_store is None:
+        res_data = GPU_LIB.c_malloc(c_size_t(count * CIPHER_BYTE))
+    else:
+        res_data = res_store.bigint_storage
+    GPU_LIB.obf_modular_exponentiation(
+        c_char_p(rand_data),
+        c_size_t(CIPHER_BITS),
+        c_void_p(pub_key.pub_key_ptr),
+        c_char_p(res_data),
+        c_size_t(CIPHER_BITS),
+        c_size_t(count),
+        c_uint32(device_type),
+    )
+    return _bi_init_store(res_store, res_data, count, elem_size, MEM_DEVICE)
+
+
+def __shape_decompose(shape):
+    '''
+    Decompose TensorShapeStorage to 2-D tuple
+    satisfying cuda computation demand
+    '''
+    shape_tuple = shape.to_tuple()
+    if len(shape_tuple) == 0:
+        return 1, 1
+    elif len(shape_tuple) == 1:
+        return 1, shape_tuple[0]
+    elif len(shape_tuple) == 2:
+        return shape_tuple[0], shape_tuple[1]
+    else:
+        raise PermissionError("Invalid Shape")
+
+
+def __shape_resolve(shape_1, shape_2):
+    '''check aligment capability of shape_1 and shape_2 to support broadcast'''
+
+    def check_func(a, b):
+        return a == b or a == 1 or b == 1
+
+    P, Q = __shape_decompose(shape_1)
+    R, S = __shape_decompose(shape_2)
+    max_shape_size = max(len(shape_1.to_tuple()), len(shape_2.to_tuple()))
+    if check_func(P, R) and check_func(Q, S):
+        # to suit numpy's shape output, config output shape here
+        if max_shape_size == 0:
+            return P, Q, R, S, ()
+        elif max_shape_size == 1:
+            return P, Q, R, S, (max(Q, S),)
+        elif max_shape_size == 2:
+            return P, Q, R, S, (max(P, R), max(Q, S))
+        else:
+            raise PermissionError("Invalid shape", shape_1, shape_2)
+    else:
+        raise PermissionError("shape cannot align", shape_1, shape_2)
+
+
+def pi_add(
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''
+    Perform element-wise encrypted add, support broadcast over cols or rows
+    ---------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        right_store: PaillierEncryptedStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if left/right operators cannot aligned for compute,
+                         even if broadcast is supported
+    '''
+    # check for alignment capability of shapes
+    P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
+    res_size = max(P, R) * max(Q, S)
+    # the left_store data
+    l_pen = left_store.pen_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_pen = right_store.pen_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    # perform calculation
+    GPU_LIB.pen_matrix_add_pen_matrix(
+        c_char_p(l_pen),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_pen),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(R),
+        c_size_t(S),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
+    # handle the result's data type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_mul(
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''
+    Perform element-wise encrypted muliply, support broadcast for cols and rows
+    --------------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        right_store: FixedPointStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if left/right operators cannot aligned for compute,
+                         even if broadcast is supported
+    '''
+    # check for alignment capability of shapes
+    P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
+    res_size = max(P, R) * max(Q, S)
+    # the left_store data
+    l_pen = left_store.pen_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_fpn = right_store.bigint_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    # '''call the batch_mul function'''
+    GPU_LIB.fpn_matrix_elementwise_multiply_pen_matrix(
+        c_char_p(r_fpn),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(l_pen),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(R),
+        c_size_t(S),
+        c_size_t(P),
+        c_size_t(Q),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
+    # handle the result's data type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def fp_transpose(
+        left_store,
+        left_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    '''
+    transpose the C-memory stored matrix of FixedPointStorage,
+    support at most 2-D matrix
+    -----------------
+    Para:
+        left_store:  FixedPointStorage, left_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (FixedPointStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if dimension is higher than 2-D, not supported
+    '''
+    left_shape_tuple = left_shape.to_tuple()
+    # get the left_store parameters
+    src_fpn = left_store.bigint_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    # malloc space for the res value
+    if res_store is None:
+        res_fpn = GPU_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_fpn = res_store.bigint_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    #  Handling different shapes
+    if len(left_shape_tuple) < 2:
+        # the tuple is 0-D or 1-D,
+        # transpose returns the same value as input in numpy
+        # make the output same as numpy, so no need for transpose
+        GPU_LIB.c_memcpy(
+            c_void_p(res_fpn),
+            c_void_p(src_fpn),
+            c_size_t(vec_size * PLAIN_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(vec_size * U_INT32_BYTE))
+        return _fp_init_ss(
+            res_store,
+            res_fpn,
+            res_base,
+            res_exp,
+            left_store.vec_size,
+            left_store.encode_n,
+            left_store.max_int,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+        )
+    elif len(left_shape_tuple) == 2:
+        # the tuple is 2-D
+        # do a normal transpose
+        res_shape_tuple = (left_shape_tuple[1], left_shape_tuple[0])
+        GPU_LIB.transpose(
+            c_char_p(src_fpn),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(res_shape_tuple[1]),
+            c_size_t(res_shape_tuple[0]),
+        )
+        return _fp_init_ss(
+            res_store,
+            res_fpn,
+            res_base,
+            res_exp,
+            vec_size,
+            left_store.encode_n,
+            left_store.max_int,
+            res_shape,
+            res_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+        )
+    else:
+        raise PermissionError("Unsupported shape")
+
+
+'''
+    In the cuda code: the right matrix is vertically flattened:
+    for instance:
+    [[1,2,3],[4,5,6]]
+    should be flatten to [1,4,2,5,3,6] rather than [1,2,3,4,5,6]
+    This aims for a better leverage of space locality.
+
+    So we need a transpose to make the memory looks like [1,4,2,5,3,6]
+    so horizontal flatten for [[1,4],[2,5],[3,6]] is identically [1,4,2,5,3,6]
+    And we know that  [[1,2,3],[4,5,6]]^T = [[1,4],[2,5],[3,6]],
+    So the res is: we do a transpose and maintain the shape unchanged,
+    then we get the vertically flattened matrix
+'''
+
+
+def pi_matmul(
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''
+    Perform matrix multiply under encryption.
+    Due to implementation of cuda code, right_store needs to be transposed
+    -------------------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        right_store: FixedPointStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if shape is invalid for 1-D or 2-D matrix mul
+        ValueError, if left/right operators' shape can't align for matmul
+    '''
+
+    # '''Pre-process shape'''
+    left_tuple = left_shape.to_tuple()
+    right_tuple = right_shape.to_tuple()
+    if len(left_tuple) == 0 or len(right_tuple) == 0 or len(left_tuple) > 2 or len(right_tuple) > 2:
+        raise PermissionError("Invalid shape")
+    P, Q = __shape_decompose(left_shape)
+    R, S = __shape_decompose(right_shape)
+    if len(right_tuple) == 1:
+        R, S = S, R
+    if Q != R:
+        raise ValueError("shape not aligned")
+    if len(left_tuple) == 1 and len(right_tuple) == 1:
+        res_shape_tuple = ()
+    elif len(left_tuple) == 1 and len(right_tuple) == 2:
+        res_shape_tuple = (S,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 1:
+        res_shape_tuple = (P,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 2:
+        res_shape_tuple = (P, S)
+    else:
+        raise RuntimeError(
+            "You should never ever see this error unless something VERY STRANGE occurs"
+        )
+    res_size = P * S
+    '''A transpose is need to make the right matrix vertically flattened'''
+    transpose_right_store, _ = fp_transpose(
+        right_store, right_shape, None, None, stream
+    )
+    # the left_store data
+    l_pen = left_store.pen_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_fpn = transpose_right_store.bigint_storage
+    r_base = transpose_right_store.base_storage
+    r_exp = transpose_right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_cipher = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_cipher = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    '''call the batch_mul function'''
+    GPU_LIB.pen_matrix_multiply_fpn_matrix(
+        c_char_p(l_pen),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_fpn),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_cipher),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(S),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
+
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+
+    del transpose_right_store
+
+    return _pi_init_ss(
+        res_store,
+        res_cipher,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_rmatmul(
+        pub_key,
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''
+    Perform matrix multiply under encryption.
+    rmatmul means right_op is PaillierEncryptedStorage, differ from pi_matmul
+    Due to implementation of cuda code, right_store needs to be transposed
+    -------------------------
+    Paras:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  FixedPointStorage, left_operator
+        right_store: PaillierEncryptedStorage, right_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        right_shape: TensorShapeStorage, right_operator's shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncrytedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if shape is invalid for 1-D or 2-D matrix mul
+        ValueError, if left/right operators' shape can't align for matmul
+        RuntimeError,  default error for shape evaluation
+    '''
+    # pre-process of shapes
+    left_tuple = left_shape.to_tuple()
+    right_tuple = right_shape.to_tuple()
+    if len(left_tuple) == 0 or len(right_tuple) == 0 or len(left_tuple) > 2 or len(right_tuple) > 2:
+        raise PermissionError("Invalid shape")
+    P, Q = __shape_decompose(left_shape)
+    R, S = __shape_decompose(right_shape)
+    if len(right_tuple) == 1:
+        R, S = S, R
+    if Q != R:
+        raise ValueError("shape not aligned")
+    if len(left_tuple) == 1 and len(right_tuple) == 1:
+        res_shape_tuple = ()
+    elif len(left_tuple) == 1 and len(right_tuple) == 2:
+        res_shape_tuple = (S,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 1:
+        res_shape_tuple = (P,)
+    elif len(left_tuple) == 2 and len(right_tuple) == 2:
+        res_shape_tuple = (P, S)
+    else:
+        raise RuntimeError(
+            "You should never ever see this error unless something VERY STRANGE occurs"
+        )
+    res_size = P * S
+    '''A transpose is needed to make the right matrix vertically flattened'''
+    transpose_right_store, _ = pi_transpose(
+        right_store, right_shape, None, None, stream
+    )
+    # the left_store data
+    l_fpn = left_store.bigint_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_pen = transpose_right_store.pen_storage
+    r_base = transpose_right_store.base_storage
+    r_exp = transpose_right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    GPU_LIB.fpn_matrix_multiply_pen_matrix(
+        c_char_p(l_fpn),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_pen),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(S),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(PLAIN_BITS),
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
+
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+
+    del transpose_right_store
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        right_store.mem_type,
+        data_type,
+        right_store.encode_n,
+        right_store.encode_max_int,
+    )
+
+
+def pi_transpose(
+        left_store,
+        left_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    '''
+    transpose the C-memory stored matrix of PaillierEncryptedStorage,
+    support at most 2-D matrix
+    -----------------
+    Para:
+        left_store:  PaillierEncryptedStorage, left_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple: (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError, if dimension is higher than 2-D, not supported
+    '''
+    left_shape_tuple = left_shape.to_tuple()
+    # get the left_store parameters
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    # malloc space for the res value
+    if res_store is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    '''Start handling different type of data '''
+    if len(left_shape_tuple) < 2:
+        # just a raw memcpy, no transpose needed for this scene
+        GPU_LIB.c_memcpy(
+            c_void_p(res_pen),
+            c_void_p(src_pen),
+            c_size_t(vec_size * CIPHER_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(vec_size * U_INT32_BYTE))
+        return _pi_init_ss(
+            res_store,
+            res_pen,
+            res_base,
+            res_exp,
+            left_store.vec_size,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
+    elif len(left_shape_tuple) == 2:
+        res_shape_tuple = (left_shape_tuple[1], left_shape_tuple[0])
+        # call the C transpose functions
+        GPU_LIB.transpose(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(res_shape_tuple[1]),
+            c_size_t(res_shape_tuple[0]),
+        )
+        return _pi_init_ss(
+            res_store,
+            res_pen,
+            res_base,
+            res_exp,
+            vec_size,
+            res_shape,
+            res_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
+    else:
+        raise PermissionError("Invalid Shape")
+
+
+# WARNING:  NOW ALMOST ABANDONED DUE TO NOT IDEAL PERFORMANCE!
+def pi_sum_multi_stream(
+        pub_key,
+        left_store,
+        left_shape,
+        axis=None,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''Doing pi_sum using multi cuda stream'''
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+
+    if res_store is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    shape_tuple = left_shape.to_tuple()
+
+    GPU_LIB.pen_sum_multi_stream(
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(shape_tuple[0]),
+        c_size_t(shape_tuple[1]),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
+
+    res_size = shape_tuple[0]
+    res_shape_tuple = (res_size,)
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_sum(
+        pub_key,
+        left_store,
+        left_shape,
+        axis=None,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''
+    Perform sum according to the axis
+    ----------------------
+    Para:
+        pub_key:     Dev_PubKeyStorage, PaillierPublicKey stored in GPU mem
+        left_store:  PaillierEncryptedStorage, left_operator
+        left_shape:  TensorShapeStorage, left_operator's  shape
+        axis:        int or None, the dimension which sum is performed
+                        None: sum over all elements
+                        0:    sum vertically, over the 1st demension
+                        1:    sum horizontally, over the 2nd demension
+        res_store:   PaillierEncrpytedStorage, return value's data
+        res_shape:   TensorShapeStorage, return value's shape
+    Return:
+        tuple, (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        Permission error: when the input axis is not aligned to input shape
+    '''
+    # return shape are tuned to be the same as numpy's output
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    res_pen, res_base, res_exp, res_size = 0, 0, 0, 0
+    res_shape_tuple = ()
+    left_shape_tuple = left_shape.to_tuple()
+
+    if len(left_shape_tuple) == 0:
+        # handling shape (), meaning only one element in left_store
+        if axis is not None and axis != 0:
+            raise PermissionError(
+                "Cannot set axis other than 0 or None for dimension 0"
+            )
+        if res_store is None:
+            res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+            res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+            res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        GPU_LIB.c_memcpy(
+            c_void_p(res_pen),
+            c_void_p(src_pen),
+            c_size_t(vec_size * CIPHER_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(vec_size * U_INT32_BYTE))
+        return _pi_init_ss(
+            left_store,
+            res_pen,
+            res_base,
+            res_exp,
+            vec_size,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
+    elif axis is None or len(left_shape_tuple) == 1:
+        # handling shape (n,) or axis == None
+        # both mean sum for all elements
+        if len(left_shape_tuple) == 1 and axis is not None and axis >= 1:
+            raise PermissionError(
+                "axis is out of bounds for array of dimension 1")
+        if res_store is None:
+            res_pen = GPU_LIB.c_malloc(c_size_t(1 * CIPHER_BYTE))
+            res_base = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+            res_exp = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        res_size = 1
+        res_shape_tuple = ()
+        GPU_LIB.pen_sum(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(1),
+            c_size_t(vec_size),
+            c_void_p(pub_key.pub_key_ptr),
+            c_size_t(CIPHER_BITS),
+            c_uint32(device_type),
+        )
+    elif axis == 0:
+        # handling 2-D matrix, axis == 0 means sum vertically
+        # since our gpu sum support only horizontal sum
+        # aka batch sum over continuous memory space
+        transpose_store, transpose_shape = pi_transpose(
+            left_store, left_shape, None, None, stream
+        )
+        src_pen = transpose_store.pen_storage
+        src_base = transpose_store.base_storage
+        src_exp = transpose_store.exp_storage
+        transpose_tuple = transpose_shape.to_tuple()
+        '''perform sum on the transposed matrix'''
+        if res_store is None:
+            res_pen = GPU_LIB.c_malloc(
+                c_size_t(transpose_tuple[0] * CIPHER_BYTE))
+            res_base = GPU_LIB.c_malloc(
+                c_size_t(transpose_tuple[0] * U_INT32_BYTE))
+            res_exp = GPU_LIB.c_malloc(
+                c_size_t(transpose_tuple[0] * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        res_size = transpose_tuple[0]
+        res_shape_tuple = (transpose_tuple[0],)
+        GPU_LIB.pen_sum(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(transpose_tuple[0]),
+            c_size_t(transpose_tuple[1]),
+            c_void_p(pub_key.pub_key_ptr),
+            c_size_t(CIPHER_BITS),
+            c_uint32(device_type),
+        )
+    elif axis == 1:
+        # handling 2-D matrix, axis == 1 means sum horizontally
+        if res_store is None:
+            res_pen = GPU_LIB.c_malloc(
+                c_size_t(left_shape_tuple[0] * CIPHER_BYTE))
+            res_base = GPU_LIB.c_malloc(
+                c_size_t(left_shape_tuple[0] * U_INT32_BYTE))
+            res_exp = GPU_LIB.c_malloc(
+                c_size_t(left_shape_tuple[0] * U_INT32_BYTE))
+        else:
+            res_pen = res_store.pen_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        res_size = left_shape_tuple[0]
+        res_size = left_shape_tuple[0]
+        res_shape_tuple = (left_shape_tuple[0],)
+        GPU_LIB.pen_sum(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(left_shape_tuple[0]),
+            c_size_t(left_shape_tuple[1]),
+            c_void_p(pub_key.pub_key_ptr),
+            c_size_t(CIPHER_BITS),
+            c_uint32(device_type),
+        )
+    else:
+        raise PermissionError("Invalid Axis or Shape")
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+# WARNING: ABANDONED BECAUSE OF NOT IDEAL PERFORMANCE
+def pi_sum_with_index_v2(pub_key, left_store, left_shape, valid_index):
+    '''
+    A different version of C-implemetation of pen_sum_with_index,
+    details is that it generates a concrete new vector by traverse all
+    elements in left_store.
+    '''
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    left_shape_tuple = left_shape.to_tuple()
+    valid_store = te_p2c(valid_index, None)
+
+    valid_size = np.asarray(valid_index).sum()
+    if len(left_shape_tuple) == 0:
+        res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_pen),
+            c_void_p(src_pen),
+            c_size_t(vec_size * CIPHER_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(vec_size * U_INT32_BYTE))
+        return _pi_init_ss(
+            left_store,
+            res_pen,
+            res_base,
+            res_exp,
+            vec_size,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
+
+    res_pen = GPU_LIB.c_malloc(c_size_t(1 * CIPHER_BYTE))
+    res_base = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+    res_size = 1
+    res_shape_tuple = ()
+
+    GPU_LIB.pen_sum_with_index_v2(
+        c_void_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(1),
+        c_size_t(vec_size),
+        c_size_t(valid_size),
+        c_void_p(valid_store.data),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
+
+    return _pi_init_ss(
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        None,
+        res_shape_tuple,
+        MEM_HOST,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_sum_with_index(pub_key, left_store, left_shape, valid_index):
+    '''
+    Run pi_sum with an index list indicating which indices are used
+    Only support sum the whole list now, no axis is valid
+    ----------------
+    Paras:
+        pub_key: dev_pubkey_storage class
+        left_store: PaillierEncryptedStorage
+        left_shape: TensorShapeStorage class
+        valid_index: list, contents like [0,1,1,1,0,1,0,1],
+                        valid_index[i] == 1 means the ith value in left_store
+                        should be added to the sum result
+                        valid_index[i] == 0 means the ith value in left_store
+                        should not be counted into sum result
+    Return:
+        tuple, (PaillierEncryptedStorage, TensorShapeStorage)
+    '''
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    valid_store = te_p2c(valid_index, None)
+
+    res_pen, res_base, res_exp, res_size = 0, 0, 0, 0
+    res_shape_tuple = ()
+    left_shape_tuple = left_shape.to_tuple()
+
+    # TODO: check for the result of shape (), with only one elements
+    # TODO: check for the result of shape (0,) with no elements
+    if len(left_shape_tuple) == 0:
+        res_pen = GPU_LIB.c_malloc(c_size_t(vec_size * CIPHER_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_pen),
+            c_void_p(src_pen),
+            c_size_t(vec_size * CIPHER_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_base),
+            c_void_p(src_base),
+            c_size_t(vec_size * U_INT32_BYTE))
+        GPU_LIB.c_memcpy(
+            c_void_p(res_exp),
+            c_void_p(src_exp),
+            c_size_t(vec_size * U_INT32_BYTE))
+        return _pi_init_ss(
+            left_store,
+            res_pen,
+            res_base,
+            res_exp,
+            vec_size,
+            left_shape,
+            left_shape_tuple,
+            left_store.mem_type,
+            left_store.data_type,
+            left_store.encode_n,
+            left_store.encode_max_int,
+        )
+
+    res_pen = GPU_LIB.c_malloc(c_size_t(1 * CIPHER_BYTE))
+    res_base = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_malloc(c_size_t(1 * U_INT32_BYTE))
+    # sum result number is fixed to 1
+    res_size = 1
+    res_shape_tuple = ()
+    GPU_LIB.pen_sum_with_index(
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(1),
+        c_size_t(vec_size),
+        c_void_p(valid_store.data),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
+    return _pi_init_ss(
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        None,
+        res_shape_tuple,
+        MEM_HOST,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_sum_multi_index(
+        pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None
+):
+    '''
+    Run sum for data with the same index indicated in the valid_index list
+    Return: A PEN_Storage class with max_value-min_value+1 number of PEN values
+    ------------
+    Parameters:
+        left_store:   PaillierEncryptedStorage, the original PEN_storage class
+        valid_index:  list, contains indices like [-1, 1, 2, 1, 3, 3, 2, -1],
+                        -1 means that this value will not be calculated if min_value >= 0
+                        1,2,3 means the different groups that it belongs to
+        min_value:    int, The min valid value of the valid index, default 0,
+                           in the above example, if min_value == 1, then -1 will be invalid
+                           if min_value == -1, -1 is also valid
+        max_value:    int, The max valid value of the valid index
+    Return:
+        tuple   (PaillierEncryptedStorage, TensorShapeStorage)
+    '''
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+    valid_store = te_p2c(valid_index, None)
+    # set max_value to maximum number if it is not designated
+    max_value = max(valid_index) if max_value is None else max_value
+    res_size = max_value - min_value + 1
+
+    res_pen = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+    res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_shape_tuple = (res_size,)
+    GPU_LIB.pen_sum_with_multi_index_v2(
+        c_void_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(res_size),
+        c_size_t(vec_size),
+        c_int64(min_value),
+        c_void_p(valid_store.data),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS),
+        c_uint32(device_type),
+    )
+    return _pi_init_ss(
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        None,
+        res_shape_tuple,
+        MEM_HOST,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+# WARNNIG: CURRENTLY NOT IN USE BECAUSE NO APPRENT IMPROVEMENT WHEN left_store.vec_size is very large
+# TODO: apply this to store with small size
+def pi_sum_batch_multi_index(
+        pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None
+):
+    '''
+    Rum sum for data with the same index indicated in valid index
+    Basic logic is same with pi_sum_multi_index,
+    differ in that valid_indx may have multiple rows, given the name "batch"
+    means that we have multiple valid_index to the same PaillierEncryptedStorage
+    So there may be parallel computation between multiple valid_index list
+    -------------------
+    Paras:
+        valid_index: List[List[int]], in brief, multiple valid_index
+    Return:
+        tuple, (PaillierEncryptedStorage, TensorShapeStorage)
+    '''
+    pen_storage = left_store.pen_storage
+    base_storage = left_store.base_storage
+    exp_storage = left_store.exp_storage
+    vec_size = left_store.vec_size
+
+    max_value = max(valid_index) if max_value is None else max_value
+    valid_index_num = max_value - min_value + 1
+    batch_num = valid_index.shape[0]
+    if valid_index.shape[1] != vec_size:
+        raise PermissionError(
+            "valid index shape and raw data shape cannot align!!!")
+
+    res_size = batch_num * valid_index_num
+    res_pen = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+    res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_shape_tuple = (batch_num, valid_index_num)
+
+    valid_store = te_p2c(valid_index, None)
+
+    GPU_LIB.batch_pen_sum_with_multi_index(
+        c_void_p(pen_storage),
+        c_void_p(base_storage),
+        c_void_p(exp_storage),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(valid_index_num),
+        c_size_t(vec_size),
+        c_size_t(min_value),
+        c_size_t(batch_num),
+        c_void_p(valid_store.data),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS),
+        c_size_t(device_type),
+    )
+
+    return _pi_init_ss(
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        None,
+        res_shape_tuple,
+        MEM_HOST,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+# WARNING: ABANDONED FOR THE SAME REASON AS pi_sum_batch_multi_index
+def pi_sum_batch_multi_index_v2(
+        pub_key, left_store, left_shape, valid_index, min_value=0, max_value=None
+):
+    '''
+    Almost the same with pi_sum_batch_multi_index,
+    differ in the C implementation
+    This implementation create a concrete C memory by
+    doing a for loop before actual computation.
+    '''
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+
+    max_value = max(valid_index) if max_value is None else max_value
+    valid_index_num = max_value - min_value + 1
+    batch_num = valid_index.shape[0] // valid_index_num
+
+    res_size = batch_num * valid_index_num
+    res_pen = GPU_LIB.c_malloc(c_size_t(res_size * CIPHER_BYTE))
+    res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    res_shape_tuple = (batch_num, valid_index_num)
+
+    valid_store = te_p2c(valid_index, None)
+
+    GPU_LIB.batch_pen_sum_with_multi_index_v2(
+        c_void_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_char_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(valid_index_num),
+        c_size_t(vec_size),
+        c_size_t(valid_index.shape[1]),
+        c_size_t(batch_num),
+        c_size_t(min_value),
+        c_void_p(valid_store.data),
+        c_void_p(pub_key.pub_key_ptr),
+        c_size_t(CIPHER_BITS),
+        c_size_t(device_type),
+    )
+
+    return _pi_init_ss(
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        res_size,
+        None,
+        res_shape_tuple,
+        MEM_HOST,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def fp_encode(
+        store, n, max_int, precision=None, max_exponent=None, res=None, stream=None
+):
+    '''
+    Perform encode to a TensorStorage
+    -----------------
+    Paras:
+        store:        TensorStorage, raw data to be encoded
+        n:            big int, the same n in pubkey used for encryption
+        max_int:      big int, same max_int in pubkey.
+        precision:    int, the precision of encoding, default None
+        max_exponent: None or int, currently not used
+        res:          FixedPointStorage, the return value
+    Return:
+        FixedPointStorage, same as res
+    Raise:
+        PermissionError: For unsupported data type or encoding style
+    '''
+    if max_exponent is not None:
+        raise PermissionError("max_exponent not supported")
+    if precision is None:
+        precision = -1
+    src_data = store.data
+    vec_size = store.vec_size
+    # malloc the return memory space
+    if res is None:
+        res_fpn = GPU_LIB.c_malloc(c_size_t(PLAIN_BYTE * vec_size))
+        res_base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+        res_exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    else:
+        res_fpn = res.bigint_storage
+        res_base = res.base_storage
+        res_exp = res.exp_storage
+    # Due to the different nature of encoding float/int
+    # Handle the two different data type seperately
+    if store.data_type == FLOAT_TYPE:
+        GPU_LIB.encode_double(
+            c_void_p(src_data),
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_int32(precision),
+            c_char_p(n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS),
+            c_size_t(vec_size),
+            c_uint32(device_type),
+        )
+    elif store.data_type == INT64_TYPE:
+        GPU_LIB.encode_int(
+            c_void_p(src_data),
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_int32(precision),
+            c_char_p(n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS),
+            c_size_t(vec_size),
+            c_uint32(device_type),
+        )
+    else:
+        raise PermissionError("Invalid Data Type")
+
+    '''get the three elements, store it in a FPNStorage'''
+
+    return _fp_init_store(
+        res,
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        n,
+        max_int,
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def __fp_decode(store, res=None, stream=None):
+    '''
+    Decode a FixedPointStorage in CPU, using fp_c2p to implement
+    Currently not used, as a GPU version has been done
+    ------------------
+    Paras:
+        store:   FixedPointStorage, the raw data to be decoded
+        res:     TensorStorage, the decoded result
+    Return:
+        TensorStorage, same as res
+    '''
+    res_fpn = store.bigint_storage
+    res_base = store.base_storage
+    res_exp = store.exp_storage
+    vec_size = store.vec_size
+    fpn_array = __get_c_fpn_storage(
+        res_fpn, res_base, res_exp, vec_size, store.encode_n, store.max_int
+    )
+
+    CPU_decode = []
+    if store.data_type == INT64_TYPE:
+        for i in range(vec_size):
+            CPU_decode.append(int(fpn_array[i].decode()))
+    elif store.data_type == FLOAT_TYPE:
+        for i in range(vec_size):
+            CPU_decode.append(fpn_array[i].decode())
+    else:
+        raise PermissionError("Invalid Data Type")
+
+    # reform the value to TensorStorage
+    decode_data = te_p2c(CPU_decode, None)
+    res_data = decode_data.data
+    decode_data.data = None
+    return _te_init_store(
+        res,
+        res_data,
+        vec_size,
+        store.mem_type,
+        store.data_type)
+
+
+def fp_decode(store, res=None, stream=None):
+    '''
+    Decode a FixedPointStorage in GPU
+    ------------------
+    Paras:
+        store:   FixedPointStorage, the raw data to be decoded
+        res:     TensorStorage, the decoded result
+    Return:
+        TensorStorage, same as res
+    '''
+    if store.data_type == FLOAT_TYPE:
+        if res is None:
+            res_store = GPU_LIB.c_malloc(
+                c_size_t(store.vec_size * DOUBLE_BYTE))
+        else:
+            res_store = res.data
+        GPU_LIB.decode_double(
+            c_void_p(store.bigint_storage),
+            c_void_p(store.base_storage),
+            c_void_p(store.exp_storage),
+            c_char_p(store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(store.max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS),
+            c_void_p(res_store),
+            c_size_t(store.vec_size),
+        )
+    elif store.data_type == INT64_TYPE:
+        res_store = (
+            GPU_LIB.c_malloc(c_size_t(store.vec_size * INT64_BYTE))
+            if res is None
+            else res.data
+        )
+        GPU_LIB.decode_int(
+            c_void_p(store.bigint_storage),
+            c_void_p(store.base_storage),
+            c_void_p(store.exp_storage),
+            c_char_p(store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
+            c_char_p(store.max_int.to_bytes(PLAIN_BYTE, 'little')),
+            c_size_t(PLAIN_BITS),
+            c_void_p(res_store),
+            c_size_t(store.vec_size),
+        )
+    else:
+        raise PermissionError("Invalid Data Type")
+    return _te_init_store(
+        res, res_store, store.vec_size, store.mem_type, store.data_type
+    )
+
+
+def bi_free(src):
+    GPU_LIB.c_free(c_void_p(src.bigint_storage))
+    src.bigint_storage = None
+
+
+def fp_free(src):
+    GPU_LIB.c_free(c_void_p(src.bigint_storage))
+    GPU_LIB.c_free(c_void_p(src.base_storage))
+    GPU_LIB.c_free(c_void_p(src.exp_storage))
+    src.bigint_storage, src.base_storage, src.exp_storage = None, None, None
+
+
+'''
+    function: change the FixedPointStorage's data back into a C type
+    As there is no shape involved in the function,
+    we cannot know the return shape of the function
+    input:
+            src: FixedPointStorage, containing the data that need to be changed
+    output:
+            return value: containing 3 ndarray:
+                            fpn_array,base_array,exp_array
+'''
+
+
+def fp_c2p(src):
+    src_fpn = src.bigint_storage
+    src_base = src.base_storage
+    src_exp = src.exp_storage
+    vec_size = src.vec_size
+    return __get_c_fpn_storage(
+        src_fpn, src_base, src_exp, vec_size, src.encode_n, src.max_int
+    )
+
+
+def pi_c2p_mp(src):
+    '''
+    convert PaillierEncryptedStorage from C mem type to Python one
+    this one use multiprocess to accelerate
+    --------------
+    Para:    src, PaillierEncryptedStorage
+    Return:  tuple, each element is a ndarray,
+                    identical to sequence of encoding, base, exponent
+    '''
+    src_pen = src.pen_storage
+    src_base = src.base_storage
+    src_exp = src.exp_storage
+    vec_size = src.vec_size
+    return __get_c_pen_storage_mp(
+        src_pen,
+        src_base,
+        src_exp,
+        vec_size,
+        src.encode_n)
+
+
+def pi_c2p(src):
+    '''convert PaillierEncryptedStorage from C mem type to Python one'''
+    src_pen = src.pen_storage
+    src_base = src.base_storage
+    src_exp = src.exp_storage
+    vec_size = src.vec_size
+    return __get_c_pen_storage_raw(
+        src_pen, src_base, src_exp, vec_size, src.encode_n)
+
+
+def fp_mul(
+        left_store,
+        right_store,
+        left_shape,
+        right_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None,
+):
+    '''
+    Perform element-wise multiplication between two FixedPointStorage.
+    This is a plaintext computation rather than an encrypted one.
+    ------------------
+    Paras:
+        left_store, right_store: FixedPointStorage
+        left_shape, right_shape: TensorShapeStorage
+    Return:
+        tuple, (FixedPointStorage, TensorShapeStorage)
+    '''
+    # P,Q is the dim of the left_store(pen)
+    # R,S is the dim of the right_store(fpn)
+    P, Q, R, S, res_shape_tuple = __shape_resolve(left_shape, right_shape)
+    res_size = max(P, R) * max(Q, S)
+    # the left_store data
+    l_fpn = left_store.bigint_storage
+    l_base = left_store.base_storage
+    l_exp = left_store.exp_storage
+    # the right_store data
+    r_fpn = right_store.bigint_storage
+    r_base = right_store.base_storage
+    r_exp = right_store.exp_storage
+    # malloc space for the return value
+    if res_store is None:
+        res_fpn = GPU_LIB.c_malloc(c_size_t(res_size * PLAIN_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(res_size * U_INT32_BYTE))
+    else:
+        res_fpn = res_store.bigint_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    GPU_LIB.fpn_mul(
+        c_char_p(l_fpn),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_fpn),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_char_p(res_fpn),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(P),
+        c_size_t(Q),
+        c_size_t(R),
+        c_size_t(S),
+        c_char_p(left_store.encode_n.to_bytes(PLAIN_BYTE, 'little')),
+        c_size_t(PLAIN_BITS),
+        c_uint32(device_type),
+    )
+    # handle the data_type according to left & right's data_type
+    data_type = 0
+    if left_store.data_type == INT64_TYPE and right_store.data_type == INT64_TYPE:
+        data_type = INT64_TYPE
+    else:
+        data_type = FLOAT_TYPE
+    return _fp_init_ss(
+        res_store,
+        res_fpn,
+        res_base,
+        res_exp,
+        res_size,
+        left_store.encode_n,
+        left_store.max_int,
+        res_shape,
+        res_shape_tuple,
+        left_store.mem_type,
+        data_type,
+    )
+
+
+def fp_p2c(target, src, data_type=FLOAT_TYPE):
+    '''change a FixedPointNumber ndarray into a FixedPointStorage Class'''
+    if isinstance(src, list):
+        vec_size = len(src)
+    elif isinstance(src, np.ndarray):
+        vec_size = src.size
+        src = src.flat
+    else:
+        raise TypeError("Unsupported Data Structure")
+    # malloc the space for the type
+    if target is None:
+        res_fpn = GPU_LIB.c_malloc(c_size_t(vec_size * PLAIN_BYTE))
+        res_base = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+        res_exp = GPU_LIB.c_malloc(c_size_t(vec_size * U_INT32_BYTE))
+    else:
+        res_fpn = target.bigint_storage
+        res_base = target.base_storage
+        res_exp = target.exp_storage
+    # the temp ndarray buffer
+    base_temp = []
+    exp_temp = []
+    # get the two encoding parameters
+    n = src[0].n
+    max_int = src[0].max_int
+    for i in range(vec_size):
+        src_number = src[i].encoding.to_bytes(PLAIN_BYTE, 'little')
+        GPU_LIB.c_memcpy(
+            c_void_p(res_fpn + i * PLAIN_BYTE),
+            c_char_p(src_number),
+            c_size_t(PLAIN_BYTE),
+        )
+        base_temp.append(src[i].BASE)
+        exp_temp.append(src[i].exponent)
+
+    base_array_pointer = np.asarray(
+        base_temp, np.uint32).ctypes.data_as(c_void_p)
+    exp_array_pointer = np.asarray(
+        exp_temp, np.uint32).ctypes.data_as(c_void_p)
+    GPU_LIB.c_memcpy(
+        c_void_p(res_base),
+        base_array_pointer,
+        c_size_t(vec_size * U_INT32_BYTE))
+    GPU_LIB.c_memcpy(
+        c_void_p(res_exp), exp_array_pointer, c_size_t(vec_size * U_INT32_BYTE)
+    )
+
+    return _fp_init_store(
+        target,
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        n,
+        max_int,
+        MEM_HOST,
+        data_type)
+
+
+def _index_reset(index, dim_size):
+    if index < 0:
+        res_index = index + dim_size
+        res_index = max(0, res_index)
+    elif index > dim_size:
+        res_index = dim_size
+    else:
+        res_index = index
+    return res_index
+
+
+def fp_slice(
+        store,
+        shape,
+        start,
+        stop,
+        axis,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    '''
+    slice a contiguous memory space, now support two directions.
+    -----------------------------
+    Para:
+    store: FixedPointStorage, the data to be sliced
+    shape: TensorShapeStorage, the original shape of the storage
+    start: int, the start index of the slice (included)
+    end:   int, the end index of the slice(not included),
+           if larger than the last index, concatencate it into the dim size
+    axis:  0 or 1, 0 means cut it horizontally, 1 means cut it vertically
+    stream: the current stream of the task, not used now
+    -----------------------------
+    Return:
+    res_store, res_shape, FixedPointStorage, TensorShapeStorage
+    Raise:
+        PermissionError: if the input start/stop/axis is not valid
+    '''
+    src_fpn = store.bigint_storage
+    src_base = store.base_storage
+    src_exp = store.exp_storage
+    fpn_shape_tuple = shape.to_tuple()
+    dim0, dim1 = 0, 0
+    '''handle shape and index'''
+    if len(fpn_shape_tuple) == 0:
+        raise PermissionError("Cannot slice 0 dim!")
+    elif len(fpn_shape_tuple) == 1:
+        dim0, dim1 = 1, fpn_shape_tuple[0]
+        if axis == 0:
+            raise PermissionError("Cannot slice 1 dim horizontally!")
+        start = _index_reset(start, dim1)
+        stop = _index_reset(stop, dim1)
+    elif len(fpn_shape_tuple) == 2:
+        dim0, dim1 = fpn_shape_tuple[0], fpn_shape_tuple[1]
+        if axis == 0:
+            start = _index_reset(start, dim0)
+            stop = _index_reset(stop, dim0)
+        if axis == 1:
+            start = _index_reset(start, dim1)
+            stop = _index_reset(stop, dim1)
+    else:
+        raise PermissionError("Invalid shape")
+    # handle condition that a[k: l] k>=l for 2-d array
+    # will cause the result shape to be (0, dim1)
+    if axis == 0 and start >= stop:
+        res_fpn, res_base, res_exp = None, None, None
+        return _fp_init_ss(
+            None,
+            res_fpn,
+            res_base,
+            res_exp,
+            0,
+            store.encode_n,
+            store.encode_max_int,
+            None,
+            (0, dim1),
+            store.mem_type,
+            store.data_type,
+        )
+    # handle condition that a[:,k:l] k>=l for 2-d array
+    # will cause the result shape to be (dim0, 0)
+    if axis == 1 and start >= stop:
+        res_fpn, res_base, res_exp = None, None, None
+        res_shape_tuple = (dim0, 0) if len(fpn_shape_tuple) == 2 else (0,)
+        return _fp_init_ss(
+            None,
+            res_fpn,
+            res_base,
+            res_exp,
+            0,
+            store.encode_n,
+            store.encode_max_int,
+            None,
+            res_shape_tuple,
+            store.mem_type,
+            store.data_type,
+        )
+        # handle the normal slice
+    res_shape_tuple, vec_size = (), 0
+    '''useful paras'''
+    bigint_row_bytelen = dim1 * PLAIN_BYTE
+    uint32_row_bytelen = dim1 * U_INT32_BYTE
+    gap_length = stop - start
+    # start normal slice
+    if axis == 1:
+        'axis == 1 means that we need to cut the matrix vertically'
+        res_bigint_row_bytelen = gap_length * PLAIN_BYTE
+        res_uint32_row_bytelen = gap_length * U_INT32_BYTE
+        if res_store is None:
+            res_fpn = GPU_LIB.c_malloc(c_size_t(res_bigint_row_bytelen * dim0))
+            res_base = GPU_LIB.c_malloc(
+                c_size_t(res_uint32_row_bytelen * dim0))
+            res_exp = GPU_LIB.c_malloc(c_size_t(res_uint32_row_bytelen * dim0))
+        else:
+            res_fpn = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        GPU_LIB.slice_vertical(
+            c_char_p(src_fpn),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(PLAIN_BITS),
+            c_uint32(device_type),
+        )
+        if len(fpn_shape_tuple) == 1:
+            res_shape_tuple = (gap_length,)
+            vec_size = res_shape_tuple[0]
+        else:
+            res_shape_tuple = (dim0, gap_length)
+            vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+
+    elif axis == 0:
+        'axis == 0 means that we nned to cut the matrix horizontally'
+        if res_store is None:
+            res_fpn = GPU_LIB.c_malloc(
+                c_size_t(bigint_row_bytelen * gap_length))
+            res_base = GPU_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
+            res_exp = GPU_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
+        else:
+            res_fpn = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        GPU_LIB.slice_horizontal(
+            c_char_p(src_fpn),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(PLAIN_BITS),
+            c_uint32(device_type),
+        )
+        res_shape_tuple = (gap_length, dim1)
+        vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+    else:
+        raise NotImplementedError("Only support 2 dimensional slice")
+
+    return _fp_init_ss(
+        res_store,
+        res_fpn,
+        res_base,
+        res_exp,
+        vec_size,
+        store.encode_n,
+        store.max_int,
+        res_shape,
+        res_shape_tuple,
+        store.mem_type,
+        store.data_type,
+    )
+
+
+def pi_slice(
+        store,
+        shape,
+        start,
+        stop,
+        axis,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    '''
+    slice a contiguous memory space, now support two directions.
+    -----------------------------
+    Para:
+    store: PaillierEncryptedStorage, the data to be sliced
+    shape: TensorShapeStorage, the original shape of the storage
+    start: int, the start index of the slice (included)
+    end:   int, the end index of the slice(not included),
+           if it is larger than the last index, then it concatencate into the dim size
+    axis:  0 or 1, 0 means cut it horizontally, 1 means cut it vertically
+    stream: the current stream of the task, not used now
+    -----------------------------
+    Return:
+    res_store, res_shape, PaillierEncryptedStorage, TensorShapeStorage
+    '''
+    src_pen = store.pen_storage
+    src_base = store.base_storage
+    src_exp = store.exp_storage
+    # get the two dims and check for illegal status
+    pen_shape_tuple = shape.to_tuple()
+    dim0, dim1 = 0, 0
+    if len(pen_shape_tuple) == 0:
+        raise PermissionError("Cannot slice 0 dim!")
+    elif len(pen_shape_tuple) == 1:
+        dim0, dim1 = 1, pen_shape_tuple[0]
+        if axis == 0:
+            raise PermissionError("Cannot slice 1 dim horizontally!")
+        start = _index_reset(start, dim1)
+        stop = _index_reset(stop, dim1)
+    elif len(pen_shape_tuple) == 2:
+        dim0, dim1 = pen_shape_tuple[0], pen_shape_tuple[1]
+        if axis == 0:
+            start = _index_reset(start, dim0)
+            stop = _index_reset(stop, dim0)
+        if axis == 1:
+            start = _index_reset(start, dim1)
+            stop = _index_reset(stop, dim1)
+    else:
+        raise PermissionError("Invalid shape")
+
+    # handle condition that a[k, l], k>=l for 2-d array
+    # will cause the result shape to be (0, dim1)
+    if axis == 0 and start >= stop:
+        res_pen, res_base, res_exp = None, None, None
+        return _pi_init_ss(
+            None,
+            res_pen,
+            res_base,
+            res_exp,
+            0,
+            None,
+            (0, dim1),
+            store.mem_type,
+            store.data_type,
+            store.encode_n,
+            store.encode_max_int,
+        )
+    # handle condition that a[:, k, l] k>=l for 2-d array
+    # will cause the result shape to be (dim0, 0)
+    if axis == 1 and start >= stop:
+        res_pen, res_base, res_exp = None, None, None
+        res_shape_tuple = (dim0, 0) if len(pen_shape_tuple) == 2 else (0,)
+        return _pi_init_ss(
+            None,
+            res_pen,
+            res_base,
+            res_exp,
+            0,
+            None,
+            res_shape_tuple,
+            store.mem_type,
+            store.data_type,
+            store.encode_n,
+            store.encode_max_int,
+        )
+    # handle the normal slice
+    res_shape_tuple = ()
+    vec_size = 0
+    '''useful paras'''
+    bigint_row_bytelen = dim1 * PLAIN_BYTE
+    uint32_row_bytelen = dim1 * U_INT32_BYTE
+    gap_length = stop - start
+    # start slice
+    if axis == 1:
+        'axis == 1 means that we need to cut the matrix vertically'
+        res_bigint_row_bytelen = gap_length * PLAIN_BYTE
+        res_uint32_row_bytelen = gap_length * U_INT32_BYTE
+        # malloc space for result
+        if res_store is None:
+            res_pen = GPU_LIB.c_malloc(c_size_t(res_bigint_row_bytelen * dim0))
+            res_base = GPU_LIB.c_malloc(
+                c_size_t(res_uint32_row_bytelen * dim0))
+            res_exp = GPU_LIB.c_malloc(c_size_t(res_uint32_row_bytelen * dim0))
+        else:
+            res_pen = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        # call the raw function
+        GPU_LIB.slice_vertical(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(CIPHER_BITS),
+            c_uint32(device_type),
+        )
+        if len(pen_shape_tuple) == 1:
+            res_shape_tuple = (gap_length,)
+            vec_size = res_shape_tuple[0]
+        else:
+            res_shape_tuple = (dim0, gap_length)
+            vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+    elif axis == 0:
+        'axis == 0 means that we nned to cut the matrix horizontally'
+        if res_store is None:
+            res_pen = GPU_LIB.c_malloc(
+                c_size_t(bigint_row_bytelen * gap_length))
+            res_base = GPU_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
+            res_exp = GPU_LIB.c_malloc(
+                c_size_t(uint32_row_bytelen * gap_length))
+        else:
+            res_pen = res_store.bigint_storage
+            res_base = res_store.base_storage
+            res_exp = res_store.exp_storage
+        GPU_LIB.slice_horizontal(
+            c_char_p(src_pen),
+            c_void_p(src_base),
+            c_void_p(src_exp),
+            c_char_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(dim0),
+            c_size_t(dim1),
+            c_size_t(start),
+            c_size_t(stop),
+            c_size_t(CIPHER_BITS),
+            c_uint32(device_type),
+        )
+        # since 1-dim shape will not occur here, result shape is always 2-D
+        res_shape_tuple = (gap_length, dim1)
+        vec_size = res_shape_tuple[0] * res_shape_tuple[1]
+    else:
+        raise NotImplementedError()
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        res_shape,
+        res_shape_tuple,
+        store.mem_type,
+        store.data_type,
+        store.encode_n,
+        store.encode_max_int,
+    )
+
+
+def fp_cat(stores, shapes, axis, res_store, res_shape):
+    '''
+    concat several FixedPointStorage according to axis
+    --------------------
+    Para:
+        stores: List or ndarray, elements are FixedPointStorage
+        shapes: List or ndarray, elements are TensorShapeStorage
+        axis:   int, how stores will be stacked
+                    0 means a vertical stack, stack along 1st dim
+                    1 means a horizontal stack, stack along 2nd dim
+        res_store: FixedPointStorage, the stacked result
+        res_shape: TensorShapeStorage, the result's shape
+    Return:
+        tuple, (FixedPointStorage, TensorShapeStorage)
+    Raise:
+        PermissionError: Invalid input data or invalid shape
+        NotImplementedError: Current only support at most 2-D matrix
+    '''
+    stores = list(stores)
+    shapes = list(shapes)
+    num_stores = len(stores)
+    res_vec_size = np.sum([v.vec_size for v in stores])
+    # Abnormaly checks
+    if num_stores < 2:
+        raise PermissionError("At least 2 Storages required for concatenation")
+    if len(shapes) != num_stores:
+        raise PermissionError(
+            "The number of storages and that of shapes didn't match")
+    for v in stores:
+        if v.data_type != stores[0].data_type:
+            raise PermissionError(
+                "All storages should have the same data type")
+        if v.encode_n != stores[0].encode_n:
+            raise PermissionError("All storages should have the same n")
+        if v.max_int != stores[0].max_int:
+            raise PermissionError("All storages should have the same max_int")
+        if v.mem_type != stores[0].mem_type:
+            raise PermissionError(
+                "All storages should have the same memory type")
+    # num_rows, num_cols is the data demanded by C functions
+    # res_rows, res_cols are return values that should be same as numpy's output
+    # distinguish them so upper and lower level won't bother each other
+    if axis == 0:
+        first_shape_decomposed = __shape_decompose(shapes[0])
+        num_rows, num_cols = 0, first_shape_decomposed[1]
+        for v in shapes:
+            shape_tuple = __shape_decompose(v)
+            if shape_tuple[1] != num_cols:
+                raise PermissionError("Shapes didn't align")
+            num_rows += shape_tuple[0]
+        res_rows = num_rows
+        res_cols = num_cols
+    elif axis == 1:
+        first_shape = shapes[0].to_tuple()
+        if len(first_shape) <= 1:
+            num_rows, num_cols = 1, 0
+            for v in shapes:
+                if len(v.to_tuple()) == 0:
+                    num_cols += 1
+                if len(v.to_tuple()) == 1:
+                    num_cols += v.to_tuple()[0]
+                if len(v.to_tuple()) >= 2:
+                    raise PermissionError("Shape cannot align!!!")
+            res_rows = num_cols
+            res_cols = None
+        elif len(first_shape) == 2:
+            num_rows, num_cols = first_shape[0], 0
+            for v in shapes:
+                v_shape = v.to_tuple()
+                if len(v_shape) != 2 or num_rows != v_shape[0]:
+                    raise PermissionError("Shape cannot align!")
+                num_cols += v_shape[1]
+            res_rows = num_rows
+            res_cols = num_cols
+        else:
+            raise NotImplementedError("Now only support up to 2-D array")
+    else:
+        raise PermissionError("Invalid Axis")
+    res_shape = TensorShapeStorage(res_rows, res_cols)
+
+    fpn_pointers = [c_void_p(v.bigint_storage) for v in stores]
+    base_pointers = [c_void_p(v.base_storage) for v in stores]
+    exp_pointers = [c_void_p(v.exp_storage) for v in stores]
+
+    if res_store is None:
+        res_fpn = GPU_LIB.c_malloc(c_size_t(PLAIN_BYTE * res_vec_size))
+        res_base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+        res_exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+    else:
+        res_fpn = res_store.bigint_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    fpn_arr = (c_void_p * num_stores)(*fpn_pointers)
+    base_arr = (c_void_p * num_stores)(*base_pointers)
+    exp_arr = (c_void_p * num_stores)(*exp_pointers)
+    vec_sizes = (c_size_t * num_stores)(*[v.vec_size for v in stores])
+
+    if axis == 0:
+        '''means that we should cat stores vertically'''
+        GPU_LIB.vstack(
+            fpn_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(num_stores),
+            vec_sizes,
+            c_size_t(num_cols),
+            c_size_t(PLAIN_BITS),
+        )
+    elif axis == 1:
+        '''means that we should cat stores horizontally'''
+        GPU_LIB.hstack(
+            fpn_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_fpn),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(num_stores),
+            vec_sizes,
+            c_size_t(num_rows),
+            c_size_t(PLAIN_BITS),
+        )
+    else:
+        raise NotImplementedError()
+
+    return _fp_init_ss(
+        res_store,
+        res_fpn,
+        res_base,
+        res_exp,
+        int(round(res_vec_size)),
+        stores[0].encode_n,
+        stores[0].max_int,
+        res_shape,
+        res_shape.to_tuple(),
+        stores[0].mem_type,
+        stores[0].data_type,
+    )
+
+
+def pi_cat(stores, shapes, axis, res_store, res_shape):
+    '''
+    concat several PaillierEncryptedStorage according to axis
+    --------------------
+    Para:
+        stores: List or ndarray, elements are PaillierEncryptedStorage
+        shapes: List or ndarray, elements are TensorShapeStorage
+        axis:   int, how stores will be stacked
+                    0 means a vertical stack, stack along 1st dim
+                    1 means a horizontal stack, stack along 2nd dim
+        res_store: PaillierEncryptedStorage, the stacked result
+        res_shape: TensorShapeStorage, the result's shape
+    Return:
+        tuple, (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        PermissionError: Invalid input data or invalid shape
+        NotImplementedError: Current only support at most 2-D matrix
+    '''
+    stores = list(stores)
+    shapes = list(shapes)
+    num_stores = len(stores)
+    res_vec_size = np.sum([v.vec_size for v in stores])
+
+    # Anomaly checks
+    if num_stores < 2:
+        raise PermissionError("At least 2 Storages required for concatenation")
+    if len(shapes) != num_stores:
+        raise PermissionError(
+            "The number of storages and that of shapes didn't match")
+    for v in stores:
+        if v.data_type != stores[0].data_type:
+            raise PermissionError(
+                "All storages should have the same data type")
+        if v.encode_n != stores[0].encode_n:
+            raise PermissionError("All storages should have the same n")
+        if v.encode_max_int != stores[0].encode_max_int:
+            raise PermissionError("All storages should have the same max_int")
+        if v.mem_type != stores[0].mem_type:
+            raise PermissionError(
+                "All storages should have the same memory type")
+    # num_rows, num_cols is the data demanded by C functions
+    # res_rows, res_cols are return values that should be same as numpy's output
+    # distinguish them so upper and lower level won't bother each other
+    if axis == 0:
+        first_shape_decomposed = __shape_decompose(shapes[0])
+        num_rows, num_cols = 0, first_shape_decomposed[1]
+        for v in shapes:
+            shape_tuple = __shape_decompose(v)
+            if shape_tuple[1] != num_cols:
+                raise PermissionError("Shapes didn't align")
+            num_rows += shape_tuple[0]
+        res_rows = num_rows
+        res_cols = num_cols
+    elif axis == 1:
+        '''the horizontal cat'''
+        first_shape = shapes[0].to_tuple()
+        if len(first_shape) <= 1:
+            num_rows = 1
+            num_cols = 0
+            for v in shapes:
+                if len(v.to_tuple()) == 0:
+                    num_cols += 1
+                if len(v.to_tuple()) == 1:
+                    num_cols += v.to_tuple()[0]
+                if len(v.to_tuple()) >= 2:
+                    raise PermissionError("Shape cannot align!!!")
+            res_rows = num_cols
+            res_cols = None
+        elif len(first_shape) == 2:
+            num_rows = first_shape[0]
+            num_cols = 0
+            for v in shapes:
+                v_shape = v.to_tuple()
+                if len(v_shape) != 2 or num_rows != v_shape[0]:
+                    raise PermissionError("Shape cannot align!")
+                # num_rows += v_shape[0]
+                num_cols += v_shape[1]
+            res_rows = num_rows
+            res_cols = num_cols
+        else:
+            raise NotImplementedError("Now only support up to 2-D array")
+    else:
+        raise PermissionError("Invalid Axis")
+    res_shape = TensorShapeStorage(res_rows, res_cols)
+
+    pen_pointers = [c_void_p(v.pen_storage) for v in stores]
+    base_pointers = [c_void_p(v.base_storage) for v in stores]
+    exp_pointers = [c_void_p(v.exp_storage) for v in stores]
+
+    if res_store is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(CIPHER_BYTE * res_vec_size))
+        res_base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+        res_exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+    # call the C stack functions
+    pen_arr = (c_void_p * num_stores)(*pen_pointers)
+    base_arr = (c_void_p * num_stores)(*base_pointers)
+    exp_arr = (c_void_p * num_stores)(*exp_pointers)
+    vec_sizes = (c_size_t * num_stores)(*[v.vec_size for v in stores])
+
+    if axis == 0:
+        GPU_LIB.vstack(
+            pen_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(num_stores),
+            vec_sizes,
+            c_size_t(num_cols),
+            c_size_t(CIPHER_BITS),
+        )
+    elif axis == 1:
+        GPU_LIB.hstack(
+            pen_arr,
+            base_arr,
+            exp_arr,
+            c_void_p(res_pen),
+            c_void_p(res_base),
+            c_void_p(res_exp),
+            c_size_t(num_stores),
+            vec_sizes,
+            c_size_t(num_rows),
+            c_size_t(CIPHER_BITS),
+        )
+    else:
+        raise NotImplementedError()
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        int(round(res_vec_size)),
+        res_shape,
+        res_shape.to_tuple(),
+        stores[0].mem_type,
+        stores[0].data_type,
+        stores[0].encode_n,
+        stores[0].encode_max_int,
+    )
+
+
+def bi_p2c(data, res):
+    '''
+    copy data to the C memory pointed to by res
+    -------------------
+    Para:
+        data: List[object], each object is a bigint CIPHER_BIT long
+        res:  int, actually a pointer pointing to C memory
+    Return:
+        None, but the contents in c_void_p(res) has been changed
+    '''
+    vec_size = data.size
+    for i in range(vec_size):
+        GPU_LIB.c_memcpy(
+            c_void_p(res + i * CIPHER_BYTE),
+            c_char_p(data[i].to_bytes(CIPHER_BYTE, 'little')),
+            c_size_t(CIPHER_BYTE),
+        )
+
+
+def bi_gen_rand(elem_size, count, res, rand_seed, stream=None):
+    '''
+    generate random bigint for pi_obfuscation
+    ------------------
+    Para:
+        elem_size: int, length of random bigint, upper bound is CIPHER_BYTE
+        count:     int, number of random bigint to be generated
+        res:       BigintStorage, the return value
+        rand_seed: seed used for generating random data
+    Return:
+        BigintStorage, same as res
+    '''
+    # Didn't use vectorize since that we need to_bytes()
+    # But ndarray_float64 has no to_bytes method
+    random.seed(rand_seed)
+    rands = np.asarray([random.randrange(1, 8 ** elem_size)
+                        for i in range(count)])
+    if res is None:
+        data_storage = GPU_LIB.c_malloc(c_size_t(count * CIPHER_BYTE))
+    else:
+        data_storage = res.bigint_storage
+    bi_p2c(rands, data_storage)
+    # CIPHER_BYTE is the upper bound of the length of the rand number
+    return _bi_init_store(res, data_storage, count, CIPHER_BYTE, MEM_DEVICE)
+
+
+def __get_shape_size(shape_tuple):
+    shape_size = 1
+    if len(shape_tuple) == 0:
+        shape_size = 1
+    elif len(shape_tuple) == 1:
+        shape_size = shape_tuple[0]
+    elif len(shape_tuple) == 2:
+        shape_size = shape_tuple[0] * shape_tuple[1]
+    else:
+        raise PermissionError("Invalid Shape Tuple")
+    return shape_size
+
+
+def pi_reshape(
+        store,
+        shape,
+        new_shape,
+        res_store=None,
+        res_shape=None,
+        stream=None):
+    '''
+    Change a PaillierEcnryptedStorage's shape.
+    No need for change the continuous storage, only change the shape.
+    -------------------
+    Paras:
+        store, shape:  PaillierEncryptedStorage, TensorShapeStorage
+        new_shape:     TensorShapeStorage, the new shape for the pi_storage
+    Returns:
+        tuple: (PaillierEncryptedStorage, TensorShapeStorage)
+    Raise:
+        ValueError:    If shape and new_shape's size is unequal
+    '''
+    res_shape_tuple = new_shape.to_tuple()
+    old_shape_tuple = shape.to_tuple()
+    res_shape_size = __get_shape_size(res_shape_tuple)
+    old_shape_size = __get_shape_size(old_shape_tuple)
+    res_vec_size = store.vec_size
+    if res_shape_size != old_shape_size:
+        raise ValueError("total size of new array must be unchanged!")
+    # Still, we do a malloc and memcpy in order to avoid double free in python
+    if res_store is None:
+        res_pen = GPU_LIB.c_malloc(c_size_t(CIPHER_BYTE * res_vec_size))
+        res_base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+        res_exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * res_vec_size))
+    else:
+        res_pen = res_store.pen_storage
+        res_base = res_store.base_storage
+        res_exp = res_store.exp_storage
+
+    GPU_LIB.c_memcpy(
+        c_void_p(res_pen),
+        c_void_p(store.pen_storage),
+        c_size_t(CIPHER_BYTE * res_vec_size),
+    )
+    GPU_LIB.c_memcpy(
+        c_void_p(res_base),
+        c_void_p(store.base_storage),
+        c_size_t(U_INT32_BYTE * res_vec_size),
+    )
+    GPU_LIB.c_memcpy(
+        c_void_p(res_exp),
+        c_void_p(store.exp_storage),
+        c_size_t(U_INT32_BYTE * res_vec_size),
+    )
+
+    return _pi_init_ss(
+        res_store,
+        res_pen,
+        res_base,
+        res_exp,
+        store.vec_size,
+        res_shape,
+        res_shape_tuple,
+        store.mem_type,
+        store.data_type,
+        store.encode_n,
+        store.encode_max_int,
+    )
+
+
+def pi_accumulate(gpu_pubkey, pubkey_n, left_store, left_shape):
+    '''
+    Perform acummulate add for a vector
+    ----------------
+    Paras:
+        gpu_pubkey:  Dev_PubKeyStorage,
+        pubkey_n:    big int, n in PaillierPublicKey
+        left_store:  PaillierEncryptedStorage
+        left_shape:  TensorShapeStorage
+    Return:
+        tuple:       (PaillierEncryptedStorage, TensorShapeStorage)
+    '''
+    src_pen = left_store.pen_storage
+    src_base = left_store.base_storage
+    src_exp = left_store.exp_storage
+    vec_size = left_store.vec_size
+
+    res_pen = GPU_LIB.c_malloc(c_size_t(CIPHER_BYTE * vec_size))
+    res_base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    res_exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    res_shape_tuple = left_shape.to_tuple()
+
+    c_pubkey_n = c_char_p(pubkey_n.to_bytes(CIPHER_BYTE, "little"))
+
+    GPU_LIB.gmp_accumulate(
+        c_char_p(src_pen),
+        c_void_p(src_base),
+        c_void_p(src_exp),
+        c_void_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(vec_size),
+        c_size_t(CIPHER_BITS),
+        c_void_p(gpu_pubkey.pub_key_ptr),
+        c_pubkey_n,
+    )
+
+    return _pi_init_ss(
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        None,
+        res_shape_tuple,
+        left_store.mem_type,
+        left_store.data_type,
+        left_store.encode_n,
+        left_store.encode_max_int,
+    )
+
+
+def pi_add_with_index(
+        gpu_pubkey, pubkey_n, l_store, l_shape, r_store, r_shape, valid_index
+):
+    '''
+    Add a single PaillierEncryptedNumber to the designated index in a vector
+    ----------------------------
+    Para:
+        gpu_pubkey:  Dev_PubKeyStorage,
+        pubkey_n:    big int, n in PaillierPublicKey
+        l_store:     PaillierEncryptedStorage
+        l_shape:     TensorShapeStorage
+        r_store:     PaillierEncryptedStorage, 0-D number
+        r_shape:     TensorShapeStorage
+        valid_index: int, indicating a index offset in l_store,
+                          that r_store should be added to.
+    Return:
+        tuple, (PaillierEncryptedStorage, TensorShapeStorage)
+    '''
+    # check for data format
+    if r_store.vec_size != 1:
+        raise NotImplementedError(
+            "Now only support r_store with only one vector size")
+    # transform data format
+    vec_size = l_store.vec_size
+    c_pubkey_n = c_char_p(pubkey_n.to_bytes(CIPHER_BYTE, "little"))
+    res_shape_tuple = l_shape.to_tuple()
+    # alias for parameters
+    l_pen = l_store.pen_storage
+    l_base = l_store.base_storage
+    l_exp = l_store.exp_storage
+    r_pen = r_store.pen_storage
+    r_base = r_store.base_storage
+    r_exp = r_store.exp_storage
+    # alloc memory for return value
+    res_pen = GPU_LIB.c_malloc(c_size_t(CIPHER_BYTE * vec_size))
+    res_base = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    res_exp = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * vec_size))
+    # call the C functions
+    GPU_LIB.pen_add_with_index(
+        c_char_p(l_pen),
+        c_void_p(l_base),
+        c_void_p(l_exp),
+        c_char_p(r_pen),
+        c_void_p(r_base),
+        c_void_p(r_exp),
+        c_void_p(res_pen),
+        c_void_p(res_base),
+        c_void_p(res_exp),
+        c_size_t(vec_size),
+        c_size_t(valid_index),
+        c_size_t(CIPHER_BITS),
+        c_void_p(gpu_pubkey.pub_key_ptr),
+        c_pubkey_n,
+    )
+    return _pi_init_ss(
+        None,
+        res_pen,
+        res_base,
+        res_exp,
+        vec_size,
+        None,
+        res_shape_tuple,
+        l_store.mem_type,
+        l_store.data_type,
+        l_store.encode_n,
+        l_store.encode_max_int,
+    )
+
+
+def pi_partition_by_index(l_store, valid_index, valid_cnt=None):
+    '''
+    Rearrange the store to a number of stores according to valid_index
+    For instance, l_store with value [A,B,C,D,E] and valid_index [0,1,2,-1,2]
+    will become [[A],[B],[C,E]]
+    -----------------
+    Para:
+        l_store:     PaillierEncryptedStorage
+        valid_index: List[int], indicating the data in l_store belongs to which bin
+        valid_cnt:   List[int] or None, each bins length, default None
+    Return:
+        List[PaillierEncryptedStorage], the partitioned result, multiple PEN
+    '''
+    src_pen = l_store.pen_storage
+    src_base = l_store.base_storage
+    src_exp = l_store.exp_storage
+    vec_size = l_store.vec_size
+    valid_store = te_p2c(valid_index, None)
+    # if not pre-counted, then calculate valid_cnt here
+    if valid_cnt is None:
+        bin_cnt = max(valid_index) + 1
+        valid_cnt = [0 for _ in range(bin_cnt)]
+        for i in range(len(valid_index)):
+            if valid_index[i] == -1:
+                continue
+            bin_idx = valid_index[i]
+            valid_cnt[bin_idx] += 1
+    bin_cnt = len(valid_cnt)
+    # prepare and call for C function
+    res_pen_list, res_base_list, res_exp_list = [], [], []
+    for i in range(bin_cnt):
+        if valid_cnt[i] > 0:
+            res_pen_list.append(
+                GPU_LIB.cuda_malloc(c_size_t(CIPHER_BYTE * valid_cnt[i]))
+            )
+            # Assume that this data has already been aligned to max_exp
+            # which is done in h2d
+            base_ptr = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * valid_cnt[i]))
+            exp_ptr = GPU_LIB.c_malloc(c_size_t(U_INT32_BYTE * valid_cnt[i]))
+            GPU_LIB.c_memcpy(
+                c_void_p(base_ptr),
+                c_void_p(src_base),
+                c_size_t(U_INT32_BYTE * valid_cnt[i]),
+            )
+            GPU_LIB.c_memcpy(
+                c_void_p(exp_ptr),
+                c_void_p(src_exp),
+                c_size_t(U_INT32_BYTE * valid_cnt[i]),
+            )
+            res_base_list.append(base_ptr)
+            res_exp_list.append(exp_ptr)
+        else:
+            res_pen_list.append(None)
+            res_base_list.append(None)
+            res_exp_list.append(None)
+    pen_ptr_list = [c_void_p(x) for x in res_pen_list]
+    cipher_arr = (c_void_p * bin_cnt)(*pen_ptr_list)
+    GPU_LIB.partition_by_index(
+        c_char_p(src_pen),
+        cipher_arr,
+        c_void_p(valid_store.data),
+        c_uint32(vec_size),
+        c_uint32(bin_cnt),
+    )
+    # construct return list
+    res_list = []
+    for i in range(bin_cnt):
+        res_list.append(
+            _pi_init_ss(
+                None,
+                res_pen_list[i],
+                res_base_list[i],
+                res_exp_list[i],
+                valid_cnt[i],
+                None,
+                (valid_cnt[i],),
+                l_store.mem_type,
+                l_store.data_type,
+                l_store.encode_n,
+                l_store.encode_max_int,
+            )
+        )
+    return res_list
diff --git a/gpu/tensor/paillier_gpu/paillier_gpu/gpu_tensor.py b/gpu/tensor/paillier_gpu/paillier_gpu/gpu_tensor.py
new file mode 100644
index 0000000000..6bd5049364
--- /dev/null
+++ b/gpu/tensor/paillier_gpu/paillier_gpu/gpu_tensor.py
@@ -0,0 +1,511 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import typing
+
+import numpy as np
+
+from .gpu_engine import (
+    PaillierEncryptedStorage,
+    TensorShapeStorage,
+    pi_add,
+    te_p2c,
+    fp_encode,
+    pi_encrypt,
+    pi_mul,
+    pi_matmul,
+    pi_rmatmul,
+    pi_sum,
+    pi_h2d_pub_key,
+    pi_p2c_pub_key,
+    pi_decrypt,
+    te_c2p,
+    pi_h2d_priv_key,
+    pi_p2c_priv_key,
+)
+from fate_arch.tensor.impl.blocks.python_paillier_block import (
+    PaillierPublicKey,
+    PaillierPrivateKey,
+    PaillierKeypair,
+)
+
+
+class Cipherblock:
+    def __init__(
+            self,
+            store: PaillierEncryptedStorage,
+            shape: TensorShapeStorage,
+            pk: "PK"):
+        self.store = store
+        self.shape = shape
+        self.pk = pk
+
+    def get_shape(self):
+        return self.shape.to_tuple()
+
+    def get_size(self):
+        return self.shape.size()
+
+    @staticmethod
+    def gen_shape(other):
+        return TensorShapeStorage().from_tuple(other.shape)
+
+    def _add_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        pi_store = pi_encrypt(self.pk.gpu_pub_key, fp_store)
+        res_store, res_shape = pi_add(
+            self.pk.gpu_pub_key, self.store, pi_store, self.shape, self.gen_shape(other))
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def _mul_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        res_store, res_shape = pi_mul(
+            self.pk.gpu_pub_key, self.store, fp_store, self.shape, self.gen_shape(other))
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def _matmul_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        res_store, res_shape = pi_matmul(
+            self.pk.gpu_pub_key, self.store, fp_store, self.shape, self.gen_shape(other))
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def _rmatmul_plaintext(self, other) -> "Cipherblock":
+        fp_store = fp_encode(
+            te_p2c(other),
+            self.pk.pub_key.n,
+            self.pk.pub_key.max_int)
+        res_store, res_shape = pi_rmatmul(
+            self.pk.gpu_pub_key, fp_store, self.store, self.gen_shape(other), self.shape)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def add_cipherblock(self, other: "Cipherblock") -> "Cipherblock":
+        res_store, res_shape = pi_add(
+            self.pk.gpu_pub_key, self.store, other.store, self.shape, other.shape)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def add_plaintext_f64(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_f32(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_i64(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_i32(self, other) -> "Cipherblock":
+        return self._add_plaintext(other)
+
+    def add_plaintext_scalar_f64(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float64)
+        return self._add_plaintext(other_array)
+
+    def add_plaintext_scalar_f32(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float32)
+        return self._add_plaintext(other_array)
+
+    def add_plaintext_scalar_i64(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int64)
+        return self._add_plaintext(other_array)
+
+    def add_plaintext_scalar_i32(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int32)
+        return self._add_plaintext(other_array)
+
+    def sub_cipherblock(self, other: "Cipherblock") -> "Cipherblock":
+        return self.add_cipherblock(other.mul_plaintext_scalar_i32(-1))
+
+    def sub_plaintext_f64(self, other) -> "Cipherblock":
+        return self.add_plaintext_f64(other * -1)
+
+    def sub_plaintext_f32(self, other) -> "Cipherblock":
+        return self.add_plaintext_f32(other * -1)
+
+    def sub_plaintext_i64(self, other) -> "Cipherblock":
+        return self.add_plaintext_i64(other * -1)
+
+    def sub_plaintext_i32(self, other) -> "Cipherblock":
+        return self.add_plaintext_i32(other * -1)
+
+    def sub_plaintext_scalar_f64(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_f64(other * -1)
+
+    def sub_plaintext_scalar_f32(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_f32(other * -1)
+
+    def sub_plaintext_scalar_i64(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_i64(other * -1)
+
+    def sub_plaintext_scalar_i32(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_i32(other * -1)
+
+    def mul_plaintext_f64(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_f32(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_i64(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_i32(self, other) -> "Cipherblock":
+        return self._mul_plaintext(other)
+
+    def mul_plaintext_scalar_f64(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float64)
+        return self._mul_plaintext(other_array)
+
+    def mul_plaintext_scalar_f32(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.float32)
+        return self._mul_plaintext(other_array)
+
+    def mul_plaintext_scalar_i64(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int64)
+        return self._mul_plaintext(other_array)
+
+    def mul_plaintext_scalar_i32(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        other_array = np.asarray([other], dtype=np.int32)
+        return self._mul_plaintext(other_array)
+
+    def matmul_plaintext_ix2_f64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix2_f32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix2_i64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix2_i32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_f64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_f32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_i64(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def matmul_plaintext_ix1_i32(self, other) -> "Cipherblock":
+        return self._matmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_f64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_f32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_i64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix2_i32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_f64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_f32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_i64(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def rmatmul_plaintext_ix1_i32(self, other) -> "Cipherblock":
+        return self._rmatmul_plaintext(other)
+
+    def sum(self) -> "Cipherblock":
+        res_store, res_shape = pi_sum(
+            self.pk.gpu_pub_key, self.store, self.shape)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def sum_axis(self, axis=None):
+        res_store, res_shape = pi_sum(
+            self.pk.gpu_pub_key, self.store, self.shape, axis)
+        return Cipherblock(res_store, res_shape, self.pk)
+
+    def mean(self) -> "Cipherblock":
+        return self.sum().mul_plaintext_scalar_f64(float(1 / self.get_size()))
+
+    """parallel"""
+
+    def add_cipherblock_par(self, other: "Cipherblock") -> "Cipherblock":
+        return self.add_cipherblock(other)
+
+    def add_plaintext_f64_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_f64(other)
+
+    def add_plaintext_f32_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_f32(other)
+
+    def add_plaintext_i64_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_i64(other)
+
+    def add_plaintext_scalar_f64_par(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_f64(other)
+
+    def add_plaintext_scalar_f32_par(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_f32(other)
+
+    def add_plaintext_scalar_i64_par(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_i64(other)
+
+    def add_plaintext_scalar_i32_par(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        return self.add_plaintext_scalar_i32(other)
+
+    def add_plaintext_i32_par(self, other) -> "Cipherblock":
+        return self.add_plaintext_i32(other)
+
+    def sub_cipherblock_par(self, other: "Cipherblock") -> "Cipherblock":
+        return self.sub_cipherblock(other)
+
+    def sub_plaintext_f64_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_f64(other)
+
+    def sub_plaintext_f32_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_f32(other)
+
+    def sub_plaintext_i64_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_i64(other)
+
+    def sub_plaintext_i32_par(self, other) -> "Cipherblock":
+        return self.sub_plaintext_i32(other)
+
+    def sub_plaintext_scalar_f64_par(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        return self.sub_plaintext_scalar_f64(other)
+
+    def sub_plaintext_scalar_f32_par(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        return self.sub_plaintext_scalar_f32(other)
+
+    def sub_plaintext_scalar_i64_par(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        return self.sub_plaintext_scalar_i64(other)
+
+    def sub_plaintext_scalar_i32_par(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        return self.sub_plaintext_scalar_i32(other)
+
+    def mul_plaintext_f64_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_f64(other)
+
+    def mul_plaintext_f32_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_f32(other)
+
+    def mul_plaintext_i64_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_i64(other)
+
+    def mul_plaintext_i32_par(self, other) -> "Cipherblock":
+        return self.mul_plaintext_i32(other)
+
+    def mul_plaintext_scalar_f64_par(
+        self, other: typing.Union[float, np.float64]
+    ) -> "Cipherblock":
+        return self.mul_plaintext_scalar_f64(other)
+
+    def mul_plaintext_scalar_f32_par(
+        self, other: typing.Union[float, np.float32]
+    ) -> "Cipherblock":
+        return self.mul_plaintext_scalar_f32(other)
+
+    def mul_plaintext_scalar_i64_par(
+        self, other: typing.Union[int, np.int64]
+    ) -> "Cipherblock":
+        return self.mul_plaintext_scalar_i64(other)
+
+    def mul_plaintext_scalar_i32_par(
+        self, other: typing.Union[int, np.int32]
+    ) -> "Cipherblock":
+        return self.mul_plaintext_scalar_i32(other)
+
+    def matmul_plaintext_ix2_f64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_f64(other)
+
+    def matmul_plaintext_ix2_f32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_f32(other)
+
+    def matmul_plaintext_ix2_i64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_i64(other)
+
+    def matmul_plaintext_ix2_i32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix2_i32(other)
+
+    def matmul_plaintext_ix1_f64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_f64(other)
+
+    def matmul_plaintext_ix1_f32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_f32(other)
+
+    def matmul_plaintext_ix1_i64_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_i64(other)
+
+    def matmul_plaintext_ix1_i32_par(self, other) -> "Cipherblock":
+        return self.matmul_plaintext_ix1_i32(other)
+
+    def rmatmul_plaintext_ix2_f64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_f64(other)
+
+    def rmatmul_plaintext_ix2_f32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_f32(other)
+
+    def rmatmul_plaintext_ix2_i64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_i64(other)
+
+    def rmatmul_plaintext_ix2_i32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix2_i32(other)
+
+    def rmatmul_plaintext_ix1_f64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_f64(other)
+
+    def rmatmul_plaintext_ix1_f32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_f32(other)
+
+    def rmatmul_plaintext_ix1_i64_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_i64(other)
+
+    def rmatmul_plaintext_ix1_i32_par(self, other) -> "Cipherblock":
+        return self.rmatmul_plaintext_ix1_i32(other)
+
+    def sum_par(self) -> "Cipherblock":
+        return self.sum()
+
+    def mean_par(self) -> "Cipherblock":
+        return self.mean()
+
+
+class PK:
+    def __init__(self, pub_key: PaillierPublicKey):
+        self.pub_key = pub_key
+        self.gpu_pub_key = pi_h2d_pub_key(pi_p2c_pub_key(self.pub_key))
+
+    def _encrypt(self, a) -> Cipherblock:
+        shape = TensorShapeStorage().from_tuple(a.shape)
+        fp_store = fp_encode(te_p2c(a), self.pub_key.n, self.pub_key.max_int)
+        pi_store = pi_encrypt(self.gpu_pub_key, fp_store)
+        return Cipherblock(pi_store, shape, self)
+
+    def encrypt_f64(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_f32(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_i64(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_i32(self, a) -> Cipherblock:
+        return self._encrypt(a)
+
+    def encrypt_f64_par(self, a) -> Cipherblock:
+        return self.encrypt_f64(a)
+
+    def encrypt_f32_par(self, a) -> Cipherblock:
+        return self.encrypt_f32(a)
+
+    def encrypt_i64_par(self, a) -> Cipherblock:
+        return self.encrypt_i64(a)
+
+    def encrypt_i32_par(self, a) -> Cipherblock:
+        return self.encrypt_i32(a)
+
+
+class SK:
+    def __init__(self, priv_key: PaillierPrivateKey, pk: PK):
+        self.priv_key = priv_key
+        self.gpu_priv_key = pi_h2d_priv_key(pi_p2c_priv_key(priv_key))
+        self.pk = pk
+
+    def _decrypt(self, a: Cipherblock):
+        if a.store.vec_size == 0:
+            return np.asarray([])
+        te_res = pi_decrypt(a.pk.gpu_pub_key, self.gpu_priv_key, a.store)
+        return te_c2p(te_res).reshape(a.get_shape())
+
+    def decrypt_f64(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.float64)
+
+    def decrypt_f32(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.float32)
+
+    def decrypt_i64(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.int64)
+
+    def decrypt_i32(self, a: Cipherblock):
+        return self._decrypt(a).astype(np.int32)
+
+    def decrypt_f64_par(self, a: Cipherblock):
+        return self.decrypt_f64(a)
+
+    def decrypt_f32_par(self, a: Cipherblock):
+        return self.decrypt_f32(a)
+
+    def decrypt_i64_par(self, a: Cipherblock):
+        return self.decrypt_i64(a)
+
+    def decrypt_i32_par(self, a: Cipherblock):
+        return self.decrypt_i32(a)
+
+
+def keygen(bit_size) -> typing.Tuple[PK, SK]:
+    pub_key, priv_key = PaillierKeypair.generate_keypair(n_length=bit_size)
+    pk = PK(pub_key)
+    sk = SK(priv_key, pk)
+    return pk, sk
diff --git a/gpu/tensor/paillier_gpu/paillier_gpu/tests/__init__.py b/gpu/tensor/paillier_gpu/paillier_gpu/tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/gpu/tensor/paillier_gpu/paillier_gpu/tests/test_gpu_engine.py b/gpu/tensor/paillier_gpu/paillier_gpu/tests/test_gpu_engine.py
new file mode 100755
index 0000000000..c0a3d0d7c5
--- /dev/null
+++ b/gpu/tensor/paillier_gpu/paillier_gpu/tests/test_gpu_engine.py
@@ -0,0 +1,707 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import random
+
+import numpy as np
+import unittest
+import functools
+import time
+
+from fate_arch.tensor.impl.blocks.python_paillier_block import (
+    PaillierKeypair,
+    PaillierEncryptedNumber,
+    FixedPointNumber,
+    gmpy_math,
+)
+
+from ..gpu_engine import (
+    FLOAT_TYPE,
+    INT64_TYPE,
+    pi_p2c_pub_key,
+    pi_p2c_priv_key,
+    pi_h2d_pub_key,
+    pi_h2d_priv_key,
+    TensorShapeStorage,
+    bi_alloc,
+    PLAIN_BYTE,
+    MEM_HOST,
+    te_alloc,
+    fp_alloc,
+    pi_alloc,
+    te_p2c,
+    fp_encode,
+    fp_decode,
+    te_c2p,
+    pi_encrypt,
+    pi_gen_obf_seed,
+    CIPHER_BITS,
+    pi_obfuscate,
+    pi_c2p,
+    pi_decrypt,
+    fp_mul,
+    fp_c2p,
+    pi_add,
+    pi_mul,
+    pi_sum,
+    bi_free,
+    te_free,
+    fp_free,
+    pi_free,
+)
+
+RAND_TYPE = FLOAT_TYPE  # SWITCH DATA TYPE HERE: EITHER INT64_TYPE OR FLOAT_TYPE
+NUM_ROWS = 200
+NUM_COLS = 200
+TEST_SIZE = NUM_ROWS * NUM_COLS
+KEY_LEN = 1024
+DATA_SIZE = TEST_SIZE * KEY_LEN * 2 // 8
+ERROR_TOLERANCE = 1e-10
+
+
+class TestCaseReport:
+    def __init__(self, name, batch_size, bit_len, data_size):
+        self.name = name
+        self.batch_size = batch_size
+        self.bit_len = bit_len
+        self.data_size = int(data_size)
+        self.content = {}
+        self.width = 100
+        self.column = [30, 20, 25, 24]
+        self.cpu_throughput = 0.0
+        self.gpu_throughput = 0.0
+
+    def add_perf_report(self, name):
+        self.content[name] = {}
+
+    def add_item(self, report_name, item_name, time, ops, bw):
+        self.content[report_name][item_name] = {}
+        self.content[report_name][item_name]['time'] = time
+        self.content[report_name][item_name]['ops'] = ops
+        self.content[report_name][item_name]['bw'] = bw
+
+    def gen_line(self, *args):
+        i = 0
+        size = 0
+        res = ''
+        for v in args:
+            res += '|' + str(v) + ' ' * (self.column[i] - len(str(v)) - 1)
+            size += self.column[i]
+            i += 1
+        if i < 3:
+            res += " " * (self.width - size - 1)
+        res += '|'
+        return res
+
+    def dump_header(self):
+        res = []
+        res.append('=' * self.width)
+        res.append(
+            '|' + ' ' * (int(self.width - len(self.name) - 2) // 2) + self.name + ' ' * (
+                int(self.width - len(self.name) - 1) // 2) + '|'
+        )
+        res.append('=' * self.width)
+        res.append(self.gen_line("Data Information"))
+        res.append('-' * self.width)
+        res.append(self.gen_line("Batch Size", self.batch_size))
+        res.append(self.gen_line("Bit Length", self.bit_len))
+        res.append(self.gen_line("Data Size (Bytes)", self.data_size))
+        return "\n".join(res)
+
+    def dump_item(self, report_name, item_name):
+        time = self.content[report_name][item_name]['time']
+        time = "{0:.4f}".format(time)
+        ops = self.content[report_name][item_name]['ops']
+        ops = "{0:.4f}".format(ops)
+        bw = self.content[report_name][item_name]['bw'] / (2 ** 20)
+        bw = "{0:.4f}".format(bw)
+        line = self.gen_line(item_name, time, ops, bw)
+        return line
+
+    def dump_perf_report(self, report_name):
+        res = []
+        res.append("=" * self.width)
+        res.append(self.gen_line(report_name))
+        res.append("-" * self.width)
+        res.append(
+            self.gen_line(
+                "Item",
+                "Time Elapsed(s)",
+                "Operations Per Second",
+                "Bandwidth (MB/s)"))
+        res.append("-" * self.width)
+        for v in self.content[report_name]:
+            res.append(self.dump_item(report_name, v))
+        return "\n".join(res)
+
+    def dump_summary(self):
+        self.ratio = self.gpu_throughput / self.cpu_throughput
+        res = []
+        res.append("=" * self.width)
+        res.append(self.gen_line("Performance of GPU/CPU"))
+        res.append('-' * self.width)
+        res.append(
+            self.gen_line(
+                "GPU/CPU Ratio (Speedup)",
+                "{0:.4f}".format(
+                    self.ratio)))
+        res.append("=" * self.width)
+        res.append('\n')
+
+        return "\n".join(res)
+
+    def dump_result(self):
+        res = []
+        res.append(self.dump_header())
+        for v in self.content:
+            res.append(self.dump_perf_report(v))
+        res.append(self.dump_summary())
+        report = "\n".join(res)
+        print(report)
+
+
+def generate_rand(test_size):
+    if RAND_TYPE == FLOAT_TYPE:
+        return np.random.normal(0, 5, test_size)
+    elif RAND_TYPE == INT64_TYPE:
+        return np.random.randint(-(2 ** 10), 2 ** 10, test_size)
+    else:
+        raise TypeError("Invalid data type")
+
+
+def assert_diff(res, ref):
+    if res == 0 or ref == 0:
+        assert res == 0
+        assert ref == 0
+    else:
+        diff = res - ref
+        assert abs(diff / res) < ERROR_TOLERANCE
+        assert abs(diff / ref) < ERROR_TOLERANCE
+
+
+def assert_ndarray_diff(res, ref):
+    assert res.shape == ref.shape
+    res, ref = res.flatten(), ref.flatten()
+    assert res.shape == ref.shape
+    for i in range(res.size):
+        try:
+            assert_diff(res[i], ref[i])
+        except AssertionError:
+            print(
+                "Assertion Error at location",
+                i,
+                ", GPU result:",
+                res[i],
+                ", reference result:",
+                ref[i],
+            )
+
+
+def profile(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        res = func(*args, **kwargs)
+        end_time = time.time()
+        return res, end_time - start_time
+
+    return wrapper
+
+
+def compare_time(gpu_time, cpu_time, num_instances=TEST_SIZE):
+    print("GPU time:", gpu_time, "second(s)")
+    print("CPU time:", cpu_time, "second(s)")
+    print(
+        "GPU throughput:",
+        num_instances / gpu_time,
+        "instance(s) per second")
+    print(
+        "CPU throughput:",
+        num_instances / cpu_time,
+        "instance(s) per second")
+    print("Speedup:", cpu_time / gpu_time)
+
+
+def cpu_pi_gen_obf_seed(
+        res_store,
+        public_key,
+        count,
+        elem_size,
+        rand_seed,
+        stream):
+    random.seed(rand_seed)
+    rand_vals = [random.randrange(1, 8 ** elem_size) for _ in range(count)]
+    return [
+        gmpy_math.powmod(
+            v,
+            public_key.n,
+            public_key.nsquare) for v in rand_vals]
+
+
+def cpu_pi_obfuscate(
+        public_key, encrypted_numbers, obf_seeds, exponents, res_store, stream
+):
+    return [
+        PaillierEncryptedNumber(
+            public_key,
+            (encrypted_numbers[i] * obf_seeds[i]) % public_key.nsquare,
+            exponents[i],
+        )
+        for i in range(len(encrypted_numbers))
+    ]
+
+
+def cpu_fp_mul(left, right):
+    return [
+        FixedPointNumber(
+            (left[i].encoding * right[i].encoding) % left[i].n,
+            left[i].exponent + right[i].exponent,
+            left[i].n,
+            left[i].max_int,
+        )
+        for i in range(len(left))
+    ]
+
+
+def add_to_perf_reports(_perf_reports, name, gpu_time, cpu_time, data_size):
+    perf_report = TestCaseReport(name, TEST_SIZE, KEY_LEN, data_size)
+    perf_report.gpu_throughput = TEST_SIZE / gpu_time
+    perf_report.add_perf_report("GPU Performance")
+    perf_report.add_item(
+        "GPU Performance",
+        "Computation on GPU",
+        gpu_time,
+        TEST_SIZE / gpu_time,
+        data_size / gpu_time,
+    )
+    perf_report.cpu_throughput = TEST_SIZE / cpu_time
+    perf_report.add_perf_report("CPU Performance")
+    perf_report.add_item(
+        "CPU Performance",
+        "Computation on CPU",
+        cpu_time,
+        TEST_SIZE / cpu_time,
+        data_size / cpu_time,
+    )
+    _perf_reports.append(perf_report)
+
+
+class TestOperators(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls._pub_key, cls._priv_key = PaillierKeypair.generate_keypair()
+        cls.n, cls.max_int = cls._pub_key.n, cls._pub_key.max_int
+        cls._cpu_pub_key = pi_p2c_pub_key(cls._pub_key)
+        cls._cpu_priv_key = pi_p2c_priv_key(cls._priv_key)
+        cls._gpu_pub_key = pi_h2d_pub_key(cls._cpu_pub_key)
+        cls._gpu_priv_key = pi_h2d_priv_key(cls._cpu_priv_key)
+        cls._perf_reports = []
+        print(
+            "\n\n",
+            "*" * 100,
+            "\n\nInitialization complete\nTest Size:",
+            TEST_SIZE)
+
+    def test_performance(self):
+        print("\n\n", "*" * 100, "\n\nTest performance begins")
+
+        print("\n>>>>> generate data and allocate memory spaces")
+        raw, raw2 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        shape_tuple, shape_tuple_T = (NUM_ROWS, NUM_COLS), (NUM_COLS, NUM_ROWS)
+        shape_store, _ = TensorShapeStorage(*shape_tuple), TensorShapeStorage(
+            *shape_tuple_T
+        )
+        gpu_bi_store, gpu_bi_store2 = bi_alloc(
+            None, TEST_SIZE, PLAIN_BYTE, MEM_HOST
+        ), bi_alloc(None, TEST_SIZE, PLAIN_BYTE, MEM_HOST)
+        gpu_te_store, gpu_te_store2 = te_alloc(
+            None, TEST_SIZE, MEM_HOST), te_alloc(
+            None, TEST_SIZE, MEM_HOST)
+        gpu_fp_store, gpu_fp_store2 = fp_alloc(
+            None, TEST_SIZE, MEM_HOST), fp_alloc(
+            None, TEST_SIZE, MEM_HOST)
+        gpu_pi_store, gpu_pi_store2 = pi_alloc(
+            None, TEST_SIZE, MEM_HOST), pi_alloc(
+            None, TEST_SIZE, MEM_HOST)
+        gpu_te_store, gpu_te_store2 = te_p2c(raw, gpu_te_store), te_p2c(
+            raw2, gpu_te_store2
+        )
+
+        print("\n>>>>> fp_encode profiling begins")
+        gpu_encoded, gpu_encode_time = profile(fp_encode)(
+            gpu_te_store, self.n, self.max_int, res=gpu_fp_store
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_encode_time
+            )
+        )
+        cpu_encoded, cpu_encode_time = profile(
+            lambda l: [
+                FixedPointNumber.encode(
+                    v, self.n, self.max_int) for v in l])(raw)
+        compare_time(gpu_encode_time, cpu_encode_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Encode",
+            gpu_encode_time,
+            cpu_encode_time,
+            DATA_SIZE)
+
+        print("\n>>>>> fp_decode profiling begins")
+        gpu_decoded, gpu_decode_time = profile(fp_decode)(
+            gpu_encoded, gpu_te_store, None
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_decode_time
+            )
+        )
+        cpu_decoded, cpu_decode_time = profile(
+            lambda l: [v.decode() for v in l])(cpu_encoded)
+        compare_time(gpu_decode_time, cpu_decode_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Decode",
+            gpu_decode_time,
+            cpu_decode_time,
+            DATA_SIZE)
+
+        # check decoded results
+        assert_ndarray_diff(te_c2p(gpu_decoded), np.asarray(cpu_decoded))
+
+        print("\n>>>>> pi_encrypt profiling begins")
+        print("This function calculates (encoding * n + 1) % nsquare")
+        gpu_encrypted, gpu_encrypt_time = profile(pi_encrypt)(
+            self._gpu_pub_key, gpu_encoded, gpu_pi_store, None
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_encrypt_time
+            )
+        )
+        cpu_encrypted, cpu_encrypt_time = profile(
+            lambda l: [self._pub_key.raw_encrypt(v.encoding, 1) for v in l]
+        )(cpu_encoded)
+        compare_time(gpu_encrypt_time, cpu_encrypt_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Encrypt",
+            gpu_encrypt_time,
+            cpu_encrypt_time,
+            DATA_SIZE)
+
+        print("\n>>>>> pi_gen_obf_seed profiling begins")
+        print("This function calculates (rand() ^ n) % nsquare")
+        gpu_obf_seeds, gpu_gen_obf_seeds_time = profile(pi_gen_obf_seed)(
+            gpu_bi_store, self._gpu_pub_key, TEST_SIZE, CIPHER_BITS // 6, 0, None)
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_gen_obf_seeds_time
+            )
+        )
+        cpu_obf_seeds, cpu_gen_obf_seefs_time = profile(cpu_pi_gen_obf_seed)(
+            None, self._pub_key, TEST_SIZE, CIPHER_BITS // 6, 0, None
+        )
+        compare_time(gpu_gen_obf_seeds_time, cpu_gen_obf_seefs_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Generate Obfuscators",
+            gpu_gen_obf_seeds_time,
+            cpu_gen_obf_seefs_time,
+            DATA_SIZE,
+        )
+
+        print("\n>>>>> pi_obfuscate profiling begins")
+        print("This function calculates (raw_cipher * obf_seed) % nsquare,")
+        print(
+            "\twhere raw_cipher and obf_seed are calculated in pi_encrypt and pi_gen_obf_seeds, respectively"
+        )
+        gpu_obfuscated, gpu_obfuscate_time = profile(pi_obfuscate)(
+            self._gpu_pub_key, gpu_encrypted, gpu_obf_seeds, gpu_pi_store, None
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_obfuscate_time
+            )
+        )
+        cpu_obfuscated, cpu_obfuscate_time = profile(cpu_pi_obfuscate)(
+            self._pub_key,
+            cpu_encrypted,
+            cpu_obf_seeds,
+            [v.exponent for v in cpu_encoded],
+            None,
+            None,
+        )
+        compare_time(gpu_obfuscate_time, cpu_obfuscate_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Obfuscate",
+            gpu_obfuscate_time,
+            cpu_obfuscate_time,
+            DATA_SIZE,
+        )
+
+        # check intermediate result
+        assert_ndarray_diff(
+            np.asarray(pi_c2p(gpu_obfuscated)[0]),
+            np.asarray([v.ciphertext(False) for v in cpu_obfuscated]),
+        )
+
+        print("\n>>>>> pi_decrypt profiling begins")
+        print(
+            "This function calculates L(cipher ^ lambda % nsquare) * L(g ^ lambda % nsquare) ^ -1 % n"
+        )
+        print("fp_decode is by default included in pi_decrypt")
+        fps_buffer = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        gpu_decrypted, gpu_decrypt_time = profile(pi_decrypt)(
+            self._gpu_pub_key,
+            self._gpu_priv_key,
+            gpu_obfuscated,
+            gpu_te_store,
+            fps_buffer,
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_decrypt_time
+            )
+        )
+        cpu_decrypted, cpu_decrypt_time = profile(
+            lambda l: [self._priv_key.decrypt(v) for v in l]
+        )(cpu_obfuscated)
+        compare_time(gpu_decrypt_time, cpu_decrypt_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Decrypt",
+            gpu_decrypt_time,
+            cpu_decrypt_time,
+            DATA_SIZE)
+
+        # check decrypted results
+        assert_ndarray_diff(te_c2p(gpu_decrypted), np.asarray(cpu_decrypted))
+
+        print("\n>>>>> generating the other array")
+        # encode the other array
+        gpu_encoded2 = fp_encode(
+            gpu_te_store2,
+            self.n,
+            self.max_int,
+            res=gpu_fp_store2)
+        cpu_encoded2 = [
+            FixedPointNumber.encode(
+                v, self.n, self.max_int) for v in raw2]
+        # encrypt the other array
+        gpu_encrypted2 = pi_encrypt(
+            self._gpu_pub_key, gpu_encoded2, gpu_pi_store2, None
+        )
+        cpu_encrypted2 = [
+            self._pub_key.raw_encrypt(v.encoding, 1) for v in cpu_encoded2
+        ]
+        # generate obfuscation seeds (obfuscators) for the other array using a
+        # different random seed
+        gpu_obf_seeds2 = pi_gen_obf_seed(
+            gpu_bi_store2,
+            self._gpu_pub_key,
+            TEST_SIZE,
+            CIPHER_BITS // 6,
+            1,
+            None)
+        cpu_obf_seeds2 = cpu_pi_gen_obf_seed(
+            None, self._pub_key, TEST_SIZE, CIPHER_BITS // 6, 1, None
+        )
+        # obfuscate the other array
+        gpu_obfuscated2 = pi_obfuscate(
+            self._gpu_pub_key,
+            gpu_encrypted2,
+            gpu_obf_seeds2,
+            gpu_pi_store2,
+            None)
+        cpu_obfuscated2 = cpu_pi_obfuscate(
+            self._pub_key,
+            cpu_encrypted2,
+            cpu_obf_seeds2,
+            [v.exponent for v in cpu_encoded2],
+            None,
+            None,
+        )
+        # check intermediate result
+        assert_ndarray_diff(
+            np.asarray(pi_c2p(gpu_obfuscated2)[0]),
+            np.asarray([v.ciphertext(False) for v in cpu_obfuscated2]),
+        )
+
+        print("\n>>>>> fp_mul profiling begins")
+        gpu_fp_mul_store = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        (gpu_fp_mul_res, _), gpu_fp_mul_time = profile(fp_mul)(
+            gpu_encoded,
+            gpu_encoded2,
+            shape_store,
+            shape_store,
+            gpu_fp_mul_store,
+            shape_store,
+            None,
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_fp_mul_time
+            )
+        )
+        cpu_fp_mul_res, cpu_fp_mul_time = profile(
+            cpu_fp_mul)(cpu_encoded, cpu_encoded2)
+        compare_time(gpu_fp_mul_time, cpu_fp_mul_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Fixed-point Number Multiply",
+            gpu_fp_mul_time,
+            cpu_fp_mul_time,
+            DATA_SIZE * 2,
+        )
+
+        # Compare results
+        received_fp_mul_res = fp_c2p(gpu_fp_mul_res)
+        for i in range(TEST_SIZE):
+            assert_diff(
+                received_fp_mul_res[i].encoding,
+                cpu_fp_mul_res[i].encoding)
+            assert received_fp_mul_res[i].BASE == cpu_fp_mul_res[i].BASE
+            assert received_fp_mul_res[i].exponent == cpu_fp_mul_res[i].exponent
+
+        print("\n>>>>> pi_add profiling begins")
+        (gpu_add_res, _), gpu_add_time = profile(pi_add)(
+            self._gpu_pub_key,
+            gpu_obfuscated,
+            gpu_obfuscated2,
+            shape_store,
+            shape_store,
+            gpu_pi_store,
+            shape_store,
+            None,
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_add_time
+            )
+        )
+        cpu_add_res, cpu_add_time = profile(
+            lambda a, b: [a[i] + b[i] for i in range(TEST_SIZE)]
+        )(cpu_obfuscated, cpu_obfuscated2)
+        compare_time(gpu_add_time, cpu_add_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Add",
+            gpu_add_time,
+            cpu_add_time,
+            DATA_SIZE * 2)
+
+        print("\n>>>>> pi_mul profiling begins")
+        (gpu_mul_res, _), gpu_mul_time = profile(pi_mul)(
+            self._gpu_pub_key,
+            gpu_add_res,
+            gpu_encoded2,
+            shape_store,
+            shape_store,
+            gpu_pi_store,
+            shape_store,
+            None,
+        )
+        print(
+            "GPU computation completed in {} second(s), waiting for CPU".format(
+                gpu_mul_time
+            )
+        )
+        cpu_mul_res, cpu_mul_time = profile(
+            lambda a, b: [a[i] * b[i] for i in range(TEST_SIZE)]
+        )(cpu_add_res, cpu_encoded2)
+        compare_time(gpu_mul_time, cpu_mul_time)
+        add_to_perf_reports(
+            self._perf_reports,
+            "Multiply",
+            gpu_mul_time,
+            cpu_mul_time,
+            DATA_SIZE * 2)
+
+        gpu_pi_matmul_store = pi_alloc(None, NUM_ROWS * NUM_ROWS, MEM_HOST)
+        gpu_matmul_res, gpu_matmul_shape = gpu_mul_res, shape_store
+        cpu_matmul_res = np.asarray(cpu_mul_res).reshape(shape_tuple)
+
+        print("\n>>>>> pi_sum profiling begins")
+        print("shape is", gpu_matmul_shape.to_tuple())
+        gpu_pi_sum_store = pi_alloc(None, max(NUM_ROWS, NUM_COLS), MEM_HOST)
+        for axis in [0, 1, None]:
+            print(">>> axis:", axis)
+            (gpu_sum_res, _), gpu_sum_time = profile(pi_sum)(
+                self._gpu_pub_key,
+                gpu_matmul_res,
+                gpu_matmul_shape,
+                axis,
+                gpu_pi_sum_store,
+                None,
+                None,
+            )
+            print(
+                "GPU computation completed in {} second(s), waiting for CPU".format(
+                    gpu_sum_time
+                )
+            )
+            cpu_sum_res, cpu_sum_time = profile(lambda a: np.sum(a, axis))(
+                cpu_matmul_res
+            )
+            compare_time(gpu_sum_time, cpu_sum_time)
+            add_to_perf_reports(
+                self._perf_reports,
+                "Sum (axis={})".format(axis),
+                gpu_sum_time,
+                cpu_sum_time,
+                DATA_SIZE,
+            )
+
+            # check result
+            gpu_decrypted = te_c2p(
+                pi_decrypt(
+                    self._gpu_pub_key,
+                    self._gpu_priv_key,
+                    gpu_sum_res,
+                    None,
+                    None,
+                    None))
+            cpu_decrypted = np.asarray(
+                [self._priv_key.decrypt(v) for v in cpu_sum_res.flat]
+                if axis is not None
+                else [self._priv_key.decrypt(cpu_sum_res)]
+            )
+            assert_ndarray_diff(gpu_decrypted, cpu_decrypted)
+
+        print("\n>>>>> free all allocated spaces")
+        bi_free(gpu_bi_store)
+        bi_free(gpu_bi_store2)
+        te_free(gpu_te_store)
+        te_free(gpu_te_store2)
+        fp_free(gpu_fp_store)
+        fp_free(gpu_fp_store2)
+        fp_free(fps_buffer)
+        fp_free(gpu_fp_mul_store)
+        pi_free(gpu_pi_store)
+        pi_free(gpu_pi_store2)
+        pi_free(gpu_pi_matmul_store)
+        pi_free(gpu_pi_sum_store)
+
+    @classmethod
+    def tearDownClass(cls):
+        for v in cls._perf_reports:
+            v.dump_result()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/gpu/tensor/paillier_gpu/paillier_gpu/tests/test_gpu_performance.py b/gpu/tensor/paillier_gpu/paillier_gpu/tests/test_gpu_performance.py
new file mode 100755
index 0000000000..1b09afc3d6
--- /dev/null
+++ b/gpu/tensor/paillier_gpu/paillier_gpu/tests/test_gpu_performance.py
@@ -0,0 +1,276 @@
+#
+#  Copyright 2022 The FATE Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import functools
+import time
+import unittest
+import numpy
+from fate_arch.tensor.impl.blocks.python_paillier_block import FixedPointNumber, PaillierKeypair
+
+from ..gpu_engine import (
+    FLOAT_TYPE,
+    INT64_TYPE,
+    pi_p2c_pub_key,
+    pi_p2c_priv_key,
+    pi_h2d_pub_key,
+    pi_h2d_priv_key,
+    TensorShapeStorage,
+    bi_alloc,
+    PLAIN_BYTE,
+    MEM_HOST,
+    te_alloc,
+    fp_alloc,
+    pi_alloc,
+    te_p2c,
+    fp_encode,
+    fp_decode,
+    te_c2p,
+    pi_encrypt,
+    pi_gen_obf_seed,
+    CIPHER_BITS,
+    pi_obfuscate,
+    pi_decrypt,
+    fp_mul,
+    fp_c2p,
+    pi_add,
+    pi_mul,
+    pi_sum,
+    bi_free,
+    te_free,
+    fp_free,
+    pi_free,
+    initialize_device,
+    pi_matmul,
+)
+
+RAND_TYPE = FLOAT_TYPE  # SWITCH DATA TYPE HERE: EITHER INT64_TYPE OR FLOAT_TYPE
+NUM_ROWS = 666
+NUM_COLS = 666
+TEST_SIZE = NUM_ROWS * NUM_COLS
+ERROR_TOLERANCE = 1e-10
+
+
+def generate_rand(test_size):
+    if RAND_TYPE == FLOAT_TYPE:
+        return numpy.random.normal(0, 5, test_size)
+    elif RAND_TYPE == INT64_TYPE:
+        return numpy.random.randint(-2 ** 10, 2 ** 10, test_size)
+    else:
+        raise TypeError("Invalid data type")
+
+
+def assert_diff(res, ref):
+    if res == 0 or ref == 0:
+        assert res == 0
+        assert ref == 0
+    else:
+        diff = res - ref
+        assert abs(diff / res) < ERROR_TOLERANCE
+        assert abs(diff / ref) < ERROR_TOLERANCE
+
+
+def assert_ndarray_diff(res, ref):
+    assert res.shape == ref.shape
+    res, ref = res.flatten(), ref.flatten()
+    assert res.shape == ref.shape
+    for i in range(res.size):
+        try:
+            assert_diff(res[i], ref[i])
+        except AssertionError:
+            print("Assertion Error at location", i, ", GPU result:",
+                  res[i], ", reference result:", ref[i])
+
+
+def profile(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        res = func(*args, **kwargs)
+        end_time = time.time()
+        return res, end_time - start_time
+
+    return wrapper
+
+
+def compare_time(gpu_time, cpu_time, num_instances=TEST_SIZE):
+    print("GPU time:", gpu_time, "second(s)")
+    print("CPU time:", cpu_time, "second(s)")
+    print("GPU throughput:", num_instances / gpu_time, "instance(s) per second")
+    print("CPU throughput:", num_instances / cpu_time, "instance(s) per second")
+    print("Speedup:", cpu_time / gpu_time)
+
+
+class TestGPUPerformance(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        initialize_device()
+        cls._pub_key, cls._priv_key = PaillierKeypair.generate_keypair()
+        cls.n, cls.max_int = cls._pub_key.n, cls._pub_key.max_int
+        cls._cpu_pub_key = pi_p2c_pub_key(None, cls._pub_key)
+        cls._cpu_priv_key = pi_p2c_priv_key(None, cls._priv_key)
+        cls._gpu_pub_key = pi_h2d_pub_key(None, cls._cpu_pub_key)
+        cls._gpu_priv_key = pi_h2d_priv_key(None, cls._cpu_priv_key)
+        print("\n\n", "*" * 100, "\n\nInitialization complete\nTest Size:", TEST_SIZE)
+
+    # test performance
+    def test_performance(self):
+        print("\n\n", "*" * 100, "\n\nTest performance begins")
+
+        print("\n>>>>> generate data and allocate memory spaces")
+        raw, raw2 = generate_rand(TEST_SIZE), generate_rand(TEST_SIZE)
+        shape_tuple, shape_tuple_T = (NUM_ROWS, NUM_COLS), (NUM_COLS, NUM_ROWS)
+        shape_store, shape_store_T = TensorShapeStorage(*shape_tuple), TensorShapeStorage(*shape_tuple_T)
+        gpu_bi_store, gpu_bi_store2 = bi_alloc(None, TEST_SIZE, PLAIN_BYTE, MEM_HOST), bi_alloc(None, TEST_SIZE,
+                                                                                                PLAIN_BYTE, MEM_HOST)
+        gpu_te_store, gpu_te_store2 = te_alloc(None, TEST_SIZE, MEM_HOST), te_alloc(None, TEST_SIZE, MEM_HOST)
+        gpu_fp_store, gpu_fp_store2 = fp_alloc(None, TEST_SIZE, MEM_HOST), fp_alloc(None, TEST_SIZE, MEM_HOST)
+        gpu_pi_store, gpu_pi_store2 = pi_alloc(None, TEST_SIZE, MEM_HOST), pi_alloc(None, TEST_SIZE, MEM_HOST)
+        gpu_te_store, gpu_te_store2 = te_p2c(raw, gpu_te_store), te_p2c(raw2, gpu_te_store2)
+
+        print("\n>>>>> fp_encode profiling begins")
+        gpu_encoded, gpu_encode_time = profile(fp_encode)(gpu_te_store, self.n, self.max_int, res=gpu_fp_store)
+        cpu_encode_time = TEST_SIZE / 62303.97
+        compare_time(gpu_encode_time, cpu_encode_time)
+
+        print("\n>>>>> fp_decode profiling begins")
+        gpu_decoded, gpu_decode_time = profile(fp_decode)(gpu_encoded, gpu_te_store, None)
+        cpu_decode_time = TEST_SIZE / 567913.21
+        compare_time(gpu_decode_time, cpu_decode_time)
+
+        # check decoded results
+        assert_ndarray_diff(te_c2p(gpu_decoded), numpy.asarray(raw))
+
+        print("\n>>>>> pi_encrypt profiling begins")
+        print("This function calculates (encoding * n + 1) % nsquare")
+        gpu_encrypted, gpu_encrypt_time = profile(pi_encrypt)(self._gpu_pub_key, gpu_encoded, gpu_pi_store, None)
+        cpu_encrypt_time = TEST_SIZE / 205864.74
+        compare_time(gpu_encrypt_time, cpu_encrypt_time)
+
+        print("\n>>>>> pi_gen_obf_seed profiling begins")
+        print("This function calculates (rand() ^ n) % nsquare")
+        gpu_obf_seeds, gpu_gen_obf_seeds_time = profile(pi_gen_obf_seed)(gpu_bi_store, self._gpu_pub_key, TEST_SIZE,
+                                                                         CIPHER_BITS // 6, 0, None)
+        cpu_gen_obf_seefs_time = TEST_SIZE / 444.05
+        compare_time(gpu_gen_obf_seeds_time, cpu_gen_obf_seefs_time)
+
+        print("\n>>>>> pi_obfuscate profiling begins")
+        print("This function calculates (raw_cipher * obf_seed) % nsquare,")
+        print("\twhere raw_cipher and obf_seed are calculated in pi_encrypt and pi_gen_obf_seeds, respectively")
+        gpu_obfuscated, gpu_obfuscate_time = profile(pi_obfuscate)(self._gpu_pub_key, gpu_encrypted, gpu_obf_seeds,
+                                                                   gpu_pi_store, None)
+        cpu_obfuscate_time = TEST_SIZE / 60236.27
+        compare_time(gpu_obfuscate_time, cpu_obfuscate_time)
+
+        print("\n>>>>> pi_decrypt profiling begins")
+        print("This function calculates L(cipher ^ lambda % nsquare) * L(g ^ lambda % nsquare) ^ -1 % n")
+        print("fp_decode is by default included in pi_decrypt")
+        fps_buffer = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        gpu_decrypted, gpu_decrypt_time = profile(pi_decrypt)(self._gpu_pub_key, self._gpu_priv_key, gpu_obfuscated,
+                                                              gpu_te_store, fps_buffer)
+        cpu_decrypt_time = TEST_SIZE / 1590.48
+        compare_time(gpu_decrypt_time, cpu_decrypt_time)
+
+        # check decrypted results
+        assert_ndarray_diff(te_c2p(gpu_decrypted), numpy.asarray(raw))
+
+        print("\n>>>>> generating the other array")
+        gpu_encoded2 = fp_encode(gpu_te_store2, self.n, self.max_int, res=gpu_fp_store2)
+        gpu_encrypted2 = pi_encrypt(self._gpu_pub_key, gpu_encoded2, gpu_pi_store2, None)
+        gpu_obf_seeds2 = pi_gen_obf_seed(gpu_bi_store2, self._gpu_pub_key, TEST_SIZE, CIPHER_BITS // 6, 1, None)
+        gpu_obfuscated2 = pi_obfuscate(self._gpu_pub_key, gpu_encrypted2, gpu_obf_seeds2, gpu_pi_store2, None)
+
+        print("\n>>>>> fp_mul profiling begins")
+        gpu_fp_mul_store = fp_alloc(None, TEST_SIZE, MEM_HOST)
+        (gpu_fp_mul_res, _), gpu_fp_mul_time = profile(fp_mul)(gpu_encoded, gpu_encoded2, shape_store, shape_store,
+                                                               gpu_fp_mul_store, shape_store, None)
+        cpu_fp_mul_time = TEST_SIZE / 228424.79
+        compare_time(gpu_fp_mul_time, cpu_fp_mul_time)
+
+        # Compare results
+        cpu_encoded = [FixedPointNumber.encode(v, self.n, self.max_int) for v in raw]
+        cpu_encoded2 = [FixedPointNumber.encode(v, self.n, self.max_int) for v in raw2]
+        cpu_fp_mul_res = [FixedPointNumber((cpu_encoded[i].encoding * cpu_encoded2[i].encoding) % cpu_encoded[i].n,
+                                           cpu_encoded[i].exponent + cpu_encoded2[i].exponent, cpu_encoded[i].n,
+                                           cpu_encoded[i].max_int)
+                          for i in range(TEST_SIZE)]
+        received_fp_mul_res = fp_c2p(gpu_fp_mul_res)
+        for i in range(TEST_SIZE):
+            assert_diff(received_fp_mul_res[i].encoding, cpu_fp_mul_res[i].encoding)
+            assert received_fp_mul_res[i].BASE == cpu_fp_mul_res[i].BASE
+            assert received_fp_mul_res[i].exponent == cpu_fp_mul_res[i].exponent
+
+        print("\n>>>>> pi_add profiling begins")
+        (gpu_add_res, _), gpu_add_time = profile(pi_add)(self._gpu_pub_key, gpu_obfuscated, gpu_obfuscated2,
+                                                         shape_store, shape_store, gpu_pi_store, shape_store, None)
+        cpu_add_time = TEST_SIZE / 29759.90
+        compare_time(gpu_add_time, cpu_add_time)
+
+        print("\n>>>>> pi_mul profiling begins")
+        (gpu_mul_res, _), gpu_mul_time = profile(pi_mul)(self._gpu_pub_key, gpu_add_res, gpu_encoded2, shape_store,
+                                                         shape_store, gpu_pi_store, shape_store, None)
+        cpu_mul_time = TEST_SIZE / 6175.70
+        compare_time(gpu_mul_time, cpu_mul_time)
+
+        print("\n>>>>> pi_matmul profiling begins")
+        print("sizes are", shape_tuple, "and", shape_tuple_T)
+        gpu_pi_matmul_store = pi_alloc(None, NUM_ROWS * NUM_ROWS, MEM_HOST)
+        (gpu_matmul_res, gpu_matmul_shape), gpu_matmul_time = profile(pi_matmul)(self._gpu_pub_key, gpu_mul_res,
+                                                                                 gpu_encoded2, shape_store,
+                                                                                 shape_store_T, gpu_pi_matmul_store,
+                                                                                 None, None)
+        cpu_matmul_time = NUM_ROWS * TEST_SIZE / 4178.43
+        compare_time(gpu_matmul_time, cpu_matmul_time, NUM_ROWS * TEST_SIZE)
+
+        print("\n>>>>> pi_sum profiling begins")
+        print("shape is", gpu_matmul_shape.to_tuple())
+        gpu_pi_sum_store = pi_alloc(None, max(NUM_ROWS, NUM_COLS), MEM_HOST)
+        decrypted_matmul_res = numpy.asarray(
+            te_c2p(pi_decrypt(self._gpu_pub_key, self._gpu_priv_key, gpu_matmul_res, None, None))).reshape(
+            gpu_matmul_shape.to_tuple())
+        for axis in [0, 1, None]:
+            print(">>> axis:", axis)
+            (gpu_sum_res, _), gpu_sum_time = profile(pi_sum)(self._gpu_pub_key, gpu_matmul_res, gpu_matmul_shape, axis,
+                                                             gpu_pi_sum_store, None, None)
+            cpu_sum_time = TEST_SIZE / (12865.10 if axis == 0 else (15919.62 if axis == 1 else 10277.66))
+            compare_time(gpu_sum_time, cpu_sum_time)
+
+            # check result
+            gpu_decrypted = te_c2p(pi_decrypt(self._gpu_pub_key, self._gpu_priv_key, gpu_sum_res, None, None))
+            cpu_sum = decrypted_matmul_res.sum(axis)
+            if axis is None:
+                cpu_sum = numpy.asarray([cpu_sum])
+            assert_ndarray_diff(gpu_decrypted, cpu_sum)
+
+        print("\n>>>>> free all allocated spaces")
+        bi_free(gpu_bi_store)
+        bi_free(gpu_bi_store2)
+        te_free(gpu_te_store)
+        te_free(gpu_te_store2)
+        fp_free(gpu_fp_store)
+        fp_free(gpu_fp_store2)
+        fp_free(fps_buffer)
+        fp_free(gpu_fp_mul_store)
+        pi_free(gpu_pi_store)
+        pi_free(gpu_pi_store2)
+        pi_free(gpu_pi_matmul_store)
+        pi_free(gpu_pi_sum_store)
+
+        print("test passed")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/gpu/tensor/paillier_gpu/pyproject.toml b/gpu/tensor/paillier_gpu/pyproject.toml
new file mode 100644
index 0000000000..2a5f5b3008
--- /dev/null
+++ b/gpu/tensor/paillier_gpu/pyproject.toml
@@ -0,0 +1,17 @@
+[tool.poetry]
+name = "paillier-gpu"
+version = "0.1.0"
+description = "This project is an industrial-level heterogeneous acceleration system to support and speed up federated learning. We've designed and implemented a heterogeneous acceleration solutions using GPU, respectively, that can significantly accelerate the Paillier cryptosystem while maintaining functionality, accuracy and scalability."
+authors = ["Xiaolong.Gao <1506957902@qq.com>"]
+
+[tool.poetry.dependencies]
+python = "^3.6"
+numpy = "~1.18.4"
+gmpy2 = "^2.0.8"
+
+[tool.poetry.dev-dependencies]
+pytest = "^5.2"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
diff --git a/python/fate_arch/tensor/impl/blocks/python_paillier_block/__init__.py b/python/fate_arch/tensor/impl/blocks/python_paillier_block/__init__.py
index 66e5fdfd25..8a8919c760 100644
--- a/python/fate_arch/tensor/impl/blocks/python_paillier_block/__init__.py
+++ b/python/fate_arch/tensor/impl/blocks/python_paillier_block/__init__.py
@@ -3,5 +3,15 @@
     BlockPaillierDecryptor,
     BlockPaillierEncryptor,
 )
+from ._fate_paillier import (
+    PaillierEncryptedNumber,
+    PaillierPrivateKey,
+    PaillierPublicKey,
+    PaillierKeypair,
+)
+from ._fixedpoint import FixedPointNumber, FixedPointEndec
+from . import _gmpy_math as gmpy_math
 
-__all__ = ["BlockPaillierCipher", "BlockPaillierEncryptor", "BlockPaillierDecryptor"]
+__all__ = ["BlockPaillierCipher", "BlockPaillierEncryptor", "BlockPaillierDecryptor", "PaillierEncryptedNumber",
+           "PaillierPrivateKey", "PaillierPublicKey", "PaillierKeypair", "FixedPointNumber", "FixedPointEndec",
+           "gmpy_math"]