testing torch.compile

Peter · Peter · commit 12460889b136 · 2023-11-02T14:39:27.000-04:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -71,5 +71,5 @@ mujoco = ["mujoco"]
 [build-system]
 # Including torch and ninja here are needed to build the native code.
 # They will be installed as dependencies during the build, which can take a while the first time.
-requires = ["setuptools>=60.0.0", "wheel", "torch", "ninja"]
+requires = ["setuptools>=60.0.0", "wheel", "torch==2.1.0", "ninja"]
 build-backend= "setuptools.build_meta"
diff --git a/src/pytorch_kinematics/chain.py b/src/pytorch_kinematics/chain.py
@@ -123,6 +123,7 @@ def __init__(self, root_frame, dtype=torch.float32, device="cpu"):
             idx += 1
         self.joint_type_indices = torch.tensor(self.joint_type_indices)
         self.joint_indices = torch.tensor(self.joint_indices)
+        # We need to use a dict because torch.compile doesn't list lists of tensors
         self.parents_indices = [torch.tensor(p, dtype=torch.long, device=self.device) for p in self.parents_indices]
 
     def to(self, dtype=None, device=None):
@@ -317,6 +318,58 @@ def forward_kinematics(self, th, frame_indices: Optional = None):
 
         return frame_names_and_transform3ds
 
+    def forward_kinematics_py(self, th, frame_indices: Optional = None):
+        if frame_indices is None:
+            frame_indices = self.get_all_frame_indices()
+
+        th = self.ensure_tensor(th)
+        th = torch.atleast_2d(th)
+
+        b = th.shape[0]
+        axes_expanded = self.axes.unsqueeze(0).repeat(b, 1, 1)
+
+        # compute all joint transforms at once first
+        # in order to handle multiple joint types without branching, we create all possible transforms
+        # for all joint types and then select the appropriate one for each joint.
+        rev_jnt_transform = tensor_axis_and_angle_to_matrix(axes_expanded, th)
+        pris_jnt_transform = tensor_axis_and_d_to_pris_matrix(axes_expanded, th)
+
+        frame_transforms = {}
+        b = th.shape[0]
+        for frame_idx in frame_indices:
+            frame_transform = torch.eye(4).to(th).unsqueeze(0).repeat(b, 1, 1)
+
+            # iterate down the list and compose the transform
+            for chain_idx in self.parents_indices[frame_idx.item()]:
+                if chain_idx.item() in frame_transforms:
+                    frame_transform = frame_transforms[chain_idx.item()]
+                else:
+                    link_offset_i = self.link_offsets[chain_idx]
+                    if link_offset_i is not None:
+                        frame_transform = frame_transform @ link_offset_i
+
+                    joint_offset_i = self.joint_offsets[chain_idx]
+                    if joint_offset_i is not None:
+                        frame_transform = frame_transform @ joint_offset_i
+
+                    jnt_idx = self.joint_indices[chain_idx]
+                    jnt_type = self.joint_type_indices[chain_idx]
+                    if jnt_type == 0:
+                        pass
+                    elif jnt_type == 1:
+                        jnt_transform_i = rev_jnt_transform[:, jnt_idx]
+                        frame_transform = frame_transform @ jnt_transform_i
+                    elif jnt_type == 2:
+                        jnt_transform_i = pris_jnt_transform[:, jnt_idx]
+                        frame_transform = frame_transform @ jnt_transform_i
+
+            frame_transforms[frame_idx.item()] = frame_transform
+
+        frame_names_and_transform3ds = {self.idx_to_frame[frame_idx]: tf.Transform3d(matrix=transform) for
+                                        frame_idx, transform in frame_transforms.items()}
+
+        return frame_names_and_transform3ds
+
     def ensure_tensor(self, th):
         """
         Converts a number of possible types into a tensor. The order of the tensor is determined by the order
diff --git a/tests/gen_fk_perf.py b/tests/gen_fk_perf.py
@@ -24,19 +24,30 @@ def main():
     number = 100
 
     # iterate over all combinations and store in a pandas dataframe
-    headers = ['chain', 'device', 'dtype', 'batch_size', 'time']
+    headers = ['method', 'chain', 'device', 'dtype', 'batch_size', 'time']
     data = []
 
+    def _fk_cpp(th):
+        return chain.forward_kinematics(th)
+
+    @torch.compile(backend='eager')
+    def _fk_torch_compile(th):
+        return chain.forward_kinematics_py(th)
+
+    method_names = ['fk_cpp', 'fk_torch_compile']
+    methods = [_fk_cpp, _fk_torch_compile]
+
     for chain, name in zip(chains, names):
         for device in devices:
             for dtype in dtypes:
                 for batch_size in batch_sizes:
-                    chain = chain.to(dtype=dtype, device=device)
-                    th = torch.zeros(batch_size, chain.n_joints).to(dtype=dtype, device=device)
+                    for method_name, method in zip(method_names, methods):
+                        chain = chain.to(dtype=dtype, device=device)
+                        th = torch.zeros(batch_size, chain.n_joints).to(dtype=dtype, device=device)
 
-                    dt = timeit.timeit(lambda: chain.forward_kinematics(th), number=number)
-                    data.append([name, device, dtype, batch_size, dt / number])
-                    print(f"{name=} {device=} {dtype=} {batch_size=} {dt / number:.4f}")
+                        dt = timeit.timeit(lambda: method(th), number=number)
+                        data.append([name, device, dtype, batch_size, dt / number])
+                        print(f"{method_name} {name=} {device=} {dtype=} {batch_size=} {dt / number:.4f}")
 
     # pickle the data for visualization in jupyter notebook
     import pickle
diff --git a/tests/test_kinematics.py b/tests/test_kinematics.py
@@ -255,9 +255,34 @@ def test_mjcf_slide_joint_parsing():
 
 
 def test_fk_val():
+    dtype = torch.float64
+    d = "cuda" if torch.cuda.is_available() else "cpu"
+
     chain = pk.build_chain_from_mjcf(open(os.path.join(TEST_DIR, "val.xml")).read())
-    chain = chain.to(dtype=torch.float64)
-    ret = chain.forward_kinematics(torch.zeros([1000, chain.n_joints], dtype=torch.float64))
+    chain = chain.to(dtype=torch.float64, device=d)
+
+    th = torch.rand(1000, chain.n_joints, dtype=dtype, device=d)
+
+    def _fk_no_compile():
+        return chain.forward_kinematics_py(th)
+
+    @torch.compile(backend='inductor')
+    def _fk_compile():
+        return chain.forward_kinematics_py(th)
+
+    from timeit import timeit
+
+    # warmup
+    _fk_no_compile()
+    _fk_compile()
+
+    number = 10
+    ms_no_compile = timeit(_fk_no_compile, number=number) / number * 1000
+    print(f"elapsed {ms_no_compile:.1f}ms for no compile")
+    ms_compile = timeit(_fk_compile, number=number) / number * 1000
+    print(f"elapsed {ms_compile:.1f}ms for compile")
+
+    ret = chain.forward_kinematics_py(th)
     tg = ret['drive45']
     pos, rot = quat_pos_from_transform3d(tg)
     torch.set_printoptions(precision=6, sci_mode=False)