Merge branch 'master' into kwargs_py_api

narendasan · narendasan · commit fbce40274a4c · 2021-10-19T21:04:17.000-07:00
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -0,0 +1,49 @@
+ARG BASE=21.06
+ARG BASE_IMG=nvcr.io/nvidia/pytorch:${BASE}-py3
+FROM ${BASE_IMG} as base
+
+FROM base as trtorch-builder-base
+
+# Removing any bazel or trtorch pre-installed from the base image
+
+RUN rm -rf /opt/pytorch/trtorch /usr/bin/bazel
+
+RUN apt-get update && apt-get install --no-install-recommends -y curl gnupg
+RUN curl -fsSL https://bazel.build/bazel-release.pub.gpg | gpg --dearmor > /etc/apt/trusted.gpg.d/bazel.gpg
+RUN echo "deb [arch=amd64] https://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list
+
+RUN apt-get update && apt-get install -y --no-install-recommends bazel-4.0.0
+RUN ln -s /usr/bin/bazel-4.0.0 /usr/bin/bazel
+
+# Workaround for bazel expecting both static and shared versions, we only use shared libraries inside container
+RUN cp /usr/lib/x86_64-linux-gnu/libnvinfer.so /usr/lib/x86_64-linux-gnu/libnvinfer_static.a
+
+RUN apt-get update && apt-get install -y --no-install-recommends locales ninja-build && rm -rf /var/lib/apt/lists/* && locale-gen en_US.UTF-8
+
+FROM trtorch-builder-base as trtorch-builder
+
+COPY . /workspace/trtorch/src
+WORKDIR /workspace/trtorch/src
+RUN cp ./docker/WORKSPACE.cu.docker  WORKSPACE
+
+# This script builds both libtrtorch bin/lib/include tarball and the Pythin wheel, in dist/
+RUN ./docker/dist-build.sh
+
+FROM base as trtorch
+
+# copy source repo
+COPY . /workspace/trtorch
+COPY --from=trtorch-builder  /workspace/trtorch/src/dist/ .
+RUN patch -u /opt/conda/lib/python3.8/site-packages/pytorch_quantization/nn/modules/tensor_quantizer.py -i /workspace/trtorch/docker/qat.patch
+RUN conda init bash
+
+RUN pip3 install ipywidgets --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host=files.pythonhosted.org
+RUN jupyter nbextension enable --py widgetsnbextension
+
+RUN mkdir -p /opt/trtorch && tar xvf libtrtorch.tar.gz --strip-components 2 -C /opt/trtorch --exclude=LICENSE && pip3 install *.whl && rm -fr /workspace/trtorch/dist/*
+
+ENV LD_LIBRARY_PATH /opt/conda/lib/python3.8/site-packages/torch/lib:/opt/trtorch/lib:${LD_LIBRARY_PATH}
+ENV PATH /opt/trtorch/bin:${PATH}
+
+WORKDIR /workspace/trtorch/
+CMD /bin/bash
diff --git a/docker/Dockerfile.21.03 b/docker/Dockerfile.21.03
diff --git a/docker/Dockerfile.21.06 b/docker/Dockerfile.21.06
diff --git a/docker/Dockerfile.21.07 b/docker/Dockerfile.21.07
diff --git a/docker/WORKSPACE.cu.docker b/docker/WORKSPACE.cu.docker
@@ -29,13 +29,25 @@ git_repository(
     shallow_since = "1570114335 -0400"
 )
 
+# External dependency for trtorch if you already have precompiled binaries.
+# This is currently used in pytorch NGC container CI testing.
+local_repository(
+    name = "trtorch",
+    path = "/opt/conda/lib/python3.8/site-packages/trtorch"
+)
+
 # CUDA should be installed on the system locally
 new_local_repository(
     name = "cuda",
     path = "/usr/local/cuda",
     build_file = "@//third_party/cuda:BUILD",
 )
 
+new_local_repository(
+    name = "cublas",
+    build_file = "@//third_party/cublas:BUILD",
+    path = "/usr",
+)
 
 ####################################################################################
 # Locally installed dependencies (use in cases of custom dependencies or aarch64)
diff --git a/docker/dist-build.sh b/docker/dist-build.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+mkdir -p dist
+
+bazel build //:libtrtorch --compilation_mode opt
+
+cd py && MAX_JOBS=1 LANG=en_US.UTF-8 LANGUAGE=en_US:en LC_ALL=en_US.UTF-8 python3 setup.py bdist_wheel --use-cxx11-abi
+
+cd ..
+
+cp bazel-bin/libtrtorch.tar.gz dist/
+cp py/dist/* dist/
+
+pip3 install ipywidgets --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host=files.pythonhosted.org
+jupyter nbextension enable --py widgetsnbextension
+
+pip3 install timm
+
+# test install
+mkdir -p /opt/trtorch && tar xvf dist/libtrtorch.tar.gz --strip-components 2 -C /opt/trtorch --exclude=LICENSE && pip3 uninstall -y trtorch && pip3 install dist/*.whl
+
diff --git a/docker/dist-test.sh b/docker/dist-test.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+pip3 install timm --trusted-host pypi.org --trusted-host pypi.python.org --trusted-host=files.pythonhosted.org
+# Build and run unit tests
+cd tests/modules && python3 ./hub.py
+cd ../..
+
+bazel test //tests:tests //tests:python_api_tests --compilation_mode=opt --jobs=4
diff --git a/docker/mha.patch b/docker/mha.patch
@@ -0,0 +1,19 @@
+--- torch/nn/functional.py	2021-10-01 16:53:42.827338664 -0700
++++ functional.py	2021-10-01 16:53:34.639338618 -0700
+@@ -4975,7 +4975,7 @@
+         f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
+     if isinstance(embed_dim, torch.Tensor):
+         # embed_dim can be a tensor when JIT tracing
+-        head_dim = embed_dim.div(num_heads, rounding_mode='trunc')
++        head_dim = int(embed_dim.div(num_heads, rounding_mode='trunc'))
+     else:
+         head_dim = embed_dim // num_heads
+     assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
+@@ -5044,6 +5044,7 @@
+     #
+     # reshape q, k, v for multihead attention and make em batch first
+     #
++    bsz = int(bsz)
+     q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+     if static_k is None:
+         k = k.contiguous().view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
diff --git a/docker/qat.patch b/docker/qat.patch
@@ -0,0 +1,11 @@
+--- /opt/conda/lib/python3.8/site-packages/pytorch_quantization/nn/modules/tensor_quantizer.py	2021-08-16 22:50:37.000000000 +0000
++++ tensor_quantizer.py	2021-10-19 20:41:54.288077426 +0000
+@@ -291,7 +291,7 @@
+             quant_dim = list(amax.shape).index(list(amax_sequeeze.shape)[0])
+             scale = amax_sequeeze / bound
+             outputs = torch.fake_quantize_per_channel_affine(
+-                inputs, scale.data, torch.zeros_like(scale, dtype=torch.long).data, quant_dim,
++                inputs, scale.data, torch.zeros_like(scale, dtype=torch.int32).data, quant_dim,
+                 -bound - 1 if not self._unsigned else 0, bound)
+ 
+         return outputs
diff --git a/py/setup.py b/py/setup.py
@@ -234,7 +234,7 @@ def run(self):
       long_description=long_description,
       ext_modules=ext_modules,
       install_requires=[
-          'torch>=1.9.0+cu111,<1.10.0',
+          'torch>=1.9.0<1.11.0',
       ],
       setup_requires=[],
       cmdclass={
diff --git a/tests/py/test_api.py b/tests/py/test_api.py
@@ -46,22 +46,22 @@ def test_from_torch_tensor(self):
             "enabled_precisions": {torch.float}
         }
 
-        trt_mod = trtorch.compile(self.scripted_model, **compile_spec)
-        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
+        trt_mod = trtorch.compile(self.traced_model, **compile_spec)
+        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
         self.assertTrue(same < 2e-2)
 
     def test_device(self):
         compile_spec = {"inputs": [self.input], "device": trtorch.Device("gpu:0"), "enabled_precisions": {torch.float}}
 
-        trt_mod = trtorch.compile(self.scripted_model, **compile_spec)
-        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
+        trt_mod = trtorch.compile(self.traced_model, **compile_spec)
+        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
         self.assertTrue(same < 2e-2)
 
     def test_default_device(self):
         compile_spec = {"inputs": [self.input], "enabled_precisions": {torch.float}}
 
-        trt_mod = trtorch.compile(self.scripted_model, **compile_spec)
-        same = (trt_mod(self.input) - self.scripted_model(self.input)).abs().max()
+        trt_mod = trtorch.compile(self.traced_model, **compile_spec)
+        same = (trt_mod(self.input) - self.traced_model(self.input)).abs().max()
         self.assertTrue(same < 2e-2)
 
     def test_compile_script_from_dict(self):
@@ -179,7 +179,7 @@ class TestPTtoTRTtoPT(ModelTestCase):
 
     def setUp(self):
         self.input = torch.randn((1, 3, 224, 224)).to("cuda")
-        self.ts_model = torch.jit.script(self.model)
+        self.ts_model = torch.jit.trace(self.model, [self.input])
 
     def test_pt_to_trt_to_pt(self):
         compile_spec = {
@@ -359,7 +359,7 @@ def test_suite():
     suite.addTest(TestCompile.parametrize(TestCompile, model=models.mobilenet_v2(pretrained=True)))
     suite.addTest(TestCompileHalf.parametrize(TestCompileHalf, model=models.resnet18(pretrained=True)))
     suite.addTest(TestCompileHalfDefault.parametrize(TestCompileHalfDefault, model=models.resnet18(pretrained=True)))
-    suite.addTest(TestPTtoTRTtoPT.parametrize(TestPTtoTRTtoPT, model=models.mobilenet_v2(pretrained=True)))
+    suite.addTest(TestPTtoTRTtoPT.parametrize(TestPTtoTRTtoPT, model=models.resnet18(pretrained=True)))
     suite.addTest(
         TestInputTypeDefaultsFP32Model.parametrize(TestInputTypeDefaultsFP32Model,
                                                    model=models.resnet18(pretrained=True)))
diff --git a/third_party/cublas/BUILD b/third_party/cublas/BUILD
@@ -16,7 +16,6 @@ config_setting(
     constraint_values = [
         "@platforms//cpu:aarch64",
         "@platforms//os:linux",
-        "@//toolchains/jetpack:4.6"
     ]
 )
 
@@ -67,4 +66,4 @@ cc_library(
         "cublas_lib",
         "cublas_lt_lib",
     ],
-)
+)
diff --git a/third_party/cuda/BUILD b/third_party/cuda/BUILD
@@ -1,13 +1,5 @@
 package(default_visibility = ["//visibility:public"])
 
-config_setting(
-    name = "aarch64_linux",
-    constraint_values = [
-        "@platforms//cpu:aarch64",
-        "@platforms//os:linux",
-    ],
-)
-
 config_setting(
     name = "windows",
     constraint_values = [
@@ -18,9 +10,6 @@ config_setting(
 cc_library(
     name = "cudart",
     srcs = select({
-        ":aarch64_linux": [
-            "targets/aarch64-linux/lib/libcudart.so",
-        ],
         ":windows": [
             "lib/x64/cudart.lib",
         ],
@@ -39,9 +28,6 @@ cc_library(
 cc_library(
     name = "nvToolsExt",
     srcs = select({
-        ":aarch64_linux": [
-            "targets/aarch64-linux/lib/libnvToolsExt.so.1",
-        ],
         ":windows": [
             "bin/nvToolsExt64_1.dll",
         ],
@@ -54,9 +40,6 @@ cc_library(
 cc_library(
     name = "cuda",
     srcs = select({
-        ":aarch64_linux": glob([
-            "targets/aarch64-linux/lib/**/lib*.so",
-        ]),
         ":windows": [
             "bin/*.dll",
         ],
@@ -76,14 +59,11 @@ cc_library(
 cc_library(
     name = "cublas",
     srcs = select({
-        ":aarch64_linux": glob([
-            "lib/**/*libcublas.so",
-        ]),
         ":windows": glob([
             "lib/x64/cublas.lib",
         ]),
         "//conditions:default": glob([
-            "lib/**/*libcublas.so",
+            "targets/x86_64-linux/lib/**/*libcublas.so",
         ]),
     }),
     hdrs = glob([
diff --git a/third_party/tensorrt/archive/BUILD b/third_party/tensorrt/archive/BUILD
diff --git a/third_party/tensorrt/local/BUILD b/third_party/tensorrt/local/BUILD