niukuo
diff --git a/‎cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp‎
Lines changed: 12 additions & 1 deletion b/‎cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎cpp/tensorrt_llm/nanobind/bindings.cpp‎
Lines changed: 6 additions & 3 deletions b/‎cpp/tensorrt_llm/nanobind/bindings.cpp‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/common/bindTypes.h‎
Lines changed: 3 additions & 36 deletions b/‎cpp/tensorrt_llm/nanobind/common/bindTypes.h‎
Lines changed: 3 additions & 36 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/common/customCasters.h‎
Lines changed: 35 additions & 88 deletions b/‎cpp/tensorrt_llm/nanobind/common/customCasters.h‎
Lines changed: 35 additions & 88 deletions
diff --git a/‎cpp/tensorrt_llm/nanobind/executor/executor.cpp‎
Lines changed: 24 additions & 40 deletions b/‎cpp/tensorrt_llm/nanobind/executor/executor.cpp‎
Lines changed: 24 additions & 40 deletions
@@ -79,7 +79,7 @@ void initBindings(nb::module_& m)
             }
         });
 
-    PybindUtils::bindSet<tb::ReqIdsSet>(m, "ReqIdsSet");
+    NanobindUtils::bindSet<tb::ReqIdsSet>(m, "ReqIdsSet");
 
     nb::enum_<tb::LlmRequestType>(m, "LlmRequestType")
         .value("LLMREQUEST_TYPE_CONTEXT_AND_GENERATION", tb::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION)
 
@@ -48,6 +48,9 @@ using SizeType32 = tensorrt_llm::runtime::SizeType32;
 using TokenIdType = tensorrt_llm::runtime::TokenIdType;
 using VecTokens = std::vector<TokenIdType>;
 using CudaStreamPtr = std::shared_ptr<tensorrt_llm::runtime::CudaStream>;
+using CacheBlockIds = std::vector<std::vector<SizeType32>>;
+
+NB_MAKE_OPAQUE(CacheBlockIds);
 
 namespace
 {
@@ -424,7 +427,15 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(nb::module_& m)
         .def("get_newly_allocated_block_ids", &BaseKVCacheManager::getNewlyAllocatedBlockIds)
         .def("flush_iteration_events", &BaseKVCacheManager::flushIterationEvents);
 
-    nb::bind_vector<std::vector<std::vector<SizeType32>>>(m, "CacheBlockIds");
+    nb::bind_vector<CacheBlockIds>(m, "CacheBlockIds")
+        .def("__getstate__", [](CacheBlockIds const& v) { return nb::make_tuple(v); })
+        .def("__setstate__",
+            [](CacheBlockIds& self, nb::tuple const& t)
+            {
+                if (t.size() != 1)
+                    throw std::runtime_error("Invalid state!");
+                new (&self) CacheBlockIds(nb::cast<std::vector<std::vector<SizeType32>>>(t[0]));
+            });
 
     nb::enum_<tbk::CacheType>(m, "CacheType")
         .value("SELF", tbk::CacheType::kSELF)
 
@@ -359,9 +359,12 @@ NB_MODULE(TRTLLM_NB_MODULE, m)
             config.earlyStopping, config.noRepeatNgramSize, config.numReturnSequences, config.minP,
             config.beamWidthArray);
     };
-    auto SamplingConfigSetState = [](tr::SamplingConfig& self, nb::tuple t) -> tr::SamplingConfig
+    auto SamplingConfigSetState = [](tr::SamplingConfig& self, nb::tuple t)
     {
-        assert(t.size() == 19);
+        if (t.size() != 19)
+        {
+            throw std::runtime_error("Invalid SamplingConfig state!");
+        }
 
         tr::SamplingConfig config;
         config.beamWidth = nb::cast<SizeType32>(t[0]);
@@ -384,7 +387,7 @@ NB_MODULE(TRTLLM_NB_MODULE, m)
         config.minP = nb::cast<OptVec<float>>(t[17]);
         config.beamWidthArray = nb::cast<OptVec<std::vector<SizeType32>>>(t[18]);
 
-        return config;
+        new (&self) tr::SamplingConfig(config);
     };
 
     nb::class_<tr::SamplingConfig>(m, "SamplingConfig")
 
@@ -21,44 +21,11 @@
 #include <nanobind/nanobind.h>
 #include <nanobind/stl/string.h>
 
-namespace PybindUtils
+namespace NanobindUtils
 {
 
 namespace nb = nanobind;
 
-template <typename T>
-void bindList(nb::module_& m, std::string const& name)
-{
-    nb::class_<T>(m, name.c_str())
-        .def(nb::init<>())
-        .def("push_back", [](T& lst, const typename T::value_type& value) { lst.push_back(value); })
-        .def("pop_back", [](T& lst) { lst.pop_back(); })
-        .def("push_front", [](T& lst, const typename T::value_type& value) { lst.push_front(value); })
-        .def("pop_front", [](T& lst) { lst.pop_front(); })
-        .def("__len__", [](T const& lst) { return lst.size(); })
-        .def(
-            "__iter__", [](T& lst) { return nb::make_iterator(nb::type<T>(), "iterator", lst.begin(), lst.end()); },
-            nb::keep_alive<0, 1>())
-        .def("__getitem__",
-            [](T const& lst, size_t index)
-            {
-                if (index >= lst.size())
-                    throw nb::index_error();
-                auto it = lst.begin();
-                std::advance(it, index);
-                return *it;
-            })
-        .def("__setitem__",
-            [](T& lst, size_t index, const typename T::value_type& value)
-            {
-                if (index >= lst.size())
-                    throw nb::index_error();
-                auto it = lst.begin();
-                std::advance(it, index);
-                *it = value;
-            });
-}
-
 template <typename T>
 void bindSet(nb::module_& m, std::string const& name)
 {
@@ -93,8 +60,8 @@ void bindSet(nb::module_& m, std::string const& name)
                 {
                     s.insert(item);
                 }
-                return s;
+                new (&v) T(s);
             });
 }
 
-} // namespace PybindUtils
+} // namespace NanobindUtils
@@ -38,6 +38,7 @@
 #include <torch/csrc/autograd/variable.h>
 #include <torch/extension.h>
 #include <torch/torch.h>
+#include <vector>
 
 // Pybind requires to have a central include in order for type casters to work.
 // Opaque bindings add a type caster, so they have the same requirement.
@@ -48,7 +49,6 @@ NB_MAKE_OPAQUE(tensorrt_llm::batch_manager::ReqIdsSet)
 NB_MAKE_OPAQUE(std::vector<tensorrt_llm::batch_manager::SlotDecoderBuffers>)
 NB_MAKE_OPAQUE(std::vector<tensorrt_llm::runtime::decoder_batch::Request>)
 NB_MAKE_OPAQUE(std::vector<tensorrt_llm::runtime::SamplingConfig>)
-NB_MAKE_OPAQUE(std::vector<std::vector<tensorrt_llm::runtime::SizeType32>>)
 
 namespace nb = nanobind;
 
@@ -128,70 +128,6 @@ struct type_caster<tensorrt_llm::common::OptionalRef<T>>
     }
 };
 
-template <typename T>
-struct PathCaster
-{
-
-private:
-    static PyObject* unicode_from_fs_native(std::string const& w)
-    {
-        return PyUnicode_DecodeFSDefaultAndSize(w.c_str(), ssize_t(w.size()));
-    }
-
-    static PyObject* unicode_from_fs_native(std::wstring const& w)
-    {
-        return PyUnicode_FromWideChar(w.c_str(), ssize_t(w.size()));
-    }
-
-public:
-    static handle from_cpp(T const& path, rv_policy, cleanup_list* cleanup)
-    {
-        if (auto py_str = unicode_from_fs_native(path.native()))
-        {
-            return module_::import_("pathlib").attr("Path")(steal<object>(py_str), cleanup).release();
-        }
-        return nullptr;
-    }
-
-    bool from_python(handle src, uint8_t flags, cleanup_list* cleanup)
-    {
-        PyObject* native = nullptr;
-        if constexpr (std::is_same_v<typename T::value_type, char>)
-        {
-            if (PyUnicode_FSConverter(src.ptr(), &native) != 0)
-            {
-                if (auto* c_str = PyBytes_AsString(native))
-                {
-                    // AsString returns a pointer to the internal buffer, which
-                    // must not be free'd.
-                    value = c_str;
-                }
-            }
-        }
-        else if constexpr (std::is_same_v<typename T::value_type, wchar_t>)
-        {
-            if (PyUnicode_FSDecoder(src.ptr(), &native) != 0)
-            {
-                if (auto* c_str = PyUnicode_AsWideCharString(native, nullptr))
-                {
-                    // AsWideCharString returns a new string that must be free'd.
-                    value = c_str; // Copies the string.
-                    PyMem_Free(c_str);
-                }
-            }
-        }
-        Py_XDECREF(native);
-        if (PyErr_Occurred())
-        {
-            PyErr_Clear();
-            return false;
-        }
-        return true;
-    }
-
-    NB_TYPE_CASTER(T, const_name("os.PathLike"));
-};
-
 template <>
 class type_caster<tensorrt_llm::executor::StreamPtr>
 {
@@ -311,34 +247,45 @@ struct type_caster<at::Tensor>
 
     bool from_python(nb::handle src, uint8_t, cleanup_list*) noexcept
     {
-        nb::object capsule = nb::getattr(src, "__dlpack__")();
-        DLManagedTensor* dl_managed = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(capsule.ptr(), "dltensor"));
-        PyCapsule_SetDestructor(capsule.ptr(), nullptr);
-        value = at::fromDLPack(dl_managed).alias();
-        return true;
+        PyObject* obj = src.ptr();
+        if (THPVariable_Check(obj))
+        {
+            value = THPVariable_Unpack(obj);
+            return true;
+        }
+        return false;
     }
 
-    static handle from_cpp(at::Tensor tensor, rv_policy, cleanup_list*) noexcept
+    static handle from_cpp(at::Tensor src, rv_policy, cleanup_list*) noexcept
     {
-        DLManagedTensor* dl_managed = at::toDLPack(tensor);
-        if (!dl_managed)
-            return nullptr;
-
-        nanobind::object capsule = nb::steal(PyCapsule_New(dl_managed, "dltensor",
-            [](PyObject* obj)
-            {
-                DLManagedTensor* dl = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(obj, "dltensor"));
-                dl->deleter(dl);
-            }));
-        if (!capsule.is_valid())
+        return THPVariable_Wrap(src);
+    }
+};
+
+template <typename T>
+struct type_caster<std::vector<std::reference_wrapper<T const>>>
+{
+    using VectorType = std::vector<std::reference_wrapper<T const>>;
+
+    NB_TYPE_CASTER(VectorType, const_name("List[") + make_caster<T>::Name + const_name("]"));
+
+    bool from_python(handle src, uint8_t flags, cleanup_list* cleanup) noexcept
+    {
+        // Not needed for our use case since we only convert C++ to Python
+        return false;
+    }
+
+    static handle from_cpp(VectorType const& src, rv_policy policy, cleanup_list* cleanup) noexcept
+    {
+
+        std::vector<T> result;
+        result.reserve(src.size());
+        for (auto const& ref : src)
         {
-            dl_managed->deleter(dl_managed);
-            return nullptr;
+            result.push_back(ref.get());
         }
-        nanobind::module_ torch = nanobind::module_::import_("torch");
-        nanobind::object result = torch.attr("from_dlpack")(capsule);
-        capsule.release();
-        return result.release();
+
+        return make_caster<std::vector<T>>::from_cpp(result, policy, cleanup);
     }
 };
 } // namespace detail
 
@@ -52,58 +52,37 @@ struct dtype_traits<half>
 
 namespace
 {
-// todo: Properly support FP8 and BF16 and verify functionality
-tle::Tensor numpyToTensor(nb::ndarray<nb::numpy> const& array)
+tle::Tensor numpyToTensor(nb::object const& object)
 {
-    auto npDtype = array.dtype();
-    char kind = '\0';
-    switch (npDtype.code)
-    {
-    case static_cast<uint8_t>(nb::dlpack::dtype_code::Int):
-        kind = 'i'; // signed integer
-        break;
-    case static_cast<uint8_t>(nb::dlpack::dtype_code::UInt):
-        kind = 'u'; // unsigned integer
-        break;
-    case static_cast<uint8_t>(nb::dlpack::dtype_code::Float):
-        kind = 'f'; // floating point
-        break;
-    case static_cast<uint8_t>(nb::dlpack::dtype_code::Bfloat):
-        kind = 'f'; // brain floating point (treat as float kind)
-        break;
-    case static_cast<uint8_t>(nb::dlpack::dtype_code::Complex):
-        kind = 'c'; // complex
-        break;
-    default:
-        kind = 'V'; // void/other
-        break;
-    }
+    std::string dtype_name = nb::cast<std::string>(object.attr("dtype").attr("name"));
+    nb::object metadata = object.attr("dtype").attr("metadata");
+
     tle::DataType dtype;
-    if (npDtype == nb::dtype<half>())
+    if (dtype_name == "float16")
     {
         dtype = tle::DataType::kFP16;
     }
-    else if (npDtype == nb::dtype<float>())
+    else if (dtype_name == "float32")
     {
         dtype = tle::DataType::kFP32;
     }
-    else if (npDtype == nb::dtype<int8_t>())
+    else if (dtype_name == "int8")
     {
         dtype = tle::DataType::kINT8;
     }
-    else if (npDtype == nb::dtype<int32_t>())
+    else if (dtype_name == "int32")
     {
         dtype = tle::DataType::kINT32;
     }
-    else if (npDtype == nb::dtype<int64_t>())
+    else if (dtype_name == "int64")
     {
         dtype = tle::DataType::kINT64;
     }
-    else if (kind == 'V' && array.itemsize() == 1)
+    else if (dtype_name == "void8" && !metadata.is_none() && nb::cast<std::string>(metadata["dtype"]) == "float8")
     {
         dtype = tle::DataType::kFP8;
     }
-    else if (kind == 'V' && array.itemsize() == 2)
+    else if (dtype_name == "void16" && !metadata.is_none() && nb::cast<std::string>(metadata["dtype"]) == "bfloat16")
     {
         dtype = tle::DataType::kBF16;
     }
@@ -112,16 +91,21 @@ tle::Tensor numpyToTensor(nb::ndarray<nb::numpy> const& array)
         TLLM_THROW("Unsupported numpy dtype.");
     }
 
-    // todo: improve the following code
+    nb::object array_interface = object.attr("__array_interface__");
+    nb::object shape_obj = array_interface["shape"];
     std::vector<int64_t> dims;
-    dims.reserve(array.ndim());
-    for (size_t i = 0; i < array.ndim(); ++i)
+    dims.reserve(nb::len(shape_obj));
+
+    for (size_t i = 0; i < nb::len(shape_obj); ++i)
     {
-        dims.push_back(static_cast<int64_t>(array.shape(i)));
+        dims.push_back(nb::cast<int64_t>(shape_obj[i]));
     }
-    tle::Shape shape(dims.data(), dims.size());
 
-    return tle::Tensor::of(dtype, const_cast<void*>(array.data()), shape);
+    nb::object data_obj = array_interface["data"];
+    uintptr_t addr = nb::cast<uintptr_t>(data_obj[0]);
+    void* data_ptr = reinterpret_cast<void*>(addr);
+    tle::Shape shape(dims.data(), dims.size());
+    return tle::Tensor::of(dtype, data_ptr, shape);
 }
 
 } // namespace
@@ -153,8 +137,8 @@ Executor::Executor(nb::bytes const& engineBuffer, std::string const& jsonConfigS
         for (auto const& [rawName, rawArray] : managedWeights.value())
         {
             std::string name = nb::cast<std::string>(rawName);
-            nb::ndarray<nb::numpy> array = nb::cast<nb::ndarray<nb::numpy>>(rawArray);
-            managedWeightsMap->emplace(name, numpyToTensor(array));
+            nb::object array_obj = nb::cast<nb::object>(rawArray);
+            managedWeightsMap->emplace(name, numpyToTensor(array_obj));
         }
     }
     mExecutor = std::make_unique<tle::Executor>(
Original file line number	Diff line number	Diff line change
`@@ -79,7 +79,7 @@ void initBindings(nb::module_& m)`
`79`	`79`	`}`
`80`	`80`	`});`
`81`	`81`
`82`		`- PybindUtils::bindSet<tb::ReqIdsSet>(m, "ReqIdsSet");`
	`82`	`+ NanobindUtils::bindSet<tb::ReqIdsSet>(m, "ReqIdsSet");`
`83`	`83`
`84`	`84`	`nb::enum_<tb::LlmRequestType>(m, "LlmRequestType")`
`85`	`85`	`.value("LLMREQUEST_TYPE_CONTEXT_AND_GENERATION", tb::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION)`
Original file line number	Diff line number	Diff line change
`@@ -52,58 +52,37 @@ struct dtype_traits<half>`
`52`	`52`
`53`	`53`	`namespace`
`54`	`54`	`{`
`55`		`-// todo: Properly support FP8 and BF16 and verify functionality`
`56`		`-tle::Tensor numpyToTensor(nb::ndarray<nb::numpy> const& array)`
	`55`	`+tle::Tensor numpyToTensor(nb::object const& object)`
`57`	`56`	`{`
`58`		`- auto npDtype = array.dtype();`
`59`		`- char kind = '\0';`
`60`		`- switch (npDtype.code)`
`61`		`- {`
`62`		`- case static_cast<uint8_t>(nb::dlpack::dtype_code::Int):`
`63`		`- kind = 'i'; // signed integer`
`64`		`- break;`
`65`		`- case static_cast<uint8_t>(nb::dlpack::dtype_code::UInt):`
`66`		`- kind = 'u'; // unsigned integer`
`67`		`- break;`
`68`		`- case static_cast<uint8_t>(nb::dlpack::dtype_code::Float):`
`69`		`- kind = 'f'; // floating point`
`70`		`- break;`
`71`		`- case static_cast<uint8_t>(nb::dlpack::dtype_code::Bfloat):`
`72`		`- kind = 'f'; // brain floating point (treat as float kind)`
`73`		`- break;`
`74`		`- case static_cast<uint8_t>(nb::dlpack::dtype_code::Complex):`
`75`		`- kind = 'c'; // complex`
`76`		`- break;`
`77`		`- default:`
`78`		`- kind = 'V'; // void/other`
`79`		`- break;`
`80`		`- }`
	`57`	`+ std::string dtype_name = nb::cast<std::string>(object.attr("dtype").attr("name"));`
	`58`	`+ nb::object metadata = object.attr("dtype").attr("metadata");`
	`59`	`+`
`81`	`60`	`tle::DataType dtype;`
`82`		`- if (npDtype == nb::dtype<half>())`
	`61`	`+ if (dtype_name == "float16")`
`83`	`62`	`{`
`84`	`63`	`dtype = tle::DataType::kFP16;`
`85`	`64`	`}`
`86`		`- else if (npDtype == nb::dtype<float>())`
	`65`	`+ else if (dtype_name == "float32")`
`87`	`66`	`{`
`88`	`67`	`dtype = tle::DataType::kFP32;`
`89`	`68`	`}`
`90`		`- else if (npDtype == nb::dtype<int8_t>())`
	`69`	`+ else if (dtype_name == "int8")`
`91`	`70`	`{`
`92`	`71`	`dtype = tle::DataType::kINT8;`
`93`	`72`	`}`
`94`		`- else if (npDtype == nb::dtype<int32_t>())`
	`73`	`+ else if (dtype_name == "int32")`
`95`	`74`	`{`
`96`	`75`	`dtype = tle::DataType::kINT32;`
`97`	`76`	`}`
`98`		`- else if (npDtype == nb::dtype<int64_t>())`
	`77`	`+ else if (dtype_name == "int64")`
`99`	`78`	`{`
`100`	`79`	`dtype = tle::DataType::kINT64;`
`101`	`80`	`}`
`102`		`- else if (kind == 'V' && array.itemsize() == 1)`
	`81`	`+ else if (dtype_name == "void8" && !metadata.is_none() && nb::cast<std::string>(metadata["dtype"]) == "float8")`
`103`	`82`	`{`
`104`	`83`	`dtype = tle::DataType::kFP8;`
`105`	`84`	`}`
`106`		`- else if (kind == 'V' && array.itemsize() == 2)`
	`85`	`+ else if (dtype_name == "void16" && !metadata.is_none() && nb::cast<std::string>(metadata["dtype"]) == "bfloat16")`
`107`	`86`	`{`
`108`	`87`	`dtype = tle::DataType::kBF16;`
`109`	`88`	`}`
`@@ -112,16 +91,21 @@ tle::Tensor numpyToTensor(nb::ndarray<nb::numpy> const& array)`
`112`	`91`	`TLLM_THROW("Unsupported numpy dtype.");`
`113`	`92`	`}`
`114`	`93`
`115`		`- // todo: improve the following code`
	`94`	`+ nb::object array_interface = object.attr("__array_interface__");`
	`95`	`+ nb::object shape_obj = array_interface["shape"];`
`116`	`96`	`std::vector<int64_t> dims;`
`117`		`- dims.reserve(array.ndim());`
`118`		`- for (size_t i = 0; i < array.ndim(); ++i)`
	`97`	`+ dims.reserve(nb::len(shape_obj));`
	`98`	`+`
	`99`	`+ for (size_t i = 0; i < nb::len(shape_obj); ++i)`
`119`	`100`	`{`
`120`		`- dims.push_back(static_cast<int64_t>(array.shape(i)));`
	`101`	`+ dims.push_back(nb::cast<int64_t>(shape_obj[i]));`
`121`	`102`	`}`
`122`		`- tle::Shape shape(dims.data(), dims.size());`
`123`	`103`
`124`		`- return tle::Tensor::of(dtype, const_cast<void*>(array.data()), shape);`
	`104`	`+ nb::object data_obj = array_interface["data"];`
	`105`	`+ uintptr_t addr = nb::cast<uintptr_t>(data_obj[0]);`
	`106`	`+ void* data_ptr = reinterpret_cast<void*>(addr);`
	`107`	`+ tle::Shape shape(dims.data(), dims.size());`
	`108`	`+ return tle::Tensor::of(dtype, data_ptr, shape);`
`125`	`109`	`}`
`126`	`110`
`127`	`111`	`} // namespace`
`@@ -153,8 +137,8 @@ Executor::Executor(nb::bytes const& engineBuffer, std::string const& jsonConfigS`
`153`	`137`	`for (auto const& [rawName, rawArray] : managedWeights.value())`
`154`	`138`	`{`
`155`	`139`	`std::string name = nb::cast<std::string>(rawName);`
`156`		`- nb::ndarray<nb::numpy> array = nb::cast<nb::ndarray<nb::numpy>>(rawArray);`
`157`		`- managedWeightsMap->emplace(name, numpyToTensor(array));`
	`140`	`+ nb::object array_obj = nb::cast<nb::object>(rawArray);`
	`141`	`+ managedWeightsMap->emplace(name, numpyToTensor(array_obj));`
`158`	`142`	`}`
`159`	`143`	`}`
`160`	`144`	`mExecutor = std::make_unique<tle::Executor>(`