openvinotoolkit · Maxim-Doronin · Dec 18, 2025 · Dec 18, 2025
@@ -100,9 +100,16 @@ class CompilerImpl final : public intel_npu::ICompiler {
     intel_npu::NetworkDescription compileWsIterative(const std::shared_ptr<ov::Model>& model,
                                                      const intel_npu::Config& config, size_t callIdx) const override;
 
-    // WS CiD-specific methods
+    // WS VCL-specific methods
 
-    /// @brief Sequentially compiles Init and Main schedules. The Main schedule is always last.
+    /// @brief Returns Init schedules and Main in a single call. The blobs are allocated using the provided allocator.
+    /// There is always exactly one Main schedule, placed at the back of the vector.
+    std::vector<std::shared_ptr<NetworkDescriptionView>> compileWsOneShot(const std::shared_ptr<ov::Model>& model,
+                                                                          const intel_npu::Config& config,
+                                                                          BlobAllocator& allocator) const;
+
+    /// @brief Sequentially compiles Init and Main schedules. The blob is allocated using the provided allocator. The
+    /// Main schedule is always last.
     NetworkDescriptionView compileWsIterative(const std::shared_ptr<ov::Model>& model, const intel_npu::Config& config,
                                               size_t callIdx, BlobAllocator& allocator) const;
 };

@@ -1204,6 +1204,41 @@ std::vector<std::shared_ptr<intel_npu::NetworkDescription>> CompilerImpl::compil
     return networkDescrs;
 }
 
+std::vector<std::shared_ptr<NetworkDescriptionView>> CompilerImpl::compileWsOneShot(
+        const std::shared_ptr<ov::Model>& model, const intel_npu::Config& config, BlobAllocator& allocator) const {
+    OV_ITT_SCOPED_TASK(itt::domains::VPUXPlugin, "CompilerImpl::compileWsOneShot");
+    checkPlatformSupportedForCompilation(config.get<intel_npu::PLATFORM>());
+
+    Logger log("vpux-compiler", getLogLevel(config));
+    log.info("Start oneshot WS compilation");
+
+    auto setup = CompilerSetup::create(config);
+
+    using CompilationReturnType = std::vector<CompilationResult>;
+    auto getCompilationResult = [&](const std::shared_ptr<ov::Model>& debatchedModel,
+                                    const std::vector<std::shared_ptr<const ov::Node>>& originalParameters,
+                                    const std::vector<std::shared_ptr<const ov::Node>>& originalResults,
+                                    const intel_npu::Config& debatchedConfig) -> CompilationReturnType {
+        return ws::compileImplWsOneShot(setup, originalParameters, originalResults, debatchedModel, debatchedConfig,
+                                        log);
+    };
+    auto [compilationResults, compiledConfig] =
+            tryCompileDebatchedModel<CompilationReturnType>(model, config, log, getCompilationResult);
+
+    OV_ITT_TASK_CHAIN(COMPILER_IMPLEMENTATION, itt::domains::VPUXPlugin, "CompilerImpl::compileWsOneShot",
+                      "exportNetwork");
+    std::vector<std::shared_ptr<NetworkDescriptionView>> networkDescrs;
+    networkDescrs.reserve(compilationResults.size());
+    for (const auto& result : compilationResults) {
+        networkDescrs.emplace_back(std::make_shared<NetworkDescriptionView>(
+                exportNetwork(result.moduleOp.get(), compiledConfig, log, allocator)));
+    }
+    OV_ITT_TASK_SKIP(COMPILER_IMPLEMENTATION);
+
+    // Plugin will collect the compilation memory usage
+    return networkDescrs;
+}
+
 intel_npu::NetworkDescription CompilerImpl::compileWsIterative(const std::shared_ptr<ov::Model>& model,
                                                                const intel_npu::Config& config, size_t callIdx) const {
     OV_ITT_SCOPED_TASK(itt::domains::VPUXPlugin, "CompilerImpl::compileWsIterative");

@@ -1,5 +1,11 @@
 Change Log:
 -----------
+VPUXCompilerL0 7.6.0:
+  - Add new weightless compilation API:
+      vclAllocatedExecutableCreateWSOneShot
+      saveVclAllocatorBlobWS
+  - Update compilerTest to support the new weightless compilation API
+
 VPUXCompilerL0 7.5.0:
   - Added support for the "USE_BASE_MODEL_SERIALIZER" config option. A new deserializer has been implemented,
 one that matches the "no weights copy" serializer. This option can be used to switch between the two implementations.

@@ -23,7 +23,7 @@ extern "C" {
 #endif
 
 #define VCL_COMPILER_VERSION_MAJOR 7
-#define VCL_COMPILER_VERSION_MINOR 5
+#define VCL_COMPILER_VERSION_MINOR 6
 #define VCL_PROFILING_VERSION_MAJOR 2
 #define VCL_PROFILING_VERSION_MINOR 0
 
@@ -268,6 +268,10 @@ VCL_APIEXPORT vcl_result_t VCL_APICALL vclAllocatedExecutableCreate2(vcl_compile
                                                                      vcl_allocator2_t* allocator, uint8_t** blobBuffer,
                                                                      uint64_t* blobSize);
 
+VCL_APIEXPORT vcl_result_t VCL_APICALL vclAllocatedExecutableCreateWSOneShot(vcl_compiler_handle_t compiler,
+                                                                             vcl_executable_desc_t desc,
+                                                                             vcl_allocator2_t* allocator);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Destroys the executable and releases the cached blob.
 VCL_APIEXPORT vcl_result_t VCL_APICALL vclExecutableDestroy(vcl_executable_handle_t executable);

@@ -105,6 +105,18 @@ class VPUXCompilerL0 final {
      */
     vpux::NetworkDescriptionView importNetwork(BuildInfo& buildInfo, vpux::BlobAllocator& allocator);
 
+    /**
+     * @brief Use VPUX MLIR compiler to create one shot weight-separated blob with user info
+     * @note Blob storage is allocated via given allocator
+     *
+     * @param buildInfo Include the model data, ioInfo, compilation configs
+     * @param allocator Allocator for blob storage allocation
+     * @return std::vector<std::shared_ptr<vpux::NetworkDescriptionView>> Include non-owning
+     * views into blobs and metadatas
+     */
+    std::vector<std::shared_ptr<vpux::NetworkDescriptionView>> importNetworkWSOneShot(BuildInfo& buildInfo,
+                                                                                      vpux::BlobAllocator& allocator);
+
     /**
      * @brief Check if a model can be supported by current compiler
      *

@@ -100,6 +100,51 @@ vcl_result_t allocatedExecutableCreate(vcl_compiler_handle_t compiler, vcl_execu
     return VCL_RESULT_SUCCESS;
 }
 
+vcl_result_t allocatedExecutableCreateWSOneShot(vcl_compiler_handle_t compiler, vcl_executable_desc_t desc,
+                                                vcl_allocator2_t* allocator) {
+    if (!compiler || !allocator || !desc.modelIRData) {
+        return VCL_RESULT_ERROR_INVALID_ARGUMENT;
+    }
+
+    VPUXDriverCompiler::VPUXCompilerL0* pCompiler = reinterpret_cast<VPUXDriverCompiler::VPUXCompilerL0*>(compiler);
+    VPUXDriverCompiler::VCLLogger* vclLogger = pCompiler->getLogger();
+
+    /// To avoid access violation, need to convert to string
+    std::string descOptions(desc.options, desc.optionsSize);
+    vclLogger->info("config: {0}", descOptions);
+
+    /// Create info parser
+    VPUXDriverCompiler::BuildInfo buildInfo(pCompiler);
+    /// Parse user descriptions and store the input && output settings, compilation configs
+    if (auto ret = buildInfo.prepareBuildFlags(descOptions); ret != VCL_RESULT_SUCCESS) {
+        vclLogger->outputError(formatv("Failed to prepare io info and config! DescOptions: {0}", descOptions));
+        return ret;
+    }
+
+    /// Parse serialized model data and create the model container for compiler
+    if (auto ret = buildInfo.prepareModel(desc.modelIRData, desc.modelIRSize); ret != VCL_RESULT_SUCCESS) {
+        vclLogger->outputError("Failed to parse model info! Incorrect format!");
+        return ret;
+    }
+
+    try {
+        // NetworkMetadata is part of the result, but unused in VCL
+        // it'd just get destroyed at function call here
+        VCLBlobAllocator vcl_allocator{allocator};
+        auto result = pCompiler->importNetworkWSOneShot(buildInfo, vcl_allocator);
+        if (result.empty()) {
+            vclLogger->warning("Compiler successfully returned but the blob list is empty!");
+        }
+    } catch (const std::exception& error) {
+        vclLogger->outputError(formatv("Compiler returned msg:\n{0}", error.what()));
+        return VCL_RESULT_ERROR_INVALID_ARGUMENT;
+    } catch (...) {
+        vclLogger->outputError("Internal exception! Can't compile model!");
+        return VCL_RESULT_ERROR_INVALID_ARGUMENT;
+    }
+    return VCL_RESULT_SUCCESS;
+}
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -335,6 +380,11 @@ DLLEXPORT vcl_result_t vclAllocatedExecutableCreate(vcl_compiler_handle_t compil
     return allocatedExecutableCreate(compiler, desc, allocator, blob, size);
 }
 
+DLLEXPORT vcl_result_t vclAllocatedExecutableCreateWSOneShot(vcl_compiler_handle_t compiler, vcl_executable_desc_t desc,
+                                                             vcl_allocator2_t* allocator) {
+    return allocatedExecutableCreateWSOneShot(compiler, desc, allocator);
+}
+
 DLLEXPORT vcl_result_t vclExecutableGetSerializableBlob(vcl_executable_handle_t executable, uint8_t* blobBuffer,
                                                         uint64_t* blobSize) {
     vcl_result_t ret = VCL_RESULT_SUCCESS;

@@ -443,9 +443,12 @@ vcl_result_t BuildInfo::prepareConfig(const std::string& descOptions) {
     /// Update maxtiles config with compiler desc
     ///  - If deviceDesc is valid, compare its tileCount with user config and use the smaller value
     ///  - If deviceDesc is empty, its tileCount is invalid, then just handle it according to the config
-    if (!pvc->isDeviceDescEmpty()) {
+    ///  - If deviceDesc's tileCount is invalid, then skip updating maxtiles
+    if (!pvc->isDeviceDescEmpty() && (deviceDesc.tileCount != static_cast<uint32_t>(-1))) {
         config[ov::intel_npu::max_tiles.name()] = getValidTileValue(config, deviceDesc);
         logger->debug("NPU_MAX_TILES is updated to {0}", config[ov::intel_npu::max_tiles.name()]);
+    } else {
+        logger->debug("DeviceDesc is empty or tileCount is invalid, skip updating NPU_MAX_TILES");
     }
 
     /// When we use LOG_INFO, show vcl level profiling log

@@ -357,6 +357,32 @@ NetworkDescriptionView VPUXCompilerL0::importNetwork(BuildInfo& buildInfo, BlobA
     });
 }
 
+std::vector<std::shared_ptr<NetworkDescriptionView>> VPUXCompilerL0::importNetworkWSOneShot(BuildInfo& buildInfo,
+                                                                                            BlobAllocator& allocator) {
+    StopWatch stopWatch;
+    if (buildInfo.enableProfiling) {
+        // Output time cost on vcl level
+        stopWatch.start();
+    }
+
+    auto scoped = Scoped{[&stopWatch, &buildInfo, this]() {
+        if (buildInfo.enableProfiling) {
+            stopWatch.stop();
+            _logger->info("Compile net time: {0} ms", stopWatch.delta_ms());
+        }
+    }};
+
+    std::shared_ptr<ov::Model> model;
+
+    model = preprocessModel(buildInfo.model, buildInfo.inputPrecisions, buildInfo.outputPrecisions,
+                            buildInfo.inputLayouts, buildInfo.outputLayouts, _logger);
+
+    // Isolate the MLIR thread to safely destroy MLIR thread_local objects before CiD unload
+    return run_in_worker_thread_sync([&] {
+        return _compiler->compileWsOneShot(model, buildInfo.parsedConfig, allocator);
+    });
+}
+
 vcl_result_t VPUXCompilerL0::queryNetwork(const BuildInfo& buildInfo, VPUXQueryNetworkL0* pQueryNetwork) {
     _logger->info("Start to call query function from compiler to get supported layers!");
     ov::SupportedOpsMap queryNetworkResult;

@@ -17,6 +17,7 @@ target_link_libraries(compilerTest
         npu_driver_compiler # VCL function
         npu_ov_utils # OV core function
         npu_llvm_utils # CL command line
+        openvino::npu_logger_utils # logging utils
 )
 ov_add_api_validator_post_build_step(TARGET compilerTest)