Skip to content

Commit f1e04ef

Browse files
Add vclAllocatedExecutableCreateWSOneShot (#222)
1 parent a1ae54e commit f1e04ef

File tree

11 files changed

+852
-6
lines changed

11 files changed

+852
-6
lines changed

src/vpux_compiler/include/vpux/compiler/compiler.hpp

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,16 @@ class CompilerImpl final : public intel_npu::ICompiler {
100100
intel_npu::NetworkDescription compileWsIterative(const std::shared_ptr<ov::Model>& model,
101101
const intel_npu::Config& config, size_t callIdx) const override;
102102

103-
// WS CiD-specific methods
103+
// WS VCL-specific methods
104104

105-
/// @brief Sequentially compiles Init and Main schedules. The Main schedule is always last.
105+
/// @brief Returns Init schedules and Main in a single call. The blobs are allocated using the provided allocator.
106+
/// There is always exactly one Main schedule, placed at the back of the vector.
107+
std::vector<std::shared_ptr<NetworkDescriptionView>> compileWsOneShot(const std::shared_ptr<ov::Model>& model,
108+
const intel_npu::Config& config,
109+
BlobAllocator& allocator) const;
110+
111+
/// @brief Sequentially compiles Init and Main schedules. The blob is allocated using the provided allocator. The
112+
/// Main schedule is always last.
106113
NetworkDescriptionView compileWsIterative(const std::shared_ptr<ov::Model>& model, const intel_npu::Config& config,
107114
size_t callIdx, BlobAllocator& allocator) const;
108115
};

src/vpux_compiler/src/compiler.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1204,6 +1204,41 @@ std::vector<std::shared_ptr<intel_npu::NetworkDescription>> CompilerImpl::compil
12041204
return networkDescrs;
12051205
}
12061206

1207+
std::vector<std::shared_ptr<NetworkDescriptionView>> CompilerImpl::compileWsOneShot(
1208+
const std::shared_ptr<ov::Model>& model, const intel_npu::Config& config, BlobAllocator& allocator) const {
1209+
OV_ITT_SCOPED_TASK(itt::domains::VPUXPlugin, "CompilerImpl::compileWsOneShot");
1210+
checkPlatformSupportedForCompilation(config.get<intel_npu::PLATFORM>());
1211+
1212+
Logger log("vpux-compiler", getLogLevel(config));
1213+
log.info("Start oneshot WS compilation");
1214+
1215+
auto setup = CompilerSetup::create(config);
1216+
1217+
using CompilationReturnType = std::vector<CompilationResult>;
1218+
auto getCompilationResult = [&](const std::shared_ptr<ov::Model>& debatchedModel,
1219+
const std::vector<std::shared_ptr<const ov::Node>>& originalParameters,
1220+
const std::vector<std::shared_ptr<const ov::Node>>& originalResults,
1221+
const intel_npu::Config& debatchedConfig) -> CompilationReturnType {
1222+
return ws::compileImplWsOneShot(setup, originalParameters, originalResults, debatchedModel, debatchedConfig,
1223+
log);
1224+
};
1225+
auto [compilationResults, compiledConfig] =
1226+
tryCompileDebatchedModel<CompilationReturnType>(model, config, log, getCompilationResult);
1227+
1228+
OV_ITT_TASK_CHAIN(COMPILER_IMPLEMENTATION, itt::domains::VPUXPlugin, "CompilerImpl::compileWsOneShot",
1229+
"exportNetwork");
1230+
std::vector<std::shared_ptr<NetworkDescriptionView>> networkDescrs;
1231+
networkDescrs.reserve(compilationResults.size());
1232+
for (const auto& result : compilationResults) {
1233+
networkDescrs.emplace_back(std::make_shared<NetworkDescriptionView>(
1234+
exportNetwork(result.moduleOp.get(), compiledConfig, log, allocator)));
1235+
}
1236+
OV_ITT_TASK_SKIP(COMPILER_IMPLEMENTATION);
1237+
1238+
// Plugin will collect the compilation memory usage
1239+
return networkDescrs;
1240+
}
1241+
12071242
intel_npu::NetworkDescription CompilerImpl::compileWsIterative(const std::shared_ptr<ov::Model>& model,
12081243
const intel_npu::Config& config, size_t callIdx) const {
12091244
OV_ITT_SCOPED_TASK(itt::domains::VPUXPlugin, "CompilerImpl::compileWsIterative");

src/vpux_driver_compiler/CHANGES.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
Change Log:
22
-----------
3+
VPUXCompilerL0 7.6.0:
4+
- Add new weightless compilation API:
5+
vclAllocatedExecutableCreateWSOneShot
6+
saveVclAllocatorBlobWS
7+
- Update compilerTest to support the new weightless compilation API
8+
39
VPUXCompilerL0 7.5.0:
410
- Added support for the "USE_BASE_MODEL_SERIALIZER" config option. A new deserializer has been implemented,
511
one that matches the "no weights copy" serializer. This option can be used to switch between the two implementations.

src/vpux_driver_compiler/include/npu_driver_compiler.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ extern "C" {
2323
#endif
2424

2525
#define VCL_COMPILER_VERSION_MAJOR 7
26-
#define VCL_COMPILER_VERSION_MINOR 5
26+
#define VCL_COMPILER_VERSION_MINOR 6
2727
#define VCL_PROFILING_VERSION_MAJOR 2
2828
#define VCL_PROFILING_VERSION_MINOR 0
2929

@@ -268,6 +268,10 @@ VCL_APIEXPORT vcl_result_t VCL_APICALL vclAllocatedExecutableCreate2(vcl_compile
268268
vcl_allocator2_t* allocator, uint8_t** blobBuffer,
269269
uint64_t* blobSize);
270270

271+
VCL_APIEXPORT vcl_result_t VCL_APICALL vclAllocatedExecutableCreateWSOneShot(vcl_compiler_handle_t compiler,
272+
vcl_executable_desc_t desc,
273+
vcl_allocator2_t* allocator);
274+
271275
///////////////////////////////////////////////////////////////////////////////
272276
/// @brief Destroys the executable and releases the cached blob.
273277
VCL_APIEXPORT vcl_result_t VCL_APICALL vclExecutableDestroy(vcl_executable_handle_t executable);

src/vpux_driver_compiler/include/vcl_compiler.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,18 @@ class VPUXCompilerL0 final {
105105
*/
106106
vpux::NetworkDescriptionView importNetwork(BuildInfo& buildInfo, vpux::BlobAllocator& allocator);
107107

108+
/**
109+
* @brief Use VPUX MLIR compiler to create one shot weight-separated blob with user info
110+
* @note Blob storage is allocated via given allocator
111+
*
112+
* @param buildInfo Include the model data, ioInfo, compilation configs
113+
* @param allocator Allocator for blob storage allocation
114+
* @return std::vector<std::shared_ptr<vpux::NetworkDescriptionView>> Include non-owning
115+
* views into blobs and metadatas
116+
*/
117+
std::vector<std::shared_ptr<vpux::NetworkDescriptionView>> importNetworkWSOneShot(BuildInfo& buildInfo,
118+
vpux::BlobAllocator& allocator);
119+
108120
/**
109121
* @brief Check if a model can be supported by current compiler
110122
*

src/vpux_driver_compiler/src/vpux_compiler_l0/vcl_bridge.cpp

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,51 @@ vcl_result_t allocatedExecutableCreate(vcl_compiler_handle_t compiler, vcl_execu
100100
return VCL_RESULT_SUCCESS;
101101
}
102102

103+
vcl_result_t allocatedExecutableCreateWSOneShot(vcl_compiler_handle_t compiler, vcl_executable_desc_t desc,
104+
vcl_allocator2_t* allocator) {
105+
if (!compiler || !allocator || !desc.modelIRData) {
106+
return VCL_RESULT_ERROR_INVALID_ARGUMENT;
107+
}
108+
109+
VPUXDriverCompiler::VPUXCompilerL0* pCompiler = reinterpret_cast<VPUXDriverCompiler::VPUXCompilerL0*>(compiler);
110+
VPUXDriverCompiler::VCLLogger* vclLogger = pCompiler->getLogger();
111+
112+
/// To avoid access violation, need to convert to string
113+
std::string descOptions(desc.options, desc.optionsSize);
114+
vclLogger->info("config: {0}", descOptions);
115+
116+
/// Create info parser
117+
VPUXDriverCompiler::BuildInfo buildInfo(pCompiler);
118+
/// Parse user descriptions and store the input && output settings, compilation configs
119+
if (auto ret = buildInfo.prepareBuildFlags(descOptions); ret != VCL_RESULT_SUCCESS) {
120+
vclLogger->outputError(formatv("Failed to prepare io info and config! DescOptions: {0}", descOptions));
121+
return ret;
122+
}
123+
124+
/// Parse serialized model data and create the model container for compiler
125+
if (auto ret = buildInfo.prepareModel(desc.modelIRData, desc.modelIRSize); ret != VCL_RESULT_SUCCESS) {
126+
vclLogger->outputError("Failed to parse model info! Incorrect format!");
127+
return ret;
128+
}
129+
130+
try {
131+
// NetworkMetadata is part of the result, but unused in VCL
132+
// it'd just get destroyed at function call here
133+
VCLBlobAllocator vcl_allocator{allocator};
134+
auto result = pCompiler->importNetworkWSOneShot(buildInfo, vcl_allocator);
135+
if (result.empty()) {
136+
vclLogger->warning("Compiler successfully returned but the blob list is empty!");
137+
}
138+
} catch (const std::exception& error) {
139+
vclLogger->outputError(formatv("Compiler returned msg:\n{0}", error.what()));
140+
return VCL_RESULT_ERROR_INVALID_ARGUMENT;
141+
} catch (...) {
142+
vclLogger->outputError("Internal exception! Can't compile model!");
143+
return VCL_RESULT_ERROR_INVALID_ARGUMENT;
144+
}
145+
return VCL_RESULT_SUCCESS;
146+
}
147+
103148
#ifdef __cplusplus
104149
extern "C" {
105150
#endif
@@ -335,6 +380,11 @@ DLLEXPORT vcl_result_t vclAllocatedExecutableCreate(vcl_compiler_handle_t compil
335380
return allocatedExecutableCreate(compiler, desc, allocator, blob, size);
336381
}
337382

383+
DLLEXPORT vcl_result_t vclAllocatedExecutableCreateWSOneShot(vcl_compiler_handle_t compiler, vcl_executable_desc_t desc,
384+
vcl_allocator2_t* allocator) {
385+
return allocatedExecutableCreateWSOneShot(compiler, desc, allocator);
386+
}
387+
338388
DLLEXPORT vcl_result_t vclExecutableGetSerializableBlob(vcl_executable_handle_t executable, uint8_t* blobBuffer,
339389
uint64_t* blobSize) {
340390
vcl_result_t ret = VCL_RESULT_SUCCESS;

src/vpux_driver_compiler/src/vpux_compiler_l0/vcl_common.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -443,9 +443,12 @@ vcl_result_t BuildInfo::prepareConfig(const std::string& descOptions) {
443443
/// Update maxtiles config with compiler desc
444444
/// - If deviceDesc is valid, compare its tileCount with user config and use the smaller value
445445
/// - If deviceDesc is empty, its tileCount is invalid, then just handle it according to the config
446-
if (!pvc->isDeviceDescEmpty()) {
446+
/// - If deviceDesc's tileCount is invalid, then skip updating maxtiles
447+
if (!pvc->isDeviceDescEmpty() && (deviceDesc.tileCount != static_cast<uint32_t>(-1))) {
447448
config[ov::intel_npu::max_tiles.name()] = getValidTileValue(config, deviceDesc);
448449
logger->debug("NPU_MAX_TILES is updated to {0}", config[ov::intel_npu::max_tiles.name()]);
450+
} else {
451+
logger->debug("DeviceDesc is empty or tileCount is invalid, skip updating NPU_MAX_TILES");
449452
}
450453

451454
/// When we use LOG_INFO, show vcl level profiling log

src/vpux_driver_compiler/src/vpux_compiler_l0/vcl_compiler.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,32 @@ NetworkDescriptionView VPUXCompilerL0::importNetwork(BuildInfo& buildInfo, BlobA
357357
});
358358
}
359359

360+
std::vector<std::shared_ptr<NetworkDescriptionView>> VPUXCompilerL0::importNetworkWSOneShot(BuildInfo& buildInfo,
361+
BlobAllocator& allocator) {
362+
StopWatch stopWatch;
363+
if (buildInfo.enableProfiling) {
364+
// Output time cost on vcl level
365+
stopWatch.start();
366+
}
367+
368+
auto scoped = Scoped{[&stopWatch, &buildInfo, this]() {
369+
if (buildInfo.enableProfiling) {
370+
stopWatch.stop();
371+
_logger->info("Compile net time: {0} ms", stopWatch.delta_ms());
372+
}
373+
}};
374+
375+
std::shared_ptr<ov::Model> model;
376+
377+
model = preprocessModel(buildInfo.model, buildInfo.inputPrecisions, buildInfo.outputPrecisions,
378+
buildInfo.inputLayouts, buildInfo.outputLayouts, _logger);
379+
380+
// Isolate the MLIR thread to safely destroy MLIR thread_local objects before CiD unload
381+
return run_in_worker_thread_sync([&] {
382+
return _compiler->compileWsOneShot(model, buildInfo.parsedConfig, allocator);
383+
});
384+
}
385+
360386
vcl_result_t VPUXCompilerL0::queryNetwork(const BuildInfo& buildInfo, VPUXQueryNetworkL0* pQueryNetwork) {
361387
_logger->info("Start to call query function from compiler to get supported layers!");
362388
ov::SupportedOpsMap queryNetworkResult;

src/vpux_driver_compiler/test/smoke/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ target_link_libraries(compilerTest
1717
npu_driver_compiler # VCL function
1818
npu_ov_utils # OV core function
1919
npu_llvm_utils # CL command line
20+
openvino::npu_logger_utils # logging utils
2021
)
2122
ov_add_api_validator_post_build_step(TARGET compilerTest)
2223

0 commit comments

Comments
 (0)