Skip to content
Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion HeterogeneousCore/SonicCore/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
<use name="FWCore/Utilities"/>
<export>
<lib name="1"/>
</export>i
</export>
6 changes: 5 additions & 1 deletion HeterogeneousCore/SonicTriton/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,13 @@
<use name="HeterogeneousCore/CUDAUtilities"/>
<use name="triton-inference-client"/>
<use name="protobuf"/>
<use name="catch2"/>
<iftool name="cuda">
<use name="cuda"/>
</iftool>

<export>
<lib name="1"/>
<lib name="1"/>
</export>

<test name="RetryActionDiffServer_test" command="RetryActionDiffServer.cc"/>
33 changes: 33 additions & 0 deletions HeterogeneousCore/SonicTriton/interface/RetryActionDiffServer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#ifndef HeterogeneousCore_SonicTriton_RetryActionDiffServer_h
#define HeterogeneousCore_SonicTriton_RetryActionDiffServer_h

#include "HeterogeneousCore/SonicCore/interface/RetryActionBase.h"

/**
* @class RetryActionDiffServer
* @brief A concrete implementation of RetryActionBase that attempts to retry an inference
* request on a different, user-specified Triton server.
*
* This class is designed to provide a fallback mechanism. If an initial inference
* request fails (e.g., due to server unavailability or a model-specific error),
* this action will be triggered. It reads an alternative server URL from the
* ParameterSet and instructs the TritonClient to reconnect to this new server
* for the retry attempt. This action is designed for one-time use per inference
* call; after the retry attempt, it disables itself until the next `start()` call.
*/

class RetryActionDiffServer : public RetryActionBase {
public:
RetryActionDiffServer(const edm::ParameterSet& conf, SonicClientBase* client);
~RetryActionDiffServer() override = default;

void retry() override;
void start() override;

private:
std::string alt_server_url_;
std::string alt_server_token_;
};

#endif

220 changes: 117 additions & 103 deletions HeterogeneousCore/SonicTriton/interface/TritonClient.h
Original file line number Diff line number Diff line change
@@ -1,103 +1,117 @@
#ifndef HeterogeneousCore_SonicTriton_TritonClient
#define HeterogeneousCore_SonicTriton_TritonClient

#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "FWCore/ServiceRegistry/interface/ServiceToken.h"
#include "HeterogeneousCore/SonicCore/interface/SonicClient.h"
#include "HeterogeneousCore/SonicTriton/interface/TritonData.h"
#include "HeterogeneousCore/SonicTriton/interface/TritonService.h"

#include <map>
#include <vector>
#include <string>
#include <exception>
#include <unordered_map>

#include "grpc_client.h"
#include "grpc_service.pb.h"

enum class TritonBatchMode { Rectangular = 1, Ragged = 2 };

class TritonClient : public SonicClient<TritonInputMap, TritonOutputMap> {
public:
struct ServerSideStats {
uint64_t inference_count_;
uint64_t execution_count_;
uint64_t success_count_;
uint64_t cumm_time_ns_;
uint64_t queue_time_ns_;
uint64_t compute_input_time_ns_;
uint64_t compute_infer_time_ns_;
uint64_t compute_output_time_ns_;
};

//constructor
TritonClient(const edm::ParameterSet& params, const std::string& debugName);

//destructor
~TritonClient() override;

//accessors
unsigned batchSize() const;
TritonBatchMode batchMode() const { return batchMode_; }
bool verbose() const { return verbose_; }
bool useSharedMemory() const { return useSharedMemory_; }
void setUseSharedMemory(bool useShm) { useSharedMemory_ = useShm; }
bool setBatchSize(unsigned bsize);
void setBatchMode(TritonBatchMode batchMode);
void resetBatchMode();
void reset() override;
TritonServerType serverType() const { return serverType_; }
bool isLocal() const { return isLocal_; }

//for fillDescriptions
static void fillPSetDescription(edm::ParameterSetDescription& iDesc);

protected:
//helpers
bool noOuterDim() const { return noOuterDim_; }
unsigned outerDim() const { return outerDim_; }
unsigned nEntries() const;
void getResults(const std::vector<std::shared_ptr<triton::client::InferResult>>& results);
void evaluate() override;
template <typename F>
bool handle_exception(F&& call);

void reportServerSideStats(const ServerSideStats& stats) const;
void updateServer(std::string serverName);
ServerSideStats summarizeServerStats(const inference::ModelStatistics& start_status,
const inference::ModelStatistics& end_status) const;

inference::ModelStatistics getServerSideStatus() const;

//members
unsigned maxOuterDim_;
unsigned outerDim_;
bool noOuterDim_;
unsigned nEntries_;
TritonBatchMode batchMode_;
bool manualBatchMode_;
bool verbose_;
bool useSharedMemory_;
TritonServerType serverType_;
bool isLocal_;
grpc_compression_algorithm compressionAlgo_;
triton::client::Headers headers_;

std::unique_ptr<triton::client::InferenceServerGrpcClient> client_;
//stores timeout, model name and version
std::vector<triton::client::InferOptions> options_;
edm::ServiceToken token_;

private:
friend TritonInputData;
friend TritonOutputData;

//private accessors only used by data
auto client() { return client_.get(); }
void addEntry(unsigned entry);
void resizeEntries(unsigned entry);
};

#endif
#ifndef HeterogeneousCore_SonicTriton_TritonClient
#define HeterogeneousCore_SonicTriton_TritonClient

#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "FWCore/ServiceRegistry/interface/ServiceToken.h"
#include "HeterogeneousCore/SonicCore/interface/SonicClient.h"
#include "HeterogeneousCore/SonicTriton/interface/TritonData.h"
#include "HeterogeneousCore/SonicTriton/interface/TritonService.h"

#include <map>
#include <vector>
#include <string>
#include <exception>
#include <unordered_map>

#include "grpc_client.h"
#include "grpc_service.pb.h"

enum class TritonBatchMode { Rectangular = 1, Ragged = 2 };

class TritonClient : public SonicClient<TritonInputMap, TritonOutputMap> {
public:
struct ServerSideStats {
uint64_t inference_count_;
uint64_t execution_count_;
uint64_t success_count_;
uint64_t cumm_time_ns_;
uint64_t queue_time_ns_;
uint64_t compute_input_time_ns_;
uint64_t compute_infer_time_ns_;
uint64_t compute_output_time_ns_;
};

//constructor
TritonClient(const edm::ParameterSet& params, const std::string& debugName);

//destructor
~TritonClient() override;

//accessors
unsigned batchSize() const;
TritonBatchMode batchMode() const { return batchMode_; }
bool verbose() const { return verbose_; }
bool useSharedMemory() const { return useSharedMemory_; }
void setUseSharedMemory(bool useShm) { useSharedMemory_ = useShm; }
bool setBatchSize(unsigned bsize);
void setBatchMode(TritonBatchMode batchMode);
void resetBatchMode();
void reset() override;
TritonServerType serverType() const { return serverType_; }
bool isLocal() const { return isLocal_; }
virtual void connectToServer(const std::string& url);

//for fillDescriptions
static void fillPSetDescription(edm::ParameterSetDescription& iDesc);

protected:
/**
* @brief Constructor for unit testing purposes only.
*
* This constructor is provided to allow the creation of a TritonClient
* instance (or a mock derived from it) without needing the full CMSSW
* Service framework, which is required by the standard constructor.
* This is essential for writing isolated unit tests that do not depend
* on external services. It initializes the base SonicClient with dummy
* parameters.
* @param is_testing A boolean flag to select this constructor.
*/
TritonClient(bool is_testing);

//helpers
bool noOuterDim() const { return noOuterDim_; }
unsigned outerDim() const { return outerDim_; }
unsigned nEntries() const;
void getResults(const std::vector<std::shared_ptr<triton::client::InferResult>>& results);
virtual void evaluate() override;
template <typename F>
bool handle_exception(F&& call);

void reportServerSideStats(const ServerSideStats& stats) const;
void updateServer(std::string serverName);
ServerSideStats summarizeServerStats(const inference::ModelStatistics& start_status,
const inference::ModelStatistics& end_status) const;

inference::ModelStatistics getServerSideStatus() const;

//members
unsigned maxOuterDim_;
unsigned outerDim_;
bool noOuterDim_;
unsigned nEntries_;
TritonBatchMode batchMode_;
bool manualBatchMode_;
bool verbose_;
bool useSharedMemory_;
TritonServerType serverType_;
bool isLocal_;
grpc_compression_algorithm compressionAlgo_;
triton::client::Headers headers_;

std::unique_ptr<triton::client::InferenceServerGrpcClient> client_;
//stores timeout, model name and version
std::vector<triton::client::InferOptions> options_;
edm::ServiceToken token_;

private:
friend TritonInputData;
friend TritonOutputData;

//private accessors only used by data
auto client() { return client_.get(); }
void addEntry(unsigned entry);
void resizeEntries(unsigned entry);
};

#endif
46 changes: 46 additions & 0 deletions HeterogeneousCore/SonicTriton/src/RetryActionDiffServer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#include "HeterogeneousCore/SonicTriton/interface/RetryActionDiffServer.h"
#include "HeterogeneousCore/SonicTriton/interface/TritonClient.h"
#include "FWCore/MessageLogger/interface/MessageLogger.h"

RetryActionDiffServer::RetryActionDiffServer(
const edm::ParameterSet& conf,
SonicClientBase* client
): RetryActionBase(conf, client) {
alt_server_url_ = conf.getUntrackedParameter<std::string>("altServerUrl", "");
alt_server_token_ = conf.getUntrackedParameter<std::string>("altServerToken", "");

if (this->alt_server_url_.empty()) {
edm::LogWarning("RetryActionDiffServer")
<< "No alternative server URL provided. "
<< "This retry action will be disabled.";
this->shouldRetry_ = false;
}
}

void RetryActionDiffServer::start() {
this->shouldRetry_ = true;
}

void RetryActionDiffServer::retry() {
if (!this->shouldRetry_ || this->alt_server_url_.empty()) {
this->shouldRetry_ = false;
edm::LogInfo("RetryActionDiffServer") << "No alternative server available for retry.";
return;
}

try {
TritonClient* tritonClient = static_cast<TritonClient*>(client_);
edm::LogInfo("RetryActionDiffServer")
<< "Attempting retry by switching to server: "
<< this->alt_server_url_;
tritonClient->connectToServer(this->alt_server_url_);
eval();
} catch (const std::exception& e) {
edm::LogError("RetryActionDiffServer")
<< "Failed to retry with alternative server: "
<< e.what();
}
this->shouldRetry_ = false;
}

DEFINE_RETRY_ACTION(RetryActionDiffServer);
23 changes: 23 additions & 0 deletions HeterogeneousCore/SonicTriton/src/TritonClient.cc
Original file line number Diff line number Diff line change
Expand Up @@ -598,3 +598,26 @@ void TritonClient::fillPSetDescription(edm::ParameterSetDescription& iDesc) {
descClient.addUntracked<std::vector<std::string>>("outputs", {});
iDesc.add<edm::ParameterSetDescription>("Client", descClient);
}

void TritonClient::connectToServer(const std::string& url) {
// Update client state for a generic remote server
serverType_ = TritonServerType::Remote;
isLocal_ = false;

edm::LogInfo("TritonDiscovery") << debugName_ << " connecting to server: " << url;

// Use default SSL options
triton::client::SslOptions sslOptions;
bool useSsl = false; // Assuming no SSL for direct URL connection

// Connect to the server
TRITON_THROW_IF_ERROR(
triton::client::InferenceServerGrpcClient::Create(&client_, url, false, useSsl, sslOptions),
"TritonClient::connectToServer(): unable to create inference context",
false // isLocal is false
);
}

//constructor for testing
TritonClient::TritonClient(bool /*is_testing*/) : SonicClient(edm::ParameterSet(), "TritonClient_test", "TritonClient") {}

1 change: 1 addition & 0 deletions HeterogeneousCore/SonicTriton/test/BuildFile.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<test name="TestHeterogeneousCoreSonicTritonProducerCPU" command="unittest.sh ${LOCALTOP} CPU"/>
<test name="TestHeterogeneousCoreSonicTritonProducerGPU" command="unittest.sh ${LOCALTOP} GPU"/>
<test name="TestHeterogeneousCoreSonicTritonRetryAction" command="unittest.sh ${LOCALTOP} CPU tritonRetryActionTest_cfg.py"/>
<test name="TestHeterogeneousCoreSonicTritonVersionCheck" command="cmsTritonConfigTool versioncheck">
<use name="cmsswdata"/>
</test>
Expand Down
Loading