-
Notifications
You must be signed in to change notification settings - Fork 47
feat: replace per-conversation ZMQ fallback with dataset client metadata request #742
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,12 +26,13 @@ | |
| WorkerHealthMessage, | ||
| ) | ||
| from aiperf.common.messages.dataset_messages import ( | ||
| ConversationRequestMessage, | ||
| ConversationResponseMessage, | ||
| DatasetClientRequestMessage, | ||
| DatasetClientResponseMessage, | ||
| ) | ||
| from aiperf.common.mixins import ProcessHealthMixin | ||
| from aiperf.common.models import ( | ||
| Conversation, | ||
| DatasetClientMetadata, | ||
| ErrorDetails, | ||
| ModelEndpointInfo, | ||
| ReasoningResponseData, | ||
|
|
@@ -197,10 +198,9 @@ def __init__( | |
| or self.user_config.loadgen.warmup_prefill_concurrency is not None | ||
| ) | ||
|
|
||
| # Only used as a fallback when dataset client is not initialized | ||
| # or was not available when the credit was dropped. Must be created here | ||
| # so it can be attached to the worker lifecycle. | ||
| self.conversation_request_client: RequestClientProtocol = ( | ||
| # Used to request dataset client metadata from DatasetManager when | ||
| # the dataset client is not yet initialized (race condition at startup). | ||
| self._dataset_client_request_client: RequestClientProtocol = ( | ||
| self.comms.create_request_client( | ||
| address=CommAddress.DATASET_MANAGER_PROXY_FRONTEND, | ||
| bind=False, | ||
|
|
@@ -221,24 +221,30 @@ async def _send_worker_ready_message(self) -> None: | |
|
|
||
| @on_message(MessageType.DATASET_CONFIGURED_NOTIFICATION) | ||
| async def _on_dataset_configured(self, msg: DatasetConfiguredNotification) -> None: | ||
| """Initialize dataset client when configuration is received. | ||
| """Initialize dataset client when configuration is received.""" | ||
| await self._initialize_dataset_client(msg.client_metadata) | ||
| self.debug( | ||
| lambda: ( | ||
| f"Dataset client initialized: type={msg.client_metadata.client_type}" | ||
| ) | ||
| ) | ||
|
|
||
| async def _initialize_dataset_client( | ||
| self, client_metadata: DatasetClientMetadata | ||
| ) -> None: | ||
| """Create and initialize the dataset client from metadata. | ||
|
|
||
| Uses factory pattern to dynamically create the appropriate client. | ||
| The factory auto-extracts client_type from client_metadata, leveraging | ||
| the discriminated union pattern for type-safe routing. This allows new | ||
| storage backends (S3, Redis, etc.) to work without modifying Worker code. | ||
| """ | ||
| ClientStoreClass = plugins.get_class( | ||
| PluginType.DATASET_CLIENT_STORE, msg.client_metadata.client_type | ||
| PluginType.DATASET_CLIENT_STORE, client_metadata.client_type | ||
| ) | ||
| self._dataset_client = ClientStoreClass(client_metadata=msg.client_metadata) | ||
| self._dataset_client = ClientStoreClass(client_metadata=client_metadata) | ||
| await self._dataset_client.initialize() | ||
| self._dataset_configured_event.set() | ||
|
Comment on lines
+232
to
247
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🏁 Script executed: wc -l src/aiperf/workers/worker.pyRepository: ai-dynamo/aiperf Length of output: 92 🏁 Script executed: # Read the _initialize_dataset_client method and surrounding context
sed -n '230,250p' src/aiperf/workers/worker.pyRepository: ai-dynamo/aiperf Length of output: 951 🏁 Script executed: # Read the lines 580-586 where client is used
sed -n '575,590p' src/aiperf/workers/worker.pyRepository: ai-dynamo/aiperf Length of output: 794 🏁 Script executed: # Read the lines 588-605 fallback request
sed -n '585,610p' src/aiperf/workers/worker.pyRepository: ai-dynamo/aiperf Length of output: 1147 🏁 Script executed: # Search for _on_dataset_configured method
rg -n "_on_dataset_configured" src/aiperf/workers/worker.py -A 10Repository: ai-dynamo/aiperf Length of output: 578 🏁 Script executed: # Search for _dataset_configured_event usage
rg -n "_dataset_configured_event" src/aiperf/workers/worker.pyRepository: ai-dynamo/aiperf Length of output: 224 🏁 Script executed: # Check line 673 context to see how the event is used for synchronization
sed -n '670,685p' src/aiperf/workers/worker.pyRepository: ai-dynamo/aiperf Length of output: 761 🏁 Script executed: # Check if there's any lock mechanism used with _dataset_client
rg -n "asyncio.Lock\|RLock\|Semaphore\|_dataset_client" src/aiperf/workers/worker.py | head -30Repository: ai-dynamo/aiperf Length of output: 42 🏁 Script executed: # Get full context of the on_pull_message handler to see event wait usage
rg -n "@on_pull_message\|_dataset_configured_event" src/aiperf/workers/worker.py -B 2 -A 5Repository: ai-dynamo/aiperf Length of output: 42 🏁 Script executed: # Search for methods that call get_conversation to understand when it's invoked
rg -n "get_conversation" src/aiperf/workers/worker.py -B 3 -A 1Repository: ai-dynamo/aiperf Length of output: 565 🏁 Script executed: # Check the on_message handlers and their concurrency
rg -n "@on_message\|@on_pull_message\|@on_start\|@on_init" src/aiperf/workers/worker.py | head -20Repository: ai-dynamo/aiperf Length of output: 42 🏁 Script executed: # Get more context around line 580 to see the full get_conversation method
sed -n '570,595p' src/aiperf/workers/worker.pyRepository: ai-dynamo/aiperf Length of output: 1219 🏁 Script executed: # Check if there's any initialization guard or lock used elsewhere with _dataset_client
rg -n "self._dataset_client" src/aiperf/workers/worker.py | head -20Repository: ai-dynamo/aiperf Length of output: 773 Fix race condition in dataset client initialization. The assignment of 🤖 Prompt for AI Agents |
||
| self.debug( | ||
| lambda: ( | ||
| f"Dataset client initialized: type={msg.client_metadata.client_type}" | ||
| ) | ||
| ) | ||
|
|
||
| @on_stop | ||
| async def _send_worker_shutdown_message(self) -> None: | ||
|
|
@@ -576,52 +582,32 @@ async def _retrieve_conversation( | |
| elif self.stop_requested: | ||
| raise asyncio.CancelledError("Stop requested while retrieving conversation") | ||
|
|
||
| return await self._request_conversation_from_dataset_manager( | ||
| conversation_id, credit_context | ||
| ) | ||
| await self._request_dataset_client_from_dataset_manager() | ||
| return await self._dataset_client.get_conversation(conversation_id) | ||
|
|
||
| async def _request_conversation_from_dataset_manager( | ||
| self, conversation_id: str, credit_context: CreditContext | ||
| ) -> Conversation: | ||
| """Fallback: Request from DatasetManager via ZMQ""" | ||
| conversation_response: ( | ||
| ConversationResponseMessage | ErrorMessage | ||
| ) = await self.conversation_request_client.request( | ||
| ConversationRequestMessage( | ||
| service_id=self.service_id, | ||
| conversation_id=conversation_id, | ||
| credit_phase=credit_context.credit.phase, | ||
| ) | ||
| async def _request_dataset_client_from_dataset_manager(self) -> None: | ||
| """Fallback: Request dataset client metadata from DatasetManager and initialize client.""" | ||
| self.info( | ||
| "Dataset client not available, requesting metadata from DatasetManager" | ||
| ) | ||
| if self.is_trace_enabled: | ||
| self.trace(f"Received response message: {conversation_response}") | ||
|
|
||
| # Check for error in conversation response | ||
| if isinstance(conversation_response, ErrorMessage): | ||
| error = conversation_response.error | ||
| await self._send_inference_result_message( | ||
| RequestRecord( | ||
| request_info=RequestInfo( | ||
| model_endpoint=self.model_endpoint, | ||
| conversation_id=conversation_id, | ||
| turn_index=0, | ||
| turns=[], | ||
| credit_num=credit_context.credit.id, | ||
| credit_phase=credit_context.credit.phase, | ||
| x_request_id=str(uuid.uuid4()), | ||
| x_correlation_id=credit_context.credit.x_correlation_id, | ||
| drop_perf_ns=credit_context.drop_perf_ns, | ||
| ), | ||
| model_name=self.model_endpoint.primary_model_name, | ||
| timestamp_ns=time.time_ns(), | ||
| start_perf_ns=time.perf_counter_ns(), | ||
| end_perf_ns=time.perf_counter_ns(), | ||
| error=error, | ||
| ) | ||
| response: ( | ||
| DatasetClientResponseMessage | ErrorMessage | ||
| ) = await self._dataset_client_request_client.request( | ||
| DatasetClientRequestMessage(service_id=self.service_id) | ||
| ) | ||
|
|
||
| if isinstance(response, ErrorMessage): | ||
| raise ValueError( | ||
| f"Failed to retrieve dataset client metadata: {response.error}" | ||
| ) | ||
| raise ValueError(f"Failed to retrieve conversation response: {error}") | ||
|
|
||
| return conversation_response.conversation | ||
| await self._initialize_dataset_client(response.client_metadata) | ||
| self.info( | ||
| lambda: ( | ||
| f"Dataset client initialized via fallback request: " | ||
| f"type={response.client_metadata.client_type}" | ||
| ) | ||
| ) | ||
|
|
||
| async def _process_response(self, record: RequestRecord) -> Turn | None: | ||
| """Extract assistant response from RequestRecord and convert to Turn for session. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't initialize dataset clients from DatasetManager metadata in Kubernetes.
Lines 223-225 and Lines 588-604 treat DatasetManager
client_metadataas worker-local in every run mode. Butsrc/aiperf/dataset/dataset_manager.pyLines 356-364 explicitly say that, in Kubernetes, those paths are control-plane paths that workers should ignore until WorkerPodManager provides local download paths. As written, a pod can try to mmap files that do not exist locally. Guard both paths on run type and keep waiting for the downloaded-path source in Kubernetes.Also applies to: 588-605
🤖 Prompt for AI Agents