11/*
2- * Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved.
2+ * Copyright (c) 2014-2016 Cisco Systems, Inc. All rights reserved.
33 * Copyright (c) 2015 Research Organization for Information Science
44 * and Technology (RIST). All rights reserved.
55 * $COPYRIGHT$
@@ -97,6 +97,10 @@ typedef enum {
9797 AGENT_MSG_TYPE_ACK
9898} agent_udp_message_type_t ;
9999
100+ // Arbitrary 64 bit numbers
101+ #define MAGIC_ORIGINATOR 0x9a9e2fbce63a11e5
102+ #define MAGIC_TARGET 0x60735c68f368aace
103+
100104/*
101105 * Ping and ACK messages
102106 */
@@ -110,6 +114,11 @@ typedef struct {
110114 uint32_t src_ipv4_addr ;
111115 uint32_t src_udp_port ;
112116
117+ /* A magic number that helps determine that the sender was Open
118+ MPI */
119+ uint64_t magic_number ;
120+ uint32_t major_version , minor_version ;
121+
113122 /* If this is a PING, the message should be this size.
114123 If this is an ACK, we are ACKing a ping of this size. */
115124 uint32_t size ;
@@ -327,48 +336,6 @@ static void agent_sendto(int fd, char *buffer, ssize_t numbytes,
327336 * All of the following functions run in agent thread
328337 **************************************************************************/
329338
330- /*
331- * Check to ensure that we expected to receive a ping from this sender
332- * on the interface in which it was received (i.e., did the usnic
333- * module corresponding to the received interface choose to pair
334- * itself with the sender's interface). If not, discard it.
335- *
336- * Note that there may be a race condition here. We may get a ping
337- * before we've setup endpoints on the module in question. It's no
338- * problem -- if we don't find it, we'll drop the PING and let the
339- * sender try again later.
340- */
341- static bool agent_thread_is_ping_expected (opal_btl_usnic_module_t * module ,
342- uint32_t src_ipv4_addr )
343- {
344- bool found = false;
345- opal_list_item_t * item ;
346-
347- /* If we have a NULL value for the module, it means that the MPI
348- process that is the agent hasn't submitted the LISTEN command
349- yet (which can happen for a fast sender / slow receiver). So
350- just return "ping is not [yet] expected". */
351- if (NULL == module ) {
352- return false;
353- }
354-
355- opal_mutex_lock (& module -> all_endpoints_lock );
356- if (module -> all_endpoints_constructed ) {
357- OPAL_LIST_FOREACH (item , & module -> all_endpoints , opal_list_item_t ) {
358- opal_btl_usnic_endpoint_t * ep ;
359- ep = container_of (item , opal_btl_usnic_endpoint_t ,
360- endpoint_endpoint_li );
361- if (src_ipv4_addr == ep -> endpoint_remote_modex .ipv4_addr ) {
362- found = true;
363- break ;
364- }
365- }
366- }
367- opal_mutex_unlock (& module -> all_endpoints_lock );
368-
369- return found ;
370- }
371-
372339/*
373340 * Handle an incoming PING message (send an ACK)
374341 */
@@ -411,29 +378,36 @@ static void agent_thread_handle_ping(agent_udp_port_listener_t *listener,
411378 return ;
412379 }
413380
414- /* Finally, check that the ping is from an interface that the
415- module expects */
416- if (!agent_thread_is_ping_expected (listener -> module ,
417- src_addr_in -> sin_addr .s_addr )) {
381+ if (msg -> magic_number != MAGIC_ORIGINATOR ) {
418382 opal_output_verbose (20 , USNIC_OUT ,
419- "usNIC connectivity got bad ping (from unexpected address: listener %s not paired with peer interface %s, discarded)" ,
420- listener -> ipv4_addr_str ,
421- real_ipv4_addr_str );
383+ "usNIC connectivity got bad ping (magic number: %" PRIu64 ", discarded)" ,
384+ msg -> magic_number );
385+ return ;
386+ }
387+ if (msg -> major_version != OPAL_MAJOR_VERSION ||
388+ msg -> minor_version != OPAL_MINOR_VERSION ) {
389+ opal_output_verbose (20 , USNIC_OUT ,
390+ "usNIC connectivity got bad ping (originator version: %d.%d, expected %d.%d, discarded)" ,
391+ msg -> major_version , msg -> minor_version ,
392+ OPAL_MAJOR_VERSION , OPAL_MINOR_VERSION );
422393 return ;
423394 }
424395
425- /* Ok, this is a good ping. Send the ACK back */
396+ /* Ok, this is a good ping. Send the ACK back. The PING sender
397+ will verify that the ACK came back from the IP address that it
398+ expected. */
426399
427400 opal_output_verbose (20 , USNIC_OUT ,
428401 "usNIC connectivity got PING (size=%ld) from %s; sending ACK" ,
429402 numbytes , msg_ipv4_addr_str );
430403
431404 /* Send back an ACK. No need to allocate a new buffer; just
432405 re-use the same buffer we just got. Note that msg->size is
433- already set. */
406+ already set. We simply echo back the sender's IP address/port
407+ in the msg (the sender will use the msg fields and the
408+ recvfrom() src_addr to check for a match). */
434409 msg -> message_type = AGENT_MSG_TYPE_ACK ;
435- msg -> src_ipv4_addr = listener -> ipv4_addr ;
436- msg -> src_udp_port = listener -> udp_port ;
410+ msg -> magic_number = MAGIC_TARGET ;
437411
438412 agent_sendto (listener -> fd , (char * ) listener -> buffer , sizeof (* msg ), from );
439413}
@@ -457,12 +431,22 @@ static void agent_thread_handle_ack(agent_udp_port_listener_t *listener,
457431 (int ) numbytes , str , (int ) sizeof (* msg ));
458432 return ;
459433 }
434+ if (msg -> magic_number != MAGIC_TARGET ) {
435+ opal_output_verbose (20 , USNIC_OUT ,
436+ "usNIC connectivity got bad ACK (magic number: %" PRIu64 ", discarded)" ,
437+ msg -> magic_number );
438+ return ;
439+ }
460440
461- /* Find the pending ping request that this ACK is for */
441+ /* Find the pending ping request (on this interface) for this ACK.
442+ If we don't find a match, we'll drop it. */
462443 agent_ping_t * ap ;
444+ uint32_t src_in_port = ntohs (src_addr_in -> sin_port );
463445 OPAL_LIST_FOREACH (ap , & pings_pending , agent_ping_t ) {
464- if (ap -> dest_ipv4_addr == msg -> src_ipv4_addr &&
465- ap -> dest_udp_port == msg -> src_udp_port ) {
446+ if (ap -> dest_ipv4_addr == src_addr_in -> sin_addr .s_addr &&
447+ ap -> dest_udp_port == src_in_port &&
448+ ap -> src_ipv4_addr == msg -> src_ipv4_addr &&
449+ ap -> src_udp_port == msg -> src_udp_port ) {
466450 /* Found it -- indicate that it has been acked */
467451 for (int i = 0 ; i < NUM_PING_SIZES ; ++ i ) {
468452 if (ap -> sizes [i ] == msg -> size ) {
@@ -913,6 +897,9 @@ static void agent_thread_cmd_ping(agent_ipc_listener_t *ipc_listener)
913897 msg -> message_type = AGENT_MSG_TYPE_PING ;
914898 msg -> src_ipv4_addr = ap -> src_ipv4_addr ;
915899 msg -> src_udp_port = ap -> src_udp_port ;
900+ msg -> magic_number = MAGIC_ORIGINATOR ;
901+ msg -> major_version = OPAL_MAJOR_VERSION ;
902+ msg -> minor_version = OPAL_MINOR_VERSION ;
916903 msg -> size = ap -> sizes [i ];
917904 }
918905
0 commit comments