@@ -113,13 +113,30 @@ int qmanager_cb_t::post_sched_loop (flux_t *h, schedutil_t *schedutil,
113113 return rc;
114114}
115115
116+ /* The RFC 27 hello handshake occurs during scheduler initialization. Its
117+ * purpose is to inform the scheduler of jobs that already have resources
118+ * allocated. This callback is made once per job. The callback should return
119+ * 0 on success or -1 on failure. On failure, the job manager raises a fatal
120+ * exception on the job.
121+ *
122+ * Jobs that already have resources at hello need to be assigned to the correct
123+ * qmanager queue, but the queue is not provided in the hello metadata.
124+ * Therefore, jobspec is fetched from the KVS so that attributes.system.queue
125+ * can be extracted from it.
126+ *
127+ * Note that fluxion instantiates the "default" queue when no named queues
128+ * are configured. Therefore, when the queue attribute is not defined, we
129+ * put the job in the default queue.
130+ *
131+ * Fail the job if its queue attribute (or lack thereof) no longer matches a
132+ * valid queue. This can occur if queues have been reconfigured since job
133+ * submission.
134+ */
116135int qmanager_cb_t::jobmanager_hello_cb (flux_t *h, const flux_msg_t *msg,
117136 const char *R, void *arg)
118137
119138{
120- int rc = 0 ;
121- json_t *o = NULL ;
122- json_error_t err;
139+ int rc = -1 ;
123140 std::string R_out;
124141 char *qn_attr = NULL ;
125142 std::string queue_name;
@@ -130,53 +147,68 @@ int qmanager_cb_t::jobmanager_hello_cb (flux_t *h, const flux_msg_t *msg,
130147 unsigned int prio;
131148 uint32_t uid;
132149 double ts;
150+ json_t *jobspec = NULL ;
151+ flux_future_t *f = NULL ;
133152
153+ /* Don't expect jobspec to be set here as it is not currently defined
154+ * in RFC 27. However, add it anyway in case the hello protocol
155+ * evolves to include it. If it is not set, it must be looked up.
156+ */
134157 if (flux_msg_unpack (msg,
135- " {s:I s:i s:i s:f}" ,
158+ " {s:I s:i s:i s:f s?o }" ,
136159 " id" , &id,
137160 " priority" , &prio,
138161 " userid" , &uid,
139- " t_submit" , &ts) < 0 ) {
162+ " t_submit" , &ts,
163+ " jobspec" , &jobspec) < 0 ) {
140164 flux_log_error (h, " %s: flux_msg_unpack" , __FUNCTION__);
141165 goto out;
142166 }
143-
144- if ( (o = json_loads (R, 0 , &err)) == NULL ) {
145- rc = -1 ;
146- errno = EPROTO;
147- flux_log (h, LOG_ERR, " %s: parsing R for job (id=%jd): %s %s@%d:%d" ,
148- __FUNCTION__, static_cast <intmax_t > (id),
149- err.text , err.source , err.line , err.column );
167+ if (!jobspec) {
168+ char key[64 ] = { 0 };
169+ if (flux_job_kvs_key (key, sizeof (key), id, " jobspec" ) < 0
170+ || !(f = flux_kvs_lookup (h, NULL , 0 , key))
171+ || flux_kvs_lookup_get_unpack (f, " o" , &jobspec) < 0 ) {
172+ flux_log_error (h, " %s" , key);
173+ goto out;
174+ }
175+ }
176+ if (json_unpack (jobspec,
177+ " {s?{s?{s?s}}}" ,
178+ " attributes" ,
179+ " system" ,
180+ " queue" , &qn_attr) < 0 ) {
181+ flux_log_error (h, " error parsing jobspec" );
150182 goto out;
151183 }
152- if ( (rc = json_unpack (o, " { s?:{s?:{s?:{s?:s}}} }" ,
153- " attributes" ,
154- " system" ,
155- " scheduler" ,
156- " queue" , &qn_attr)) < 0 ) {
157- json_decref (o);
158- errno = EPROTO;
159- flux_log (h, LOG_ERR, " %s: json_unpack for attributes" , __FUNCTION__);
184+ if (qn_attr)
185+ queue_name = qn_attr;
186+ else
187+ queue_name = ctx->opts .get_opt ().get_default_queue_name ();
188+ if (ctx->queues .find (queue_name) == ctx->queues .end ()) {
189+ flux_log (h,
190+ LOG_ERR,
191+ " %s: unknown queue name (id=%jd queue=%s)" ,
192+ __FUNCTION__,
193+ static_cast <intmax_t > (id),
194+ queue_name.c_str ());
160195 goto out;
161196 }
162-
163- queue_name = qn_attr? qn_attr : ctx->opts .get_opt ().get_default_queue_name ();
164- json_decref (o);
165197 queue = ctx->queues .at (queue_name);
166198 running_job = std::make_shared<job_t > (job_state_kind_t ::RUNNING,
167199 id, uid, calc_priority (prio),
168200 ts, R);
169201
170- if ( (rc = queue->reconstruct (static_cast <void *> (h),
171- running_job, R_out)) < 0 ) {
202+ if (queue->reconstruct (static_cast <void *> (h), running_job, R_out) < 0 ) {
172203 flux_log_error (h, " %s: reconstruct (id=%jd queue=%s)" , __FUNCTION__,
173204 static_cast <intmax_t > (id), queue_name.c_str ());
174205 goto out;
175206 }
176207 flux_log (h, LOG_DEBUG, " requeue success (queue=%s id=%jd)" ,
177208 queue_name.c_str (), static_cast <intmax_t > (id));
178-
209+ rc = 0 ;
179210out:
211+ flux_future_destroy (f);
180212 return rc;
181213}
182214
0 commit comments