Capturing return values from concurrent mpiexecs
Pete Wyckoff
pw at osc.edu
Tue Nov 8 15:06:31 EST 2005
martin.schaffoener at e-technik.uni-magdeburg.de wrote on Tue, 08 Nov 2005 21:50 +0200:
> Don't know if it worked before; I only started using mpiexec now (0.80) to
> replace all rsh-related stuff on our nodes.
Here's a little patch to CVS that seems to work here, at least for
./mpiexec -server &
./mpiexec --comm=none -n 1 /bin/false
Let me know if it seems to work for you and I'll check it in. I
think that the modified file hasn't been changed much since 0.80
so it should patch cleanly to that distribution.
It looks like propagating the exit statuses just never got coded
up in the first place, although it was pretty easy.
> > I'd imagine that "mpiexec -server" would always return 0 unless it
> > died in some unnatural way---it would never return the exit status
> > of tasks it started for any of its clients.
>
> If I send SIGTERM to "mpiexec -server", it returns 1.
Hrm, now that I think of it, is there any way that a pure server
can exit cleanly? Shouldn't it always return 1? Let me know if
you have any ideas.
> > The combined return value of a parallel task is always just the exit
> > status of task #0, although there will be warning lines on stderr to
> > report the non-zero exit statuses of other tasks. I couldn't come
> > up with any better way of reporting the array of exit statuses. Any
> > suggestions?
>
> Well, I guess only returning the exit status of task #0 and warning about
> others is fine, I think. Only if I start, say, 8 concurrent "mpiexec -n x"
> while "mpiexec -server" is running I would like to get 8 return values.
Agreed. Hopefully that is okay now.
-- Pete
-------------- next part --------------
Index: concurrent.c
===================================================================
RCS file: /cvs/mpiexec/concurrent.c,v
retrieving revision 1.9
diff -u -p -r1.9 concurrent.c
--- concurrent.c 8 Sep 2005 12:49:20 -0000 1.9
+++ concurrent.c 8 Nov 2005 20:01:08 -0000
@@ -65,8 +65,9 @@ typedef enum {
CLIENT_SPAWN,
CLIENT_KILL,
MASTER_RESPONSE, /* to one of the above client requests */
- MASTER_EVT, /* generated by master on its own */
- MASTER_EVT_START,
+ MASTER_EVT, /* generated by master on its own */
+ MASTER_EVT_START, /* specializations: start and obit have extra data */
+ MASTER_EVT_OBIT,
} cli_command_t;
/* make this configurable later */
@@ -652,7 +653,7 @@ handle_client_kill(int n)
typedef struct {
int cmd;
int evt;
- int obit_evt;
+ int extra;
} pushed_events_t;
static pushed_events_t *pushed_events = 0;
static int num_pushed_events = 0;
@@ -664,7 +665,7 @@ static int max_pushed_events = 0;
* but pop and memcpy from the front.
*/
static void
-push_event(int cmd, int evt, int obit_evt)
+push_event(int cmd, int evt, int extra)
{
if (num_pushed_events == max_pushed_events) {
void *x = pushed_events;
@@ -678,7 +679,7 @@ push_event(int cmd, int evt, int obit_ev
}
pushed_events[num_pushed_events].cmd = cmd;
pushed_events[num_pushed_events].evt = evt;
- pushed_events[num_pushed_events].obit_evt = obit_evt;
+ pushed_events[num_pushed_events].extra = extra;
++num_pushed_events;
}
@@ -706,7 +707,8 @@ concurrent_poll(int block)
fd_set rfs;
int n;
cli_command_t cmd;
- tm_event_t evt, obit_evt;
+ tm_event_t evt;
+ int extra;
pushed_events_t pe;
evts_t *ep;
@@ -718,7 +720,7 @@ concurrent_poll(int block)
if (pe.evt != -1) {
cmd = pe.cmd;
evt = pe.evt;
- obit_evt = pe.obit_evt;
+ extra = pe.extra;
goto found;
}
@@ -746,8 +748,9 @@ concurrent_poll(int block)
read_full(fifo, &evt, sizeof(evt));
break;
case MASTER_EVT_START:
+ case MASTER_EVT_OBIT:
read_full(fifo, &evt, sizeof(evt));
- read_full(fifo, &obit_evt, sizeof(obit_evt));
+ read_full(fifo, &extra, sizeof(extra));
break;
default:
error("%s: unknown command %d", __func__, cmd);
@@ -757,7 +760,9 @@ concurrent_poll(int block)
if (!ep)
error("%s: no event structure for %d", __func__, evt);
if (cmd == MASTER_EVT_START)
- ep->obit_evt = obit_evt;
+ ep->obit_evt = extra; /* autogenerated new obit event */
+ else if (cmd == MASTER_EVT_OBIT)
+ tasks[ep->task].status = extra; /* exit status of task */
}
out:
return ep;
@@ -771,7 +776,8 @@ static void
read_master_response(void)
{
cli_command_t cmd;
- tm_event_t evt, obit_evt;
+ tm_event_t evt;
+ int extra;
for (;;) {
read_full(fifo, &cmd, sizeof(cmd));
@@ -780,12 +786,13 @@ read_master_response(void)
return;
case MASTER_EVT:
read_full(fifo, &evt, sizeof(evt));
- push_event(cmd, evt, obit_evt);
+ push_event(cmd, evt, 0);
break;
case MASTER_EVT_START:
+ case MASTER_EVT_OBIT:
read_full(fifo, &evt, sizeof(evt));
- read_full(fifo, &obit_evt, sizeof(obit_evt));
- push_event(cmd, evt, obit_evt);
+ read_full(fifo, &extra, sizeof(extra));
+ push_event(cmd, evt, extra);
break;
default:
error("%s: unknown event %d", __func__, cmd);
@@ -908,6 +915,7 @@ void cm_forward_event(evts_t *ep)
cli_command_t cmd;
tids_t *tp;
tm_event_t obit_evt = TM_NULL_EVENT;
+ int exit_status = 0;
debug(2, "%s: event %d for client %d task %d type %s", __func__, ep->evt,
ep->client, ep->task, evt_type_string(ep->type));
@@ -950,6 +958,7 @@ void cm_forward_event(evts_t *ep)
if (!tp)
error("%s: lost tid for client %d task %d at obit", __func__,
ep->client, ep->task);
+ exit_status = tp->status;
tid_del(tp);
}
@@ -963,6 +972,13 @@ void cm_forward_event(evts_t *ep)
ret = write_client(n, &ep->evt, sizeof(ep->evt));
if (ret == 0)
write_client(n, &obit_evt, sizeof(obit_evt));
+ } else if (ep->type == EVT_OBIT) {
+ cmd = MASTER_EVT_OBIT;
+ ret = write_client(n, &cmd, sizeof(cmd));
+ if (ret == 0)
+ ret = write_client(n, &ep->evt, sizeof(ep->evt));
+ if (ret == 0)
+ write_client(n, &exit_status, sizeof(exit_status));
} else {
cmd = MASTER_EVT;
ret = write_client(n, &cmd, sizeof(cmd));
More information about the mpiexec
mailing list