Patch for MVAPICH-1.0beta
Pete Wyckoff
pw at osc.edu
Mon Feb 4 14:52:43 EST 2008
frank.mietke at informatik.tu-chemnitz.de wrote on Mon, 04 Feb 2008 13:44 +0100:
> due to a change of the protocol version in MVAPICH-1.0beta from 5 to 6. I've
> made the following changes to ib.c
Looks like great stuff. Thanks for figuring all this out. I
applied your patch by hand as the diff wasn't against SVN so I
had to guess. Things like is_homogeneous did not show up as new
in your patch, so I could have missed something else too.
Can you look this over and test? The only substantive change I made
from your version (barring missed pieces) was to change the way you
managed the poll set to keep the sockets open for the second phase.
I made it more apparent that this was a hack, and put it up in the
phase transition code area, and handled poll() users too. Still not
so pretty, but I'm impressed you were able to accommodate this new
style in the existing code!
Also I ignored a possibly-important hunk that put the
accept(mport_fd) code under if (version == 6 && phase == 1) because
I didn't understand it. Maybe you could explain and/or find another
way to conditionalize that section if needed? I figured we could
just leave it open and that code to accept() would never trigger.
You might want to subscribe to the mailing list too.
-- Pete
Index: ib.c
===================================================================
--- ib.c (revision 409)
+++ ib.c (working copy)
@@ -30,7 +30,10 @@ static char *address = 0;
static int address_size = 0;
static char *pids = 0;
static int pids_size = 0;
-static int phase = 0; /* for two-phase version 5 */
+static int phase = 0; /* for two-phase versions 5 and 6 */
+static int *hca_type = NULL;
+static int hca_first_rank = -1; /* to determine if homogeneous or not */
+static int is_homogeneous = 1; /* until proven otherwise */
/* state of all the sockets */
static int num_waiting_to_accept; /* first accept all numtasks */
@@ -169,6 +172,24 @@ prepare_ib_startup_port(int *fd)
* pids[] # <pidlen> bytes
* Write back personalized out_addrs[] and full pids[].
*
+ * Version 6:
+ * It's 2 phase based like version 5 with a major change that the socket
+ * to each task stays open across phases, and some data tweaks.
+ * First phase distributes hostids and hca_types:
+ * version # 6
+ * rank # 0..np-1
+ * hostidlen # 4 bytes
+ * hostid # <hostidlen> bytes
+ * hca_type # 4 bytes
+ * Write back whether hca_types are homogeneous or not and send the
+ * entire hostid[] array. Keep the fds open, go to phase 2 and gather:
+ * addrlen # 4 bytes, could be 0
+ * addrs[] # <addrlen> bytes
+ * pidlen # 4 bytes
+ * pids[] # <pidlen> bytes
+ * Write back is_homogeneous (4 bytes), personalized out_addrs[] and
+ * full pids[].
+ *
* Return negative on error, or new rank number for success.
*/
static int read_ib_one(int fd)
@@ -178,15 +199,28 @@ static int read_ib_one(int fd)
int j, ret = -1;
pid_t pidlen;
- if (version == 5 && phase == 1) {
+ if ((version == 5 || version == 6) && phase == 1) {
/* no version again on second phase */
testvers = version;
} else {
if (read_full_ret(fd, &testvers, sizeof(int)) != sizeof(int))
goto out;
}
- if (read_full_ret(fd, &rank, sizeof(int)) != sizeof(int))
- goto out;
+
+ if (version == 6 && phase == 1) {
+ /* since socket stays open, tasks do not send rank, so figure it out
+ * from the fd the slow way. This can not fail.
+ */
+ for (j = 0; j < numtasks; j++)
+ if (fds[j] == fd) {
+ rank = j;
+ break;
+ }
+ } else{
+ if (read_full_ret(fd, &rank, sizeof(int)) != sizeof(int))
+ goto out;
+ }
+
if (read_full_ret(fd, &addrlen, sizeof(int)) != sizeof(int))
goto out;
@@ -218,11 +252,12 @@ static int read_ib_one(int fd)
if (version == -1) {
version = testvers;
- if (!(version == 1 || version == 2 || version == 3 || version == 5)) {
+ if (!(version == 1 || version == 2 || version == 3 || version == 5 ||
+ version == 6)) {
warning(
"%s: protocol version %d not known, but might still work",
__func__, version);
- version = 5; /* guess the latest still works */
+ version = 6; /* guess the latest still works */
}
debug(1, "%s: version %d startup%s", __func__, version,
non_versioned_092 ? " (unversioned)" : "");
@@ -249,6 +284,10 @@ static int read_ib_one(int fd)
__func__, rank, addrlen, address_size);
}
+ /* New to protocol version 6 of MVAPICH-1.0 */
+ if (version == 6 && hca_type == NULL)
+ hca_type = malloc(numtasks * sizeof(int));
+
if (non_versioned_092) {
/* push back the bit we accidentally read in guessing the version */
for (j=0; j<4; j++)
@@ -262,7 +301,16 @@ static int read_ib_one(int fd)
goto out;
}
- if (version == 3 || (version == 5 && phase == 1)) {
+ if (version == 6 && phase == 0) {
+ if (read_full_ret(fd, &hca_type[rank], sizeof(int)) != sizeof(int))
+ goto out;
+ if (hca_first_rank == -1)
+ hca_first_rank = rank;
+ if (hca_type[hca_first_rank] != hca_type[rank])
+ is_homogeneous = 0;
+ }
+
+ if (version == 3 || ((version == 5 || version == 6) && phase == 1)) {
read_full(fd, &pidlen, sizeof(pidlen));
if (!pids) {
pids_size = pidlen;
@@ -366,23 +414,45 @@ next_phase:
}
free(pids);
}
- } else if (version == 5) {
+ } else if (version == 5 || version == 6) {
if (phase == 0) {
/* These are actually the hostids, in mvapich parlance. Next
* phase will be the personalized addresses. */
for (i=0; i<numtasks; i++) {
+ if (version == 6)
+ if (write_full(fds[i], &is_homogeneous, sizeof(int)) < 0)
+ error_errno("%s: write homogeneous flag to rank %d",
+ __func__, i);
if (write_full(fds[i], address, numtasks * address_size) < 0)
error_errno("%s: write addresses to rank %d", __func__, i);
}
phase = 1;
- for (i=0; i<numtasks; i++) {
- close(fds[i]);
- fds[i] = -1;
+ if (version == 5) {
+ for (i=0; i<numtasks; i++) {
+ close(fds[i]);
+ fds[i] = -1;
+ }
+ num_waiting_to_accept = numtasks; /* will reconnect */
+ } else {
+ /* Hackity hack. Put these back on the ready-to-read list. */
+ for (i=0; i<numtasks; i++) {
+#ifdef HAVE_POLL
+ pfs[num_waiting_to_read].fd = fds[i];
+ pfs[num_waiting_to_read].events = POLLIN;
+ pfs[num_waiting_to_read].revents = 0;
+#else
+ FD_SET(fds[i], &rfs);
+#endif
+ }
+ num_waiting_to_read = numtasks; /* socks stay open */
}
address_size = 0;
free(address);
address = NULL;
- num_waiting_to_accept = numtasks;
+ if (version == 6) {
+ free(hca_type);
+ hca_type = NULL;
+ }
goto next_phase;
} else if (phase == 1) {
/*
More information about the mpiexec
mailing list