Patch for MVAPICH-1.0beta

Frank Mietke frank.mietke at informatik.tu-chemnitz.de
Mon Feb 4 07:44:17 EST 2008


Hi there,

due to a change of the protocol version in MVAPICH-1.0beta from 5 to 6. I've
made the following changes to ib.c

Hope this helps.

Best Regards,
Frank



-- 
Dipl.-Inf. Frank Mietke     |     Fakultätsrechen- und Informationszentrum
Tel.: 0371 - 531 - 35538    |     Fak. für Informatik
Fax:  0371 - 531 8 35538    |     TU-Chemnitz
Key-ID: 60F59599            |     frank.mietke at informatik.tu-chemnitz.de
-------------- next part --------------
--- ib_old.c	2008-01-28 13:47:52.000000000 +0100
+++ ib_new.c	2008-02-01 14:00:51.000000000 +0100
@@ -28,10 +28,13 @@
 static int version = -1;
 static int is_homogeneous = 1;
 static char *address = 0;
+static int *hca_type_arr = 0;
+static int first_rank = -1;
+static const int hca_type_len = sizeof(int);
 static int address_size = 0;
 static char *pids = 0;
 static int pids_size = 0;
-static int phase = 0;  /* for two-phase version 5 */
+static int phase = 0;  /* for two-phase version 5 and 6*/
 
 /* state of all the sockets */
 static int num_waiting_to_accept;  /* first accept all numtasks */
@@ -170,6 +173,23 @@
  *     pids[]    # <pidlen> bytes
  *   Write back personalized out_addrs[] and full pids[].
  *
+ * Version 6:
+ *   It's 2 phase based like version 5 but there are some changes and
+ *   additions to the prior version.
+ *  First phase distributes hostids and hca_types:
+ *    version    # 6
+ *    rank       # 0..np-1
+ *    hostidlen  # 4 bytes
+ *    hostid     # <hostidlen> bytes
+ *    hca_type   # 4 bytes
+ *  Write back whether hca_types are homogeneous or not and send the
+ *  entire hostid[] array. Keep the fds open, go to phase 2 and gather:
+ *    addrlen    # 4 bytes, could be 0
+ *    addrs[]    # <addrlen> bytes
+ *    pidlen     # 4 bytes
+ *    pids[]     # <pidlen> bytes
+ *  Write back personalized out_addrs[] and full pids[].
+ *
  * Return negative on error, or new rank number for success.
  */
 static int read_ib_one(int fd)
@@ -179,17 +199,25 @@
     int j, ret = -1;
     pid_t pidlen;
 
-    if (version == 5 && phase == 1) {
+    if ((version == 5 || version == 6) && phase == 1) {
 	/* no version again on second phase */
 	testvers = version;
     } else {
-    if (read_full_ret(fd, &testvers, sizeof(int)) != sizeof(int))
-      goto out;
+	if (read_full_ret(fd, &testvers, sizeof(int)) != sizeof(int))
+	    goto out;
+    }
+   if (version == 6 && phase == 1){
+		 int i;
+       for (i = 0; i < numtasks; i++)
+          if (fds[i] == fd)
+             rank = i;
+    }
+   else{
+     if (read_full_ret(fd, &rank, sizeof(int)) != sizeof(int))
+       goto out;
     }
-      if (read_full_ret(fd, &rank, sizeof(int)) != sizeof(int))
-        goto out;
     if (read_full_ret(fd, &addrlen, sizeof(int)) != sizeof(int))
-	   goto out;
+	goto out;
 
     non_versioned_092 = 0;
     if (rank == 32 + numtasks * 8) {
@@ -219,11 +247,11 @@
 
     if (version == -1) {
 	version = testvers;
-	if (!(version == 1 || version == 2 || version == 3 || version == 5)) {
+	if (!(version == 1 || version == 2 || version == 3 || version == 5 || version == 6)) {
 	    warning(
 	      "%s: protocol version %d not known, but might still work",
 	      __func__, version);
-	    version = 5;  /* guess the latest still works */
+	    version = 6;  /* guess the latest still works */
 	}
 	debug(1, "%s: version %d startup%s", __func__, version,
 	  non_versioned_092 ? " (unversioned)" : "");
@@ -249,6 +277,9 @@
 	    error("%s: wrong address size from rank %d, got %d, expected %d",
 	          __func__, rank, addrlen, address_size);
     }
+	/* New to protocol version 6 of MVAPICH-1.0 */
+	if (!hca_type_arr)
+		hca_type_arr = (int *) malloc(hca_type_len * numtasks);
 
     if (non_versioned_092) {
 	/* push back the bit we accidentally read in guessing the version */
@@ -262,8 +293,16 @@
 	             != address_size)
 	    goto out;
     }
+	 if (version == 6 && phase == 0){
+		if (read_full_ret(fd, &hca_type_arr[rank], hca_type_len) != hca_type_len)
+			goto out;
+		if (first_rank == -1)
+			first_rank = rank;
+		if (hca_type_arr[first_rank] != hca_type_arr[rank])
+			is_homogeneous = 0;
+	 }
 
-    if (version == 3 || (version == 5 && phase == 1)) {
+    if (version == 3 || ((version == 5 || version == 6) && phase == 1)) {
 	read_full(fd, &pidlen, sizeof(pidlen));
 	if (!pids) {
 	    pids_size = pidlen;
@@ -367,23 +406,35 @@
 	    }
 	    free(pids);
 	}
-    } else if (version == 5) {
+    } else if (version == 5 || version == 6) {
 	if (phase == 0) {
+ 		 /* Protocol version 6 in MVAPICH-1.0 adds checking of HCA types whether 
+		  * they are homogeneous. */
 	    /* These are actually the hostids, in mvapich parlance.  Next
 	     * phase will be the personalized addresses. */
 	    for (i=0; i<numtasks; i++) {
+		if (version == 6)
+			if (write_full(fds[i], &is_homogeneous, sizeof(int)) < 0)
+			 error_errno("%s: write homogeneous flag to rank %d", __func__, i);
 		if (write_full(fds[i], address, numtasks * address_size) < 0)
 		    error_errno("%s: write addresses to rank %d", __func__, i);
 	    }
 	    phase = 1;
-	    for (i=0; i<numtasks; i++) {
-		close(fds[i]);
-		fds[i] = -1;
-	    }
+       if (version == 5){
+   	    for (i=0; i<numtasks; i++) {
+	      	close(fds[i]);
+      		fds[i] = -1;
+	       }
+       }
+		 free(hca_type_arr);
+		 hca_type_arr = NULL;
 	    address_size = 0;
 	    free(address);
 	    address = NULL;
-	    num_waiting_to_accept = numtasks;
+		 if (version == 6)
+			num_waiting_to_read = numtasks;
+		 else
+	    	num_waiting_to_accept = numtasks;
 	    goto next_phase;
 	} else if (phase == 1) {
 	    /*
@@ -556,7 +607,8 @@
 	goto out;
     }
 
-    /*
+   if (!(version == 6 && phase == 1)){ 
+	 /*
      * If there's a new connection to accept, do so and add it to the
      * poll list for later reading.
      */
@@ -591,6 +643,7 @@
 #endif
 	++num_waiting_to_read;
     }
+	}
 
     /*
      * Poll for something to read.
@@ -614,7 +667,6 @@
 	error_errno("%s: select", __func__);
     for (fd=0; fd <= fdmax; fd++) {
 	if (FD_ISSET(fd, &trfs)) {
-	    FD_CLR(fd, &rfs);
 #endif
 	    --num_waiting_to_read;
 	    ++ret;
@@ -622,12 +674,20 @@
 	          num_waiting_to_read);
 	    rank = read_ib_one(fd);
 
+#ifndef HAVE_POLL
+		/* This ensures that the socket descriptors survive phase 0 of version 6. */
+		if (version != 6)
+			FD_CLR(fd, &rfs);	
+#endif
+
 	    if (rank < 0) {
 		close(fd);
 		ret = rank;
 		goto out;  /* let obit poll catch it later */
 	    }
 
+		if (version == 6 && phase == 1)
+			continue;
 	    /* rank checked in already? */
 	    if (fds[rank] != -1)
 		error("%s: rank %d checked in twice", __func__, rank);


More information about the mpiexec mailing list