LAM/MPI support for Mpiexec (experimental)

Ben Webb ben at bellatrix.pcl.ox.ac.uk
Fri May 17 07:35:26 EDT 2002


On Thu, May 16, 2002 at 07:46:20PM +0100, Ben Webb wrote:
> 	The attached patch to LAM 6.5.6 makes it work with Mpiexec on a
> PBS cluster.
...
> - Processes started via. LAM's mpirun don't record their CPU usage, etc.
>   with PBS, and neither do they get killed if you "qdel" the PBS job.
>   I'm not entirely sure why this is, as the processes are children of
>   the TM-spawned lamd process. I think lamd must be calling setsid()
>   somewhere. I will investigate further.

Yes, in share/kreq/kcreate.c. Fixed in the attached, updated, patch.

> - Mpiexec reports I/O errors after startup from lamboot. I think this is
>   because it inherits lamboot's stdin etc., and am pretty sure that just
>   closing these descriptors will solve this problem.

Closing stdin appears to solve the problem; this is what I do with the
new patch.

	PBS now keeps track of LAM/MPI processes properly. There are
still some problems, however (suggestions appreciated)... "qdel" on a
LAM/MPI job tends to leave processes running on node 0, and any SysV IPC
shared memory or semaphores are not cleaned up. I have my kill_delay set
to 30, so I suspect this is because lamd doesn't catch SIGTERM.

	Ben
-- 
ben at bellatrix.pcl.ox.ac.uk           http://bellatrix.pcl.ox.ac.uk/~ben/
"So you found a girl who thinks really deep thoughts,
 What's so amazing about really deep thoughts?"
-------------- next part --------------
diff -Nur -Xlam.exclude lam-6.5.6/share/boot/lambootagent.c lam-6.5.6-patched/share/boot/lambootagent.c
--- lam-6.5.6/share/boot/lambootagent.c	Mon Nov 19 16:13:45 2001
+++ lam-6.5.6-patched/share/boot/lambootagent.c	Fri May 17 11:02:25 2002
@@ -73,8 +73,8 @@
 int
 lambootagent(struct lamnode *lamnet, int nlamnet, int *nboot, int *nrun)
 {
-	int		agent_port;	/* port number for replies */
-	int		agent_sd;	/* socket for replies */
+	int		agent_port[nlamnet];	/* port number for replies */
+	int		agent_sd[nlamnet];	/* socket for replies */
 	int		boot_sd;	/* connection to new node */
 	int		cmdc;		/* command vector count */
 	int		dlport;
@@ -84,7 +84,12 @@
 	int4		origin;		/* origin node ID */
 	char		**cmdv;		/* command vector */
 	char		*batchid;	/* batch job ID */
+	char		*mpiexec[10];	/* argv for mpiexec invocation */
+	char		tmpnam[80];
+	int		tmpfd;
+	FILE		*fp;
 	unsigned char	*p;
+	pid_t		childpid;
 
 	*nboot = 0;
 	*nrun = 0;
@@ -99,22 +104,43 @@
 	fl_verbose = opt_taken('v');
 	fl_fast = opt_taken('b');
 	fl_close = opt_taken('s');
+
 /*
- * Allocate a server socket and port.
+ * Write mpiexec config file.
  */
-	agent_port = 0;
-	agent_sd = sfh_sock_open_srv_inet_stm(&agent_port);
-	if (agent_sd < 0) {
-	  show_help("boot", "socket-fail", NULL);
-	  return(LAMERROR);
+	strcpy(tmpnam, "/tmp/lam-mpiexec.cfg-XXXXXX");
+	tmpfd = mkstemp(tmpnam);
+	if (tmpfd == -1) {
+		perror("Create temporary file failed");
+		exit(1);
+	}
+	fp = fdopen(tmpfd, "w");
+	if (!fp) {
+		perror("Open of temp file failed");
+		exit(1);
+	}
+	if (fl_verbose) {
+		printf("Using mpiexec config file %s\n", tmpnam);
 	}
+
 /*
- * Make the socket close on exec.
+ * Allocate server sockets and ports.
  */
-	if (fcntl(agent_sd, F_SETFD, 1) == -1) {
-	  show_help(NULL, "system-call-fail", "fcntl (set close-on-exec)", 
-		    NULL);
-	  return(LAMERROR);
+	for (i = 0; i < nlamnet; i++) {
+	  agent_port[i] = 0;
+	  agent_sd[i] = sfh_sock_open_srv_inet_stm(&agent_port[i]);
+	  if (agent_sd[i] < 0) {
+	    show_help("boot", "socket-fail", NULL);
+	    return(LAMERROR);
+	  }
+/*
+ * Make the sockets close on exec.
+ */
+	  if (fcntl(agent_sd[i], F_SETFD, 1) == -1) {
+	    show_help(NULL, "system-call-fail", "fcntl (set close-on-exec)", 
+		      NULL);
+	    return(LAMERROR);
+	  }
 	}
 /*
  * Find the local node.
@@ -160,18 +186,14 @@
 /*
  * Invoke hboot on the new host.
  */
-		cmdc = 0;
-		cmdv = 0;
-		argvadd(&cmdc, &cmdv, DEFTHBOOT);
-		argvadd(&cmdc, &cmdv, "-t");
-		argvadd(&cmdc, &cmdv, "-c");
-		argvadd(&cmdc, &cmdv, "lam-conf.lam");
+		fprintf(fp, "%s : %s -t -c lam-conf.lam", lamnet[i].lnd_hname,
+						          DEFTHBOOT);
 
 		if (fl_debug) {
-			argvadd(&cmdc, &cmdv, "-d");
+			fprintf(fp, " -d");
 		}
 		if (fl_verbose) {
-			argvadd(&cmdc, &cmdv, "-v");
+			fprintf(fp, " -v");
 		}
 /*
  * If remote node, close stdio of processes, unless forced by the
@@ -180,7 +202,7 @@
  * hboot/lamd on somenode to close their stdio so that rsh can finish.
  */
 		if (i != local || fl_close) {
-			argvadd(&cmdc, &cmdv, "-s");
+			fprintf(fp, " -s");
 		}
 /*
  * If this is under a batch system, pass the -b to both hboot and to
@@ -188,26 +210,22 @@
  */
 		batchid = get_batchid();
 		if (strlen(batchid) > 0) {
-		  argvadd(&cmdc, &cmdv, "-b");
-		  argvadd(&cmdc, &cmdv, batchid); 
+		  fprintf(fp, " -b %s", batchid);
 		}
 /*
  * Override the $inet_topo variable.
  */
 		p = (unsigned char *) &lamnet[local].lnd_addr.sin_addr;
-		argvadd(&cmdc, &cmdv, "-I");
-		sprintf(buf, "%c%s-H %u.%u.%u.%u -P %d -n %d -o %d %s %s%c",
-			i == local ? ' ' : '"',
+		fprintf(fp, " -I \" %s-H %u.%u.%u.%u -P %d -n %d -o %d %s %s\"",
 			opt_taken('x') ? "-x " : "",
 			(unsigned) p[0], (unsigned) p[1],
 			(unsigned) p[2], (unsigned) p[3],
-			agent_port,
+			agent_port[i],
 			i,
 			origin,
 			(strlen(batchid) == 0 ? " " : "-b"),
-			(strlen(batchid) == 0 ? " " : batchid),
-			i == local ? ' ' : '"');
-		argvadd(&cmdc, &cmdv, buf);
+			(strlen(batchid) == 0 ? " " : batchid));
+		fprintf(fp, "\n");
 
 		VERBOSE("Executing %s on n%d (%s - %d CPU%s)...\n", 
 			DEFTHBOOT, i, lamnet[i].lnd_hname,
@@ -215,48 +233,38 @@
 			(lamnet[i].lnd_ncpus > 1) ? "s" : "");
 
 		(*nboot)++;
+	}
 
-		if (i == local) {
-		        if (fl_debug) {
-			  int j;
-			  
-			  printf("lamboot: attempting to execute \"");
-			  for (j = 0; j < cmdc; j++) {
-			    if (j > 0)
-			      printf(" ");
-			    if (strchr(cmdv[j], ' ') != NULL)
-			      printf("\"%s\"", cmdv[j]);
-			    else
-			      printf("%s", cmdv[j]);
-			  }
-			  printf("\"\n");
-			}
-			r = _lam_few(cmdv);
-
-			if (r) {
-				(*nboot)--;
-				errno = r;
-				show_help("boot", "fork-fail", cmdv[0], NULL);
-				argvfree(cmdv);
-				return(LAMERROR);
-			}
-		} else {
-			r = inetexec(lamnet[i].lnd_hname, lamnet[i].lnd_uname,
-				     cmdv, (fl_debug ? "lamboot" : NULL),
-				     fl_fast);
-
-			if (r) {
-				(*nboot)--;
-				argvfree(cmdv);
-				/* inetexec will display errors if it
-                                   fails */
-				return(LAMERROR);
-			}
-		}
+/*
+ * Fire off mpiexec to start the hboot processes.
+ */
+	fclose(fp);
+	mpiexec[0] = MPIEXEC;
+	mpiexec[1] = "-comm=none";
+	mpiexec[2] = "-config";
+	mpiexec[3] = tmpnam;
+	mpiexec[4] = NULL;
+	(void) fflush(stdout);
+	(void) fflush(stderr);
+	childpid = fork();
+	if (childpid == -1) {
+		lamfail("lambootagent fork failed");
+	} else if (childpid == 0) {
+		fclose(stdin);
+		execv(mpiexec[0], mpiexec);
+		lamfail("execv failed");
+	}
+
+	for (i = 0; i < nlamnet; ++i) {
+/*
+ * Skip nodes that are invalid or already booted.
+ */
+		if ((lamnet[i].lnd_nodeid == NOTNODEID) ||
+				!(lamnet[i].lnd_type & NT_BOOT)) continue;
 /*
  * Accept a connection from the new host.
  */
-		boot_sd = sfh_sock_accept_tmout(agent_sd, LAM_TO_BOOT);
+		boot_sd = sfh_sock_accept_tmout(agent_sd[i], LAM_TO_BOOT);
 		if (boot_sd < 0) return(LAMERROR);
 /*
  * Read the new host port numbers.
@@ -272,7 +280,17 @@
 		(*nrun)++;
 	}
 
-	if (close(agent_sd)) return(LAMERROR);
+	if (fl_verbose) {
+		printf("all nodes connected\n");
+	}
+/*
+ * mpiexec must have fired up by now, so we can remove the config file
+ */
+	unlink(tmpnam);
+
+	for (i = 0; i < nlamnet; ++i) {
+		if (close(agent_sd[i])) return(LAMERROR);
+	}
 
 	if (fl_verbose) {
 		nodespin_init("topology");
diff -Nur -Xlam.exclude lam-6.5.6/share/include/lamnet.h lam-6.5.6-patched/share/include/lamnet.h
--- lam-6.5.6/share/include/lamnet.h	Mon Nov 19 16:13:40 2001
+++ lam-6.5.6-patched/share/include/lamnet.h	Fri May 17 11:01:36 2002
@@ -61,6 +61,7 @@
 #define DEFTHBOOT	"hboot"
 #define DEFTTKILL	"tkill"
 #define DEFTWIPE	"wipe"
+#define MPIEXEC		"/usr/bin/mpiexec"
 
 /*
  * node description
diff -Nur -Xlam.exclude lam-6.5.6/share/kreq/kcreate.c lam-6.5.6-patched/share/kreq/kcreate.c
--- lam-6.5.6/share/kreq/kcreate.c	Mon Nov 19 16:13:52 2001
+++ lam-6.5.6-patched/share/kreq/kcreate.c	Fri May 17 10:59:18 2002
@@ -120,7 +120,10 @@
 	sigaction(SIGCHLD, &act, 0);
 	sigaction(SIGPIPE, &act, 0);
 
-	(void) setsid();
+/*
+ * Do NOT call setsid; this way PBS can keep track of the spawned process.
+ */
+/*	(void) setsid();*/
 /*
  * Redirect the stdio fd's
  */
diff -Nur -Xlam.exclude lam-6.5.6/tools/hboot/hboot.c lam-6.5.6-patched/tools/hboot/hboot.c
--- lam-6.5.6/tools/hboot/hboot.c	Mon Nov 19 16:14:48 2001
+++ lam-6.5.6-patched/tools/hboot/hboot.c	Thu May 16 16:15:52 2002
@@ -99,6 +99,8 @@
 	char		buf[32];	/* formatting buffer */
 	char		*full;		/* full pathname */
 	char		*tail;		/* tail of full pathname */
+	char **pt;
+	int		status;
 
 	/* Ensure that we are not root */
 
@@ -245,7 +247,7 @@
 	  exit(errno);
 	}
 
-#if 1
+#if 0
 	/* Comment this out to make the TM extensions to PBS work
            nicely -- everything will be in one session, so TM can kill
            it when it dies. */
@@ -304,6 +306,7 @@
 			if (fl_debug) {
 			  printf("hboot: attempting to execute \n");
 			}
+
 			execvp(p->psc_argv[0], p->psc_argv);
 			exit(errno);
 		}
@@ -323,6 +326,7 @@
 
 				printf("\n");
 			}
+			wait(&status);
 		}
 
 		if (p->psc_delay > 0) {
diff -Nur -Xlam.exclude lam-6.5.6/tools/lamboot/lamboot.c lam-6.5.6-patched/tools/lamboot/lamboot.c
--- lam-6.5.6/tools/lamboot/lamboot.c	Mon Nov 19 16:14:49 2001
+++ lam-6.5.6-patched/tools/lamboot/lamboot.c	Thu May 16 13:13:25 2002
@@ -271,6 +271,7 @@
  */
 	if (cmdc == 2) {
 		fname = cmdv[1];
+	} else if ((fname = getenv("PBS_NODEFILE"))) {
 	} else if ((fname = getenv("LAMBHOST"))) {
 	} else if ((fname = getenv("TROLLIUSBHOST"))) {
 	} else {
diff -Nur -Xlam.exclude lam-6.5.6/tools/wipe/wipe.c lam-6.5.6-patched/tools/wipe/wipe.c
--- lam-6.5.6/tools/wipe/wipe.c	Mon Nov 19 16:14:50 2001
+++ lam-6.5.6-patched/tools/wipe/wipe.c	Fri May 17 11:02:58 2002
@@ -86,6 +86,8 @@
 	int		badhost;	/* bad host index */
 	int		r, j, success = 1;
 	struct lamnode	*lamnet;	/* network description array */
+	char		tmpnam[80];
+	int		tmpfd;
 
 	/* Ensure that we are not root */
 
@@ -192,15 +194,23 @@
 	} else {
 	  DBUG("wipe: killing LAM from a non-member machine\n");
 	}
+
 /*
- * Build the tkill command.
+ * Write mpiexec config file.
  */
-	cmdn = 0;
-	cmdv = 0;
-	argvadd(&cmdn, &cmdv, DEFTTKILL);
-
-	if (fl_debug) {
-		argvadd(&cmdn, &cmdv, "-d");
+	strcpy(tmpnam, "/tmp/lam-mpiexec.cfg-XXXXXX");
+	tmpfd = mkstemp(tmpnam);
+	if (tmpfd == -1) {
+		perror("Create temporary file failed");
+		exit(1);
+	}
+	fp = fdopen(tmpfd, "w");
+	if (!fp) {
+		perror("Open of temp file failed");
+		exit(1);
+	}
+	if (fl_verbose) {
+		printf("Using mpiexec config file %s\n", tmpnam);
 	}
 
 	if (opt_taken('n')) {
@@ -208,71 +218,46 @@
 	} else {
 		limit = -1;
 	}
+
+	for (i = 0; (i < nlamnet) && limit; ++i) {
+		if (limit > 0) --limit;
+		fprintf(fp, lamnet[i].lnd_hname);
+	}
+	fprintf(fp, " : %s", DEFTTKILL);
+	if (fl_debug) {
+		fprintf(fp, " -d");
+	}
+
 /*
  * If we're running ounder a batch system, we have to propogate the
  * socket name to all the remote tkill instances.
  */
 	batchid = get_batchid();
 	if (strlen(batchid) > 0) {
-	  argvadd(&cmdn, &cmdv, "-b");
-	  argvadd(&cmdn, &cmdv, batchid);
+	  fprintf(fp, " -b %s", batchid);
 	}
+
 /*
- * Loop over all host nodes.
+ * Build the mpiexec command.
  */
-	global_ret = 0;
-
-	for (i = 0; (i < nlamnet) && limit; ++i) {
-
-		if (limit > 0) --limit;
-
-		VERBOSE("Executing %s on n%d (%s)...\n", DEFTTKILL,
-				lamnet[i].lnd_nodeid, lamnet[i].lnd_hname);
-
-                if (fl_debug) {
-		  printf("wipe: attempting to launch \"");
-		  for (j = 0; j < cmdn; j++) {
-		    if (j > 0)
-		      printf(" ");
-		    printf("%s", cmdv[j]);
-		  }
-		  printf("\" ");
-		}
-
-		if (lamnet[i].lnd_type & NT_ORIGIN) {
-		        DBUG("(local execution)\n");
-			r = _lam_few(cmdv);
-
-			if (r) {
-				errno = r;
-			}
-		} else {
-		        DBUG("(remote execution)\n");
-			r = inetexec(lamnet[i].lnd_hname,
-				     lamnet[i].lnd_uname, cmdv, 
-				     (fl_debug ? "wipe" : NULL),
-				     fl_fast);
-		}
-
-		if (r) {
-			fprintf(stderr, "wipe: %s failed on n%d (%s)\n",
-					DEFTTKILL, lamnet[i].lnd_nodeid,
-					lamnet[i].lnd_hname);
-
-			if (errno != EUNKNOWN) {
-				terror("wipe");
-			} else
-			  show_help(NULL, "unknown", NULL);
-
-			global_ret = errno;
-			success = 0;
-		}
-	}
-
-	if (success) {
-	  DBUG("wipe completed successfully\n");
+	cmdn = 0;
+	cmdv = 0;
+	argvadd(&cmdn, &cmdv, MPIEXEC);
+	argvadd(&cmdn, &cmdv, "-comm=none");
+	argvadd(&cmdn, &cmdv, "-config");
+	argvadd(&cmdn, &cmdv, tmpnam);
+
+	r = _lam_few(cmdv);
+	unlink(tmpnam);
+
+	if (r) {
+		errno = r;
+		if (errno != EUNKNOWN) {
+			terror("wipe");
+		} else
+		  show_help(NULL, "unknown", NULL);
 	} else {
-	  DBUG("wipe did NOT complete successfully\n");
+		DBUG("wipe completed successfully\n");
 	}
 
 	argvfree(cmdv);
@@ -308,6 +293,7 @@
  */
 	if (cmdc == 2) {
 		bhost = cmdv[1];
+	} else if ((bhost = getenv("PBS_NODEFILE"))) {
 	} else if ((bhost = getenv("LAMBHOST"))) {
 	} else if ((bhost = getenv("TROLLIUSBHOST"))) {
 	} else {


More information about the mpiexec mailing list