LAM/MPI support for Mpiexec (experimental)
Ben Webb
ben at bellatrix.pcl.ox.ac.uk
Fri May 17 07:35:26 EDT 2002
On Thu, May 16, 2002 at 07:46:20PM +0100, Ben Webb wrote:
> The attached patch to LAM 6.5.6 makes it work with Mpiexec on a
> PBS cluster.
...
> - Processes started via. LAM's mpirun don't record their CPU usage, etc.
> with PBS, and neither do they get killed if you "qdel" the PBS job.
> I'm not entirely sure why this is, as the processes are children of
> the TM-spawned lamd process. I think lamd must be calling setsid()
> somewhere. I will investigate further.
Yes, in share/kreq/kcreate.c. Fixed in the attached, updated, patch.
> - Mpiexec reports I/O errors after startup from lamboot. I think this is
> because it inherits lamboot's stdin etc., and am pretty sure that just
> closing these descriptors will solve this problem.
Closing stdin appears to solve the problem; this is what I do with the
new patch.
PBS now keeps track of LAM/MPI processes properly. There are
still some problems, however (suggestions appreciated)... "qdel" on a
LAM/MPI job tends to leave processes running on node 0, and any SysV IPC
shared memory or semaphores are not cleaned up. I have my kill_delay set
to 30, so I suspect this is because lamd doesn't catch SIGTERM.
Ben
--
ben at bellatrix.pcl.ox.ac.uk http://bellatrix.pcl.ox.ac.uk/~ben/
"So you found a girl who thinks really deep thoughts,
What's so amazing about really deep thoughts?"
-------------- next part --------------
diff -Nur -Xlam.exclude lam-6.5.6/share/boot/lambootagent.c lam-6.5.6-patched/share/boot/lambootagent.c
--- lam-6.5.6/share/boot/lambootagent.c Mon Nov 19 16:13:45 2001
+++ lam-6.5.6-patched/share/boot/lambootagent.c Fri May 17 11:02:25 2002
@@ -73,8 +73,8 @@
int
lambootagent(struct lamnode *lamnet, int nlamnet, int *nboot, int *nrun)
{
- int agent_port; /* port number for replies */
- int agent_sd; /* socket for replies */
+ int agent_port[nlamnet]; /* port number for replies */
+ int agent_sd[nlamnet]; /* socket for replies */
int boot_sd; /* connection to new node */
int cmdc; /* command vector count */
int dlport;
@@ -84,7 +84,12 @@
int4 origin; /* origin node ID */
char **cmdv; /* command vector */
char *batchid; /* batch job ID */
+ char *mpiexec[10]; /* argv for mpiexec invocation */
+ char tmpnam[80];
+ int tmpfd;
+ FILE *fp;
unsigned char *p;
+ pid_t childpid;
*nboot = 0;
*nrun = 0;
@@ -99,22 +104,43 @@
fl_verbose = opt_taken('v');
fl_fast = opt_taken('b');
fl_close = opt_taken('s');
+
/*
- * Allocate a server socket and port.
+ * Write mpiexec config file.
*/
- agent_port = 0;
- agent_sd = sfh_sock_open_srv_inet_stm(&agent_port);
- if (agent_sd < 0) {
- show_help("boot", "socket-fail", NULL);
- return(LAMERROR);
+ strcpy(tmpnam, "/tmp/lam-mpiexec.cfg-XXXXXX");
+ tmpfd = mkstemp(tmpnam);
+ if (tmpfd == -1) {
+ perror("Create temporary file failed");
+ exit(1);
+ }
+ fp = fdopen(tmpfd, "w");
+ if (!fp) {
+ perror("Open of temp file failed");
+ exit(1);
+ }
+ if (fl_verbose) {
+ printf("Using mpiexec config file %s\n", tmpnam);
}
+
/*
- * Make the socket close on exec.
+ * Allocate server sockets and ports.
*/
- if (fcntl(agent_sd, F_SETFD, 1) == -1) {
- show_help(NULL, "system-call-fail", "fcntl (set close-on-exec)",
- NULL);
- return(LAMERROR);
+ for (i = 0; i < nlamnet; i++) {
+ agent_port[i] = 0;
+ agent_sd[i] = sfh_sock_open_srv_inet_stm(&agent_port[i]);
+ if (agent_sd[i] < 0) {
+ show_help("boot", "socket-fail", NULL);
+ return(LAMERROR);
+ }
+/*
+ * Make the sockets close on exec.
+ */
+ if (fcntl(agent_sd[i], F_SETFD, 1) == -1) {
+ show_help(NULL, "system-call-fail", "fcntl (set close-on-exec)",
+ NULL);
+ return(LAMERROR);
+ }
}
/*
* Find the local node.
@@ -160,18 +186,14 @@
/*
* Invoke hboot on the new host.
*/
- cmdc = 0;
- cmdv = 0;
- argvadd(&cmdc, &cmdv, DEFTHBOOT);
- argvadd(&cmdc, &cmdv, "-t");
- argvadd(&cmdc, &cmdv, "-c");
- argvadd(&cmdc, &cmdv, "lam-conf.lam");
+ fprintf(fp, "%s : %s -t -c lam-conf.lam", lamnet[i].lnd_hname,
+ DEFTHBOOT);
if (fl_debug) {
- argvadd(&cmdc, &cmdv, "-d");
+ fprintf(fp, " -d");
}
if (fl_verbose) {
- argvadd(&cmdc, &cmdv, "-v");
+ fprintf(fp, " -v");
}
/*
* If remote node, close stdio of processes, unless forced by the
@@ -180,7 +202,7 @@
* hboot/lamd on somenode to close their stdio so that rsh can finish.
*/
if (i != local || fl_close) {
- argvadd(&cmdc, &cmdv, "-s");
+ fprintf(fp, " -s");
}
/*
* If this is under a batch system, pass the -b to both hboot and to
@@ -188,26 +210,22 @@
*/
batchid = get_batchid();
if (strlen(batchid) > 0) {
- argvadd(&cmdc, &cmdv, "-b");
- argvadd(&cmdc, &cmdv, batchid);
+ fprintf(fp, " -b %s", batchid);
}
/*
* Override the $inet_topo variable.
*/
p = (unsigned char *) &lamnet[local].lnd_addr.sin_addr;
- argvadd(&cmdc, &cmdv, "-I");
- sprintf(buf, "%c%s-H %u.%u.%u.%u -P %d -n %d -o %d %s %s%c",
- i == local ? ' ' : '"',
+ fprintf(fp, " -I \" %s-H %u.%u.%u.%u -P %d -n %d -o %d %s %s\"",
opt_taken('x') ? "-x " : "",
(unsigned) p[0], (unsigned) p[1],
(unsigned) p[2], (unsigned) p[3],
- agent_port,
+ agent_port[i],
i,
origin,
(strlen(batchid) == 0 ? " " : "-b"),
- (strlen(batchid) == 0 ? " " : batchid),
- i == local ? ' ' : '"');
- argvadd(&cmdc, &cmdv, buf);
+ (strlen(batchid) == 0 ? " " : batchid));
+ fprintf(fp, "\n");
VERBOSE("Executing %s on n%d (%s - %d CPU%s)...\n",
DEFTHBOOT, i, lamnet[i].lnd_hname,
@@ -215,48 +233,38 @@
(lamnet[i].lnd_ncpus > 1) ? "s" : "");
(*nboot)++;
+ }
- if (i == local) {
- if (fl_debug) {
- int j;
-
- printf("lamboot: attempting to execute \"");
- for (j = 0; j < cmdc; j++) {
- if (j > 0)
- printf(" ");
- if (strchr(cmdv[j], ' ') != NULL)
- printf("\"%s\"", cmdv[j]);
- else
- printf("%s", cmdv[j]);
- }
- printf("\"\n");
- }
- r = _lam_few(cmdv);
-
- if (r) {
- (*nboot)--;
- errno = r;
- show_help("boot", "fork-fail", cmdv[0], NULL);
- argvfree(cmdv);
- return(LAMERROR);
- }
- } else {
- r = inetexec(lamnet[i].lnd_hname, lamnet[i].lnd_uname,
- cmdv, (fl_debug ? "lamboot" : NULL),
- fl_fast);
-
- if (r) {
- (*nboot)--;
- argvfree(cmdv);
- /* inetexec will display errors if it
- fails */
- return(LAMERROR);
- }
- }
+/*
+ * Fire off mpiexec to start the hboot processes.
+ */
+ fclose(fp);
+ mpiexec[0] = MPIEXEC;
+ mpiexec[1] = "-comm=none";
+ mpiexec[2] = "-config";
+ mpiexec[3] = tmpnam;
+ mpiexec[4] = NULL;
+ (void) fflush(stdout);
+ (void) fflush(stderr);
+ childpid = fork();
+ if (childpid == -1) {
+ lamfail("lambootagent fork failed");
+ } else if (childpid == 0) {
+ fclose(stdin);
+ execv(mpiexec[0], mpiexec);
+ lamfail("execv failed");
+ }
+
+ for (i = 0; i < nlamnet; ++i) {
+/*
+ * Skip nodes that are invalid or already booted.
+ */
+ if ((lamnet[i].lnd_nodeid == NOTNODEID) ||
+ !(lamnet[i].lnd_type & NT_BOOT)) continue;
/*
* Accept a connection from the new host.
*/
- boot_sd = sfh_sock_accept_tmout(agent_sd, LAM_TO_BOOT);
+ boot_sd = sfh_sock_accept_tmout(agent_sd[i], LAM_TO_BOOT);
if (boot_sd < 0) return(LAMERROR);
/*
* Read the new host port numbers.
@@ -272,7 +280,17 @@
(*nrun)++;
}
- if (close(agent_sd)) return(LAMERROR);
+ if (fl_verbose) {
+ printf("all nodes connected\n");
+ }
+/*
+ * mpiexec must have fired up by now, so we can remove the config file
+ */
+ unlink(tmpnam);
+
+ for (i = 0; i < nlamnet; ++i) {
+ if (close(agent_sd[i])) return(LAMERROR);
+ }
if (fl_verbose) {
nodespin_init("topology");
diff -Nur -Xlam.exclude lam-6.5.6/share/include/lamnet.h lam-6.5.6-patched/share/include/lamnet.h
--- lam-6.5.6/share/include/lamnet.h Mon Nov 19 16:13:40 2001
+++ lam-6.5.6-patched/share/include/lamnet.h Fri May 17 11:01:36 2002
@@ -61,6 +61,7 @@
#define DEFTHBOOT "hboot"
#define DEFTTKILL "tkill"
#define DEFTWIPE "wipe"
+#define MPIEXEC "/usr/bin/mpiexec"
/*
* node description
diff -Nur -Xlam.exclude lam-6.5.6/share/kreq/kcreate.c lam-6.5.6-patched/share/kreq/kcreate.c
--- lam-6.5.6/share/kreq/kcreate.c Mon Nov 19 16:13:52 2001
+++ lam-6.5.6-patched/share/kreq/kcreate.c Fri May 17 10:59:18 2002
@@ -120,7 +120,10 @@
sigaction(SIGCHLD, &act, 0);
sigaction(SIGPIPE, &act, 0);
- (void) setsid();
+/*
+ * Do NOT call setsid; this way PBS can keep track of the spawned process.
+ */
+/* (void) setsid();*/
/*
* Redirect the stdio fd's
*/
diff -Nur -Xlam.exclude lam-6.5.6/tools/hboot/hboot.c lam-6.5.6-patched/tools/hboot/hboot.c
--- lam-6.5.6/tools/hboot/hboot.c Mon Nov 19 16:14:48 2001
+++ lam-6.5.6-patched/tools/hboot/hboot.c Thu May 16 16:15:52 2002
@@ -99,6 +99,8 @@
char buf[32]; /* formatting buffer */
char *full; /* full pathname */
char *tail; /* tail of full pathname */
+ char **pt;
+ int status;
/* Ensure that we are not root */
@@ -245,7 +247,7 @@
exit(errno);
}
-#if 1
+#if 0
/* Comment this out to make the TM extensions to PBS work
nicely -- everything will be in one session, so TM can kill
it when it dies. */
@@ -304,6 +306,7 @@
if (fl_debug) {
printf("hboot: attempting to execute \n");
}
+
execvp(p->psc_argv[0], p->psc_argv);
exit(errno);
}
@@ -323,6 +326,7 @@
printf("\n");
}
+ wait(&status);
}
if (p->psc_delay > 0) {
diff -Nur -Xlam.exclude lam-6.5.6/tools/lamboot/lamboot.c lam-6.5.6-patched/tools/lamboot/lamboot.c
--- lam-6.5.6/tools/lamboot/lamboot.c Mon Nov 19 16:14:49 2001
+++ lam-6.5.6-patched/tools/lamboot/lamboot.c Thu May 16 13:13:25 2002
@@ -271,6 +271,7 @@
*/
if (cmdc == 2) {
fname = cmdv[1];
+ } else if ((fname = getenv("PBS_NODEFILE"))) {
} else if ((fname = getenv("LAMBHOST"))) {
} else if ((fname = getenv("TROLLIUSBHOST"))) {
} else {
diff -Nur -Xlam.exclude lam-6.5.6/tools/wipe/wipe.c lam-6.5.6-patched/tools/wipe/wipe.c
--- lam-6.5.6/tools/wipe/wipe.c Mon Nov 19 16:14:50 2001
+++ lam-6.5.6-patched/tools/wipe/wipe.c Fri May 17 11:02:58 2002
@@ -86,6 +86,8 @@
int badhost; /* bad host index */
int r, j, success = 1;
struct lamnode *lamnet; /* network description array */
+ char tmpnam[80];
+ int tmpfd;
/* Ensure that we are not root */
@@ -192,15 +194,23 @@
} else {
DBUG("wipe: killing LAM from a non-member machine\n");
}
+
/*
- * Build the tkill command.
+ * Write mpiexec config file.
*/
- cmdn = 0;
- cmdv = 0;
- argvadd(&cmdn, &cmdv, DEFTTKILL);
-
- if (fl_debug) {
- argvadd(&cmdn, &cmdv, "-d");
+ strcpy(tmpnam, "/tmp/lam-mpiexec.cfg-XXXXXX");
+ tmpfd = mkstemp(tmpnam);
+ if (tmpfd == -1) {
+ perror("Create temporary file failed");
+ exit(1);
+ }
+ fp = fdopen(tmpfd, "w");
+ if (!fp) {
+ perror("Open of temp file failed");
+ exit(1);
+ }
+ if (fl_verbose) {
+ printf("Using mpiexec config file %s\n", tmpnam);
}
if (opt_taken('n')) {
@@ -208,71 +218,46 @@
} else {
limit = -1;
}
+
+ for (i = 0; (i < nlamnet) && limit; ++i) {
+ if (limit > 0) --limit;
+ fprintf(fp, lamnet[i].lnd_hname);
+ }
+ fprintf(fp, " : %s", DEFTTKILL);
+ if (fl_debug) {
+ fprintf(fp, " -d");
+ }
+
/*
* If we're running ounder a batch system, we have to propogate the
* socket name to all the remote tkill instances.
*/
batchid = get_batchid();
if (strlen(batchid) > 0) {
- argvadd(&cmdn, &cmdv, "-b");
- argvadd(&cmdn, &cmdv, batchid);
+ fprintf(fp, " -b %s", batchid);
}
+
/*
- * Loop over all host nodes.
+ * Build the mpiexec command.
*/
- global_ret = 0;
-
- for (i = 0; (i < nlamnet) && limit; ++i) {
-
- if (limit > 0) --limit;
-
- VERBOSE("Executing %s on n%d (%s)...\n", DEFTTKILL,
- lamnet[i].lnd_nodeid, lamnet[i].lnd_hname);
-
- if (fl_debug) {
- printf("wipe: attempting to launch \"");
- for (j = 0; j < cmdn; j++) {
- if (j > 0)
- printf(" ");
- printf("%s", cmdv[j]);
- }
- printf("\" ");
- }
-
- if (lamnet[i].lnd_type & NT_ORIGIN) {
- DBUG("(local execution)\n");
- r = _lam_few(cmdv);
-
- if (r) {
- errno = r;
- }
- } else {
- DBUG("(remote execution)\n");
- r = inetexec(lamnet[i].lnd_hname,
- lamnet[i].lnd_uname, cmdv,
- (fl_debug ? "wipe" : NULL),
- fl_fast);
- }
-
- if (r) {
- fprintf(stderr, "wipe: %s failed on n%d (%s)\n",
- DEFTTKILL, lamnet[i].lnd_nodeid,
- lamnet[i].lnd_hname);
-
- if (errno != EUNKNOWN) {
- terror("wipe");
- } else
- show_help(NULL, "unknown", NULL);
-
- global_ret = errno;
- success = 0;
- }
- }
-
- if (success) {
- DBUG("wipe completed successfully\n");
+ cmdn = 0;
+ cmdv = 0;
+ argvadd(&cmdn, &cmdv, MPIEXEC);
+ argvadd(&cmdn, &cmdv, "-comm=none");
+ argvadd(&cmdn, &cmdv, "-config");
+ argvadd(&cmdn, &cmdv, tmpnam);
+
+ r = _lam_few(cmdv);
+ unlink(tmpnam);
+
+ if (r) {
+ errno = r;
+ if (errno != EUNKNOWN) {
+ terror("wipe");
+ } else
+ show_help(NULL, "unknown", NULL);
} else {
- DBUG("wipe did NOT complete successfully\n");
+ DBUG("wipe completed successfully\n");
}
argvfree(cmdv);
@@ -308,6 +293,7 @@
*/
if (cmdc == 2) {
bhost = cmdv[1];
+ } else if ((bhost = getenv("PBS_NODEFILE"))) {
} else if ((bhost = getenv("LAMBHOST"))) {
} else if ((bhost = getenv("TROLLIUSBHOST"))) {
} else {
More information about the mpiexec
mailing list