patch for lam version 6.5.8
Mark Hartner
hartner at cs.utah.edu
Tue Mar 25 17:51:58 EST 2003
Howdy,
I ported the mpiexec patch to lam version 6.5.8. We've been running it
for several weeks on our 128 node cluster without a hitch. The patch is
attached.
Mark
-------------- next part --------------
diff -ru unpatched/acconfig.h patched/acconfig.h
--- unpatched/acconfig.h 2003-01-06 11:23:18.000000000 -0700
+++ patched/acconfig.h 2003-01-06 11:15:19.000000000 -0700
@@ -340,6 +340,16 @@
#define LAM_RSH_NEED_MINUSMINUS 0
/*
+ * Do we want to use 'mpiexec' for job startup?
+ */
+#define LAM_WITH_MPIEXEC 1
+
+/*
+ * The full path to 'mpiexec'
+ */
+#define LAM_MPIEXEC "/usr/local/bin/mpiexec"
+
+/*
* System libraries
*/
#define LAM_SYSLIBS "bogusness"
diff -ru unpatched/otb/sys/kernel/kernelio.c patched/otb/sys/kernel/kernelio.c
--- unpatched/otb/sys/kernel/kernelio.c 2003-01-06 11:21:26.000000000 -0700
+++ patched/otb/sys/kernel/kernelio.c 2003-01-06 11:17:06.000000000 -0700
@@ -71,6 +71,7 @@
int kio_recv(struct kmsg *recvkmsg, int4 minlen, int fd_client);
/* recv to internal proc */
int kio_to(struct timeval *delay, void (*f)()); /* register timeout */
+void kio_shutdown(); /* cleanup */
/*
* external functions
@@ -112,6 +113,7 @@
static struct sockaddr_un
kernel_un; /* kernel address */
+static int shutdown_request;
static struct {
void (*kn_func)(); /* interrupt function */
@@ -123,7 +125,7 @@
/*
* local functions
*/
-static void kio_shutdown(); /* cleanup and exit */
+static void handle_sigend(void);
/*
* kio_init
@@ -168,9 +170,11 @@
/*
* Create the socket.
*/
+ i = 1;
if ((sd_kernel = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
lampanic("lamd kernel: problem with socket()");
+ setsockopt(sd_kernel, SOL_SOCKET, SO_REUSEADDR, &i, sizeof(i));
fd_max = sd_kernel;
/*
* Bind the kernel's address to the socket.
@@ -205,10 +209,11 @@
/*
* Catch SIGTERM and SIGINT and kill all attached processes.
*/
- if (_lam_signal(SIGTERM, kio_shutdown) == SIG_ERR)
+ shutdown_request = 0;
+ if (_lam_signal(SIGTERM, handle_sigend) == SIG_ERR)
lampanic("lamd kernel: problem with internal call _lam_signal() (2)");
- if (_lam_signal(SIGINT, kio_shutdown) == SIG_ERR)
+ if (_lam_signal(SIGINT, handle_sigend) == SIG_ERR)
lampanic("lamd kernel: problem with internal call _lam_signal() (3)");
FD_ZERO(&allfds);
@@ -262,13 +267,24 @@
}
}
+ if (shutdown_request) {
+ request.kq_req = KQSHUTDOWN;
+ return(&request);
+ }
+ /* N.B. We miss signals that come in right here; we have
+ * to rely on the timeout to pick them up */
while (((nfd_ready = select(fd_max + 1, &readfds,
(fd_set *) 0, (fd_set *) &exceptfds, pto)) < 0) &&
- (errno == EINTR)) {
+ (errno == EINTR && !shutdown_request)){
memcpy((char *) &readfds, (char *) &allfds, sizeof(fd_set));
FD_SET(sd_kernel, &readfds);
memcpy((char *) &exceptfds, (char *) &readfds, sizeof(fd_set));
}
+ if (shutdown_request) {
+ request.kq_req = KQSHUTDOWN;
+ return(&request);
+ }
+
if (nfd_ready < 0)
lampanic("lamd kernel: problem with select() (1)");
/*
@@ -599,18 +615,22 @@
return(fd_ready);
}
+static void handle_sigend(void)
+{
+ shutdown_request = 1;
+}
+
/*
* kio_shutdown
*
- * Function: - cleanup and exit
+ * Function: - cleanup of IO system
*/
-static void
+void
kio_shutdown()
{
kkillall();
shutdown(sd_kernel, 2);
kio_cleanup();
- exit(0);
}
/*
diff -ru unpatched/otb/sys/kernel/kouter.c patched/otb/sys/kernel/kouter.c
--- unpatched/otb/sys/kernel/kouter.c 2003-01-06 11:21:26.000000000 -0700
+++ patched/otb/sys/kernel/kouter.c 2003-01-06 11:17:06.000000000 -0700
@@ -31,6 +31,7 @@
#include <debug.h>
#include <kreq.h>
+#include <preq.h>
#include <net.h>
#include <terror.h>
#include <typical.h>
@@ -49,6 +50,7 @@
extern void kboot();
extern void kio_close();
extern void kio_init();
+extern void kio_shutdown();
extern void kio_reply();
extern void kio_send();
extern void kio_transfer();
@@ -61,6 +63,7 @@
/*
* local functions
*/
+static void kqshutdown(void);
static void kqattach(struct kproc *pclient, struct kreq *pkq);
static void kqdetach(struct kreq *pkq);
static void kqsurrender(struct kproc *pclient, struct kreq *pkq);
@@ -216,7 +219,9 @@
/*
* Service special or non-client requests.
*/
- if (pkq->kq_req == KQDETACH) {
+ if (pkq->kq_req == KQSHUTDOWN) {
+ kqshutdown();
+ } else if (pkq->kq_req == KQDETACH) {
kqdetach(pkq);
} else if ((pkq->kq_req == KQATTACH) && (pkq->kq_index == -1)) {
kqattach((struct kproc *) 0, pkq);
@@ -241,6 +246,31 @@
}
/*
+ * kqshutdown
+ *
+ * Function: - cleans up state and exits, in response to a
+ * fatal signal
+ */
+static void kqshutdown(void)
+{
+ struct kproc *p;
+
+ lamlog("kouter: shutting down");
+
+ /* Kill any active processes */
+ for (p = pready; p; p = p->kp_next) {
+ knuke(p);
+ }
+
+ /* Free shared memory etc. and then remove the session directory */
+ lam_cleanup_objects();
+ /* lam_rmsocknamedir(); */
+
+ kio_shutdown();
+ exit(0);
+}
+
+/*
* kqattach
*
* Function: - attaches a new client process
diff -ru unpatched/share/boot/lambootagent.c patched/share/boot/lambootagent.c
--- unpatched/share/boot/lambootagent.c 2003-01-06 11:20:49.000000000 -0700
+++ patched/share/boot/lambootagent.c 2003-01-06 11:18:50.000000000 -0700
@@ -55,8 +55,6 @@
int
lambootagent(struct lamnode *lamnet, int nlamnet, int *nboot, int *nrun)
{
- int agent_port; /* port number for replies */
- int agent_sd; /* socket for replies */
int boot_sd; /* connection to new node */
int cmdc; /* command vector count */
int dlport;
@@ -67,6 +65,18 @@
char **cmdv; /* command vector */
char *batchid; /* batch job ID */
unsigned char *p;
+ char sep = ' ';
+#if LAM_WITH_MPIEXEC
+ int agent_port[nlamnet]; /* port number for replies */
+ int agent_sd[nlamnet]; /* socket for replies */
+ char tmpnam[80]; /* mpiexec config file name */
+ int tmpfd;
+ FILE *fp;
+ pid_t childpid;
+#else
+ int agent_port; /* port number for replies */
+ int agent_sd; /* socket for replies */
+#endif
*nboot = 0;
*nrun = 0;
@@ -81,6 +91,48 @@
fl_verbose = opt_taken('v');
fl_fast = opt_taken('b');
fl_close = opt_taken('s');
+
+#ifdef LAM_WITH_MPIEXEC
+/*
+ * Create mpiexec config file.
+ */
+ strcpy(tmpnam, "/tmp/lam-mpiexec.cfg-XXXXXX");
+ tmpfd = mkstemp(tmpnam);
+ if (tmpfd == -1) {
+ perror("Create temporary file failed");
+ exit(1);
+ }
+ fp = fdopen(tmpfd, "w");
+ if (!fp) {
+ perror("Open of temp file failed");
+ exit(1);
+ }
+ if (fl_verbose) {
+ printf("Using mpiexec config file %s\n", tmpnam);
+ }
+#endif
+
+#ifdef LAM_WITH_MPIEXEC
+/*
+ * Allocate server sockets and ports.
+ */
+ for (i = 0; i < nlamnet; i++) {
+ agent_port[i] = 0;
+ agent_sd[i] = sfh_sock_open_srv_inet_stm(&agent_port[i]);
+ if (agent_sd[i] < 0) {
+ show_help("boot", "socket-fail", NULL);
+ return(LAMERROR);
+ }
+/*
+ * Make the sockets close on exec.
+ */
+ if (fcntl(agent_sd[i], F_SETFD, 1) == -1) {
+ show_help(NULL, "system-call-fail", "fcntl (set close-on-exec)",
+ NULL);
+ return(LAMERROR);
+ }
+ }
+#else
/*
* Allocate a server socket and port.
*/
@@ -98,6 +150,8 @@
NULL);
return(LAMERROR);
}
+#endif /* LAM_WITH_MPIEXEC */
+
/*
* Find the local node.
*/
@@ -176,19 +230,27 @@
/*
* Override the $inet_topo variable.
*/
+#if LAM_WITH_MPIEXEC
+ /* Mpiexec's configuration file needs to be escaped. */
+ sep = '"';
+#endif
p = (unsigned char *) &lamnet[local].lnd_addr.sin_addr;
argvadd(&cmdc, &cmdv, "-I");
sprintf(buf, "%c%s-H %u.%u.%u.%u -P %d -n %d -o %d %s %s%c",
- i == local ? ' ' : '"',
+ sep,
opt_taken('x') ? "-x " : "",
(unsigned) p[0], (unsigned) p[1],
(unsigned) p[2], (unsigned) p[3],
+#if LAM_WITH_MPIEXEC
+ agent_port[i],
+#else
agent_port,
+#endif
i,
origin,
(strlen(batchid) == 0 ? " " : "-b"),
(strlen(batchid) == 0 ? " " : batchid),
- i == local ? ' ' : '"');
+ sep);
argvadd(&cmdc, &cmdv, buf);
VERBOSE("Executing %s on n%d (%s - %d CPU%s)...\n",
@@ -197,7 +259,7 @@
(lamnet[i].lnd_ncpus > 1) ? "s" : "");
(*nboot)++;
-
+#if !LAM_WITH_MPIEXEC
if (i == local) {
if (fl_debug) {
int j;
@@ -252,13 +314,84 @@
*/
if (close(boot_sd)) return(LAMERROR);
(*nrun)++;
+#else /* LAM_WITH_MPIEXEC */
+ /* Write out the Mpiexec configuration for this host */
+ fputs(lamnet[i].lnd_hname, fp);
+ fputs(" :", fp);
+ for (j = 0; j < cmdc; j++) {
+ fputc(' ', fp);
+ fputs(cmdv[j], fp);
+ }
+ fputc('\n', fp);
+ argvfree(cmdv);
+#endif /* LAM_WITH_MPIEXEC */
}
+#if LAM_WITH_MPIEXEC
+/*
+ * Fire off mpiexec to start the hboot processes.
+ */
+ fclose(fp);
+ cmdc = 0;
+ cmdv = 0;
+ argvadd(&cmdc, &cmdv, LAM_MPIEXEC);
+ argvadd(&cmdc, &cmdv, "-comm=none");
+ argvadd(&cmdc, &cmdv, "-config");
+ argvadd(&cmdc, &cmdv, tmpnam);
+ (void) fflush(stdout);
+ (void) fflush(stderr);
+ childpid = fork();
+ if (childpid == -1) {
+ lamfail("lambootagent fork failed");
+ } else if (childpid == 0) {
+ fclose(stdin);
+ execv(cmdv[0], cmdv);
+ lamfail("execv failed");
+ }
+ argvfree(cmdv);
+ for (i = 0; i < nlamnet; ++i) {
+/*
+ * Skip nodes that are invalid or already booted.
+ */
+ if ((lamnet[i].lnd_nodeid == NOTNODEID) ||
+ !(lamnet[i].lnd_type & NT_BOOT)) continue;
+/*
+ * Accept a connection from the new host.
+ */
+ boot_sd = sfh_sock_accept_tmout(agent_sd[i], LAM_TO_BOOT);
+ if (boot_sd < 0) return(LAMERROR);
+/*
+ * Read the new host port numbers.
+ */
+ if (readcltcoord(boot_sd, &lamnet[i].lnd_bootport,
+ &dlport)) return(LAMERROR);
+ lamnet[i].lnd_addr.sin_port = htons((unsigned short) dlport);
+/*
+ * Close the host connection.
+ */
+ if (close(boot_sd)) return(LAMERROR);
+ (*nrun)++;
+ }
+
+ if (fl_verbose) {
+ printf("all nodes connected\n");
+ }
+/*
+ * mpiexec must have fired up by now, so we can remove the config file
+ */
+ unlink(tmpnam);
+
+ for (i = 0; i < nlamnet; ++i) {
+ if (close(agent_sd[i])) return(LAMERROR);
+ }
+
+#else
if (close(agent_sd)) return(LAMERROR);
+#endif /* LAM_WITH_MPIEXEC */
if (fl_verbose) {
- nodespin_init("topology");
- }
+ nodespin_init("topology");
+ }
/*
* Send link information to all nodes that have been booted.
*/
diff -ru unpatched/share/include/kreq.h patched/share/include/kreq.h
--- unpatched/share/include/kreq.h 2003-01-06 11:20:43.000000000 -0700
+++ patched/share/include/kreq.h 2003-01-06 11:18:25.000000000 -0700
@@ -86,6 +86,7 @@
*/
#define KQDETACH 7 /* end kernel session */
#define KQDUMP 8 /* print process descriptors */
+#define KQSHUTDOWN 9 /* shutdown on signal */
/*
* process states
diff -ru unpatched/share/kreq/kcreate.c patched/share/kreq/kcreate.c
--- unpatched/share/kreq/kcreate.c 2003-01-06 11:20:53.000000000 -0700
+++ patched/share/kreq/kcreate.c 2003-01-06 11:19:01.000000000 -0700
@@ -102,7 +102,14 @@
sigaction(SIGCHLD, &act, 0);
sigaction(SIGPIPE, &act, 0);
+#if !LAM_WITH_MPIEXEC
+/*
+ * We do NOT call setsid when using PBS+Mpiexec; this way PBS can keep track
+ * of the spawned process.
+ */
(void) setsid();
+#endif
+
/*
* Redirect the stdio fd's
*/
diff -ru unpatched/tools/hboot/hboot.c patched/tools/hboot/hboot.c
--- unpatched/tools/hboot/hboot.c 2003-01-06 11:21:00.000000000 -0700
+++ patched/tools/hboot/hboot.c 2003-01-06 11:19:10.000000000 -0700
@@ -25,6 +25,7 @@
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
+#include <sys/wait.h>
#include <unistd.h>
#include <all_list.h>
@@ -81,6 +82,9 @@
char buf[32]; /* formatting buffer */
char *full; /* full pathname */
char *tail; /* tail of full pathname */
+#if LAM_WITH_MPIEXEC
+ int status;
+#endif
/* Ensure that we are not root */
@@ -227,7 +231,7 @@
exit(errno);
}
-#if 1
+#if !LAM_WITH_MPIEXEC
/* Comment this out to make the TM extensions to PBS work
nicely -- everything will be in one session, so TM can kill
it when it dies. */
@@ -255,6 +259,8 @@
for (p = al_top(list_psc); p; p = al_next(list_psc, p)) {
DBUG("hboot: fork %s\n", p->psc_argv[0]);
+ fflush(stdout);
+ fflush(stderr);
if ((pid = fork()) < 0) {
show_help(NULL, "system-call-fail", "fork", NULL);
exit(errno);
@@ -311,6 +317,30 @@
sleep((unsigned int) p->psc_delay);
}
}
+#if LAM_WITH_MPIEXEC
+/*
+ * When using Mpiexec+PBS, we want the mpiexec spawned by lamboot (the one
+ * that spawned hboot) to last for the duration of the PBS job. Thus, we
+ * don't want to exit hboot until all processes have exited, and so we
+ * wait for them here.
+ */
+ do {
+ fflush(stdout);
+ fflush(stderr);
+ do {
+ pid = wait(&status);
+ } while (pid == -1 && errno == EINTR);
+ if (pid > 0 && fl_debug) {
+ printf("Child pid %d exited ", pid);
+ if (WIFEXITED(status)) {
+ printf("with status %d", WEXITSTATUS(status));
+ } else if (WIFSIGNALED(status)) {
+ printf("on signal %d", WTERMSIG(status));
+ }
+ printf("\n");
+ }
+ } while (pid > 0);
+#endif
return(0);
}
diff -ru unpatched/tools/lamboot/lamboot.c patched/tools/lamboot/lamboot.c
--- unpatched/tools/lamboot/lamboot.c 2003-01-06 11:21:10.000000000 -0700
+++ patched/tools/lamboot/lamboot.c 2003-01-06 11:19:24.000000000 -0700
@@ -253,6 +253,9 @@
*/
if (cmdc == 2) {
fname = cmdv[1];
+#if LAM_WITH_MPIEXEC
+ } else if ((fname = getenv("PBS_NODEFILE"))) {
+#endif
} else if ((fname = getenv("LAMBHOST"))) {
} else if ((fname = getenv("TROLLIUSBHOST"))) {
} else {
diff -ru unpatched/tools/wipe/wipe.c patched/tools/wipe/wipe.c
--- unpatched/tools/wipe/wipe.c 2003-01-06 11:21:13.000000000 -0700
+++ patched/tools/wipe/wipe.c 2003-01-06 11:19:29.000000000 -0700
@@ -68,6 +68,10 @@
int badhost; /* bad host index */
int r, j, success = 1;
struct lamnode *lamnet; /* network description array */
+#if LAM_WITH_MPIEXEC
+ char tmpnam[80];
+ int tmpfd;
+#endif
/* Ensure that we are not root */
@@ -174,6 +178,25 @@
} else {
DBUG("wipe: killing LAM from a non-member machine\n");
}
+#if LAM_WITH_MPIEXEC
+/*
+ * Create mpiexec config file.
+ */
+ strcpy(tmpnam, "/tmp/lam-mpiexec.cfg-XXXXXX");
+ tmpfd = mkstemp(tmpnam);
+ if (tmpfd == -1) {
+ perror("Create temporary file failed");
+ exit(1);
+ }
+ fp = fdopen(tmpfd, "w");
+ if (!fp) {
+ perror("Open of temp file failed");
+ exit(1);
+ }if (fl_verbose) {
+ printf("Using mpiexec config file %s\n", tmpnam);
+ }
+#endif
+
/*
* Build the tkill command.
*/
@@ -199,6 +222,46 @@
argvadd(&cmdn, &cmdv, "-b");
argvadd(&cmdn, &cmdv, batchid);
}
+
+#if LAM_WITH_MPIEXEC
+/* Write Mpiexec config file */
+ for (i = 0; (i < nlamnet) && limit; ++i) {
+ if (limit > 0) --limit;
+ fputs(lamnet[i].lnd_hname, fp);
+ fputc(' ', fp);
+ }
+ fputc(':', fp);
+ for (i = 0; i < cmdn; i++) {
+ fputc(' ', fp);
+ fputs(cmdv[i], fp);
+ }
+ fputc('\n', fp);
+ argvfree(cmdv);
+ fclose(fp);
+
+/* Run mpiexec */
+ cmdn = 0;
+ cmdv = 0;
+ argvadd(&cmdn, &cmdv, LAM_MPIEXEC);
+ argvadd(&cmdn, &cmdv, "-comm=none");
+ argvadd(&cmdn, &cmdv, "-config");
+ argvadd(&cmdn, &cmdv, tmpnam);
+
+ r = _lam_few(cmdv);
+ unlink(tmpnam);
+
+ if (r) {
+ errno = r;
+ if (errno != EUNKNOWN) {
+ terror("wipe");
+ } else
+ show_help(NULL, "unknown", NULL);
+ global_ret = r;
+ success = 0;
+ }
+
+#else
+
/*
* Loop over all host nodes.
*/
@@ -250,6 +313,7 @@
success = 0;
}
}
+#endif /* LAM_WITH_MPIEXEC */
if (success) {
DBUG("wipe completed successfully\n");
@@ -290,6 +354,9 @@
*/
if (cmdc == 2) {
bhost = cmdv[1];
+#if LAM_WITH_MPIEXEC
+ } else if ((bhost = getenv("PBS_NODEFILE"))) {
+#endif
} else if ((bhost = getenv("LAMBHOST"))) {
} else if ((bhost = getenv("TROLLIUSBHOST"))) {
} else {
More information about the mpiexec
mailing list