patch for lam version 6.5.8

Mark Hartner hartner at cs.utah.edu
Tue Mar 25 17:51:58 EST 2003


Howdy,
  I ported the mpiexec patch to lam version 6.5.8. We've been running it
for several weeks on our 128 node cluster without a hitch. The patch is
attached.

Mark
-------------- next part --------------
diff -ru unpatched/acconfig.h patched/acconfig.h
--- unpatched/acconfig.h	2003-01-06 11:23:18.000000000 -0700
+++ patched/acconfig.h	2003-01-06 11:15:19.000000000 -0700
@@ -340,6 +340,16 @@
 #define LAM_RSH_NEED_MINUSMINUS 0
 
 /*
+ * Do we want to use 'mpiexec' for job startup?
+ */
+#define LAM_WITH_MPIEXEC       1
+
+/*
+ * The full path to 'mpiexec'
+ */
+#define LAM_MPIEXEC "/usr/local/bin/mpiexec"
+
+/*
  * System libraries
  */
 #define LAM_SYSLIBS             "bogusness"
diff -ru unpatched/otb/sys/kernel/kernelio.c patched/otb/sys/kernel/kernelio.c
--- unpatched/otb/sys/kernel/kernelio.c	2003-01-06 11:21:26.000000000 -0700
+++ patched/otb/sys/kernel/kernelio.c	2003-01-06 11:17:06.000000000 -0700
@@ -71,6 +71,7 @@
 int kio_recv(struct kmsg *recvkmsg, int4 minlen, int fd_client); 
                                                 /* recv to internal proc */
 int kio_to(struct timeval *delay, void (*f)()); /* register timeout */
+void                   kio_shutdown();         /* cleanup */
 
 /*
  * external functions
@@ -112,6 +113,7 @@
 
 static struct sockaddr_un
 			kernel_un;		/* kernel address */
+static int             shutdown_request;
 
 static struct {
 	void		(*kn_func)();		/* interrupt function */
@@ -123,7 +125,7 @@
 /*
  * local functions
  */
-static void		kio_shutdown();		/* cleanup and exit */
+static void            handle_sigend(void);
 
 /*
  *	kio_init
@@ -168,9 +170,11 @@
 /*
  * Create the socket.
  */
+	i = 1;
 	if ((sd_kernel = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
 	  lampanic("lamd kernel: problem with socket()");
 
+	setsockopt(sd_kernel, SOL_SOCKET, SO_REUSEADDR, &i, sizeof(i));
 	fd_max = sd_kernel;
 /*
  * Bind the kernel's address to the socket.
@@ -205,10 +209,11 @@
 /*
  * Catch SIGTERM and SIGINT and kill all attached processes.
  */
-	if (_lam_signal(SIGTERM, kio_shutdown) == SIG_ERR)
+        shutdown_request = 0;
+        if (_lam_signal(SIGTERM, handle_sigend) == SIG_ERR)
 	  lampanic("lamd kernel: problem with internal call _lam_signal() (2)");
 
-	if (_lam_signal(SIGINT, kio_shutdown) == SIG_ERR)
+       if (_lam_signal(SIGINT, handle_sigend) == SIG_ERR)
 	  lampanic("lamd kernel: problem with internal call _lam_signal() (3)");
 	
 	FD_ZERO(&allfds);
@@ -262,13 +267,24 @@
 		    }
 		}
 
+		if (shutdown_request) {
+		  request.kq_req = KQSHUTDOWN;
+		  return(&request);
+		}
+		/* N.B. We miss signals that come in right here; we have
+		 * to rely on the timeout to pick them up */
 		while (((nfd_ready = select(fd_max + 1, &readfds,
 			(fd_set *) 0, (fd_set *) &exceptfds, pto)) < 0) &&
-			(errno == EINTR)) {
+			(errno == EINTR && !shutdown_request)){
 		  memcpy((char *) &readfds, (char *) &allfds, sizeof(fd_set));
 		  FD_SET(sd_kernel, &readfds);
 		  memcpy((char *) &exceptfds, (char *) &readfds, sizeof(fd_set));
 		}
+		if (shutdown_request) {
+		  request.kq_req = KQSHUTDOWN;
+		  return(&request);
+		}	
+
 		if (nfd_ready < 0) 
 		  lampanic("lamd kernel: problem with select() (1)");
 /*
@@ -599,18 +615,22 @@
 	return(fd_ready);
 }
 
+static void handle_sigend(void)
+{
+  shutdown_request = 1;
+}
+
 /*
  *	kio_shutdown
  *
- *	Function:	- cleanup and exit
+ *	Function:       - cleanup of IO system
  */
-static void
+void
 kio_shutdown()
 {
 	kkillall();
 	shutdown(sd_kernel, 2);
 	kio_cleanup();
-	exit(0);
 }
 
 /*
diff -ru unpatched/otb/sys/kernel/kouter.c patched/otb/sys/kernel/kouter.c
--- unpatched/otb/sys/kernel/kouter.c	2003-01-06 11:21:26.000000000 -0700
+++ patched/otb/sys/kernel/kouter.c	2003-01-06 11:17:06.000000000 -0700
@@ -31,6 +31,7 @@
 
 #include <debug.h>
 #include <kreq.h>
+#include <preq.h>
 #include <net.h>
 #include <terror.h>
 #include <typical.h>
@@ -49,6 +50,7 @@
 extern void		kboot();
 extern void		kio_close();
 extern void		kio_init();
+extern void		kio_shutdown();
 extern void		kio_reply();
 extern void		kio_send();
 extern void		kio_transfer();
@@ -61,6 +63,7 @@
 /*
  * local functions
  */
+static void kqshutdown(void);
 static void kqattach(struct kproc *pclient, struct kreq *pkq);
 static void kqdetach(struct kreq *pkq);
 static void kqsurrender(struct kproc *pclient, struct kreq *pkq);
@@ -216,7 +219,9 @@
 /*
  * Service special or non-client requests.
  */
-	    if (pkq->kq_req == KQDETACH) {
+	    if (pkq->kq_req == KQSHUTDOWN) {
+		kqshutdown();
+	    } else if (pkq->kq_req == KQDETACH) {
 		kqdetach(pkq);
 	    } else if ((pkq->kq_req == KQATTACH) && (pkq->kq_index == -1)) {
 		kqattach((struct kproc *) 0, pkq);
@@ -241,6 +246,31 @@
 }
 
 /*
+ *     kqshutdown
+ *
+ *     Function:       - cleans up state and exits, in response to a
+ *                        fatal signal
+ */
+static void kqshutdown(void)
+{
+  struct kproc *p;
+
+  lamlog("kouter: shutting down");
+
+  /* Kill any active processes */
+  for (p = pready; p; p = p->kp_next) {
+    knuke(p);
+  }
+
+  /* Free shared memory etc. and then remove the session directory */
+  lam_cleanup_objects();
+  /* lam_rmsocknamedir(); */
+
+  kio_shutdown();
+  exit(0);
+}
+
+/*
  *	kqattach
  *
  *	Function:	- attaches a new client process
diff -ru unpatched/share/boot/lambootagent.c patched/share/boot/lambootagent.c
--- unpatched/share/boot/lambootagent.c	2003-01-06 11:20:49.000000000 -0700
+++ patched/share/boot/lambootagent.c	2003-01-06 11:18:50.000000000 -0700
@@ -55,8 +55,6 @@
 int
 lambootagent(struct lamnode *lamnet, int nlamnet, int *nboot, int *nrun)
 {
-	int		agent_port;	/* port number for replies */
-	int		agent_sd;	/* socket for replies */
 	int		boot_sd;	/* connection to new node */
 	int		cmdc;		/* command vector count */
 	int		dlport;
@@ -67,6 +65,18 @@
 	char		**cmdv;		/* command vector */
 	char		*batchid;	/* batch job ID */
 	unsigned char	*p;
+	char		sep = ' ';
+#if LAM_WITH_MPIEXEC
+	int		agent_port[nlamnet];    /* port number for replies */
+	int		agent_sd[nlamnet];      /* socket for replies */
+	char		tmpnam[80];     /* mpiexec config file name */
+	int		tmpfd;
+	FILE		*fp;
+	pid_t		childpid;
+#else
+	int		agent_port;     /* port number for replies */
+	int		agent_sd;       /* socket for replies */
+#endif
 
 	*nboot = 0;
 	*nrun = 0;
@@ -81,6 +91,48 @@
 	fl_verbose = opt_taken('v');
 	fl_fast = opt_taken('b');
 	fl_close = opt_taken('s');
+
+#ifdef LAM_WITH_MPIEXEC
+/*
+ * Create mpiexec config file.
+ */
+       strcpy(tmpnam, "/tmp/lam-mpiexec.cfg-XXXXXX");
+       tmpfd = mkstemp(tmpnam);
+       if (tmpfd == -1) {
+               perror("Create temporary file failed");
+               exit(1);
+       }
+       fp = fdopen(tmpfd, "w");
+       if (!fp) {
+               perror("Open of temp file failed");
+               exit(1);
+       }
+       if (fl_verbose) {
+               printf("Using mpiexec config file %s\n", tmpnam);
+       }
+#endif
+
+#ifdef LAM_WITH_MPIEXEC
+/*
+ * Allocate server sockets and ports.
+ */
+       for (i = 0; i < nlamnet; i++) {
+         agent_port[i] = 0;
+         agent_sd[i] = sfh_sock_open_srv_inet_stm(&agent_port[i]);
+         if (agent_sd[i] < 0) {
+           show_help("boot", "socket-fail", NULL);
+           return(LAMERROR);
+         }
+/*
+ * Make the sockets close on exec.
+ */
+         if (fcntl(agent_sd[i], F_SETFD, 1) == -1) {
+           show_help(NULL, "system-call-fail", "fcntl (set close-on-exec)", 
+                     NULL);
+           return(LAMERROR);
+         }
+       }
+#else
 /*
  * Allocate a server socket and port.
  */
@@ -98,6 +150,8 @@
 		    NULL);
 	  return(LAMERROR);
 	}
+#endif /* LAM_WITH_MPIEXEC */
+
 /*
  * Find the local node.
  */
@@ -176,19 +230,27 @@
 /*
  * Override the $inet_topo variable.
  */
+#if LAM_WITH_MPIEXEC
+		/* Mpiexec's configuration file needs to be escaped. */
+		sep = '"';
+#endif
 		p = (unsigned char *) &lamnet[local].lnd_addr.sin_addr;
 		argvadd(&cmdc, &cmdv, "-I");
 		sprintf(buf, "%c%s-H %u.%u.%u.%u -P %d -n %d -o %d %s %s%c",
-			i == local ? ' ' : '"',
+			sep,
 			opt_taken('x') ? "-x " : "",
 			(unsigned) p[0], (unsigned) p[1],
 			(unsigned) p[2], (unsigned) p[3],
+#if LAM_WITH_MPIEXEC
+			agent_port[i],
+#else
 			agent_port,
+#endif
 			i,
 			origin,
 			(strlen(batchid) == 0 ? " " : "-b"),
 			(strlen(batchid) == 0 ? " " : batchid),
-			i == local ? ' ' : '"');
+			sep);
 		argvadd(&cmdc, &cmdv, buf);
 
 		VERBOSE("Executing %s on n%d (%s - %d CPU%s)...\n", 
@@ -197,7 +259,7 @@
 			(lamnet[i].lnd_ncpus > 1) ? "s" : "");
 
 		(*nboot)++;
-
+#if !LAM_WITH_MPIEXEC
 		if (i == local) {
 		        if (fl_debug) {
 			  int j;
@@ -252,13 +314,84 @@
  */
 		if (close(boot_sd)) return(LAMERROR);
 		(*nrun)++;
+#else   /* LAM_WITH_MPIEXEC */
+	/* Write out the Mpiexec configuration for this host */
+	fputs(lamnet[i].lnd_hname, fp);
+	fputs(" :", fp);
+	for (j = 0; j < cmdc; j++) {
+	  fputc(' ', fp);
+	  fputs(cmdv[j], fp);
+	}
+	fputc('\n', fp);
+	argvfree(cmdv);
+#endif /* LAM_WITH_MPIEXEC */
 	}
 
+#if LAM_WITH_MPIEXEC
+/*
+ * Fire off mpiexec to start the hboot processes.
+ */
+       fclose(fp);
+       cmdc = 0;
+       cmdv = 0;
+       argvadd(&cmdc, &cmdv, LAM_MPIEXEC);
+       argvadd(&cmdc, &cmdv, "-comm=none");
+       argvadd(&cmdc, &cmdv, "-config");
+       argvadd(&cmdc, &cmdv, tmpnam);
+       (void) fflush(stdout);
+       (void) fflush(stderr);
+       childpid = fork();
+       if (childpid == -1) {
+               lamfail("lambootagent fork failed");
+       } else if (childpid == 0) {
+               fclose(stdin);
+               execv(cmdv[0], cmdv);
+               lamfail("execv failed");
+       }
+       argvfree(cmdv);
+       for (i = 0; i < nlamnet; ++i) {
+/*
+ * Skip nodes that are invalid or already booted.
+ */
+               if ((lamnet[i].lnd_nodeid == NOTNODEID) ||
+                               !(lamnet[i].lnd_type & NT_BOOT)) continue;
+/*
+ * Accept a connection from the new host.
+ */
+               boot_sd = sfh_sock_accept_tmout(agent_sd[i], LAM_TO_BOOT);
+               if (boot_sd < 0) return(LAMERROR);
+/*
+ * Read the new host port numbers.
+ */
+               if (readcltcoord(boot_sd, &lamnet[i].lnd_bootport,
+                                &dlport)) return(LAMERROR);
+               lamnet[i].lnd_addr.sin_port = htons((unsigned short) dlport);
+/*
+ * Close the host connection.
+ */
+               if (close(boot_sd)) return(LAMERROR);
+               (*nrun)++;
+       }
+
+       if (fl_verbose) {
+               printf("all nodes connected\n");
+       }
+/*
+ * mpiexec must have fired up by now, so we can remove the config file
+ */
+       unlink(tmpnam);
+
+       for (i = 0; i < nlamnet; ++i) {
+               if (close(agent_sd[i])) return(LAMERROR);
+       }
+
+#else
 	if (close(agent_sd)) return(LAMERROR);
+#endif /* LAM_WITH_MPIEXEC */
 
 	if (fl_verbose) {
-		nodespin_init("topology");
-	}
+                nodespin_init("topology");
+        }
 /*
  * Send link information to all nodes that have been booted.
  */
diff -ru unpatched/share/include/kreq.h patched/share/include/kreq.h
--- unpatched/share/include/kreq.h	2003-01-06 11:20:43.000000000 -0700
+++ patched/share/include/kreq.h	2003-01-06 11:18:25.000000000 -0700
@@ -86,6 +86,7 @@
  */
 #define KQDETACH	7		/* end kernel session */
 #define KQDUMP		8		/* print process descriptors */
+#define KQSHUTDOWN	9		/* shutdown on signal */
 
 /*
  * process states
diff -ru unpatched/share/kreq/kcreate.c patched/share/kreq/kcreate.c
--- unpatched/share/kreq/kcreate.c	2003-01-06 11:20:53.000000000 -0700
+++ patched/share/kreq/kcreate.c	2003-01-06 11:19:01.000000000 -0700
@@ -102,7 +102,14 @@
 	sigaction(SIGCHLD, &act, 0);
 	sigaction(SIGPIPE, &act, 0);
 
+#if !LAM_WITH_MPIEXEC
+/*
+ * We do NOT call setsid when using PBS+Mpiexec; this way PBS can keep track
+ * of the spawned process.
+ */
 	(void) setsid();
+#endif
+
 /*
  * Redirect the stdio fd's
  */
diff -ru unpatched/tools/hboot/hboot.c patched/tools/hboot/hboot.c
--- unpatched/tools/hboot/hboot.c	2003-01-06 11:21:00.000000000 -0700
+++ patched/tools/hboot/hboot.c	2003-01-06 11:19:10.000000000 -0700
@@ -25,6 +25,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <sys/types.h>
+#include <sys/wait.h>
 #include <unistd.h>
 
 #include <all_list.h>
@@ -81,6 +82,9 @@
 	char		buf[32];	/* formatting buffer */
 	char		*full;		/* full pathname */
 	char		*tail;		/* tail of full pathname */
+#if LAM_WITH_MPIEXEC
+	int             status;
+#endif
 
 	/* Ensure that we are not root */
 
@@ -227,7 +231,7 @@
 	  exit(errno);
 	}
 
-#if 1
+#if !LAM_WITH_MPIEXEC
 	/* Comment this out to make the TM extensions to PBS work
            nicely -- everything will be in one session, so TM can kill
            it when it dies. */
@@ -255,6 +259,8 @@
 	for (p = al_top(list_psc); p; p = al_next(list_psc, p)) {
 		DBUG("hboot: fork %s\n", p->psc_argv[0]);
 
+		fflush(stdout);
+		fflush(stderr);
 		if ((pid = fork()) < 0) {
 		  show_help(NULL, "system-call-fail", "fork", NULL);
 		  exit(errno);
@@ -311,6 +317,30 @@
 			sleep((unsigned int) p->psc_delay);
 		}
 	}
+#if LAM_WITH_MPIEXEC
+/*
+ * When using Mpiexec+PBS, we want the mpiexec spawned by lamboot (the one
+ * that spawned hboot) to last for the duration of the PBS job. Thus, we
+ * don't want to exit hboot until all processes have exited, and so we
+ * wait for them here.
+ */
+       do {
+         fflush(stdout);
+         fflush(stderr);
+         do {
+           pid = wait(&status);
+         } while (pid == -1 && errno == EINTR);
+         if (pid > 0 && fl_debug) {
+           printf("Child pid %d exited ", pid);
+           if (WIFEXITED(status)) {
+             printf("with status %d", WEXITSTATUS(status));
+           } else if (WIFSIGNALED(status)) {
+             printf("on signal %d", WTERMSIG(status));
+           }
+           printf("\n");
+         }
+       } while (pid > 0);
+#endif
 
 	return(0);
 }
diff -ru unpatched/tools/lamboot/lamboot.c patched/tools/lamboot/lamboot.c
--- unpatched/tools/lamboot/lamboot.c	2003-01-06 11:21:10.000000000 -0700
+++ patched/tools/lamboot/lamboot.c	2003-01-06 11:19:24.000000000 -0700
@@ -253,6 +253,9 @@
  */
 	if (cmdc == 2) {
 		fname = cmdv[1];
+#if LAM_WITH_MPIEXEC
+	} else if ((fname = getenv("PBS_NODEFILE"))) {
+#endif
 	} else if ((fname = getenv("LAMBHOST"))) {
 	} else if ((fname = getenv("TROLLIUSBHOST"))) {
 	} else {
diff -ru unpatched/tools/wipe/wipe.c patched/tools/wipe/wipe.c
--- unpatched/tools/wipe/wipe.c	2003-01-06 11:21:13.000000000 -0700
+++ patched/tools/wipe/wipe.c	2003-01-06 11:19:29.000000000 -0700
@@ -68,6 +68,10 @@
 	int		badhost;	/* bad host index */
 	int		r, j, success = 1;
 	struct lamnode	*lamnet;	/* network description array */
+#if LAM_WITH_MPIEXEC
+	char            tmpnam[80];
+	int             tmpfd;
+#endif
 
 	/* Ensure that we are not root */
 
@@ -174,6 +178,25 @@
 	} else {
 	  DBUG("wipe: killing LAM from a non-member machine\n");
 	}
+#if LAM_WITH_MPIEXEC
+/*
+ * Create mpiexec config file.
+ */
+	strcpy(tmpnam, "/tmp/lam-mpiexec.cfg-XXXXXX");
+	tmpfd = mkstemp(tmpnam);
+	if (tmpfd == -1) {
+		perror("Create temporary file failed");
+		exit(1);
+	}
+	fp = fdopen(tmpfd, "w");
+	if (!fp) {
+		perror("Open of temp file failed");
+		exit(1);
+	}if (fl_verbose) {
+		printf("Using mpiexec config file %s\n", tmpnam);
+	}
+#endif
+
 /*
  * Build the tkill command.
  */
@@ -199,6 +222,46 @@
 	  argvadd(&cmdn, &cmdv, "-b");
 	  argvadd(&cmdn, &cmdv, batchid);
 	}
+
+#if LAM_WITH_MPIEXEC
+/* Write Mpiexec config file */
+       for (i = 0; (i < nlamnet) && limit; ++i) {
+               if (limit > 0) --limit;
+               fputs(lamnet[i].lnd_hname, fp);
+               fputc(' ', fp);
+       }
+       fputc(':', fp);
+       for (i = 0; i < cmdn; i++) {
+               fputc(' ', fp);
+               fputs(cmdv[i], fp);
+       }
+       fputc('\n', fp);
+       argvfree(cmdv);
+       fclose(fp);
+
+/* Run mpiexec */
+       cmdn = 0;
+       cmdv = 0;
+       argvadd(&cmdn, &cmdv, LAM_MPIEXEC);
+        argvadd(&cmdn, &cmdv, "-comm=none");
+        argvadd(&cmdn, &cmdv, "-config");
+        argvadd(&cmdn, &cmdv, tmpnam);
+
+        r = _lam_few(cmdv);
+        unlink(tmpnam);
+
+        if (r) {
+                errno = r;
+                if (errno != EUNKNOWN) {
+                        terror("wipe");
+                } else
+                  show_help(NULL, "unknown", NULL);
+               global_ret = r;
+               success = 0;
+        }
+
+#else
+
 /*
  * Loop over all host nodes.
  */
@@ -250,6 +313,7 @@
 			success = 0;
 		}
 	}
+#endif /* LAM_WITH_MPIEXEC */
 
 	if (success) {
 	  DBUG("wipe completed successfully\n");
@@ -290,6 +354,9 @@
  */
 	if (cmdc == 2) {
 		bhost = cmdv[1];
+#if LAM_WITH_MPIEXEC
+	} else if ((bhost = getenv("PBS_NODEFILE"))) {
+#endif
 	} else if ((bhost = getenv("LAMBHOST"))) {
 	} else if ((bhost = getenv("TROLLIUSBHOST"))) {
 	} else {


More information about the mpiexec mailing list