SIGTSTP propagation for 0.80 ?

David Golden dgolden at cp.dias.ie
Thu Sep 15 13:36:32 EDT 2005


Okay, attached patch is my first attempt.  use at your own risk, no warranty, etc...

I haven't implemented any "pause mpiexec and stdio only" variant as yet,
nor made anything command-line and/or ./configure configurable.

I've done some _very_ cursory testing with mpich2-1.0.2p1 with 2
concurrent mpiexecs, and it doesn't immediately fall over, so that's
nice.  

Next major trouble is, it *doesn't* work with Torque right now - I think 
it might just be the time between the TSTP and the STOP torque sends is too
short, as "qsig -s TSTP <jobid>"  does appear to do the right thing even
though "qsig -s suspend <jobid>" doesn't.

I wasn't too sure about the word "kill" usage in the sources - I decided "kill" 
should mean the traditional confusing "signal" rather than "signal with sigkill" 
to avoid needless dupping of functionality, but "have_killed" and stdio.c 
are particularly inconsistent with that (for stdio child there's  kill_ stop_ 
and cont_ functions.).


-------------- next part --------------
diff -Nar -U 8 mpiexec-0.80/concurrent.c mpiexec-0.80.tstp1/concurrent.c
--- mpiexec-0.80/concurrent.c	2005-07-15 15:03:47.000000000 +0100
+++ mpiexec-0.80.tstp1/concurrent.c	2005-09-15 13:30:55.015298832 +0100
@@ -630,17 +630,17 @@
     ret = read_client(n, &tasknum, sizeof(tasknum));
     if (ret) return;
     ret = read_client(n, &signum, sizeof(signum));
     if (ret) return;
 
     evt = -1;
     tp = tid_find(n, tasknum);
     if (tp)
-	evt = kill_tid(tp);
+	evt = kill_tid(tp, signum);
 
     /* return evt */
     ret = write_client(n, &cmd, sizeof(cmd));
     if (ret == 0)
 	write_client(n, &evt, sizeof(evt));
 }
 
 
@@ -1171,17 +1171,17 @@
 
     debug(2, "%s: client %d", __func__, n);
     (void) close(clients[n].fd);
     clients[n].fd = -1;
 
     /* kill tids */
     list_for_each_entry_safe(tp, tpnext, tids, list) {
 	if (tp->client == n)
-	    kill_tid(tp);  /* might tid_del tp */
+	    kill_tid(tp, SIGKILL);  /* might tid_del tp */
     }
     free_client_if_zero(n);
 }
 
 /*
  * The connection to this client is closed but he still has events and
  * tids outstanding.  Clean up.  Don't delete the event.
  */
@@ -1194,40 +1194,41 @@
     debug(2, "%s: evt %d client %d task %d type %s", __func__, ep->evt,
       n, ep->task, evt_type_string(ep->type));
     switch (ep->type) {
 	case EVT_OBIT:
         case EVT_KILL: {
 	    /* if this was the last event, release the tid; they may come
 	     * in either order. */
 	    evts_t *eq;
-	    evt_type_t other;
 	    int found;
 
 	    tp = tid_find(n, ep->task);
 	    if (!tp)
 		error("%s: lost tid for client %d task %d", __func__, n,
 		  ep->task);
-	    other = (ep->type == EVT_OBIT) ? EVT_KILL : EVT_OBIT;
 	    found = 0;
+            /* check for any other events */
 	    list_for_each_entry(eq, evts, list) {
-		if (eq->client == ep->client
-		 && eq->task == ep->task
-		 && eq->type == other) {
+	      if (ep != eq) {
+		if (eq->client == ep->client && eq->task == ep->task) {
+		  if (eq->type == EVT_OBIT || eq->type == EVT_KILL ) {
 		    found = 1;
 		    break;
+		  }
 		}
+	      }
 	    }
 	    if (!found)
 		tid_del(tp);
 	    break;
 	}
 	case EVT_START:
 	    /* submit a kill and obit on this new task */
 	    tp = tid_add(c->pending_tids[ep->task], n, ep->task);
-	    kill_tid(tp);
+	    kill_tid(tp, SIGKILL);
 	    break;
 	default:
 	    error("%s: unknown event type %d", __func__, ep->type);
     }
     free_client_if_zero(n);
 }
 
diff -Nar -U 8 mpiexec-0.80/mpiexec.c mpiexec-0.80.tstp1/mpiexec.c
--- mpiexec-0.80/mpiexec.c	2005-07-15 15:03:47.000000000 +0100
+++ mpiexec-0.80.tstp1/mpiexec.c	2005-09-15 17:44:12.004078328 +0100
@@ -153,36 +153,75 @@
     if (sig == SIGKILL) return "SIGKILL";
 #   endif
 #   if defined(SIGSEGV)
     if (sig == SIGSEGV) return "SIGSEGV";
 #   endif
 #   if defined(SIGTERM)
     if (sig == SIGTERM) return "SIGTERM";
 #   endif
+#   if defined(SIGTSTP)
+    if (sig == SIGTSTP) return "SIGTSTP";
+#   endif
+#   if defined(SIGCONT)
+    if (sig == SIGCONT) return "SIGCONT";
+#   endif
     return "unknown";
 #endif
 }
 
-static int killall_sig = 0;
+
 static jmp_buf jmp_env;
 
 /*
- * Signal handling.
+ * Signal handling for any propagated signals
+ *
+ * FIXME: lethal signals should be split out into a list,
+ *        and probably made a command-line  option with
+ *        configurable defaults.
+ * FIXME: TSTP propagating STOP vs. TSTP propagating TSTP then STOP 
+ *        vs. TSTP propagating TSTP should probably be a command-line
+ *        option with configurable defaults. 
  */
 void
 killall(int sig)
 {
     static int killall_count = 0;
-
+#if defined(SIGTSTP)
+    const int tstp_siglist[] = { SIGTSTP };
+#endif
     debug(1, "%s: caught signal %d (%s)", __func__, sig,
       parse_signal_number(sig));
-    ++killall_count;
-    killall_sig = sig;
-    longjmp(jmp_env, killall_count);
+    switch (sig) {
+    default:
+      kill_tasks(sig);
+      break;
+#if defined(SIGTSTP)
+    case SIGTSTP:
+      handle_signals(tstp_siglist,list_count(tstp_siglist), SIG_IGN); /* qualms about concurrent mpiexecs */ 
+      kill_tasks(SIGSTOP);
+      stop_stdio();
+      if (!concurrent_master) raise(SIGSTOP); /* uhoh. master has to stay awake to propagate for clients? */
+      break;
+#endif
+#if defined(SIGCONT)
+    case SIGCONT:
+      cont_stdio();
+      kill_tasks(SIGCONT);
+      handle_signals(tstp_siglist,list_count(tstp_siglist), killall); 
+      break;
+#endif
+    case SIGHUP:
+    case SIGINT:
+    case SIGTERM:
+      kill_tasks(SIGKILL);
+      ++killall_count;
+      longjmp(jmp_env, killall_count);
+      break;
+    }
 }
 
 /*
  * Enable one signal handler for a list of signals.  Do not defer
  * signal reception while handling these, to let the impatient user
  * interrupt again to really exit.
  */
 void
@@ -199,16 +238,34 @@
     sigemptyset(&act.sa_mask);
     act.sa_flags = SA_NODEFER;
     act.sa_handler = handler;
     for (i=0; i<num; i++)
 	sigaction(list[i], &act, 0);
 }
 
 /*
+ * Setup signals that should be propagated to all tasks if
+ * caught by the master.
+ * FIXME: Might be nice if the signal set was a command-line option with 
+ *        configurable defaults.
+ */
+void
+setup_propagated_signals() 
+{
+#if defined(SIGTSTP)
+  const int propagated_siglist[] = { SIGHUP, SIGINT, SIGTERM, SIGTSTP, SIGCONT };
+#else
+  const int propagated_siglist[] = { SIGHUP, SIGINT, SIGTERM };
+#endif
+  handle_signals(propagated_siglist, list_count(propagated_siglist), killall);
+}
+
+
+/*
  * Just print a little version string.
  */
 static void
 version(FILE *fp)
 {
     if (!strcmp(CONFIGURE_OPTIONS, ""))
 	fprintf(fp, "Version %s, no configure options\n", VERSION);
     else
@@ -649,17 +706,17 @@
 	  getpid(), getpid());
 	system(s);
 	sleep(1);  /* wait for attach */
     }
 #endif
 
     if (cl_args->server_only) {
 	cm_permit_new_clients(1);
-	handle_signals(0, 0, killall);
+        setup_propagated_signals(); /* FIXME: should server-only propagated sigs be different? */
 	numtasks = 0;
 	numspawned = 0;
 	goto server_only;
     }
 
     /*
      * Now look at the command-line constraints.
      */
@@ -689,16 +746,17 @@
 	    nodes[tasks[i].node].numcpu -= tasks[i].num_copies;
 	cm_permit_new_clients(1);
     } else
 	concurrent_node_alloc();
 
     /*
      * Run the tasks and wait for them to finish.  Use of setjmp is
      * to avoid complex shutdown activity in the signal handler.
+     * Any "lethal" signals (see killall()) will jump back to here.
      */
   server_only:
     jmp_return = setjmp(jmp_env);
     switch (jmp_return) {
         case 0:
 	    if (cl_args->server_only) {
 		cm_serve_clients();
 		concurrent_exit();   /* not reached, wait for ctrl-c path */
@@ -707,17 +765,16 @@
 		distribute_executable();
 		start_tasks();
 		wait_tasks();
 	    }
 	    break;
 
 	/* reached by setjmp return likely */
 	case 1:
-	    kill_tasks(killall_sig);
 	    if (concurrent_master) {
 		cm_kill_clients();
 		cm_permit_new_clients(0);  /* no new connections */
 	    }
 	    wait_tasks();
 	    break;
 
 	/* second ctrl-c, don't try to communicate with anything, just die */
diff -Nar -U 8 mpiexec-0.80/mpiexec.h mpiexec-0.80.tstp1/mpiexec.h
--- mpiexec-0.80/mpiexec.h	2005-07-15 15:03:47.000000000 +0100
+++ mpiexec-0.80.tstp1/mpiexec.h	2005-09-15 17:17:26.297182880 +0100
@@ -172,16 +172,17 @@
 /*
  * Prototypes
  */
 /* mpiexec.c */
 const char *resolve_exe(const char *exe);
 const char *parse_signal_number(int sig);
 void killall(int sig);
 void handle_signals(const int *list, int num, void (*handler)(int sig));
+void setup_propagated_signals(void);
 int stat_exe(const char *exe, int complain);
 
 /* get_hosts.c */
 void get_hosts(void);
 void constrain_nodes(void);
 void reconnect_to_mom(void);
 
 /* start_tasks.c */
@@ -189,17 +190,17 @@
 int do_tm_poll(tm_event_t *evt, const char *caller, int block);
 
 /* task.c */
 tids_t *tid_add(int tid, int client, int task);
 tids_t *tid_find(int client, int task);
 void tid_del(tids_t *tp);
 void tid_dump(void);
 const char *node_name_from_nid(tm_node_id nid);
-tm_event_t kill_tid(tids_t *tp);
+tm_event_t kill_tid(tids_t *tp, int signum);
 void kill_tasks(int signum);
 void wait_tasks(void);
 
 /* event.c */
 void evt_add(int tid, int client, int task, evt_type_t type);
 evts_t *evt_lookup(int evt);
 void evt_del(evts_t *ep);
 const char *evt_type_string(evt_type_t type);
@@ -216,16 +217,18 @@
 const char *config_get_unique_executable(void);
 void config_set_unique_executable(const char *s);
 
 /* stdio.c */
 void stdio_fork(int expected_in[3], int abort_fd_in[2], int pmi_fd);
 int stdio_port(int n);
 void stdio_notice_streams(void);
 void kill_stdio(void);
+void stop_stdio(void);
+void cont_stdio(void);
 void try_kill_stdio(void);
 void kill_stdio_abort_fd(int abort_fd_index);
 void poll_set(int fd, fd_set *fds);
 void poll_del(int fd, fd_set *fds);
 void maybe_exit_stdio(void);  /* for use by device-specific code */
 
 /* concurrent.c */
 void concurrent_init(void);
diff -Nar -U 8 mpiexec-0.80/start_tasks.c mpiexec-0.80.tstp1/start_tasks.c
--- mpiexec-0.80/start_tasks.c	2005-07-15 15:03:47.000000000 +0100
+++ mpiexec-0.80.tstp1/start_tasks.c	2005-09-15 13:06:33.250520856 +0100
@@ -380,17 +380,17 @@
     stdio_fork(conns, gmpi_fd, pmi_fd);
     if (pmi_fd >= 0)
 	close(pmi_fd);  /* child has it now */
 
     /*
      * Start signal handling _after_ stdio child is up.
      */
     numspawned = 0;
-    handle_signals(0, 0, killall);
+    setup_propagated_signals();
 
     /*
      * environment variables common to all tasks
      */
     env_init();
 
     /* override user env with these */
     if (cl_args->comm == COMM_MPICH_GM) {
diff -Nar -U 8 mpiexec-0.80/stdio.c mpiexec-0.80.tstp1/stdio.c
--- mpiexec-0.80/stdio.c	2005-07-15 15:03:47.000000000 +0100
+++ mpiexec-0.80.tstp1/stdio.c	2005-09-15 17:15:40.445274816 +0100
@@ -1039,16 +1039,30 @@
 		  WTERMSIG(stat), parse_signal_number(WTERMSIG(stat)));
 	} else
 	    error("%s: wait stat 0x%x, ifsignaled %d, termsig %d,"
 	      " ifstopped %d, stopsig %d", __func__, stat, WIFSIGNALED(stat),
 	      WTERMSIG(stat), WIFSTOPPED(stat), WSTOPSIG(stat));
     }
 }
 
+/* called by parent to stop stdio child */
+void
+stop_stdio(void)
+{
+    kill(pid, SIGSTOP);
+}
+
+void
+cont_stdio(void)
+{
+   kill(pid, SIGCONT);
+}
+
+
 /*
  * Try to kill stdio, but fail quietly if unsuccessful.
  */
 void
 try_kill_stdio(void)
 {
     if (!pid) return;
     kill(pid, SIGTERM);
diff -Nar -U 8 mpiexec-0.80/task.c mpiexec-0.80.tstp1/task.c
--- mpiexec-0.80/task.c	2005-07-15 15:03:47.000000000 +0100
+++ mpiexec-0.80.tstp1/task.c	2005-09-15 15:53:44.831777664 +0100
@@ -89,23 +89,23 @@
     return nodes[i].name;
 }
 
 /*
  * Get rid of this task, but keep around any events it might generate.
  * Return the new event number that will report when the kill is completed.
  */
 tm_event_t
-kill_tid(tids_t *tp)
+kill_tid(tids_t *tp, int signum)
 {
     tm_event_t evt = -1;
     int ret;
 
     debug(2, "%s: kill client %d task %d", __func__, tp->client, tp->task);
-    ret = tm_kill(tp->tid, SIGKILL, &evt);
+    ret = tm_kill(tp->tid, signum, &evt);
     if (ret == TM_SUCCESS)
 	evt_add(evt, tp->client, tp->task, EVT_KILL);
     else if (ret == TM_ENOTFOUND) {
 	debug(2, "%s: delete already dead client %d task %d",
 	  __func__, tp->client, tp->task);
 	tid_del(tp);
     } else
 	error_tm(ret, "%s: tm_kill client %d task %d", __func__,
@@ -120,38 +120,39 @@
 /*
  * Use tm to send a signal to all tasks.
  */
 void
 kill_tasks(int signum)
 {
     int i;
 
-    debug(1, "%s: killing all tasks", __func__);
+    debug(1, "%s: killing all tasks with sig %d (%s)", __func__, 
+      signum, parse_signal_number(signum) );
     for (i=0; i<numtasks; i++) {
 	if (tasks[i].done) continue;
 	if (concurrent_master) {
 	    tm_event_t evt;
 	    int ret;
-	    debug(2, "%s: kill my task %d on %s", __func__,
-	      i, nodes[tasks[i].node].name);
-	    ret = tm_kill(tasks[i].tid, SIGKILL, &evt);
+	    debug(2, "%s: kill my task %d on %s with sig %d (%s)", __func__,
+	      i, nodes[tasks[i].node].name, signum, parse_signal_number(signum) );
+	    ret = tm_kill(tasks[i].tid, signum, &evt);
 	    if (ret == TM_SUCCESS)
 		evt_add(evt, -1, i, EVT_KILL);
 	    else if (ret == TM_ENOTFOUND) {
 		debug(2, "%s: tried to kill my already dead task %d",
 		  __func__, i);
 		/* but no tid to delete, and don't mark done until obit */
 	    } else
 		error_tm(ret, "%s: tm_kill my task %d", __func__, i);
 	} else {
 	    concurrent_request_kill(i, signum);
 	}
     }
-    have_killed = 1;
+    if (signum == SIGKILL) have_killed = 1;
 }
 
 /*
  * Wait for tasks to finish, if any exit with non-zero status, perhaps
  * kill the rest.  Also, if concurrent_master, pay attention to other
  * mpiexec requests.
  */
 void


More information about the mpiexec mailing list