Propogation of SIGTSTP ?

Sébastien Georget Sebastien.Georget at sophia.inria.fr
Tue Apr 12 07:52:18 EDT 2005


Pete Wyckoff wrote:
> We had a bit of followup later:
> 
>     http://email.osc.edu/pipermail/mpiexec/2004/000329.html
> 
> but I never did anything because I wasn't sure what the right
> behavior would be.  And there seemed to be some question about
> modifications to qsig too.
> 
> If you and others can figure out all the details and handle
> the PBS side of things, I'll certainly help to get mpiexec
> to play nicely with SIGSTOP and SIGTSTP.

Hi,

here are the patches we are testing to suspend/resume mpi job submited
with mpiexec through torque.

Comments are welcome.

Sébastien
-- 
Sébastien Georget
INRIA Sophia-Antipolis, Service DREAM, B.P. 93
06902 Sophia-Antipolis Cedex, FRANCE
E-mail : sebastien.georget at sophia.inria.fr
-------------- next part --------------
diff -ru mpiexec-0.78/mpiexec.c mpiexec-0.78.new/mpiexec.c
--- mpiexec-0.78/mpiexec.c	Mon Apr 11 16:56:45 2005
+++ mpiexec-0.78.new/mpiexec.c	Tue Apr 12 10:43:36 2005
@@ -8,6 +8,9 @@
  *
  * Copyright (C) 2000-5 Pete Wyckoff <pw at osc.edu>
  *
+ * Copyright (C) 2005 Institut National de Recherche en Informatique
+ *                    et Automatique <sebastien.georget at sophia.inria.fr>
+ *
  * Distributed under the GNU Public License Version 2 or later (See LICENSE)
  */
 #define _GNU_SOURCE  /* hoping to get strsignal() from string.h */
@@ -176,6 +179,18 @@
 	  __func__, sig, parse_signal_number(sig));
     wait_tasks(sig, 0);
     exit(sig);
+}
+
+void
+notify_all(int sig)
+{
+    if (cl_args->verbose)
+	printf("%s: caught signal %d (%s), sending it to all tasks...\n",
+	  __func__, sig, parse_signal_number(sig));
+    if (sig == 18)
+        notify_tasks(18);
+    if (sig == 20)
+        notify_tasks(19);
 }
 
 /*
diff -ru mpiexec-0.78/mpiexec.h mpiexec-0.78.new/mpiexec.h
--- mpiexec-0.78/mpiexec.h	Mon Apr 11 16:56:45 2005
+++ mpiexec-0.78.new/mpiexec.h	Mon Apr 11 16:56:57 2005
@@ -114,6 +114,7 @@
  */
 /* mpiexec.c */
 const char *resolve_exe(const char *exe);
+void notify_all(int sig);
 void killall(int sig);
 void handle_signals(const int *list, int num, void (*handler)(int sig));
 typedef int (*cull_compare_func)(task_cntrl_t *t1, task_cntrl_t *t2);
diff -ru mpiexec-0.78/start_tasks.c mpiexec-0.78.new/start_tasks.c
--- mpiexec-0.78/start_tasks.c	Mon Apr 11 16:56:45 2005
+++ mpiexec-0.78.new/start_tasks.c	Tue Apr 12 10:44:55 2005
@@ -5,6 +5,8 @@
  *
  * Copyright (C) 2000-3 Ohio Supercomputer Center.
  * Copyright (C) 2000-5 Pete Wyckoff <pw at osc.edu>
+ * Copyright (C) 2005 Institut National de Recherche en Informatique
+ *                    et Automatique <sebastien.georget at sophia.inria.fr>
  *
  * Distributed under the GNU Public License Version 2 or later (See LICENSE)
  */
@@ -354,8 +356,10 @@
     {
 	const int siglist[] = { SIGHUP, SIGINT, SIGTERM };
 	const int alarm_list[] = { SIGALRM };
+	const int suspend_list[] = { SIGTSTP, SIGCONT };
 	handle_signals(siglist, list_count(siglist), killall);
 	handle_signals(alarm_list, list_count(alarm_list), kill_others_now);
+	handle_signals(suspend_list, list_count(suspend_list), notify_all);
     }
 
     /*
diff -ru mpiexec-0.78/wait_tasks.c mpiexec-0.78.new/wait_tasks.c
--- mpiexec-0.78/wait_tasks.c	Mon Apr 11 16:56:45 2005
+++ mpiexec-0.78.new/wait_tasks.c	Tue Apr 12 10:45:36 2005
@@ -5,6 +5,8 @@
  *
  * Copyright (C) 2000-3 Ohio Supercomputer Center.
  * Copyright (C) 2000-4 Pete Wyckoff <pw at osc.edu>
+ * Copyright (C) 2005 Institut National de Recherche en Informatique
+ *                    et Automatique <sebastien.georget at sophia.inria.fr>
  *
  * Distributed under the GNU Public License Version 2 or later (See LICENSE)
  */
@@ -33,6 +35,24 @@
 	/* replace old event with the kill one */
 	tasks[i].evt_obit = tasks[i].evt;
 	tasks[i].evt = evt;
+    }
+}
+
+/*
+ * Use tm to send a signal to all tasks.
+ */
+void
+notify_tasks(int signum)
+{
+    int i, err;
+    tm_event_t evt;
+
+    /* printf("kill_tasks: killing with %d\n", signum); */
+    /* ignore the generated event */
+    for (i=0; i<numtask; i++) {
+	if (tasks[i].done) continue;
+	if ((err = tm_kill(tasks[i].tid, signum, &evt)) != TM_SUCCESS)
+	    error_tm(err, "%s: tm_kill tid %d", __func__, tasks[i].tid);
     }
 }
 
-------------- next part --------------
diff -ru torque-1.2.0p2/src/resmom/mom_comm.c torque-1.2.0p2.new/src/resmom/mom_comm.c
--- torque-1.2.0p2/src/resmom/mom_comm.c	Mon Feb 28 18:17:55 2005
+++ torque-1.2.0p2.new/src/resmom/mom_comm.c	Tue Apr 12 11:02:56 2005
@@ -3,6 +3,8 @@
 * 
 * Copyright (c) 1999-2000 Veridian Information Solutions, Inc.
 * All rights reserved.
+* Copyright (C) 2005 Institut National de Recherche en Informatique
+*                    et Automatique <sebastien.georget at sophia.inria.fr>
 * 
 * ---------------------------------------------------------------------------
 * For a license to use or redistribute the OpenPBS software under conditions
@@ -3929,7 +3931,9 @@
     goto err;
     }
 
-  if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) 
+  if ((pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) &&
+      (pjob->ji_qs.ji_substate != JOB_SUBSTATE_SUSPEND) &&
+      (command == TM_SIGNAL))
     {
     sprintf(log_buffer, "job %s not running", 
       jobid);
diff -ru torque-1.2.0p2/src/resmom/requests.c torque-1.2.0p2.new/src/resmom/requests.c
--- torque-1.2.0p2/src/resmom/requests.c	Tue Feb 22 21:59:14 2005
+++ torque-1.2.0p2.new/src/resmom/requests.c	Tue Apr 12 11:03:17 2005
@@ -3,6 +3,8 @@
 * 
 * Copyright (c) 1999-2000 Veridian Information Solutions, Inc.
 * All rights reserved.
+* Copyright (C) 2005 Institut National de Recherche en Informatique
+*                    et Automatique <sebastien.georget at sophia.inria.fr>
 * 
 * ---------------------------------------------------------------------------
 * For a license to use or redistribute the OpenPBS software under conditions
@@ -1181,7 +1183,15 @@
   for (tp = (task *)GET_NEXT(pjob->ji_tasks);tp != NULL;tp = (task *)GET_NEXT(tp->ti_jobtask)) 
     {
     if (susp != 0)
+    {
+      /* For parallel jobs */
+      stat = kill_task(tp,SIGTSTP);
+
+      sleep(5);
+
+      /* For sequential jobs */
       stat = kill_task(tp,SIGSTOP);
+    }
     else
       stat = kill_task(tp,SIGCONT);
 
diff -ru torque-1.2.0p2/src/server/req_signal.c torque-1.2.0p2.new/src/server/req_signal.c
--- torque-1.2.0p2/src/server/req_signal.c	Tue Feb 22 21:59:23 2005
+++ torque-1.2.0p2.new/src/server/req_signal.c	Tue Apr 12 11:03:33 2005
@@ -3,6 +3,8 @@
 * 
 * Copyright (c) 1999-2000 Veridian Information Solutions, Inc.
 * All rights reserved.
+* Copyright (C) 2005 Institut National de Recherche en Informatique
+*                    et Automatique <sebastien.georget at sophia.inria.fr>
 * 
 * ---------------------------------------------------------------------------
 * For a license to use or redistribute the OpenPBS software under conditions
@@ -227,9 +229,11 @@
       set_statechar(pjob);
 
       job_save(pjob,SAVEJOB_QUICK);
+      free_nodes(pjob);
       } 
     else if (strcmp(preq->rq_ind.rq_signal.rq_signame,SIG_RESUME) == 0) 
       {
+      set_old_nodes(pjob);
       pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_Suspend;
 
       set_statechar(pjob);


More information about the mpiexec mailing list