Propogation of SIGTSTP ?
Sébastien Georget
Sebastien.Georget at sophia.inria.fr
Tue Apr 12 07:52:18 EDT 2005
Pete Wyckoff wrote:
> We had a bit of followup later:
>
> http://email.osc.edu/pipermail/mpiexec/2004/000329.html
>
> but I never did anything because I wasn't sure what the right
> behavior would be. And there seemed to be some question about
> modifications to qsig too.
>
> If you and others can figure out all the details and handle
> the PBS side of things, I'll certainly help to get mpiexec
> to play nicely with SIGSTOP and SIGTSTP.
Hi,
here are the patches we are testing to suspend/resume mpi job submited
with mpiexec through torque.
Comments are welcome.
Sébastien
--
Sébastien Georget
INRIA Sophia-Antipolis, Service DREAM, B.P. 93
06902 Sophia-Antipolis Cedex, FRANCE
E-mail : sebastien.georget at sophia.inria.fr
-------------- next part --------------
diff -ru mpiexec-0.78/mpiexec.c mpiexec-0.78.new/mpiexec.c
--- mpiexec-0.78/mpiexec.c Mon Apr 11 16:56:45 2005
+++ mpiexec-0.78.new/mpiexec.c Tue Apr 12 10:43:36 2005
@@ -8,6 +8,9 @@
*
* Copyright (C) 2000-5 Pete Wyckoff <pw at osc.edu>
*
+ * Copyright (C) 2005 Institut National de Recherche en Informatique
+ * et Automatique <sebastien.georget at sophia.inria.fr>
+ *
* Distributed under the GNU Public License Version 2 or later (See LICENSE)
*/
#define _GNU_SOURCE /* hoping to get strsignal() from string.h */
@@ -176,6 +179,18 @@
__func__, sig, parse_signal_number(sig));
wait_tasks(sig, 0);
exit(sig);
+}
+
+void
+notify_all(int sig)
+{
+ if (cl_args->verbose)
+ printf("%s: caught signal %d (%s), sending it to all tasks...\n",
+ __func__, sig, parse_signal_number(sig));
+ if (sig == 18)
+ notify_tasks(18);
+ if (sig == 20)
+ notify_tasks(19);
}
/*
diff -ru mpiexec-0.78/mpiexec.h mpiexec-0.78.new/mpiexec.h
--- mpiexec-0.78/mpiexec.h Mon Apr 11 16:56:45 2005
+++ mpiexec-0.78.new/mpiexec.h Mon Apr 11 16:56:57 2005
@@ -114,6 +114,7 @@
*/
/* mpiexec.c */
const char *resolve_exe(const char *exe);
+void notify_all(int sig);
void killall(int sig);
void handle_signals(const int *list, int num, void (*handler)(int sig));
typedef int (*cull_compare_func)(task_cntrl_t *t1, task_cntrl_t *t2);
diff -ru mpiexec-0.78/start_tasks.c mpiexec-0.78.new/start_tasks.c
--- mpiexec-0.78/start_tasks.c Mon Apr 11 16:56:45 2005
+++ mpiexec-0.78.new/start_tasks.c Tue Apr 12 10:44:55 2005
@@ -5,6 +5,8 @@
*
* Copyright (C) 2000-3 Ohio Supercomputer Center.
* Copyright (C) 2000-5 Pete Wyckoff <pw at osc.edu>
+ * Copyright (C) 2005 Institut National de Recherche en Informatique
+ * et Automatique <sebastien.georget at sophia.inria.fr>
*
* Distributed under the GNU Public License Version 2 or later (See LICENSE)
*/
@@ -354,8 +356,10 @@
{
const int siglist[] = { SIGHUP, SIGINT, SIGTERM };
const int alarm_list[] = { SIGALRM };
+ const int suspend_list[] = { SIGTSTP, SIGCONT };
handle_signals(siglist, list_count(siglist), killall);
handle_signals(alarm_list, list_count(alarm_list), kill_others_now);
+ handle_signals(suspend_list, list_count(suspend_list), notify_all);
}
/*
diff -ru mpiexec-0.78/wait_tasks.c mpiexec-0.78.new/wait_tasks.c
--- mpiexec-0.78/wait_tasks.c Mon Apr 11 16:56:45 2005
+++ mpiexec-0.78.new/wait_tasks.c Tue Apr 12 10:45:36 2005
@@ -5,6 +5,8 @@
*
* Copyright (C) 2000-3 Ohio Supercomputer Center.
* Copyright (C) 2000-4 Pete Wyckoff <pw at osc.edu>
+ * Copyright (C) 2005 Institut National de Recherche en Informatique
+ * et Automatique <sebastien.georget at sophia.inria.fr>
*
* Distributed under the GNU Public License Version 2 or later (See LICENSE)
*/
@@ -33,6 +35,24 @@
/* replace old event with the kill one */
tasks[i].evt_obit = tasks[i].evt;
tasks[i].evt = evt;
+ }
+}
+
+/*
+ * Use tm to send a signal to all tasks.
+ */
+void
+notify_tasks(int signum)
+{
+ int i, err;
+ tm_event_t evt;
+
+ /* printf("kill_tasks: killing with %d\n", signum); */
+ /* ignore the generated event */
+ for (i=0; i<numtask; i++) {
+ if (tasks[i].done) continue;
+ if ((err = tm_kill(tasks[i].tid, signum, &evt)) != TM_SUCCESS)
+ error_tm(err, "%s: tm_kill tid %d", __func__, tasks[i].tid);
}
}
-------------- next part --------------
diff -ru torque-1.2.0p2/src/resmom/mom_comm.c torque-1.2.0p2.new/src/resmom/mom_comm.c
--- torque-1.2.0p2/src/resmom/mom_comm.c Mon Feb 28 18:17:55 2005
+++ torque-1.2.0p2.new/src/resmom/mom_comm.c Tue Apr 12 11:02:56 2005
@@ -3,6 +3,8 @@
*
* Copyright (c) 1999-2000 Veridian Information Solutions, Inc.
* All rights reserved.
+* Copyright (C) 2005 Institut National de Recherche en Informatique
+* et Automatique <sebastien.georget at sophia.inria.fr>
*
* ---------------------------------------------------------------------------
* For a license to use or redistribute the OpenPBS software under conditions
@@ -3929,7 +3931,9 @@
goto err;
}
- if (pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING)
+ if ((pjob->ji_qs.ji_substate != JOB_SUBSTATE_RUNNING) &&
+ (pjob->ji_qs.ji_substate != JOB_SUBSTATE_SUSPEND) &&
+ (command == TM_SIGNAL))
{
sprintf(log_buffer, "job %s not running",
jobid);
diff -ru torque-1.2.0p2/src/resmom/requests.c torque-1.2.0p2.new/src/resmom/requests.c
--- torque-1.2.0p2/src/resmom/requests.c Tue Feb 22 21:59:14 2005
+++ torque-1.2.0p2.new/src/resmom/requests.c Tue Apr 12 11:03:17 2005
@@ -3,6 +3,8 @@
*
* Copyright (c) 1999-2000 Veridian Information Solutions, Inc.
* All rights reserved.
+* Copyright (C) 2005 Institut National de Recherche en Informatique
+* et Automatique <sebastien.georget at sophia.inria.fr>
*
* ---------------------------------------------------------------------------
* For a license to use or redistribute the OpenPBS software under conditions
@@ -1181,7 +1183,15 @@
for (tp = (task *)GET_NEXT(pjob->ji_tasks);tp != NULL;tp = (task *)GET_NEXT(tp->ti_jobtask))
{
if (susp != 0)
+ {
+ /* For parallel jobs */
+ stat = kill_task(tp,SIGTSTP);
+
+ sleep(5);
+
+ /* For sequential jobs */
stat = kill_task(tp,SIGSTOP);
+ }
else
stat = kill_task(tp,SIGCONT);
diff -ru torque-1.2.0p2/src/server/req_signal.c torque-1.2.0p2.new/src/server/req_signal.c
--- torque-1.2.0p2/src/server/req_signal.c Tue Feb 22 21:59:23 2005
+++ torque-1.2.0p2.new/src/server/req_signal.c Tue Apr 12 11:03:33 2005
@@ -3,6 +3,8 @@
*
* Copyright (c) 1999-2000 Veridian Information Solutions, Inc.
* All rights reserved.
+* Copyright (C) 2005 Institut National de Recherche en Informatique
+* et Automatique <sebastien.georget at sophia.inria.fr>
*
* ---------------------------------------------------------------------------
* For a license to use or redistribute the OpenPBS software under conditions
@@ -227,9 +229,11 @@
set_statechar(pjob);
job_save(pjob,SAVEJOB_QUICK);
+ free_nodes(pjob);
}
else if (strcmp(preq->rq_ind.rq_signal.rq_signame,SIG_RESUME) == 0)
{
+ set_old_nodes(pjob);
pjob->ji_qs.ji_svrflags &= ~JOB_SVFLG_Suspend;
set_statechar(pjob);
More information about the mpiexec
mailing list