Problem with Mpiexec

Joseph Spadavecchia j.spadavecchia at ed.ac.uk
Sat Feb 22 07:39:26 EST 2003


Hello,

Our research group has a 160 processor (900Mhz Each) Beowulf cluster and I
am trying to use mpiexec as a replacement to mpirun, but Mpiexec does not
work and I would greatly appreciate your help!

With Open_PBS_2.3.12 I applied the pbs-2.3.12-mpiexec.diff patch and
rebuilt pbs and installed it --prefix=/usr/pbs.

PBS is configured as follows:

Qmgr: print server
#
# Create queues and set their attributes.
#
#
# Create and define queue small
#
create queue small
set queue small queue_type = Execution
set queue small Priority = 128
set queue small resources_max.nodect = 8
set queue small resources_min.nodect = 1
set queue small enabled = True
set queue small started = True
#
# Create and define queue big
#
create queue big
set queue big queue_type = Execution
set queue big Priority = 0
set queue big resources_max.nodect = 78
set queue big resources_min.nodect = 9
set queue big enabled = True
set queue big started = True
#
# Set server attributes.
#
set server scheduling = True
set server managers = joseph at master
set server managers += mmatrino at master
set server default_queue = big
set server log_events = 511
set server mail_from = admin
set server query_other_jobs = True
set server resources_available.mem = 39936mb
set server resources_available.ncpus = 156
set server scheduler_iteration = 600


mpich-1.2.5 is installed in /usr/local/mpich witht the ch_p4 device.

I complied mpiexec 0.72 and 0.68 with the following options:

./configure --with-pbs=/usr/pbs
--with-pbs-src=/usr/local/src/OpenPBS_2_3_12 --with-default-comm=mpich-p4
--with-mpicc=/usr/local/mpich/bin/mpicc

After typing make && make install

I try to use mpiexec and get the following errors:

mpiexec -n 10 /usr/local/src/mpiexec-0.68/hello
mpiexec: Error: PBS_JOBID not set in environment.  Code must be run from a
  PBS script, perhaps interactively using "qsub -I".

If I export PBS_JOBID=FOO_BAR I get this error..

mpiexec -n 10 /usr/local/src/mpiexec-0.68/hello
mpiexec: Error: get_hosts: tm_init: tm: bad environment.

I've looked in all the mailing lists and in all documentation, but cannot
find a remidy.

Below you will find a strace, which is probably not that helpful, but why
not...

If you could please help me I'd really appreciate it.

Thanks for your time,
*Joseph



strace mpiexec -n 10 /usr/local/src/mpiexec-0.68/hello
execve("/usr/local/bin/mpiexec", ["mpiexec", "-n", "10",
"/usr/local/src/mpiexec-0.68/hello"], [/* 28 vars */]) = 0
uname({sys="Linux", node="master", ...}) = 0
brk(0)                                  = 0x8057280
old_mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1,
0) = 0x40017000
open("/etc/ld.so.preload", O_RDONLY)    = -1 ENOENT (No such file or
directory)
open("/opt/intel/compiler50/ia32/lib/i686/mmx/libc.so.6", O_RDONLY) = -1
ENOENT (No such file or directory)
stat64("/opt/intel/compiler50/ia32/lib/i686/mmx", 0xbfffebcc) = -1 ENOENT
(No such file or directory)
open("/opt/intel/compiler50/ia32/lib/i686/libc.so.6", O_RDONLY) = -1
ENOENT (No such file or directory)
stat64("/opt/intel/compiler50/ia32/lib/i686", 0xbfffebcc) = -1 ENOENT (No
such file or directory)
open("/opt/intel/compiler50/ia32/lib/mmx/libc.so.6", O_RDONLY) = -1 ENOENT
(No such file or directory)
stat64("/opt/intel/compiler50/ia32/lib/mmx", 0xbfffebcc) = -1 ENOENT (No
such file or directory)
open("/opt/intel/compiler50/ia32/lib/libc.so.6", O_RDONLY) = -1 ENOENT (No
such file or directory)
stat64("/opt/intel/compiler50/ia32/lib", {st_mode=S_IFDIR|0775,
st_size=4096, ...}) = 0
open("/etc/ld.so.cache", O_RDONLY)      = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=48070, ...}) = 0
old_mmap(NULL, 48070, PROT_READ, MAP_PRIVATE, 3, 0) = 0x40018000
close(3)                                = 0
open("/lib/i686/libc.so.6", O_RDONLY)   = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0@\307\1"..., 1024)
= 1024
fstat64(3, {st_mode=S_IFREG|0755, st_size=5779542, ...}) = 0
old_mmap(NULL, 1291464, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) =
0x40024000
mprotect(0x40156000, 38088, PROT_NONE)  = 0
old_mmap(0x40156000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED,
3, 0x131000) = 0x40156000
old_mmap(0x4015c000, 13512, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x4015c000
close(3)                                = 0
munmap(0x40018000, 48070)               = 0
fstat64(0, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 9), ...}) = 0
fstat64(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 9), ...}) = 0
fstat64(2, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 9), ...}) = 0
brk(0)                                  = 0x8057280
brk(0x80572c8)                          = 0x80572c8
brk(0x8058000)                          = 0x8058000
access("/usr/local/src/mpiexec-0.68/hello", X_OK) = 0
stat64("/usr/local/src/mpiexec-0.68/hello", {st_mode=S_IFREG|0755,
st_size=361473, ...}) = 0
getuid32()                              = 511
socket(PF_UNIX, SOCK_STREAM, 0)         = 3
connect(3, {sin_family=AF_UNIX, path="/var/run/.nscd_socket"}, 110) = -1
ENOENT (No such file or directory)
close(3)                                = 0
open("/etc/nsswitch.conf", O_RDONLY)    = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=1762, ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)
= 0x40018000
read(3, "#\n# /etc/nsswitch.conf\n#\n# An ex"..., 4096) = 1762
read(3, "", 4096)                       = 0
close(3)                                = 0
munmap(0x40018000, 4096)                = 0
open("/opt/intel/compiler50/ia32/lib/libnss_files.so.2", O_RDONLY) = -1
ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY)      = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=48070, ...}) = 0
old_mmap(NULL, 48070, PROT_READ, MAP_PRIVATE, 3, 0) = 0x40018000
close(3)                                = 0
open("/lib/libnss_files.so.2", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\200 \0"..., 1024)
= 1024
fstat64(3, {st_mode=S_IFREG|0755, st_size=262272, ...}) = 0
brk(0x8059000)                          = 0x8059000
old_mmap(NULL, 42600, PROT_READ|PROT_EXEC, MAP_PRIVATE, 3, 0) = 0x40160000
mprotect(0x4016a000, 1640, PROT_NONE)   = 0
old_mmap(0x4016a000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED, 3,
0x9000) = 0x4016a000
close(3)                                = 0
munmap(0x40018000, 48070)               = 0
open("/etc/passwd", O_RDONLY)           = 3
fcntl64(0x3, 0x1, 0, 0x1)               = 0
fcntl64(0x3, 0x2, 0x1, 0x1)             = 0
fstat64(3, {st_mode=S_IFREG|0644, st_size=2744, ...}) = 0
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)
= 0x40018000
read(3, "root:x:0:0:root:/root:/bin/bash\n"..., 4096) = 2744
close(3)                                = 0
munmap(0x40018000, 4096)                = 0
write(2, "mpiexec: Error: ", 16mpiexec: Error: )        = 16
write(2, "get_hosts: tm_init", 18get_hosts: tm_init)      = 18
write(2, ": tm: ", 6: tm: )                   = 6
write(2, "bad environment.\n", 17bad environment.
)      = 17
_exit(1)                                = ?

I've done some debugging on my own (by inserting fprintf's in tm.c)
and it seems like it fails b/c the COOKIE is bad.  Obviously your patch is
supposed to fix this by removing it, but does not work for some reason.
Also, it can't get the JOBID from the environment for some reason.





More information about the mpiexec mailing list