|
|
SysAdm /
Test Torque/PBSNavigation ProblemOur cluster uses SMP, multi-core CPUs. We were unable to run the right number of jobs per physical node. Using set queue batch resources_default.nodes = 1 only 8 jobs run - one per node. Using set queue batch resources_default.nodes = 0 infinite jobs per node run... Test case
#include <stdlib.h>
int main() {
int i=0,j=0;
double a=0;
for (j=0;1<10;j++) {
for (i=0;i<1e9;i++) {
a+=0.001*i*rand()/RAND_MAX;
}
}
return a;
}
make busywait for a in $(seq 1 100); do echo "$PWD/busywait" | qsub; done ; qstat killall your running jobs qstat | grep R | cut -f1 -d. | xargs qdel FixIt's not really clear what fixed the problem. Settings which initially did not seem to work instead turned out to be right, after restarting both Torque and Maui. And now it seems like I cannot reproduce the problem. Anyway, I will record the current configuration.
# # Create queues and set their attributes. # # # Create and define queue short # create queue short set queue short queue_type = Execution set queue short resources_default.walltime = 00:02:00 set queue short enabled = True set queue short started = True # # Create and define queue gilda # create queue gilda set queue gilda queue_type = Execution set queue gilda resources_default.neednodes = gLite set queue gilda resources_default.walltime = 00:05:00 set queue gilda enabled = True set queue gilda started = True # # Create and define queue batch # create queue batch set queue batch queue_type = Execution set queue batch resources_default.walltime = 01:00:00 set queue batch enabled = True set queue batch started = True # # Create and define queue long # create queue long set queue long queue_type = Execution set queue long resources_default.walltime = 24:00:00 set queue long enabled = True set queue long started = True # # Create and define queue infinite # create queue infinite set queue infinite queue_type = Execution set queue infinite resources_default.walltime = 168:00:00 set queue infinite enabled = True set queue infinite started = True # # Set server attributes. # set server scheduling = True set server acl_hosts = gridvm set server managers = clusteradm@gridvm.grid.uj.ac.za set server managers += root@gridvm.grid.uj.ac.za set server operators = edginfo@glite-ce.grid.uj.ac.za set server operators += edguser@glite-ce.grid.uj.ac.za set server operators += root@gridvm.grid.uj.ac.za set server operators += root@glite-ce.grid.uj.ac.za set server default_queue = batch set server log_events = 511 set server mail_from = adm set server scheduler_iteration = 600 set server node_check_rate = 150 set server tcp_timeout = 6 set server node_pack = False set server pbs_version = 2.3.0-snap.200801151629 set server submit_hosts = gridvm set server submit_hosts += glite-ce set server submit_hosts += osg-ce set server submit_hosts += glite-ui set server submit_hosts += osg-ui
# # MAUI configuration example # @(#)maui.cfg David Groep 20031015.1 # for MAUI version 3.2.5 # SERVERHOST gridvm ADMIN1 root edginfo edguser rgma ADMIN3 rgma edginfo ADMINHOST gridvm glite-ce osg-ce RMTYPE[0] PBS RMHOST[0] gridvm RMSERVER[0] gridvm SERVERPORT 40559 SERVERMODE NORMAL # Set PBS server polling interval. Since we have many short jobs # and want fast turn-around, set this to 10 seconds (default: 2 minutes) RMPOLLINTERVAL 00:00:10 # a max. 10 MByte log file in a logical location LOGFILE /var/log/maui.log LOGFILEMAXSIZE 10000000 LOGLEVEL 2
vwn001 i386 vwn002 i386 wn001 np=8 i386 x86_64 wn002 np=8 i386 x86_64 wn003 np=8 i386 x86_64 wn004 np=8 i386 x86_64 wn005 np=8 i386 x86_64 wn006 np=8 i386 x86_64 wn007 np=8 i386 x86_64 A nicely loaded node:
[root@gridvm pbs]# qnodes wn001
wn001
state = job-exclusive
np = 8
properties = i386,x86_64
ntype = cluster
jobs = 0/9698.gridvm.grid.uj.ac.za, 1/9699.gridvm.grid.uj.ac.za, 2/9700.gridvm.grid.uj.ac.za, 3/9701.gridvm.grid.uj.ac.za, 4/9702.gridvm.grid.uj.ac.za, 5/9703.gridvm.grid.uj.ac.za, 6/9704.gridvm.grid.uj.ac.za, 7/9705.gridvm.grid.uj.ac.za
status = opsys=linux,uname=Linux wn001 2.6.9-78.0.17.ELsmp #1 SMP Thu Mar 12 12:05:34 CDT 2009 x86_64,sessions=22562 22571 22587 22614 22646 22690 22729 22767,nsessions=8,nusers=1,idletime=719923,totmem=5907488kb,availmem=5728892kb,physmem=16442152kb,ncpus=8,loadave=9.00,netload=861203745,state=free,jobs=9698.gridvm.grid.uj.ac.za 9699.gridvm.grid.uj.ac.za 9700.gridvm.grid.uj.ac.za 9701.gridvm.grid.uj.ac.za 9702.gridvm.grid.uj.ac.za 9703.gridvm.grid.uj.ac.za 9704.gridvm.grid.uj.ac.za 9705.gridvm.grid.uj.ac.za,rectime=1238619769
[root@gridvm pbs]# checknode wn001
checking node wn001
State: Busy (in current state for 00:18:38)
Configured Resources: PROCS: 8 MEM: 15G SWAP: 15G DISK: 1M
Utilized Resources: PROCS: 8
Dedicated Resources: PROCS: 8
Opsys: linux Arch: [NONE]
Speed: 1.00 Load: 9.000
Network: [DEFAULT]
Features: [i386][x86_64]
Attributes: [Batch]
Classes: [short 8:8][gilda 8:8][batch 0:8][long 8:8][infinite 8:8]
Total Time: 74:15:20:11 Up: 74:13:47:42 (99.91%) Active: 2:41:02 (0.15%)
Reservations:
Job '9698'(x1) -00:18:39 -> 00:41:21 (1:00:00)
Job '9699'(x1) -00:18:39 -> 00:41:21 (1:00:00)
Job '9700'(x1) -00:18:39 -> 00:41:21 (1:00:00)
Job '9701'(x1) -00:18:39 -> 00:41:21 (1:00:00)
Job '9702'(x1) -00:18:39 -> 00:41:21 (1:00:00)
Job '9703'(x1) -00:18:39 -> 00:41:21 (1:00:00)
Job '9704'(x1) -00:18:39 -> 00:41:21 (1:00:00)
Job '9705'(x1) -00:18:39 -> 00:41:21 (1:00:00)
JobList: 9698,9699,9700,9701,9702,9703,9704,9705
Links |