Recent Changes - Search:

Disclaimer

edit SideBar

Test Torque/PBS

On this page... (hide)

  1. 1. Problem
  2. 2. Test case
  3. 3. Fix
  4. 4. Links

1.  Problem

Our cluster uses SMP, multi-core CPUs. We were unable to run the right number of jobs per physical node.

Using

set queue batch resources_default.nodes = 1

only 8 jobs run - one per node.

Using

set queue batch resources_default.nodes = 0

infinite jobs per node run...

2.  Test case

busywait.c

#include <stdlib.h>
int main() {
  int i=0,j=0;
  double a=0;
  for (j=0;1<10;j++) {
  for (i=0;i<1e9;i++) {
    a+=0.001*i*rand()/RAND_MAX;
  }
  }
  return a;
}
make busywait
for a in $(seq 1 100); do echo "$PWD/busywait" | qsub; done ; qstat

killall your running jobs

qstat | grep R | cut -f1 -d. | xargs qdel

3.  Fix

It's not really clear what fixed the problem. Settings which initially did not seem to work instead turned out to be right, after restarting both Torque and Maui. And now it seems like I cannot reproduce the problem.

Anyway, I will record the current configuration.

qmgr -c "print server"

#
# Create queues and set their attributes.
#
#
# Create and define queue short
#
create queue short
set queue short queue_type = Execution
set queue short resources_default.walltime = 00:02:00
set queue short enabled = True
set queue short started = True
#
# Create and define queue gilda
#
create queue gilda
set queue gilda queue_type = Execution
set queue gilda resources_default.neednodes = gLite
set queue gilda resources_default.walltime = 00:05:00
set queue gilda enabled = True
set queue gilda started = True
#
# Create and define queue batch
#
create queue batch
set queue batch queue_type = Execution
set queue batch resources_default.walltime = 01:00:00
set queue batch enabled = True
set queue batch started = True
#
# Create and define queue long
#
create queue long
set queue long queue_type = Execution
set queue long resources_default.walltime = 24:00:00
set queue long enabled = True
set queue long started = True
#
# Create and define queue infinite
#
create queue infinite
set queue infinite queue_type = Execution
set queue infinite resources_default.walltime = 168:00:00
set queue infinite enabled = True
set queue infinite started = True
#
# Set server attributes.
#
set server scheduling = True
set server acl_hosts = gridvm
set server managers = clusteradm@gridvm.grid.uj.ac.za
set server managers += root@gridvm.grid.uj.ac.za
set server operators = edginfo@glite-ce.grid.uj.ac.za
set server operators += edguser@glite-ce.grid.uj.ac.za
set server operators += root@gridvm.grid.uj.ac.za
set server operators += root@glite-ce.grid.uj.ac.za
set server default_queue = batch
set server log_events = 511
set server mail_from = adm
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server node_pack = False
set server pbs_version = 2.3.0-snap.200801151629
set server submit_hosts = gridvm
set server submit_hosts += glite-ce
set server submit_hosts += osg-ce
set server submit_hosts += glite-ui
set server submit_hosts += osg-ui

maui.cfg

# 
# MAUI configuration example
# @(#)maui.cfg David Groep 20031015.1
# for MAUI version 3.2.5
# 
SERVERHOST		gridvm
ADMIN1			root edginfo edguser rgma
ADMIN3			rgma edginfo
ADMINHOST		gridvm glite-ce osg-ce
RMTYPE[0]           PBS
RMHOST[0]	    gridvm
RMSERVER[0]         gridvm

SERVERPORT            40559
SERVERMODE            NORMAL

# Set PBS server polling interval. Since we have many short jobs
# and want fast turn-around, set this to 10 seconds (default: 2 minutes)
RMPOLLINTERVAL        00:00:10

# a max. 10 MByte log file in a logical location
LOGFILE               /var/log/maui.log
LOGFILEMAXSIZE        10000000
LOGLEVEL              2

server_priv/nodes

vwn001 i386
vwn002 i386
wn001 np=8 i386 x86_64
wn002 np=8 i386 x86_64
wn003 np=8 i386 x86_64
wn004 np=8 i386 x86_64
wn005 np=8 i386 x86_64
wn006 np=8 i386 x86_64
wn007 np=8 i386 x86_64

A nicely loaded node:

[root@gridvm pbs]# qnodes wn001
wn001
     state = job-exclusive
     np = 8
     properties = i386,x86_64
     ntype = cluster
     jobs = 0/9698.gridvm.grid.uj.ac.za, 1/9699.gridvm.grid.uj.ac.za, 2/9700.gridvm.grid.uj.ac.za, 3/9701.gridvm.grid.uj.ac.za, 4/9702.gridvm.grid.uj.ac.za, 5/9703.gridvm.grid.uj.ac.za, 6/9704.gridvm.grid.uj.ac.za, 7/9705.gridvm.grid.uj.ac.za
     status = opsys=linux,uname=Linux wn001 2.6.9-78.0.17.ELsmp #1 SMP Thu Mar 12 12:05:34 CDT 2009 x86_64,sessions=22562 22571 22587 22614 22646 22690 22729 22767,nsessions=8,nusers=1,idletime=719923,totmem=5907488kb,availmem=5728892kb,physmem=16442152kb,ncpus=8,loadave=9.00,netload=861203745,state=free,jobs=9698.gridvm.grid.uj.ac.za 9699.gridvm.grid.uj.ac.za 9700.gridvm.grid.uj.ac.za 9701.gridvm.grid.uj.ac.za 9702.gridvm.grid.uj.ac.za 9703.gridvm.grid.uj.ac.za 9704.gridvm.grid.uj.ac.za 9705.gridvm.grid.uj.ac.za,rectime=1238619769

[root@gridvm pbs]# checknode wn001
checking node wn001

State:      Busy  (in current state for 00:18:38)
Configured Resources: PROCS: 8  MEM: 15G  SWAP: 15G  DISK: 1M
Utilized   Resources: PROCS: 8
Dedicated  Resources: PROCS: 8
Opsys:         linux  Arch:      [NONE]
Speed:      1.00  Load:       9.000
Network:    [DEFAULT]
Features:   [i386][x86_64]
Attributes: [Batch]
Classes:    [short 8:8][gilda 8:8][batch 0:8][long 8:8][infinite 8:8]

Total Time: 74:15:20:11  Up: 74:13:47:42 (99.91%)  Active: 2:41:02 (0.15%)

Reservations:
  Job '9698'(x1)  -00:18:39 -> 00:41:21 (1:00:00)
  Job '9699'(x1)  -00:18:39 -> 00:41:21 (1:00:00)
  Job '9700'(x1)  -00:18:39 -> 00:41:21 (1:00:00)
  Job '9701'(x1)  -00:18:39 -> 00:41:21 (1:00:00)
  Job '9702'(x1)  -00:18:39 -> 00:41:21 (1:00:00)
  Job '9703'(x1)  -00:18:39 -> 00:41:21 (1:00:00)
  Job '9704'(x1)  -00:18:39 -> 00:41:21 (1:00:00)
  Job '9705'(x1)  -00:18:39 -> 00:41:21 (1:00:00)
JobList:  9698,9699,9700,9701,9702,9703,9704,9705

4.  Links

Edit - History - Print - Recent Changes - Search
Page last modified on April 06, 2009, at 11:30 AM