#!/bin/bash # # start_distributed_jobs: # # This is the program that starts the batch jobs. # It is run once, on just one host. # It runs `local_job_background_start' on each host, # which then starts a specified number of instances of # the `local_job_coordinator' script for that host. # # determine base directory of the batch basedir="$(dirname "$(realpath "$0" )" )" # include the list of hostnames and numbers of processes source "$basedir/hosts.list" # options for the ssh login sshopts=" \ -o StrictHostKeyChecking=no \ -o NoHostAuthenticationForLocalhost=yes \ -o CheckHostIP=no \ -o PreferredAuthentications=publickey \ -o ForwardX11=no \ -o ForwardX11Trusted=no" # check that the script isn't being re-run on a started batch logfile="$basedir/logs/`basename $0`.log" if [ -f "$logfile" ] then echo "$0" echo "Logfile: $logfile exists." echo "The batch should only be started on a fresh tree." echo "Exiting." exit 3 fi # get confirmation: it's tedious to try to cancel a batch! echo "$0: " echo "Log in to these hosts" echo "$usehosts" \ | while read do echo "$REPLY" | grep -q ',' || REPLY="$REPLY," sshhost="`echo "$REPLY" | cut -d, -f1`" nproc="`echo "$REPLY" | cut -d, -f2`" [ -z "$nproc" ] && nproc=1 printf "%20s,%d\n" $sshhost $nproc done echo "to run the jobs?" echo -n "Please confirm: (yes|NO) > " read approve if [ "$approve" != "yes" ] then echo "Answer $approve is not 'yes': exiting." echo " " exit 4 fi echo " " # loop through the hosts, logging in and starting the local # coordinator; the number of processes is passed as a variable exename="$basedir/local_job_background_start" ( # information for screen and logs echo "`hostname`, `date +'%F %H:%M:%S %Z'`: $0" # action: for each host-line, split into hostname # and number of processes, set default number of # processes to 1, then use ssh to do the remote start nhost=0 nhosts=`echo "$usehosts" | wc -l` for nhost in `seq 1 $nhosts` do line="`echo "$usehosts" | tail -n +$nhost | head -n 1`" nhost=$((nhost+1)) echo "$line" | grep -q ',' || line="$line," sshhost="`echo "$line" | cut -d, -f1`" nproc="`echo "$line" | cut -d, -f2`" [ -z "$nproc" ] && nproc=1 echo " " echo "host $nhost: \"$sshhost\", nproc: $nproc" ssh $sshopts $sshhost $exename $nproc done echo " " ) 2>&1 | tee -a "$logfile"