#!/bin/bash # # Loop through all job numbers, trying to claim and run any # that is not already claimed; each job is waited for until # it returns. Jobs should therefore NOT background anything # that would remain after their return! # CPU priority: 0 --> normal, 19 --> only if nothing else jobnice=15 # for the logs echo "`hostname`,$instance: `date +'%F %H:%M:%S %Z'`: $0" # the batch's base directory basedir="$(dirname "$(realpath "$0" )" )" # exit if unable to change to the base directory if ! cd "$basedir" then echo "error from cd to $basedir" echo "$0 exiting" exit 2 fi pwd # define subdirectory names for work and logs logssub=logs worksub=work # name of executable to try to run in each job directory runexec=run # basename of stdout and stderr logs for each job (in its pwd) joblog=log # main loop: as soon as a job finishes, look for the next that isn't # already claimed by the creation of a nnnnnn.begun file iter=0; while true do echo " " # take next job-number iter=$((iter+1)) # make a zero-padded version for filenames namebase="` printf "%06d" $iter `" # log each attempt echo "try job: $namebase: `date +'%F %H:%M:%S %Z'`" # set names of the various executables, logs and workingdirs. logs="$logssub/$namebase" work="$worksub/$namebase" exec="$worksub/$namebase/$runexec" # if no executable, assume there are no more jobs -- exit if [ ! -d "$work" ] then echo "job $namebase: directory $work not found" echo "*** NO MORE JOBS: `date +'%F %H:%M:%S %Z'`: Exiting." echo "`hostname` `date +'%F %H:%M:%S %Z'`" \ >"$logssub/host_`hostname`_$instance.ended" break elif [ ! -x "$exec" ] then echo "job $namebase: directory $work/ exists, but not executable $exec" echo "*** SKIPPING this job" continue fi # if this job-number already has associated "begun" file, go to the next if [ -f "$logs.begun" ] then echo "$logs.begun already exists (`date +'%F %H:%M:%S %Z'`)." continue fi # if we get here, the job seems available: but another host/process # might be trying to take it simultaneously, so be a little careful # (i.e. "please clean me up, if you've knowledge, time and need") echo "job $exec is available: creating $logs.begun to claim it"; touch "$logs.begun" claimstring="`hostname`,$instance `date +'%F %H:%M:%S %Z'`" # flock only helps on the local computer (not over NFS)... flock --timeout=0 "$logs.begun" \ sh -c "echo \"$claimstring\" >>\"$logs.begun\" ; chmod a-w \"$logs.begun\"" # ... so, check whether another host has somehow managed to get there first host_that_won="` head -n1 "$logs.begun" | cut -d' ' -f1 `" if [ "$host_that_won" != "`hostname`,$instance" ] then echo "$logs.begun file gives hostname and instance \"$host_that_won\"," echo "which is not my hostname and instance \"`hostname`,$instance\"" echo "==> going to next job."; continue fi echo "job $namebase is mine: starting: `date +'%F %H:%M:%S %Z'`" # change to job's own working directory if ! cd "$work" then echo "Failed trying cd to $work : going to next job."; continue fi # run the command: echo "pwd=$PWD/, running ./$runexec, nice=$jobnice" nice -n $jobnice ./$runexec >$joblog.stdout 2>$joblog.stderr # log the finish of the job echo "`hostname`,$instance `date +'%F %H:%M:%S %Z'`" >>"../../$logs.ended" # go back to the batch's base directory, or exit if ! cd "$basedir" then echo "error from cd to $basedir" echo "$0 exiting." exit 2 fi done echo " "