#!/bin/bash 
#
# Loop through all job numbers, trying to claim and run any 
# that is not already claimed; each job is waited for until 
# it returns. Jobs should therefore NOT background anything 
# that would remain after their return!

# CPU priority: 0 --> normal, 19 --> only if nothing else
jobnice=15

# for the logs
echo "`hostname`,$instance: `date +'%F %H:%M:%S %Z'`: $0"

# the batch's base directory
basedir="$(dirname "$(realpath "$0" )" )"

# exit if unable to change to the base directory
if ! cd "$basedir" 
then
	echo "error from cd to $basedir"
	echo "$0 exiting"
	exit 2
fi
pwd

# define subdirectory names for work and logs 
logssub=logs
worksub=work
# name of executable to try to run in each job directory
runexec=run
# basename of stdout and stderr logs for each job (in its pwd)
joblog=log

# main loop: as soon as a job finishes, look for the next that isn't
# already claimed by the creation of a nnnnnn.begun file
iter=0;
while true
do
	echo " "
	
	# take next job-number
	iter=$((iter+1))
	# make a zero-padded version for filenames
	namebase="` printf "%06d" $iter `"
	
	# log each attempt
	echo "try job: $namebase: `date +'%F %H:%M:%S %Z'`"
	# set names of the various executables, logs and workingdirs.
	logs="$logssub/$namebase"
	work="$worksub/$namebase"
	exec="$worksub/$namebase/$runexec"

	# if no executable, assume there are no more jobs -- exit
	if [ ! -d "$work" ]
	then
		echo "job $namebase: directory $work not found"
		echo "*** NO MORE JOBS: `date +'%F %H:%M:%S %Z'`:  Exiting."
		echo "`hostname` `date +'%F %H:%M:%S %Z'`" \
			>"$logssub/host_`hostname`_$instance.ended"
		break
	elif [ ! -x "$exec" ]
	then
		echo "job $namebase: directory $work/ exists, but not executable $exec"
		echo "*** SKIPPING this job"
		continue
	fi

	# if this job-number already has associated "begun" file, go to the next
	if [ -f "$logs.begun" ]
	then
		echo "$logs.begun already exists (`date +'%F %H:%M:%S %Z'`)."
		continue
	fi

        # if we get here, the job seems available: but another host/process 
	# might be trying to take it simultaneously, so be a little careful
	# (i.e. "please clean me up, if you've knowledge, time and need")
        echo "job $exec is available: creating $logs.begun to claim it";
        touch "$logs.begun"
        claimstring="`hostname`,$instance  `date +'%F %H:%M:%S %Z'`"
        # flock only helps on the local computer (not over NFS)...
        flock --timeout=0  "$logs.begun" \
                 sh -c "echo \"$claimstring\" >>\"$logs.begun\" ; chmod a-w \"$logs.begun\""
        # ... so, check whether another host has somehow managed to get there first
        host_that_won="` head -n1 "$logs.begun" | cut -d' ' -f1 `"
        if [ "$host_that_won" != "`hostname`,$instance" ]
        then
                echo "$logs.begun file gives hostname and instance \"$host_that_won\","
                echo "which is not my hostname and instance \"`hostname`,$instance\""
                echo "==> going to next job.";
                continue
        fi
        echo "job $namebase is mine: starting: `date +'%F %H:%M:%S %Z'`"

	# change to job's own working directory
	if ! cd "$work"
	then
		echo "Failed trying cd to $work : going to next job.";
		continue
	fi
	
	# run the command:
	echo "pwd=$PWD/, running ./$runexec, nice=$jobnice"
	nice -n $jobnice ./$runexec >$joblog.stdout 2>$joblog.stderr

	# log the finish of the job
	echo "`hostname`,$instance  `date +'%F %H:%M:%S %Z'`" >>"../../$logs.ended"

	# go back to the batch's base directory, or exit 
	if ! cd "$basedir" 
	then
		echo "error from cd to $basedir"
		echo "$0 exiting."
		exit 2
	fi
	
done

echo " "