#!/bin/bash # the job assumes that it is a good idea to run everything in PWD # the job manager should make sure that is true # fail whenever something is fishy, use -x to get verbose logfiles set -e -u -x dssource="$1" pushgitremote="$2" subd="$3" subid=$(basename $3) # get the analysis dataset, which includes the inputs as well # importantly, we do not clone from the lcoation that we want to push the # results too, in order to avoid too many jobs blocking access to # the same location and creating a throughput bottleneck datalad clone "${dssource}" ds # all following actions are performed in the context of the superdataset cd ds # in order to avoid accumulation temporary git-annex availability information # and to avoid a syncronization bottleneck by having to consolidate the # git-annex branch across jobs, we will only push the main tracking branch # back to the output store (plus the actual file content). Final availability # information can be establish via an eventual "git-annex fsck -f cat12.8.1_out-storage". # this remote is never fetched, it accumulates a larger number of branches # and we want to avoid progressive slowdown. Instead we only ever push # a unique branch per each job (subject AND process specific name) git remote add outputstore "$pushgitremote" # all results of this job will be put into a dedicated branch git checkout -b "job-${JOBID}" # we pull down the input subject manually in order to discover relevant # files. We do this outside the recorded call, because on a potential # re-run we want to be able to do fine-grained recomputing of individual # outputs. The recorded calls will have specific paths that will enable # recomputation outside the scope of the original Condor setup datalad get -n "inputs/CORR/${subd}" # the meat of the matter # look for T1w files in the input data for the given participant # it is critical for reproducibility that the command given to # "containers-run" does not rely on any property of the immediate # computational environment (env vars, services, etc) find \ inputs/CORR/${subd} \ -name "${subid}*T1w.nii.gz" \ -exec sh -c ' odir=$(echo {} | cut -d / -f3-5); datalad containers-run \ -m "Compute $odir" \ -n cat12-8-1\ --explicit \ -o $odir \ -i {} \ sh -e -u -x -c " rm -rf {outputs[0]} ; mkdir -p {outputs[0]} \ && cp {inputs[0]} {outputs[0]} \ && /singularity -b code/cat_standalone_segment_enigma_subdir_rp_MSA.m {outputs[0]}/*.nii.gz \ && /singularity -b code/pipeline/batches/cat_standalone_batch-surfext.m {outputs[0]}/surf/lh.central.* \ && /singularity -b code/pipeline/batches/cat_standalone_batch-thick1.m {outputs[0]}/surf/lh.thickness.* \ && /singularity -b code/pipeline/batches/cat_standalone_batch-thick2.m {outputs[0]}/surf/lh.thickness.* \ && rm -f {outputs[0]}/*.nii* \ && gzip {outputs[0]}/*/*.nii \ " \ ' \; # remove big files from results after hashing before pushing to ria datalad drop --what filecontent --reckless kill */sub-*/*/mri/iy* */sub-*/*/mri/y* */sub-*/*/mri/anon_m* */sub-*/*/*/*.pdf */sub-*/*/surf/*sphere* #### Maybe remove simlinks without data and commit ??? # rm -f */sub-*/*/mri/iy* */sub-*/*/mri/y* # */sub-*/*/mri/anon_m* */sub-*/*/*/*.pdf # */sub-*/*/surf/*sphere* # datalad save -m "remove simlinks without data" # file content first -- does not need a lock, no interaction with Git datalad push --to cat12.8.1_out-storage # and the output branch flock --verbose $DSLOCKFILE git push outputstore echo SUCCESS # job handler should clean up workspace