data_mask_concat_jobs.sh 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. #!/bin/bash
  2. ##### Datalad Handbook 3.2 ######
  3. ##### DataLad-centric analysis with job scheduling and parallel computing
  4. # http://handbook.datalad.org/en/latest/beyond_basics/101-170-dataladrun.html
  5. # top-level analysis dataset with subdatasets
  6. # $ datalad create parallel_analysis
  7. # $ cd parallel_analysis
  8. # a) pipeline dataset (with a configured software container)
  9. # $ datalad clone -d . https://github.com/ReproNim/containers.git
  10. # b) input dataset
  11. # $ datalad clone -d . /path/to/my/rawdata
  12. # data analysis with software container that performs a set of analyses
  13. # results will be aggregated into a top-level dataset
  14. # Individual jobs are computed in throw-away dataset clones (& branches)
  15. # to avoid unwanted interactions between parallel jobs.
  16. # results are pushed back (as branches) to into the target dataset.
  17. # A manual merge aggregates all results into the master branch of the dataset.
  18. # following analysis processes rawdata with
  19. # - a pipeline from
  20. # - collects outcomes in toplevel parallel_analysis dataset
  21. # You could also add and configure the container using datalad containers-add
  22. # to the top-most dataset. This solution makes the container less usable, though.
  23. # If you have more than one application for a container, keeping it as a
  24. # standalone dataset can guarantee easier reuse.
  25. # what you will submit as a job with a job scheduler
  26. # is a shell script that contains all relevant data analysis steps
  27. # and not a datalad containers-run call
  28. # but datalad run does not support concurrent execution in the same dataset clone.
  29. # Solution: create one throw-away dataset clone for each job.
  30. # We treat cluster compute nodes like contributors to the analyses:
  31. # They clone the analysis dataset hierarchy into a temporary location,
  32. # run the computation, push the results, and remove their temporary dataset again
  33. # The compute job clones the dataset to a unique place, so that it can run a
  34. # containers-run command inside it without interfering with any other job.
  35. # fail whenever something is fishy, use -x to get verbose logfiles
  36. set -e -u -x
  37. # we pass arbitrary arguments via job scheduler and can use them as variables
  38. indir=$1
  39. # The first part of the script is therefore to navigate to a unique location,
  40. # and clone the analysis dataset to it.
  41. # go into unique location
  42. cd /tmp
  43. # clone the analysis dataset. flock makes sure that this does not interfere
  44. # with another job finishing and pushing results back at the same time
  45. flock --verbose $DSLOCKFILE datalad clone /data/group/psyinf/studyforrest-srm-movies chaeusler-concat
  46. cd chaeusler-concat
  47. # This dataset clone is temporary: It will exist over the course of one analysis/job only,
  48. # but before it is being purged, all of the results it computed will be pushed
  49. # to the original dataset. This requires a safe-guard: If the original dataset
  50. # receives the results from the dataset clone, it knows about the clone and its
  51. # state. In order to protect the results from someone accidentally synchronizing
  52. # (updating) the dataset from its linked dataset after is has been deleted,
  53. # the clone should be created as a “trow-away clone” right from the start. By
  54. # running git annex dead here, git-annex disregards the clone, preventing the
  55. # deletion of data in the clone to affect the original dataset.
  56. # announce the clone to be temporary
  57. git annex dead here
  58. # The datalad push to the original clone location of a dataset needs to be prepared
  59. # carefully. The job computes one result (out of of many results) and saves it,
  60. # thus creating new data and a new entry with the run-record in the dataset
  61. # history. But each job is unaware of the results and commits produced by other
  62. # branches. Should all jobs push back the results to the original place (the
  63. # master branch of the original dataset), the individual jobs would conflict with
  64. # each other or, worse, overwrite each other (if you don’t have the default
  65. # push configuration of Git).
  66. # The general procedure and standard Git workflow for collaboration, therefore,
  67. # is to create a change on a different, unique branch, push this different
  68. # branch, and integrate the changes into the original master branch via a merge
  69. # in the original dataset4.
  70. # In order to do this, prior to executing the analysis, the script will checkout
  71. # a unique new branch in the analysis dataset. The most convenient name for the
  72. # branch is the Job-ID, an identifier under which the job scheduler runs an
  73. # individual job. This makes it easy to associate a result (via its branch)
  74. # with the log, error, or output files that the job scheduler produces5, and
  75. # the real-life example will demonstrate these advantages more concretely.
  76. # git checkout -b <name> creates a new branch and checks it out
  77. # checkout a unique branch
  78. git checkout -b "job-$JOBID"
  79. # $JOB-ID isn’t hardcoded into the script but it can be given to the script as
  80. # an environment or input variable at the time of job submission.
  81. # Next, its time for the containers-run command. The invocation will depend on
  82. # the container and dataset configuration (both of which are demonstrated in the
  83. # real-life example in the next section), and below, we pretend that the
  84. # container invocation only needs an input file and an output file. These input
  85. # file is specified via a bash variables ($inputfile) that will be defined in
  86. # the script and provided at the time of job submission via command line argument
  87. # from the job scheduler, and the output file name is based on the input file name.
  88. # After the containers-run execution in the script, the results can be pushed back to the dataset sibling origin6:
  89. # run the job
  90. datalad run \
  91. -m "Concatenating and z-scoring runs of ${indir}" \
  92. --explicit \
  93. --input $indir \
  94. --output $indir \
  95. ./code/data_mask_concat_runs.py \
  96. -sub "{inputs}" -outdir "{outputs}"
  97. # push, with filelocking as a safe-guard
  98. flock --verbose $DSLOCKFILE datalad push --to origin
  99. # Done - job handler should clean up workspace
  100. # manually merge it
  101. # git merge -m "Merge results from job cluster XY" $(git branch -l | grep 'job-' | tr -d ' ')
  102. # delete branches matching a pattern
  103. # git branch | grep "job-*" | xargs git branch -D