Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

splitting.py 3.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374
  1. #!/usr/bin/python
  2. """This module runs the trimming process."""
  3. import json
  4. import math
  5. import multiprocessing
  6. import os
  7. import sys
  8. MOUNT_PATHS = json.loads(os.environ.get("MOUNT_PATHS"))
  9. INPUT_FOLDER = MOUNT_PATHS["dependencies"]["alignment_filtering"]
  10. # If a specific environment variable is set, appends the respective option.
  11. threads = math.floor(multiprocessing.cpu_count() * 0.8)
  12. if threads < 1:
  13. threads = 1
  14. # Iterates over all sample directories and processes them conserving the directory structure.
  15. BAM_SUFFIX = ".bam"
  16. for root, dirs, files in os.walk(INPUT_FOLDER):
  17. if len(files) > 0:
  18. for file in files:
  19. if file.endswith(BAM_SUFFIX):
  20. file_base_name = file.removesuffix(BAM_SUFFIX)
  21. file_base_path = os.path.join(root, file_base_name)
  22. file_base_output_path = os.path.join(
  23. MOUNT_PATHS["output"],
  24. file_base_path.removeprefix(INPUT_FOLDER + "/")
  25. )
  26. # 1 - 99 bp: nucleosome free region
  27. # 180 - 247 bp: mononucleosomal region
  28. # 315 - 473 bp: dinucleosomal region
  29. # 558 - 615 bp: trinucleosomal region
  30. full_commands = [
  31. (f"samtools view -@ {threads} -h {file_base_path}{BAM_SUFFIX} "
  32. "| awk '{if (($9 < 100 && $9 > -100 && $9 != 0) || $1 ~ /^@/) {print $0}}' "
  33. f"| samtools sort -@ {threads} "
  34. f"-O bam -o {file_base_output_path}_nucelosomefree.bam -"),
  35. (f"samtools view -@ {threads} -h {file_base_path}{BAM_SUFFIX} "
  36. "| awk '{if ((($9 <= 247 && $9 >= 180) || ($9 >= -247 && $9 <= -180)) || $1 ~ /^@/) {print $0}}' "
  37. f"| samtools sort -@ {threads} "
  38. f"-O bam -o {file_base_output_path}_mononucelosomal.bam -"
  39. ),
  40. (f"samtools view -@ {threads} -h {file_base_path}{BAM_SUFFIX} "
  41. "| awk '{if ((($9 <= 473 && $9 >= 315) || ($9 >= -473 && $9 <= -315)) || $1 ~ /^@/) {print $0}}' "
  42. f"| samtools sort -@ {threads} "
  43. f"-O bam -o {file_base_output_path}_dinucelosomal.bam -"
  44. )
  45. ]
  46. for full_command in full_commands:
  47. print(f"Running command: {full_command}")
  48. os.makedirs(os.path.dirname(file_base_output_path), exist_ok = True)
  49. exit_code = os.waitstatus_to_exitcode(os.system(full_command))
  50. if exit_code != 0:
  51. sys.exit(exit_code)
  52. for root, dirs, files in os.walk(MOUNT_PATHS["output"]):
  53. if len(files) > 0:
  54. for file in files:
  55. if file.endswith(BAM_SUFFIX):
  56. file_base_name = file.removesuffix(BAM_SUFFIX)
  57. file_base_output_path = os.path.join(root, file_base_name)
  58. full_commands = [
  59. f"samtools index -@ {threads} {file_base_output_path}{BAM_SUFFIX}",
  60. (f"samtools flagstat -@ {threads} -O json {file_base_output_path}{BAM_SUFFIX} > "
  61. f"{file_base_output_path}.flagstat")
  62. ]
  63. for full_command in full_commands:
  64. print(f"Running command: {full_command}")
  65. os.makedirs(os.path.dirname(file_base_output_path), exist_ok = True)
  66. exit_code = os.waitstatus_to_exitcode(os.system(full_command))
  67. if exit_code != 0:
  68. sys.exit(exit_code)