浏览代码

[7] Updated pipelines

Some new pipeline steps were added.
at-robins 1 年之前
父节点
当前提交
5145312923

+ 19 - 0
pipelines/wiedemann_atac_pipeline/container/alignment_filtering/Dockerfile

@@ -0,0 +1,19 @@
+FROM python:3.11.6-alpine3.18
+RUN apk -U upgrade && apk add --no-cache autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev openssl-dev ncurses-dev g++ zlib unzip
+RUN wget 'https://github.com/samtools/samtools/archive/refs/tags/1.18.zip' -O samtools.zip && unzip 'samtools.zip' && rm 'samtools.zip'
+RUN wget 'https://github.com/samtools/htslib/releases/download/1.18/htslib-1.18.tar.bz2' -O htslib.tar.bz2 && tar -xf 'htslib.tar.bz2' && rm 'htslib.tar.bz2'
+
+WORKDIR /htslib-1.18
+RUN autoreconf -i
+RUN ./configure
+RUN make
+RUN make install
+WORKDIR /samtools-1.18
+RUN autoreconf -i
+RUN ./configure
+RUN make
+RUN make install
+
+WORKDIR /
+COPY run_alignment_filtering.py /run.py
+ENTRYPOINT ["python", "run.py"]

+ 51 - 0
pipelines/wiedemann_atac_pipeline/container/alignment_filtering/run_alignment_filtering.py

@@ -0,0 +1,51 @@
+#!/usr/bin/python
+"""This module runs the trimming process."""
+
+import json
+import math
+import multiprocessing
+import os
+import sys
+
+MOUNT_PATHS = json.loads(os.environ.get("MOUNT_PATHS"))
+INPUT_FOLDER = MOUNT_PATHS["dependencies"]["alignment"]
+
+# If a specific environment variable is set, appends the respective option.
+
+threads = math.floor(multiprocessing.cpu_count() * 0.8)
+if threads < 1:
+    threads = 1
+
+view_options = f"-@ {threads} -h "
+
+remove_invalid_reads = os.environ.get("REMOVE_INVALID")
+if remove_invalid_reads is not None and remove_invalid_reads == "true":
+    view_options += "-F 2828 "
+
+quality_filtering = os.environ.get("QUALITY_FILTER")
+if quality_filtering is not None:
+    view_options += f"-q {quality_filtering} "
+
+# Iterates over all sample directories and processes them conserving the directory structure.
+INPUT_SUFFIX = ".bam"
+for root, dirs, files in os.walk(INPUT_FOLDER):
+    if len(files) > 0:
+        for file in files:
+            if file.endswith(INPUT_SUFFIX):
+                file_base_name = file.removesuffix(INPUT_SUFFIX)
+                file_base_input_path = os.path.join(root, file_base_name)
+                file_base_output_path = os.path.join(
+                    MOUNT_PATHS["output"],
+                    file_base_input_path.removeprefix(INPUT_FOLDER + "/")
+                )
+                full_command = f"samtools view {view_options}{file_base_input_path}{INPUT_SUFFIX} "
+                remove_mitochondrial_reads = os.environ.get("REMOVE_M")
+                if remove_mitochondrial_reads is not None and remove_mitochondrial_reads == "true":
+                    full_command += "| awk '{if($3 != \"chrM\"){print $0}}' "
+                full_command += (f"| samtools sort -@ {threads} "
+                f"-O bam -o {file_base_output_path}.bam -")
+                print(f"Running command: {full_command}")
+                os.makedirs(os.path.dirname(file_base_output_path), exist_ok = True)
+                exit_code = os.waitstatus_to_exitcode(os.system(full_command))
+                if exit_code != 0:
+                    sys.exit(exit_code)

+ 23 - 0
pipelines/wiedemann_atac_pipeline/container/bowtie/Dockerfile

@@ -0,0 +1,23 @@
+FROM python:3.11.6-alpine3.18
+RUN apk -U upgrade && apk add --no-cache autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev openssl-dev ncurses-dev g++ zlib unzip
+RUN wget 'https://github.com/BenLangmead/bowtie2/archive/refs/tags/v2.5.1.zip' -O bowtie2.zip && unzip 'bowtie2.zip' && rm 'bowtie2.zip'
+RUN wget 'https://github.com/samtools/samtools/archive/refs/tags/1.18.zip' -O samtools.zip && unzip 'samtools.zip' && rm 'samtools.zip'
+RUN wget 'https://github.com/samtools/htslib/releases/download/1.18/htslib-1.18.tar.bz2' -O htslib.tar.bz2 && tar -xf 'htslib.tar.bz2' && rm 'htslib.tar.bz2'
+
+WORKDIR /htslib-1.18
+RUN autoreconf -i
+RUN ./configure
+RUN make
+RUN make install
+WORKDIR /samtools-1.18
+RUN autoreconf -i
+RUN ./configure
+RUN make
+RUN make install
+WORKDIR /bowtie2-2.5.1
+RUN make
+WORKDIR /
+RUN mv /bowtie2-2.5.1 /bowtie2
+
+COPY run_alignment.py /run_alignment.py
+ENTRYPOINT ["python", "run_alignment.py"]

+ 51 - 0
pipelines/wiedemann_atac_pipeline/container/bowtie/run_alignment.py

@@ -0,0 +1,51 @@
+#!/usr/bin/python
+"""This module runs the alignment process."""
+
+import json
+import math
+import multiprocessing
+import os
+import sys
+
+MOUNT_PATHS = json.loads(os.environ.get("MOUNT_PATHS"))
+INPUT_FOLDER = MOUNT_PATHS["dependencies"]["trimming"]
+
+# If a specific environment variable is set, appends the respective option.
+options_bowtie = ("--end-to-end --no-mixed --dovetail --very-sensitive "
+f"-x {MOUNT_PATHS['globals']['GENOME']}/genome "
+f"--met 10")
+
+options_sort = ""
+threads = math.floor(multiprocessing.cpu_count() * 0.8)
+if threads > 0:
+    options_bowtie += f" --threads {threads}"
+    options_sort += f" -@ {threads}"
+
+
+print("Specified bowtie2 options:" + options_bowtie)
+print("Specified samtools options:" + options_sort)
+
+# Iterates over all sample directories and processes them conserving the directory structure.
+INPUT_SUFFIX_FORWARD = "_1_paired.fq.gz"
+INPUT_SUFFIX_REVERSE = "_2_paired.fq.gz"
+for root, dirs, files in os.walk(INPUT_FOLDER):
+    if len(files) > 0:
+        for file in files:
+            if file.endswith(INPUT_SUFFIX_FORWARD):
+                file_base_name = file.removesuffix(INPUT_SUFFIX_FORWARD)
+                file_base_input_path = os.path.join(root, file_base_name)
+                file_base_output_path = os.path.join(
+                    MOUNT_PATHS["output"],
+                    file_base_input_path.removeprefix(INPUT_FOLDER + "/")
+                )
+                full_command = (f"/bowtie2/bowtie2 {options_bowtie} "
+                f"--met-file {file_base_output_path}_metrics.txt "
+                f"-1 {file_base_input_path}{INPUT_SUFFIX_FORWARD} "
+                f"-2 {file_base_input_path}{INPUT_SUFFIX_REVERSE} | "
+                f"samtools sort {options_sort} "
+                f"-O bam -o {file_base_output_path}.bam -")
+                print(f"Running command: {full_command}")
+                os.makedirs(os.path.dirname(file_base_output_path), exist_ok = True)
+                exit_code = os.waitstatus_to_exitcode(os.system(full_command))
+                if exit_code != 0:
+                    sys.exit(exit_code)

+ 19 - 0
pipelines/wiedemann_atac_pipeline/container/splitting/Dockerfile

@@ -0,0 +1,19 @@
+FROM python:3.11.6-alpine3.18
+RUN apk -U upgrade && apk add --no-cache autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev openssl-dev ncurses-dev g++ zlib unzip
+RUN wget 'https://github.com/samtools/samtools/archive/refs/tags/1.18.zip' -O samtools.zip && unzip 'samtools.zip' && rm 'samtools.zip'
+RUN wget 'https://github.com/samtools/htslib/releases/download/1.18/htslib-1.18.tar.bz2' -O htslib.tar.bz2 && tar -xf 'htslib.tar.bz2' && rm 'htslib.tar.bz2'
+
+WORKDIR /htslib-1.18
+RUN autoreconf -i
+RUN ./configure
+RUN make
+RUN make install
+WORKDIR /samtools-1.18
+RUN autoreconf -i
+RUN ./configure
+RUN make
+RUN make install
+
+WORKDIR /
+COPY splitting.py /run.py
+ENTRYPOINT ["python", "run.py"]

+ 74 - 0
pipelines/wiedemann_atac_pipeline/container/splitting/splitting.py

@@ -0,0 +1,74 @@
+#!/usr/bin/python
+"""This module runs the trimming process."""
+
+import json
+import math
+import multiprocessing
+import os
+import sys
+
+MOUNT_PATHS = json.loads(os.environ.get("MOUNT_PATHS"))
+INPUT_FOLDER = MOUNT_PATHS["dependencies"]["alignment_filtering"]
+
+# If a specific environment variable is set, appends the respective option.
+
+threads = math.floor(multiprocessing.cpu_count() * 0.8)
+if threads < 1:
+    threads = 1
+
+# Iterates over all sample directories and processes them conserving the directory structure.
+BAM_SUFFIX = ".bam"
+for root, dirs, files in os.walk(INPUT_FOLDER):
+    if len(files) > 0:
+        for file in files:
+            if file.endswith(BAM_SUFFIX):
+                file_base_name = file.removesuffix(BAM_SUFFIX)
+                file_base_path = os.path.join(root, file_base_name)
+                file_base_output_path = os.path.join(
+                    MOUNT_PATHS["output"],
+                    file_base_path.removeprefix(INPUT_FOLDER + "/")
+                )
+                #   1 -  99 bp: nucleosome free region
+                # 180 - 247 bp: mononucleosomal region
+                # 315 - 473 bp: dinucleosomal region
+                # 558 - 615 bp: trinucleosomal region
+                full_commands = [
+                    (f"samtools view -@ {threads} -h {file_base_path}{BAM_SUFFIX} "
+                    "| awk '{if (($9 < 100 && $9 > -100 && $9 != 0) || $1 ~ /^@/) {print $0}}' "
+                    f"| samtools sort -@ {threads} "
+                    f"-O bam -o {file_base_output_path}_nucelosomefree.bam -"),
+                    (f"samtools view -@ {threads} -h {file_base_path}{BAM_SUFFIX} "
+                    "| awk '{if ((($9 <= 247 && $9 >= 180) || ($9 >= -247 && $9 <= -180)) || $1 ~ /^@/) {print $0}}' "
+                    f"| samtools sort -@ {threads} "
+                    f"-O bam -o {file_base_output_path}_mononucelosomal.bam -"
+                    ),
+                    (f"samtools view -@ {threads} -h {file_base_path}{BAM_SUFFIX} "
+                    "| awk '{if ((($9 <= 473 && $9 >= 315) || ($9 >= -473 && $9 <= -315)) || $1 ~ /^@/) {print $0}}' "
+                    f"| samtools sort -@ {threads} "
+                    f"-O bam -o {file_base_output_path}_dinucelosomal.bam -"
+                    )
+                ]
+                for full_command in full_commands:
+                    print(f"Running command: {full_command}")
+                    os.makedirs(os.path.dirname(file_base_output_path), exist_ok = True)
+                    exit_code = os.waitstatus_to_exitcode(os.system(full_command))
+                    if exit_code != 0:
+                        sys.exit(exit_code)
+
+for root, dirs, files in os.walk(MOUNT_PATHS["output"]):
+    if len(files) > 0:
+        for file in files:
+            if file.endswith(BAM_SUFFIX):
+                file_base_name = file.removesuffix(BAM_SUFFIX)
+                file_base_output_path = os.path.join(root, file_base_name)
+                full_commands = [
+                    f"samtools index -@ {threads} {file_base_output_path}{BAM_SUFFIX}",
+                    (f"samtools flagstat -@ {threads} -O json {file_base_output_path}{BAM_SUFFIX} > "
+                    f"{file_base_output_path}.flagstat")                   
+                ]
+                for full_command in full_commands:
+                    print(f"Running command: {full_command}")
+                    os.makedirs(os.path.dirname(file_base_output_path), exist_ok = True)
+                    exit_code = os.waitstatus_to_exitcode(os.system(full_command))
+                    if exit_code != 0:
+                        sys.exit(exit_code)

+ 27 - 4
pipelines/wiedemann_atac_pipeline/container/trimmomatic/run_trimming.py

@@ -6,10 +6,9 @@ import math
 import multiprocessing
 import os
 import sys
+from contextlib import suppress
 
 BASE_COMMAND = "java -jar /Trimmomatic-0.39/trimmomatic-0.39.jar PE"
-STEP_OPTIONS = ("ILLUMINACLIP:/Trimmomatic-0.39/adapters/NexteraPE-PE.fa:2:30:10:2:True "
-"LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36")
 MOUNT_PATHS = json.loads(os.environ.get("MOUNT_PATHS"))
 INPUT_FOLDER = MOUNT_PATHS["input"] + "/"
 
@@ -34,6 +33,30 @@ if not options:
 else:
     print("Specified options:" + options)
 
+# Define the step options.
+step_options = ""
+adapters = ""
+
+with suppress(Exception):
+    adapters = f"{MOUNT_PATHS['globals']['ADAPTERS_CUSTOM']}/trimming_adapters.fa"
+
+adapters_fixed = os.environ.get("ADAPTERS_FIXED")
+if not adapters and adapters_fixed is not None:
+    adapters = f"/Trimmomatic-0.39/adapters/{adapters_fixed}"
+else:
+    # Defaults to Nextera adapters as those are standard for ATAC sequencing.
+    adapters = "/Trimmomatic-0.39/adapters/NexteraPE-PE.fa"
+
+if adapters:
+    step_options += f" ILLUMINACLIP:{adapters}:2:30:10:2:True"
+
+step_options += " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36"
+
+if not options:
+    print("Running with default step options.")
+else:
+    print("Specified step options:" + options)
+
 # Iterates over all sample directories and processes them conserving the directory structure.
 for root, dirs, files in os.walk(INPUT_FOLDER):
     if len(files) > 0:
@@ -59,8 +82,8 @@ for root, dirs, files in os.walk(INPUT_FOLDER):
                 f"{file_base_output_path}_1_paired.fq.gz "
                 f"{file_base_output_path}_1_unpaired.fq.gz "
                 f"{file_base_output_path}_2_paired.fq.gz "
-                f"{file_base_output_path}_2_unpaired.fq.gz "
-                f"{STEP_OPTIONS}")
+                f"{file_base_output_path}_2_unpaired.fq.gz"
+                f"{step_options}")
                 os.makedirs(os.path.dirname(file_base_output_path), exist_ok = True)
                 exit_code = os.waitstatus_to_exitcode(os.system(full_command))
                 if exit_code != 0:

+ 97 - 3
pipelines/wiedemann_atac_pipeline/pipeline.json

@@ -6,7 +6,7 @@
     {
       "id": "qc_initial",
       "name": "Initial QC",
-      "description": "Performs an initial quality control of sequenced reads.",
+      "description": "<p>Performs an initial quality control of sequenced reads.</p><p><b>References</b><ol><li>Andrews S. (2010). FastQC: a quality control tool for high throughput sequence data. Available online at: <a href=\"http://www.bioinformatics.babraham.ac.uk/projects/fastqc\">Babraham Bioinformatics</a></li></ol></p>",
       "container": "fastqc_initial",
       "dependencies": [],
       "variables": [
@@ -39,10 +39,45 @@
     {
       "id": "trimming",
       "name": "Trimming",
-      "description": "Performs trimming of adapters and low quality bases.",
+      "description": "<p>Performs trimming of adapters and low quality bases.</p><p><b>References</b><ol><li>Bolger, A. M., Lohse, M., & Usadel, B. (2014). Trimmomatic: A flexible trimmer for Illumina Sequence Data. Bioinformatics, btu170.</li></ol></p>",
       "container": "trimmomatic",
       "dependencies": [],
       "variables": [
+        {
+          "id": "ADAPTERS_CUSTOM",
+          "name": "Custom adapter sequences",
+          "description": "A custom list of sequencing adapters. The global data repository must contain a file called <var>trimming_adapters.fa</var> at its root. This will overwrite any of the predefined adapter sequences.",
+          "category": {
+            "tag": "Global"
+          }
+        },
+        {
+          "id": "ADAPTERS_FIXED",
+          "name": "Predefined adapter sequences",
+          "description": "Specify which of the predefined adapter sequences to use for trimming.",
+          "required": false,
+          "category": {
+            "tag": "Option",
+            "content": [
+              {
+                "name": "Nextera-PE",
+                "value": "NexteraPE-PE.fa"
+              },
+              {
+                "name": "TrueSeq2-PE",
+                "value": "TrueSeq2-PE.fa"
+              },
+              {
+                "name": "TrueSeq3-PE",
+                "value": "TrueSeq3-PE.fa"
+              },
+              {
+                "name": "TrueSeq3-PE-2",
+                "value": "TrueSeq3-PE-2.fa"
+              }
+            ]
+          }
+        },
         {
           "id": "PHRED",
           "name": "PHRED score",
@@ -67,7 +102,7 @@
     {
       "id": "qc_trimming",
       "name": "Trimming QC",
-      "description": "Performs a quality control of the trimmed reads.",
+      "description": "<p>Performs a quality control of the trimmed reads.</p><p><b>References</b><ol><li>Andrews S. (2010). FastQC: a quality control tool for high throughput sequence data. Available online at: <a href=\"http://www.bioinformatics.babraham.ac.uk/projects/fastqc\">Babraham Bioinformatics</a></li></ol></p>",
       "container": "fastqc_trimming",
       "dependencies": ["trimming"],
       "variables": [
@@ -96,6 +131,65 @@
           }
         }
       ]
+    },
+    {
+      "id": "alignment",
+      "name": "Alignment",
+      "description": "<p>Aligns trimmed reads against a reference genome and sorts the output.</p><p><b>References</b><ol><li>Langmead B, Salzberg S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012, 9:357-359.</li><li>HTSlib: C library for reading/writing high-throughput sequencing data. James K Bonfield, John Marshall, Petr Danecek, Heng Li, Valeriu Ohan, Andrew Whitwham, Thomas Keane, Robert M Davies. GigaScience, Volume 10, Issue 2, February 2021, giab007, <a href=\"https://doi.org/10.1093/gigascience/giab007\">https://doi.org/10.1093/gigascience/giab007</a></li><li>Twelve years of SAMtools and BCFtools. Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li. GigaScience, Volume 10, Issue 2, February 2021, giab008, <a href=\"https://doi.org/10.1093/gigascience/giab008\">https://doi.org/10.1093/gigascience/giab008</a></li></ol></p>",
+      "container": "bowtie",
+      "dependencies": ["trimming"],
+      "variables": [
+        {
+          "id": "GENOME",
+          "name": "Reference genome",
+          "description": "The reference genome to align the reads against. The global data repository must contain a file called <var>genome.fa</var> and the according indices at its root. Indices can be generated by the reference genome preprocessing pipeline.",
+          "category": {
+            "tag": "Global"
+          },
+          "required": true
+        }
+      ]
+    },
+    {
+      "id": "alignment_filtering",
+      "name": "Post alignment filtering",
+      "description": "<p>Filters low quality and misaligned reads.</p><p><b>References</b><ol><li>HTSlib: C library for reading/writing high-throughput sequencing data. James K Bonfield, John Marshall, Petr Danecek, Heng Li, Valeriu Ohan, Andrew Whitwham, Thomas Keane, Robert M Davies. GigaScience, Volume 10, Issue 2, February 2021, giab007, <a href=\"https://doi.org/10.1093/gigascience/giab007\">https://doi.org/10.1093/gigascience/giab007</a></li><li>Twelve years of SAMtools and BCFtools. Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li. GigaScience, Volume 10, Issue 2, February 2021, giab008, <a href=\"https://doi.org/10.1093/gigascience/giab008\">https://doi.org/10.1093/gigascience/giab008</a></li></ol></p>",
+      "container": "alignment_filtering",
+      "dependencies": ["alignment"],
+      "variables": [
+        {
+          "id": "REMOVE_M",
+          "name": "Remove mitochondrial reads",
+          "description": "Remove all reads mapping to the mitochondiral genome.",
+          "category": {
+            "tag": "Boolean"
+          }
+        },
+        {
+          "id": "REMOVE_INVALID",
+          "name": "Removes misaligned reads",
+          "description": "Remove all read pairs that are not properly aligned.",
+          "category": {
+            "tag": "Boolean"
+          }
+        },
+        {
+          "id": "QUALITY_FILTER",
+          "name": "Quality filtering",
+          "description": "Filters all alignments with a PHRED quality score lower than the defined value.",
+          "category": {
+            "tag": "Number"
+          }
+        }
+      ]
+    },
+    {
+      "id": "splitting",
+      "name": "Splitting",
+      "description": "<p>Splits the alignment into nucleosome specific chuncks.</p><p><b>References</b><ol><li>Buenrostro JD, Giresi PG, Zaba LC, Chang HY, Greenleaf WJ. Transposition of native chromatin for fast and sensitive epigenomic profiling of open chromatin, DNA-binding proteins and nucleosome position. Nat Methods. 2013 Dec;10(12):1213-8. doi: 10.1038/nmeth.2688. Epub 2013 Oct 6. PMID: 24097267; PMCID: PMC3959825.</li><li>HTSlib: C library for reading/writing high-throughput sequencing data. James K Bonfield, John Marshall, Petr Danecek, Heng Li, Valeriu Ohan, Andrew Whitwham, Thomas Keane, Robert M Davies. GigaScience, Volume 10, Issue 2, February 2021, giab007, <a href=\"https://doi.org/10.1093/gigascience/giab007\">https://doi.org/10.1093/gigascience/giab007</a></li><li>Twelve years of SAMtools and BCFtools. Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li. GigaScience, Volume 10, Issue 2, February 2021, giab008, <a href=\"https://doi.org/10.1093/gigascience/giab008\">https://doi.org/10.1093/gigascience/giab008</a></li></ol></p>",
+      "container": "splitting",
+      "dependencies": ["alignment_filtering"],
+      "variables": []
     }
   ]
 }

+ 11 - 0
pipelines/wiedemann_genome_preprocess/container/bowtie/Dockerfile

@@ -0,0 +1,11 @@
+FROM python:3.11.6-alpine3.18
+RUN apk -U upgrade && apk add --no-cache autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev openssl-dev ncurses-dev g++ zlib unzip
+RUN wget 'https://github.com/BenLangmead/bowtie2/archive/refs/tags/v2.5.1.zip' -O bowtie2.zip && unzip 'bowtie2.zip' && rm 'bowtie2.zip'
+
+WORKDIR /bowtie2-2.5.1
+RUN make
+WORKDIR /
+RUN mv /bowtie2-2.5.1 /bowtie2
+
+COPY run.py /run.py
+ENTRYPOINT ["python", "run.py"]

+ 28 - 0
pipelines/wiedemann_genome_preprocess/container/bowtie/run.py

@@ -0,0 +1,28 @@
+#!/usr/bin/python
+"""This module runs the bowtie2 indexing."""
+
+import json
+import math
+import multiprocessing
+import os
+import sys
+
+BASE_COMMAND = "/bowtie2/bowtie2-build"
+MOUNT_PATHS = json.loads(os.environ.get("MOUNT_PATHS"))
+
+# If a specific environment variable is set, appends the respective option.
+options = ""
+
+threads = math.floor(multiprocessing.cpu_count() * 0.8)
+if threads > 0:
+    options += f" --threads {threads}"
+
+if not options:
+    print("Running with default options.")
+else:
+    print("Specified options:" + options)
+
+full_command = f"{BASE_COMMAND}{options} {MOUNT_PATHS['dependencies']['unpack']}/genome.fa {MOUNT_PATHS['output']}/genome"
+exit_code = os.waitstatus_to_exitcode(os.system(full_command))
+if exit_code != 0:
+    sys.exit(exit_code)

+ 19 - 0
pipelines/wiedemann_genome_preprocess/container/samtools/Dockerfile

@@ -0,0 +1,19 @@
+FROM python:3.11.6-alpine3.18
+RUN apk -U upgrade && apk add --no-cache autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev openssl-dev ncurses-dev unzip coreutils
+RUN wget 'https://github.com/samtools/samtools/archive/refs/tags/1.18.zip' -O samtools.zip && unzip 'samtools.zip' && rm 'samtools.zip'
+RUN wget 'https://github.com/samtools/htslib/releases/download/1.18/htslib-1.18.tar.bz2' -O htslib.tar.bz2 && tar -xf 'htslib.tar.bz2' && rm 'htslib.tar.bz2'
+
+WORKDIR /htslib-1.18
+RUN autoreconf -i
+RUN ./configure
+RUN make
+RUN make install
+WORKDIR /samtools-1.18
+RUN autoreconf -i
+RUN ./configure
+RUN make
+RUN make install
+
+WORKDIR /
+COPY run.py /run.py
+ENTRYPOINT ["python", "run.py"]

+ 24 - 0
pipelines/wiedemann_genome_preprocess/container/samtools/run.py

@@ -0,0 +1,24 @@
+#!/usr/bin/python
+"""This module runs the samtools indexing."""
+
+import json
+import os
+import sys
+
+MOUNT_PATHS = json.loads(os.environ.get("MOUNT_PATHS"))
+
+samtools_command = ("samtools faidx "
+                    f"{MOUNT_PATHS['dependencies']['unpack']}/genome.fa "
+                    f"-o {MOUNT_PATHS['output']}/genome.fa.fai")
+
+exit_code = os.waitstatus_to_exitcode(os.system(samtools_command))
+if exit_code != 0:
+    sys.exit(exit_code)
+
+cut_command = ("cut -f 1,2 "
+                f"{MOUNT_PATHS['output']}/genome.fa.fai "
+                f"> {MOUNT_PATHS['output']}/genome.chrom.sizes")
+
+exit_code = os.waitstatus_to_exitcode(os.system(cut_command))
+if exit_code != 0:
+    sys.exit(exit_code)

+ 36 - 0
pipelines/wiedemann_genome_preprocess/container/seqoutbias/Dockerfile

@@ -0,0 +1,36 @@
+FROM python:3.11.6-alpine3.18
+RUN apk -U upgrade && apk add --no-cache curl make git cairo-dev pango-dev gcc g++
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- --default-toolchain=1.73.0 -y
+RUN source $HOME/.cargo/env
+RUN wget 'https://github.com/genometools/genometools/archive/refs/tags/v1.6.4.tar.gz' -O genometools.tar.gz && tar -xvf 'genometools.tar.gz' && rm 'genometools.tar.gz'
+
+
+# RUN wget 'https://github.com/samtools/htslib/releases/download/1.18/htslib-1.18.tar.bz2' -O htslib.tar.bz2 && tar -xf 'htslib.tar.bz2' && rm 'htslib.tar.bz2'
+
+WORKDIR /genometools-1.6.4
+# RUN autoreconf -i
+# RUN ./configure
+RUN make -j4
+RUN make install
+
+WORKDIR /
+RUN git clone git://genome-source.soe.ucsc.edu/kent.git
+RUN apk add --no-cache mariadb-connector-c-dev bash shadow
+WORKDIR /kent/src
+# Sets bash as default shell to prevent errors during makefile script execution.
+RUN sed -i 's@/bin/ash@bin/bash@g' /etc/passwd
+# Skips building the hg directory since we only need the wigToBigWig utility.
+# Also prevents a current error while building the hg utilities.
+RUN sed -i 's@cd hg && ${MAKE} utils@echo "skipping hg"@g' makefile
+RUN make
+
+# WORKDIR /samtools-1.18
+# RUN autoreconf -i
+# RUN ./configure
+# RUN make
+# RUN make install
+
+WORKDIR /
+RUN wigToBigWig
+COPY run.py /run.py
+ENTRYPOINT ["python", "run.py"]

+ 24 - 0
pipelines/wiedemann_genome_preprocess/container/seqoutbias/run.py

@@ -0,0 +1,24 @@
+#!/usr/bin/python
+"""This module runs the samtools indexing."""
+
+import json
+import os
+import sys
+
+MOUNT_PATHS = json.loads(os.environ.get("MOUNT_PATHS"))
+
+samtools_command = ("samtools faidx "
+                    f"{MOUNT_PATHS['depenencies']['unpack']}/genome.fa "
+                    f"-o {MOUNT_PATHS['output']}/genome.fa.fai")
+
+exit_code = os.waitstatus_to_exitcode(os.system(samtools_command))
+if exit_code != 0:
+    sys.exit(exit_code)
+
+cut_command = ("cut -f 1,2 "
+                f"-o {MOUNT_PATHS['output']}/genome.fa.fai"
+                f"-o {MOUNT_PATHS['output']}/genome.chrom.sizes")
+
+exit_code = os.waitstatus_to_exitcode(os.system(cut_command))
+if exit_code != 0:
+    sys.exit(exit_code)

+ 4 - 0
pipelines/wiedemann_genome_preprocess/container/unpack/Dockerfile

@@ -0,0 +1,4 @@
+FROM python:3.11.6-alpine3.18
+RUN apk -U upgrade && apk add --no-cache gzip
+COPY run.py /run.py
+ENTRYPOINT ["python", "run.py"]

+ 19 - 0
pipelines/wiedemann_genome_preprocess/container/unpack/run.py

@@ -0,0 +1,19 @@
+#!/usr/bin/python
+"""This module unpacks the genome file."""
+
+import json
+import os
+import sys
+
+MOUNT_PATHS = json.loads(os.environ.get("MOUNT_PATHS"))
+
+# Extract file.
+for root, dirs, files in os.walk(MOUNT_PATHS['input']):
+    for file in files:
+        if file.casefold().endswith(".fa.gz"):
+            file_input_path = os.path.join(root, file)
+            full_command = (f"gzip -dkc {file_input_path} "
+            f"> {MOUNT_PATHS['output']}/genome.fa")
+            exit_code = os.waitstatus_to_exitcode(os.system(full_command))
+            if exit_code != 0:
+                sys.exit(exit_code)

+ 40 - 0
pipelines/wiedemann_genome_preprocess/pipeline.json

@@ -0,0 +1,40 @@
+{
+  "id": "wiedemann_genome_preprocess",
+  "name": "Wiedemann reference genome preprocessing pipeline",
+  "description": "<p>This pipeline preprocesses a reference genome for usage as a data repository for other pipelines.</p><p><b>Input specifications</b><br />The input is expected to be a gzipped FASTA file (<var>.fa.gz</var>) located at the root.</p>",
+  "steps": [
+    {
+      "id": "unpack",
+      "name": "Unpacking",
+      "description": "<p>Unpacks and renames the genome file.</p>",
+      "container": "unpack",
+      "dependencies": [],
+      "variables": []
+    },
+    {
+      "id": "samtools_index",
+      "name": "Samtools indexing",
+      "description": "<p>Uses Samtools to index the genome and generate a chromosome size file.</p><p><b>References</b><br />HTSlib: C library for reading/writing high-throughput sequencing data. James K Bonfield, John Marshall, Petr Danecek, Heng Li, Valeriu Ohan, Andrew Whitwham, Thomas Keane, Robert M Davies. GigaScience, Volume 10, Issue 2, February 2021, giab007, <a href=\"https://doi.org/10.1093/gigascience/giab007\">https://doi.org/10.1093/gigascience/giab007</a><br />Twelve years of SAMtools and BCFtools. Petr Danecek, James K Bonfield, Jennifer Liddle, John Marshall, Valeriu Ohan, Martin O Pollard, Andrew Whitwham, Thomas Keane, Shane A McCarthy, Robert M Davies, Heng Li. GigaScience, Volume 10, Issue 2, February 2021, giab008, <a href=\"https://doi.org/10.1093/gigascience/giab008\">https://doi.org/10.1093/gigascience/giab008</a></p>",
+      "container": "samtools",
+      "dependencies": ["unpack"],
+      "variables": []
+    },
+    {
+      "id": "bowtie2_index",
+      "name": "Bowtie2 indexing",
+      "description": "<p>Uses Bowtie2 to index the genome.</p><p><b>References</b><br />Langmead B, Salzberg S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012, 9:357-359.</p>",
+      "container": "bowtie",
+      "dependencies": ["unpack"],
+      "variables": []
+    }
+    ,
+    {
+      "id": "seqoutbias_preprocessing",
+      "name": "SeqOutBias preprocessing",
+      "description": "<p>Uses SeqOutBias to generate tallymer files for .</p><p><b>References</b><br />Langmead B, Salzberg S. Fast gapped-read alignment with Bowtie 2. Nature Methods. 2012, 9:357-359.</p>",
+      "container": "seqoutbias",
+      "dependencies": ["unpack"],
+      "variables": []
+    }
+  ]
+}