# Kallisto-Bustools Count subworkflow (https://github.com/ShalekLab/kallisto-bustools_workflow) # A publicly available WDL workflow made by Shalek Lab for Kallisto and Bustools wrapped within kb_python # Workflow by jgatter [at] broadinstitute.org, created November 2019. Jointly maintained with Cumulus Team. # FULL DISCLOSURE: many optional parameters remain untested, post on GitHub with bug reports, etc. # Kallisto and Bustools software made by Pachter Lab. Documentation: https://www.kallistobus.tools/kb_getting_started.html # ----------------------------------------------------------------------------------------------------------- # COUNT INSTRUCTIONS: Align your reads and generate a count matrix (use_lamanno=true for RNA velocity) # ex: kb count --verbose (--lamanno) -i index.idx -g transcripts_to_genes.txt -x DROPSEQ -t 32 -m 256G --filter bustools -o ~/count (use_lamanno==true: c1 cDNA_t2c.txt -c2 intron_t2c.txt) R1.fastq.gz (R2.fastq.gz) # Inputs: All outputs from the ref step, technology (“DROPSEQ”, “10XV3”, “10XV2”, see kb --list for more), R1_fastq, optional R2_fastq, set use_lamanno=true for RNA velocity, # Set nucleus to true for calculating RNA velocity on single-nucleus RNA-seq reads # h5ad or loom to true for outputting expression matrices in those formats. # Barcode whitelist for Seq-Well data will be generated by the program, but if you have one for 10X data you can provide it as an input. # There are several memory parameters you can tweak, but I haven't noticed any improvements in speed when adjusting them. I'll investigate it eventually. # Specifically, for running with use_lamanno, memory/disk space parameters may require tweaking! Let me know! # Outputs: Count matrices filtered and unfiltered with their respective barcode and gene lists. Many other files as well. # ----------------------------------------------------------------------------------------------------------- # SNAPSHOT 1 # Public release. # ----------------------------------------------------------------------------------------------------------- # SNAPSHOT 2 # Removed failOnStdErr runtime parameter. # ----------------------------------------------------------------------------------------------------------- # SNAPSHOT 3 # kb now outputs to /cromwell_root rather than the home directory. # gsutil rsync now transfers out the the files rather than gsutil cp -r. # Modified several runtime parameters # Task scattering now happens in this subworkflow # Single-end FASTQ support # ----------------------------------------------------------------------------------------------------------- version 1.0 workflow kallisto_bustools_count { input { String docker = "shaleklab/kallisto-bustools:0.24.4" Int number_cpu_threads = 32 Int task_memory_GB = 256 Float program_memory_multiplier = 0.9 Int preemptible = 1 String zones = "us-central1-a us-central1-b us-central1-c us-central1-f us-east1-b us-east1-c us-east1-d us-west1-a us-west1-b us-west1-c" String disks = "local-disk 256 SSD" Int boot_disk_size_GB = 10 String bucket String output_path File sample_sheet File index File T2G_mapping String technology # DROPSEQ, 10XV1, 10XV2, 10XV3 or see the README for more File? barcodes_whitelist Boolean use_lamanno File? cDNA_transcripts_to_capture File? intron_transcripts_to_capture Boolean nucleus=false Boolean bustools_filter=true Boolean loom=false Boolean h5ad=false Boolean delete_bus_files } String bucket_slash = sub(bucket, "/+$", '') + '/' scatter (sample in read_objects(sample_sheet)) { String sample_name=sample.Sample String base_output_path_slash = sub(output_path_slash, bucket_slash, '') String output_folder = if defined(sample_name) then "count_"+sample_name else "count" String output_path_slash = if output_path == '' then output_folder+'/' else sub(output_path, "/+$", '')+'/'+output_folder+'/' call count { input: docker=docker, number_cpu_threads=number_cpu_threads, task_memory_GB=task_memory_GB, program_memory_multiplier=program_memory_multiplier, preemptible=preemptible, zones=zones, disks=disks, boot_disk_size_GB=boot_disk_size_GB, sample_sheet=sample_sheet, bucket_slash=bucket_slash, output_path_slash=base_output_path_slash, output_folder=output_folder, index=index, T2G_mapping=T2G_mapping, technology=technology, sample_name=sample_name, barcodes_whitelist=barcodes_whitelist, use_lamanno=use_lamanno, cDNA_transcripts_to_capture=cDNA_transcripts_to_capture, intron_transcripts_to_capture=intron_transcripts_to_capture, nucleus=nucleus, bustools_filter=bustools_filter, loom=loom, h5ad=h5ad, delete_bus_files=delete_bus_files } } output { Array[File] counts_unfiltered_matrices = count.counts_unfiltered_matrix Array[File] counts_filtered_matrices = count.counts_filtered_matrix Array[String] count_output_paths = count.count_output_path } } task count { input { String docker Int number_cpu_threads Int task_memory_GB Float program_memory_multiplier Int preemptible String zones String disks Int boot_disk_size_GB File sample_sheet String bucket_slash String output_folder String output_path_slash File index File T2G_mapping String technology String sample_name File? barcodes_whitelist Boolean use_lamanno File? cDNA_transcripts_to_capture File? intron_transcripts_to_capture Boolean nucleus Boolean bustools_filter Boolean loom Boolean h5ad Boolean delete_bus_files } Int program_memory_GB = ceil(task_memory_GB * program_memory_multiplier) command { set -e export TMPDIR=/tmp python <