# Kallisto-Bustools Count subworkflow (https://github.com/ShalekLab/kallisto-bustools_workflow)
# A publicly available WDL workflow made by Shalek Lab for Kallisto and Bustools wrapped within kb_python
# Workflow by jgatter [at] broadinstitute.org, created November 2019. Jointly maintained with Cumulus Team.
# FULL DISCLOSURE: many optional parameters remain untested, post on GitHub with bug reports, etc.
# Kallisto and Bustools software made by Pachter Lab. Documentation: https://www.kallistobus.tools/kb_getting_started.html
# -----------------------------------------------------------------------------------------------------------
# COUNT INSTRUCTIONS: Align your reads and generate a count matrix (use_lamanno=true for RNA velocity)
# ex: kb count --verbose (--lamanno) -i index.idx -g transcripts_to_genes.txt -x DROPSEQ -t 32 -m 256G --filter bustools -o ~/count (use_lamanno==true: c1 cDNA_t2c.txt -c2 intron_t2c.txt) R1.fastq.gz (R2.fastq.gz)
# Inputs: All outputs from the ref step, technology (“DROPSEQ”, “10XV3”, “10XV2”, see kb --list for more), R1_fastq, optional R2_fastq, set use_lamanno=true for RNA velocity,
#	Set nucleus to true for calculating RNA velocity on single-nucleus RNA-seq reads 
#	h5ad or loom to true for outputting expression matrices in those formats.	
#	Barcode whitelist for Seq-Well data will be generated by the program, but if you have one for 10X data you can provide it as an input.
# 	There are several memory parameters you can tweak, but I haven't noticed any improvements in speed when adjusting them. I'll investigate it eventually.
#	Specifically, for running with use_lamanno, memory/disk space parameters may require tweaking! Let me know!
# Outputs: Count matrices filtered and unfiltered with their respective barcode and gene lists. Many other files as well.
# -----------------------------------------------------------------------------------------------------------
# SNAPSHOT 1
# Public release.
# -----------------------------------------------------------------------------------------------------------
# SNAPSHOT 2
# Removed failOnStdErr runtime parameter.
# -----------------------------------------------------------------------------------------------------------
# SNAPSHOT 3
# kb now outputs to /cromwell_root rather than the home directory.
# gsutil rsync now transfers out the the files rather than gsutil cp -r.
# Modified several runtime parameters
# Task scattering now happens in this subworkflow
# Single-end FASTQ support
# -----------------------------------------------------------------------------------------------------------

version 1.0

workflow kallisto_bustools_count {
	input {
		String docker = "shaleklab/kallisto-bustools:0.24.4"
		Int number_cpu_threads = 32
		Int task_memory_GB = 256
		Float program_memory_multiplier = 0.9
		Int preemptible = 1
		String zones = "us-central1-a us-central1-b us-central1-c us-central1-f us-east1-b us-east1-c us-east1-d us-west1-a us-west1-b us-west1-c"
		String disks = "local-disk 256 SSD"
		Int boot_disk_size_GB = 10

		String bucket
		String output_path
		File sample_sheet

		File index
		File T2G_mapping
		String technology # DROPSEQ, 10XV1, 10XV2, 10XV3 or see the README for more

		File? barcodes_whitelist
		Boolean use_lamanno
		File? cDNA_transcripts_to_capture
		File? intron_transcripts_to_capture
		Boolean nucleus=false
		Boolean bustools_filter=true
		Boolean loom=false
		Boolean h5ad=false
		Boolean delete_bus_files
	}
	String bucket_slash = sub(bucket, "/+$", '') + '/'

	scatter (sample in read_objects(sample_sheet)) {
		String sample_name=sample.Sample
		String base_output_path_slash = sub(output_path_slash, bucket_slash, '')
		String output_folder = if defined(sample_name) then "count_"+sample_name else "count"
		String output_path_slash = if output_path == '' then output_folder+'/' else sub(output_path, "/+$", '')+'/'+output_folder+'/'
		
		call count {
			input:
				docker=docker,
				number_cpu_threads=number_cpu_threads,
				task_memory_GB=task_memory_GB,
				program_memory_multiplier=program_memory_multiplier,
				preemptible=preemptible,
				zones=zones,
				disks=disks,
				boot_disk_size_GB=boot_disk_size_GB,
				sample_sheet=sample_sheet,
				bucket_slash=bucket_slash,
				output_path_slash=base_output_path_slash,
				output_folder=output_folder,
				index=index,
				T2G_mapping=T2G_mapping,
				technology=technology,
				sample_name=sample_name,
				barcodes_whitelist=barcodes_whitelist,
				use_lamanno=use_lamanno,
				cDNA_transcripts_to_capture=cDNA_transcripts_to_capture,
				intron_transcripts_to_capture=intron_transcripts_to_capture,
				nucleus=nucleus,
				bustools_filter=bustools_filter,
				loom=loom,
				h5ad=h5ad,
				delete_bus_files=delete_bus_files
		}
	}
	output {
		Array[File] counts_unfiltered_matrices = count.counts_unfiltered_matrix
		Array[File] counts_filtered_matrices = count.counts_filtered_matrix
		Array[String] count_output_paths = count.count_output_path
	}
}

task count {
	input {
		String docker
		Int number_cpu_threads
		Int task_memory_GB
		Float program_memory_multiplier
		Int preemptible
		String zones
		String disks
		Int boot_disk_size_GB

		File sample_sheet
		String bucket_slash
		String output_folder
		String output_path_slash
		File index
		File T2G_mapping
		String technology
		
		String sample_name
		File? barcodes_whitelist
		Boolean use_lamanno
		File? cDNA_transcripts_to_capture
		File? intron_transcripts_to_capture
		Boolean nucleus
		Boolean bustools_filter
		Boolean loom
		Boolean h5ad
		Boolean delete_bus_files
	}
	Int program_memory_GB = ceil(task_memory_GB * program_memory_multiplier)

	command {
		set -e
		export TMPDIR=/tmp

		python <<CODE
		import pandas as pd
		import os.path as osp
		
		sample="~{sample_name}"
		pd.options.display.max_colwidth = 100000 # Ensure the entire cell prints out
		df = pd.read_csv("~{sample_sheet}", sep='\t', header=0)
		
		if not "R1_Paths" in df.columns and not "R2_Paths" in df.columns:
			raise Exception("ERROR: You must include both a 'R1_Paths' column and an 'R2_Paths' column in your sample sheet. "
				"If you have only one single-pair FASTQ, enter 'null' under 'R2_Paths' for each entry"
			)
		R1_paths = df.loc[ df["Sample"] == sample, "R1_Paths"].to_string(index=False).strip().split(',')
		R2_paths = df.loc[ df["Sample"] == sample, "R2_Paths"].to_string(index=False).strip().split(',')
		
		fastq_pairs_zipped = list(zip(R1_paths, R2_paths))
		fastq_pairs = list(sum(fastq_pairs_zipped, ())) # Flatten the list of pairs.
		
		# NOTE: Values that will render as NaN:
		# '#N/A', '#NA', '-NaN', '-nan','N/A', 'NA', 'NULL', 'NaN', 'n/a', 'nan', 'null'
		fastq_pairs_filtered = [fastq for fastq in fastq_pairs if fastq != "NaN"]
		if not fastq_pairs_filtered:
			raise Exception("ERROR: No FASTQs were detected. If you have more than one FASTQ pair per sample, "
				"please separate FASTQs with only a comma, no whitespace. "
				"If you have only one single-pair FASTQ, enter 'null' under 'R2_Paths' for each entry"
			)
		if len(fastq_pairs_filtered) is 1:
			fastq_pairs_filtered = ["--single "+fastq_pairs_filtered[0]]
		open("fastq_pairs_gsURIs.tsv", 'w').write('\n'.join(fastq_pairs_filtered))
		
		fastq_pairs_localized = [osp.basename(fastq) for fastq in fastq_pairs_filtered]
		open("fastq_pairs_localized.tsv", 'w').write(' '.join(fastq_pairs_localized))
		CODE
		
		cat fastq_pairs_gsURIs.tsv | gsutil -m cp -I .

		kb count --verbose \
			-i ~{index} \
			-g ~{T2G_mapping} \
			-x ~{technology} \
			-o ~{output_folder} \
			~{"-w "+barcodes_whitelist} \
			~{true="--lamanno" false='' use_lamanno} \
			~{"-c1 "+cDNA_transcripts_to_capture} \
			~{"-c2 "+intron_transcripts_to_capture} \
			~{true="--nucleus" false='' nucleus} \
			~{true="--filter bustools" false='' bustools_filter} \
			~{true="--loom" false='' loom} \
			~{true="--h5ad" false='' h5ad} \
			~{"-t "+number_cpu_threads} \
			~{"-m "+program_memory_GB+'G'} \
			$(cat fastq_pairs_localized.tsv)
		
		if [ "~{delete_bus_files}" = "true" ]; then
			rm -vf ~{output_folder}/*.bus
		fi
		
		gsutil -m rsync -r ~{output_folder} ~{bucket_slash}~{output_path_slash}
	}
	output {
		String count_output_path = "~{bucket_slash}~{output_path_slash}"
		String counts_unfiltered_matrix = "~{bucket_slash}~{output_path_slash}counts_unfiltered/cells_x_genes.mtx"
		String counts_filtered_matrix = "~{bucket_slash}~{output_path_slash}counts_filtered/cells_x_genes.mtx"
	}
	runtime {
		docker: "~{docker}"
		preemptible: preemptible
		memory: "~{task_memory_GB}G"
		zones: "~{zones}"
		bootDiskSizeGb: boot_disk_size_GB
		disks: "~{disks}"
		cpu: number_cpu_threads
	}
}