task parseBatchInfo {
    File batchInfo

    command {
        cut -f1 ${batchInfo} > "batch.list"
        cut -f1,2 ${batchInfo} > "batch_md_path.map"
        cut -f4 ${batchInfo} > "batch_cnv_calls_path.list"
    }

    output {
        Array[String] batchList = read_lines("batch.list")
        Map[String, String] batchMdPathMap = read_map("batch_md_path.map")
        Array[String] batchCnvCallsList = read_lines("batch_cnv_calls_path.list")
    }

    runtime {
        docker: "skashin/genome-strip:latest"
    }
}

task mergeCnvCalls {

    Array[File] cnvCallsList
    File referenceBundle
 
    Int diskSize
    Int numPreempt

    command {
        echo ${sep=' ' cnvCallsList} | sed 's/ /\n/g' > cnv_calls.list
        $SV_DIR/scripts/firecloud/cnv/merge_cnv_calls.sh cnv_calls.list ${referenceBundle}
    }

    output {
        File mergedSitesVcf = "gs_cnv.sites.vcf.gz"
    }

    runtime {
        docker: "skashin/genome-strip:latest"
        disks: "local-disk ${diskSize} HDD"
        preemptible: "${numPreempt}"
    }
}

task setupGenotyping {
    File vcfFile
    Int parallelRecords

    Int numPreempt

    command {
        $SV_DIR/scripts/firecloud/compute_vcf_partitions.sh ${vcfFile} ${parallelRecords} partitions.dat
        cut -f 1 partitions.dat > partitions.list
    }

    output {
        File partitionFile = "partitions.dat"
        Array[String] partitionList = read_lines("partitions.list")
    }
    
    runtime {
        docker: "skashin/genome-strip:latest"
        preemptible: "${numPreempt}"
    }
}

task runParallelGenotyper {
    File vcfFile
    String batch
    File partitionFile
    String partitionName
    File mdPath
    File referenceBundle
    File credentialsKeyFile

    Int memory
    Int numPreempt
   
    Int diskSize = round(size(mdPath, "G")) + 30

    command {
        cat ${partitionFile} | awk -v partitionName=${partitionName} '$1 == partitionName' | cut -f 2 > partition.arg

        $SV_DIR/scripts/firecloud/genotyping/run_parallel_genotyper.sh ${vcfFile} ${partitionName} "$(cat partition.arg)" ${mdPath} NULL ${referenceBundle} true ${credentialsKeyFile}
    }

    output {
        String batchList = "${batch}"
        File partitionVcf = "gtrun/${partitionName}.genotypes.vcf.gz"
    }
    
    runtime {
        docker: "skashin/genome-strip:latest"
        memory: "${memory}GB"
        disks: "local-disk ${diskSize} HDD"
        preemptible: "${numPreempt}"
    }
}

task computeBatchVcfList {
    String batch
    Array[String] batchList
    Array[String] partitionVcfList

    Int numPreempt

    command {
        echo ${sep=' ' batchList} | sed 's/ /\n/g' > batch.list
        echo ${sep=' ' partitionVcfList} | sed 's/ /\n/g' > vcf.list
        paste batch.list vcf.list | awk -v batch=${batch} '$1 == batch' | cut -f 2 > batch_vcf.list
    }

    output {
        Array[String] batchVcfList = read_lines("batch_vcf.list")
    }
    
    runtime {
        docker: "skashin/genome-strip:latest"
        preemptible: "${numPreempt}"
    }
}

task mergeBatchPartitions {
    Array[File] partitionVcfList
    File referenceBundle

    Int diskSize
    Int numPreempt

    # Use size(partitionVcfList), once it's available
    Int adjustedDiskSize = diskSize + 5
    
    command {
        echo ${sep=' ' partitionVcfList} | sed 's/ /\n/g' > vcf.list

        source $SV_DIR/scripts/firecloud/gs_extract_reference.sh ${referenceBundle} || exit 1

        java -cp $SV_CLASSPATH -Xmx4g \
            org.broadinstitute.sv.apps.VCFMerge \
            -R $referenceFile \
            -vcf vcf.list \
            -includeInfoTag END \
            -includeInfoTag GSELENGTH \
            -includeInfoTag SVTYPE \
            -O gs_cnv.genotypes.vcf.gz \
            || exit 1
    }

    output {
        File batchVcf = "gs_cnv.genotypes.vcf.gz"
    }
    
    runtime {
        docker: "skashin/genome-strip:latest"
        disks: "local-disk ${adjustedDiskSize} HDD"
        preemptible: "${numPreempt}"
    }
}

task mergeBatches {
    Array[File] batchVcfList
    File referenceBundle

	Int cpu
    Int memory
    Int diskSize
    
    command {
        echo ${sep=' ' batchVcfList} | sed 's/ /\n/g' > vcf.list

        source $SV_DIR/scripts/firecloud/gs_extract_reference.sh ${referenceBundle} || exit 1

        java -cp $SV_CLASSPATH -Xmx4g \
            org.broadinstitute.sv.apps.VCFMerge \
            -R $referenceFile \
            -vcf vcf.list \
            -includeInfoTag END \
            -includeInfoTag GSELENGTH \
            -includeInfoTag SVTYPE \
            -O gs_cnv.genotypes.vcf.gz \
            || exit 1
    }

    output {
        File mergedGenotypesVcf = "gs_cnv.genotypes.vcf.gz"
        File mergedGenotypesVcfIndex = "gs_cnv.genotypes.vcf.gz.tbi"
    }
    
    runtime {
        docker: "skashin/genome-strip:latest"
        cpu: "${cpu}"
        memory: "${memory}GB"
        disks: "local-disk ${diskSize} HDD"
    }
}

task setupRedundancyFiltering {

    File vcfFile
    File referenceBundle

    Int windowSize

    Int diskSize
    Int numPreempt

    command {
        $SV_DIR/scripts/firecloud/common/create_genome_partitions.sh ${vcfFile} ${windowSize} intervals.list ${referenceBundle}
    }

    output {
        Array[String] intervalsList = read_lines("intervals.list")
    }
    
    runtime {
        docker: "skashin/genome-strip:latest"
        disks: "local-disk ${diskSize} HDD"
        preemptible: "${numPreempt}"
    }
}

task filterRedundantSites {
    File vcfFile
    File vcfFileIndex
    String interval
    File referenceBundle

    Int diskSize
    Int numPreempt
    
    Int adjustedDiskSize = diskSize + round(size(vcfFile, "G"))

    command {
        source $SV_DIR/scripts/firecloud/gs_extract_reference.sh ${referenceBundle} || exit 1

        java -cp $SV_CLASSPATH -Xmx4g \
            org.broadinstitute.sv.apps.FilterRedundantSites \
            -R $referenceFile \
            -vcf ${vcfFile} \
            -L ${interval} \
            -O gs_cnv.genotypes.vcf.gz \
            || exit 1
    }

    output {
        File dedupedVcf = "gs_cnv.genotypes.vcf.gz"
    }
    
    runtime {
        docker: "skashin/genome-strip:latest"
        disks: "local-disk ${adjustedDiskSize} HDD"
        preemptible: "${numPreempt}"
    }
}


task createFinalCallset {
    Array[File] vcfFileList
    File referenceBundle

    Int diskSize

    command {
        echo ${sep=' ' vcfFileList} | sed 's/ /\n/g' > vcf_files.list
        $SV_DIR/scripts/firecloud/cnv/create_final_callset.sh vcf_files.list ${referenceBundle}
    }

    output {
        File cnvCallset = "cnv_callset.tar.gz"
    }

    runtime {
        docker: "skashin/genome-strip:latest"
        memory: "7GB"
        cpu: "2"
        disks: "local-disk ${diskSize} HDD"
        preemptible: 0
    }
}

workflow gs_create_cnv_callset_wf {
    File batchInfo
    Int genotypingParallelRecords
    File referenceBundle
    File credentialsKeyFile

    Int memory
    Int diskSize
    Int numThreads
    Int numPreempt

    call parseBatchInfo {
        input:
            batchInfo = batchInfo
    }

    call mergeCnvCalls {
        input:
            cnvCallsList = parseBatchInfo.batchCnvCallsList,
            referenceBundle = referenceBundle,
            diskSize = diskSize,
            numPreempt = numPreempt
    }

    call setupGenotyping {
        input:
            vcfFile = mergeCnvCalls.mergedSitesVcf,
            parallelRecords = genotypingParallelRecords,
            numPreempt = numPreempt
    }

    Array[Pair[String, String]] batchPartitionPairs = cross(parseBatchInfo.batchList, setupGenotyping.partitionList)
    scatter(pair in batchPartitionPairs) {
        String batch = pair.left
        call runParallelGenotyper {
            input: 
                vcfFile = mergeCnvCalls.mergedSitesVcf,
                batch = batch,
                partitionFile = setupGenotyping.partitionFile,
                partitionName = pair.right,
                mdPath = parseBatchInfo.batchMdPathMap[batch],
                referenceBundle = referenceBundle,
                credentialsKeyFile = credentialsKeyFile,
                memory = memory,
                numPreempt = numPreempt
        }
    }

    scatter(batch in parseBatchInfo.batchList) {
        call computeBatchVcfList {
            input:
                batch = batch,
                batchList = runParallelGenotyper.batchList,
                partitionVcfList = runParallelGenotyper.partitionVcf,
                numPreempt = numPreempt
        }
    }
    
    scatter(batchVcfList in computeBatchVcfList.batchVcfList) {
        call mergeBatchPartitions {
            input:
                partitionVcfList = batchVcfList,
                referenceBundle = referenceBundle,
                diskSize = diskSize,
                numPreempt = numPreempt
        }
    }

    call mergeBatches {
        input:
            batchVcfList = mergeBatchPartitions.batchVcf,
            referenceBundle = referenceBundle
    }

    call setupRedundancyFiltering {
        input:
            vcfFile = mergeCnvCalls.mergedSitesVcf,
            referenceBundle = referenceBundle,
            diskSize = diskSize,
            numPreempt = numPreempt
    }

    scatter(interval in setupRedundancyFiltering.intervalsList) {
        call filterRedundantSites {
            input:
                vcfFile = mergeBatches.mergedGenotypesVcf,
                vcfFileIndex = mergeBatches.mergedGenotypesVcfIndex,
                interval = interval,
                referenceBundle = referenceBundle,
                diskSize = diskSize,
                numPreempt = numPreempt
        }
    }

    call createFinalCallset {
        input:
            vcfFileList = filterRedundantSites.dedupedVcf,
            referenceBundle = referenceBundle
    } 

    output {
        File mergedVcf = mergeBatches.mergedGenotypesVcf
        File cnvCallset = createFinalCallset.cnvCallset
    }
}