# Copyright (c) 2018 Talkowski Laboratory # Contact: Ryan Collins # Distributed under terms of the MIT license. # Helper workflow to calculate basic AF statistics for a single chromosome on an input VCF workflow getAFs_singleChrom { File vcf File vcf_idx String contig Int sv_per_shard String prefix File? sample_pop_assignments #Two-column file with sample ID & pop assignment. "." for pop will ignore sample File? famfile #Used for M/F AF calculations String? drop_empty_records # Tabix to chromosome of interest, and shard input VCF for stats collection call shard_vcf { input: vcf=vcf, vcf_idx=vcf_idx, contig=contig, sv_per_shard=sv_per_shard } # Scatter over VCF shards scatter ( shard in shard_vcf.shard_vcfs ) { # Collect AF summary stats call compute_shard_AFs { input: vcf=shard, prefix="${prefix}.${contig}", sample_pop_assignments=sample_pop_assignments, famfile=famfile } } # Merge shards into single VCF call combine_sharded_vcfs { input: vcfs=compute_shard_AFs.shard_wAFs, prefix="${prefix}.${contig}", drop_empty_records=drop_empty_records } # Final output output { File vcf_wAFs = combine_sharded_vcfs.vcf_out File vcf_wAFs_idx = combine_sharded_vcfs.vcf_out_idx } } # Shard VCF into fixed size chunks task shard_vcf { File vcf File vcf_idx String contig Int sv_per_shard command { set -euo pipefail #Tabix chromosome of interest tabix -h ${vcf} ${contig} | bgzip -c > ${contig}.vcf.gz #Then shard VCF /opt/sv-pipeline/scripts/shard_VCF.sh \ ${contig}.vcf.gz \ ${sv_per_shard} \ "vcf.shard." } output { Array[File] shard_vcfs = glob("vcf.shard.*.vcf.gz") } runtime { preemptible: 1 maxRetries: 1 docker: "talkowski/sv-pipeline@sha256:193d18c26100fdd603c569346722513f5796685e990ec3abcaeb4be887062a1a" memory: "4 GB" disks: "local-disk 250 SSD" } } # Subset a vcf to a single chromosome, and add global AF information (no subpop) task compute_shard_AFs { File vcf String prefix File? sample_pop_assignments File? famfile command <<< set -euo pipefail optionals=" " if [ ${default="SKIP" sample_pop_assignments} != "SKIP" ]; then optionals="$( echo "$optionals" ) -p ${sample_pop_assignments}" fi if [ ${default="SKIP" famfile} != "SKIP" ]; then optionals="$( echo "$optionals" ) -f ${famfile}" fi echo -e "OPTIONALS INTERPRETED AS: $optionals" echo -e "NOW RUNNING: /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $( echo "$optionals" ) ${vcf} stdout" #Tabix chromosome of interest & compute AN, AC, and AF /opt/sv-pipeline/05_annotation/scripts/compute_AFs.py $optionals "${vcf}" stdout \ | bgzip -c \ > "${prefix}.wAFs.vcf.gz" >>> output { File shard_wAFs = "${prefix}.wAFs.vcf.gz" } runtime { docker: "talkowski/sv-pipeline@sha256:193d18c26100fdd603c569346722513f5796685e990ec3abcaeb4be887062a1a" preemptible: 1 maxRetries: 1 memory: "4 GB" disks: "local-disk 20 SSD" } } # Merge VCF shards & drop records with zero remaining non-ref alleles task combine_sharded_vcfs { Array[File] vcfs String prefix String? drop_empty_records command { set -euo pipefail vcf-concat ${sep=" " vcfs} \ | vcf-sort \ > merged.vcf if [ ${default="TRUE" drop_empty_records} == "TRUE" ]; then /opt/sv-pipeline/05_annotation/scripts/prune_allref_records.py \ merged.vcf stdout \ | bgzip -c \ > "${prefix}.wAFs.vcf.gz" else cat merged.vcf | bgzip -c > "${prefix}.wAFs.vcf.gz" fi tabix -p vcf "${prefix}.wAFs.vcf.gz" } runtime { preemptible: 1 maxRetries: 1 docker: "talkowski/sv-pipeline@sha256:193d18c26100fdd603c569346722513f5796685e990ec3abcaeb4be887062a1a" disks: "local-disk 250 SSD" bootDiskSizeGb: 30 memory: "4 GB" } output { File vcf_out = "${prefix}.wAFs.vcf.gz" File vcf_out_idx = "${prefix}.wAFs.vcf.gz.tbi" } }