Best Python code snippet using SeleniumLibrary
urmo_imputation_hg38.smk
Source:urmo_imputation_hg38.smk
1#!/usr/bin/env python2shell.executable('bash')3# Converts BIM to BED and converts the BED file via CrossMap.4# Finds excluded SNPs and removes them from the original plink file.5# Then replaces the BIM with CrossMap's output.6rule crossmap:7 input:8 pgen = output_dict["output_dir"] + "/subset_ancestry/{ancestry}_subset.pgen",9 psam = output_dict["output_dir"] + "/subset_ancestry/{ancestry}_subset.psam",10 pvar = output_dict["output_dir"] + "/subset_ancestry/{ancestry}_subset.pvar"11 output:12 bed = output_dict["output_dir"] + "/crossmapped/{ancestry}_crossmapped_plink.bed",13 bim = output_dict["output_dir"] + "/crossmapped/{ancestry}_crossmapped_plink.bim",14 fam = output_dict["output_dir"] + "/crossmapped/{ancestry}_crossmapped_plink.fam",15 inbed = output_dict["output_dir"] + "/crossmapped/{ancestry}_crossmap_input.bed",16 outbed = output_dict["output_dir"] + "/crossmapped/{ancestry}_crossmap_output.bed",17 excluded_ids = output_dict["output_dir"] + "/crossmapped/{ancestry}_excluded_ids.txt"18 resources:19 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["crossmap_memory"],20 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["crossmap_memory"]21 threads:22 imputation_dict["crossmap_threads"]23 params:24 bind = input_dict["bind_paths"],25 sif = input_dict["singularity_image"],26 in_plink = output_dict["output_dir"] + "/subset_ancestry/{ancestry}_subset",27 out = output_dict["output_dir"] + "/crossmapped/{ancestry}_crossmapped_plink",28 chain_file = "/opt/GRCh37_to_GRCh38.chain"29 shell:30 """31 singularity exec --bind {params.bind} {params.sif} awk 'BEGIN{{FS=OFS="\t"}}{{print $1,$2,$2+1,$3,$4,$5}}' {input.pvar} > {output.inbed}32 singularity exec --bind {params.bind} {params.sif} CrossMap.py bed {params.chain_file} {output.inbed} {output.outbed}33 singularity exec --bind {params.bind} {params.sif} awk '{{print $4}}' {output.outbed}.unmap > {output.excluded_ids}34 singularity exec --bind {params.bind} {params.sif} plink2 --pfile {params.in_plink} --exclude {output.excluded_ids} --make-bed --output-chr MT --out {params.out}35 singularity exec --bind {params.bind} {params.sif} awk -F'\t' 'BEGIN {{OFS=FS}} {{print $1,$4,0,$2,$6,$5}}' {output.outbed} > {output.bim}36 """37rule sort_bed:38 input:39 pgen = output_dict["output_dir"] + "/crossmapped/{ancestry}_crossmapped_plink.bed",40 psam = output_dict["output_dir"] + "/crossmapped/{ancestry}_crossmapped_plink.bim",41 pvar = output_dict["output_dir"] + "/crossmapped/{ancestry}_crossmapped_plink.fam"42 output:43 bed = output_dict["output_dir"] + "/crossmapped_sorted/{ancestry}_crossmapped_sorted.bed",44 bim = output_dict["output_dir"] + "/crossmapped_sorted/{ancestry}_crossmapped_sorted.bim",45 fam = output_dict["output_dir"] + "/crossmapped_sorted/{ancestry}_crossmapped_sorted.fam"46 resources:47 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["sort_bed_memory"],48 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["sort_bed_memory"]49 threads:50 imputation_dict["sort_bed_threads"]51 params:52 bind = input_dict["bind_paths"],53 sif = input_dict["singularity_image"],54 infile = output_dict["output_dir"] + "/crossmapped/{ancestry}_crossmapped_plink",55 out = output_dict["output_dir"] + "/crossmapped_sorted/{ancestry}_crossmapped_sorted"56 shell:57 """58 singularity exec --bind {params.bind} {params.sif} plink2 --bfile {params.infile} --make-bed --max-alleles 2 --output-chr MT --out {params.out}59 """60rule harmonize_hg38:61 input:62 bed = output_dict["output_dir"] + "/crossmapped_sorted/{ancestry}_crossmapped_sorted.bed",63 bim = output_dict["output_dir"] + "/crossmapped_sorted/{ancestry}_crossmapped_sorted.bim",64 fam = output_dict["output_dir"] + "/crossmapped_sorted/{ancestry}_crossmapped_sorted.fam",65 vcf = vcf_dir + "/30x-GRCh38_NoSamplesSorted.vcf.gz",66 index = vcf_dir + "/30x-GRCh38_NoSamplesSorted.vcf.gz.tbi"67 output:68 bed = output_dict["output_dir"] + "/harmonize_hg38/{ancestry}.bed",69 bim = output_dict["output_dir"] + "/harmonize_hg38/{ancestry}.bim",70 fam = output_dict["output_dir"] + "/harmonize_hg38/{ancestry}.fam"71 resources:72 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["harmonize_hg38_memory"],73 java_mem = lambda wildcards, attempt: attempt * imputation_dict["harmonize_hg38_java_memory"],74 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["harmonize_hg38_memory"]75 threads:76 imputation_dict["harmonize_hg38_threads"]77 params:78 bind = input_dict["bind_paths"],79 sif = input_dict["singularity_image"],80 infile = output_dict["output_dir"] + "/crossmapped_sorted/{ancestry}_crossmapped_sorted",81 out = output_dict["output_dir"] + "/harmonize_hg38/{ancestry}",82 jar = "/opt/GenotypeHarmonizer-1.4.23/GenotypeHarmonizer.jar"83 shell:84 """85 singularity exec --bind {params.bind} {params.sif} java -Xmx{resources.java_mem}g -jar {params.jar}\86 --input {params.infile}\87 --inputType PLINK_BED\88 --ref {input.vcf}\89 --refType VCF\90 --update-id\91 --output {params.out}92 """93rule plink_to_vcf:94 input:95 bed = output_dict["output_dir"] + "/harmonize_hg38/{ancestry}.bed",96 bim = output_dict["output_dir"] + "/harmonize_hg38/{ancestry}.bim",97 fam = output_dict["output_dir"] + "/harmonize_hg38/{ancestry}.fam"98 output:99 data_vcf_gz = output_dict["output_dir"] + "/harmonize_hg38/{ancestry}_harmonised_hg38.vcf.gz",100 index = output_dict["output_dir"] + "/harmonize_hg38/{ancestry}_harmonised_hg38.vcf.gz.csi"101 resources:102 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["plink_to_vcf_memory"],103 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["plink_to_vcf_memory"]104 threads:105 imputation_dict["plink_to_vcf_threads"]106 params:107 bind = input_dict["bind_paths"],108 sif = input_dict["singularity_image"],109 infile = output_dict["output_dir"] + "/harmonize_hg38/{ancestry}",110 out = output_dict["output_dir"] + "/harmonize_hg38/{ancestry}_harmonised_hg38"111 shell:112 """113 singularity exec --bind {params.bind} {params.sif} plink2 --bfile {params.infile} --recode vcf id-paste=iid --chr 1-22 --out {params.out}114 singularity exec --bind {params.bind} {params.sif} bgzip {params.out}.vcf115 singularity exec --bind {params.bind} {params.sif} bcftools index {output.data_vcf_gz}116 """117rule vcf_fixref_hg38:118 input:119 fasta = fasta,120 vcf = vcf_dir + "/30x-GRCh38_NoSamplesSorted.vcf.gz",121 index = vcf_dir + "/30x-GRCh38_NoSamplesSorted.vcf.gz.tbi",122 data_vcf = output_dict["output_dir"] + "/harmonize_hg38/{ancestry}_harmonised_hg38.vcf.gz"123 output:124 vcf = output_dict["output_dir"] + "/vcf_fixref_hg38/{ancestry}_fixref_hg38.vcf.gz",125 index = output_dict["output_dir"] + "/vcf_fixref_hg38/{ancestry}_fixref_hg38.vcf.gz.csi"126 resources:127 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["vcf_fixref_hg38_memory"],128 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["vcf_fixref_hg38_memory"]129 threads:130 imputation_dict["vcf_fixref_hg38_threads"]131 params:132 bind = input_dict["bind_paths"],133 sif = input_dict["singularity_image"]134 shell:135 """136 singularity exec --bind {params.bind} {params.sif} bcftools +fixref {input.data_vcf} -- -f {input.fasta} -i {input.vcf} | \137 singularity exec --bind {params.bind} {params.sif} bcftools norm --check-ref x -f {input.fasta} -Oz -o {output.vcf}138 #Index139 singularity exec --bind {params.bind} {params.sif} bcftools index {output.vcf}140 """141rule filter_preimpute_vcf:142 input:143 vcf = output_dict["output_dir"] + "/vcf_fixref_hg38/{ancestry}_fixref_hg38.vcf.gz"144 output:145 tagged_vcf = output_dict["output_dir"] + "/filter_preimpute_vcf/{ancestry}_tagged.vcf.gz",146 filtered_vcf = output_dict["output_dir"] + "/filter_preimpute_vcf/{ancestry}_filtered.vcf.gz",147 filtered_index = output_dict["output_dir"] + "/filter_preimpute_vcf/{ancestry}_filtered.vcf.gz.csi"148 resources:149 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["filter_preimpute_vcf_memory"],150 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["filter_preimpute_vcf_memory"]151 threads:152 imputation_dict["filter_preimpute_vcf_threads"]153 params:154 bind = input_dict["bind_paths"],155 sif = input_dict["singularity_image"],156 maf = lambda wildcards: float(maf_df["MAF"][maf_df.Ancestry == wildcards.ancestry].values),157 missing = imputation_dict["snp_missing_pct"],158 hwe = imputation_dict["snp_hwe"]159 shell:160 """161 #Add tags162 singularity exec --bind {params.bind} {params.sif} bcftools +fill-tags {input.vcf} -Oz -o {output.tagged_vcf}163 #Filter rare and non-HWE variants and those with abnormal alleles and duplicates164 singularity exec --bind {params.bind} {params.sif} bcftools filter -i 'INFO/HWE > {params.hwe} & F_MISSING < {params.missing} & MAF[0] > {params.maf}' {output.tagged_vcf} |\165 singularity exec --bind {params.bind} {params.sif} bcftools filter -e 'REF="N" | REF="I" | REF="D"' |\166 singularity exec --bind {params.bind} {params.sif} bcftools filter -e "ALT='.'" |\167 singularity exec --bind {params.bind} {params.sif} bcftools norm -d all |\168 singularity exec --bind {params.bind} {params.sif} bcftools norm -m+any |\169 singularity exec --bind {params.bind} {params.sif} bcftools view -m2 -M2 -Oz -o {output.filtered_vcf}170 #Index the output file171 singularity exec --bind {params.bind} {params.sif} bcftools index {output.filtered_vcf}172 """173rule het:174 input:175 vcf = output_dict["output_dir"] + "/filter_preimpute_vcf/{ancestry}_filtered.vcf.gz",176 output:177 tmp_vcf = temp(output_dict["output_dir"] + "/het/{ancestry}_filtered_temp.vcf"),178 inds = output_dict["output_dir"] + "/het/{ancestry}_het_failed.inds",179 het = output_dict["output_dir"] + "/het/{ancestry}_het.het",180 passed = output_dict["output_dir"] + "/het/{ancestry}_het_passed.inds",181 passed_list = output_dict["output_dir"] + "/het/{ancestry}_het_passed.txt"182 resources:183 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["het_memory"],184 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["het_memory"]185 threads: imputation_dict["het_threads"]186 params:187 het_base = output_dict["output_dir"] + "/het/{ancestry}_het",188 script = "/opt/WG1-pipeline-QC/Imputation/scripts/filter_het.R",189 bind = input_dict["bind_paths"],190 hwe = output_dict["output_dir"] + "/hwe/{ancestry}_hwe",191 out = output_dict["output_dir"] + "/het/{ancestry}_het",192 sif = input_dict["singularity_image"],193 shell:194 """195 singularity exec --bind {params.bind} {params.sif} gunzip -c {input.vcf} | sed 's/^##fileformat=VCFv4.3/##fileformat=VCFv4.2/' > {output.tmp_vcf}196 singularity exec --bind {params.bind} {params.sif} vcftools --vcf {output.tmp_vcf} --het --out {params.het_base}197 singularity exec --bind {params.bind} {params.sif} Rscript {params.script} {output.het} {output.inds} {output.passed} {output.passed_list}198 """199rule het_filter:200 input:201 passed_list = output_dict["output_dir"] + "/het/{ancestry}_het_passed.txt",202 vcf = output_dict["output_dir"] + "/filter_preimpute_vcf/{ancestry}_filtered.vcf.gz"203 output:204 vcf = output_dict["output_dir"] + "/het_filter/{ancestry}_het_filtered.vcf.gz",205 index = output_dict["output_dir"] + "/het_filter/{ancestry}_het_filtered.vcf.gz.csi"206 resources:207 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["het_filter_memory"],208 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["het_filter_memory"]209 threads: imputation_dict["het_filter_threads"]210 params:211 bind = input_dict["bind_paths"],212 hwe = output_dict["output_dir"] + "/hwe/{ancestry}_hwe",213 out = output_dict["output_dir"] + "/het_filter/{ancestry}_het_filter",214 sif = input_dict["singularity_image"]215 shell:216 """217 singularity exec --bind {params.bind} {params.sif} bcftools view -S {input.passed_list} {input.vcf} -Oz -o {output.vcf}218 #Index the output file219 singularity exec --bind {params.bind} {params.sif} bcftools index {output.vcf}220 """221rule calculate_missingness:222 input:223 filtered_vcf = output_dict["output_dir"] + "/het_filter/{ancestry}_het_filtered.vcf.gz",224 filtered_index = output_dict["output_dir"] + "/het_filter/{ancestry}_het_filtered.vcf.gz.csi"225 output:226 tmp_vcf = temp(output_dict["output_dir"] + "/filter_preimpute_vcf/{ancestry}_het_filtered.vcf"),227 miss = output_dict["output_dir"] + "/filter_preimpute_vcf/{ancestry}_genotypes.imiss",228 individuals = output_dict["output_dir"] + "/genotype_donor_annotation/{ancestry}_individuals.tsv"229 resources:230 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["calculate_missingness_memory"],231 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["calculate_missingness_memory"]232 threads:233 imputation_dict["calculate_missingness_threads"]234 params:235 bind = input_dict["bind_paths"],236 sif = input_dict["singularity_image"],237 out = output_dict["output_dir"] + "/filter_preimpute_vcf/{ancestry}_genotypes"238 shell:239 """240 singularity exec --bind {params.bind} {params.sif} gunzip -c {input.filtered_vcf} | sed 's/^##fileformat=VCFv4.3/##fileformat=VCFv4.2/' > {output.tmp_vcf}241 singularity exec --bind {params.bind} {params.sif} vcftools --gzvcf {output.tmp_vcf} --missing-indv --out {params.out}242 singularity exec --bind {params.bind} {params.sif} bcftools query -l {input.filtered_vcf} >> {output.individuals}243 """244rule split_by_chr:245 input:246 filtered_vcf = output_dict["output_dir"] + "/het_filter/{ancestry}_het_filtered.vcf.gz",247 filtered_index = output_dict["output_dir"] + "/het_filter/{ancestry}_het_filtered.vcf.gz.csi"248 output:249 vcf = output_dict["output_dir"] + "/split_by_chr/{ancestry}_chr_{chr}.vcf.gz",250 index = output_dict["output_dir"] + "/split_by_chr/{ancestry}_chr_{chr}.vcf.gz.csi"251 resources:252 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["split_by_chr_memory"],253 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["split_by_chr_memory"]254 threads:255 imputation_dict["split_by_chr_threads"]256 params:257 bind = input_dict["bind_paths"],258 sif = input_dict["singularity_image"]259 shell:260 """261 singularity exec --bind {params.bind} {params.sif} bcftools view -r {wildcards.chr} {input.filtered_vcf} -Oz -o {output.vcf}262 singularity exec --bind {params.bind} {params.sif} bcftools index {output.vcf}263 """264rule eagle_prephasing:265 input:266 vcf = output_dict["output_dir"] + "/split_by_chr/{ancestry}_chr_{chr}.vcf.gz",267 map_file = genetic_map,268 phasing_file = phasing_dir + "/chr{chr}.bcf"269 output:270 vcf = output_dict["output_dir"] + "/eagle_prephasing/{ancestry}_chr{chr}_phased.vcf.gz"271 resources:272 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["eagle_prephasing_memory"],273 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["eagle_prephasing_memory"]274 threads: imputation_dict["eagle_prephasing_threads"]275 params:276 bind = input_dict["bind_paths"],277 sif = input_dict["singularity_image"],278 out = output_dict["output_dir"] + "/eagle_prephasing/{ancestry}_chr{chr}_phased"279 shell:280 """281 singularity exec --bind {params.bind} {params.sif} eagle --vcfTarget={input.vcf} \282 --vcfRef={input.phasing_file} \283 --geneticMapFile={input.map_file} \284 --chrom={wildcards.chr} \285 --outPrefix={params.out} \286 --numThreads={threads}287 """288rule minimac_imputation:289 input:290 vcf = output_dict["output_dir"] + "/eagle_prephasing/{ancestry}_chr{chr}_phased.vcf.gz",291 impute_file = impute_dir + "/chr{chr}.m3vcf.gz"292 output:293 output_dict["output_dir"] + "/minimac_imputed/{ancestry}_chr{chr}.dose.vcf.gz"294 resources:295 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["minimac_imputation_memory"],296 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["minimac_imputation_memory"]297 threads:298 imputation_dict["minimac_imputation_threads"]299 params:300 bind = input_dict["bind_paths"],301 sif = input_dict["singularity_image"],302 out = output_dict["output_dir"] + "/minimac_imputed/{ancestry}_chr{chr}",303 minimac4 = "/opt/bin/minimac4",304 chunk_length = imputation_dict["chunk_length"]305 shell:306 """307 singularity exec --bind {params.bind} {params.sif} {params.minimac4} --refHaps {input.impute_file} \308 --haps {input.vcf} \309 --prefix {params.out} \310 --format GT,DS,GP \311 --noPhoneHome \312 --cpus {threads} \313 --ChunkLengthMb {params.chunk_length}314 """315rule combine_vcfs_ancestry:316 input:317 vcfs = lambda wildcards: expand(output_dict["output_dir"] + "/minimac_imputed/{ancestry}_chr{chr}.dose.vcf.gz", chr = chromosomes, ancestry = ancestry_subsets)318 output:319 combined = output_dict["output_dir"] + "/vcf_merged_by_ancestries/{ancestry}_imputed_hg38.vcf.gz",320 ind = output_dict["output_dir"] + "/vcf_merged_by_ancestries/{ancestry}_imputed_hg38.vcf.gz.csi"321 resources:322 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["combine_vcfs_ancestry_memory"],323 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["combine_vcfs_ancestry_memory"]324 threads: imputation_dict["combine_vcfs_ancestry_threads"]325 params:326 sif = input_dict["singularity_image"],327 bind = input_dict["bind_paths"],328 files_begin = output_dict["output_dir"] + "/minimac_imputed/{ancestry}_chr*.dose.vcf.gz"329 shell:330 """331 singularity exec --bind {params.bind} {params.sif} bcftools concat -Oz {params.files_begin} > {output.combined}332 singularity exec --bind {params.bind} {params.sif} bcftools index {output.combined}333 """334rule combine_vcfs_all:335 input:336 vcfs = lambda wildcards: expand(output_dict["output_dir"] + "/vcf_merged_by_ancestries/{ancestry}_imputed_hg38.vcf.gz", ancestry = ancestry_subsets)337 output:338 combined = output_dict["output_dir"] + "/vcf_all_merged/imputed_hg38.vcf.gz",339 ind = output_dict["output_dir"] + "/vcf_all_merged/imputed_hg38.vcf.gz.csi"340 resources:341 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["combine_vcfs_memory"],342 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["combine_vcfs_memory"]343 threads: imputation_dict["combine_vcfs_threads"]344 params:345 sif = input_dict["singularity_image"],346 bind = input_dict["bind_paths"],347 shell:348 """349 if [[ $(ls -l {input.vcfs} | wc -l) > 1 ]]350 then351 singularity exec --bind {params.bind} {params.sif} bcftools merge -Oz {input.vcfs} > {output.combined}352 else353 cp {input.vcfs} {output.combined}354 fi355 singularity exec --bind {params.bind} {params.sif} bcftools index {output.combined}356 """357rule filter4demultiplexing:358 input:359 output_dict["output_dir"] + "/vcf_all_merged/imputed_hg38.vcf.gz"360 output:361 info_filled = output_dict["output_dir"] + "/vcf_all_merged/imputed_hg38_info_filled.vcf.gz",362 qc_filtered = output_dict["output_dir"] + "/vcf_all_merged/imputed_hg38_R2_0.3_MAF0.05.vcf.gz",363 location_filtered = temp(output_dict["output_dir"] + "/vcf_all_merged/imputed_hg38_R2_0.3_MAF0.05_exons.recode.vcf"),364 complete_cases = output_dict["output_dir"] + "/vcf_all_merged/imputed_hg38_R2_0.3_MAF0.05_exons_complete_cases.recode.vcf"365 resources:366 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["filter4demultiplexing_memory"],367 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["filter4demultiplexing_memory"]368 threads: imputation_dict["filter4demultiplexing_threads"]369 params:370 sif = input_dict["singularity_image"],371 bind = input_dict["bind_paths"],372 out = output_dict["output_dir"] + "/vcf_all_merged/imputed_hg38_R2_0.3_MAF0.05_exons",373 complete_out = output_dict["output_dir"] + "/vcf_all_merged/imputed_hg38_R2_0.3_MAF0.05_exons_complete_cases",374 bed = "/opt/hg38exonsUCSC.bed"375 shell:376 """377 ##### Add all the info fields378 singularity exec --bind {params.bind} {params.sif} bcftools +fill-tags -Oz --output {output.info_filled} {input}379 ##### Filter the Imputed SNP Genotype by Minor Allele Frequency (MAF) and INFO scores #####380 singularity exec --bind {params.bind} {params.sif} bcftools filter --include 'MAF>=0.05 & R2>=0.3' -Oz --output {output.qc_filtered} {output.info_filled}381 singularity exec --bind {params.bind} {params.sif} vcftools \382 --gzvcf {output.qc_filtered} \383 --max-alleles 2 \384 --remove-indels \385 --bed {params.bed} \386 --recode \387 --recode-INFO-all \388 --out {params.out}389 singularity exec --bind {params.bind} {params.sif} vcftools --recode --recode-INFO-all --vcf {output.location_filtered} --max-missing 1 --out {params.complete_out}390 """391rule sort4demultiplexing:392 input:393 complete_cases = output_dict["output_dir"] + "/vcf_all_merged/imputed_hg38_R2_0.3_MAF0.05_exons_complete_cases.recode.vcf"394 output:395 complete_cases_sorted = output_dict["output_dir"] + "/vcf_4_demultiplex/imputed_hg38_R2_0.3_MAF0.05_exons_sorted.vcf"396 resources:397 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["sort4demultiplexing_memory"],398 java_mem = lambda wildcards, attempt: attempt * imputation_dict["sort4demultiplexing_java_memory"],399 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["sort4demultiplexing_memory"]400 threads: imputation_dict["sort4demultiplexing_threads"]401 params:402 sif = input_dict["singularity_image"],403 bind = input_dict["bind_paths"]404 shell:405 """406 singularity exec --bind {params.bind} {params.sif} java -Xmx{resources.java_mem}g -Xms{resources.java_mem}g -jar /opt/picard/build/libs/picard.jar SortVcf \407 I={input.complete_cases} \408 O={output.complete_cases_sorted}409 """410rule count_snps:411 input:412 info_filled = output_dict["output_dir"] + "/vcf_all_merged/imputed_hg38_info_filled.vcf.gz",413 qc_filtered = output_dict["output_dir"] + "/vcf_all_merged/imputed_hg38_R2_0.3_MAF0.05.vcf.gz",414 complete_cases_sorted = output_dict["output_dir"] + "/vcf_4_demultiplex/imputed_hg38_R2_0.3_MAF0.05_exons_sorted.vcf"415 output:416 report(output_dict["output_dir"] + "/metrics/Number_SNPs.png", category = "SNP Numbers", caption = "../report_captions/counts_snps.rst")417 resources:418 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["count_snps_memory"],419 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["count_snps_memory"]420 threads: imputation_dict["count_snps_threads"]421 params:422 sif = input_dict["singularity_image"],423 bind = input_dict["bind_paths"],424 basedir = output_dict["output_dir"],425 outdir = output_dict["output_dir"] + "/metrics/",426 script = "/opt/WG1-pipeline-QC/Imputation/scripts/SNP_numbers.R"427 shell:428 """429 singularity exec --bind {params.bind} {params.sif} Rscript {params.script} {params.basedir} {params.outdir}430 """431rule genotype_donor_annotation:432 input:433 individuals = lambda wildcards: expand(output_dict["output_dir"] + "/genotype_donor_annotation/{ancestry}_individuals.tsv", ancestry = ancestry_subsets),434 updated_psam = output_dict["output_dir"] + "/update_sex_ancestry/update_sex.psam"435 output:436 out_temp = temp(output_dict["output_dir"] + "/genotype_donor_annotation/genotype_donor_annotation_temp.tsv"),437 combined_individuals = output_dict["output_dir"] + "/genotype_donor_annotation/combined_individuals.tsv",438 final = output_dict["output_dir"] + "/genotype_donor_annotation/genotype_donor_annotation.tsv",439 resources:440 mem_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["genotype_donor_annotation_memory"],441 disk_per_thread_gb=lambda wildcards, attempt: attempt * imputation_dict["genotype_donor_annotation_memory"]442 threads: imputation_dict["genotype_donor_annotation_threads"]443 params:444 bind = input_dict["bind_paths"],445 sif = input_dict["singularity_image"],446 shell:447 """448 singularity exec --bind {params.bind} {params.sif} cut -f2,5- {input.updated_psam} | awk 'NR<2{{print $0;next}}{{print $0| "sort -k1,1"}}' > {output.out_temp}449 singularity exec --bind {params.bind} {params.sif} cat {input.individuals} >> {output.combined_individuals}450 singularity exec --bind {params.bind} {params.sif} sed -i '1 i\IID' {output.combined_individuals}451 singularity exec --bind {params.bind} {params.sif} awk -F"\t" 'NR==FNR {{a[$1]; next}} $1 in a' {output.combined_individuals} {output.out_temp} | awk 'BEGIN{{FS=OFS="\t"}}{{sub("1","M",$2);print}}' | awk 'BEGIN{{FS=OFS="\t"}}{{sub("2","F",$2);print}}' > {output.final}452 singularity exec --bind {params.bind} {params.sif} sed -i 's/^IID\tSEX\tProvided_Ancestry/donor_id\tsex\tethnicity_super_population/g' {output.final}453 singularity exec --bind {params.bind} {params.sif} sed -i 's/$/\tsceQTL-Gen_hg38_imputation_pipeline\t1000g_30x_GRCh38_ref/' {output.final}454 singularity exec --bind {params.bind} {params.sif} sed -i '1s/SangerImputationServer/imputation_method/g' {output.final}455 singularity exec --bind {params.bind} {params.sif} sed -i '1s/1000g_30x_GRCh38_ref/imputation_reference/g' {output.final}...
plink_gender_ancestry_QC.smk
Source:plink_gender_ancestry_QC.smk
1#!/usr/bin/env python2shell.executable('bash')3rule indiv_missingness:4 input:5 pgen = pgen,6 pvar = pvar,7 psam = psam,8 output:9 bed = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness.pgen",10 bim = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness.pvar",11 fam = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness.psam",12 resources:13 mem_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["indiv_missingness_memory"],14 disk_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["indiv_missingness_memory"]15 threads: plink_gender_ancestry_QC_dict["indiv_missingness_threads"]16 params:17 bind = input_dict["bind_paths"],18 infile = re.sub(".psam", "", psam),19 out = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness",20 mind = plink_gender_ancestry_QC_dict["indiv_missingness_mind"],21 sif = input_dict["singularity_image"]22 shell:23 """24 singularity exec --bind {params.bind} {params.sif} echo {params.infile}25 singularity exec --bind {params.bind} {params.sif} plink2 --threads {threads} \26 --pfile {params.infile} \27 --make-pgen 'psam-cols='fid,parents,sex,phenos \28 --mind {params.mind} \29 --out {params.out}30 """31rule check_sex:32 input:33 bed = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness.pgen",34 bim = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness.pvar",35 fam = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness.psam",36 output:37 bed = output_dict["output_dir"] + "/check_sex/check_sex.bed",38 bim = output_dict["output_dir"] + "/check_sex/check_sex.bim",39 fam = output_dict["output_dir"] + "/check_sex/check_sex.fam",40 hh = output_dict["output_dir"] + "/check_sex/check_sex.hh",41 log = output_dict["output_dir"] + "/check_sex/check_sex.log",42 nosex = output_dict["output_dir"] + "/check_sex/check_sex.nosex",43 sexcheck = output_dict["output_dir"] + "/check_sex/check_sex.sexcheck",44 sexcheck_tab = output_dict["output_dir"] + "/check_sex/check_sex.sexcheck.tsv"45 resources:46 mem_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["check_sex_memory"],47 disk_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["check_sex_memory"]48 threads: plink_gender_ancestry_QC_dict["check_sex_threads"]49 params:50 bind = input_dict["bind_paths"],51 infile = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness",52 out = output_dict["output_dir"] + "/check_sex/check_sex",53 sif = input_dict["singularity_image"]54 shell:55 """56 singularity exec --bind {params.bind} {params.sif} plink2 --threads {threads} --pfile {params.infile} --make-bed --max-alleles 2 --out {params.out}57 singularity exec --bind {params.bind} {params.sif} plink --threads {threads} --bfile {params.out} --check-sex --out {params.out}58 singularity exec --bind {params.bind} {params.sif} touch {output.nosex}59 singularity exec --bind {params.bind} {params.sif} sed 's/^ \+//g' {output.sexcheck} | singularity exec --bind {params.bind} {params.sif} sed 's/ \+/\t/g' > {output.sexcheck_tab}60 """61### Pull just common SNPs between two groups ###62rule common_snps:63 input:64 bed = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness.pgen",65 bim = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness.pvar",66 fam = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness.psam",67 output:68 snps_data = output_dict["output_dir"] + "/common_snps/snps_data.tsv",69 snps_1000g = output_dict["output_dir"] + "/common_snps/snps_1000g.tsv",70 bed = output_dict["output_dir"] + "/common_snps/subset_data.pgen",71 bim = output_dict["output_dir"] + "/common_snps/subset_data.pvar",72 fam = output_dict["output_dir"] + "/common_snps/subset_data.psam",73 bed_1000g = output_dict["output_dir"] + "/common_snps/subset_1000g.pgen",74 bim_1000g = output_dict["output_dir"] + "/common_snps/subset_1000g.pvar",75 fam_1000g = output_dict["output_dir"] + "/common_snps/subset_1000g.psam",76 resources:77 mem_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["common_snps_memory"],78 disk_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["common_snps_memory"]79 threads: plink_gender_ancestry_QC_dict["common_snps_threads"]80 params:81 bim_1000 = "/opt/1000G/all_phase3_filtered.pvar",82 bind = input_dict["bind_paths"],83 infile = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness",84 infile_1000g = "/opt/1000G/all_phase3_filtered",85 out = output_dict["output_dir"] + "/common_snps/subset_data",86 out_1000g = output_dict["output_dir"] + "/common_snps/subset_1000g",87 sif = input_dict["singularity_image"]88 shell:89 """90 singularity exec --bind {params.bind} {params.sif} awk 'NR==FNR{{a[$1,$2,$4,$5];next}} ($1,$2,$4,$5) in a{{print $3}}' {input.bim} {params.bim_1000} > {output.snps_1000g}91 singularity exec --bind {params.bind} {params.sif} awk 'NR==FNR{{a[$1,$2,$4,$5];next}} ($1,$2,$4,$5) in a{{print $3}}' {params.bim_1000} {input.bim} > {output.snps_data}92 singularity exec --bind {params.bind} {params.sif} plink2 --threads {threads} --pfile {params.infile} --extract {output.snps_data} --make-pgen 'psam-cols='fid,parents,sex,phenos --out {params.out}93 singularity exec --bind {params.bind} {params.sif} plink2 --threads {threads} --pfile {params.infile_1000g} --extract {output.snps_1000g} --make-pgen --out {params.out_1000g}94 """95### Prune with --indep,96rule prune_1000g:97 input:98 bed_1000g = output_dict["output_dir"] + "/common_snps/subset_1000g.pgen",99 bim_1000g = output_dict["output_dir"] + "/common_snps/subset_1000g.pvar",100 fam_1000g = output_dict["output_dir"] + "/common_snps/subset_1000g.psam",101 bim = output_dict["output_dir"] + "/common_snps/subset_data.pvar",102 bed = output_dict["output_dir"] + "/common_snps/subset_data.pgen",103 fam = output_dict["output_dir"] + "/common_snps/subset_data.psam",104 output:105 prune_out_1000g = output_dict["output_dir"] + "/common_snps/subset_pruned_1000g.prune.out",106 prune_out = output_dict["output_dir"] + "/common_snps/subset_data.prune.out",107 bed_1000g = output_dict["output_dir"] + "/common_snps/subset_pruned_1000g.pgen",108 bim_1000g = output_dict["output_dir"] + "/common_snps/subset_pruned_1000g.pvar",109 fam_1000g = output_dict["output_dir"] + "/common_snps/subset_pruned_1000g.psam",110 bed = output_dict["output_dir"] + "/common_snps/subset_pruned_data.pgen",111 bim = output_dict["output_dir"] + "/common_snps/subset_pruned_data.pvar",112 bim_temp = output_dict["output_dir"] + "/common_snps/subset_pruned_data_temp.pvar",113 bim_old = output_dict["output_dir"] + "/common_snps/subset_pruned_data_original.pvar",114 fam = output_dict["output_dir"] + "/common_snps/subset_pruned_data.psam",115 data_1000g_key = output_dict["output_dir"] + "/common_snps/subset_pruned_data_1000g_key.txt",116 SNPs2keep = output_dict["output_dir"] + "/common_snps/SNPs2keep.txt"117 resources:118 mem_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["prune_1000g_memory"],119 disk_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["prune_1000g_memory"]120 threads: plink_gender_ancestry_QC_dict["prune_1000g_threads"]121 params:122 bind = input_dict["bind_paths"],123 sif = input_dict["singularity_image"],124 out_1000g = output_dict["output_dir"] + "/common_snps/subset_pruned_1000g",125 infile_1000g = output_dict["output_dir"] + "/common_snps/subset_1000g",126 infile = output_dict["output_dir"] + "/common_snps/subset_data",127 out = output_dict["output_dir"] + "/common_snps/subset_pruned_data"128 shell:129 """130 singularity exec --bind {params.bind} {params.sif} plink2 --threads {threads} --pfile {params.infile_1000g} \131 --indep-pairwise 50 5 0.5 \132 --out {params.out_1000g}133 singularity exec --bind {params.bind} {params.sif} plink2 --threads {threads} --pfile {params.infile_1000g} --extract {output.prune_out_1000g} --make-pgen --out {params.out_1000g}134 if [[ $(grep "##" {input.bim} | wc -l) > 0 ]]135 then136 singularity exec --bind {params.bind} {params.sif} grep "##" {input.bim} > {output.data_1000g_key}137 fi138 singularity exec --bind {params.bind} {params.sif} awk -F"\\t" 'BEGIN{{OFS=FS = "\\t"}} NR==FNR{{a[$1 FS $2 FS $4 FS $5] = $0; next}} {{ind = $1 FS $2 FS $4 FS $5}} ind in a {{print a[ind], $3}}' {output.bim_1000g} {input.bim} | singularity exec --bind {params.bind} {params.sif} grep -v "##" >> {output.data_1000g_key}139 singularity exec --bind {params.bind} {params.sif} grep -v "##" {output.data_1000g_key} | singularity exec --bind {params.bind} {params.sif} awk 'BEGIN{{FS=OFS="\t"}}{{print $NF}}' > {output.prune_out}140 singularity exec --bind {params.bind} {params.sif} plink2 --threads {threads} --pfile {params.infile} --extract {output.prune_out} --make-pgen 'psam-cols='fid,parents,sex,phenos --out {params.out}141 singularity exec --bind {params.bind} {params.sif} cp {output.bim} {output.bim_old}142 singularity exec --bind {params.bind} {params.sif} grep -v "#" {output.bim_old} | singularity exec --bind {params.bind} {params.sif} awk 'BEGIN{{FS=OFS="\t"}}{{print($3)}}' > {output.SNPs2keep}143 singularity exec --bind {params.bind} {params.sif} grep "#CHROM" {output.data_1000g_key} > {output.bim}144 singularity exec --bind {params.bind} {params.sif} grep -Ff {output.SNPs2keep} {output.data_1000g_key} >> {output.bim}145 singularity exec --bind {params.bind} {params.sif} awk 'BEGIN{{FS=OFS="\t"}}NF{{NF-=1}};1' < {output.bim} > {output.bim_temp}146 singularity exec --bind {params.bind} {params.sif} grep "##" {output.bim_1000g} > {output.bim}147 singularity exec --bind {params.bind} {params.sif} cat {output.bim_temp} >> {output.bim}148 """149 150rule final_pruning: ### put in contingency for duplicated snps - remove from both 1000G and your dataset151 input:152 bed = output_dict["output_dir"] + "/common_snps/subset_pruned_data.pgen",153 bim = output_dict["output_dir"] + "/common_snps/subset_pruned_data.pvar",154 fam = output_dict["output_dir"] + "/common_snps/subset_pruned_data.psam",155 output:156 bed = output_dict["output_dir"] + "/common_snps/final_subset_pruned_data.pgen",157 bim = output_dict["output_dir"] + "/common_snps/final_subset_pruned_data.pvar",158 fam = output_dict["output_dir"] + "/common_snps/final_subset_pruned_data.psam",159 resources:160 mem_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["prune_1000g_memory"],161 disk_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["prune_1000g_memory"]162 threads: plink_gender_ancestry_QC_dict["prune_1000g_threads"]163 params:164 bind = input_dict["bind_paths"],165 sif = input_dict["singularity_image"],166 infile = output_dict["output_dir"] + "/common_snps/subset_pruned_data",167 out = output_dict["output_dir"] + "/common_snps/final_subset_pruned_data"168 shell:169 """170 singularity exec --bind {params.bind} {params.sif} plink2 --rm-dup 'force-first' -threads {threads} --pfile {params.infile} --make-pgen 'psam-cols='fid,parents,sex,phenos --out {params.out}171 """172### use PCA from plink for PCA and projection173rule pca_1000g:174 input:175 bed_1000g = output_dict["output_dir"] + "/common_snps/subset_pruned_1000g.pgen",176 bim_1000g = output_dict["output_dir"] + "/common_snps/subset_pruned_1000g.pvar",177 fam_1000g = output_dict["output_dir"] + "/common_snps/subset_pruned_1000g.psam",178 bed = output_dict["output_dir"] + "/common_snps/subset_pruned_data.pgen" 179 output:180 out = output_dict["output_dir"] + "/pca_projection/subset_pruned_1000g_pcs.acount",181 eig_all = output_dict["output_dir"] + "/pca_projection/subset_pruned_1000g_pcs.eigenvec.allele",182 eig_vec = output_dict["output_dir"] + "/pca_projection/subset_pruned_1000g_pcs.eigenvec",183 eig = output_dict["output_dir"] + "/pca_projection/subset_pruned_1000g_pcs.eigenval",184 resources:185 mem_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["pca_1000g_memory"],186 disk_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["pca_1000g_memory"]187 threads: plink_gender_ancestry_QC_dict["pca_1000g_threads"]188 params:189 bind = input_dict["bind_paths"],190 sif = input_dict["singularity_image"],191 infile = output_dict["output_dir"] + "/common_snps/subset_pruned_1000g",192 out = output_dict["output_dir"] + "/pca_projection/subset_pruned_1000g_pcs"193 shell:194 """195 singularity exec --bind {params.bind} {params.sif} plink2 --threads {threads} --pfile {params.infile} \196 --freq counts \197 --pca allele-wts \198 --out {params.out}199 """200### use plink pca results to plot with R ###201rule pca_project:202 input:203 bed = output_dict["output_dir"] + "/common_snps/final_subset_pruned_data.pgen",204 bim = output_dict["output_dir"] + "/common_snps/final_subset_pruned_data.pvar",205 fam = output_dict["output_dir"] + "/common_snps/final_subset_pruned_data.psam",206 frq = output_dict["output_dir"] + "/pca_projection/subset_pruned_1000g_pcs.acount",207 scores = output_dict["output_dir"] + "/pca_projection/subset_pruned_1000g_pcs.eigenvec.allele"208 output:209 projected_scores = output_dict["output_dir"] + "/pca_projection/final_subset_pruned_data_pcs.sscore",210 projected_1000g_scores = output_dict["output_dir"] + "/pca_projection/subset_pruned_1000g_pcs_projected.sscore"211 resources:212 mem_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["pca_project_memory"],213 disk_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["pca_project_memory"]214 threads: plink_gender_ancestry_QC_dict["pca_project_threads"]215 params:216 bind = input_dict["bind_paths"],217 sif = input_dict["singularity_image"],218 infile = output_dict["output_dir"] + "/common_snps/final_subset_pruned_data",219 infile_1000g = output_dict["output_dir"] + "/common_snps/subset_pruned_1000g",220 out = output_dict["output_dir"] + "/pca_projection/final_subset_pruned_data_pcs",221 out_1000g = output_dict["output_dir"] + "/pca_projection/subset_pruned_1000g_pcs_projected"222 shell:223 """224 export OMP_NUM_THREADS={threads}225 singularity exec --bind {params.bind} {params.sif} plink2 --threads {threads} --pfile {params.infile} \226 --read-freq {input.frq} \227 --score {input.scores} 2 5 header-read no-mean-imputation \228 variance-standardize \229 --score-col-nums 6-15 \230 --out {params.out}231 singularity exec --bind {params.bind} {params.sif} plink2 --threads {threads} --pfile {params.infile_1000g} \232 --read-freq {input.frq} \233 --score {input.scores} 2 5 header-read no-mean-imputation \234 variance-standardize \235 --score-col-nums 6-15 \236 --out {params.out_1000g}237 """238rule pca_projection_assign:239 input:240 projected_scores = output_dict["output_dir"] + "/pca_projection/final_subset_pruned_data_pcs.sscore",241 projected_1000g_scores = output_dict["output_dir"] + "/pca_projection/subset_pruned_1000g_pcs_projected.sscore",242 fam_1000g = output_dict["output_dir"] + "/common_snps/subset_1000g.psam",243 psam = psam,244 sexcheck = output_dict["output_dir"] + "/check_sex/check_sex.sexcheck.tsv",245 output:246 sexcheck = output_dict["output_dir"] + "/pca_sex_checks/check_sex_update_remove.tsv",247 anc_check = output_dict["output_dir"] + "/pca_sex_checks/ancestry_update_remove.tsv",248 anc_fig = report(output_dict["output_dir"] + "/pca_sex_checks/Ancestry_PCAs.png", category = "Ancestry", caption = "../report_captions/ancestry_pca.rst")249 resources:250 mem_per_thread_gb=lambda wildcards, attempt: attempt * 10,251 disk_per_thread_gb=lambda wildcards, attempt: attempt * 10252 threads: 2253 params:254 variables = output_dict["output_dir"] + "/pca_sex_checks/variables.tsv",255 bind = input_dict["bind_paths"],256 sif = input_dict["singularity_image"],257 outdir = output_dict["output_dir"] + "/pca_sex_checks/",258 script = "/opt/WG1-pipeline-QC/Imputation/scripts/PCA_Projection_Plotting.R"259 shell:260 """261 singularity exec --bind {params.bind} {params.sif} echo {params.outdir} > {params.variables}262 singularity exec --bind {params.bind} {params.sif} echo {input.projected_scores} >> {params.variables}263 singularity exec --bind {params.bind} {params.sif} echo {input.projected_1000g_scores} >> {params.variables}264 singularity exec --bind {params.bind} {params.sif} echo {input.fam_1000g} >> {params.variables}265 singularity exec --bind {params.bind} {params.sif} echo {input.psam} >> {params.variables}266 singularity exec --bind {params.bind} {params.sif} echo {input.sexcheck} >> {params.variables}267 singularity exec --bind {params.bind} {params.sif} Rscript {params.script} {params.variables}268 """269rule summary_ancestry_sex:270 input:271 sexcheck = output_dict["output_dir"] + "/check_sex/check_sex.sexcheck.tsv",272 sexcheck_tsv = output_dict["output_dir"] + "/pca_sex_checks/check_sex_update_remove.tsv",273 fam = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness.psam",274 anc_check = output_dict["output_dir"] + "/pca_sex_checks/ancestry_update_remove.tsv"275 output:276 report(output_dict["output_dir"] + "/metrics/sex_summary.png", category = "Ancestry and Sex Summary", caption = "../report_captions/sex_summary.rst"),277 report(output_dict["output_dir"] + "/metrics/ancestry_summary.png", category = "Ancestry and Sex Summary", caption = "../report_captions/ancestry_summary.rst")278 resources:279 mem_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["summary_ancestry_sex_memory"],280 disk_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["summary_ancestry_sex_memory"]281 threads: plink_gender_ancestry_QC_dict["summary_ancestry_sex_threads"]282 params:283 bind = input_dict["bind_paths"],284 sif = input_dict["singularity_image"],285 outdir = output_dict["output_dir"] + "/metrics/",286 basedir = output_dict["output_dir"],287 script = "/opt/WG1-pipeline-QC/Imputation/scripts/sex_ancestry_summaries.R"288 shell:289 """290 singularity exec --bind {params.bind} {params.sif} Rscript {params.script} {params.basedir} {params.outdir}291 """292rule separate_indivs:293 input:294 sexcheck = output_dict["output_dir"] + "/pca_sex_checks/check_sex_update_remove.tsv",295 anc_check = output_dict["output_dir"] + "/pca_sex_checks/ancestry_update_remove.tsv"296 output:297 update_sex = output_dict["output_dir"] + "/separate_indivs/sex_update_indivs.tsv",298 remove_indiv = output_dict["output_dir"] + "/separate_indivs/remove_indivs.tsv",299 remove_indiv_temp = output_dict["output_dir"] + "/separate_indivs/remove_indivs_temp.tsv",300 resources:301 mem_per_thread_gb=lambda wildcards, attempt: attempt * 5,302 disk_per_thread_gb=lambda wildcards, attempt: attempt * 5303 threads: 1304 params:305 bind = input_dict["bind_paths"],306 sif = input_dict["singularity_image"],307 shell:308 """309 singularity exec --bind {params.bind} {params.sif} grep "UPDATE" {input.sexcheck} | singularity exec --bind {params.bind} {params.sif} awk 'BEGIN{{FS=OFS="\t"}}{{print($1,$2,$4)}}' | singularity exec --bind {params.bind} {params.sif} sed 's/SNPSEX/SEX/g' > {output.update_sex}310 singularity exec --bind {params.bind} {params.sif} grep "REMOVE" {input.sexcheck} | singularity exec --bind {params.bind} {params.sif} awk 'BEGIN{{FS=OFS="\t"}}{{print($1,$2)}}'> {output.remove_indiv_temp}311 singularity exec --bind {params.bind} {params.sif} grep "REMOVE" {input.anc_check} | singularity exec --bind {params.bind} {params.sif} awk 'BEGIN{{FS=OFS="\t"}}{{print($1,$2)}}' >> {output.remove_indiv_temp}312 singularity exec --bind {params.bind} {params.sif} sort -u {output.remove_indiv_temp} > {output.remove_indiv}313 """314rule update_sex_ancestry:315 input:316 bim = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness.pgen",317 psam = psam,318 update_sex = output_dict["output_dir"] + "/separate_indivs/sex_update_indivs.tsv",319 remove_indiv = output_dict["output_dir"] + "/separate_indivs/remove_indivs.tsv",320 output:321 bed = output_dict["output_dir"] + "/update_sex_ancestry/update_sex.pgen",322 bim = output_dict["output_dir"] + "/update_sex_ancestry/update_sex.pvar",323 psam = output_dict["output_dir"] + "/update_sex_ancestry/update_sex.psam",324 resources:325 mem_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["update_sex_memory"],326 disk_per_thread_gb=lambda wildcards, attempt: attempt * plink_gender_ancestry_QC_dict["update_sex_memory"]327 threads: plink_gender_ancestry_QC_dict["update_sex_threads"]328 params:329 anc_updated_psam = output_dict["output_dir"] + "/pca_sex_checks/updated_psam.psam",330 infile = output_dict["output_dir"] + "/indiv_missingness/indiv_missingness",331 psam_temp = output_dict["output_dir"] + "/update_sex_ancestry/temp/indiv_missingness.psam_temp",332 tdir = output_dict["output_dir"] + "/update_sex_ancestry/temp/",333 bind = input_dict["bind_paths"],334 out = output_dict["output_dir"] + "/update_sex_ancestry/update_sex",335 sif = input_dict["singularity_image"]336 shell:337 """338 singularity exec --bind {params.bind} {params.sif} mkdir -p {params.tdir}339 singularity exec --bind {params.bind} {params.sif} cp {params.infile}* {params.tdir}340 singularity exec --bind {params.bind} {params.sif} cp {params.anc_updated_psam} {params.tdir}/indiv_missingness.psam 341 singularity exec --bind {params.bind} {params.sif} plink2 --threads {threads} --pfile {params.tdir}/indiv_missingness --update-sex {input.update_sex} --remove {input.remove_indiv} --make-pgen 'psam-cols='fid,parents,sex,phenos --out {params.out}342 """343rule subset_ancestry:344 input:345 psam = output_dict["output_dir"] + "/update_sex_ancestry/update_sex.psam"346 output:347 keep = output_dict["output_dir"] + "/subset_ancestry/{ancestry}_individuals.psam",348 pgen = output_dict["output_dir"] + "/subset_ancestry/{ancestry}_subset.pgen",349 psam = output_dict["output_dir"] + "/subset_ancestry/{ancestry}_subset.psam",350 pvar = output_dict["output_dir"] + "/subset_ancestry/{ancestry}_subset.pvar"351 resources: 352 mem_per_thread_gb = lambda wildcards, attempt: attempt * 1,353 disk_per_thread_gb = lambda wildcards, attempt: attempt * 1354 threads: 1355 params:356 bind = input_dict["bind_paths"],357 sif = input_dict["singularity_image"],358 infile = output_dict["output_dir"] + "/update_sex_ancestry/update_sex",359 out = output_dict["output_dir"] + "/subset_ancestry/{ancestry}_subset"360 shell:361 """362 singularity exec --bind {params.bind} {params.sif} grep {wildcards.ancestry} {input.psam} > {output.keep}363 singularity exec --bind {params.bind} {params.sif} plink2 --threads {threads} --pfile {params.infile} --keep {output.keep} --max-alleles 2 --make-pgen 'psam-cols='fid,parents,sex,phenos --out {params.out}...
Snakefile
Source:Snakefile
1import os2import sys3import argparse4import glob5from pathlib import Path6import re7if os.path.exists('config.yaml'):8 configfile: 'config.yaml'9# print function for debugging in snakemake10def eprint(*args, **kwargs):11 print(*args, file=sys.stderr, **kwargs)12input_dir=config['input_dir']13input_sample=config['input_sample']14core_genome=config['core_genome']15output_dir=config['output_dir']16include_to_final=config['include']17SNP_distance=config['SNP_distance']18prefix=config['prefix']19# check if files are fastq.gz:20def check_fastq_end():21 fastqs = glob.glob(input_dir+'/'+input_sample+"*R1*"); fastqs += glob.glob(input_dir+'/'+input_sample+"*R2*")22 for fastq in fastqs:23 if not fastq.endswith('fastq.gz') and not fastq.endswith('fq.gz'):24 raise Exception("ERROR Input files must be fastq.gz or fq.gz !")25 return26check_fastq_end()27READS={}28READS['R1'] = glob.glob(input_dir+'/'+input_sample+"*R1*")[0]29READS['R2'] = glob.glob(input_dir+'/'+input_sample+"*R2*")[0]30# check if the file is already present in the matrix. Authomatically set include to False if the file is already in the matrix.31def check_if_exists_in_matrix():32 myfile_dist=Path(output_dir+'/output_files/complete_clonetype_distance_matrix.txt')33 if myfile_dist.is_file():34 with open(output_dir+'/output_files/complete_clonetype_distance_matrix.txt') as f:35 names = [line.split('\t')[0] for line in f][1:]36 trimmed_names = [name.split('::')[1] for name in names]37 if input_sample in trimmed_names:38 if include_to_final is True:39 config['include']=False40 eprint("The file already exists in the matrix. Parameter \"Include\" is set to False")41check_if_exists_in_matrix()42def number_of_existing_samples():43 myfile_aln=Path(output_dir+'/output_files/complete_clonetype_sorted_alignment.fa')44 if myfile_aln.is_file():45 sample_no=int(sum(1 for line in open(output_dir+'/output_files/complete_clonetype_sorted_alignment.fa') if line.rstrip())/2)46 else:47 sample_no=048 return(sample_no)49sample_no=number_of_existing_samples()50# exclude all the names which are non-numeric51def follows_the_pattern(name):52 if(re.compile("[0-9][0-9][0-9]").match(name)):53 return True54 else:55 return False56def new_clonetype():57 myfile_dist=Path(output_dir+'/output_files/complete_clonetype_distance_matrix.txt')58 if myfile_dist.is_file():59 with open(output_dir+'/output_files/complete_clonetype_distance_matrix.txt') as f:60 names = [line.split('\t')[0] for line in f]61 num = [name.split('::')[0][-3:] for name in names]62 numbers=int(max(filter(follows_the_pattern, num)))+163 elif sample_no==1:64 numbers=265 else:66 numbers=167 return("{:03d}".format(numbers))68new_ct=new_clonetype()69now_ct=float(new_ct)-170eprint("Previously analyzed and added sample number:",sample_no)71eprint("Number of already existing clone types in the data set is:", int(now_ct))72rule all:73 input:74 vcf_gz=expand("{output_dir}/sample_alignments/{sample}/{sample}.vcf.gz", sample=config['input_sample'], output_dir=config['output_dir']),75 aligned=expand("{output_dir}/sample_alignments/{sample}/{sample}.aligned.fa", sample=config['input_sample'], output_dir=config['output_dir']),76 stats=expand("{output_dir}/sample_alignments/{sample}/alignment_statistics.txt", sample=config['input_sample'], output_dir=config['output_dir']),77 snp_dist=expand("{output_dir}/sample_alignments/{sample}/{sample}_clonetype_snp_distances.txt", output_dir=config['output_dir'], sample=config['input_sample']) if sample_no>=1 else [],78 initial_alignment=expand("{output_dir}/output_files/complete_clonetype_sorted_alignment.fa", output_dir=config['output_dir']) if sample_no==0 else [],79 clonetype_stats=expand("{output_dir}/sample_alignments/{sample}/{sample}_clonetype_summary.txt", output_dir=config['output_dir'], sample=config['input_sample']) if sample_no>=1 else []80 message:81 "Pipeline complete"82# Run snippy on the input sample using the core reference genome83rule run_snippy:84 input:85 R1=READS['R1'],86 R2=READS['R2'] 87 output:88 vcf_gz=expand("{output_dir}/sample_alignments/{sample}/{sample}.vcf.gz", sample=config['input_sample'], output_dir=config['output_dir']),89 aligned=expand("{output_dir}/sample_alignments/{sample}/{sample}.aligned.fa", sample=config['input_sample'], output_dir=config['output_dir']),90 ref=temp(expand("{output_dir}/sample_alignments/{sample}/ref.fa", sample=config['input_sample'], output_dir=config['output_dir'])),91 ref_fai=temp(expand("{output_dir}/sample_alignments/{sample}/ref.fa.fai", sample=config['input_sample'], output_dir=config['output_dir'])),92 bam=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.bam", sample=config['input_sample'], output_dir=config['output_dir'])),93 bam_bai=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.bam.bai", sample=config['input_sample'], output_dir=config['output_dir'])),94 raw_vcf=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.raw.vcf", sample=config['input_sample'], output_dir=config['output_dir'])),95 filt_vcf=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.filt.vcf", sample=config['input_sample'], output_dir=config['output_dir'])),96 vcf=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.vcf", sample=config['input_sample'], output_dir=config['output_dir'])),97 tab=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.tab", sample=config['input_sample'], output_dir=config['output_dir'])),98 subs_vcf=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.subs.vcf", sample=config['input_sample'], output_dir=config['output_dir'])),99 consensus_fa=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.consensus.fa", sample=config['input_sample'], output_dir=config['output_dir'])),100 consensus_subs_fa=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.consensus.subs.fa", sample=config['input_sample'], output_dir=config['output_dir'])),101 log_file=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.log", sample=config['input_sample'], output_dir=config['output_dir'])),102 bed=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.bed", sample=config['input_sample'], output_dir=config['output_dir'])),103 gff=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.gff", sample=config['input_sample'], output_dir=config['output_dir'])),104 csv=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.csv", sample=config['input_sample'], output_dir=config['output_dir'])),105 html=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.html", sample=config['input_sample'], output_dir=config['output_dir'])),106 txt=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.txt", sample=config['input_sample'], output_dir=config['output_dir']))107 108 params:109 outdir=config['output_dir'],110 ref=config['core_genome'],111 prefix=config['input_sample'],112 cpus=10,113 mincov=10114 message:115 "Running snippy variant calling for {params.prefix}."116 log:117 expand("{output_dir}/logs/snippy_{sample}.log", output_dir=config["output_dir"], sample=config['input_sample'])118 shell:119 """120 module unload anaconda3/4.0.0121 module load anaconda2/4.0.0122 module load tbl2asn/20190117123 module load parallel/20190122124 module load bwa/0.7.15125 module load samtools/1.9126 module load java/1.8.0127 module load jre/1.8.0128 module load perl/5.24.0129 module load freebayes/1.1.0-50-g61527c5130 module load vcflib/1.0.0-rc2131 module load vcftools/0.1.16132 module load snpeff/4.3r133 module load prokka/1.12134 module load minimap2/2.6135 module load seqtk/1.0-r82-dirty136 module load snp-sites/2.4.0137 module load emboss/6.6.0138 module load bcftools/1.9139 module load snippy/4.4.0140 module load vt/0.5772141 (snippy --outdir {params.outdir}/sample_alignments/{params.prefix} --ref {params.ref} --R1 {input.R1} --R2 {input.R2} --prefix {params.prefix} --cpus {params.cpus} --force --mincov {params.mincov}) 2> {log}142 # how to add to a previous SNP distance? cp - if include is No, add if include is Yes?143 """144def do_QC(fasta_file, output_file, log_file):145 with open(fasta_file, "r") as f:146 sequence = "".join(line.strip() for line in f if line[:1]!=">")147 seq_len = len(sequence)148 missing_len = sequence.count("N")+sequence.count("-")149 cons_positions = seq_len - missing_len150 missing_percent = (missing_len*100)/seq_len151 present_percent = 100 - missing_percent152 f.closed153 if missing_percent >20:154 with open(log_file, "w") as logf:155 logf.write("Core genome consists of {total} positions.\n\156In total {considered} positions were covered by not less than 10 reads.\n\157{covered}% of positions were covered by not less than 10 reads.\n".format(total=seq_len, considered=cons_positions, covered=round(present_percent, 2)))158 raise Exception("ERROR: The sequence alignment is less than 80% of the reference sequence. Analysis can't be continued because of low coverage.")159 else:160 print("Sequence alignment covers 80% or more of the reference sequence")161 with open(output_file, "w") as outf:162 outf.write("Core genome consists of {total} positions.\n\163In total {considered} positions were covered by not less than 10 reads.\n\164{covered}% of postions were covered by not less than 10 reads.\n".format(total=seq_len, considered=cons_positions, covered=round(present_percent, 2)))165 166 167# Check if the alignment covers >=80% of the reference core genome168rule do_QC:169 input:170 expand("{output_dir}/sample_alignments/{sample}/{sample}.aligned.fa", sample=config['input_sample'], output_dir=config['output_dir'])171 output:172 expand("{output_dir}/sample_alignments/{sample}/alignment_statistics.txt", sample=config['input_sample'], output_dir=config['output_dir'])173 log:174 expand("{output_dir}/sample_alignments/{sample}/unsuccessful_alignment_statistics.txt", sample=config['input_sample'], output_dir=config['output_dir'])175 message:176 "Performing quality check of the alignment coverage."177 run:178 do_QC(input[0], output[0], log[0])179if sample_no >= 2:180# SNP distance analysis on the provided samples181 rule snp_dists:182 input:183 aligned_file=expand("{output_dir}/sample_alignments/{sample}/{sample}.aligned.fa", sample=config['input_sample'], output_dir=config['output_dir']),184 initial_alignment=expand("{output_dir}/output_files/complete_clonetype_sorted_alignment.fa", output_dir=config['output_dir']),185 alignment_statistics=expand("{output_dir}/sample_alignments/{sample}/alignment_statistics.txt", sample=config['input_sample'], output_dir=config['output_dir'])186 output:187 consensus_sample=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.consensus.fa", sample=config['input_sample'], output_dir=config['output_dir'])),188 snp_dist=expand("{output_dir}/sample_alignments/{sample}/{sample}_clonetype_snp_distances.txt", output_dir=config['output_dir'], sample=config['input_sample']),189 temp_names=temp(expand("{output_dir}/output_files/temporary_names_snp_dist.txt", output_dir=config['output_dir'])),190 temp_dist=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}_temp_distances.txt", sample=config['input_sample'], output_dir=config['output_dir'])),191 temp_fasta=temp(expand("{output_dir}/sample_alignments/{sample}/temp_fasta.fa", output_dir=config['output_dir'], sample=config['input_sample'])),192 params:193 initial_distance=expand("{output_dir}/output_files/complete_clonetype_distance_matrix.txt", output_dir=config['output_dir']) if include_to_final == True else [],194 temp_consensus=temp(expand("{output_dir}/output_files/temp_complete_clonetype_sorted_alignment.fa", output_dir=config['output_dir'])),195 sample=config['input_sample'],196 incl=config['include']197 message:198 "Performing SNP distance analysis on the {params.sample}."199 shell:200 """201 module load pigz/2.3.4202 module load seqkit/0.7.1 203 # creating a sorted fasta file with all the genes merged in one sequence204 205 seqkit sort --id-regexp "(.+)" {input.aligned_file} | grep -v "^>" | awk \'!/^>/ {{ printf "%s", $0; n = "\\n" }} /^>/ {{ print n $0; n = "" }} END {{ printf "%s", n }}\' | sed "1s/^/>{params.sample}\\n/" > {output.consensus_sample}206 # performing snp-dist for each sample pair (new sample vs every older sample)207 208 cat {input.initial_alignment} | nl | grep ">" > {output.temp_names} 209 echo {params.sample} > {output.temp_dist}210 211 while read line name212 do213 # adding a 1 to the line number so both name and sequence would be included214 new_line=`echo "$line"+1 | bc`215 seq=`head -n $new_line {input.initial_alignment} | tail -2`216 new_seq=`cat {output.consensus_sample}`217 echo "$seq" > {output.temp_fasta}218 echo "$new_seq" >> {output.temp_fasta}219 dist=`snp-dists -q -b {output.temp_fasta} | tail -1 | cut -f 2`220 echo "$dist"221 done < {output.temp_names} >> {output.temp_dist}222 223 # merging the initial distance table with the new results224 paste {params.initial_distance} {output.temp_dist} > {output.snp_dist} 225 226 hline=`cat {output.temp_dist} | tr "\\n" "\\t" | sed "s/$/0\\n/"`227 echo "$hline" >> {output.snp_dist} 228 229 230 # if include is true output snps distance file and initial alignment file should be updated with the new file. Filename should be changed in the next rule.231 if [[ {params.incl} == "True" ]]232 then233 cat {input.initial_alignment} {output.consensus_sample} > {params.temp_consensus}234 mv {params.temp_consensus} {input.initial_alignment}235 236 cp {output.snp_dist} {params.initial_distance}237 fi238 """239if sample_no == 1:240 rule snp_dists:241 input:242 aligned_file=expand("{output_dir}/sample_alignments/{sample}/{sample}.aligned.fa", sample=config['input_sample'], output_dir=config['output_dir']),243 initial_alignment=expand("{output_dir}/output_files/complete_clonetype_sorted_alignment.fa", output_dir=config['output_dir']),244 alignment_statistics=expand("{output_dir}/sample_alignments/{sample}/alignment_statistics.txt", sample=config['input_sample'], output_dir=config['output_dir'])245 output:246 consensus_sample=temp(expand("{output_dir}/sample_alignments/{sample}/{sample}.consensus.fa", sample=config['input_sample'], output_dir=config['output_dir'])),247 snp_dist=expand("{output_dir}/sample_alignments/{sample}/{sample}_clonetype_snp_distances.txt", output_dir=config['output_dir'], sample=config['input_sample']),248 temp_fasta=temp(expand("{output_dir}/sample_alignments/{sample}/temp_fasta.fa", output_dir=config['output_dir'], sample=config['input_sample'])),249 initial_distance=expand("{output_dir}/output_files/complete_clonetype_distance_matrix.txt", output_dir=config['output_dir'])250 params:251 temp_consensus=temp(expand("{output_dir}/output_files/temp_complete_clonetype_sorted_alignment.fa", output_dir=config['output_dir'])),252 sample=config['input_sample'],253 incl=config['include']254 message:255 "Performing SNP distance analysis on the {params.sample}."256 shell:257 """258 module load pigz/2.3.4259 module load seqkit/0.7.1260 # creating a sorted fasta file with all the genes merged in one sequence261 seqkit sort --id-regexp "(.+)" {input.aligned_file} | grep -v "^>" | awk \'!/^>/ {{ printf "%s", $0; n = "\\n" }} /^>/ {{ print n $0; n = "" }} END {{ printf "%s", n }}\' | sed "1s/^/>{params.sample}\\n/" > {output.consensus_sample}262 # performing snp-dists for the only two samples which are available263 cat {input.initial_alignment} {output.consensus_sample} > {output.temp_fasta}264 snp-dists -q -b {output.temp_fasta} > {output.snp_dist}265 266 # output snps distance file and initial alignment file should be updated with the new file. Filename should be changed in the next rule.267 268 cat {input.initial_alignment} {output.consensus_sample} > {params.temp_consensus}269 mv {params.temp_consensus} {input.initial_alignment}270 cp {output.snp_dist} {output.initial_distance}271 272 """273# If there is only one alignment, the alignment should be added to the complete_alignment if include is YES file and no further action should be done.274rule create_complete_aligment:275 input:276 aligned_file=expand("{output_dir}/sample_alignments/{sample}/{sample}.aligned.fa", sample=config['input_sample'], output_dir=config['output_dir'])277 output:278 initial_alignment=expand("{output_dir}/output_files/complete_clonetype_sorted_alignment.fa", output_dir=config['output_dir']) if sample_no<1 else []279 params:280 prefix=config['prefix'],281 sample=config['input_sample']282 message:283 "Adding the sample to the multiple sequence FASTA file"284 shell:285 """286 module load pigz/2.3.4287 module load seqkit/0.7.1288 # creating a sorted fasta file with all the genes merged in one sequence289 seqkit sort --id-regexp "(.+)" {input.aligned_file} | grep -v "^>" | awk \'!/^>/ {{ printf "%s", $0; n = "\\n" }} /^>/ {{ print n $0; n = "" }} END {{ printf "%s", n }}\' | sed "1s/^/>{params.prefix}001::{params.sample}\\n/" > {output.initial_alignment}290 """291# Estimation of the closest clone types. If the SNP difference is <5000 SNPs, the isolate is given a clone type. Otherwise, it is considered a new clone type.292rule estimate_clonetype:293 input:294 snp_dist=expand("{output_dir}/sample_alignments/{sample}/{sample}_clonetype_snp_distances.txt", output_dir=config['output_dir'], sample=config['input_sample']),295 initial_alignment=expand("{output_dir}/output_files/complete_clonetype_sorted_alignment.fa", output_dir=config['output_dir']),296 initial_distance=expand("{output_dir}/output_files/complete_clonetype_distance_matrix.txt", output_dir=config['output_dir'])297 output:298 stats=expand("{output_dir}/sample_alignments/{sample}/{sample}_clonetype_summary.txt", output_dir=config['output_dir'], sample=config['input_sample']),299 temp_clonetype_list=temp(expand("{output_dir}/output_files/temp_{sample}_clonetype_list.txt", output_dir=config['output_dir'], sample=config['input_sample']))300 params:301 prefix=config['prefix'],302 distance=config['SNP_distance'],303 incl=config['include'],304 sample=config['input_sample'],305 new_clonetype=new_ct306 message:307 "Predicting the clone type for the input sample"308 shell:309 """310 module load datamash/1.4311 close_clonetypes=`cat {input.snp_dist} | tail -1 | cut -f 2- | datamash transpose | head -n -1 | nl | awk '$2<{params.distance} {{print $1}}'`312 if [[ ! -z "$close_clonetypes" ]]313 then314 while read line315 do316 new_num=`echo "$line" +1 | bc`317 head -1 {input.initial_distance} | cut -f "$new_num" | sed 's/::.*//'318 done <<< "$close_clonetypes" >> {output.temp_clonetype_list}319 320 clonetype_number=`cat {output.temp_clonetype_list} | sort | uniq -c | sort -k1,1n`321 322 line_number=`echo "$clonetype_number" | wc -l`323 324 else325 touch {output.temp_clonetype_list}326 line_number=0327 fi328 329 # if only one clone type is predicted: 330 if [[ "$line_number" == 1 ]]331 then332 number=`echo "$clonetype_number" | awk '{{print $1}}'`333 clonetype=`echo "$clonetype_number" | awk '{{print $2}}'`334 total_number=`cat {input.snp_dist} | cut -f 1 | grep -w "$clonetype" | wc -l`335 336 echo "The {params.sample} sample has less than {params.distance} SNP distance to the "$clonetype" clone type.337In total "$number" out of "$total_number" samples with this clone type fall below the SNP distance threshold." > {output.stats}338 elif [[ "$line_number" > 1 ]]339 then340 echo "WARNING! More than one clone type has smaller SNP distance than {params.distance} to the {params.sample}.341You might want to reconsider SNP distance threshold. If the sample is set to be included in the final SNP distance matrix, the most often occuring clone type will be used. It might cause problems in the future analysis. \The following clone types are reported to be close to the {params.sample} sample:" > {output.stats}342 while read clonetype_line343 do344 number=`echo "$clonetype_line" | awk '{{print $1}}'`345 clonetype=`echo "$clonetype_line" | awk '{{print $2}}'`346 total_number=`cat {input.snp_dist} | cut -f 1 | grep -w "$clonetype" | wc -l`347 echo "Clonetype "$clonetype" in "$number" out of "$total_number" samples with this clonetype which fall below the SNP distance threshold." >> {output.stats}348 349 done <<< "$clonetype_number"350 351 elif [[ "$line_number" == 0 ]]352 then353 echo "No already existing clone types are closer than the {params.distance} SNP distance threshold.354New clone type assigned: {params.new_clonetype}" > {output.stats}355 clonetype={params.prefix}{params.new_clonetype}356 fi357 # if include is true change the name of complete snp distance and sorted alignment files to the name of the assigned clone type358 if [[ {params.incl} == "True" ]]359 then360 sed -i "s/^>{params.sample}/>\"$clonetype\"::{params.sample}/" {input.initial_alignment}361 sed -i "s/{params.sample}/\"$clonetype\"::{params.sample}/" {input.initial_distance}362 fi363 """...
Snakefile_CombineResults.smk
Source:Snakefile_CombineResults.smk
1#!/usr/bin/env python2import os3import pandas as pd4from glob import glob5#################################6######## COMBINE RESULTS ########7#################################8if os.path.exists(output_dict["output_dir"] + "/manual_selections/scrublet/scrublet_percentile_manual_selection.tsv"):9 scrublet_selection = pd.read_csv(output_dict["output_dir"] + "/manual_selections/scrublet/scrublet_percentile_manual_selection.tsv", sep = "\t")10rule join_results:11 input:12 demuxlet = output_dict["output_dir"] + "/{pool}/CombinedResults/demuxlet_results.txt",13 souporcell = output_dict["output_dir"] + "/{pool}/CombinedResults/souporcell_results.txt",14 scrublet = lambda wildcards: expand(output_dict["output_dir"] + "/{pool}/CombinedResults/{pctl}_scrublet_results.txt", zip, pool = wildcards.pool, pctl = scrublet_selection.scrublet_Percentile[scrublet_selection.Pool == wildcards.pool]),15 scds = output_dict["output_dir"] + "/{pool}/CombinedResults/scds_results.txt",16 DoubletDetection = output_dict["output_dir"] + "/{pool}/CombinedResults/DoubletDetection_results.txt"17 output:18 output_dict["output_dir"] + "/{pool}/CombinedResults/CombinedDropletAssignments.tsv"19 resources:20 mem_per_thread_gb=5,21 disk_per_thread_gb=522 threads: 123 params:24 sif = input_dict["singularity_image"],25 bind = bind_path26 log: output_dict["output_dir"] + "/logs/join_results.{pool}.log"27 shell:28 """29 join -a1 -a2 -1 1 -2 1 -t "\t" -e"NA" -o "0,1.2,1.3,1.4,1.5,1.6,1.7,2.2,2.3,2.4,2.5" {input.demuxlet} {input.souporcell} | awk 'NR<2{{print $0;next}}{{print $0| "sort -k1"}}' | \30 join -a1 -a2 -1 1 -2 1 -t "\t" -e"NA" -o "0,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,2.2,2.3" - {input.scrublet} | awk 'NR<2{{print $0;next}}{{print $0| "sort -k1"}}' | \31 join -a1 -a2 -1 1 -2 1 -t "\t" -e"NA" -o "0,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,2.2,2.3" - {input.scds} | awk 'NR<2{{print $0;next}}{{print $0| "sort -k1"}}' | \32 join -a1 -a2 -1 1 -2 1 -t "\t" -e"NA" -o "0,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15,2.2" - {input.DoubletDetection} > {output}33 """34#####################################################################35############ SCRIPT FOR CALLING FINAL BARCODE ASSIGNMENT ############36#####################################################################37rule final_assignments:38 input:39 assignments = output_dict["output_dir"] + "/{pool}/CombinedResults/CombinedDropletAssignments_w_genotypeIDs.tsv"40 output:41 figure = report(output_dict["output_dir"] + "/{pool}/CombinedResults/DropletType_Assignment_BarPlot.png", category = "Number Individuals Summary", caption = "../report_captions/final_assignments.rst"),42 table = output_dict["output_dir"] + "/{pool}/CombinedResults/Final_Assignments_demultiplexing_doublets.txt",43 variables = output_dict["output_dir"] + "/{pool}/CombinedResults/variables.txt"44 resources:45 mem_per_thread_gb = lambda wildcards, attempt: attempt * CombineResults_dict["FinalAssignments_memory"],46 disk_per_thread_gb = lambda wildcards, attempt: attempt * CombineResults_dict["FinalAssignments_memory"]47 threads: CombineResults_dict["FinalAssignments_threads"]48 params:49 sif = input_dict["singularity_image"],50 bind = bind_path,51 out = output_dict["output_dir"],52 script = "/opt/WG1-pipeline-QC/Demultiplexing/scripts/FinalBarcodeAssignments.R"53 log: output_dict["output_dir"] + "/logs/final_assignments.{pool}.log"54 shell:55 """56 singularity exec --bind {params.bind} {params.sif} echo {params.out} > {output.variables}57 singularity exec --bind {params.bind} {params.sif} echo {wildcards.pool} >> {output.variables}58 singularity exec --bind {params.bind} {params.sif} Rscript {params.script} {output.variables}59 [[ -s {output.figure} ]]60 echo $?61 """62####################################################63############ SCRIPT TO PRODUCE QC PLOTS ############64####################################################65rule echo_final_assignments:66 input:67 expand(output_dict["output_dir"] + "/{pool}/CombinedResults/Final_Assignments_demultiplexing_doublets.txt", pool = samples.Pool)68 output:69 output_dict["output_dir"] + "/QC_figures/final_assignments.txt"70 resources:71 mem_per_thread_gb=1,72 disk_per_thread_gb=173 threads: 174 params:75 sif = input_dict["singularity_image"],76 bind = bind_path77 log: output_dict["output_dir"] + "/logs/echo_final_assignments.log"78 shell:79 """80 singularity exec --bind {params.bind} {params.sif} echo {input} | singularity exec --bind {params.bind} {params.sif} tr ' ' '\n' >> {output}81 """82rule final_assignments_check:83 input:84 assignment_list = output_dict["output_dir"] + "/QC_figures/final_assignments.txt",85 meta=input_dict["samplesheet_filepath"]86 output:87 assignment_list = output_dict["output_dir"] + "/QC_figures/final_assignments_comparison.txt",88 meta = output_dict["output_dir"] + "/QC_figures/meta_comparison.txt"89 resources:90 mem_per_thread_gb=1,91 disk_per_thread_gb=192 threads: 193 params:94 sif = input_dict["singularity_image"],95 bind = bind_path96 log: output_dict["output_dir"] + "/logs/final_assignments_check.log"97 shell:98 """99 singularity exec --bind {params.bind} {params.sif} cat {input.assignment_list} > {output.assignment_list}100 singularity exec --bind {params.bind} {params.sif} awk 'BEGIN{{FS=OFS="\t"}}{{print $1}}' {input.meta} | singularity exec --bind {params.bind} {params.sif} tail -n+2 > {output.meta}101 if [ "$(wc -l < {output.meta})" -eq "$(wc -l < {output.assignment_list})" ]102 then 103 echo 0104 else 105 echo "The number of pools in the final_assignments.txt file don't match the number of pools in your sample sheet" > {log}106 rm {input.assignment_list}107 rm {output.assignment_list}108 rm {output.meta}109 echo 1110 fi111 """112rule expected_observed_numbers:113 input:114 final_assignment = output_dict["output_dir"] + "/QC_figures/final_assignments.txt",115 sample_sheet = input_dict["samplesheet_filepath"]116 output:117 report(output_dict["output_dir"] + "/QC_figures/expected_observed_individuals_classifications.png", category = "Number Individuals Summary", caption = "../report_captions/expected_observed_numbers.rst")118 resources:119 mem_per_thread_gb = lambda wildcards, attempt: attempt * CombineResults_dict["expected_observed_numbers_memory"],120 disk_per_thread_gb = lambda wildcards, attempt: attempt * CombineResults_dict["expected_observed_numbers_memory"]121 threads: CombineResults_dict["expected_observed_numbers_threads"]122 params:123 sif = input_dict["singularity_image"],124 bind = bind_path,125 script = "/opt/WG1-pipeline-QC/Demultiplexing/scripts/expected_observed_individuals_doublets.R",126 out = output_dict["output_dir"] + "/QC_figures/",127 basedir = output_dict["output_dir"]128 log: output_dict["output_dir"] + "/logs/expected_observed_numbers.log"129 shell:130 """131 singularity exec --bind {params.bind} {params.sif} Rscript {params.script} {input.sample_sheet} {params.out} {params.basedir}132 """133rule QC_plots:134 input:135 assignment_list = output_dict["output_dir"] + "/QC_figures/final_assignments_comparison.txt",136 pools = output_dict["output_dir"] + "/QC_figures/meta_comparison.txt"137 output:138 variables = temp(output_dict["output_dir"] + "/QC_figures/R_variables.txt"),139 fig1 = report(output_dict["output_dir"] + "/QC_figures/nCount_RNA_violin_MAD_All.png", category = "QC", caption = "../report_captions/QC_plots_nCount.rst"),140 fig2 = report(output_dict["output_dir"] + "/QC_figures/nCount_RNA_violin_MADper_Pool.png", category = "QC", caption = "../report_captions/QC_plots_nCount_RNA_MADall.rst"),141 fig3 = report(output_dict["output_dir"] + "/QC_figures/nCount_RNA_violin_noMADlines.png", category = "QC", caption = "../report_captions/QC_plots_nCount.rst"),142 fig4 = report(output_dict["output_dir"] + "/QC_figures/nFeature_RNA_violin_MAD_All.png", category = "QC", caption = "../report_captions/QC_plots_feature_MADall.rst"),143 fig5 = report(output_dict["output_dir"] + "/QC_figures/nFeature_RNA_violin_MADper_Pool.png", category = "QC", caption = "../report_captions/QC_plots_feature_MADperPool.rst"),144 fig6 = report(output_dict["output_dir"] + "/QC_figures/nFeature_RNA_violin_noMADlines.png", category = "QC", caption = "../report_captions/QC_plots_feature.rst"),145 fig7 = report(output_dict["output_dir"] + "/QC_figures/nFeatures_vs_percentMT_QC_scatter_colorPool.png", category = "QC", caption = "../report_captions/QC_plots_features_mt_pool.rst"),146 fig8 = report(output_dict["output_dir"] + "/QC_figures/nFeatures_vs_percentMT_QC_scatter_w_MADlines.png", category = "QC", caption = "../report_captions/QC_plots_features_mt_MAD.rst"),147 fig9 = report(output_dict["output_dir"] + "/QC_figures/percent.mt_violin_MAD_All.png", category = "QC", caption = "../report_captions/QC_plots_mt_MADall.rst"),148 fig10 = report(output_dict["output_dir"] + "/QC_figures/percent.mt_violin_MADper_Pool.png", category = "QC", caption = "../report_captions/QC_plots_mt_MADpool.rst"),149 fig11 = report(output_dict["output_dir"] + "/QC_figures/percent.mt_violin_noMADlines.png", category = "QC", caption = "../report_captions/QC_plots_mt.rst"),150 fig12 = report(output_dict["output_dir"] + "/QC_figures/percent.rb_violin_MAD_All.png", category = "QC", caption = "../report_captions/QC_plots_rb_MADall.rst"),151 fig13 = report(output_dict["output_dir"] + "/QC_figures/percent.rb_violin_MADper_Pool.png", category = "QC", caption = "../report_captions/QC_plots_rb_MADperPool.rst"),152 fig14 = report(output_dict["output_dir"] + "/QC_figures/percent.rb_violin_noMADlines.png", category = "QC", caption = "../report_captions/QC_plots_rb.rst"),153 fig15 = report(output_dict["output_dir"] + "/QC_figures/UMI_vs_Genes_QC_scatter.png", category = "QC", caption = "../report_captions/QC_plots_UMI_features.rst"),154 fig16 = report(output_dict["output_dir"] + "/QC_figures/UMI_vs_Genes_QC_scatter_w_MADlines.png", category = "QC", caption = "../report_captions/QC_plots_UMI_features_MADall.rst"),155 fig17 = report(output_dict["output_dir"] + "/QC_figures/UMI_vs_percentMT_QC_scatter_colorPool.png", category = "QC", caption = "../report_captions/QC_plots_UMI_mt_pool.rst"),156 fig18 = report(output_dict["output_dir"] + "/QC_figures/UMI_vs_percentMT_QC_scatter_w_MADlines.png", category = "QC", caption = "../report_captions/QC_plots_UMI_mt_MDA_all.rst"),157 resources:158 mem_per_thread_gb = lambda wildcards, attempt: attempt * CombineResults_dict["FinalQC_memory"],159 disk_per_thread_gb = lambda wildcards, attempt: attempt * CombineResults_dict["FinalQC_memory"]160 threads: CombineResults_dict["FinalQC_threads"]161 params:162 sif = input_dict["singularity_image"],163 bind = bind_path,164 script = "/opt/WG1-pipeline-QC/Demultiplexing/scripts/Singlet_QC_Figures.R",165 main_dir = output_dict["output_dir"],166 dirs10x = output_dict["output_dir"] + '/file_directories.txt',167 out = output_dict["output_dir"] + "/QC_figures/",168 rb_genes = "/opt/WG1-pipeline-QC/Demultiplexing/Ribosomal_genes.txt",169 mt_genes = "/opt/WG1-pipeline-QC/Demultiplexing/Mitochondrial_genes.txt"170 log: output_dict["output_dir"] + "/logs/QC_plots.log"171 shell:172 """173 singularity exec --bind {params.bind} {params.sif} echo {params.main_dir} > {output.variables}174 singularity exec --bind {params.bind} {params.sif} echo {input.pools} >> {output.variables}175 singularity exec --bind {params.bind} {params.sif} echo {params.dirs10x} >> {output.variables}176 singularity exec --bind {params.bind} {params.sif} echo {params.out} >> {output.variables}177 singularity exec --bind {params.bind} {params.sif} echo {params.rb_genes} >> {output.variables}178 singularity exec --bind {params.bind} {params.sif} echo {params.mt_genes} >> {output.variables}179 singularity exec --bind {params.bind} {params.sif} Rscript {params.script} {output.variables}180 [[ -s {output.fig18} ]]181 echo $?...
Learn to execute automation testing from scratch with LambdaTest Learning Hub. Right from setting up the prerequisites to run your first automation test, to following best practices and diving deeper into advanced test scenarios. LambdaTest Learning Hubs compile a list of step-by-step guides to help you be proficient with different test automation frameworks i.e. Selenium, Cypress, TestNG etc.
You could also refer to video tutorials over LambdaTest YouTube channel to get step by step demonstration from industry experts.
Get 100 minutes of automation test minutes FREE!!