From 6d1c53e9122f1ea1e695fc4bee32f4be3942d36a Mon Sep 17 00:00:00 2001 From: gaow Date: Wed, 16 Feb 2022 11:17:31 -0500 Subject: [PATCH 01/63] Create README.md --- TWAS/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 TWAS/README.md diff --git a/TWAS/README.md b/TWAS/README.md new file mode 100644 index 0000000..d9975ba --- /dev/null +++ b/TWAS/README.md @@ -0,0 +1 @@ +# [obsolete] this workflow will soon be replaced by TWAS workflows in molecular phenotype analysis repo From ce719190c4c39bbb6684b258fec848dd2ea9b718 Mon Sep 17 00:00:00 2001 From: gaow Date: Wed, 16 Feb 2022 11:17:53 -0500 Subject: [PATCH 02/63] Create README.md --- multivariate-prediction/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 multivariate-prediction/README.md diff --git a/multivariate-prediction/README.md b/multivariate-prediction/README.md new file mode 100644 index 0000000..d9975ba --- /dev/null +++ b/multivariate-prediction/README.md @@ -0,0 +1 @@ +# [obsolete] this workflow will soon be replaced by TWAS workflows in molecular phenotype analysis repo From bb2c1f79f28280a003d8c91704d7249bfa559dd8 Mon Sep 17 00:00:00 2001 From: Yin Huang Date: Tue, 22 Feb 2022 16:38:47 -0500 Subject: [PATCH 03/63] add rename parameter --- GWAS/liftover.ipynb | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/GWAS/liftover.ipynb b/GWAS/liftover.ipynb index 9cd83cd..438e43f 100644 --- a/GWAS/liftover.ipynb +++ b/GWAS/liftover.ipynb @@ -62,7 +62,8 @@ "- `--ouput_file`, the name of ouput file which will be saved under `cwd` path\n", "- `--fr`, From reference genome, defaut is `hg19`\n", "- `--to`,To reference genome, defaut is `hg38`\n", - "- `--remove-missing`, boolen, Remove SNPs failed to liftover (defaults to False)" + "- `--remove-missing`, boolen, Remove SNPs failed to liftover (default to False)\n", + "- `--no-rename`, boolen, Rename variants' ID (default to False). **Only implemented to sumstat liftover**" ] }, { @@ -149,6 +150,8 @@ "parameter: to = 'hg38'\n", "# Remove SNPs failed to liftover (defaults to False)\n", "parameter: remove_missing = False\n", + "# Rename Variant ID\n", + "parameter: no_rename = False\n", "# Container\n", "#parameter: container = str" ] @@ -172,7 +175,7 @@ " from LDtools.genodata import *\n", " from LDtools.sumstat import Sumstat\n", " from LDtools.liftover import Liftover\n", - " def liftover(input_path,output_path,fr='hg19',to='hg38',remove_missing=True):\n", + " def liftover(input_path,output_path,fr='hg19',to='hg38',remove_missing=True,rename=True):\n", " lf = Liftover(fr,to)\n", " print(\"liftover from \" + fr +\" to \" +to)\n", " print(\"Removing SNPs failed to liftover is\", remove_missing)\n", @@ -203,8 +206,8 @@ " lf.vcf_liftover(input_path,output_path,remove_missing)\n", " else:\n", " print(\"This file is considered as sumstat format file\")\n", - " sums = Sumstat(input_path)\n", - " new_sums = lf.sumstat_liftover(sums.ss)\n", + " sums = Sumstat(input_path,rename=rename)\n", + " new_sums = lf.sumstat_liftover(sums.ss,rename)\n", " idx = new_sums.CHR == 0\n", " if remove_missing:\n", " new_sums[~idx].to_csv(output_path, compression='gzip', sep = \"\\t\", header = True, index = False)\n", @@ -239,8 +242,9 @@ " fr = f'${fr}'\n", " to = f'${to}'\n", " remove_missing=${remove_missing}\n", + " rename = ${no_rename}==False\n", " print(fr,to,remove_missing)\n", - " liftover(input_path,output_path,fr,to,remove_missing)" + " liftover(input_path,output_path,fr,to,remove_missing,rename)" ] } ], From bd491ebef67e11a70da93cfd784c1b0d5654fcc7 Mon Sep 17 00:00:00 2001 From: Yin Huang Date: Wed, 23 Feb 2022 09:42:43 -0500 Subject: [PATCH 04/63] update --- GWAS/liftover.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/GWAS/liftover.ipynb b/GWAS/liftover.ipynb index 438e43f..7209a1e 100644 --- a/GWAS/liftover.ipynb +++ b/GWAS/liftover.ipynb @@ -63,7 +63,7 @@ "- `--fr`, From reference genome, defaut is `hg19`\n", "- `--to`,To reference genome, defaut is `hg38`\n", "- `--remove-missing`, boolen, Remove SNPs failed to liftover (default to False)\n", - "- `--no-rename`, boolen, Rename variants' ID (default to False). **Only implemented to sumstat liftover**" + "- `--rename`, boolen, Rename variants' ID (default to True). **Only implemented to sumstat liftover**" ] }, { @@ -151,9 +151,9 @@ "# Remove SNPs failed to liftover (defaults to False)\n", "parameter: remove_missing = False\n", "# Rename Variant ID\n", - "parameter: no_rename = False\n", + "parameter: rename = True\n", "# Container\n", - "#parameter: container = str" + "parameter: container = str" ] }, { @@ -242,7 +242,7 @@ " fr = f'${fr}'\n", " to = f'${to}'\n", " remove_missing=${remove_missing}\n", - " rename = ${no_rename}==False\n", + " rename = ${rename}\n", " print(fr,to,remove_missing)\n", " liftover(input_path,output_path,fr,to,remove_missing,rename)" ] From b78f09ae33c659b5d6bbb8d209964ad45b1c1b37 Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Thu, 24 Feb 2022 10:01:15 -0500 Subject: [PATCH 05/63] mtag formating --- GWAS/data/mtag_template.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 GWAS/data/mtag_template.yml diff --git a/GWAS/data/mtag_template.yml b/GWAS/data/mtag_template.yml new file mode 100644 index 0000000..83837ce --- /dev/null +++ b/GWAS/data/mtag_template.yml @@ -0,0 +1,11 @@ +# mtag summary statistics template +snpid: ID +chr: CHROM +bpos: GENPOS +a1: ALLELE1 #A1 needs to be the effect allele +a2: ALLELE0 # The other allele +freq: A1FREQ +beta: BETA +se: SE +pval: LOG10P +n: N From c9e97a1578170c5fe4fabca2d41384152b3efd9e Mon Sep 17 00:00:00 2001 From: asingh100 <55717171+asingh100@users.noreply.github.com> Date: Thu, 24 Feb 2022 15:46:52 -0500 Subject: [PATCH 06/63] Create ReadME.md --- LDSC/Deep_Learning/ReadME.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 LDSC/Deep_Learning/ReadME.md diff --git a/LDSC/Deep_Learning/ReadME.md b/LDSC/Deep_Learning/ReadME.md new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/LDSC/Deep_Learning/ReadME.md @@ -0,0 +1 @@ + From 5d426164be528fba50cbd91eeba9683fd64d3efa Mon Sep 17 00:00:00 2001 From: asingh100 <55717171+asingh100@users.noreply.github.com> Date: Thu, 24 Feb 2022 15:48:01 -0500 Subject: [PATCH 07/63] Started Work on Notebook with Deep Learning Included --- LDSC/Deep_Learning/LDSC_DeepSea_Code.ipynb | 403 +++++++++++++++++++++ 1 file changed, 403 insertions(+) create mode 100644 LDSC/Deep_Learning/LDSC_DeepSea_Code.ipynb diff --git a/LDSC/Deep_Learning/LDSC_DeepSea_Code.ipynb b/LDSC/Deep_Learning/LDSC_DeepSea_Code.ipynb new file mode 100644 index 0000000..d7e0c42 --- /dev/null +++ b/LDSC/Deep_Learning/LDSC_DeepSea_Code.ipynb @@ -0,0 +1,403 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## SoS Workflow:\n", + "\n", + "This is the options and the SoS code to run the LDSC pipeline using your own data. \n", + "\n", + "## Command Interface:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "kernel": "SoS" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "usage: sos run LDSC_DeepSea_Code.ipynb\n", + " [workflow_name | -t targets] [options] [workflow_options]\n", + " workflow_name: Single or combined workflows defined in this script\n", + " targets: One or more targets to generate\n", + " options: Single-hyphen sos parameters (see \"sos run -h\" for details)\n", + " workflow_options: Double-hyphen workflow-specific parameters\n", + "\n", + "Workflows:\n", + " train_model\n", + " make_annot\n", + " format_annot\n", + " munge_sumstats_no_sign\n", + " munge_sumstats_sign\n", + " calc_ld_score\n", + " calc_enrichment\n", + "\n", + "Sections\n", + " train_model:\n", + " make_annot:\n", + " format_annot:\n", + " Workflow Options:\n", + " --full-annot VAL (as str, required)\n", + " path to full annotation file\n", + " --output VAL (as str, required)\n", + " path to output file directory\n", + " munge_sumstats_no_sign: This option is for when the summary statistic file\n", + " does not contain a signed summary statistic (Z or Beta).\n", + " In this case,the program will calculate Z for you based\n", + " on A1 being the risk allele\n", + " Workflow Options:\n", + " --sumst VAL (as str, required)\n", + " path to summary statistic file\n", + " --alleles 'w_hm3.snplist'\n", + " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", + " and A2) for the munge_sumstats program\n", + " --output VAL (as str, required)\n", + " path to output file\n", + " munge_sumstats_sign: This option is for when the summary statistic file does\n", + " contain a signed summary statistic (Z or Beta)\n", + " Workflow Options:\n", + " --sumst VAL (as str, required)\n", + " path to summary statistic file\n", + " --alleles 'w_hm3.snplist'\n", + " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", + " and A2) for the munge_sumstats program\n", + " --output VAL (as str, required)\n", + " path to output file\n", + " calc_ld_score: Calculate LD Scores **Make sure to delete SNP,CHR, and\n", + " BP columns from annotation files if they are present\n", + " otherwise this code will not work. Before deleting, if\n", + " these columns are present, make sure that the annotation\n", + " file is sorted.**\n", + " Workflow Options:\n", + " --bim VAL (as str, required)\n", + " Path to bim file\n", + " --annot-file VAL (as str, required)\n", + " Path to annotation File. Make sure to remove the SNP,\n", + " CHR, and BP columns from the annotation file if present\n", + " before running.\n", + " --output VAL (as str, required)\n", + " name of output file\n", + " --snplist 'w_hm3.snplist'\n", + " path to Hapmap3 SNPs file, remove the A1 and A2 columns\n", + " for the Calculate LD Scores program\n", + " calc_enrichment:\n", + " Workflow Options:\n", + " --sumstats VAL (as str, required)\n", + " Path to Summary statistics File\n", + " --ref-ld VAL (as str, required)\n", + " Path to Reference LD Scores Files (Base Annotation +\n", + " Annotation you want to analyze, format like minimal\n", + " working example)\n", + " --w-ld VAL (as str, required)\n", + " Path to LD Weight Files (Format like minimal working\n", + " example)\n", + " --frq-file VAL (as str, required)\n", + " path to frequency files (Format like minimal working\n", + " example)\n", + " --output VAL (as str, required)\n", + " Output name\n" + ] + } + ], + "source": [ + "!sos run LDSC_DeepSea_Code.ipynb -h" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "Python 3 (ipykernel)" + }, + "source": [ + "## Train Model:" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "\n", + "[train_model]\n", + "\n", + "bash: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif'\n", + "\n", + " python3.7 /mnt/mfs/statgen/Anmol/training_files/run_neuron_full_pipeline.py " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Make Full Annotation File Based on Trained Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "\n", + "[make_annot]\n", + "\n", + "bash: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif'\n", + "\n", + " python3.7 /mnt/mfs/statgen/Anmol/training_files/variant_pred_pipeline.py" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Format Annotation File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "\n", + "[format_annot]\n", + "\n", + "#path to full annotation file\n", + "parameter: full_annot = str\n", + "#path to output file directory\n", + "parameter: output = str\n", + "\n", + "R: container=\"/mnt/mfs/statgen/Anmol/r-packages.sif\"\n", + " library(data.table)\n", + " library(tidyverse)\n", + " data = fread({full_annot})\n", + " data$V1 = gsub(\"chr\",\"\",data$V1)\n", + " data$V1 = as.numeric(data$V1)\n", + " features = colnames(data)[9:ncol(data)]\n", + " features = data.frame(features)\n", + " features$encoding = paste0(\"feat_\",seq(1,nrow(features)))\n", + " fwrite(features,paste0({output},\"feature_encoding.txt\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " for (i in seq(1,22)){\n", + " data_2 = filter(data,V1==i)\n", + " data_2 = select(data_2,-c(seq(4,8)))\n", + " for (j in seq(4,ncol(data_2))){\n", + " data_3 = select(data_2,c(\"V1\",\"V2\",\"V3\",j))\n", + " data_3$CM = 0\n", + " colnames(data_3) = c(\"CHR\",\"BP\",\"SNP\",paste0(\"feat_\",j),\"CM\")\n", + " fwrite(data_3,paste0({output},\"chr_\",j,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "Python 3 (ipykernel)" + }, + "source": [ + "## Munge Summary Statistics (Option 1: No Signed Summary Statistic):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "#This option is for when the summary statistic file does not contain a signed summary statistic (Z or Beta). \n", + "#In this case,the program will calculate Z for you based on A1 being the risk allele\n", + "[munge_sumstats_no_sign]\n", + "\n", + "\n", + "\n", + "#path to summary statistic file\n", + "parameter: sumst = str\n", + "#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program\n", + "parameter: alleles = \"w_hm3.snplist\"\n", + "#path to output file\n", + "parameter: output = str\n", + "\n", + "bash: \n", + " python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output} --a1-inc" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Munge Summary Statistics (Option 2: No Signed Summary Statistic):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "# This option is for when the summary statistic file does contain a signed summary statistic (Z or Beta)\n", + "[munge_sumstats_sign]\n", + "\n", + "\n", + "\n", + "#path to summary statistic file\n", + "parameter: sumst = str\n", + "#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program\n", + "parameter: alleles = \"w_hm3.snplist\"\n", + "#path to output file\n", + "parameter: output = str\n", + "\n", + "bash: \n", + " python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "Python 3 (ipykernel)" + }, + "source": [ + "## Calculate LD Scores:\n", + "\n", + "**Make sure to delete SNP,CHR, and BP columns from annotation files if they are present otherwise this code will not work. Before deleting, if these columns are present, make sure that the annotation file is sorted.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "#Calculate LD Scores\n", + "#**Make sure to delete SNP,CHR, and BP columns from annotation files if they are present otherwise this code will not work. Before deleting, if these columns are present, make sure that the annotation file is sorted.**\n", + "[calc_ld_score]\n", + "\n", + "#Path to bim file\n", + "parameter: bim = str\n", + "#Path to annotation File. Make sure to remove the SNP, CHR, and BP columns from the annotation file if present before running.\n", + "parameter: annot_file = str\n", + "#name of output file\n", + "parameter: output = str\n", + "#path to Hapmap3 SNPs file, remove the A1 and A2 columns for the Calculate LD Scores program \n", + "parameter: snplist = \"w_hm3.snplist\"\n", + "\n", + "bash: \n", + " python2 ldsc.py --bfile {bim} --l2 --ld-wind-cm 1 --annot {annot_file} --thin-annot --out {output} --print-snps {snplist}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "Python 3 (ipykernel)" + }, + "source": [ + "## Calculate Functional Enrichment using Annotations:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "#Calculate Enrichment Scores for Functional Annotations\n", + "\n", + "[calc_enrichment]\n", + "\n", + "#Path to Summary statistics File\n", + "parameter: sumstats = str\n", + "#Path to Reference LD Scores Files (Base Annotation + Annotation you want to analyze, format like minimal working example)\n", + "parameter: ref_ld = str\n", + "#Path to LD Weight Files (Format like minimal working example)\n", + "parameter: w_ld = str\n", + "#path to frequency files (Format like minimal working example)\n", + "parameter: frq_file = str\n", + "#Output name\n", + "parameter: output = str\n", + "\n", + "bash:\n", + " python2 ldsc.py --h2 {sumstats} --ref-ld-chr {ref_ld} --w-ld-chr {w_ld} --overlap-annot --frqfile-chr {frq_file} --out {output}" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "SoS", + "language": "sos", + "name": "sos" + }, + "language_info": { + "codemirror_mode": "sos", + "file_extension": ".sos", + "mimetype": "text/x-sos", + "name": "sos", + "nbconvert_exporter": "sos_notebook.converter.SoS_Exporter", + "pygments_lexer": "sos" + }, + "sos": { + "kernels": [ + [ + "Python 3 (ipykernel)", + "python3", + "python3", + "", + { + "name": "ipython", + "version": 3 + } + ], + [ + "SoS", + "sos", + "", + "", + "sos" + ] + ], + "panel": { + "displayed": true, + "height": 0 + }, + "version": "0.22.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From ca093b14008fe50f83a3fc1165213355b5b6fba8 Mon Sep 17 00:00:00 2001 From: asingh100 <55717171+asingh100@users.noreply.github.com> Date: Thu, 24 Feb 2022 20:51:55 -0500 Subject: [PATCH 08/63] Delete LDSC_DeepSea_Code.ipynb --- LDSC/Deep_Learning/LDSC_DeepSea_Code.ipynb | 403 --------------------- 1 file changed, 403 deletions(-) delete mode 100644 LDSC/Deep_Learning/LDSC_DeepSea_Code.ipynb diff --git a/LDSC/Deep_Learning/LDSC_DeepSea_Code.ipynb b/LDSC/Deep_Learning/LDSC_DeepSea_Code.ipynb deleted file mode 100644 index d7e0c42..0000000 --- a/LDSC/Deep_Learning/LDSC_DeepSea_Code.ipynb +++ /dev/null @@ -1,403 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## SoS Workflow:\n", - "\n", - "This is the options and the SoS code to run the LDSC pipeline using your own data. \n", - "\n", - "## Command Interface:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "kernel": "SoS" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "usage: sos run LDSC_DeepSea_Code.ipynb\n", - " [workflow_name | -t targets] [options] [workflow_options]\n", - " workflow_name: Single or combined workflows defined in this script\n", - " targets: One or more targets to generate\n", - " options: Single-hyphen sos parameters (see \"sos run -h\" for details)\n", - " workflow_options: Double-hyphen workflow-specific parameters\n", - "\n", - "Workflows:\n", - " train_model\n", - " make_annot\n", - " format_annot\n", - " munge_sumstats_no_sign\n", - " munge_sumstats_sign\n", - " calc_ld_score\n", - " calc_enrichment\n", - "\n", - "Sections\n", - " train_model:\n", - " make_annot:\n", - " format_annot:\n", - " Workflow Options:\n", - " --full-annot VAL (as str, required)\n", - " path to full annotation file\n", - " --output VAL (as str, required)\n", - " path to output file directory\n", - " munge_sumstats_no_sign: This option is for when the summary statistic file\n", - " does not contain a signed summary statistic (Z or Beta).\n", - " In this case,the program will calculate Z for you based\n", - " on A1 being the risk allele\n", - " Workflow Options:\n", - " --sumst VAL (as str, required)\n", - " path to summary statistic file\n", - " --alleles 'w_hm3.snplist'\n", - " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", - " and A2) for the munge_sumstats program\n", - " --output VAL (as str, required)\n", - " path to output file\n", - " munge_sumstats_sign: This option is for when the summary statistic file does\n", - " contain a signed summary statistic (Z or Beta)\n", - " Workflow Options:\n", - " --sumst VAL (as str, required)\n", - " path to summary statistic file\n", - " --alleles 'w_hm3.snplist'\n", - " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", - " and A2) for the munge_sumstats program\n", - " --output VAL (as str, required)\n", - " path to output file\n", - " calc_ld_score: Calculate LD Scores **Make sure to delete SNP,CHR, and\n", - " BP columns from annotation files if they are present\n", - " otherwise this code will not work. Before deleting, if\n", - " these columns are present, make sure that the annotation\n", - " file is sorted.**\n", - " Workflow Options:\n", - " --bim VAL (as str, required)\n", - " Path to bim file\n", - " --annot-file VAL (as str, required)\n", - " Path to annotation File. Make sure to remove the SNP,\n", - " CHR, and BP columns from the annotation file if present\n", - " before running.\n", - " --output VAL (as str, required)\n", - " name of output file\n", - " --snplist 'w_hm3.snplist'\n", - " path to Hapmap3 SNPs file, remove the A1 and A2 columns\n", - " for the Calculate LD Scores program\n", - " calc_enrichment:\n", - " Workflow Options:\n", - " --sumstats VAL (as str, required)\n", - " Path to Summary statistics File\n", - " --ref-ld VAL (as str, required)\n", - " Path to Reference LD Scores Files (Base Annotation +\n", - " Annotation you want to analyze, format like minimal\n", - " working example)\n", - " --w-ld VAL (as str, required)\n", - " Path to LD Weight Files (Format like minimal working\n", - " example)\n", - " --frq-file VAL (as str, required)\n", - " path to frequency files (Format like minimal working\n", - " example)\n", - " --output VAL (as str, required)\n", - " Output name\n" - ] - } - ], - "source": [ - "!sos run LDSC_DeepSea_Code.ipynb -h" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "Python 3 (ipykernel)" - }, - "source": [ - "## Train Model:" - ] - }, - { - "cell_type": "code", - "execution_count": 93, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "\n", - "[train_model]\n", - "\n", - "bash: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif'\n", - "\n", - " python3.7 /mnt/mfs/statgen/Anmol/training_files/run_neuron_full_pipeline.py " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## Make Full Annotation File Based on Trained Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "\n", - "[make_annot]\n", - "\n", - "bash: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif'\n", - "\n", - " python3.7 /mnt/mfs/statgen/Anmol/training_files/variant_pred_pipeline.py" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## Format Annotation File" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "\n", - "[format_annot]\n", - "\n", - "#path to full annotation file\n", - "parameter: full_annot = str\n", - "#path to output file directory\n", - "parameter: output = str\n", - "\n", - "R: container=\"/mnt/mfs/statgen/Anmol/r-packages.sif\"\n", - " library(data.table)\n", - " library(tidyverse)\n", - " data = fread({full_annot})\n", - " data$V1 = gsub(\"chr\",\"\",data$V1)\n", - " data$V1 = as.numeric(data$V1)\n", - " features = colnames(data)[9:ncol(data)]\n", - " features = data.frame(features)\n", - " features$encoding = paste0(\"feat_\",seq(1,nrow(features)))\n", - " fwrite(features,paste0({output},\"feature_encoding.txt\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", - " for (i in seq(1,22)){\n", - " data_2 = filter(data,V1==i)\n", - " data_2 = select(data_2,-c(seq(4,8)))\n", - " for (j in seq(4,ncol(data_2))){\n", - " data_3 = select(data_2,c(\"V1\",\"V2\",\"V3\",j))\n", - " data_3$CM = 0\n", - " colnames(data_3) = c(\"CHR\",\"BP\",\"SNP\",paste0(\"feat_\",j),\"CM\")\n", - " fwrite(data_3,paste0({output},\"chr_\",j,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", - " }\n", - " }" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "Python 3 (ipykernel)" - }, - "source": [ - "## Munge Summary Statistics (Option 1: No Signed Summary Statistic):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "#This option is for when the summary statistic file does not contain a signed summary statistic (Z or Beta). \n", - "#In this case,the program will calculate Z for you based on A1 being the risk allele\n", - "[munge_sumstats_no_sign]\n", - "\n", - "\n", - "\n", - "#path to summary statistic file\n", - "parameter: sumst = str\n", - "#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program\n", - "parameter: alleles = \"w_hm3.snplist\"\n", - "#path to output file\n", - "parameter: output = str\n", - "\n", - "bash: \n", - " python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output} --a1-inc" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## Munge Summary Statistics (Option 2: No Signed Summary Statistic):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "# This option is for when the summary statistic file does contain a signed summary statistic (Z or Beta)\n", - "[munge_sumstats_sign]\n", - "\n", - "\n", - "\n", - "#path to summary statistic file\n", - "parameter: sumst = str\n", - "#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program\n", - "parameter: alleles = \"w_hm3.snplist\"\n", - "#path to output file\n", - "parameter: output = str\n", - "\n", - "bash: \n", - " python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "Python 3 (ipykernel)" - }, - "source": [ - "## Calculate LD Scores:\n", - "\n", - "**Make sure to delete SNP,CHR, and BP columns from annotation files if they are present otherwise this code will not work. Before deleting, if these columns are present, make sure that the annotation file is sorted.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "#Calculate LD Scores\n", - "#**Make sure to delete SNP,CHR, and BP columns from annotation files if they are present otherwise this code will not work. Before deleting, if these columns are present, make sure that the annotation file is sorted.**\n", - "[calc_ld_score]\n", - "\n", - "#Path to bim file\n", - "parameter: bim = str\n", - "#Path to annotation File. Make sure to remove the SNP, CHR, and BP columns from the annotation file if present before running.\n", - "parameter: annot_file = str\n", - "#name of output file\n", - "parameter: output = str\n", - "#path to Hapmap3 SNPs file, remove the A1 and A2 columns for the Calculate LD Scores program \n", - "parameter: snplist = \"w_hm3.snplist\"\n", - "\n", - "bash: \n", - " python2 ldsc.py --bfile {bim} --l2 --ld-wind-cm 1 --annot {annot_file} --thin-annot --out {output} --print-snps {snplist}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "Python 3 (ipykernel)" - }, - "source": [ - "## Calculate Functional Enrichment using Annotations:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "#Calculate Enrichment Scores for Functional Annotations\n", - "\n", - "[calc_enrichment]\n", - "\n", - "#Path to Summary statistics File\n", - "parameter: sumstats = str\n", - "#Path to Reference LD Scores Files (Base Annotation + Annotation you want to analyze, format like minimal working example)\n", - "parameter: ref_ld = str\n", - "#Path to LD Weight Files (Format like minimal working example)\n", - "parameter: w_ld = str\n", - "#path to frequency files (Format like minimal working example)\n", - "parameter: frq_file = str\n", - "#Output name\n", - "parameter: output = str\n", - "\n", - "bash:\n", - " python2 ldsc.py --h2 {sumstats} --ref-ld-chr {ref_ld} --w-ld-chr {w_ld} --overlap-annot --frqfile-chr {frq_file} --out {output}" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "SoS", - "language": "sos", - "name": "sos" - }, - "language_info": { - "codemirror_mode": "sos", - "file_extension": ".sos", - "mimetype": "text/x-sos", - "name": "sos", - "nbconvert_exporter": "sos_notebook.converter.SoS_Exporter", - "pygments_lexer": "sos" - }, - "sos": { - "kernels": [ - [ - "Python 3 (ipykernel)", - "python3", - "python3", - "", - { - "name": "ipython", - "version": 3 - } - ], - [ - "SoS", - "sos", - "", - "", - "sos" - ] - ], - "panel": { - "displayed": true, - "height": 0 - }, - "version": "0.22.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From 905f21556cd58703c3c4aa399f2b636098260aac Mon Sep 17 00:00:00 2001 From: asingh100 <55717171+asingh100@users.noreply.github.com> Date: Thu, 24 Feb 2022 20:52:10 -0500 Subject: [PATCH 09/63] Add files via upload --- .../LDSC_DeepSea_Minimal_Example.ipynb | 445 ++++++++++++++++++ .../LDSC_DeepSea_Minimal_Example.sos | 139 ++++++ 2 files changed, 584 insertions(+) create mode 100644 LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb create mode 100644 LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos diff --git a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb new file mode 100644 index 0000000..ca0012d --- /dev/null +++ b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb @@ -0,0 +1,445 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## SoS Workflow:\n", + "\n", + "This is the options and the SoS code to run the LDSC pipeline using your own data. \n", + "\n", + "## Command Interface:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "kernel": "SoS" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "usage: sos run LDSC_DeepSea_Code.ipynb\n", + " [workflow_name | -t targets] [options] [workflow_options]\n", + " workflow_name: Single or combined workflows defined in this script\n", + " targets: One or more targets to generate\n", + " options: Single-hyphen sos parameters (see \"sos run -h\" for details)\n", + " workflow_options: Double-hyphen workflow-specific parameters\n", + "\n", + "Workflows:\n", + " train_model\n", + " make_annot\n", + " format_annot\n", + " munge_sumstats_no_sign\n", + " munge_sumstats_sign\n", + " calc_ld_score\n", + " calc_enrichment\n", + "\n", + "Sections\n", + " train_model:\n", + " make_annot:\n", + " Workflow Options:\n", + " --feature-list VAL (as str, required)\n", + " path to feature list file\n", + " --model VAL (as str, required)\n", + " path to trained model location\n", + " --output VAL (as str, required)\n", + " path to output directory\n", + " --num-features VAL (as int, required)\n", + " number of features\n", + " format_annot:\n", + " Workflow Options:\n", + " --full-annot VAL (as str, required)\n", + " path to full annotation file\n", + " --output VAL (as str, required)\n", + " path to output file directory\n", + " munge_sumstats_no_sign: This option is for when the summary statistic file\n", + " does not contain a signed summary statistic (Z or Beta).\n", + " In this case,the program will calculate Z for you based\n", + " on A1 being the risk allele\n", + " Workflow Options:\n", + " --sumst VAL (as str, required)\n", + " path to summary statistic file\n", + " --alleles 'w_hm3.snplist'\n", + " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", + " and A2) for the munge_sumstats program\n", + " --output VAL (as str, required)\n", + " path to output file\n", + " munge_sumstats_sign: This option is for when the summary statistic file does\n", + " contain a signed summary statistic (Z or Beta)\n", + " Workflow Options:\n", + " --sumst VAL (as str, required)\n", + " path to summary statistic file\n", + " --alleles 'w_hm3.snplist'\n", + " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", + " and A2) for the munge_sumstats program\n", + " --output VAL (as str, required)\n", + " path to output file\n", + " calc_ld_score: Calculate LD Scores **Make sure to delete SNP,CHR, and\n", + " BP columns from annotation files if they are present\n", + " otherwise this code will not work. Before deleting, if\n", + " these columns are present, make sure that the annotation\n", + " file is sorted.**\n", + " Workflow Options:\n", + " --bim VAL (as str, required)\n", + " Path to bim file\n", + " --annot-file VAL (as str, required)\n", + " Path to annotation File. Make sure to remove the SNP,\n", + " CHR, and BP columns from the annotation file if present\n", + " before running.\n", + " --output VAL (as str, required)\n", + " name of output file\n", + " --snplist 'w_hm3.snplist'\n", + " path to Hapmap3 SNPs file, remove the A1 and A2 columns\n", + " for the Calculate LD Scores program\n", + " calc_enrichment:\n", + " Workflow Options:\n", + " --sumstats VAL (as str, required)\n", + " Path to Summary statistics File\n", + " --ref-ld VAL (as str, required)\n", + " Path to Reference LD Scores Files (Base Annotation +\n", + " Annotation you want to analyze, format like minimal\n", + " working example)\n", + " --w-ld VAL (as str, required)\n", + " Path to LD Weight Files (Format like minimal working\n", + " example)\n", + " --frq-file VAL (as str, required)\n", + " path to frequency files (Format like minimal working\n", + " example)\n", + " --output VAL (as str, required)\n", + " Output name\n" + ] + } + ], + "source": [ + "!sos run LDSC_DeepSea_Code.ipynb -h" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "Python 3 (ipykernel)" + }, + "source": [ + "## Train Model:" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "\n", + "[train_model]\n", + "\n", + "bash: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif'\n", + "\n", + " python3.7 /mnt/mfs/statgen/Anmol/training_files/tutorial/run_neuron_full_tutorial.py " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Make Full Annotation File Based on Trained Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "\n", + "[make_annot]\n", + "\n", + "#path to feature list file\n", + "parameter: feature_list = str\n", + "#path to trained model location\n", + "parameter: model = str\n", + "#path to output directory\n", + "parameter: output = str\n", + "#number of features\n", + "parameter: num_features = int\n", + "\n", + "python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif'\n", + "\n", + " from selene_sdk.utils import load_path\n", + " from selene_sdk.utils import parse_configs_and_run\n", + " from selene_sdk.predict import AnalyzeSequences\n", + " from selene_sdk.sequences import Genome\n", + " from selene_sdk.utils import load_features_list\n", + " from selene_sdk.utils import NonStrandSpecific\n", + " from selene_sdk.utils import DeeperDeepSEA\n", + " import glob\n", + " import os\n", + " distinct_features = load_features_list({feature_list})\n", + "\n", + " model_predict = AnalyzeSequences(\n", + " NonStrandSpecific(DeeperDeepSEA(1000,{num_features})),\n", + " {model},\n", + " sequence_length=1000,\n", + " features=distinct_features,\n", + " reference_sequence=Genome(\"/mnt/mfs/statgen/Anmol/training_files/male.hg19.fasta\"),\n", + " use_cuda=False # update this to False if you do not have CUDA on your machine.\n", + " )\n", + "\n", + " for i in range(1,22):\n", + " model_predict.variant_effect_prediction(\n", + " \"/mnt/mfs/statgen/Anmol/training_files/testing/1000G_chr_\"+str(i)+\".vcf\",\n", + " save_data=[\"abs_diffs\"], # only want to save the absolute diff score data\n", + " output_dir={output})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Format Annotation File" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "\n", + "[format_annot]\n", + "\n", + "#path to tsv files directory\n", + "parameter: tsv = path()\n", + "#path to output file directory\n", + "parameter: output = path()\n", + "\n", + "R: expand = \"${ }\", container=\"/mnt/mfs/statgen/Anmol/r-packages.sif\"\n", + " library(data.table)\n", + " library(tidyverse)\n", + " data = fread(paste0(\"${tsv}\",\"/tutorial_1000G_chr_\",22,\"_abs_diffs.tsv\"))\n", + " features = colnames(data)[9:ncol(data)]\n", + " features = data.frame(features)\n", + " features$encoding = paste0(\"feat_\",seq(1,nrow(features)))\n", + " fwrite(features,paste0(\"${output}\",\"/feature_encoding.txt\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " for (i in seq(1,22)){\n", + " data = fread(paste0(\"${tsv}\",\"/tutorial_1000G_chr_\",i,\"_abs_diffs.tsv\"))\n", + " data$V1 = gsub(\"chr\",\"\",data$V1)\n", + " data$V1 = as.numeric(data$V1)\n", + " data_2 = select(data,-c(seq(4,8)))\n", + " for (j in seq(4,ncol(data_2))){\n", + " data_3 = select(data_2,c(1,2,3,j))\n", + " data_3$CM = 0\n", + " colnames(data_3) = c(\"CHR\",\"BP\",\"SNP\",paste0(\"feat_\",j),\"CM\")\n", + " fwrite(data_3,paste0(\"${output}\",\"/feat_\",j,\"_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " }\n", + " }" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "Python 3 (ipykernel)" + }, + "source": [ + "## Munge Summary Statistics (Option 1: No Signed Summary Statistic):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "#This option is for when the summary statistic file does not contain a signed summary statistic (Z or Beta). \n", + "#In this case,the program will calculate Z for you based on A1 being the risk allele\n", + "[munge_sumstats_no_sign]\n", + "\n", + "\n", + "\n", + "#path to summary statistic file\n", + "parameter: sumst = str\n", + "#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program\n", + "parameter: alleles = \"w_hm3.snplist\"\n", + "#path to output file\n", + "parameter: output = str\n", + "\n", + "bash: \n", + " python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output} --a1-inc" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Munge Summary Statistics (Option 2: No Signed Summary Statistic):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "# This option is for when the summary statistic file does contain a signed summary statistic (Z or Beta)\n", + "[munge_sumstats_sign]\n", + "\n", + "\n", + "\n", + "#path to summary statistic file\n", + "parameter: sumst = str\n", + "#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program\n", + "parameter: alleles = \"w_hm3.snplist\"\n", + "#path to output file\n", + "parameter: output = str\n", + "\n", + "bash: \n", + " python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "Python 3 (ipykernel)" + }, + "source": [ + "## Calculate LD Scores:\n", + "\n", + "**Make sure to delete SNP,CHR, and BP columns from annotation files if they are present otherwise this code will not work. Before deleting, if these columns are present, make sure that the annotation file is sorted.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "#Calculate LD Scores\n", + "#**Make sure to delete SNP,CHR, and BP columns from annotation files if they are present otherwise this code will not work. Before deleting, if these columns are present, make sure that the annotation file is sorted.**\n", + "[calc_ld_score]\n", + "\n", + "#Path to bim file\n", + "parameter: bim = str\n", + "#Path to annotation File. Make sure to remove the SNP, CHR, and BP columns from the annotation file if present before running.\n", + "parameter: annot_file = str\n", + "#name of output file\n", + "parameter: output = str\n", + "#path to Hapmap3 SNPs file, remove the A1 and A2 columns for the Calculate LD Scores program \n", + "parameter: snplist = \"w_hm3.snplist\"\n", + "\n", + "bash: \n", + " python2 ldsc.py --bfile {bim} --l2 --ld-wind-cm 1 --annot {annot_file} --thin-annot --out {output} --print-snps {snplist}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "Python 3 (ipykernel)" + }, + "source": [ + "## Calculate Functional Enrichment using Annotations:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "#Calculate Enrichment Scores for Functional Annotations\n", + "\n", + "[calc_enrichment]\n", + "\n", + "#Path to Summary statistics File\n", + "parameter: sumstats = str\n", + "#Path to Reference LD Scores Files (Base Annotation + Annotation you want to analyze, format like minimal working example)\n", + "parameter: ref_ld = str\n", + "#Path to LD Weight Files (Format like minimal working example)\n", + "parameter: w_ld = str\n", + "#path to frequency files (Format like minimal working example)\n", + "parameter: frq_file = str\n", + "#Output name\n", + "parameter: output = str\n", + "\n", + "bash:\n", + " python2 ldsc.py --h2 {sumstats} --ref-ld-chr {ref_ld} --w-ld-chr {w_ld} --overlap-annot --frqfile-chr {frq_file} --out {output}" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "SoS", + "language": "sos", + "name": "sos" + }, + "language_info": { + "codemirror_mode": "sos", + "file_extension": ".sos", + "mimetype": "text/x-sos", + "name": "sos", + "nbconvert_exporter": "sos_notebook.converter.SoS_Exporter", + "pygments_lexer": "sos" + }, + "sos": { + "kernels": [ + [ + "Python 3 (ipykernel)", + "python3", + "python3", + "", + { + "name": "ipython", + "version": 3 + } + ], + [ + "SoS", + "sos", + "", + "", + "sos" + ] + ], + "panel": { + "displayed": true, + "height": 0 + }, + "version": "0.22.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos new file mode 100644 index 0000000..c082c8c --- /dev/null +++ b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos @@ -0,0 +1,139 @@ +#!/usr/bin/env sos-runner +#fileformat=SOS1.0 + +[train_model] + +bash: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif' + + python3.7 /mnt/mfs/statgen/Anmol/training_files/tutorial/run_neuron_full_tutorial.py + +[make_annot] + +#path to feature list file +parameter: feature_list = str +#path to trained model location +parameter: model = str +#path to output directory +parameter: output = str +#number of features +parameter: num_features = int + +python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif' + + from selene_sdk.utils import load_path + from selene_sdk.utils import parse_configs_and_run + from selene_sdk.predict import AnalyzeSequences + from selene_sdk.sequences import Genome + from selene_sdk.utils import load_features_list + from selene_sdk.utils import NonStrandSpecific + from selene_sdk.utils import DeeperDeepSEA + import glob + import os + distinct_features = load_features_list({feature_list}) + + model_predict = AnalyzeSequences( + NonStrandSpecific(DeeperDeepSEA(1000,{num_features})), + {model}, + sequence_length=1000, + features=distinct_features, + reference_sequence=Genome("/mnt/mfs/statgen/Anmol/training_files/male.hg19.fasta"), + use_cuda=False # update this to False if you do not have CUDA on your machine. + ) + + for i in range(1,22): + model_predict.variant_effect_prediction( + "/mnt/mfs/statgen/Anmol/training_files/testing/1000G_chr_"+str(i)+".vcf", + save_data=["abs_diffs"], # only want to save the absolute diff score data + output_dir={output}) + +[format_annot] + +#path to tsv files directory +parameter: tsv = path() +#path to output file directory +parameter: output = path() + +R: expand = "${ }", container="/mnt/mfs/statgen/Anmol/r-packages.sif" + library(data.table) + library(tidyverse) + data = fread(paste0("${tsv}","/tutorial_1000G_chr_",22,"_abs_diffs.tsv")) + features = colnames(data)[9:ncol(data)] + features = data.frame(features) + features$encoding = paste0("feat_",seq(1,nrow(features))) + fwrite(features,paste0("${output}","/feature_encoding.txt"),quote=F,sep="\t",row.names=F,col.names=T) + for (i in seq(1,22)){ + data = fread(paste0("${tsv}","/tutorial_1000G_chr_",i,"_abs_diffs.tsv")) + data$V1 = gsub("chr","",data$V1) + data$V1 = as.numeric(data$V1) + data_2 = select(data,-c(seq(4,8))) + for (j in seq(4,ncol(data_2))){ + data_3 = select(data_2,c(1,2,3,j)) + data_3$CM = 0 + colnames(data_3) = c("CHR","BP","SNP",paste0("feat_",j),"CM") + fwrite(data_3,paste0("${output}","/feat_",j,"_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) + } + } + +#This option is for when the summary statistic file does not contain a signed summary statistic (Z or Beta). +#In this case,the program will calculate Z for you based on A1 being the risk allele +[munge_sumstats_no_sign] + + + +#path to summary statistic file +parameter: sumst = str +#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program +parameter: alleles = "w_hm3.snplist" +#path to output file +parameter: output = str + +bash: + python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output} --a1-inc + +# This option is for when the summary statistic file does contain a signed summary statistic (Z or Beta) +[munge_sumstats_sign] + + + +#path to summary statistic file +parameter: sumst = str +#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program +parameter: alleles = "w_hm3.snplist" +#path to output file +parameter: output = str + +bash: + python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output} + +#Calculate LD Scores +#**Make sure to delete SNP,CHR, and BP columns from annotation files if they are present otherwise this code will not work. Before deleting, if these columns are present, make sure that the annotation file is sorted.** +[calc_ld_score] + +#Path to bim file +parameter: bim = str +#Path to annotation File. Make sure to remove the SNP, CHR, and BP columns from the annotation file if present before running. +parameter: annot_file = str +#name of output file +parameter: output = str +#path to Hapmap3 SNPs file, remove the A1 and A2 columns for the Calculate LD Scores program +parameter: snplist = "w_hm3.snplist" + +bash: + python2 ldsc.py --bfile {bim} --l2 --ld-wind-cm 1 --annot {annot_file} --thin-annot --out {output} --print-snps {snplist} + +[calc_enrichment] + +#Path to Summary statistics File +parameter: sumstats = str +#Path to Reference LD Scores Files (Base Annotation + Annotation you want to analyze, format like minimal working example) +parameter: ref_ld = str +#Path to LD Weight Files (Format like minimal working example) +parameter: w_ld = str +#path to frequency files (Format like minimal working example) +parameter: frq_file = str +#Output name +parameter: output = str + +bash: + python2 ldsc.py --h2 {sumstats} --ref-ld-chr {ref_ld} --w-ld-chr {w_ld} --overlap-annot --frqfile-chr {frq_file} --out {output} + From 5c47a0a6610ab1e078587e6185aaf985e8cc80c6 Mon Sep 17 00:00:00 2001 From: Yin Huang Date: Fri, 25 Feb 2022 11:57:31 -0500 Subject: [PATCH 10/63] add yml parameter --- GWAS/liftover.ipynb | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/GWAS/liftover.ipynb b/GWAS/liftover.ipynb index 7209a1e..e82c88c 100644 --- a/GWAS/liftover.ipynb +++ b/GWAS/liftover.ipynb @@ -59,6 +59,18 @@ " - if plink format, provide the path of `bim` file \n", " - if gvcf/vcf format, the file must have gvcf and vcf in suffixes\n", " - other format will be considered as sumstat format\n", + "- `--yml_file`, if the sumstat header doesn't have CHR, POS, A0 and A1 columns, you need to provide a ymal file to describe the format of your file, such as following. the first 5 row is required.\n", + "```\n", + "ID: CHR,POS,A0,A1\n", + "CHR: CHR\n", + "POS: POS\n", + "A0: REF\n", + "A1: ALT\n", + "SNP: SNP\n", + "STAT: BETA\n", + "SE: SE\n", + "P: P\n", + "```\n", "- `--ouput_file`, the name of ouput file which will be saved under `cwd` path\n", "- `--fr`, From reference genome, defaut is `hg19`\n", "- `--to`,To reference genome, defaut is `hg38`\n", @@ -129,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4665b6b1-5e14-41b1-8537-70082c3f38dd", "metadata": { "kernel": "SoS", @@ -142,6 +154,8 @@ "parameter: cwd = path\n", "# Input file which can be plink format, gvcf/vcf format, sumstat format.\n", "parameter: input_file = path\n", + "# The path of yaml file with input file format, only for sumstat file.\n", + "parameter: yml_file = path #Fixme setting defualt to `None`\n", "# the name of ouput file which will be saved under cwd path\n", "parameter: output_file = path\n", "# From reference genome, defaut is hg19\n", @@ -175,7 +189,7 @@ " from LDtools.genodata import *\n", " from LDtools.sumstat import Sumstat\n", " from LDtools.liftover import Liftover\n", - " def liftover(input_path,output_path,fr='hg19',to='hg38',remove_missing=True,rename=True):\n", + " def liftover(input_path,output_path,yml=None,fr='hg19',to='hg38',remove_missing=True,rename=True):\n", " lf = Liftover(fr,to)\n", " print(\"liftover from \" + fr +\" to \" +to)\n", " print(\"Removing SNPs failed to liftover is\", remove_missing)\n", @@ -206,7 +220,7 @@ " lf.vcf_liftover(input_path,output_path,remove_missing)\n", " else:\n", " print(\"This file is considered as sumstat format file\")\n", - " sums = Sumstat(input_path,rename=rename)\n", + " sums = Sumstat(input_path,config_file=yml,rename=rename)\n", " new_sums = lf.sumstat_liftover(sums.ss,rename)\n", " idx = new_sums.CHR == 0\n", " if remove_missing:\n", @@ -243,8 +257,9 @@ " to = f'${to}'\n", " remove_missing=${remove_missing}\n", " rename = ${rename}\n", + " yml_file = f'${yml_file}'\n", " print(fr,to,remove_missing)\n", - " liftover(input_path,output_path,fr,to,remove_missing,rename)" + " liftover(input_path,output_path,yml_file,fr,to,remove_missing,rename)" ] } ], From 2f8ffdc8908e58d168b39f54ed4cd046fa894e44 Mon Sep 17 00:00:00 2001 From: Yin Huang Date: Fri, 25 Feb 2022 13:26:50 -0500 Subject: [PATCH 11/63] update --- GWAS/liftover.ipynb | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/GWAS/liftover.ipynb b/GWAS/liftover.ipynb index e82c88c..15dac8d 100644 --- a/GWAS/liftover.ipynb +++ b/GWAS/liftover.ipynb @@ -58,7 +58,7 @@ "- `--input_file`, the path of input file which can be plink format, gvcf/vcf format, sumstat format.\n", " - if plink format, provide the path of `bim` file \n", " - if gvcf/vcf format, the file must have gvcf and vcf in suffixes\n", - " - other format will be considered as sumstat format\n", + " - other format will be considered as sumstat format, whose header should have CHR, POS, A0 and A1 columns\n", "- `--yml_file`, if the sumstat header doesn't have CHR, POS, A0 and A1 columns, you need to provide a ymal file to describe the format of your file, such as following. the first 5 row is required.\n", "```\n", "ID: CHR,POS,A0,A1\n", @@ -155,7 +155,7 @@ "# Input file which can be plink format, gvcf/vcf format, sumstat format.\n", "parameter: input_file = path\n", "# The path of yaml file with input file format, only for sumstat file.\n", - "parameter: yml_file = path #Fixme setting defualt to `None`\n", + "parameter: yml_file = path() \n", "# the name of ouput file which will be saved under cwd path\n", "parameter: output_file = path\n", "# From reference genome, defaut is hg19\n", @@ -257,7 +257,10 @@ " to = f'${to}'\n", " remove_missing=${remove_missing}\n", " rename = ${rename}\n", - " yml_file = f'${yml_file}'\n", + " if yml_file.is_file(): \n", + " yml_file = f'${yml_file}'\n", + " else:\n", + " yml_file = None\n", " print(fr,to,remove_missing)\n", " liftover(input_path,output_path,yml_file,fr,to,remove_missing,rename)" ] From 6e9370652df733d4580edfd43794d2feec8c2345 Mon Sep 17 00:00:00 2001 From: asingh100 <55717171+asingh100@users.noreply.github.com> Date: Sun, 27 Feb 2022 19:18:58 -0500 Subject: [PATCH 12/63] Add files via upload --- .../LDSC_DeepSea_Minimal_Example.ipynb | 188 +++++++----------- .../LDSC_DeepSea_Minimal_Example.sos | 67 +++++-- 2 files changed, 119 insertions(+), 136 deletions(-) diff --git a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb index ca0012d..d2a9d44 100644 --- a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb +++ b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb @@ -15,7 +15,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": { "kernel": "SoS" }, @@ -24,95 +24,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "usage: sos run LDSC_DeepSea_Code.ipynb\n", - " [workflow_name | -t targets] [options] [workflow_options]\n", - " workflow_name: Single or combined workflows defined in this script\n", - " targets: One or more targets to generate\n", - " options: Single-hyphen sos parameters (see \"sos run -h\" for details)\n", - " workflow_options: Double-hyphen workflow-specific parameters\n", - "\n", - "Workflows:\n", - " train_model\n", - " make_annot\n", - " format_annot\n", - " munge_sumstats_no_sign\n", - " munge_sumstats_sign\n", - " calc_ld_score\n", - " calc_enrichment\n", - "\n", - "Sections\n", - " train_model:\n", - " make_annot:\n", - " Workflow Options:\n", - " --feature-list VAL (as str, required)\n", - " path to feature list file\n", - " --model VAL (as str, required)\n", - " path to trained model location\n", - " --output VAL (as str, required)\n", - " path to output directory\n", - " --num-features VAL (as int, required)\n", - " number of features\n", - " format_annot:\n", - " Workflow Options:\n", - " --full-annot VAL (as str, required)\n", - " path to full annotation file\n", - " --output VAL (as str, required)\n", - " path to output file directory\n", - " munge_sumstats_no_sign: This option is for when the summary statistic file\n", - " does not contain a signed summary statistic (Z or Beta).\n", - " In this case,the program will calculate Z for you based\n", - " on A1 being the risk allele\n", - " Workflow Options:\n", - " --sumst VAL (as str, required)\n", - " path to summary statistic file\n", - " --alleles 'w_hm3.snplist'\n", - " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", - " and A2) for the munge_sumstats program\n", - " --output VAL (as str, required)\n", - " path to output file\n", - " munge_sumstats_sign: This option is for when the summary statistic file does\n", - " contain a signed summary statistic (Z or Beta)\n", - " Workflow Options:\n", - " --sumst VAL (as str, required)\n", - " path to summary statistic file\n", - " --alleles 'w_hm3.snplist'\n", - " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", - " and A2) for the munge_sumstats program\n", - " --output VAL (as str, required)\n", - " path to output file\n", - " calc_ld_score: Calculate LD Scores **Make sure to delete SNP,CHR, and\n", - " BP columns from annotation files if they are present\n", - " otherwise this code will not work. Before deleting, if\n", - " these columns are present, make sure that the annotation\n", - " file is sorted.**\n", - " Workflow Options:\n", - " --bim VAL (as str, required)\n", - " Path to bim file\n", - " --annot-file VAL (as str, required)\n", - " Path to annotation File. Make sure to remove the SNP,\n", - " CHR, and BP columns from the annotation file if present\n", - " before running.\n", - " --output VAL (as str, required)\n", - " name of output file\n", - " --snplist 'w_hm3.snplist'\n", - " path to Hapmap3 SNPs file, remove the A1 and A2 columns\n", - " for the Calculate LD Scores program\n", - " calc_enrichment:\n", - " Workflow Options:\n", - " --sumstats VAL (as str, required)\n", - " Path to Summary statistics File\n", - " --ref-ld VAL (as str, required)\n", - " Path to Reference LD Scores Files (Base Annotation +\n", - " Annotation you want to analyze, format like minimal\n", - " working example)\n", - " --w-ld VAL (as str, required)\n", - " Path to LD Weight Files (Format like minimal working\n", - " example)\n", - " --frq-file VAL (as str, required)\n", - " path to frequency files (Format like minimal working\n", - " example)\n", - " --output VAL (as str, required)\n", - " Output name\n" + "No help information is available for script run: Failed to locate LDSC_DeepSea_Code.ipynb.sos\r\n" ] } ], @@ -180,8 +92,7 @@ "parameter: model = str\n", "#path to output directory\n", "parameter: output = str\n", - "#number of features\n", - "parameter: num_features = int\n", + "\n", "\n", "python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif'\n", "\n", @@ -247,13 +158,14 @@ " fwrite(features,paste0(\"${output}\",\"/feature_encoding.txt\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", " for (i in seq(1,22)){\n", " data = fread(paste0(\"${tsv}\",\"/tutorial_1000G_chr_\",i,\"_abs_diffs.tsv\"))\n", - " data$V1 = gsub(\"chr\",\"\",data$V1)\n", - " data$V1 = as.numeric(data$V1)\n", - " data_2 = select(data,-c(seq(4,8)))\n", + " data_2 = select(data,-seq(4,8))\n", + " base = data.frame(base=rep(1,nrow(data_2)))\n", + " fwrite(base,paste0(\"${output}\",\"/base_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", " for (j in seq(4,ncol(data_2))){\n", " data_3 = select(data_2,c(1,2,3,j))\n", - " data_3$CM = 0\n", - " colnames(data_3) = c(\"CHR\",\"BP\",\"SNP\",paste0(\"feat_\",j),\"CM\")\n", + " colnames(data_3) = c(\"CHR\",\"BP\",\"SNP\",paste0(\"feat_\",j))\n", + " data_3 = setorder(data_3,BP)\n", + " data_3 = select(data_3,-c(\"CHR\",\"BP\",\"SNP\"))\n", " fwrite(data_3,paste0(\"${output}\",\"/feat_\",j,\"_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", " }\n", " }" @@ -345,21 +257,71 @@ }, "outputs": [], "source": [ - "#Calculate LD Scores\n", - "#**Make sure to delete SNP,CHR, and BP columns from annotation files if they are present otherwise this code will not work. Before deleting, if these columns are present, make sure that the annotation file is sorted.**\n", + "\n", "[calc_ld_score]\n", "\n", - "#Path to bim file\n", - "parameter: bim = str\n", - "#Path to annotation File. Make sure to remove the SNP, CHR, and BP columns from the annotation file if present before running.\n", - "parameter: annot_file = str\n", - "#name of output file\n", - "parameter: output = str\n", - "#path to Hapmap3 SNPs file, remove the A1 and A2 columns for the Calculate LD Scores program \n", - "parameter: snplist = \"w_hm3.snplist\"\n", + "#Path to directory with bim files\n", + "parameter: bim = path()\n", + "#Path to directory with annotation files, output will appear here too. Make sure to remove the SNP, CHR, and BP columns from the annotation files if present before running.\n", + "parameter: annot_files = path()\n", + "#number of features\n", + "parameter: num_features = int\n", "\n", "bash: \n", - " python2 ldsc.py --bfile {bim} --l2 --ld-wind-cm 1 --annot {annot_file} --thin-annot --out {output} --print-snps {snplist}" + " #echo {annot_files} > out.txt\n", + " for i in $(seq 1 {num_features});do for j in {1..22}; do python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile {bim}/1000G.EUR.QC.${j} --l2 --ld-wind-cm 1 --annot {annot_files}/feat_${i}_chr_${j}.annot.gz --thin-annot --out {annot_files}/feat_${i}_chr_${j} --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt; done; done\n", + " for j in {1..22}; do python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile {bim}/1000G.EUR.QC.${j} --l2 --ld-wind-cm 1 --annot {annot_files}/base_chr_${j}.annot.gz --thin-annot --out {annot_files}/base_chr_${j} --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt; done" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Convert LD Score SNPs to AD Summary Statistic Format:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "\n", + "[convert_ld_snps]\n", + "\n", + "#Path to directory with ld score files AND annotation files\n", + "parameter: ld_scores = str\n", + "\n", + "parameter: num_features = int\n", + "\n", + "\n", + "R: expand = \"${ }\", container=\"/mnt/mfs/statgen/Anmol/r-packages.sif\"\n", + " library(tidyverse)\n", + " #library(R.utils)\n", + " library(data.table)\n", + " for (i in seq(1,22)){\n", + " data = read.table(gzfile(paste0(\"${ld_scores}/base_chr_\",i,\".l2.ldscore.gz\")))\n", + " data_2 = fread(paste0(\"${ld_scores}/base_chr_\",i,\".l2.M_5_50\"))\n", + " data_3 = read.table(gzfile(paste0(\"${ld_scores}/base_chr_\",i,\".annot.gz\")))\n", + " data$SNP = paste0(data$CHR,\":\",data$BP)\n", + " fwrite(data,paste0(\"${ld_scores}/AD_base_chr_\",i,\".l2.ldscore.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " fwrite(data_2,paste0(\"${ld_scores}/AD_base_chr_\",i,\".l2.M_5_50\"),quote=F,sep=\"\\t\",row.names=F,col.names=F)\n", + " fwrite(data_3,paste0(\"${ld_scores}/AD_base_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " for (j in seq(1,${num_features})){\n", + " data = read.table(gzfile(paste0(\"${ld_scores}/feat_\",j,\"_chr_\",i,\".l2.ldscore.gz\")))\n", + " data_2 = fread(paste0(\"${ld_scores}/feat_\",j,\"_chr_\",i,\".l2.M_5_50\"))\n", + " data_3 = read.table(gzfile(paste0(\"${ld_scores}/feat_\",j,\"_chr_\",i,\".annot.gz\")))\n", + " data$SNP = paste0(data$CHR,\":\",data$BP)\n", + " fwrite(data,paste0(\"${ld_scores}/AD_feat_\",j,\"_chr_\",i,\".l2.ldscore.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " fwrite(data_2,paste0(\"${ld_scores}/AD_feat_\",j,\"_chr_\",i,\".l2.M_5_50\"),quote=F,sep=\"\\t\",row.names=F,col.names=F)\n", + " fwrite(data_3,paste0(\"${ld_scores}/AD_feat_\",j,\"_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " }\n", + " }\n", + " \n" ] }, { @@ -415,16 +377,6 @@ }, "sos": { "kernels": [ - [ - "Python 3 (ipykernel)", - "python3", - "python3", - "", - { - "name": "ipython", - "version": 3 - } - ], [ "SoS", "sos", diff --git a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos index c082c8c..d3bee84 100644 --- a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos +++ b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos @@ -15,8 +15,7 @@ parameter: feature_list = str parameter: model = str #path to output directory parameter: output = str -#number of features -parameter: num_features = int + python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif' @@ -63,13 +62,14 @@ R: expand = "${ }", container="/mnt/mfs/statgen/Anmol/r-packages.sif" fwrite(features,paste0("${output}","/feature_encoding.txt"),quote=F,sep="\t",row.names=F,col.names=T) for (i in seq(1,22)){ data = fread(paste0("${tsv}","/tutorial_1000G_chr_",i,"_abs_diffs.tsv")) - data$V1 = gsub("chr","",data$V1) - data$V1 = as.numeric(data$V1) - data_2 = select(data,-c(seq(4,8))) + data_2 = select(data,-seq(4,8)) + base = data.frame(base=rep(1,nrow(data_2))) + fwrite(base,paste0("${output}","/base_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) for (j in seq(4,ncol(data_2))){ data_3 = select(data_2,c(1,2,3,j)) - data_3$CM = 0 - colnames(data_3) = c("CHR","BP","SNP",paste0("feat_",j),"CM") + colnames(data_3) = c("CHR","BP","SNP",paste0("feat_",j)) + data_3 = setorder(data_3,BP) + data_3 = select(data_3,-c("CHR","BP","SNP")) fwrite(data_3,paste0("${output}","/feat_",j,"_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) } } @@ -105,21 +105,52 @@ parameter: output = str bash: python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output} -#Calculate LD Scores -#**Make sure to delete SNP,CHR, and BP columns from annotation files if they are present otherwise this code will not work. Before deleting, if these columns are present, make sure that the annotation file is sorted.** [calc_ld_score] -#Path to bim file -parameter: bim = str -#Path to annotation File. Make sure to remove the SNP, CHR, and BP columns from the annotation file if present before running. -parameter: annot_file = str -#name of output file -parameter: output = str -#path to Hapmap3 SNPs file, remove the A1 and A2 columns for the Calculate LD Scores program -parameter: snplist = "w_hm3.snplist" +#Path to directory with bim files +parameter: bim = path() +#Path to directory with annotation files, output will appear here too. Make sure to remove the SNP, CHR, and BP columns from the annotation files if present before running. +parameter: annot_files = path() +#number of features +parameter: num_features = int bash: - python2 ldsc.py --bfile {bim} --l2 --ld-wind-cm 1 --annot {annot_file} --thin-annot --out {output} --print-snps {snplist} + #echo {annot_files} > out.txt + for i in $(seq 1 {num_features});do for j in {1..22}; do python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile {bim}/1000G.EUR.QC.${j} --l2 --ld-wind-cm 1 --annot {annot_files}/feat_${i}_chr_${j}.annot.gz --thin-annot --out {annot_files}/feat_${i}_chr_${j} --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt; done; done + for j in {1..22}; do python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile {bim}/1000G.EUR.QC.${j} --l2 --ld-wind-cm 1 --annot {annot_files}/base_chr_${j}.annot.gz --thin-annot --out {annot_files}/base_chr_${j} --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt; done + +[convert_ld_snps] + +#Path to directory with ld score files AND annotation files +parameter: ld_scores = str + +parameter: num_features = int + + +R: expand = "${ }", container="/mnt/mfs/statgen/Anmol/r-packages.sif" + library(tidyverse) + #library(R.utils) + library(data.table) + for (i in seq(1,22)){ + data = read.table(gzfile(paste0("${ld_scores}/base_chr_",i,".l2.ldscore.gz"))) + data_2 = fread(paste0("${ld_scores}/base_chr_",i,".l2.M_5_50")) + data_3 = read.table(gzfile(paste0("${ld_scores}/base_chr_",i,".annot.gz"))) + data$SNP = paste0(data$CHR,":",data$BP) + fwrite(data,paste0("${ld_scores}/AD_base_chr_",i,".l2.ldscore.gz"),quote=F,sep="\t",row.names=F,col.names=T) + fwrite(data_2,paste0("${ld_scores}/AD_base_chr_",i,".l2.M_5_50"),quote=F,sep="\t",row.names=F,col.names=F) + fwrite(data_3,paste0("${ld_scores}/AD_base_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) + for (j in seq(1,${num_features})){ + data = read.table(gzfile(paste0("${ld_scores}/feat_",j,"_chr_",i,".l2.ldscore.gz"))) + data_2 = fread(paste0("${ld_scores}/feat_",j,"_chr_",i,".l2.M_5_50")) + data_3 = read.table(gzfile(paste0("${ld_scores}/feat_",j,"_chr_",i,".annot.gz"))) + data$SNP = paste0(data$CHR,":",data$BP) + fwrite(data,paste0("${ld_scores}/AD_feat_",j,"_chr_",i,".l2.ldscore.gz"),quote=F,sep="\t",row.names=F,col.names=T) + fwrite(data_2,paste0("${ld_scores}/AD_feat_",j,"_chr_",i,".l2.M_5_50"),quote=F,sep="\t",row.names=F,col.names=F) + fwrite(data_3,paste0("${ld_scores}/AD_feat_",j,"_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) + } + } + + [calc_enrichment] From 0581278daf3d4b5bfa9660877ce81233b0f464eb Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Tue, 1 Mar 2022 12:28:24 -0500 Subject: [PATCH 13/63] updates to liftover --- GWAS/LMM.ipynb | 5 ++++- GWAS/data/mtag_template.yml | 15 +++++++-------- GWAS/liftover.ipynb | 10 +++++----- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/GWAS/LMM.ipynb b/GWAS/LMM.ipynb index 4d6e6aa..506de68 100644 --- a/GWAS/LMM.ipynb +++ b/GWAS/LMM.ipynb @@ -1051,6 +1051,8 @@ "parameter: trait = 'bt'\n", "# in the case of bgen data from UKBB ref_first should be set to true\n", "parameter: ref_first= False\n", + "# Specify dominant or recessive test. Default is additiveß\n", + "parameter: test = ''\n", "input: genoFile, group_by = 1, group_with = dict(info=[(path(f'{cwd}/{phenoFile:bn}_' + \"_\".join([x for x in phenoCol]) + '.regenie_pred.list'))] * len(genoFile))\n", "input_options = f\"--bgen {_input} --sample {sampleFile}\" if _input.suffix == \".bgen\" else f\"--bed {_input:n}\"\n", "output: [f'{cwd}/cache/{_input:bn}_'+ str(phenoCol[i]) + '.regenie.gz' for i in range(len(phenoCol))]\n", @@ -1070,6 +1072,7 @@ " --bsize ${bsize} \\\n", " --minMAC ${minMAC} \\\n", " --minINFO ${bgenMinINFO}\\\n", + " ${('--test ' + test) if test in ['dominant','recessive','additive'] else ''} \\\n", " --threads ${numThreads} \\\n", " --out ${cwd}/cache/${_input:bn} && \\\n", " gzip -f --best ${_output:n}" @@ -2398,7 +2401,7 @@ "displayed": true, "height": 0 }, - "version": "0.22.9" + "version": "0.22.6" }, "toc-showcode": false }, diff --git a/GWAS/data/mtag_template.yml b/GWAS/data/mtag_template.yml index 83837ce..6c8fcd4 100644 --- a/GWAS/data/mtag_template.yml +++ b/GWAS/data/mtag_template.yml @@ -1,11 +1,10 @@ # mtag summary statistics template -snpid: ID -chr: CHROM -bpos: GENPOS -a1: ALLELE1 #A1 needs to be the effect allele -a2: ALLELE0 # The other allele -freq: A1FREQ +chr: CHR +bpos: POS +a2: A0 # The other allele +a1: A1 #A1 needs to be the effect allele +freq: freq beta: BETA se: SE -pval: LOG10P -n: N +pval: P +n: n diff --git a/GWAS/liftover.ipynb b/GWAS/liftover.ipynb index e82c88c..8757b14 100644 --- a/GWAS/liftover.ipynb +++ b/GWAS/liftover.ipynb @@ -155,7 +155,7 @@ "# Input file which can be plink format, gvcf/vcf format, sumstat format.\n", "parameter: input_file = path\n", "# The path of yaml file with input file format, only for sumstat file.\n", - "parameter: yml_file = path #Fixme setting defualt to `None`\n", + "parameter: yml_file = path('.') \n", "# the name of ouput file which will be saved under cwd path\n", "parameter: output_file = path\n", "# From reference genome, defaut is hg19\n", @@ -253,11 +253,11 @@ " \n", " input_path=${_input[0]:r}\n", " output_path=${_output[0]:r}\n", - " fr = f'${fr}'\n", - " to = f'${to}'\n", + " fr = '${fr}'\n", + " to = '${to}'\n", " remove_missing=${remove_missing}\n", " rename = ${rename}\n", - " yml_file = f'${yml_file}'\n", + " yml_file = '${yml_file}'\n", " print(fr,to,remove_missing)\n", " liftover(input_path,output_path,yml_file,fr,to,remove_missing,rename)" ] @@ -294,7 +294,7 @@ "sos" ] ], - "version": "0.22.7" + "version": "0.22.6" } }, "nbformat": 4, From 2e6a75da79b6defdef6ec28412e63dcc05701b00 Mon Sep 17 00:00:00 2001 From: asingh100 <55717171+asingh100@users.noreply.github.com> Date: Wed, 2 Mar 2022 08:30:14 -0500 Subject: [PATCH 14/63] Finished Minimal Example Code and Added yml file to run it --- .../LDSC_DeepSea_Minimal_Example.ipynb | 224 ++++++++++++++---- .../LDSC_DeepSea_Minimal_Example.sos | 85 ++++--- LDSC/Deep_Learning/all_neuron_tutorial.yml | 50 ++++ 3 files changed, 277 insertions(+), 82 deletions(-) create mode 100644 LDSC/Deep_Learning/all_neuron_tutorial.yml diff --git a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb index d2a9d44..e1f2147 100644 --- a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb +++ b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb @@ -6,16 +6,25 @@ "kernel": "SoS" }, "source": [ - "## SoS Workflow:\n", + "## Tutorial Workflow for LDSC with DeepSea Integration:\n", "\n", - "This is the options and the SoS code to run the LDSC pipeline using your own data. \n", + "This is the code to run the minimal working example for LDSC with DeepSea Integration. The code will train the deepsea model on the set of features provided using the .yml file on the google drive folder, get predictions on the reference genome from the trained model, and run LDSC on the resulting predictions to output enrichments. \n", "\n", - "## Command Interface:" + "If you would like to use a different set of features or change training parameters, please edit the .yml file provided and everything else will still work.\n", + "\n", + "This is the command to run the Minimal Working Example:\n", + "\n", + "\n", + "sos run LDSC_DeepSea_Code.ipynb --model /mnt/mfs/statgen/Anmol/training_files/tutorial/training_outputs/model --feature_list /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial_features.txt --output_tsv /mnt/mfs/statgen/Anmol/training_files/tutorial/testing --tsv /mnt/mfs/statgen/Anmol/training_files/tutorial/testing --annot_files /mnt/mfs/statgen/Anmol/training_files/tutorial/annot_files --sumst /mnt/mfs/statgen/Anmol/polyfun/Dey/PGCALZ2sumstatsExcluding23andMe.txt --output_sumst /mnt/mfs/statgen/Anmol/polyfun/Dey/2021.Updated.sumstats.gz --signed True --bim /mnt/mfs/statgen/Anmol/training_files/tutorial/plink_files --num_features 7 --ld_scores /mnt/mfs/statgen/Anmol/training_files/tutorial/annot_files --ctrl_sumstats /mnt/mfs/statgen/Anmol/polyfun/Dey/AMD.sumstats.gz --AD_sumstats /mnt/mfs/statgen/Anmol/polyfun/Dey/2021.Updated.sumstats.gz --w_ld_ctrl /mnt/mfs/statgen/Anmol/training_files/testing/ldsc/AD_Variants/tutorial_data/weights_hm3_no_hla/weights. --frq_file_ctrl /mnt/mfs/statgen/Anmol/training_files/testing/ldsc/AD_Variants/frq/1000G.EUR.QC. --w_ld_AD /mnt/mfs/statgen/Anmol/training_files/testing/ldsc/AD_Variants/tutorial_data/weights_hm3_no_hla/weights.2021. --frq_file_AD /mnt/mfs/statgen/Anmol/training_files/testing/ldsc/AD_Variants/frq/1000G.2021.EUR.QC. --ref_ld annot_files --pheno AMD\n", + "\n", + "## Command Interface:\n", + "\n", + "This is the list of commands and workflows with explanations for each one" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 18, "metadata": { "kernel": "SoS" }, @@ -24,21 +33,116 @@ "name": "stdout", "output_type": "stream", "text": [ - "No help information is available for script run: Failed to locate LDSC_DeepSea_Code.ipynb.sos\r\n" + "usage: sos run LDSC_DeepSea_Minimal_Example.ipynb\n", + " [workflow_name | -t targets] [options] [workflow_options]\n", + " workflow_name: Single or combined workflows defined in this script\n", + " targets: One or more targets to generate\n", + " options: Single-hyphen sos parameters (see \"sos run -h\" for details)\n", + " workflow_options: Double-hyphen workflow-specific parameters\n", + "\n", + "Workflows:\n", + " train_model\n", + " make_annot\n", + " format_annot\n", + " munge_sumstats_no_sign\n", + " munge_sumstats_sign\n", + " calc_ld_score\n", + " convert_ld_snps\n", + " calc_enrichment\n", + "\n", + "Sections\n", + " train_model:\n", + " make_annot:\n", + " Workflow Options:\n", + " --feature-list VAL (as str, required)\n", + " path to feature list file\n", + " --model VAL (as str, required)\n", + " path to trained model location\n", + " --output-tsv VAL (as str, required)\n", + " path to output directory\n", + " format_annot:\n", + " Workflow Options:\n", + " --tsv . (as path)\n", + " path to tsv files directory\n", + " --annot-files . (as path)\n", + " path to output file directory\n", + " munge_sumstats_no_sign:\n", + " Workflow Options:\n", + " --sumst VAL (as str, required)\n", + " path to summary statistic file\n", + " --alleles 'w_hm3.snplist'\n", + " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", + " and A2) for the munge_sumstats program\n", + " --output-sumst VAL (as str, required)\n", + " path to output file\n", + " --[no-]signed (default to False)\n", + " does summary statistic contain Z or Beta\n", + " munge_sumstats_sign: This option is for when the summary statistic file does\n", + " contain a signed summary statistic (Z or Beta)\n", + " Workflow Options:\n", + " --sumst VAL (as str, required)\n", + " path to summary statistic file\n", + " --alleles 'w_hm3.snplist'\n", + " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", + " and A2) for the munge_sumstats program\n", + " --output-sumst-2 VAL (as str, required)\n", + " path to output file\n", + " --[no-]signed (default to False)\n", + " does summary statistic contain Z or Beta\n", + " calc_ld_score:\n", + " Workflow Options:\n", + " --bim . (as path)\n", + " Path to directory with bim files\n", + " --annot-files . (as path)\n", + " Path to directory with annotation files, output will\n", + " appear here too. Make sure to remove the SNP, CHR, and\n", + " BP columns from the annotation files if present before\n", + " running.\n", + " --num-features VAL (as int, required)\n", + " number of features\n", + " convert_ld_snps:\n", + " Workflow Options:\n", + " --ld-scores VAL (as str, required)\n", + " Path to directory with ld score files AND annotation\n", + " files\n", + " --num-features VAL (as int, required)\n", + " calc_enrichment:\n", + " Workflow Options:\n", + " --ctrl-sumstats VAL (as str, required)\n", + " Path to Control Summary statistics File\n", + " --AD-sumstats VAL (as str, required)\n", + " Path to AD Summary statistics File\n", + " --ref-ld VAL (as str, required)\n", + " Path to Reference LD Scores File Directory\n", + " --w-ld-ctrl VAL (as str, required)\n", + " Path to LD Weight Files for Control Sumstats (Format\n", + " like minimal working example)\n", + " --frq-file-ctrl VAL (as str, required)\n", + " path to frequency files for Control Sumstats (Format\n", + " like minimal working example)\n", + " --w-ld-AD VAL (as str, required)\n", + " Path to LD Weight Files for AD Sumstats (Format like\n", + " minimal working example)\n", + " --frq-file-AD VAL (as str, required)\n", + " path to frequency files for AD Sumstats (Format like\n", + " minimal working example)\n", + " --num-features VAL (as int, required)\n", + " Number of Features\n", + " --pheno VAL (as str, required)\n", + " Control Phenotype, For Output\n", + "\n" ] } ], "source": [ - "!sos run LDSC_DeepSea_Code.ipynb -h" + "!sos run LDSC_DeepSea_Code.ipynb -h\n" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": { - "kernel": "SoS" + "kernel": "Python 3 (ipykernel)" }, - "outputs": [], "source": [] }, { @@ -83,6 +187,8 @@ }, "outputs": [], "source": [ + "# Get Predictions for Features based on Trained Model\n", + "\n", "\n", "[make_annot]\n", "\n", @@ -91,7 +197,7 @@ "#path to trained model location\n", "parameter: model = str\n", "#path to output directory\n", - "parameter: output = str\n", + "parameter: output_tsv = str\n", "\n", "\n", "python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif'\n", @@ -109,14 +215,14 @@ "\n", " model_predict = AnalyzeSequences(\n", " NonStrandSpecific(DeeperDeepSEA(1000,{num_features})),\n", - " {model},\n", + " {model}+\"/best_model.pth.tar\",\n", " sequence_length=1000,\n", " features=distinct_features,\n", " reference_sequence=Genome(\"/mnt/mfs/statgen/Anmol/training_files/male.hg19.fasta\"),\n", " use_cuda=False # update this to False if you do not have CUDA on your machine.\n", " )\n", "\n", - " for i in range(1,22):\n", + " for i in range(1,23):\n", " model_predict.variant_effect_prediction(\n", " \"/mnt/mfs/statgen/Anmol/training_files/testing/1000G_chr_\"+str(i)+\".vcf\",\n", " save_data=[\"abs_diffs\"], # only want to save the absolute diff score data\n", @@ -140,13 +246,15 @@ }, "outputs": [], "source": [ + "# Separate Annotation Files by Chromosome\n", + "\n", "\n", "[format_annot]\n", "\n", "#path to tsv files directory\n", "parameter: tsv = path()\n", "#path to output file directory\n", - "parameter: output = path()\n", + "parameter: annot_files = path()\n", "\n", "R: expand = \"${ }\", container=\"/mnt/mfs/statgen/Anmol/r-packages.sif\"\n", " library(data.table)\n", @@ -155,18 +263,18 @@ " features = colnames(data)[9:ncol(data)]\n", " features = data.frame(features)\n", " features$encoding = paste0(\"feat_\",seq(1,nrow(features)))\n", - " fwrite(features,paste0(\"${output}\",\"/feature_encoding.txt\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " fwrite(features,paste0(\"${annot_files}\",\"/feature_encoding.txt\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", " for (i in seq(1,22)){\n", " data = fread(paste0(\"${tsv}\",\"/tutorial_1000G_chr_\",i,\"_abs_diffs.tsv\"))\n", " data_2 = select(data,-seq(4,8))\n", " base = data.frame(base=rep(1,nrow(data_2)))\n", - " fwrite(base,paste0(\"${output}\",\"/base_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " fwrite(base,paste0(\"${annot_files}\",\"/base_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", " for (j in seq(4,ncol(data_2))){\n", " data_3 = select(data_2,c(1,2,3,j))\n", " colnames(data_3) = c(\"CHR\",\"BP\",\"SNP\",paste0(\"feat_\",j))\n", " data_3 = setorder(data_3,BP)\n", " data_3 = select(data_3,-c(\"CHR\",\"BP\",\"SNP\"))\n", - " fwrite(data_3,paste0(\"${output}\",\"/feat_\",j,\"_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " fwrite(data_3,paste0(\"${annot_files}\",\"/feat_\",j,\"_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", " }\n", " }" ] @@ -188,8 +296,8 @@ }, "outputs": [], "source": [ - "#This option is for when the summary statistic file does not contain a signed summary statistic (Z or Beta). \n", - "#In this case,the program will calculate Z for you based on A1 being the risk allele\n", + "# Option when Summary Statistic File does not contain a Z or Beta Column (Signed Summary Statistic)\n", + "\n", "[munge_sumstats_no_sign]\n", "\n", "\n", @@ -199,10 +307,15 @@ "#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program\n", "parameter: alleles = \"w_hm3.snplist\"\n", "#path to output file\n", - "parameter: output = str\n", - "\n", - "bash: \n", - " python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output} --a1-inc" + "parameter: output_sumst = str\n", + "#does summary statistic contain Z or Beta\n", + "parameter: signed = False\n", + "\n", + "bash: expand = '${ }'\n", + " if [${signed}==True]\n", + " then\n", + " python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst} --a1-inc\n", + " fi" ] }, { @@ -211,7 +324,7 @@ "kernel": "SoS" }, "source": [ - "## Munge Summary Statistics (Option 2: No Signed Summary Statistic):" + "## Munge Summary Statistics (Option 2: Contains Signed Summary Statistic):" ] }, { @@ -232,10 +345,15 @@ "#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program\n", "parameter: alleles = \"w_hm3.snplist\"\n", "#path to output file\n", - "parameter: output = str\n", - "\n", - "bash: \n", - " python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output}" + "parameter: output_sumst_2 = str\n", + "#does summary statistic contain Z or Beta\n", + "parameter: signed = False\n", + "\n", + "bash: expand = '${ }'\n", + " if [${signed}==False]\n", + " then\n", + " python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst_2}\n", + " fi" ] }, { @@ -267,10 +385,10 @@ "#number of features\n", "parameter: num_features = int\n", "\n", - "bash: \n", + "bash: expand = '${ }'\n", " #echo {annot_files} > out.txt\n", - " for i in $(seq 1 {num_features});do for j in {1..22}; do python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile {bim}/1000G.EUR.QC.${j} --l2 --ld-wind-cm 1 --annot {annot_files}/feat_${i}_chr_${j}.annot.gz --thin-annot --out {annot_files}/feat_${i}_chr_${j} --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt; done; done\n", - " for j in {1..22}; do python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile {bim}/1000G.EUR.QC.${j} --l2 --ld-wind-cm 1 --annot {annot_files}/base_chr_${j}.annot.gz --thin-annot --out {annot_files}/base_chr_${j} --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt; done" + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.22 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_22.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_22 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 22| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.j --l2 --ld-wind-cm 1 --annot ${annot_files}/base_chr_j.annot.gz --thin-annot --out ${annot_files}/base_chr_j --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt" ] }, { @@ -290,6 +408,8 @@ }, "outputs": [], "source": [ + "# Convert SNP format in LD Score Files to CHR:BP to match with AD Summary Statistic Format\n", + "\n", "\n", "[convert_ld_snps]\n", "\n", @@ -304,17 +424,17 @@ " #library(R.utils)\n", " library(data.table)\n", " for (i in seq(1,22)){\n", - " data = read.table(gzfile(paste0(\"${ld_scores}/base_chr_\",i,\".l2.ldscore.gz\")))\n", + " data = read.table(gzfile(paste0(\"${ld_scores}/base_chr_\",i,\".l2.ldscore.gz\")),header=T)\n", " data_2 = fread(paste0(\"${ld_scores}/base_chr_\",i,\".l2.M_5_50\"))\n", - " data_3 = read.table(gzfile(paste0(\"${ld_scores}/base_chr_\",i,\".annot.gz\")))\n", + " data_3 = read.table(gzfile(paste0(\"${ld_scores}/base_chr_\",i,\".annot.gz\")),header=T)\n", " data$SNP = paste0(data$CHR,\":\",data$BP)\n", " fwrite(data,paste0(\"${ld_scores}/AD_base_chr_\",i,\".l2.ldscore.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", " fwrite(data_2,paste0(\"${ld_scores}/AD_base_chr_\",i,\".l2.M_5_50\"),quote=F,sep=\"\\t\",row.names=F,col.names=F)\n", " fwrite(data_3,paste0(\"${ld_scores}/AD_base_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", " for (j in seq(1,${num_features})){\n", - " data = read.table(gzfile(paste0(\"${ld_scores}/feat_\",j,\"_chr_\",i,\".l2.ldscore.gz\")))\n", + " data = read.table(gzfile(paste0(\"${ld_scores}/feat_\",j,\"_chr_\",i,\".l2.ldscore.gz\")),header=T)\n", " data_2 = fread(paste0(\"${ld_scores}/feat_\",j,\"_chr_\",i,\".l2.M_5_50\"))\n", - " data_3 = read.table(gzfile(paste0(\"${ld_scores}/feat_\",j,\"_chr_\",i,\".annot.gz\")))\n", + " data_3 = read.table(gzfile(paste0(\"${ld_scores}/feat_\",j,\"_chr_\",i,\".annot.gz\")),header=T)\n", " data$SNP = paste0(data$CHR,\":\",data$BP)\n", " fwrite(data,paste0(\"${ld_scores}/AD_feat_\",j,\"_chr_\",i,\".l2.ldscore.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", " fwrite(data_2,paste0(\"${ld_scores}/AD_feat_\",j,\"_chr_\",i,\".l2.M_5_50\"),quote=F,sep=\"\\t\",row.names=F,col.names=F)\n", @@ -341,23 +461,31 @@ }, "outputs": [], "source": [ - "#Calculate Enrichment Scores for Functional Annotations\n", "\n", "[calc_enrichment]\n", "\n", - "#Path to Summary statistics File\n", - "parameter: sumstats = str\n", - "#Path to Reference LD Scores Files (Base Annotation + Annotation you want to analyze, format like minimal working example)\n", + "#Path to Control Summary statistics File\n", + "parameter: ctrl_sumstats = str\n", + "#Path to AD Summary statistics File\n", + "parameter: AD_sumstats = str\n", + "#Path to Reference LD Scores File Directory \n", "parameter: ref_ld = str\n", - "#Path to LD Weight Files (Format like minimal working example)\n", - "parameter: w_ld = str\n", - "#path to frequency files (Format like minimal working example)\n", - "parameter: frq_file = str\n", - "#Output name\n", - "parameter: output = str\n", - "\n", - "bash:\n", - " python2 ldsc.py --h2 {sumstats} --ref-ld-chr {ref_ld} --w-ld-chr {w_ld} --overlap-annot --frqfile-chr {frq_file} --out {output}" + "#Path to LD Weight Files for Control Sumstats (Format like minimal working example)\n", + "parameter: w_ld_ctrl = str\n", + "#path to frequency files for Control Sumstats (Format like minimal working example)\n", + "parameter: frq_file_ctrl = str\n", + "#Path to LD Weight Files for AD Sumstats (Format like minimal working example)\n", + "parameter: w_ld_AD = str\n", + "#path to frequency files for AD Sumstats (Format like minimal working example)\n", + "parameter: frq_file_AD = str\n", + "#Number of Features\n", + "parameter: num_features = int \n", + "#Control Phenotype, For Output\n", + "parameter: pheno = str\n", + "\n", + "bash: expand = '${ }'\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${ctrl_sumstats} --ref-ld-chr ${ref_ld}/base_chr_,${ref_ld}/feat_j_chr_ --w-ld-chr ${w_ld_ctrl} --overlap-annot --frqfile-chr ${frq_file_ctrl} --out ${ref_ld}/${pheno}_feat_j\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${AD_sumstats} --ref-ld-chr ${ref_ld}/AD_base_chr_,${ref_ld}/AD_feat_j_chr_ --w-ld-chr ${w_ld_AD} --overlap-annot --frqfile-chr ${frq_file_AD} --out ${ref_ld}/AD_feat_j" ] } ], diff --git a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos index d3bee84..ee3dd6f 100644 --- a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos +++ b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos @@ -14,7 +14,7 @@ parameter: feature_list = str #path to trained model location parameter: model = str #path to output directory -parameter: output = str +parameter: output_tsv = str python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif' @@ -32,14 +32,14 @@ python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif' model_predict = AnalyzeSequences( NonStrandSpecific(DeeperDeepSEA(1000,{num_features})), - {model}, + {model}+"/best_model.pth.tar", sequence_length=1000, features=distinct_features, reference_sequence=Genome("/mnt/mfs/statgen/Anmol/training_files/male.hg19.fasta"), use_cuda=False # update this to False if you do not have CUDA on your machine. ) - for i in range(1,22): + for i in range(1,23): model_predict.variant_effect_prediction( "/mnt/mfs/statgen/Anmol/training_files/testing/1000G_chr_"+str(i)+".vcf", save_data=["abs_diffs"], # only want to save the absolute diff score data @@ -50,7 +50,7 @@ python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif' #path to tsv files directory parameter: tsv = path() #path to output file directory -parameter: output = path() +parameter: annot_files = path() R: expand = "${ }", container="/mnt/mfs/statgen/Anmol/r-packages.sif" library(data.table) @@ -59,23 +59,21 @@ R: expand = "${ }", container="/mnt/mfs/statgen/Anmol/r-packages.sif" features = colnames(data)[9:ncol(data)] features = data.frame(features) features$encoding = paste0("feat_",seq(1,nrow(features))) - fwrite(features,paste0("${output}","/feature_encoding.txt"),quote=F,sep="\t",row.names=F,col.names=T) + fwrite(features,paste0("${annot_files}","/feature_encoding.txt"),quote=F,sep="\t",row.names=F,col.names=T) for (i in seq(1,22)){ data = fread(paste0("${tsv}","/tutorial_1000G_chr_",i,"_abs_diffs.tsv")) data_2 = select(data,-seq(4,8)) base = data.frame(base=rep(1,nrow(data_2))) - fwrite(base,paste0("${output}","/base_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) + fwrite(base,paste0("${annot_files}","/base_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) for (j in seq(4,ncol(data_2))){ data_3 = select(data_2,c(1,2,3,j)) colnames(data_3) = c("CHR","BP","SNP",paste0("feat_",j)) data_3 = setorder(data_3,BP) data_3 = select(data_3,-c("CHR","BP","SNP")) - fwrite(data_3,paste0("${output}","/feat_",j,"_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) + fwrite(data_3,paste0("${annot_files}","/feat_",j,"_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) } } -#This option is for when the summary statistic file does not contain a signed summary statistic (Z or Beta). -#In this case,the program will calculate Z for you based on A1 being the risk allele [munge_sumstats_no_sign] @@ -85,10 +83,15 @@ parameter: sumst = str #path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program parameter: alleles = "w_hm3.snplist" #path to output file -parameter: output = str +parameter: output_sumst = str +#does summary statistic contain Z or Beta +parameter: signed = False -bash: - python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output} --a1-inc +bash: expand = '${ }' + if [${signed}==True] + then + python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst} --a1-inc + fi # This option is for when the summary statistic file does contain a signed summary statistic (Z or Beta) [munge_sumstats_sign] @@ -100,10 +103,15 @@ parameter: sumst = str #path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program parameter: alleles = "w_hm3.snplist" #path to output file -parameter: output = str +parameter: output_sumst_2 = str +#does summary statistic contain Z or Beta +parameter: signed = False -bash: - python2 munge_sumstats.py --sumstats {sumst} --merge-alleles {alleles} --out {output} +bash: expand = '${ }' + if [${signed}==False] + then + python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst_2} + fi [calc_ld_score] @@ -114,10 +122,10 @@ parameter: annot_files = path() #number of features parameter: num_features = int -bash: +bash: expand = '${ }' #echo {annot_files} > out.txt - for i in $(seq 1 {num_features});do for j in {1..22}; do python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile {bim}/1000G.EUR.QC.${j} --l2 --ld-wind-cm 1 --annot {annot_files}/feat_${i}_chr_${j}.annot.gz --thin-annot --out {annot_files}/feat_${i}_chr_${j} --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt; done; done - for j in {1..22}; do python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile {bim}/1000G.EUR.QC.${j} --l2 --ld-wind-cm 1 --annot {annot_files}/base_chr_${j}.annot.gz --thin-annot --out {annot_files}/base_chr_${j} --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt; done + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.22 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_22.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_22 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 22| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.j --l2 --ld-wind-cm 1 --annot ${annot_files}/base_chr_j.annot.gz --thin-annot --out ${annot_files}/base_chr_j --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt [convert_ld_snps] @@ -132,17 +140,17 @@ R: expand = "${ }", container="/mnt/mfs/statgen/Anmol/r-packages.sif" #library(R.utils) library(data.table) for (i in seq(1,22)){ - data = read.table(gzfile(paste0("${ld_scores}/base_chr_",i,".l2.ldscore.gz"))) + data = read.table(gzfile(paste0("${ld_scores}/base_chr_",i,".l2.ldscore.gz")),header=T) data_2 = fread(paste0("${ld_scores}/base_chr_",i,".l2.M_5_50")) - data_3 = read.table(gzfile(paste0("${ld_scores}/base_chr_",i,".annot.gz"))) + data_3 = read.table(gzfile(paste0("${ld_scores}/base_chr_",i,".annot.gz")),header=T) data$SNP = paste0(data$CHR,":",data$BP) fwrite(data,paste0("${ld_scores}/AD_base_chr_",i,".l2.ldscore.gz"),quote=F,sep="\t",row.names=F,col.names=T) fwrite(data_2,paste0("${ld_scores}/AD_base_chr_",i,".l2.M_5_50"),quote=F,sep="\t",row.names=F,col.names=F) fwrite(data_3,paste0("${ld_scores}/AD_base_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) for (j in seq(1,${num_features})){ - data = read.table(gzfile(paste0("${ld_scores}/feat_",j,"_chr_",i,".l2.ldscore.gz"))) + data = read.table(gzfile(paste0("${ld_scores}/feat_",j,"_chr_",i,".l2.ldscore.gz")),header=T) data_2 = fread(paste0("${ld_scores}/feat_",j,"_chr_",i,".l2.M_5_50")) - data_3 = read.table(gzfile(paste0("${ld_scores}/feat_",j,"_chr_",i,".annot.gz"))) + data_3 = read.table(gzfile(paste0("${ld_scores}/feat_",j,"_chr_",i,".annot.gz")),header=T) data$SNP = paste0(data$CHR,":",data$BP) fwrite(data,paste0("${ld_scores}/AD_feat_",j,"_chr_",i,".l2.ldscore.gz"),quote=F,sep="\t",row.names=F,col.names=T) fwrite(data_2,paste0("${ld_scores}/AD_feat_",j,"_chr_",i,".l2.M_5_50"),quote=F,sep="\t",row.names=F,col.names=F) @@ -154,17 +162,26 @@ R: expand = "${ }", container="/mnt/mfs/statgen/Anmol/r-packages.sif" [calc_enrichment] -#Path to Summary statistics File -parameter: sumstats = str -#Path to Reference LD Scores Files (Base Annotation + Annotation you want to analyze, format like minimal working example) +#Path to Control Summary statistics File +parameter: ctrl_sumstats = str +#Path to AD Summary statistics File +parameter: AD_sumstats = str +#Path to Reference LD Scores File Directory parameter: ref_ld = str -#Path to LD Weight Files (Format like minimal working example) -parameter: w_ld = str -#path to frequency files (Format like minimal working example) -parameter: frq_file = str -#Output name -parameter: output = str - -bash: - python2 ldsc.py --h2 {sumstats} --ref-ld-chr {ref_ld} --w-ld-chr {w_ld} --overlap-annot --frqfile-chr {frq_file} --out {output} +#Path to LD Weight Files for Control Sumstats (Format like minimal working example) +parameter: w_ld_ctrl = str +#path to frequency files for Control Sumstats (Format like minimal working example) +parameter: frq_file_ctrl = str +#Path to LD Weight Files for AD Sumstats (Format like minimal working example) +parameter: w_ld_AD = str +#path to frequency files for AD Sumstats (Format like minimal working example) +parameter: frq_file_AD = str +#Number of Features +parameter: num_features = int +#Control Phenotype, For Output +parameter: pheno = str + +bash: expand = '${ }' + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${ctrl_sumstats} --ref-ld-chr ${ref_ld}/base_chr_,${ref_ld}/feat_j_chr_ --w-ld-chr ${w_ld_ctrl} --overlap-annot --frqfile-chr ${frq_file_ctrl} --out ${ref_ld}/${pheno}_feat_j + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${AD_sumstats} --ref-ld-chr ${ref_ld}/AD_base_chr_,${ref_ld}/AD_feat_j_chr_ --w-ld-chr ${w_ld_AD} --overlap-annot --frqfile-chr ${frq_file_AD} --out ${ref_ld}/AD_feat_j diff --git a/LDSC/Deep_Learning/all_neuron_tutorial.yml b/LDSC/Deep_Learning/all_neuron_tutorial.yml new file mode 100644 index 0000000..30ca02a --- /dev/null +++ b/LDSC/Deep_Learning/all_neuron_tutorial.yml @@ -0,0 +1,50 @@ +--- +ops: [train, evaluate] +model: { + path: /mnt/mfs/statgen/Anmol/training_files/deeperdeepsea.py,#UPDATE + class: DeeperDeepSEA, + class_args: { + sequence_length: 1000, + n_targets: 7, + }, + non_strand_specific: mean +} +sampler: !obj:selene_sdk.samplers.IntervalsSampler { + reference_sequence: !obj:selene_sdk.sequences.Genome { + input_path: /mnt/mfs/statgen/Anmol/training_files/male.hg19.fasta,#UPDATE + blacklist_regions: hg19 + }, + features: !obj:selene_sdk.utils.load_features_list { + input_path: /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial_features.txt #UPDATE + }, + target_path: /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial.bed.gz, #UPDATE + intervals_path: /mnt/mfs/statgen/Anmol/training_files/DNase_Intervals_FULL.txt, #UPDATE + seed: 127, + # A positive example is an 1000bp sequence with at least 1 class/feature annotated to it. + # A negative sample has no classes/features annotated to the sequence. + sample_negative: True, + sequence_length: 1000, + center_bin_to_predict: 200, + test_holdout: 0.2, + validation_holdout: 0.3, + # The feature must take up 50% of the bin (200bp) for it to be considered + # a feature annotated to that sequence. + feature_thresholds: 0.25, + mode: train, + save_datasets: [validate, test] +} +train_model: !obj:selene_sdk.TrainModel { + batch_size: 64, + max_steps: 500, # update this value for longer training + report_stats_every_n_steps: 250, + n_validation_samples: 25000, + n_test_samples: 125000, + cpu_n_threads: 32, + use_cuda: False, # TODO: update this if CUDA is not on your machine + data_parallel: False +} +random_seed: 1447 +output_dir: ./tutorial/training_outputs/model #UPDATE +create_subdirectory: False +load_test_set: False +... \ No newline at end of file From c840df1b73bff8e241b5599de3af8b1c96388c4a Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Wed, 2 Mar 2022 17:08:06 -0500 Subject: [PATCH 15/63] change walltime and memory to be global variables easily modifiable --- GWAS/LMM.ipynb | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/GWAS/LMM.ipynb b/GWAS/LMM.ipynb index 506de68..1790ad4 100644 --- a/GWAS/LMM.ipynb +++ b/GWAS/LMM.ipynb @@ -356,6 +356,8 @@ "parameter: bgenMinINFO = 0.8\n", "# For cluster jobs, number commands to run per job\n", "parameter: job_size = 1\n", + "parameter: mem = '15G'\n", + "parameter: walltime = '10h'\n", "# The container with the lmm software. Can be either a dockerhub image or a singularity `sif` file.\n", "# Default is set to using dockerhub image\n", "parameter: container_lmm = 'statisticalgenetics/lmm:2.4'\n", @@ -703,7 +705,7 @@ "input: genoFile, group_by = 1\n", "output: f'{cwd}/cache/{_input:bn}.{phenoFile:bn}_{phenoCol[0]}.boltlmm.snp_stats.gz'\n", "file_options=f\"--bfile {bfile:n} --bgenFile={_input} --bgenMinMAF={bgenMinMAF} --bgenMinINFO={bgenMinINFO} --sampleFile={sampleFile} --statsFileBgenSnps={_output} --statsFile={_output:nn}.ref_stats.gz \" if _input.suffix == \".bgen\" else f\"--bfile={_input:n} --statsFile={_output} \"\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '60G', cores = numThreads, tags = f'{step_name}_{_output:bn}'\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'\n", "bash: container=container_lmm, expand = \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', volumes = [f\"{cwd:a}:{cwd:a}\"]\n", " bolt \\\n", " --phenoFile=${phenoFile} \\\n", @@ -772,7 +774,7 @@ "output: f'{cwd}/cache/{_input:bn}.part_{_part_number}.grm.bin', \n", " f'{cwd}/cache/{_input:bn}.part_{_part_number}.grm.N.bin', \n", " f'{cwd}/cache/{_input:bn}.part_{_part_number}.grm.id'\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '48G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", "bash: container=container_lmm, expand = \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'\n", " gcta64 \\\n", " --bfile ${_input[0]:n} \\\n", @@ -804,7 +806,7 @@ "output: f'{cwd}/{bfile:bn}.grm.bin', \n", " f'{cwd}/{bfile:bn}.grm.N.bin', \n", " f'{cwd}/{bfile:bn}.grm.id' \n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '2h', mem = '6G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = 1, tags = f'{step_name}_{_output[0]:bn}'\n", "bash: container=container_lmm, expand = \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'\n", " # here input is results all parts each having 3 items. We need to get the corresponding every other 3 items\n", " cat ${paths(_input[::3])} > ${_output[0]}\n", @@ -833,7 +835,7 @@ "# Make a sparse GRM from the merged full-dense GRM\n", "[gcta_3]\n", "output: f'{cwd}/{bfile:bn}.grm.sp' \n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '48G', cores = 1, tags = f'{step_name}_{_output:bn}'\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = 1, tags = f'{step_name}_{_output:bn}'\n", "bash: container=container_lmm, expand = \"${ }\", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'\n", " gcta64 --grm ${_output:nn} --make-bK-sparse 0.05 --out ${_output:nn}" ] @@ -874,7 +876,7 @@ "input_options = f\"--bgen {_input} --info {bgenMinINFO} --sample {sampleFile}\" if _input.suffix == \".bgen\" else f\"--bfile {_input:n}\"\n", "output: f'{cwd}/cache/{_input:bnn}.{phenoFile:bn}.fastGWA.gz'\n", "fail_if(not path(f'{_input}.bgi').is_file() and _input.suffix == '.bgen', msg = f'Cannot find file ``{_input}.bgi``. Please generate it using command ``bgenix -g {_input} -index``.') if _input.suffix == \".bgen\" else f\"continue\"\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '5G', cores = numThreads, tags = f'{step_name}_{_output:bn}'\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'\n", "bash: container=container_lmm, expand = \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'\n", " gcta64 \\\n", " ${input_options} \\\n", @@ -954,7 +956,7 @@ "parameter: mind_filter = 0.0\n", "input: bfile\n", "output: f'{cwd}/cache/{bfile:bn}.qc_pass.id', f'{cwd}/cache/{bfile:bn}.qc_pass.snplist' \n", - "task: trunk_workers = 1, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", + "task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", "bash: container=container_lmm, expand= \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout' \n", " plink2 \\\n", " --bfile ${bfile:n} --mac 1 \\\n", @@ -1009,7 +1011,7 @@ "depends: f'{cwd}/cache/{bfile:bn}.qc_pass.snplist', f'{cwd}/cache/{bfile:bn}.qc_pass.id'\n", "input: geno = bfile, pheno = f\"{cwd}/{phenoFile:bn}.regenie_phenotype\", covar = f\"{cwd}/{phenoFile:bn}.regenie_covar\", qc = output_from(\"regenie_qc\")\n", "output: f'{cwd}/{phenoFile:bn}_' + \"_\".join([x for x in phenoCol]) + '.regenie_pred.list'\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '12h', mem = '15G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", "bash: container=container_lmm, expand = \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', volumes = [f\"{lowmem_dir:a}:{lowmem_dir:a}\"]\n", " regenie \\\n", " --step 1 \\\n", @@ -1052,11 +1054,11 @@ "# in the case of bgen data from UKBB ref_first should be set to true\n", "parameter: ref_first= False\n", "# Specify dominant or recessive test. Default is additiveß\n", - "parameter: test = ''\n", + "parameter: test = 'additive'\n", "input: genoFile, group_by = 1, group_with = dict(info=[(path(f'{cwd}/{phenoFile:bn}_' + \"_\".join([x for x in phenoCol]) + '.regenie_pred.list'))] * len(genoFile))\n", "input_options = f\"--bgen {_input} --sample {sampleFile}\" if _input.suffix == \".bgen\" else f\"--bed {_input:n}\"\n", "output: [f'{cwd}/cache/{_input:bn}_'+ str(phenoCol[i]) + '.regenie.gz' for i in range(len(phenoCol))]\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '12h', mem = '15G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", "bash:container=container_lmm, expand = \"${ }\", stderr = f'{cwd}/cache/{_input:bn}.stderr', stdout = f'{cwd}/cache/{_input:bn}.stdout', volumes = [f\"{cwd:a}:{cwd:a}\"]\n", " set -e\n", " regenie \\\n", @@ -1071,8 +1073,8 @@ " --pred ${_input.info} \\\n", " --bsize ${bsize} \\\n", " --minMAC ${minMAC} \\\n", - " --minINFO ${bgenMinINFO}\\\n", - " ${('--test ' + test) if test in ['dominant','recessive','additive'] else ''} \\\n", + " --minINFO ${bgenMinINFO} \\\n", + " ${('--test ' + test) if test in ['dominant','recessive','additive'] else ''} \\\n", " --threads ${numThreads} \\\n", " --out ${cwd}/cache/${_input:bn} && \\\n", " gzip -f --best ${_output:n}" @@ -1120,7 +1122,7 @@ "input: genoFile, group_by = 1, group_with = dict(info=[(path(f'{cwd}/{phenoFile:bn}_' + \"_\".join([x for x in phenoCol]) + '.regenie_pred.list'))] * len(genoFile))\n", "input_options = f\"--bgen {_input} --sample {sampleFile}\" if _input.suffix == \".bgen\" else f\"--bed {_input:n}\"\n", "output: [f'{cwd}/cache/{_input:bn}_burden_'+ str(phenoCol[i]) + '.regenie.gz' for i in range(len(phenoCol))]\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '15G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", "bash:container=container_lmm, expand = \"${ }\", stderr = f'{cwd}/cache/{_input:bn}.stderr', stdout = f'{cwd}/cache/{_input:bn}.log', volumes = [f\"{cwd:a}:{cwd:a}\"]\n", " set -e\n", " regenie \\\n", @@ -1166,7 +1168,7 @@ "f = open(mask_file, \"r\")\n", "masks = [i.split(\" \")[0] for i in f.readlines()]\n", "output: [f'{cwd}/cache/{_input:bn}_'+ str(phenoCol[i]) + \"_\" + str(masks[j]) + \".\" + str(aaf_bins[k]) + '.regenie.gz' for i in range(len(phenoCol)) for j in range(len(masks)) for k in range(len(aaf_bins))] \n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '12h', mem = '15G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", "python: container=container_lmm, expand = \"${ }\", stderr = f'{cwd}/cache/{_input:bn}.stderr', stdout = f'{cwd}/cache/{_input:bn}.stdout', volumes = [f\"{cwd:a}:{cwd:a}\"]\n", " import pandas as pd \n", "\n", @@ -1228,7 +1230,7 @@ "parameter: invNormalize = 'FALSE'\n", "input: bfile, phenoFile\n", "output: f'{cwd}/{bfile:bn}.{phenoFile:bn}.SAIGE.rda', f'{cwd}/{bfile:bn}.{phenoFile:bn}.SAIGE.varianceRatio.txt'\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '60G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", "bash: expand = \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', template_name='conda', env_name='RSAIGE'\n", " Rscript ${script_path} \\\n", " --plinkFile=${_input[0]:n} \\\n", @@ -1278,7 +1280,7 @@ "input_options = f\"--bgenFile={_genoFile} --bgenFileIndex=${_genoFile}.bgi --sampleFile=${cwd}/{sampleFile:bn}.SAIGE_sample --minInfo=${bgenMinINFO}\" if _input.suffix == \".bgen\" else f\"--plinkFile={_input:n}\"\n", "output: f'{cwd}/cache/{_genoFile:bn}.{phenoFile:bn}.SAIGE.gz'\n", "fail_if(not path(f'{_genoFile}.bgi').is_file() and _input.suffix == '.bgen', msg = f'Cannot find file ``{_genoFile}.bgi``. Please generate it using command ``bgenix -g {_genoFile} -index``.')\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '48h', mem = '60G', tags = f'{step_name}_{_output:bn}'\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output:bn}'\n", "bash: expand = \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', template_name='conda', env_name='RSAIGE'\n", " Rscript ${script_path} \\\n", " ${input_options}\\\n", @@ -1589,7 +1591,7 @@ "input: group_by = lambda x: [x[i::len(phenoCol)] for i in range(len(phenoCol))], group_with='phenoCol'\n", "output: f'{cwd}/{phenoFile:bn}_{_phenoCol}.{step_name.rsplit(\"_\",1)[0]}.snp_stats.gz',\n", " f'{cwd}/{phenoFile:bn}_{_phenoCol}.{step_name.rsplit(\"_\",1)[0]}.snp_counts.txt'\n", - "task: trunk_workers = 1, trunk_size = 1, walltime = '1h', mem = '36G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'\n", + "task: trunk_workers = 1, trunk_size = 1, walltime = walltime, mem = mem, cores = 1, tags = f'{step_name}_{_output[0]:bn}'\n", "python: container=container_lmm, expand ='${ }'\n", " import gzip\n", " import pandas as pd\n", @@ -1647,7 +1649,7 @@ " f'{cwd}/cache/nonsin.genelist',\n", " f'{cwd}/cache/nondup.snplist',\n", " f'{cwd}/cache/someannoslim.csv'\n", - "task: trunk_workers = 1, walltime = '10h', mem = '60G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", + "task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", "\n", "# Extract target fields (CADD and GWAS catelog) from snpannofile to a smaller file someanno.txt\n", "bash:container=container_lmm, expand = \"${ }\", stderr = f'{cwd}/{step_name}.stderr', stdout = f'{cwd}/{step_name}.stdout'\n", @@ -1731,7 +1733,7 @@ "bins=[str(phenoCol[i]) + f'.{step_name.rsplit(\"_\",1)[0]}.' + str(masks[j]) + \".\" + str(aaf_bins[k]) for i in range(len(phenoCol)) for j in range(len(masks)) for k in range(len(aaf_bins))]\n", "input: [f'{cwd}/{phenoFile:bn}_' + str(phenoCol[i]) +f'.{step_name.rsplit(\"_\",1)[0]}.snp_stats.gz' for i in range(len(phenoCol))]+[f'{cwd}/{phenoFile:bn}_' + str(phenoCol[i]) +f'.{step_name.rsplit(\"_\",1)[0]}.snp_counts.txt' for i in range(len(phenoCol))],group_by = lambda x: [x[i::len(phenoCol)] for i in range(len(phenoCol))], group_with='phenoCol'\n", "output: [f'{cwd}/{phenoFile:bn}_' + bins[n] + '.snp_stats.gz' for n in range(len(bins))]+[f'{cwd}/{phenoFile:bn}_' + bins[n] + '.snp_counts.txt' for n in range(len(bins))]+[f'{cwd}/{phenoFile:bn}_' + bins[n] + '.remove_sin.snp_stats.gz' for n in range(len(bins))]\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '3h', mem = '64G', tags = f'{step_name}_{_input[0]:bn}' \n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_input[0]:bn}' \n", "python: container=container_lmm, expand = \"${ }\", stderr = f'{cwd}/{step_name}.stderr', stdout = f'{cwd}/{step_name}.stdout'\n", " import gzip\n", " import pandas as pd\n", @@ -1811,7 +1813,7 @@ " annotated_manhattan = f'{_input[0]:nn}.manhattan_annotated.png',\n", " analysis_summary = f'{_input[0]:nn}.analysis_summary.md',\n", " plot_data = f'{_input[0]:nn}.plot_data.rds'\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '3h', mem = '64G', tags = f'{step_name}_{_output[0]:bn}' \n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output[0]:bn}' \n", "bash: container=container_lmm, expand = \"${ }\"\n", " echo '''---\n", " theme: base-theme\n", @@ -2055,7 +2057,7 @@ " annotated_manhattan = f'{_input[0]:nnn}.manhattan_annotated.png',\n", " analysis_summary = f'{_input[0]:nnn}.analysis_summary.md',\n", " plot_data = f'{_input[0]:nnn}.plot_data.rds'\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '3h', mem = '64G', tags = f'{step_name}_{_output[0]:bn}' \n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, tags = f'{step_name}_{_output[0]:bn}' \n", "bash: container=container_lmm, expand = \"${ }\"\n", " echo '''---\n", " theme: base-theme\n", From 54a2d42e0baeb7474ed286db15c7bfe64a634bc5 Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Wed, 2 Mar 2022 17:08:48 -0500 Subject: [PATCH 16/63] changes to parameters for perfect_genecov --- GWAS/MTAG.ipynb | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/GWAS/MTAG.ipynb b/GWAS/MTAG.ipynb index 52e747c..5a148c0 100644 --- a/GWAS/MTAG.ipynb +++ b/GWAS/MTAG.ipynb @@ -159,7 +159,7 @@ "## MWE\n", "```\n", "sumstatsFiles=`echo ~/output/*hg19.snp_stats_original_columns.gz`\n", - "sos dryrun ~/project/UKBB_GWAS_dev/workflow/MTAG.ipynb mtag \\\n", + "sos run ~/project/UKBB_GWAS_dev/workflow/MTAG.ipynb mtag \\\n", "--cwd ~/output \\\n", "--sumstatsFiles $sumstatsFiles \\\n", "--formatFile ~/project/bioworkflows/GWAS/data/mtag_template.yml \\\n", @@ -213,7 +213,7 @@ "# If there's no overlap between samples\n", "parameter: no_overlap = False\n", "# If the traits are perfectly correlated\n", - "parameter: perfect_gencov = False\n", + "parameter: perfect_gencov = True\n", "# Assume equal heritability of traits\n", "parameter: h2_equal = False\n", "# Reference Ld used by ldsc.py needs to be splitted by chromosome\n", @@ -302,9 +302,9 @@ "[mtag_2]\n", "parameter: job_name=''\n", "input: group_by='all'\n", - "output: f'{cwd}/{job_name}_sigma_hat.mtag.txt',\n", - " f'{cwd}/{job_name}_omega_hat.mtag.txt',\n", - " [f'{cwd}/{job_name}.trait_{x}.mtag.txt' for x in range(len(sumstatsFiles))]\n", + "output: f'{cwd}/{job_name}_sigma_hat.txt',\n", + " f'{cwd}/{job_name}_omega_hat.txt',\n", + " [f'{cwd}/{job_name}.trait_{x}.txt' for x in range(len(sumstatsFiles))]\n", "task: trunk_workers = 1, trunk_size = job_size, walltime = '10h', mem = '30G', cores = numThreads, tags = f'{step_name}_{_output[1]:bn}'\n", "bash: expand = \"${ }\", stderr = f'{_output[1]:n}.stderr', stdout = f'{_output[1]:n}.log'\n", "\n", @@ -321,9 +321,9 @@ " --out ${cwd}/${job_name} \\\n", " --n_min 0.0 \\\n", " ${('--ld_ref_panel ' + ld_ref_panel + '/') } \\\n", - " ${('--' + no_overlap ) if no_overlap is True else ''} \\\n", - " ${('--' + perfect_gencov) if perfect_gencov is True else ''} \\\n", - " ${('--' + h2_equal) if h2_equal is True else ''} \\\n", + " ${('--no_overlap') if no_overlap else ''} \\\n", + " ${('--perfect_gencov') if perfect_gencov else ''} \\\n", + " ${('--h2_equal') if h2_equal else ''} \\\n", " --force" ] }, From 6914175b5cce7db7a44cc17a276f2eaf15124357 Mon Sep 17 00:00:00 2001 From: asingh100 <55717171+asingh100@users.noreply.github.com> Date: Wed, 2 Mar 2022 21:31:26 -0500 Subject: [PATCH 17/63] Add files via upload --- .../LDSC_DeepSea_Minimal_Example.ipynb | 508 +++++++----------- .../LDSC_DeepSea_Minimal_Example.sos | 133 +---- 2 files changed, 196 insertions(+), 445 deletions(-) diff --git a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb index e1f2147..1e39798 100644 --- a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb +++ b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb @@ -6,25 +6,20 @@ "kernel": "SoS" }, "source": [ - "## Tutorial Workflow for LDSC with DeepSea Integration:\n", + "## DeepSea Pipeline:\n", "\n", "This is the code to run the minimal working example for LDSC with DeepSea Integration. The code will train the deepsea model on the set of features provided using the .yml file on the google drive folder, get predictions on the reference genome from the trained model, and run LDSC on the resulting predictions to output enrichments. \n", "\n", "If you would like to use a different set of features or change training parameters, please edit the .yml file provided and everything else will still work.\n", "\n", - "This is the command to run the Minimal Working Example:\n", - "\n", - "\n", - "sos run LDSC_DeepSea_Code.ipynb --model /mnt/mfs/statgen/Anmol/training_files/tutorial/training_outputs/model --feature_list /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial_features.txt --output_tsv /mnt/mfs/statgen/Anmol/training_files/tutorial/testing --tsv /mnt/mfs/statgen/Anmol/training_files/tutorial/testing --annot_files /mnt/mfs/statgen/Anmol/training_files/tutorial/annot_files --sumst /mnt/mfs/statgen/Anmol/polyfun/Dey/PGCALZ2sumstatsExcluding23andMe.txt --output_sumst /mnt/mfs/statgen/Anmol/polyfun/Dey/2021.Updated.sumstats.gz --signed True --bim /mnt/mfs/statgen/Anmol/training_files/tutorial/plink_files --num_features 7 --ld_scores /mnt/mfs/statgen/Anmol/training_files/tutorial/annot_files --ctrl_sumstats /mnt/mfs/statgen/Anmol/polyfun/Dey/AMD.sumstats.gz --AD_sumstats /mnt/mfs/statgen/Anmol/polyfun/Dey/2021.Updated.sumstats.gz --w_ld_ctrl /mnt/mfs/statgen/Anmol/training_files/testing/ldsc/AD_Variants/tutorial_data/weights_hm3_no_hla/weights. --frq_file_ctrl /mnt/mfs/statgen/Anmol/training_files/testing/ldsc/AD_Variants/frq/1000G.EUR.QC. --w_ld_AD /mnt/mfs/statgen/Anmol/training_files/testing/ldsc/AD_Variants/tutorial_data/weights_hm3_no_hla/weights.2021. --frq_file_AD /mnt/mfs/statgen/Anmol/training_files/testing/ldsc/AD_Variants/frq/1000G.2021.EUR.QC. --ref_ld annot_files --pheno AMD\n", - "\n", "## Command Interface:\n", "\n", - "This is the list of commands and workflows with explanations for each one" + "This is the list of commands and workflows with explanations for each one, detailed information for each step will be presented below." ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 23, "metadata": { "kernel": "SoS" }, @@ -33,104 +28,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "usage: sos run LDSC_DeepSea_Minimal_Example.ipynb\n", - " [workflow_name | -t targets] [options] [workflow_options]\n", - " workflow_name: Single or combined workflows defined in this script\n", - " targets: One or more targets to generate\n", - " options: Single-hyphen sos parameters (see \"sos run -h\" for details)\n", - " workflow_options: Double-hyphen workflow-specific parameters\n", - "\n", - "Workflows:\n", - " train_model\n", - " make_annot\n", - " format_annot\n", - " munge_sumstats_no_sign\n", - " munge_sumstats_sign\n", - " calc_ld_score\n", - " convert_ld_snps\n", - " calc_enrichment\n", - "\n", - "Sections\n", - " train_model:\n", - " make_annot:\n", - " Workflow Options:\n", - " --feature-list VAL (as str, required)\n", - " path to feature list file\n", - " --model VAL (as str, required)\n", - " path to trained model location\n", - " --output-tsv VAL (as str, required)\n", - " path to output directory\n", - " format_annot:\n", - " Workflow Options:\n", - " --tsv . (as path)\n", - " path to tsv files directory\n", - " --annot-files . (as path)\n", - " path to output file directory\n", - " munge_sumstats_no_sign:\n", - " Workflow Options:\n", - " --sumst VAL (as str, required)\n", - " path to summary statistic file\n", - " --alleles 'w_hm3.snplist'\n", - " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", - " and A2) for the munge_sumstats program\n", - " --output-sumst VAL (as str, required)\n", - " path to output file\n", - " --[no-]signed (default to False)\n", - " does summary statistic contain Z or Beta\n", - " munge_sumstats_sign: This option is for when the summary statistic file does\n", - " contain a signed summary statistic (Z or Beta)\n", - " Workflow Options:\n", - " --sumst VAL (as str, required)\n", - " path to summary statistic file\n", - " --alleles 'w_hm3.snplist'\n", - " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", - " and A2) for the munge_sumstats program\n", - " --output-sumst-2 VAL (as str, required)\n", - " path to output file\n", - " --[no-]signed (default to False)\n", - " does summary statistic contain Z or Beta\n", - " calc_ld_score:\n", - " Workflow Options:\n", - " --bim . (as path)\n", - " Path to directory with bim files\n", - " --annot-files . (as path)\n", - " Path to directory with annotation files, output will\n", - " appear here too. Make sure to remove the SNP, CHR, and\n", - " BP columns from the annotation files if present before\n", - " running.\n", - " --num-features VAL (as int, required)\n", - " number of features\n", - " convert_ld_snps:\n", - " Workflow Options:\n", - " --ld-scores VAL (as str, required)\n", - " Path to directory with ld score files AND annotation\n", - " files\n", - " --num-features VAL (as int, required)\n", - " calc_enrichment:\n", - " Workflow Options:\n", - " --ctrl-sumstats VAL (as str, required)\n", - " Path to Control Summary statistics File\n", - " --AD-sumstats VAL (as str, required)\n", - " Path to AD Summary statistics File\n", - " --ref-ld VAL (as str, required)\n", - " Path to Reference LD Scores File Directory\n", - " --w-ld-ctrl VAL (as str, required)\n", - " Path to LD Weight Files for Control Sumstats (Format\n", - " like minimal working example)\n", - " --frq-file-ctrl VAL (as str, required)\n", - " path to frequency files for Control Sumstats (Format\n", - " like minimal working example)\n", - " --w-ld-AD VAL (as str, required)\n", - " Path to LD Weight Files for AD Sumstats (Format like\n", - " minimal working example)\n", - " --frq-file-AD VAL (as str, required)\n", - " path to frequency files for AD Sumstats (Format like\n", - " minimal working example)\n", - " --num-features VAL (as int, required)\n", - " Number of Features\n", - " --pheno VAL (as str, required)\n", - " Control Phenotype, For Output\n", - "\n" + "No help information is available for script run: Failed to locate LDSC_DeepSea_Code.ipynb.sos\r\n" ] } ], @@ -143,7 +41,172 @@ "metadata": { "kernel": "Python 3 (ipykernel)" }, - "source": [] + "source": [ + "## Background:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Training Model: \n", + "\n", + "**Workflow Command to train model for Minimal Example:** `sos run LDSC_DeepSea_Code.ipynb train_model`\n", + "\n", + "To train the model using different data you can use the .yml file provided to change the training parameters and files. An example of how this file looks for the minimal working example is shown below." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "kernel": "SoS" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---\n", + "ops: [train, evaluate]\n", + "model: {\n", + " path: /mnt/mfs/statgen/Anmol/training_files/deeperdeepsea.py,#UPDATE\n", + " class: DeeperDeepSEA,\n", + " class_args: {\n", + " sequence_length: 1000,\n", + " n_targets: 7,\n", + " },\n", + " non_strand_specific: mean\n", + "}\n", + "sampler: !obj:selene_sdk.samplers.IntervalsSampler {\n", + " reference_sequence: !obj:selene_sdk.sequences.Genome {\n", + " input_path: /mnt/mfs/statgen/Anmol/training_files/male.hg19.fasta,#UPDATE\n", + " blacklist_regions: hg19\n", + " },\n", + " features: !obj:selene_sdk.utils.load_features_list {\n", + " input_path: /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial_features.txt #UPDATE\n", + " },\n", + " target_path: /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial.bed.gz, #UPDATE\n", + " intervals_path: /mnt/mfs/statgen/Anmol/training_files/DNase_Intervals_FULL.txt, #UPDATE\n", + " seed: 127,\n", + " # A positive example is an 1000bp sequence with at least 1 class/feature annotated to it.\n", + " # A negative sample has no classes/features annotated to the sequence.\n", + " sample_negative: True,\n", + " sequence_length: 1000,\n", + " center_bin_to_predict: 200,\n", + " test_holdout: 0.2,\n", + " validation_holdout: 0.3,\n", + " # The feature must take up 50% of the bin (200bp) for it to be considered\n", + " # a feature annotated to that sequence.\n", + " feature_thresholds: 0.25,\n", + " mode: train,\n", + " save_datasets: [validate, test]\n", + "}\n", + "train_model: !obj:selene_sdk.TrainModel {\n", + " batch_size: 64,\n", + " max_steps: 501, # update this value for longer training\n", + " report_stats_every_n_steps: 250,\n", + " n_validation_samples: 6000,\n", + " n_test_samples: 22000,\n", + " cpu_n_threads: 32,\n", + " use_cuda: False, # TODO: update this if CUDA is not on your machine\n", + " data_parallel: False\n", + "}\n", + "random_seed: 1447\n", + "output_dir: ./tutorial/training_outputs/model #UPDATE\n", + "create_subdirectory: False\n", + "load_test_set: False\n", + "...\n" + ] + } + ], + "source": [ + "with open('all_neuron_tutorial.yml') as f:\n", + " contents = f.read()\n", + " print(contents)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "Above shows the .yml file used to train the minimal working example model in this workflow. \n", + "\n", + "To use your own data you must update the:\n", + "\n", + "1. n_targets (Number of Features you are training on):\n", + "\n", + "`n_targets: 7`\n", + "\n", + "2. Feature list file (a list of all the distinct features you are training on):\n", + "\n", + "`features: !obj:selene_sdk.utils.load_features_list {\n", + " input_path: /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial_features.txt #UPDATE\n", + " }`\n", + " \n", + "3. Target Path File (A combined bed file for all of your features):\n", + "\n", + "`target_path: /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial.bed.gz, #UPDATE`\n", + "\n", + "4. Max_Steps (Maximum number of Training Steps), n_validation_samples (Number of Validation Samples), n_test_samples (Number of Testing Samples):\n", + "\n", + "`train_model: !obj:selene_sdk.TrainModel {\n", + " batch_size: 64,\n", + " max_steps: 501, # update this value for longer training\n", + " report_stats_every_n_steps: 250,\n", + " n_validation_samples: 6000,\n", + " n_test_samples: 22000,\n", + " cpu_n_threads: 32,\n", + " use_cuda: False, # TODO: update this if CUDA is not on your machine\n", + " data_parallel: False\n", + "}`\n", + "\n", + "Generally you want to train until the validation and training loss are not decreasing anymore. For the full 2032 features example, I have found that this occurs at around 250,000 training steps which is what I set the max_steps parameter to for that case. Depending on how many features you are using, this could be more or less. You will want to change how frequently the model statistics (ROC,AUC, etc) are outputted especially if you are training on a large number of steps. For 250,000 steps, I would recommend setting the `report_stats_every_n_steps` parameter to 10,000 so you can assess how the model is training frequently enough but you are not wasting too much time calculating the ROC and AUC too frequently. For Validation and Testing Samples, I used 40,000 and 500,000 respectively for 2032 features. Again this will depend on the amount of features you have so adjust this number to be more or less depending on the number of features you are training the model on. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Getting Predictions from Trained Model:\n", + "\n", + "**Workflow Command to get predictions from trained model for Minimal Example:** `sos run LDSC_DeepSea_Minimal_Example.ipynb make_annot --feature_list /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial_features.txt --model /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial/training_outputs/model --output_tsv /mnt/mfs/statgen/Anmol/training_files/tutorial/testing --num_features 7 --vcf /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial_1000G_chr_`\n", + "\n", + "**Explanations of Parameters so that you can change to run with your own data:**\n", + "\n", + "1. feature_list: Path to list of distinct features, same as one used in .yml file\n", + "\n", + "2. model: Path to location of trained model folder, is the output_dir parameter in the .yml training file\n", + "\n", + "3. output_tsv: Path to directory where you want to output the predictions to \n", + "\n", + "4. num_features: Number of Features you trained the model on\n", + "\n", + "5. vcf: Path to location of reference genome vcf files you want to use for predictions, program will append the chromosome number and .vcf to the end of file name automatically to loop over all the chromosomes so format file name in command as 1000G_chr_ and leave the chr numbers and .vcf out. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Formatting Resulting Predictions to Annotation File for LDSC: \n", + "\n", + "**Workflow Command to format Prediction files to LDSC Annotation Files:** `sos run LDSC_DeepSea_Minimal_Example.ipynb format_annot --tsv /mnt/mfs/statgen/Anmol/training_files/tutorial/testing --annot_files /mnt/mfs/statgen/Anmol/training_files/tutorial/annot_files\n", + "\n", + "**Explanations of Parameters so that you can change to run with your own data:**\n", + "\n", + "1. tsv: Path to where prediction files (.tsv files) are located\n", + "\n", + "2. annot_files: Path to location where you want the annotation files to be outputted" + ] }, { "cell_type": "markdown", @@ -198,9 +261,13 @@ "parameter: model = str\n", "#path to output directory\n", "parameter: output_tsv = str\n", + "#number of features\n", + "parameter: num_features = int\n", + "#VCF files for Reference Genome [Give in this format: tutorial_1000G_chr_, as program will loop over chromsomes and add vcf extension automatically]\n", + "parameter: vcf = str()\n", "\n", "\n", - "python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif'\n", + "python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif',expand = \"${ }\"\n", "\n", " from selene_sdk.utils import load_path\n", " from selene_sdk.utils import parse_configs_and_run\n", @@ -211,11 +278,11 @@ " from selene_sdk.utils import DeeperDeepSEA\n", " import glob\n", " import os\n", - " distinct_features = load_features_list({feature_list})\n", + " distinct_features = load_features_list('${feature_list}')\n", "\n", " model_predict = AnalyzeSequences(\n", - " NonStrandSpecific(DeeperDeepSEA(1000,{num_features})),\n", - " {model}+\"/best_model.pth.tar\",\n", + " NonStrandSpecific(DeeperDeepSEA(1000,${num_features})),\n", + " '${model}'+\"/best_model.pth.tar\",\n", " sequence_length=1000,\n", " features=distinct_features,\n", " reference_sequence=Genome(\"/mnt/mfs/statgen/Anmol/training_files/male.hg19.fasta\"),\n", @@ -224,9 +291,9 @@ "\n", " for i in range(1,23):\n", " model_predict.variant_effect_prediction(\n", - " \"/mnt/mfs/statgen/Anmol/training_files/testing/1000G_chr_\"+str(i)+\".vcf\",\n", + " ${vcf}+str(i)+\".vcf\",\n", " save_data=[\"abs_diffs\"], # only want to save the absolute diff score data\n", - " output_dir={output})" + " output_dir='${output_tsv}')" ] }, { @@ -251,7 +318,7 @@ "\n", "[format_annot]\n", "\n", - "#path to tsv files directory\n", + "#path to tsv files [Give in this format: tutorial_1000G_chr_, as program will loop over chromsomes and add tsv extension automatically]\n", "parameter: tsv = path()\n", "#path to output file directory\n", "parameter: annot_files = path()\n", @@ -259,13 +326,13 @@ "R: expand = \"${ }\", container=\"/mnt/mfs/statgen/Anmol/r-packages.sif\"\n", " library(data.table)\n", " library(tidyverse)\n", - " data = fread(paste0(\"${tsv}\",\"/tutorial_1000G_chr_\",22,\"_abs_diffs.tsv\"))\n", + " data = fread(paste0(\"${tsv}\",22,\"_abs_diffs.tsv\"))\n", " features = colnames(data)[9:ncol(data)]\n", " features = data.frame(features)\n", " features$encoding = paste0(\"feat_\",seq(1,nrow(features)))\n", " fwrite(features,paste0(\"${annot_files}\",\"/feature_encoding.txt\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", " for (i in seq(1,22)){\n", - " data = fread(paste0(\"${tsv}\",\"/tutorial_1000G_chr_\",i,\"_abs_diffs.tsv\"))\n", + " data = fread(paste0(\"${tsv}\",i,\"_abs_diffs.tsv\"))\n", " data_2 = select(data,-seq(4,8))\n", " base = data.frame(base=rep(1,nrow(data_2)))\n", " fwrite(base,paste0(\"${annot_files}\",\"/base_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", @@ -278,215 +345,6 @@ " }\n", " }" ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "Python 3 (ipykernel)" - }, - "source": [ - "## Munge Summary Statistics (Option 1: No Signed Summary Statistic):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "# Option when Summary Statistic File does not contain a Z or Beta Column (Signed Summary Statistic)\n", - "\n", - "[munge_sumstats_no_sign]\n", - "\n", - "\n", - "\n", - "#path to summary statistic file\n", - "parameter: sumst = str\n", - "#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program\n", - "parameter: alleles = \"w_hm3.snplist\"\n", - "#path to output file\n", - "parameter: output_sumst = str\n", - "#does summary statistic contain Z or Beta\n", - "parameter: signed = False\n", - "\n", - "bash: expand = '${ }'\n", - " if [${signed}==True]\n", - " then\n", - " python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst} --a1-inc\n", - " fi" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## Munge Summary Statistics (Option 2: Contains Signed Summary Statistic):" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "# This option is for when the summary statistic file does contain a signed summary statistic (Z or Beta)\n", - "[munge_sumstats_sign]\n", - "\n", - "\n", - "\n", - "#path to summary statistic file\n", - "parameter: sumst = str\n", - "#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program\n", - "parameter: alleles = \"w_hm3.snplist\"\n", - "#path to output file\n", - "parameter: output_sumst_2 = str\n", - "#does summary statistic contain Z or Beta\n", - "parameter: signed = False\n", - "\n", - "bash: expand = '${ }'\n", - " if [${signed}==False]\n", - " then\n", - " python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst_2}\n", - " fi" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "Python 3 (ipykernel)" - }, - "source": [ - "## Calculate LD Scores:\n", - "\n", - "**Make sure to delete SNP,CHR, and BP columns from annotation files if they are present otherwise this code will not work. Before deleting, if these columns are present, make sure that the annotation file is sorted.**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "\n", - "[calc_ld_score]\n", - "\n", - "#Path to directory with bim files\n", - "parameter: bim = path()\n", - "#Path to directory with annotation files, output will appear here too. Make sure to remove the SNP, CHR, and BP columns from the annotation files if present before running.\n", - "parameter: annot_files = path()\n", - "#number of features\n", - "parameter: num_features = int\n", - "\n", - "bash: expand = '${ }'\n", - " #echo {annot_files} > out.txt\n", - " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.22 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_22.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_22 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", - " seq 1 22| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.j --l2 --ld-wind-cm 1 --annot ${annot_files}/base_chr_j.annot.gz --thin-annot --out ${annot_files}/base_chr_j --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## Convert LD Score SNPs to AD Summary Statistic Format:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "# Convert SNP format in LD Score Files to CHR:BP to match with AD Summary Statistic Format\n", - "\n", - "\n", - "[convert_ld_snps]\n", - "\n", - "#Path to directory with ld score files AND annotation files\n", - "parameter: ld_scores = str\n", - "\n", - "parameter: num_features = int\n", - "\n", - "\n", - "R: expand = \"${ }\", container=\"/mnt/mfs/statgen/Anmol/r-packages.sif\"\n", - " library(tidyverse)\n", - " #library(R.utils)\n", - " library(data.table)\n", - " for (i in seq(1,22)){\n", - " data = read.table(gzfile(paste0(\"${ld_scores}/base_chr_\",i,\".l2.ldscore.gz\")),header=T)\n", - " data_2 = fread(paste0(\"${ld_scores}/base_chr_\",i,\".l2.M_5_50\"))\n", - " data_3 = read.table(gzfile(paste0(\"${ld_scores}/base_chr_\",i,\".annot.gz\")),header=T)\n", - " data$SNP = paste0(data$CHR,\":\",data$BP)\n", - " fwrite(data,paste0(\"${ld_scores}/AD_base_chr_\",i,\".l2.ldscore.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", - " fwrite(data_2,paste0(\"${ld_scores}/AD_base_chr_\",i,\".l2.M_5_50\"),quote=F,sep=\"\\t\",row.names=F,col.names=F)\n", - " fwrite(data_3,paste0(\"${ld_scores}/AD_base_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", - " for (j in seq(1,${num_features})){\n", - " data = read.table(gzfile(paste0(\"${ld_scores}/feat_\",j,\"_chr_\",i,\".l2.ldscore.gz\")),header=T)\n", - " data_2 = fread(paste0(\"${ld_scores}/feat_\",j,\"_chr_\",i,\".l2.M_5_50\"))\n", - " data_3 = read.table(gzfile(paste0(\"${ld_scores}/feat_\",j,\"_chr_\",i,\".annot.gz\")),header=T)\n", - " data$SNP = paste0(data$CHR,\":\",data$BP)\n", - " fwrite(data,paste0(\"${ld_scores}/AD_feat_\",j,\"_chr_\",i,\".l2.ldscore.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", - " fwrite(data_2,paste0(\"${ld_scores}/AD_feat_\",j,\"_chr_\",i,\".l2.M_5_50\"),quote=F,sep=\"\\t\",row.names=F,col.names=F)\n", - " fwrite(data_3,paste0(\"${ld_scores}/AD_feat_\",j,\"_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", - " }\n", - " }\n", - " \n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "Python 3 (ipykernel)" - }, - "source": [ - "## Calculate Functional Enrichment using Annotations:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "\n", - "[calc_enrichment]\n", - "\n", - "#Path to Control Summary statistics File\n", - "parameter: ctrl_sumstats = str\n", - "#Path to AD Summary statistics File\n", - "parameter: AD_sumstats = str\n", - "#Path to Reference LD Scores File Directory \n", - "parameter: ref_ld = str\n", - "#Path to LD Weight Files for Control Sumstats (Format like minimal working example)\n", - "parameter: w_ld_ctrl = str\n", - "#path to frequency files for Control Sumstats (Format like minimal working example)\n", - "parameter: frq_file_ctrl = str\n", - "#Path to LD Weight Files for AD Sumstats (Format like minimal working example)\n", - "parameter: w_ld_AD = str\n", - "#path to frequency files for AD Sumstats (Format like minimal working example)\n", - "parameter: frq_file_AD = str\n", - "#Number of Features\n", - "parameter: num_features = int \n", - "#Control Phenotype, For Output\n", - "parameter: pheno = str\n", - "\n", - "bash: expand = '${ }'\n", - " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${ctrl_sumstats} --ref-ld-chr ${ref_ld}/base_chr_,${ref_ld}/feat_j_chr_ --w-ld-chr ${w_ld_ctrl} --overlap-annot --frqfile-chr ${frq_file_ctrl} --out ${ref_ld}/${pheno}_feat_j\n", - " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${AD_sumstats} --ref-ld-chr ${ref_ld}/AD_base_chr_,${ref_ld}/AD_feat_j_chr_ --w-ld-chr ${w_ld_AD} --overlap-annot --frqfile-chr ${frq_file_AD} --out ${ref_ld}/AD_feat_j" - ] } ], "metadata": { diff --git a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos index ee3dd6f..31b9dd3 100644 --- a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos +++ b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.sos @@ -15,9 +15,13 @@ parameter: feature_list = str parameter: model = str #path to output directory parameter: output_tsv = str +#number of features +parameter: num_features = int +#VCF files for Reference Genome [Give in this format: tutorial_1000G_chr_, as program will loop over chromsomes and add vcf extension automatically] +parameter: vcf = str() -python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif' +python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif',expand = "${ }" from selene_sdk.utils import load_path from selene_sdk.utils import parse_configs_and_run @@ -28,11 +32,11 @@ python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif' from selene_sdk.utils import DeeperDeepSEA import glob import os - distinct_features = load_features_list({feature_list}) + distinct_features = load_features_list('${feature_list}') model_predict = AnalyzeSequences( - NonStrandSpecific(DeeperDeepSEA(1000,{num_features})), - {model}+"/best_model.pth.tar", + NonStrandSpecific(DeeperDeepSEA(1000,${num_features})), + '${model}'+"/best_model.pth.tar", sequence_length=1000, features=distinct_features, reference_sequence=Genome("/mnt/mfs/statgen/Anmol/training_files/male.hg19.fasta"), @@ -41,13 +45,13 @@ python3: container='/mnt/mfs/statgen/Anmol/deepsea_latest.sif' for i in range(1,23): model_predict.variant_effect_prediction( - "/mnt/mfs/statgen/Anmol/training_files/testing/1000G_chr_"+str(i)+".vcf", + ${vcf}+str(i)+".vcf", save_data=["abs_diffs"], # only want to save the absolute diff score data - output_dir={output}) + output_dir='${output_tsv}') [format_annot] -#path to tsv files directory +#path to tsv files [Give in this format: tutorial_1000G_chr_, as program will loop over chromsomes and add tsv extension automatically] parameter: tsv = path() #path to output file directory parameter: annot_files = path() @@ -55,13 +59,13 @@ parameter: annot_files = path() R: expand = "${ }", container="/mnt/mfs/statgen/Anmol/r-packages.sif" library(data.table) library(tidyverse) - data = fread(paste0("${tsv}","/tutorial_1000G_chr_",22,"_abs_diffs.tsv")) + data = fread(paste0("${tsv}",22,"_abs_diffs.tsv")) features = colnames(data)[9:ncol(data)] features = data.frame(features) features$encoding = paste0("feat_",seq(1,nrow(features))) fwrite(features,paste0("${annot_files}","/feature_encoding.txt"),quote=F,sep="\t",row.names=F,col.names=T) for (i in seq(1,22)){ - data = fread(paste0("${tsv}","/tutorial_1000G_chr_",i,"_abs_diffs.tsv")) + data = fread(paste0("${tsv}",i,"_abs_diffs.tsv")) data_2 = select(data,-seq(4,8)) base = data.frame(base=rep(1,nrow(data_2))) fwrite(base,paste0("${annot_files}","/base_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) @@ -74,114 +78,3 @@ R: expand = "${ }", container="/mnt/mfs/statgen/Anmol/r-packages.sif" } } -[munge_sumstats_no_sign] - - - -#path to summary statistic file -parameter: sumst = str -#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program -parameter: alleles = "w_hm3.snplist" -#path to output file -parameter: output_sumst = str -#does summary statistic contain Z or Beta -parameter: signed = False - -bash: expand = '${ }' - if [${signed}==True] - then - python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst} --a1-inc - fi - -# This option is for when the summary statistic file does contain a signed summary statistic (Z or Beta) -[munge_sumstats_sign] - - - -#path to summary statistic file -parameter: sumst = str -#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program -parameter: alleles = "w_hm3.snplist" -#path to output file -parameter: output_sumst_2 = str -#does summary statistic contain Z or Beta -parameter: signed = False - -bash: expand = '${ }' - if [${signed}==False] - then - python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst_2} - fi - -[calc_ld_score] - -#Path to directory with bim files -parameter: bim = path() -#Path to directory with annotation files, output will appear here too. Make sure to remove the SNP, CHR, and BP columns from the annotation files if present before running. -parameter: annot_files = path() -#number of features -parameter: num_features = int - -bash: expand = '${ }' - #echo {annot_files} > out.txt - seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.22 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_22.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_22 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt - seq 1 22| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.j --l2 --ld-wind-cm 1 --annot ${annot_files}/base_chr_j.annot.gz --thin-annot --out ${annot_files}/base_chr_j --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt - -[convert_ld_snps] - -#Path to directory with ld score files AND annotation files -parameter: ld_scores = str - -parameter: num_features = int - - -R: expand = "${ }", container="/mnt/mfs/statgen/Anmol/r-packages.sif" - library(tidyverse) - #library(R.utils) - library(data.table) - for (i in seq(1,22)){ - data = read.table(gzfile(paste0("${ld_scores}/base_chr_",i,".l2.ldscore.gz")),header=T) - data_2 = fread(paste0("${ld_scores}/base_chr_",i,".l2.M_5_50")) - data_3 = read.table(gzfile(paste0("${ld_scores}/base_chr_",i,".annot.gz")),header=T) - data$SNP = paste0(data$CHR,":",data$BP) - fwrite(data,paste0("${ld_scores}/AD_base_chr_",i,".l2.ldscore.gz"),quote=F,sep="\t",row.names=F,col.names=T) - fwrite(data_2,paste0("${ld_scores}/AD_base_chr_",i,".l2.M_5_50"),quote=F,sep="\t",row.names=F,col.names=F) - fwrite(data_3,paste0("${ld_scores}/AD_base_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) - for (j in seq(1,${num_features})){ - data = read.table(gzfile(paste0("${ld_scores}/feat_",j,"_chr_",i,".l2.ldscore.gz")),header=T) - data_2 = fread(paste0("${ld_scores}/feat_",j,"_chr_",i,".l2.M_5_50")) - data_3 = read.table(gzfile(paste0("${ld_scores}/feat_",j,"_chr_",i,".annot.gz")),header=T) - data$SNP = paste0(data$CHR,":",data$BP) - fwrite(data,paste0("${ld_scores}/AD_feat_",j,"_chr_",i,".l2.ldscore.gz"),quote=F,sep="\t",row.names=F,col.names=T) - fwrite(data_2,paste0("${ld_scores}/AD_feat_",j,"_chr_",i,".l2.M_5_50"),quote=F,sep="\t",row.names=F,col.names=F) - fwrite(data_3,paste0("${ld_scores}/AD_feat_",j,"_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) - } - } - - - -[calc_enrichment] - -#Path to Control Summary statistics File -parameter: ctrl_sumstats = str -#Path to AD Summary statistics File -parameter: AD_sumstats = str -#Path to Reference LD Scores File Directory -parameter: ref_ld = str -#Path to LD Weight Files for Control Sumstats (Format like minimal working example) -parameter: w_ld_ctrl = str -#path to frequency files for Control Sumstats (Format like minimal working example) -parameter: frq_file_ctrl = str -#Path to LD Weight Files for AD Sumstats (Format like minimal working example) -parameter: w_ld_AD = str -#path to frequency files for AD Sumstats (Format like minimal working example) -parameter: frq_file_AD = str -#Number of Features -parameter: num_features = int -#Control Phenotype, For Output -parameter: pheno = str - -bash: expand = '${ }' - seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${ctrl_sumstats} --ref-ld-chr ${ref_ld}/base_chr_,${ref_ld}/feat_j_chr_ --w-ld-chr ${w_ld_ctrl} --overlap-annot --frqfile-chr ${frq_file_ctrl} --out ${ref_ld}/${pheno}_feat_j - seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${AD_sumstats} --ref-ld-chr ${ref_ld}/AD_base_chr_,${ref_ld}/AD_feat_j_chr_ --w-ld-chr ${w_ld_AD} --overlap-annot --frqfile-chr ${frq_file_AD} --out ${ref_ld}/AD_feat_j - From 62812e9db6cd3228e4ee2f82fc269a1887f572a1 Mon Sep 17 00:00:00 2001 From: asingh100 <55717171+asingh100@users.noreply.github.com> Date: Thu, 3 Mar 2022 13:22:50 -0500 Subject: [PATCH 18/63] Add files via upload --- .../LDSC_DeepSea_Minimal_Example.ipynb | 49 +++++++++++++------ 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb index 1e39798..4ca5ada 100644 --- a/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb +++ b/LDSC/Deep_Learning/LDSC_DeepSea_Minimal_Example.ipynb @@ -37,12 +37,29 @@ ] }, { + "attachments": { + "image.png": { + "image/png": "" + } + }, "cell_type": "markdown", "metadata": { "kernel": "Python 3 (ipykernel)" }, "source": [ - "## Background:" + "## Background:\n", + "\n", + "DeepSea is a deep learning model developed by researchers at Princeton University to integrate epigenomic feature data and generate functional predictions for these features on input variants given by the user. DeepSEA can accurately predict the epigenetic state of a sequence, including transcription factors binding, DNase I sensitivities and histone marks in multiple cell types, and further utilize this capability to predict the chromatin effects of sequence variants and prioritize regulatory variants. This model allows for the user to predict effects based on an increased context such as a 1 kB window around the variant which allows for better accuracy and predictions from the model. \n", + "\n", + "![image.png](attachment:image.png)\n", + "\n", + "This diagram shows the DeepSea pipeline. First the model takes an input of 1 kB sequence from the reference genome and trains the model on epigenomic features such as DNase Sites and TFs. Then, using the trained model you can generate functional predictions, predictions of the effect the variant will have on the features in the model, on your own set of variants. These predictions can be used to identify and priortize functionally important variants for certain features in your model.\n", + "\n", + "For more background and information please refer two two papers written by the creaters at Princeton University:\n", + "\n", + "1. [Predicting effects of noncoding variants with deep learning–based sequence model](https://www.nature.com/articles/nmeth.3547)\n", + "\n", + "2. [Whole-genome deep-learning analysis identifies contribution of noncoding mutations to autism risk](https://www.nature.com/articles/s41588-019-0420-0)" ] }, { @@ -138,21 +155,21 @@ "\n", "To use your own data you must update the:\n", "\n", - "1. n_targets (Number of Features you are training on):\n", + "**1. n_targets (Number of Features you are training on):\n", "\n", "`n_targets: 7`\n", "\n", - "2. Feature list file (a list of all the distinct features you are training on):\n", + "**2. Feature list file (a list of all the distinct features you are training on):\n", "\n", "`features: !obj:selene_sdk.utils.load_features_list {\n", " input_path: /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial_features.txt #UPDATE\n", " }`\n", " \n", - "3. Target Path File (A combined bed file for all of your features):\n", + "**3. Target Path File (A combined bed file for all of your features):\n", "\n", "`target_path: /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial.bed.gz, #UPDATE`\n", "\n", - "4. Max_Steps (Maximum number of Training Steps), n_validation_samples (Number of Validation Samples), n_test_samples (Number of Testing Samples):\n", + "**4. Max_Steps (Maximum number of Training Steps), n_validation_samples (Number of Validation Samples), n_test_samples (Number of Testing Samples):\n", "\n", "`train_model: !obj:selene_sdk.TrainModel {\n", " batch_size: 64,\n", @@ -176,19 +193,21 @@ "source": [ "## Getting Predictions from Trained Model:\n", "\n", - "**Workflow Command to get predictions from trained model for Minimal Example:** `sos run LDSC_DeepSea_Minimal_Example.ipynb make_annot --feature_list /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial_features.txt --model /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial/training_outputs/model --output_tsv /mnt/mfs/statgen/Anmol/training_files/tutorial/testing --num_features 7 --vcf /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial_1000G_chr_`\n", + "**Workflow Command to get predictions from trained model for Minimal Example:** \n", + "\n", + "`sos run LDSC_DeepSea_Minimal_Example.ipynb make_annot --feature_list /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial_features.txt --model /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial/training_outputs/model --output_tsv /mnt/mfs/statgen/Anmol/training_files/tutorial/testing --num_features 7 --vcf /mnt/mfs/statgen/Anmol/training_files/tutorial/tutorial_1000G_chr_`\n", "\n", "**Explanations of Parameters so that you can change to run with your own data:**\n", "\n", - "1. feature_list: Path to list of distinct features, same as one used in .yml file\n", + "**1. feature_list: Path to list of distinct features, same as one used in .yml file\n", "\n", - "2. model: Path to location of trained model folder, is the output_dir parameter in the .yml training file\n", + "**2. model: Path to location of trained model folder, is the output_dir parameter in the .yml training file\n", "\n", - "3. output_tsv: Path to directory where you want to output the predictions to \n", + "**3. output_tsv: Path to directory where you want to output the predictions to \n", "\n", - "4. num_features: Number of Features you trained the model on\n", + "**4. num_features: Number of Features you trained the model on\n", "\n", - "5. vcf: Path to location of reference genome vcf files you want to use for predictions, program will append the chromosome number and .vcf to the end of file name automatically to loop over all the chromosomes so format file name in command as 1000G_chr_ and leave the chr numbers and .vcf out. " + "**5. vcf: Path to location of reference genome vcf files you want to use for predictions, program will append the chromosome number and .vcf to the end of file name automatically to loop over all the chromosomes so format file name in command as `1000G_chr_` and leave the chr numbers and .vcf out. " ] }, { @@ -199,13 +218,15 @@ "source": [ "## Formatting Resulting Predictions to Annotation File for LDSC: \n", "\n", - "**Workflow Command to format Prediction files to LDSC Annotation Files:** `sos run LDSC_DeepSea_Minimal_Example.ipynb format_annot --tsv /mnt/mfs/statgen/Anmol/training_files/tutorial/testing --annot_files /mnt/mfs/statgen/Anmol/training_files/tutorial/annot_files\n", + "**Workflow Command to format Prediction files to LDSC Annotation Files:** \n", + "\n", + "`sos run LDSC_DeepSea_Minimal_Example.ipynb format_annot --tsv /mnt/mfs/statgen/Anmol/training_files/tutorial/testing --annot_files /mnt/mfs/statgen/Anmol/training_files/tutorial/annot_files`\n", "\n", "**Explanations of Parameters so that you can change to run with your own data:**\n", "\n", - "1. tsv: Path to where prediction files (.tsv files) are located\n", + "**1. tsv: Path to where prediction files (.tsv files) are located\n", "\n", - "2. annot_files: Path to location where you want the annotation files to be outputted" + "**2. annot_files: Path to location where you want the annotation files to be outputted" ] }, { From 49c376529c8b7db6481d7329ccaec3ae78f6ca05 Mon Sep 17 00:00:00 2001 From: asingh100 <55717171+asingh100@users.noreply.github.com> Date: Tue, 8 Mar 2022 12:10:08 -0500 Subject: [PATCH 19/63] Updated LDSC Code Workflow with new sections and improvements --- LDSC/LDSC_Code.ipynb | 402 +++++++++++++++++++++++++++++++++++++++++++ LDSC/LDSC_Code.sos | 127 ++++++++++++++ 2 files changed, 529 insertions(+) create mode 100644 LDSC/LDSC_Code.ipynb create mode 100644 LDSC/LDSC_Code.sos diff --git a/LDSC/LDSC_Code.ipynb b/LDSC/LDSC_Code.ipynb new file mode 100644 index 0000000..73b5fc1 --- /dev/null +++ b/LDSC/LDSC_Code.ipynb @@ -0,0 +1,402 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## SoS Workflow:\n", + "\n", + "This is the options and the SoS code to run the LDSC pipeline using your own data. \n", + "\n", + "## Command Interface:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "kernel": "SoS" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "usage: sos run LDSC_Code.ipynb [workflow_name | -t targets] [options] [workflow_options]\n", + " workflow_name: Single or combined workflows defined in this script\n", + " targets: One or more targets to generate\n", + " options: Single-hyphen sos parameters (see \"sos run -h\" for details)\n", + " workflow_options: Double-hyphen workflow-specific parameters\n", + "\n", + "Workflows:\n", + " make_annot\n", + " munge_sumstats_no_sign\n", + " munge_sumstats_sign\n", + " calc_ld_score\n", + " convert_ld_snps\n", + " calc_enrichment\n", + "\n", + "Sections\n", + " make_annot:\n", + " Workflow Options:\n", + " --bed VAL (as str, required)\n", + " path to bed file\n", + " --bim VAL (as str, required)\n", + " path to bim file\n", + " --annot VAL (as str, required)\n", + " name of output annotation file\n", + " munge_sumstats_no_sign:\n", + " Workflow Options:\n", + " --sumst VAL (as str, required)\n", + " path to summary statistic file\n", + " --alleles 'w_hm3.snplist'\n", + " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", + " and A2) for the munge_sumstats program\n", + " --output-sumst VAL (as str, required)\n", + " path to output file\n", + " --[no-]signed (default to False)\n", + " does summary statistic contain Z or Beta\n", + " munge_sumstats_sign: This option is for when the summary statistic file does\n", + " contain a signed summary statistic (Z or Beta)\n", + " Workflow Options:\n", + " --sumst VAL (as str, required)\n", + " path to summary statistic file\n", + " --alleles 'w_hm3.snplist'\n", + " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", + " and A2) for the munge_sumstats program\n", + " --output-sumst-2 VAL (as str, required)\n", + " path to output file\n", + " --[no-]signed (default to False)\n", + " does summary statistic contain Z or Beta\n", + " calc_ld_score:\n", + " Workflow Options:\n", + " --bim . (as path)\n", + " Path to directory with bim files\n", + " --annot-files . (as path)\n", + " Path to directory with annotation files, output will\n", + " appear here too. Make sure to remove the SNP, CHR, and\n", + " BP columns from the annotation files if present before\n", + " running.\n", + " --num-features VAL (as int, required)\n", + " number of features\n", + " convert_ld_snps:\n", + " Workflow Options:\n", + " --ld-scores VAL (as str, required)\n", + " Path to directory with ld score files AND annotation\n", + " files\n", + " --num-features VAL (as int, required)\n", + " calc_enrichment:\n", + " Workflow Options:\n", + " --ctrl-sumstats VAL (as str, required)\n", + " Path to Control Summary statistics File\n", + " --AD-sumstats VAL (as str, required)\n", + " Path to AD Summary statistics File\n", + " --ref-ld VAL (as str, required)\n", + " Path to Reference LD Scores File Directory\n", + " --w-ld-ctrl VAL (as str, required)\n", + " Path to LD Weight Files for Control Sumstats (Format\n", + " like minimal working example)\n", + " --frq-file-ctrl VAL (as str, required)\n", + " path to frequency files for Control Sumstats (Format\n", + " like minimal working example)\n", + " --w-ld-AD VAL (as str, required)\n", + " Path to LD Weight Files for AD Sumstats (Format like\n", + " minimal working example)\n", + " --frq-file-AD VAL (as str, required)\n", + " path to frequency files for AD Sumstats (Format like\n", + " minimal working example)\n", + " --num-features VAL (as int, required)\n", + " Number of Features\n", + " --pheno VAL (as str, required)\n", + " Control Phenotype, For Output\n" + ] + } + ], + "source": [ + "!sos run LDSC_Code.ipynb -h" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "Python 3 (ipykernel)" + }, + "source": [ + "## Make Annotation File:" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "\n", + "[make_annot]\n", + "\n", + "# Make Annotated Bed File\n", + "\n", + "# path to bed file\n", + "parameter: bed = str \n", + "#path to bim file\n", + "parameter: bim = str\n", + "#name of output annotation file\n", + "parameter: annot = str\n", + "bash: \n", + " python2 /mnt/mfs/statgen/Anmol/ldsc/make_annot.py --bed-file {bed} --bimfile {bim} --annot-file {annot}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Munge Summary Statistics (Option 1: No Signed Summary Statistic File)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "# Option when Summary Statistic File does not contain a Z or Beta Column (Signed Summary Statistic)\n", + "\n", + "[munge_sumstats_no_sign]\n", + "\n", + "\n", + "\n", + "#path to summary statistic file\n", + "parameter: sumst = str\n", + "#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program\n", + "parameter: alleles = \"w_hm3.snplist\"\n", + "#path to output file\n", + "parameter: output_sumst = str\n", + "#does summary statistic contain Z or Beta\n", + "parameter: signed = False\n", + "\n", + "bash: expand = '${ }'\n", + " if [${signed}==True]\n", + " then\n", + " python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst} --a1-inc\n", + " fi" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Munge Summary Statistics (Option 2: Signed Summary Statistic File)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "# This option is for when the summary statistic file does contain a signed summary statistic (Z or Beta)\n", + "[munge_sumstats_sign]\n", + "\n", + "\n", + "\n", + "#path to summary statistic file\n", + "parameter: sumst = str\n", + "#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program\n", + "parameter: alleles = \"w_hm3.snplist\"\n", + "#path to output file\n", + "parameter: output_sumst_2 = str\n", + "#does summary statistic contain Z or Beta\n", + "parameter: signed = False\n", + "\n", + "bash: expand = '${ }'\n", + " if [${signed}==False]\n", + " then\n", + " python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst_2}\n", + " fi" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Calculate LD Scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "[calc_ld_score]\n", + "\n", + "#Path to directory with bim files\n", + "parameter: bim = path()\n", + "#Path to directory with annotation files, output will appear here too. Make sure to remove the SNP, CHR, and BP columns from the annotation files if present before running.\n", + "parameter: annot_files = path()\n", + "#number of features\n", + "parameter: num_features = int\n", + "\n", + "bash: expand = '${ }'\n", + " #echo {annot_files} > out.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.22 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_22.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_22 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 22| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.j --l2 --ld-wind-cm 1 --annot ${annot_files}/base_chr_j.annot.gz --thin-annot --out ${annot_files}/base_chr_j --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Create Separate LD Score Files for AD Summary Statistic SNP Format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "# Convert SNP format in LD Score Files to CHR:BP to match with AD Summary Statistic Format\n", + "\n", + "\n", + "[convert_ld_snps]\n", + "\n", + "#Path to directory with ld score files AND annotation files\n", + "parameter: ld_scores = str\n", + "\n", + "parameter: num_features = int\n", + "\n", + "\n", + "R: expand = \"${ }\", container=\"/mnt/mfs/statgen/Anmol/r-packages.sif\"\n", + " library(tidyverse)\n", + " #library(R.utils)\n", + " library(data.table)\n", + " for (i in seq(1,22)){\n", + " data = read.table(gzfile(paste0(\"${ld_scores}/base_chr_\",i,\".l2.ldscore.gz\")),header=T)\n", + " data_2 = fread(paste0(\"${ld_scores}/base_chr_\",i,\".l2.M_5_50\"))\n", + " data_3 = read.table(gzfile(paste0(\"${ld_scores}/base_chr_\",i,\".annot.gz\")),header=T)\n", + " data$SNP = paste0(data$CHR,\":\",data$BP)\n", + " fwrite(data,paste0(\"${ld_scores}/AD_base_chr_\",i,\".l2.ldscore.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " fwrite(data_2,paste0(\"${ld_scores}/AD_base_chr_\",i,\".l2.M_5_50\"),quote=F,sep=\"\\t\",row.names=F,col.names=F)\n", + " fwrite(data_3,paste0(\"${ld_scores}/AD_base_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " for (j in seq(1,${num_features})){\n", + " data = read.table(gzfile(paste0(\"${ld_scores}/feat_\",j,\"_chr_\",i,\".l2.ldscore.gz\")),header=T)\n", + " data_2 = fread(paste0(\"${ld_scores}/feat_\",j,\"_chr_\",i,\".l2.M_5_50\"))\n", + " data_3 = read.table(gzfile(paste0(\"${ld_scores}/feat_\",j,\"_chr_\",i,\".annot.gz\")),header=T)\n", + " data$SNP = paste0(data$CHR,\":\",data$BP)\n", + " fwrite(data,paste0(\"${ld_scores}/AD_feat_\",j,\"_chr_\",i,\".l2.ldscore.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " fwrite(data_2,paste0(\"${ld_scores}/AD_feat_\",j,\"_chr_\",i,\".l2.M_5_50\"),quote=F,sep=\"\\t\",row.names=F,col.names=F)\n", + " fwrite(data_3,paste0(\"${ld_scores}/AD_feat_\",j,\"_chr_\",i,\".annot.gz\"),quote=F,sep=\"\\t\",row.names=F,col.names=T)\n", + " }\n", + " }\n", + " \n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Calculate Enrichments" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "[calc_enrichment]\n", + "\n", + "#Path to Control Summary statistics File\n", + "parameter: ctrl_sumstats = str\n", + "#Path to AD Summary statistics File\n", + "parameter: AD_sumstats = str\n", + "#Path to Reference LD Scores File Directory \n", + "parameter: ref_ld = str\n", + "#Path to LD Weight Files for Control Sumstats (Format like minimal working example)\n", + "parameter: w_ld_ctrl = str\n", + "#path to frequency files for Control Sumstats (Format like minimal working example)\n", + "parameter: frq_file_ctrl = str\n", + "#Path to LD Weight Files for AD Sumstats (Format like minimal working example)\n", + "parameter: w_ld_AD = str\n", + "#path to frequency files for AD Sumstats (Format like minimal working example)\n", + "parameter: frq_file_AD = str\n", + "#Number of Features\n", + "parameter: num_features = int \n", + "#Control Phenotype, For Output\n", + "parameter: pheno = str\n", + "\n", + "bash: expand = '${ }'\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${ctrl_sumstats} --ref-ld-chr ${ref_ld}/base_chr_,${ref_ld}/feat_j_chr_ --w-ld-chr ${w_ld_ctrl} --overlap-annot --frqfile-chr ${frq_file_ctrl} --out ${ref_ld}/${pheno}_feat_j\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${AD_sumstats} --ref-ld-chr ${ref_ld}/AD_base_chr_,${ref_ld}/AD_feat_j_chr_ --w-ld-chr ${w_ld_AD} --overlap-annot --frqfile-chr ${frq_file_AD} --out ${ref_ld}/AD_feat_j" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "SoS", + "language": "sos", + "name": "sos" + }, + "language_info": { + "codemirror_mode": "sos", + "file_extension": ".sos", + "mimetype": "text/x-sos", + "name": "sos", + "nbconvert_exporter": "sos_notebook.converter.SoS_Exporter", + "pygments_lexer": "sos" + }, + "sos": { + "kernels": [ + [ + "Python 3 (ipykernel)", + "python3", + "python3", + "", + { + "name": "ipython", + "version": 3 + } + ], + [ + "SoS", + "sos", + "", + "", + "sos" + ] + ], + "panel": { + "displayed": true, + "height": 0 + }, + "version": "0.22.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/LDSC/LDSC_Code.sos b/LDSC/LDSC_Code.sos new file mode 100644 index 0000000..6d928fa --- /dev/null +++ b/LDSC/LDSC_Code.sos @@ -0,0 +1,127 @@ +#!/usr/bin/env sos-runner +#fileformat=SOS1.0 + +[make_annot] + +# Make Annotated Bed File + +# path to bed file +parameter: bed = str +#path to bim file +parameter: bim = str +#name of output annotation file +parameter: annot = str +bash: + python2 /mnt/mfs/statgen/Anmol/ldsc/make_annot.py --bed-file {bed} --bimfile {bim} --annot-file {annot} + +[munge_sumstats_no_sign] + + + +#path to summary statistic file +parameter: sumst = str +#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program +parameter: alleles = "w_hm3.snplist" +#path to output file +parameter: output_sumst = str +#does summary statistic contain Z or Beta +parameter: signed = False + +bash: expand = '${ }' + if [${signed}==True] + then + python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst} --a1-inc + fi + +# This option is for when the summary statistic file does contain a signed summary statistic (Z or Beta) +[munge_sumstats_sign] + + + +#path to summary statistic file +parameter: sumst = str +#path to Hapmap3 SNPs file, keep all columns (SNP, A1, and A2) for the munge_sumstats program +parameter: alleles = "w_hm3.snplist" +#path to output file +parameter: output_sumst_2 = str +#does summary statistic contain Z or Beta +parameter: signed = False + +bash: expand = '${ }' + if [${signed}==False] + then + python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst_2} + fi + +[calc_ld_score] + +#Path to directory with bim files +parameter: bim = path() +#Path to directory with annotation files, output will appear here too. Make sure to remove the SNP, CHR, and BP columns from the annotation files if present before running. +parameter: annot_files = path() +#number of features +parameter: num_features = int + +bash: expand = '${ }' + #echo {annot_files} > out.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.22 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_22.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_22 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 22| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.j --l2 --ld-wind-cm 1 --annot ${annot_files}/base_chr_j.annot.gz --thin-annot --out ${annot_files}/base_chr_j --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + +[convert_ld_snps] + +#Path to directory with ld score files AND annotation files +parameter: ld_scores = str + +parameter: num_features = int + + +R: expand = "${ }", container="/mnt/mfs/statgen/Anmol/r-packages.sif" + library(tidyverse) + #library(R.utils) + library(data.table) + for (i in seq(1,22)){ + data = read.table(gzfile(paste0("${ld_scores}/base_chr_",i,".l2.ldscore.gz")),header=T) + data_2 = fread(paste0("${ld_scores}/base_chr_",i,".l2.M_5_50")) + data_3 = read.table(gzfile(paste0("${ld_scores}/base_chr_",i,".annot.gz")),header=T) + data$SNP = paste0(data$CHR,":",data$BP) + fwrite(data,paste0("${ld_scores}/AD_base_chr_",i,".l2.ldscore.gz"),quote=F,sep="\t",row.names=F,col.names=T) + fwrite(data_2,paste0("${ld_scores}/AD_base_chr_",i,".l2.M_5_50"),quote=F,sep="\t",row.names=F,col.names=F) + fwrite(data_3,paste0("${ld_scores}/AD_base_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) + for (j in seq(1,${num_features})){ + data = read.table(gzfile(paste0("${ld_scores}/feat_",j,"_chr_",i,".l2.ldscore.gz")),header=T) + data_2 = fread(paste0("${ld_scores}/feat_",j,"_chr_",i,".l2.M_5_50")) + data_3 = read.table(gzfile(paste0("${ld_scores}/feat_",j,"_chr_",i,".annot.gz")),header=T) + data$SNP = paste0(data$CHR,":",data$BP) + fwrite(data,paste0("${ld_scores}/AD_feat_",j,"_chr_",i,".l2.ldscore.gz"),quote=F,sep="\t",row.names=F,col.names=T) + fwrite(data_2,paste0("${ld_scores}/AD_feat_",j,"_chr_",i,".l2.M_5_50"),quote=F,sep="\t",row.names=F,col.names=F) + fwrite(data_3,paste0("${ld_scores}/AD_feat_",j,"_chr_",i,".annot.gz"),quote=F,sep="\t",row.names=F,col.names=T) + } + } + + + +[calc_enrichment] + +#Path to Control Summary statistics File +parameter: ctrl_sumstats = str +#Path to AD Summary statistics File +parameter: AD_sumstats = str +#Path to Reference LD Scores File Directory +parameter: ref_ld = str +#Path to LD Weight Files for Control Sumstats (Format like minimal working example) +parameter: w_ld_ctrl = str +#path to frequency files for Control Sumstats (Format like minimal working example) +parameter: frq_file_ctrl = str +#Path to LD Weight Files for AD Sumstats (Format like minimal working example) +parameter: w_ld_AD = str +#path to frequency files for AD Sumstats (Format like minimal working example) +parameter: frq_file_AD = str +#Number of Features +parameter: num_features = int +#Control Phenotype, For Output +parameter: pheno = str + +bash: expand = '${ }' + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${ctrl_sumstats} --ref-ld-chr ${ref_ld}/base_chr_,${ref_ld}/feat_j_chr_ --w-ld-chr ${w_ld_ctrl} --overlap-annot --frqfile-chr ${frq_file_ctrl} --out ${ref_ld}/${pheno}_feat_j + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${AD_sumstats} --ref-ld-chr ${ref_ld}/AD_base_chr_,${ref_ld}/AD_feat_j_chr_ --w-ld-chr ${w_ld_AD} --overlap-annot --frqfile-chr ${frq_file_AD} --out ${ref_ld}/AD_feat_j + From caac1a743b2aaf42344ca626c4b34459a3c8732e Mon Sep 17 00:00:00 2001 From: asingh100 <55717171+asingh100@users.noreply.github.com> Date: Wed, 9 Mar 2022 23:01:50 -0500 Subject: [PATCH 20/63] Finished LDSC Code Notebook and Minimal Working Example (Tutorial) Notebook --- LDSC/LDSC.ipynb | 433 +++++++++++++++++++++++++++++++++++++++++++ LDSC/LDSC.sos | 3 + LDSC/LDSC_Code.ipynb | 54 ++++-- LDSC/LDSC_Code.sos | 54 ++++-- 4 files changed, 504 insertions(+), 40 deletions(-) create mode 100644 LDSC/LDSC.ipynb create mode 100644 LDSC/LDSC.sos diff --git a/LDSC/LDSC.ipynb b/LDSC/LDSC.ipynb new file mode 100644 index 0000000..240f180 --- /dev/null +++ b/LDSC/LDSC.ipynb @@ -0,0 +1,433 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "# Stratified LD Score Regression \n", + "This notebook implements the pipepline of [S-LDSC](https://github.com/bulik/ldsc/wiki) for LD score and functional enrichment analysis. It is written by Anmol Singh (singh.anmol@columbia.edu), with input from Dr. Gao Wang.\n", + "\n", + "**FIXME: the initial draft is complete but pending Gao's review and documentation with minimal working example**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "Markdown" + }, + "source": [ + "The pipeline is developed to integrate GWAS summary statistics data, annotation data, and LD reference panel data to compute functional enrichment for each of the epigenomic annotations that the user provides using the S-LDSC model. We will first start off with an introduction, instructions to set up, and the minimal working examples. Then the workflow code that can be run using SoS on any data will be at the end. \n", + "\n", + "## A brief review on Stratified LD score regression\n", + "\n", + "Here I briefly review LD Score Regression and what it is used for. For more in depth information on LD Score Regression please read the following three papers:\n", + "\n", + "1. \"LD Score regression distinguishes confounding from polygenicity in genome-wide association studies\" by Sullivan et al (2015)\n", + "\n", + "2. \"Partitioning heritability by functional annotation using genome-wide association summary statistics\" by Finucane et al (2015)\n", + "\n", + "3. \"Linkage disequilibrium–dependent architecture of human complex traits shows action of negative selection\" by Gazal et al (2017)\n", + "\n", + "As stated in Sullivan et al 2015, confounding factors and polygenic effects can cause inflated test statistics and other methods cannot distinguish between inflation from confounding bias and a true signal. LD Score Regression (LDSC) is a technique that aims to identify the impact of confounding factors and polygenic effects using information from GWAS summary statistics. \n", + "\n", + "This approach involves using regression to mesaure the relationship between Linkage Disequilibrium (LD) scores and test statistics of SNPs from the GWAS summary statistics. Variants in LD with a \"causal\" variant show an elevation in test statistics in association analysis proportional to their LD (measured by $r^2$) with the causal variant within a certain window size (could be 1 cM, 1kB, etc.). In contrast, inflation from confounders such as population stratification that occur purely from genetic drift will not correlate with LD. For a polygenic trait, SNPs with a high LD score will have more significant χ2 statistics on average than SNPs with a low LD score. Thus, if we regress the $\\chi^2$ statistics from GWAS against LD Score, the intercept minus one is an estimator of the mean contribution of confounding bias to the inflation in the test statistics. The regression model is known as LD Score regression. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "### LDSC model\n", + "\n", + "Under a polygenic assumption, in which effect sizes for variants are drawn independently from distributions with variance proportional to $1/(p(1-p))$ where p is the minor allele frequency (MAF), the expected $\\chi^2$ statistic of variant j is:\n", + "\n", + "$$E[\\chi^2|l_j] = Nh^2l_j/M + Na + 1 \\quad (1)$$\n", + "\n", + "where $N$ is the sample size; $M$ is the number of SNPs, such that $h^2/M$ is the average heritability explained per SNP; $a$ measures the contribution of confounding biases, such as cryptic relatedness and population stratification; and $l_j = \\sum_k r^2_{jk}$ is the LD Score of variant $j$, which measures the amount of genetic variation tagged by $j$. A full derivation of this equation is provided in the Supplementary Note of Sullivan et al (2015). An alternative derivation is provided in Supplementary Note of Zhu and Stephens (2017) AoAS.\n", + "\n", + "From this we can see that LD Score regression can be used to compute SNP-based heritability for a phenotype or trait, from GWAS summary statistics and does not require genotype information like other methods such as REML do. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "### Stratified LDSC\n", + "\n", + "Heritability is the proportion of phenotypic variation (VP) that is due to variation in genetic values (VG) and thus can tell us how much of the difference in observed phenotypes in a sample is due to difference in genetics in the sample. It can also be extended to analyze partitioned heritability for a phenotype/trait split over categories. \n", + "\n", + "For Partitioned Heritability or Stratified LD Score Regression (S-LDSC) more power is added to our analysis by leveraging LD Score information as well as using SNPs that haven't reached Genome Wide Significance to partition heritability for a trait over categories which many other methods do not do. \n", + "\n", + "\n", + "S-LDSC relies on the fact that the $\\chi^2$ association statistic for a given SNP includes the effects of all SNPs tagged by this SNP meaning that in a region of high LD in the genome the given SNP from the GWAS represents the effects of a group of SNPs in that region.\n", + "\n", + "S-LDSC determines that a category of SNPs is enriched for heritability if SNPs with high LD to that category have more significant $\\chi^2$ statistics than SNPs with low LD to that category.\n", + "\n", + "Here, enrichment of a category is defined as the proportion of SNP heritability in the category divided by the proportion of SNPs in that category.\n", + "\n", + "More precisely, under a polygenic model, the expected $\\chi^2$ statistic of SNP $j$ is\n", + "\n", + "$$E[\\chi^2_j] = N\\sum_CT_Cl(j,C) + Na + 1 \\quad (2)$$\n", + "\n", + "where $N$ is sample size, C indexes categories, $ℓ(j, C)$ is the LD score of SNP j with respect to category $l(j,C) = \\sum_{k\\epsilon C} r^2_{jk}$, $a$ is a term that measures the contribution of confounding biases, and if the categories are disjoint, $\\tau_C$ is the per-SNP heritability in category $C$; if the categories overlap, then the per-SNP heritability of SNP j is $\\sum_{C:j\\epsilon C} \\tau_C$. Equation 2 allows us to estimate $\\tau_C$ via a (computationally simple) multiple regression of $\\chi^2$ against $ℓ(j, C)$, for either a quantitative or case-control study. \n", + "\n", + "To see how these methods have been applied to real world data as well as a further discussion on methods and comparisons to other methods please read the three papers listed at the top of the document." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Command Interface" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "kernel": "SoS" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "usage: sos run LDSC_Code.ipynb [workflow_name | -t targets] [options] [workflow_options]\n", + " workflow_name: Single or combined workflows defined in this script\n", + " targets: One or more targets to generate\n", + " options: Single-hyphen sos parameters (see \"sos run -h\" for details)\n", + " workflow_options: Double-hyphen workflow-specific parameters\n", + "\n", + "Workflows:\n", + " make_annot\n", + " munge_sumstats_no_sign\n", + " munge_sumstats_sign\n", + " calc_ld_score\n", + " convert_ld_snps\n", + " calc_enrichment\n", + "\n", + "Sections\n", + " make_annot:\n", + " Workflow Options:\n", + " --bed VAL (as str, required)\n", + " path to bed file\n", + " --bim VAL (as str, required)\n", + " path to bim file\n", + " --annot VAL (as str, required)\n", + " name of output annotation file\n", + " munge_sumstats_no_sign:\n", + " Workflow Options:\n", + " --sumst VAL (as str, required)\n", + " path to summary statistic file\n", + " --alleles 'w_hm3.snplist'\n", + " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", + " and A2) for the munge_sumstats program\n", + " --output-sumst VAL (as str, required)\n", + " path to output file\n", + " --[no-]signed (default to False)\n", + " does summary statistic contain Z or Beta\n", + " munge_sumstats_sign: This option is for when the summary statistic file does\n", + " contain a signed summary statistic (Z or Beta)\n", + " Workflow Options:\n", + " --sumst VAL (as str, required)\n", + " path to summary statistic file\n", + " --alleles 'w_hm3.snplist'\n", + " path to Hapmap3 SNPs file, keep all columns (SNP, A1,\n", + " and A2) for the munge_sumstats program\n", + " --output-sumst-2 VAL (as str, required)\n", + " path to output file\n", + " --[no-]signed (default to False)\n", + " does summary statistic contain Z or Beta\n", + " calc_ld_score:\n", + " Workflow Options:\n", + " --bim . (as path)\n", + " Path to directory with bim files\n", + " --annot-files . (as path)\n", + " Path to directory with annotation files, output will\n", + " appear here too. Make sure to remove the SNP, CHR, and\n", + " BP columns from the annotation files if present before\n", + " running.\n", + " --num-features VAL (as int, required)\n", + " number of features\n", + " convert_ld_snps:\n", + " Workflow Options:\n", + " --ld-scores VAL (as str, required)\n", + " Path to directory with ld score files AND annotation\n", + " files\n", + " --num-features VAL (as int, required)\n", + " calc_enrichment:\n", + " Workflow Options:\n", + " --ctrl-sumstats VAL (as str, required)\n", + " Path to Control Summary statistics File\n", + " --AD-sumstats VAL (as str, required)\n", + " Path to AD Summary statistics File\n", + " --ref-ld VAL (as str, required)\n", + " Path to Reference LD Scores File Directory\n", + " --w-ld-ctrl VAL (as str, required)\n", + " Path to LD Weight Files for Control Sumstats (Format\n", + " like minimal working example)\n", + " --frq-file-ctrl VAL (as str, required)\n", + " path to frequency files for Control Sumstats (Format\n", + " like minimal working example)\n", + " --w-ld-AD VAL (as str, required)\n", + " Path to LD Weight Files for AD Sumstats (Format like\n", + " minimal working example)\n", + " --frq-file-AD VAL (as str, required)\n", + " path to frequency files for AD Sumstats (Format like\n", + " minimal working example)\n", + " --num-features VAL (as int, required)\n", + " Number of Features\n", + " --pheno VAL (as str, required)\n", + " Control Phenotype, For Output\n" + ] + } + ], + "source": [ + "!sos run LDSC_Code.ipynb -h" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Example Analysis 1: Setting up Summary Statistic File\n", + "\n", + "This section will go over how to set up the summary statistic file for the phenotype you are trying to analyze. The summary statistic file we will use is for BMI and it can be downloaded here: http://www.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files. For the tutorial you also need the list of hapmap snps to restrict the summary statistic file to the recommended HapMap Phase 3 SNPs that will be used in the regression. The authors recommend restricting the analysis to HapMap Phase 3 SNPs because most GWAS summary statistics do not have information about imputation quality, thus using HapMap SNPs insures that you are using well-imputed and common variants for the analysis. This file can be downloaded here: https://storage.googleapis.com/broad-alkesgroup-public/LDSCORE/w_hm3.snplist.bz2. The summary statistic file should have the following columns with the following names for the analysis to work:\n", + "\n", + "SNP -- SNP identifier (e.g., rs number)\n", + "\n", + "N -- sample size (which may vary from SNP to SNP).\n", + "\n", + "P -- p-value.\n", + "\n", + "A1 -- first allele (effect allele)\n", + "\n", + "A2-- second allele (other allele)\n", + "\n", + "Signed Summary Statistic (Can be Z, BETA, or Odds Ratio(label as OR)), is optional if A1 is the risk increasing allele as you can use the other munge_sumstats option in the workflow.\n", + "\n", + "For the tutorial, the BMI summary statistic file is not signed so we will use the not_signed option in the workflow. Once you have set up the summary statistic file with these column headers you can reformat it for the analysis using the following command in the SoS workflow:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "sos run LDSC_Code.ipynb munge_sumstats_no_sign --sumstats GIANT_BMI_Speliotes2010_publicrelease_HapMapCeuFreq.txt\\\n", + "--output-sumst-2 BMI\\\n", + "--alleles w_hm3.snplist\\\n", + "--signed False" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "Markdown" + }, + "source": [ + "Explanation of flags:\n", + " \n", + " 1. `--sumstats`: path to summary statistic file\n", + " \n", + " 2. `--output-sumst-2`: Prefix for output file name\n", + " \n", + " 3. `--alleles`: path to hm3 alleles for merging\n", + " \n", + " 4. `--signed`: is the summary statistic file signed or not, logical flag (True or False)\n", + " \n", + "This will return a file called BMI.sumstats.gz which is a gzipped file that will be used as the summary statistic file in our analysis. It contains a row for each variant as well as the Allele Information and the Z score calculated by the workflow.\n", + "\n", + "## Example Analysis 2: Partitioned LD Score Regression\n", + "We first make the annotation file with respect to a specific annotation bed file using the make_annot option in the workflow. For the purposes of this tutorial we will use a Histone Mark annotation from Adipose Tissue, Adipose_Tissue.H3K27ac. I have provided the bed file for this annotation on a google drive folder (https://drive.google.com/drive/folders/1HdG-QsCl6fAspSxGsuoOCapwfnXCyfnU?usp=sharing) so you can download it to run the commands below. **Please place the plink files into a folder called plink files and make the output annotation directory annot_files before running the command below. When running on your own data in the future, please encode the annotation file names with feat_#_chr_# as the prefix so that the future commands in the pipeline will work. The command to make the annotation file for this annotation for one chromosome of the 1000 Genome Phase 3 variants (the reference data) for the tutorial is listed here:**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "sos run LDSC_code.ipynb make_annot \\\n", + "\t\t--bed Adipose_Tissue.H3K27ac.bed \\\n", + "\t\t--bim plink_files/1000G.EUR.QC.22.bim \\\n", + "\t\t--annot annot_files/feat_1_chr_22.annot.gz" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "Markdown" + }, + "source": [ + "Explanation of flags:\n", + " \n", + " 1. `--bim`: path to 1000 Genome bim file\n", + " \n", + " 2. `--bed`: Path to bed file for annotation\n", + " \n", + " 3. `--annot`: output file name\n", + " \n", + "This command will output a file with 0/1 for each variant in the bim file which corresponds to whether this specific variant is within the regions described in the annotation file.\n", + "\n", + "## Example Analysis 3: Calculate LD Scores\n", + "\n", + "After the annotation file is made we can use it to calculate the LD Scores for this annotation. In this case the program recommends that you only print LD Scores for HapMap Phase 3 SNPs. This can be achieved by using the hapmap snplist file which can be found here: https://storage.googleapis.com/broad-alkesgroup-public/LDSCORE/w_hm3.snplist.bz2. You must get rid of the A1 and A2 columns in this file and keep only the SNP column before using the command below\n", + "\n", + "Make sure your annotation files have the same prefix as your LD Score files that you will create as the workflow will not be able to read the annotation files if they have a different prefix when you try to conduct the regression.\n", + "\n", + "The code below shows how to use the workflow to calculate LD Scores for the tutorial annotation. The workflow will automatically calculate LD Scores for each chromosome and the base or reference annotation using one simple command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "sos run LDSC_code.ipynb calc_ld_score \\\n", + "\t\t--bim plink_files \\\n", + "\t\t--annot_files annot_files \\\n", + "\t\t--num_features 1" + ] + }, + { + "attachments": { + "image.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "Explanation of flags:\n", + " \n", + " 1. `--bim`: path to bim files directory\n", + " \n", + " 2. `--num_features`: Number of Epigenomic Features in analysis\n", + " \n", + " 3. `--annot`: path to annotation files directory\n", + " \n", + " \n", + "This command outputs the same gzipped LD score file as the simple case but instead of just an LD Score column, it will have one LD Score column for each annotation that you are calculating LD Scores for.\n", + "\n", + "![image.png](attachment:image.png)\n", + "\n", + "Now that we have calculated the LD Scores for each chromosome for our annotation, we can use these LD Scores to conduct the Partitioned LD Score Regression for our annotation. In this case we have to make sure that our annotation files are in the same folder and have the same prefix name as our LD Score files. Now we can conduct the Regression for our annotation:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "sos run LDSC_code.ipynb calc_enrichment --sumstats BMI.sumstats.gz\\\n", + " --ref_ld annot_files/base.,annot_files/feat_1_chr_\\ \n", + " --w_ld weights_hm3_no_hla/weights.\\\n", + " --overlap-annot\\\n", + " --frq_file 1000G_frq/1000G.mac5eur.\\\n", + " --pheno BMI\\\n", + " --num_features 1" + ] + }, + { + "attachments": { + "image.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "Here the comma indicates that we are concatinating the baseline model with the new annotation.\n", + "\n", + "The results our outputted in a .results file which shows the proportion of heritability and enrichment attributable to each category for the trait you are studying, in this case BMI.\n", + "\n", + "The results file for this analysis looks like this, where L2_1 represents our Adipose Tissue Annotation and baseL2_0 describes the baseline annotation:\n", + "\n", + "![image.png](attachment:image.png)\n", + "\n", + "\n", + "Explanation of flags:\n", + " \n", + " \n", + " 1. `sumstats`: #Path to Summary statistics File\n", + "\n", + " 2. `ref_ld`: Path to Reference LD Scores File Directory \n", + " \n", + " 3. `w_ld`: Path to LD Weight Files\n", + " \n", + " 4. `frq_file`: Path to Frequency Files\n", + " \n", + " 5. `pheno`: Phenotype of Summary statistic file, for output prefix\n", + " \n", + " 6. `num_features`: Number of Features in Analysis \n", + " " + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "SoS", + "language": "sos", + "name": "sos" + }, + "language_info": { + "codemirror_mode": "sos", + "file_extension": ".sos", + "mimetype": "text/x-sos", + "name": "sos", + "nbconvert_exporter": "sos_notebook.converter.SoS_Exporter", + "pygments_lexer": "sos" + }, + "sos": { + "kernels": [ + [ + "Markdown", + "markdown", + "markdown", + "", + "" + ], + [ + "SoS", + "sos", + "", + "", + "sos" + ] + ], + "panel": { + "displayed": true, + "height": 0 + }, + "version": "0.22.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/LDSC/LDSC.sos b/LDSC/LDSC.sos new file mode 100644 index 0000000..3acb973 --- /dev/null +++ b/LDSC/LDSC.sos @@ -0,0 +1,3 @@ +#!/usr/bin/env sos-runner +#fileformat=SOS1.0 + diff --git a/LDSC/LDSC_Code.ipynb b/LDSC/LDSC_Code.ipynb index 73b5fc1..4193e03 100644 --- a/LDSC/LDSC_Code.ipynb +++ b/LDSC/LDSC_Code.ipynb @@ -146,8 +146,8 @@ "parameter: bim = str\n", "#name of output annotation file\n", "parameter: annot = str\n", - "bash: \n", - " python2 /mnt/mfs/statgen/Anmol/ldsc/make_annot.py --bed-file {bed} --bimfile {bim} --annot-file {annot}" + "bash: expand = '${ }'\n", + " python2 /mnt/mfs/statgen/Anmol/ldsc/make_annot.py --bed-file ${bed} --bimfile ${bim} --annot-file ${annot}" ] }, { @@ -183,7 +183,7 @@ "parameter: signed = False\n", "\n", "bash: expand = '${ }'\n", - " if [${signed}==True]\n", + " if [${signed}==False]\n", " then\n", " python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst} --a1-inc\n", " fi" @@ -221,7 +221,7 @@ "parameter: signed = False\n", "\n", "bash: expand = '${ }'\n", - " if [${signed}==False]\n", + " if [${signed}==True]\n", " then\n", " python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst_2}\n", " fi" @@ -252,9 +252,30 @@ "parameter: annot_files = path()\n", "#number of features\n", "parameter: num_features = int\n", - "\n", + " \n", "bash: expand = '${ }'\n", " #echo {annot_files} > out.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.1 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_1.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_1 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.2 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_2.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_2 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.3 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_3.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_3 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.4 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_4.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_4 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.5 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_5.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_5 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.6 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_6.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_6 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.7 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_7.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_7 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.8 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_8.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_8 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.9 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_9.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_9 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.10 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_10.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_10 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.11 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_11.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_11 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.12 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_12.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_12 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.13 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_13.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_13 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.14 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_14.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_14 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.15 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_15.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_15 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.16 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_16.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_16 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.17 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_17.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_17 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.18 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_18.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_18 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.19 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_19.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_19 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.20 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_20.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_20 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.21 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_21.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_21 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.22 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_22.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_22 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt\n", " seq 1 22| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.j --l2 --ld-wind-cm 1 --annot ${annot_files}/base_chr_j.annot.gz --thin-annot --out ${annot_files}/base_chr_j --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt" ] @@ -331,28 +352,21 @@ "source": [ "[calc_enrichment]\n", "\n", - "#Path to Control Summary statistics File\n", - "parameter: ctrl_sumstats = str\n", - "#Path to AD Summary statistics File\n", - "parameter: AD_sumstats = str\n", + "#Path to Summary statistics File\n", + "parameter: sumstats = str\n", "#Path to Reference LD Scores File Directory \n", "parameter: ref_ld = str\n", - "#Path to LD Weight Files for Control Sumstats (Format like minimal working example)\n", - "parameter: w_ld_ctrl = str\n", - "#path to frequency files for Control Sumstats (Format like minimal working example)\n", - "parameter: frq_file_ctrl = str\n", - "#Path to LD Weight Files for AD Sumstats (Format like minimal working example)\n", - "parameter: w_ld_AD = str\n", - "#path to frequency files for AD Sumstats (Format like minimal working example)\n", - "parameter: frq_file_AD = str\n", - "#Number of Features\n", + "#Path to LD Weight Files (Format like minimal working example)\n", + "parameter: w_ld = str\n", + "#path to frequency files (Format like minimal working example)\n", + "parameter: frq_file = str\n", "parameter: num_features = int \n", "#Control Phenotype, For Output\n", "parameter: pheno = str\n", "\n", "bash: expand = '${ }'\n", - " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${ctrl_sumstats} --ref-ld-chr ${ref_ld}/base_chr_,${ref_ld}/feat_j_chr_ --w-ld-chr ${w_ld_ctrl} --overlap-annot --frqfile-chr ${frq_file_ctrl} --out ${ref_ld}/${pheno}_feat_j\n", - " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${AD_sumstats} --ref-ld-chr ${ref_ld}/AD_base_chr_,${ref_ld}/AD_feat_j_chr_ --w-ld-chr ${w_ld_AD} --overlap-annot --frqfile-chr ${frq_file_AD} --out ${ref_ld}/AD_feat_j" + " seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${sumstats} --ref-ld-chr ${ref_ld}/base_chr_,${ref_ld}/feat_j_chr_ --w-ld-chr ${w_ld} --overlap-annot --frqfile-chr ${frq_file} --out ${ref_ld}/${pheno}_feat_j\n", + " #seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${AD_sumstats} --ref-ld-chr ${ref_ld}/AD_base_chr_,${ref_ld}/AD_feat_j_chr_ --w-ld-chr ${w_ld_AD} --overlap-annot --frqfile-chr ${frq_file_AD} --out ${ref_ld}/AD_feat_j" ] } ], diff --git a/LDSC/LDSC_Code.sos b/LDSC/LDSC_Code.sos index 6d928fa..454c612 100644 --- a/LDSC/LDSC_Code.sos +++ b/LDSC/LDSC_Code.sos @@ -11,8 +11,8 @@ parameter: bed = str parameter: bim = str #name of output annotation file parameter: annot = str -bash: - python2 /mnt/mfs/statgen/Anmol/ldsc/make_annot.py --bed-file {bed} --bimfile {bim} --annot-file {annot} +bash: expand = '${ }' + python2 /mnt/mfs/statgen/Anmol/ldsc/make_annot.py --bed-file ${bed} --bimfile ${bim} --annot-file ${annot} [munge_sumstats_no_sign] @@ -28,7 +28,7 @@ parameter: output_sumst = str parameter: signed = False bash: expand = '${ }' - if [${signed}==True] + if [${signed}==False] then python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst} --a1-inc fi @@ -48,7 +48,7 @@ parameter: output_sumst_2 = str parameter: signed = False bash: expand = '${ }' - if [${signed}==False] + if [${signed}==True] then python2 /mnt/mfs/statgen/Anmol/ldsc/munge_sumstats.py --sumstats ${sumst} --merge-alleles ${alleles} --out ${output_sumst_2} fi @@ -61,9 +61,30 @@ parameter: bim = path() parameter: annot_files = path() #number of features parameter: num_features = int - + bash: expand = '${ }' #echo {annot_files} > out.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.1 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_1.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_1 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.2 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_2.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_2 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.3 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_3.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_3 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.4 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_4.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_4 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.5 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_5.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_5 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.6 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_6.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_6 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.7 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_7.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_7 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.8 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_8.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_8 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.9 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_9.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_9 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.10 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_10.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_10 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.11 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_11.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_11 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.12 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_12.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_12 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.13 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_13.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_13 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.14 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_14.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_14 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.15 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_15.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_15 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.16 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_16.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_16 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.17 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_17.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_17 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.18 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_18.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_18 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.19 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_19.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_19 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.20 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_20.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_20 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.21 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_21.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_21 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.22 --l2 --ld-wind-cm 1 --annot ${annot_files}/feat_j_chr_22.annot.gz --thin-annot --out ${annot_files}/feat_j_chr_22 --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt seq 1 22| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --bfile ${bim}/1000G.EUR.QC.j --l2 --ld-wind-cm 1 --annot ${annot_files}/base_chr_j.annot.gz --thin-annot --out ${annot_files}/base_chr_j --print-snps /mnt/mfs/statgen/Anmol/ldsc/tutorial_data/w_hm3.snplist/snplist.txt @@ -102,26 +123,19 @@ R: expand = "${ }", container="/mnt/mfs/statgen/Anmol/r-packages.sif" [calc_enrichment] -#Path to Control Summary statistics File -parameter: ctrl_sumstats = str -#Path to AD Summary statistics File -parameter: AD_sumstats = str +#Path to Summary statistics File +parameter: sumstats = str #Path to Reference LD Scores File Directory parameter: ref_ld = str -#Path to LD Weight Files for Control Sumstats (Format like minimal working example) -parameter: w_ld_ctrl = str -#path to frequency files for Control Sumstats (Format like minimal working example) -parameter: frq_file_ctrl = str -#Path to LD Weight Files for AD Sumstats (Format like minimal working example) -parameter: w_ld_AD = str -#path to frequency files for AD Sumstats (Format like minimal working example) -parameter: frq_file_AD = str -#Number of Features +#Path to LD Weight Files (Format like minimal working example) +parameter: w_ld = str +#path to frequency files (Format like minimal working example) +parameter: frq_file = str parameter: num_features = int #Control Phenotype, For Output parameter: pheno = str bash: expand = '${ }' - seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${ctrl_sumstats} --ref-ld-chr ${ref_ld}/base_chr_,${ref_ld}/feat_j_chr_ --w-ld-chr ${w_ld_ctrl} --overlap-annot --frqfile-chr ${frq_file_ctrl} --out ${ref_ld}/${pheno}_feat_j - seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${AD_sumstats} --ref-ld-chr ${ref_ld}/AD_base_chr_,${ref_ld}/AD_feat_j_chr_ --w-ld-chr ${w_ld_AD} --overlap-annot --frqfile-chr ${frq_file_AD} --out ${ref_ld}/AD_feat_j + seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${sumstats} --ref-ld-chr ${ref_ld}/base_chr_,${ref_ld}/feat_j_chr_ --w-ld-chr ${w_ld} --overlap-annot --frqfile-chr ${frq_file} --out ${ref_ld}/${pheno}_feat_j + #seq 1 ${num_features}| xargs -n 1 -I j -P 4 python2 /mnt/mfs/statgen/Anmol/ldsc/ldsc.py --h2 ${AD_sumstats} --ref-ld-chr ${ref_ld}/AD_base_chr_,${ref_ld}/AD_feat_j_chr_ --w-ld-chr ${w_ld_AD} --overlap-annot --frqfile-chr ${frq_file_AD} --out ${ref_ld}/AD_feat_j From 824514541e10975d38dc53e489cb2d4ae84f5f43 Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Tue, 15 Mar 2022 09:54:53 -0400 Subject: [PATCH 21/63] fixes to burden part --- GWAS/LMM.ipynb | 57 ++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/GWAS/LMM.ipynb b/GWAS/LMM.ipynb index 1790ad4..57aec3c 100644 --- a/GWAS/LMM.ipynb +++ b/GWAS/LMM.ipynb @@ -956,7 +956,7 @@ "parameter: mind_filter = 0.0\n", "input: bfile\n", "output: f'{cwd}/cache/{bfile:bn}.qc_pass.id', f'{cwd}/cache/{bfile:bn}.qc_pass.snplist' \n", - "task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", + "task: trunk_workers = 1, walltime = '10h', mem = '20G', cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", "bash: container=container_lmm, expand= \"${ }\", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout' \n", " plink2 \\\n", " --bfile ${bfile:n} --mac 1 \\\n", @@ -1160,7 +1160,7 @@ "outputs": [], "source": [ "[regenie_burden_3]\n", - "# Select the annotations to be used in the mask file. format: mask# annotatio type\n", + "# Select the annotations to be used in the mask file. format: mask# annotation type\n", "parameter: mask_file = path(\".\")\n", "# Select the upper MAF to generate masks\n", "parameter: aaf_bins =[0.05]\n", @@ -1645,35 +1645,39 @@ "# Annotation file format: variantID, gene and functional annotation (space/tab delimited)\n", "parameter: anno_file = path\n", "input: snpannofile, anno_file\n", - "output: f'{cwd}/cache/someanno.txt',\n", - " f'{cwd}/cache/nonsin.genelist',\n", - " f'{cwd}/cache/nondup.snplist',\n", - " f'{cwd}/cache/someannoslim.csv'\n", + "output: f'{cwd}/cache/{snpannofile:nn}.subset.csv',\n", + " f'{cwd}/cache/non_singleton.genelist',\n", + " f'{cwd}/cache/non_duplicated.snplist',\n", + " f'{cwd}/cache/{anno_file:bnn}.burden_variants.csv'\n", "task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'\n", "\n", - "# Extract target fields (CADD and GWAS catelog) from snpannofile to a smaller file someanno.txt\n", - "bash:container=container_lmm, expand = \"${ }\", stderr = f'{cwd}/{step_name}.stderr', stdout = f'{cwd}/{step_name}.stdout'\n", - " awk -F \",\" -v OFS=\"\\t\" '\n", - " NR==1 {\n", - " for (i=1; i<=NF; i++){\n", - " f[$i] = i}\n", - " } \n", - " {print $(f[\"Chr\"]), $(f[\"Start\"]), $(f[\"End\"]), $(f[\"Ref\"]),$(f[\"Alt\"]),$(f[\"avsnp150\"]),$(f[\"gwasCatalog\"]), $(f[\"CADD_raw\"]), $(f[\"CADD_phred\"])}\n", - " ' ${_input[0]} > ${_output[0]}\n", - "\n", "python: container=container_lmm, expand= \"${ }\", stderr = f'{cwd}/{step_name}.stderr', stdout = f'{cwd}/{step_name}.stdout'\n", " import pandas as pd\n", " import os\n", + " # Get the columns from the annotation file needed for this analysis and make a smaller file\n", + " combined_cols= ['Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.refGene', 'Gene.refGene', 'avsnp150','CADD_phred', 'AF_nfe_exome']\n", + " df = pd.read_csv(${snpannofile:r}, compression='gzip', quotechar = '\"', dtype=\"string\", usecols=combined_cols)\n", + " print('Reading the annotation file done')\n", + " df.to_csv(${_output[0]:r}, index=False)\n", + " print('Saving subset of columns from the annotation file to output finished')\n", + "\n", + "\n", " snplist=[]\n", " nonsingenelist=[]\n", - " for file in os.listdir(\"${cwd}/cache/\"):\n", + " for file in os.listdir(\"${cwd}/cache\"):\n", " if file.endswith(\"burden_masks.snplist\"):\n", - " data=pd.read_csv(os.path.join(\"${cwd}/cache/\", file),header=None, sep=\"\\t\")\n", - "\n", + " data=pd.read_csv(os.path.join(\"${cwd}/cache\", file),header=None, sep=\"\\t\")\n", " # Gather all the SNPs \n", " snpset=data.iloc[:,1].str.split(\",\")\n", " snplist=snplist+[snp for snps in snpset for snp in snps]\n", - " # Remove the single variant genes and gather all the non-sin genes\n", + " def get_single_var_genes(list):\n", + " single_var = 0\n", + " for snp in range(len(snpset)):\n", + " single_var += len(snpset[snp])==1\n", + " return single_var\n", + " print('There are', get_single_var_genes(snpset), 'genes with only one variant')\n", + " print('There are', len(snpset) - get_single_var_genes(snpset), 'genes with more than one variant')\n", + " # Remove the single variant genes and gather all the non-single variant genes\n", " nonsingeneindex=[]\n", " for i in range(len(snpset)):\n", " if len(snpset[i])!=1:\n", @@ -1694,14 +1698,16 @@ " snplistfile.close()\n", "\n", " # Extract target SNPs from someanno.txt to an even smaller file someannoslim.csv\n", - " data=pd.read_csv(${_output[0]:r},sep=\"\\t\",lineterminator=\"\\n\",header=None,names=['Chr', 'Start', 'End', 'Ref', 'Alt', 'avsnp150', 'gwasCatalog', 'CADD_raw', 'CADD_phred'])\n", + " data=pd.read_csv(${_output[0]:r},lineterminator=\"\\n\")\n", " data=data.iloc[1:,:]\n", " data = data.astype(str)\n", " data['Chr'] = 'chr' + data['Chr'].astype(str)\n", " data[\"varID\"] = data.Chr.str.cat(others=[data.Start, data.Ref, data.Alt], sep=':')\n", - " anno_file=pd.read_csv(${_input[1]:r},sep=\" \",header=None,names=[snptag,'gene','funcion'])\n", + " anno_file=pd.read_csv(${_input[1]:r},sep=\" \",header=None,names=[snptag,'gene','function'])\n", + " print('Annotation input file for regenie contains', len(anno_file),'variants')\n", " minianno=data[data[snptag].isin(snplistslim)]\n", " minianno=pd.merge(minianno,anno_file[[snptag,'gene']],on=[snptag],how=\"inner\")\n", + " print('After merging non duplicated variants in burden_mask.snplist with the annotation file',len(minianno),'variants remain')\n", " minianno.to_csv(${_output[3]:r},index=False)" ] }, @@ -1722,7 +1728,7 @@ "# A given list of gene for annotation\n", "parameter: genelist = \"\"\n", "# Path to the SNP counts\n", - "parameter: nonsingenelist = f'{cwd}/cache/nonsin.genelist'\n", + "parameter: nonsingenelist = f'{cwd}/cache/non_singleton.genelist'\n", "# Select the annotations to be used in the mask file. format: mask# annotatio type\n", "parameter: mask_file = path(\".\")\n", "# Select the upper MAF to generate masks\n", @@ -1742,7 +1748,7 @@ " nonsingenelist=pd.read_csv(\"${nonsingenelist}\",sep=\"\\n\",header=None).iloc[:,0].to_list()\n", "\n", " for bin in binlist:\n", - " # Separate regenie resulta into mask and MAF bins\n", + " # Separate regenie results into mask and MAF bins\n", " data[data['ALT']==bin].to_csv(\"${_input[0]:nn}.\"+bin+'.snp_stats.gz', compression='gzip', sep='\\t', header = True, index = False)\n", " # Remove the single variant genes\n", " data2=data[data['SNP'].isin(nonsingenelist)]\n", @@ -2036,7 +2042,8 @@ "# A given list of gene for annotation\n", "parameter: genelist = \"\"\n", "# Annotation file\n", - "parameter: snpsomeanno = f'{cwd}/cache/someannoslim.csv'\n", + "parameter: anno_file = path\n", + "parameter: snpsomeanno = f'{cwd}/cache/{anno_file:b nn}.burden_variants.csv'\n", "# Select the annotations to be used in the mask file. format: mask# annotatio type\n", "parameter: mask_file = path(\".\")\n", "# Select the upper MAF to generate masks\n", From 0d2815f9d81219c288fb27225581ad974da3a343 Mon Sep 17 00:00:00 2001 From: Yin Huang Date: Tue, 15 Mar 2022 11:31:19 -0400 Subject: [PATCH 22/63] deal with one input geno --- GWAS/Region_Extraction.ipynb | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/GWAS/Region_Extraction.ipynb b/GWAS/Region_Extraction.ipynb index 1cb5a27..3dc2a27 100644 --- a/GWAS/Region_Extraction.ipynb +++ b/GWAS/Region_Extraction.ipynb @@ -151,11 +151,11 @@ "# Work directory where output will be saved to\n", "parameter: cwd = path\n", "# Region specifications\n", - "parameter: region_file = path\n", + "parameter: region_file = path()\n", "# Genotype file inventory\n", - "parameter: geno_path = path\n", + "parameter: geno_path = path()\n", "# Phenotype path\n", - "parameter: pheno_path = path\n", + "parameter: pheno_path = path()\n", "# Sample file path, for bgen format\n", "parameter: bgen_sample_path = path('.')\n", "# Path to summary stats file\n", @@ -163,13 +163,13 @@ "# Path to summary stats format configuration\n", "parameter: format_config_path = path('.')\n", "# Path to samples of unrelated individuals\n", - "parameter: unrelated_samples = path\n", + "parameter: unrelated_samples = path()\n", + "# imputed Genotype file inventory\n", + "parameter: imp_geno_path = path()\n", + "# Path to summary stats file\n", + "parameter: imp_sumstats_path = path()\n", "# Number of tasks to run in each job on cluster\n", "parameter: job_size = int\n", - "# Number of tasks to run in each job on cluster\n", - "parameter: imp_geno_path = path\n", - "# Path to summary stats file\n", - "parameter: imp_sumstats_path = path\n", "# The reference genome of imputed genotype data\n", "parameter: imp_ref = str\n", "parameter: walltime = '12h'\n", @@ -348,11 +348,14 @@ " \n", " input_format_config = ${format_config_path:r} if ${format_config_path.is_file()} else None\n", "\n", - " \n", + " chrom = \"${_regions[0]}\"\n", " # Load genotype file for the region of interest\n", " geno_inventory = dict([x.strip().split() for x in open(input_geno_path).readlines() if x.strip()])\n", - " imp_geno_inventory = dict([x.strip().split() for x in open(imp_geno_path).readlines() if x.strip()])\n", - " chrom = \"${_regions[0]}\"\n", + " if yml_path.is_file(): \n", + " imp_geno_inventory = dict([x.strip().split() for x in open(imp_geno_path).readlines() if x.strip()])\n", + " else:\n", + " imp_geno_inventory={'0':None,chrom:None}\n", + " \n", " if chrom.startswith('chr'):\n", " chrom = chrom[3:]\n", " if chrom not in geno_inventory:\n", From e2f6d4b82f8c10ff1bb373542039bab1cf1ecb7a Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Thu, 17 Mar 2022 09:27:41 -0400 Subject: [PATCH 23/63] added Gao's comment on the specificaton of regenie_qc --- GWAS/LMM.ipynb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/GWAS/LMM.ipynb b/GWAS/LMM.ipynb index 57aec3c..ffca4ee 100644 --- a/GWAS/LMM.ipynb +++ b/GWAS/LMM.ipynb @@ -937,7 +937,9 @@ "kernel": "SoS" }, "source": [ - "Documentation can be found [here](https://rgcgithub.github.io/regenie/). Binary and quantitative traits should be analyzed separately. " + "Documentation can be found [here](https://rgcgithub.github.io/regenie/). Binary and quantitative traits should be analyzed separately. \n", + "\n", + "**step regenie_qc needs a minimum of 20G and 10h to be able to run on the cluster. FIXME: determine this variables in a better way so that they are not hard-coded refer to [PR](https://github.com/cumc/bioworkflows/pull/152)** " ] }, { From d6e29caad7b24cfcc7874c15a903dc3233a27c53 Mon Sep 17 00:00:00 2001 From: yuliu426 Date: Thu, 31 Mar 2022 16:43:40 -0400 Subject: [PATCH 24/63] update GMMAT--add bhat and sbhat --- GWAS/LMM.ipynb | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/GWAS/LMM.ipynb b/GWAS/LMM.ipynb index ffca4ee..acf218c 100644 --- a/GWAS/LMM.ipynb +++ b/GWAS/LMM.ipynb @@ -1452,6 +1452,10 @@ " MAF.range = c(${bgenMinMAF},1-${bgenMinMAF}), \n", " miss.cutoff = ${geno_filter},\n", " nperbatch = ${nperbatch})\n", + " score=read.table('${cwd}/${_input:bn}.${phenoFile:bn}.gmmat.score.txt', header=T)\n", + " score$BHAT=score$SCORE/score$VAR\n", + " score$SBHAT=1/sqrt(score$VAR)\n", + " write.table(score,'${cwd}/${_input:bn}.${phenoFile:bn}.gmmat.score.txt', sep = '\\t', quote = F, col.names = T, row.names = F)\n", "bash: container=container_lmm,expand='${ }', stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'\n", " gzip ${cwd}/${_input:bn}.${phenoFile:bn}.gmmat.score.txt" ] @@ -1520,7 +1524,7 @@ "#maximum minor allele frequencies of variants\n", "parameter: maf_max_filter = float\n", "output: f'{cwd}/{_input[0]:bn}.smmat.burden.txt.gz'\n", - "task: trunk_workers = 1, trunk_size = 1, walltime = '10h', mem = '120G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'\n", + "task: trunk_workers = 1, trunk_size = 1, walltime = '10h', mem = '12G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'\n", "R: container=container_lmm, expand='${ }', stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'\n", " library('dplyr')\n", " library('GMMAT')\n", From 147d24ddf08fd4079b2160a78fefd2d0c1cc0fcb Mon Sep 17 00:00:00 2001 From: Yin Huang Date: Mon, 18 Apr 2022 16:23:31 -0400 Subject: [PATCH 25/63] change LDtools to cugg --- GWAS/liftover.ipynb | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/GWAS/liftover.ipynb b/GWAS/liftover.ipynb index 635bc44..566aab2 100644 --- a/GWAS/liftover.ipynb +++ b/GWAS/liftover.ipynb @@ -33,7 +33,7 @@ "Make sure you install the pre-requisited before running this notebook:\n", "\n", "```\n", - "pip install LDtoolsets -U\n", + "pip install cugg -U\n", "```" ] }, @@ -155,7 +155,7 @@ "# Input file which can be plink format, gvcf/vcf format, sumstat format.\n", "parameter: input_file = path\n", "# The path of yaml file with input file format, only for sumstat file.\n", - "parameter: yml_file = path('.') \n", + "parameter: yml_file = path() \n", "# the name of ouput file which will be saved under cwd path\n", "parameter: output_file = path\n", "# From reference genome, defaut is hg19\n", @@ -180,15 +180,15 @@ "outputs": [], "source": [ "[default_1 (export utils script)]\n", - "depends: Py_Module('LDtools'), Py_Module('pathlib'),Py_Module('pandas')\n", + "depends: Py_Module('cugg'), Py_Module('pathlib'),Py_Module('pandas')\n", "output: f'{cwd:a}/utils.py'\n", "report: expand = '${ }', output=f'{cwd:a}/utils.py'\n", "\n", " import pandas as pd\n", " from pathlib import Path\n", - " from LDtools.genodata import *\n", - " from LDtools.sumstat import Sumstat\n", - " from LDtools.liftover import Liftover\n", + " from cugg.genodata import *\n", + " from cugg.sumstat import Sumstat\n", + " from cugg.liftover import Liftover\n", " def liftover(input_path,output_path,yml=None,fr='hg19',to='hg38',remove_missing=True,rename=True):\n", " lf = Liftover(fr,to)\n", " print(\"liftover from \" + fr +\" to \" +to)\n", @@ -251,13 +251,17 @@ "output: f'{cwd}/{output_file}'\n", "python: input = f'{cwd:a}/utils.py', expand = '${ }', stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'\n", " \n", + " \n", + " import os.path\n", " input_path=${_input[0]:r}\n", " output_path=${_output[0]:r}\n", - " fr = '${fr}'\n", - " to = '${to}'\n", + " fr = f'${fr}'\n", + " to = f'${to}'\n", " remove_missing=${remove_missing}\n", " rename = ${rename}\n", - " yml_file = '${yml_file}'\n", + " yml_file = f'${yml_file}'\n", + " if not os.path.isfile(yml_file):\n", + " yml_file = None\n", " print(fr,to,remove_missing)\n", " liftover(input_path,output_path,yml_file,fr,to,remove_missing,rename)" ] @@ -294,7 +298,7 @@ "sos" ] ], - "version": "0.22.6" + "version": "0.22.7" } }, "nbformat": 4, From 27b0f1d44c6c6372fada5a0e262ad3f64793dca3 Mon Sep 17 00:00:00 2001 From: Yin Huang Date: Mon, 18 Apr 2022 16:29:06 -0400 Subject: [PATCH 26/63] update --- GWAS/Region_Extraction.ipynb | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/GWAS/Region_Extraction.ipynb b/GWAS/Region_Extraction.ipynb index 3dc2a27..9fb3530 100644 --- a/GWAS/Region_Extraction.ipynb +++ b/GWAS/Region_Extraction.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "7a09f4ce", + "id": "feda1754-b5fc-4c37-beb7-4f43b39174fa", "metadata": { "kernel": "SoS" }, @@ -12,7 +12,7 @@ }, { "cell_type": "markdown", - "id": "8259e900", + "id": "2a3789d2-ef07-4513-bcb8-881ba984b967", "metadata": { "kernel": "SoS" }, @@ -22,7 +22,7 @@ }, { "cell_type": "markdown", - "id": "185fb76b", + "id": "212f861a-0038-4112-a1d3-00b7b40183c3", "metadata": { "kernel": "SoS" }, @@ -32,7 +32,7 @@ }, { "cell_type": "markdown", - "id": "c94dcb53", + "id": "de3fdee2-9312-4d3e-82c6-78c1c52d69d0", "metadata": { "kernel": "SoS" }, @@ -56,7 +56,6 @@ "cell_type": "markdown", "id": "9070e9aa", "metadata": { - "jp-MarkdownHeadingCollapsed": true, "kernel": "SoS", "tags": [] }, @@ -151,11 +150,11 @@ "# Work directory where output will be saved to\n", "parameter: cwd = path\n", "# Region specifications\n", - "parameter: region_file = path()\n", + "parameter: region_file = path\n", "# Genotype file inventory\n", - "parameter: geno_path = path()\n", + "parameter: geno_path = path\n", "# Phenotype path\n", - "parameter: pheno_path = path()\n", + "parameter: pheno_path = path\n", "# Sample file path, for bgen format\n", "parameter: bgen_sample_path = path('.')\n", "# Path to summary stats file\n", From 940c0aa93b12ecd140dc4321468f692871cf313d Mon Sep 17 00:00:00 2001 From: Yin Huang Date: Wed, 20 Apr 2022 15:16:52 -0400 Subject: [PATCH 27/63] update --- GWAS/liftover.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GWAS/liftover.ipynb b/GWAS/liftover.ipynb index 566aab2..9c8ab05 100644 --- a/GWAS/liftover.ipynb +++ b/GWAS/liftover.ipynb @@ -255,11 +255,11 @@ " import os.path\n", " input_path=${_input[0]:r}\n", " output_path=${_output[0]:r}\n", - " fr = f'${fr}'\n", - " to = f'${to}'\n", + " fr = ${fr}\n", + " to = ${to}\n", " remove_missing=${remove_missing}\n", " rename = ${rename}\n", - " yml_file = f'${yml_file}'\n", + " yml_file = ${yml_file}\n", " if not os.path.isfile(yml_file):\n", " yml_file = None\n", " print(fr,to,remove_missing)\n", From f9c2588a499dfd3b4eff7871cdb3ec415e511bd4 Mon Sep 17 00:00:00 2001 From: Yin Huang Date: Fri, 29 Apr 2022 16:43:52 -0400 Subject: [PATCH 28/63] fix path issue --- GWAS/Region_Extraction.ipynb | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/GWAS/Region_Extraction.ipynb b/GWAS/Region_Extraction.ipynb index 9fb3530..578d1c9 100644 --- a/GWAS/Region_Extraction.ipynb +++ b/GWAS/Region_Extraction.ipynb @@ -48,7 +48,7 @@ "Make sure you install the pre-requisited before running this notebook:\n", "\n", "```\n", - "pip install LDtoolsets\n", + "pip install cugg\n", "```" ] }, @@ -71,7 +71,7 @@ "- `--pheno-path`, the path of a phenotype. Only for one genotype data. If `None`, only `pld` will be calculated.\n", " - The phenotype file should have a column with the name `IID`, which is used to represent the sample ID.\n", "- `--sumstats-path`, the path of the GWAS file, including all summary statistics (eg, $\\hat{\\beta}$, $SE(\\hat{\\beta})$ and p-values)\n", - " - These summary statistics should contain at least these columns: `chrom, pos, ref, alt, snp_id, bhat, sbhat, p`\n", + " - These summary statistics should contain at least these columns: `CHR,POS,A0,A1,BETA,SE,P`\n", "- `--unrelated-samples`, the file path of unrelated samples with a column named `IID`. If `None`, all samples will be considered unrelative. \n", "- `--cwd`, the path of output directory\n", "\n", @@ -81,7 +81,7 @@ " - The first column is chromosome ID, the 2nd file is genotype for that chromosome.\n", " - When chromosome ID is 0, it implies that the genotype file contains all the genotypes.\n", "- `--imp-sumstats-path`, the path of the GWAS file, including all summary statistics (eg, $\\hat{\\beta}$, $SE(\\hat{\\beta})$ and p-values)\n", - " - These summary statistics should contain at least these columns: `chrom, pos, ref, alt, snp_id, bhat, sbhat, p`\n", + " - These summary statistics should contain at least these columns: `CHR,POS,A0,A1,BETA,SE,P`\n", "- `--imp-ref`, the reference genome if exome genotype and imputed genotype are different. If `None`, The two genotype data will be considered from the same " ] }, @@ -155,12 +155,10 @@ "parameter: geno_path = path\n", "# Phenotype path\n", "parameter: pheno_path = path\n", - "# Sample file path, for bgen format\n", - "parameter: bgen_sample_path = path('.')\n", "# Path to summary stats file\n", "parameter: sumstats_path = path\n", - "# Path to summary stats format configuration\n", - "parameter: format_config_path = path('.')\n", + "# Sample file path, for bgen format\n", + "parameter: bgen_sample_path = path()\n", "# Path to samples of unrelated individuals\n", "parameter: unrelated_samples = path()\n", "# imputed Genotype file inventory\n", @@ -170,7 +168,7 @@ "# Number of tasks to run in each job on cluster\n", "parameter: job_size = int\n", "# The reference genome of imputed genotype data\n", - "parameter: imp_ref = str\n", + "parameter: imp_ref = str('')\n", "parameter: walltime = '12h'\n", "parameter: mem = '60G'\n", "fail_if(not region_file.is_file(), msg = 'Cannot find regions to extract. Please specify them using ``--region-file`` option.')\n", @@ -189,18 +187,18 @@ "outputs": [], "source": [ "[default_1 (export utils script)]\n", - "depends: Py_Module('pandas'), Py_Module('numpy'), Py_Module('dask'), Py_Module('LDtools')\n", + "depends: Py_Module('pandas'), Py_Module('numpy'), Py_Module('dask'), Py_Module('cugg')\n", "parameter: scan_window = 500000\n", "output: f'{cwd:a}/utils.py'\n", "report: expand = '${ }', output=f'{cwd:a}/utils.py'\n", " import pandas as pd\n", " import numpy as np\n", " import dask.array as da\n", - " from LDtools.liftover import *\n", - " from LDtools.genodata import *\n", - " from LDtools.sumstat import *\n", - " from LDtools.ldmatrix import *\n", - " from LDtools.utils import *\n", + " from cugg.liftover import *\n", + " from cugg.genodata import *\n", + " from cugg.sumstat import *\n", + " from cugg.ldmatrix import *\n", + " from cugg.utils import *\n", "\n", "\n", " def main(region,geno_path,sumstats_path,pheno_path,unr_path,imp_geno_path,imp_sumstats_path,imp_ref,output_sumstats,output_LD,bgen_sample_path):\n", @@ -344,16 +342,24 @@ " imp_sumstats_path = ${_input[5]:r}\n", " bgen_sample_path = ${_input[6]:r}\n", " imp_ref = '${imp_ref}'\n", + "\n", + " if not imp_ref:\n", + " imp_ref=None\n", + "\n", + " if not os.path.isfile(bgen_sample_path):\n", + " bgen_sample_path=None\n", + " print('If the genotype data is bgen format, please provide the path of bgen sample')\n", " \n", " input_format_config = ${format_config_path:r} if ${format_config_path.is_file()} else None\n", "\n", " chrom = \"${_regions[0]}\"\n", " # Load genotype file for the region of interest\n", " geno_inventory = dict([x.strip().split() for x in open(input_geno_path).readlines() if x.strip()])\n", - " if yml_path.is_file(): \n", + " if os.path.isfile(imp_geno_path): \n", " imp_geno_inventory = dict([x.strip().split() for x in open(imp_geno_path).readlines() if x.strip()])\n", " else:\n", " imp_geno_inventory={'0':None,chrom:None}\n", + " imp_sumstats_path = None\n", " \n", " if chrom.startswith('chr'):\n", " chrom = chrom[3:]\n", From 9e331e687116be1a42e422edff4a5fdede905c35 Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Thu, 12 May 2022 16:20:42 -0400 Subject: [PATCH 29/63] fix path in burden test --- GWAS/LMM.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GWAS/LMM.ipynb b/GWAS/LMM.ipynb index acf218c..d43b5fe 100644 --- a/GWAS/LMM.ipynb +++ b/GWAS/LMM.ipynb @@ -1651,7 +1651,7 @@ "# Annotation file format: variantID, gene and functional annotation (space/tab delimited)\n", "parameter: anno_file = path\n", "input: snpannofile, anno_file\n", - "output: f'{cwd}/cache/{snpannofile:nn}.subset.csv',\n", + "output: f'{cwd}/cache/{snpannofile:bnn}.subset.csv',\n", " f'{cwd}/cache/non_singleton.genelist',\n", " f'{cwd}/cache/non_duplicated.snplist',\n", " f'{cwd}/cache/{anno_file:bnn}.burden_variants.csv'\n", @@ -2049,7 +2049,7 @@ "parameter: genelist = \"\"\n", "# Annotation file\n", "parameter: anno_file = path\n", - "parameter: snpsomeanno = f'{cwd}/cache/{anno_file:b nn}.burden_variants.csv'\n", + "parameter: snpsomeanno = f'{cwd}/cache/{anno_file:bnn}.burden_variants.csv'\n", "# Select the annotations to be used in the mask file. format: mask# annotatio type\n", "parameter: mask_file = path(\".\")\n", "# Select the upper MAF to generate masks\n", @@ -2226,7 +2226,7 @@ " topgene<-data[data$P<=${plim},] \n", " } \n", " if (${1 if k !=\"\" else 0} ){\n", - " topgene<-data[order(data$P),][1:${k},] \n", + " topgene<-data[order(data$P),][1:as.double(${k}),] \n", " } \n", " \n", " if (${1 if genelist !=\"\" else 0}){\n", From 9fef882f8b9026bd1dd470167f6e835991b5bfcb Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Thu, 12 May 2022 20:51:23 -0400 Subject: [PATCH 30/63] Fix the PR #156 --- GWAS/LMM.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/GWAS/LMM.ipynb b/GWAS/LMM.ipynb index d43b5fe..ea2af99 100644 --- a/GWAS/LMM.ipynb +++ b/GWAS/LMM.ipynb @@ -2042,7 +2042,7 @@ "# ylim set to 0 to use maximum -log10(p) in data\n", "parameter: ylim = 0\n", "# Top k genes to be annotated\n", - "parameter: k = \"\"\n", + "parameter: k = 0\n", "# P value limitation for annotation\n", "parameter: plim = 2.5E-6\n", "# A given list of gene for annotation\n", @@ -2225,8 +2225,8 @@ " if (${1 if plim !=\"\" else 0}){\n", " topgene<-data[data$P<=${plim},] \n", " } \n", - " if (${1 if k !=\"\" else 0} ){\n", - " topgene<-data[order(data$P),][1:as.double(${k}),] \n", + " if (${k}>0){\n", + " topgene<-data[order(data$P),][1:${k},] \n", " } \n", " \n", " if (${1 if genelist !=\"\" else 0}){\n", @@ -2416,7 +2416,7 @@ "displayed": true, "height": 0 }, - "version": "0.22.6" + "version": "0.22.4" }, "toc-showcode": false }, From 8deee736b5a235395aca6c051e80a073276f8a08 Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Tue, 17 May 2022 16:09:05 -0400 Subject: [PATCH 31/63] Update GATK to latest version --- variant-calling/README.md | 7 +++++++ variant-calling/gatk4-annovar.dockerfile | 22 ++++++++++------------ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/variant-calling/README.md b/variant-calling/README.md index a9941eb..68881d0 100644 --- a/variant-calling/README.md +++ b/variant-calling/README.md @@ -4,3 +4,10 @@ To build and upload the docker container for the GATK+ANNOVAR pipeline, docker build --build-arg DUMMY=`date +%s` -t gaow/gatk4-annovar -f gatk4-annovar.dockerfile . docker push gaow/gatk4-annovar ``` + +to build singularity container, + +``` +spython recipe gatk4-annovar.dockerfile | sed 's/Stage: spython-base//g' &> gatk4-annovar.def +singularity build --fakeroot gatk4-annovar.sif gatk4-annovar.def +``` diff --git a/variant-calling/gatk4-annovar.dockerfile b/variant-calling/gatk4-annovar.dockerfile index 0bfed94..8c6ef45 100644 --- a/variant-calling/gatk4-annovar.dockerfile +++ b/variant-calling/gatk4-annovar.dockerfile @@ -1,32 +1,30 @@ -# Add GATK4 to debian-ngs +FROM debian:stable-slim -FROM gaow/debian-ngs:latest - -# :) MAINTAINER Gao Wang, wang.gao@columbia.edu # Install tools WORKDIR /tmp -## https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=863199 -RUN mkdir -p /usr/share/man/man1 + + RUN apt-get update -y \ && apt-get install -qq -y --no-install-recommends \ + curl ca-certificates \ + tabix samtools \ default-jdk python3 python3-matplotlib r-base \ build-essential zlib1g-dev libbz2-dev liblzma-dev \ && apt-get autoclean \ && rm -rf /var/lib/apt/lists/* /var/log/dpkg.log -ENV GATK_VERSION 4.1.6.0 -ADD https://raw.githubusercontent.com/broadinstitute/gatk/${GATK_VERSION}/scripts/docker/gatkbase/install_R_packages.R /opt +ENV GATK_VERSION 4.2.6.1 +ADD https://raw.githubusercontent.com/broadinstitute/gatk/4.1.6.0/scripts/docker/gatkbase/install_R_packages.R /opt +RUN Rscript /opt/install_R_packages.R && rm -rf /tmp/* RUN curl -L \ https://github.com/broadinstitute/gatk/releases/download/${GATK_VERSION}/gatk-${GATK_VERSION}.zip -o gatk.zip \ && unzip gatk.zip \ && mv gatk-${GATK_VERSION} /opt \ && ln -s /opt/gatk-${GATK_VERSION}/gatk /usr/local/bin/gatk \ - && rm -rf /tmp/* -#RUN Rscript /opt/install_R_packages.R && rm -rf /tmp/* + && rm -rf /tmp/gatk.zip RUN ln -s /usr/bin/python3 /usr/local/bin/python -COPY Annovar.tar.gz /tmp -RUN tar zxvf Annovar.tar.gz -C /usr/local/bin && rm -f /tmp/* +RUN curl http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz -o /tmp/annovar.latest.tar.gz && tar zxvf /tmp/annovar.latest.tar.gz -C /usr/local/bin && rm -f /tmp/annovar.latest.tar.gz # Default command CMD ["bash"] From 362af6492748d5da3b597ccf76e30fad1cd6ceed Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Tue, 17 May 2022 16:12:10 -0400 Subject: [PATCH 32/63] Minor change --- variant-calling/gatk4-annovar.dockerfile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/variant-calling/gatk4-annovar.dockerfile b/variant-calling/gatk4-annovar.dockerfile index 8c6ef45..66418ac 100644 --- a/variant-calling/gatk4-annovar.dockerfile +++ b/variant-calling/gatk4-annovar.dockerfile @@ -5,7 +5,6 @@ MAINTAINER Gao Wang, wang.gao@columbia.edu # Install tools WORKDIR /tmp - RUN apt-get update -y \ && apt-get install -qq -y --no-install-recommends \ curl ca-certificates \ @@ -14,9 +13,9 @@ RUN apt-get update -y \ build-essential zlib1g-dev libbz2-dev liblzma-dev \ && apt-get autoclean \ && rm -rf /var/lib/apt/lists/* /var/log/dpkg.log -ENV GATK_VERSION 4.2.6.1 ADD https://raw.githubusercontent.com/broadinstitute/gatk/4.1.6.0/scripts/docker/gatkbase/install_R_packages.R /opt RUN Rscript /opt/install_R_packages.R && rm -rf /tmp/* +ENV GATK_VERSION 4.2.6.1 RUN curl -L \ https://github.com/broadinstitute/gatk/releases/download/${GATK_VERSION}/gatk-${GATK_VERSION}.zip -o gatk.zip \ && unzip gatk.zip \ From fba718efb9cd13794aa8671ed2f06ccabd797d3d Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Thu, 19 May 2022 17:32:36 -0400 Subject: [PATCH 33/63] Add progress track for LDSC --- ldpred/ldpred.ipynb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ldpred/ldpred.ipynb b/ldpred/ldpred.ipynb index 6e1d88a..9d8ae14 100644 --- a/ldpred/ldpred.ipynb +++ b/ldpred/ldpred.ipynb @@ -1080,7 +1080,7 @@ " # Open a temporary file\n", " tmp = tempfile(tmpdir = \"${cwd}/ld-cache\")\n", " on.exit(file.remove(paste0(tmp, \".sbk\")), add = TRUE)\n", - " \n", + " print(\"Computing LD matrix\") \n", " for (chr in 1:22) {\n", " # Extract SNPs that are included in the chromosome\n", " ind.chr <- which(info_snp$chr == chr)\n", @@ -1101,12 +1101,13 @@ " corr$add_columns(corr0, nrow(corr))\n", " }\n", " }\n", - " \n", + " print(\"LD matrix computed. Performing LDSC\")\n", " ldsc <- snp_ldsc(ld, \n", " length(ld), \n", " chi2 = (info_snp$beta / info_snp$beta_se)^2,\n", " sample_size = info_snp$n_eff,\n", " blocks = NULL)\n", + " print(\"LDSC completed\")\n", " saveRDS(list(ld=ld,corr=corr,ldsc=ldsc), file = \"${_output}\")" ] }, @@ -1504,7 +1505,7 @@ "displayed": true, "height": 0 }, - "version": "0.22.4" + "version": "0.22.9" }, "toc-autonumbering": false, "toc-showmarkdowntxt": false From 64fde8bf40cf48502ca5304f730bc0753ce8c88e Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Tue, 21 Jun 2022 17:39:55 -0400 Subject: [PATCH 34/63] Add chmod +x to docker image --- variant-calling/gatk4-annovar.dockerfile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/variant-calling/gatk4-annovar.dockerfile b/variant-calling/gatk4-annovar.dockerfile index 66418ac..25439bb 100644 --- a/variant-calling/gatk4-annovar.dockerfile +++ b/variant-calling/gatk4-annovar.dockerfile @@ -23,7 +23,13 @@ RUN curl -L \ && ln -s /opt/gatk-${GATK_VERSION}/gatk /usr/local/bin/gatk \ && rm -rf /tmp/gatk.zip RUN ln -s /usr/bin/python3 /usr/local/bin/python -RUN curl http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz -o /tmp/annovar.latest.tar.gz && tar zxvf /tmp/annovar.latest.tar.gz -C /usr/local/bin && rm -f /tmp/annovar.latest.tar.gz +#RUN curl http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz -o /tmp/annovar.latest.tar.gz +COPY annovar.latest.tar.gz /tmp +RUN tar zxvf /tmp/annovar.latest.tar.gz -C /usr/local/bin && rm -f /tmp/annovar.latest.tar.gz && chmod +x /usr/local/bin/*.pl # Default command CMD ["bash"] + +# To build singularity image out of this: +# spython recipe gatk4-annovar.dockerfile | sed 's/Stage: spython-base//g' &> gatk4-annovar.def +# singularity build --fakeroot gatk4-annovar.sif gatk4-annovar.def From 73307f711b487bb57af676b25ee6ba0dfac5fe14 Mon Sep 17 00:00:00 2001 From: UxxUnet Date: Thu, 23 Jun 2022 15:56:43 -0400 Subject: [PATCH 35/63] Fix the unzip folder for .pl and bugs in install_R_packages.R --- variant-calling/gatk4-annovar.dockerfile | 7 +++-- variant-calling/install_R_packages.R | 34 ++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 variant-calling/install_R_packages.R diff --git a/variant-calling/gatk4-annovar.dockerfile b/variant-calling/gatk4-annovar.dockerfile index 25439bb..9e2bfed 100644 --- a/variant-calling/gatk4-annovar.dockerfile +++ b/variant-calling/gatk4-annovar.dockerfile @@ -13,7 +13,7 @@ RUN apt-get update -y \ build-essential zlib1g-dev libbz2-dev liblzma-dev \ && apt-get autoclean \ && rm -rf /var/lib/apt/lists/* /var/log/dpkg.log -ADD https://raw.githubusercontent.com/broadinstitute/gatk/4.1.6.0/scripts/docker/gatkbase/install_R_packages.R /opt +ADD https://raw.githubusercontent.com/cumc/bioworkflows/master/variant-calling/install_R_packages.R /opt RUN Rscript /opt/install_R_packages.R && rm -rf /tmp/* ENV GATK_VERSION 4.2.6.1 RUN curl -L \ @@ -25,7 +25,10 @@ RUN curl -L \ RUN ln -s /usr/bin/python3 /usr/local/bin/python #RUN curl http://www.openbioinformatics.org/annovar/download/0wgxR2rIVP/annovar.latest.tar.gz -o /tmp/annovar.latest.tar.gz COPY annovar.latest.tar.gz /tmp -RUN tar zxvf /tmp/annovar.latest.tar.gz -C /usr/local/bin && rm -f /tmp/annovar.latest.tar.gz && chmod +x /usr/local/bin/*.pl +RUN tar zxvf /tmp/annovar.latest.tar.gz \ + && mv /tmp/annovar/* /usr/local/bin/ \ + && rm -f /tmp/annovar.latest.tar.gz \ + && chmod +x /usr/local/bin/*.pl # Default command CMD ["bash"] diff --git a/variant-calling/install_R_packages.R b/variant-calling/install_R_packages.R new file mode 100644 index 0000000..06455a2 --- /dev/null +++ b/variant-calling/install_R_packages.R @@ -0,0 +1,34 @@ +############################################################################### +# If you edit this file you MUST release a new version of the gatkbase docker # +# built with the updated r dependencies # +# # +# you MUST also manually clear the travis cache for master before running the # +# pull request tests in order to make sure it's still working # +############################################################################### + +options(warn = 2) # treat warnings as errors, otherwise script can fail silently if a package fails to install + +InstallPackageFromArchive = function(packageName, packageURL) { + # make sure to use http not https as this will give an "unsupported URL scheme" error + if (!(packageName %in% rownames(installed.packages()))) { + install.packages(packageURL, repos = NULL, type = "source", clean = TRUE) + } +} + +dependencies = c("gplots", + "digest", "gtable", "MASS", "plyr", "reshape2", "scales", "tibble", "lazyeval", # for ggplot2 + "tidyselect", "BH", "plogr") # for dplyr +repos <- c("http://cran.r-project.org") +install.packages(dependencies, repos = repos, clean = TRUE) + +InstallPackageFromArchive("getopt", "http://cran.r-project.org/src/contrib/Archive/getopt/getopt_1.20.0.tar.gz") +InstallPackageFromArchive("optparse", "http://cran.r-project.org/src/contrib/Archive/optparse/optparse_1.3.2.tar.gz") +install.packages("data.table") +InstallPackageFromArchive("gsalib", "http://cran.r-project.org/src/contrib/gsalib_2.1.tar.gz") +InstallPackageFromArchive("ggplot2", "http://cran.r-project.org/src/contrib/Archive/ggplot2/ggplot2_2.2.1.tar.gz") +install.packages("dplyr") + +# HMM is only required for testing and not in production: +install.packages("HMM") + +q(save = "no") \ No newline at end of file From 0e214889266e67940ebe8561eafa715ce36cf28f Mon Sep 17 00:00:00 2001 From: UxxUnet Date: Wed, 29 Jun 2022 13:59:27 -0400 Subject: [PATCH 36/63] Update the annotation mapping --- variant-annotation/annovar.ipynb | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/variant-annotation/annovar.ipynb b/variant-annotation/annovar.ipynb index 09da05b..6065601 100644 --- a/variant-annotation/annovar.ipynb +++ b/variant-annotation/annovar.ipynb @@ -642,12 +642,18 @@ " df2[\"Gene\"] = df2.apply(rename_chrom, axis=1)\n", " \n", " # Match annovar annotations with regenie_burden needs \n", - " annotation_mappings = {\"nonsynonymous\":'missense', \"frameshift\":'LoF', \"stopgain\":'LoF', \"stoploss\":'LoF', \"synonymous\":'synonymous'}\n", + " annotation_mappings = {\"nonsynonymous SNV\":'missense', \n", + " \"synonymous SNV\":'synonymous',\n", + " \"frameshift substitution\":'LoF', \n", + " \"stopgain\":'LoF', \n", + " \"stoploss\":'LoF',\n", + " \"startloss\":'LoF',\n", + " \"nonframeshift substitution\":\"inframe\"\n", + " }\n", + "\n", " def annotation(x):\n", - " x = x.strip().split()\n", - " for i in x:\n", - " if i in annotation_mappings.keys():\n", - " return annotation_mappings[i]\n", + " if x in annotation_mappings.keys():\n", + " return annotation_mappings[x]\n", " return 'other'\n", " df2[\"anno_cat\"] = df2[\"ExonicFunc.refGene\"].apply(annotation)\n", " \n", @@ -771,7 +777,7 @@ "sos" ] ], - "version": "0.22.6" + "version": "0.23.3" } }, "nbformat": 4, From 7800069c54fd05170d7562037c9106dcf7265a9e Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Fri, 1 Jul 2022 12:08:48 -0400 Subject: [PATCH 37/63] add the option to keep certain samples when making the GRM --- GWAS/LMM.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/GWAS/LMM.ipynb b/GWAS/LMM.ipynb index d43b5fe..8ef74af 100644 --- a/GWAS/LMM.ipynb +++ b/GWAS/LMM.ipynb @@ -767,6 +767,7 @@ "source": [ "# Partition the GRM into 100 parts and allocate 8GB memory to each job\n", "[gcta_1]\n", + "parameter: keep_samples = path('.')\n", "# Number of parts the GRM calculation is to be partitioned\n", "parameter: parts = 100\n", "part_number = [f'{parts}_{format(x+1, \"0\" + str(len(str(parts))))}' for x in range(parts)]\n", @@ -779,6 +780,7 @@ " gcta64 \\\n", " --bfile ${_input[0]:n} \\\n", " --make-grm-part ${parts} ${_index+1} \\\n", + " ${(\"--keep \" ) if keep_samples.is_file() else \"\"}\n", " --thread-num ${numThreads} \\\n", " --out ${_output[0]:nnn}" ] From 60b48fcdf3e2031bf81d5e6cb4a88843f5ba0350 Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Fri, 1 Jul 2022 13:33:34 -0400 Subject: [PATCH 38/63] fixed bugs in the liftover pipeline --- GWAS/liftover.ipynb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/GWAS/liftover.ipynb b/GWAS/liftover.ipynb index 9c8ab05..e633521 100644 --- a/GWAS/liftover.ipynb +++ b/GWAS/liftover.ipynb @@ -155,7 +155,7 @@ "# Input file which can be plink format, gvcf/vcf format, sumstat format.\n", "parameter: input_file = path\n", "# The path of yaml file with input file format, only for sumstat file.\n", - "parameter: yml_file = path() \n", + "parameter: yml_file = path('.') \n", "# the name of ouput file which will be saved under cwd path\n", "parameter: output_file = path\n", "# From reference genome, defaut is hg19\n", @@ -167,7 +167,7 @@ "# Rename Variant ID\n", "parameter: rename = True\n", "# Container\n", - "parameter: container = str" + "parameter: container = path('.')" ] }, { @@ -255,11 +255,11 @@ " import os.path\n", " input_path=${_input[0]:r}\n", " output_path=${_output[0]:r}\n", - " fr = ${fr}\n", - " to = ${to}\n", + " fr = \"${fr}\"\n", + " to = \"${to}\"\n", " remove_missing=${remove_missing}\n", " rename = ${rename}\n", - " yml_file = ${yml_file}\n", + " yml_file = \"${yml_file}\"\n", " if not os.path.isfile(yml_file):\n", " yml_file = None\n", " print(fr,to,remove_missing)\n", @@ -298,7 +298,7 @@ "sos" ] ], - "version": "0.22.7" + "version": "0.22.6" } }, "nbformat": 4, From bddc6811898611d56a4b75ae87cbf938321d3dfc Mon Sep 17 00:00:00 2001 From: Yin Huang Date: Tue, 5 Jul 2022 15:27:59 -0400 Subject: [PATCH 39/63] update --- GWAS/liftover.ipynb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/GWAS/liftover.ipynb b/GWAS/liftover.ipynb index 9c8ab05..896208b 100644 --- a/GWAS/liftover.ipynb +++ b/GWAS/liftover.ipynb @@ -59,7 +59,7 @@ " - if plink format, provide the path of `bim` file \n", " - if gvcf/vcf format, the file must have gvcf and vcf in suffixes\n", " - other format will be considered as sumstat format, whose header should have CHR, POS, A0 and A1 columns\n", - "- `--yml_file`, if the sumstat header doesn't have CHR, POS, A0 and A1 columns, you need to provide a ymal file to describe the format of your file, such as following. the first 5 row is required.\n", + "- `--yml_file`, if the sumstat header doesn't have CHR, POS, A0 and A1 columns, you need to provide a ymal file to describe the format of your file, such as following. the first 5 row is required. **ID is the combindation of key words from the word before `:` in the yml file.**\n", "```\n", "ID: CHR,POS,A0,A1\n", "CHR: CHR\n", @@ -155,7 +155,7 @@ "# Input file which can be plink format, gvcf/vcf format, sumstat format.\n", "parameter: input_file = path\n", "# The path of yaml file with input file format, only for sumstat file.\n", - "parameter: yml_file = path() \n", + "parameter: yml_file = path('.') \n", "# the name of ouput file which will be saved under cwd path\n", "parameter: output_file = path\n", "# From reference genome, defaut is hg19\n", @@ -167,7 +167,7 @@ "# Rename Variant ID\n", "parameter: rename = True\n", "# Container\n", - "parameter: container = str" + "parameter: container = '.'" ] }, { @@ -255,11 +255,11 @@ " import os.path\n", " input_path=${_input[0]:r}\n", " output_path=${_output[0]:r}\n", - " fr = ${fr}\n", - " to = ${to}\n", + " fr = f'${fr}'\n", + " to = f'${to}'\n", " remove_missing=${remove_missing}\n", " rename = ${rename}\n", - " yml_file = ${yml_file}\n", + " yml_file = f'${yml_file}'\n", " if not os.path.isfile(yml_file):\n", " yml_file = None\n", " print(fr,to,remove_missing)\n", From d43ac57bb55b7a4799a833021604c0c9890a38ef Mon Sep 17 00:00:00 2001 From: Yin Huang Date: Wed, 6 Jul 2022 14:25:49 -0400 Subject: [PATCH 40/63] update --- GWAS/liftover.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GWAS/liftover.ipynb b/GWAS/liftover.ipynb index 896208b..0d50112 100644 --- a/GWAS/liftover.ipynb +++ b/GWAS/liftover.ipynb @@ -167,7 +167,7 @@ "# Rename Variant ID\n", "parameter: rename = True\n", "# Container\n", - "parameter: container = '.'" + "parameter: container = ''" ] }, { From 374e4e3972cfdc3bce863f4e55d6595ce64b68b4 Mon Sep 17 00:00:00 2001 From: Yin Huang Date: Thu, 7 Jul 2022 16:24:52 -0400 Subject: [PATCH 41/63] update --- GWAS/liftover.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/GWAS/liftover.ipynb b/GWAS/liftover.ipynb index e6a85bf..443e38a 100644 --- a/GWAS/liftover.ipynb +++ b/GWAS/liftover.ipynb @@ -255,11 +255,11 @@ " import os.path\n", " input_path=${_input[0]:r}\n", " output_path=${_output[0]:r}\n", - " fr = f'${fr}'\n", - " to = f'${to}'\n", + " fr = '${fr}'\n", + " to = '${to}'\n", " remove_missing=${remove_missing}\n", " rename = ${rename}\n", - " yml_file = f'${yml_file}'\n", + " yml_file = '${yml_file}'\n", " if not os.path.isfile(yml_file):\n", " yml_file = None\n", " print(fr,to,remove_missing)\n", @@ -298,7 +298,7 @@ "sos" ] ], - "version": "0.22.6" + "version": "0.22.7" } }, "nbformat": 4, From ba4b304b0540205aaef1c199b0142b75ad1fa88a Mon Sep 17 00:00:00 2001 From: UxxUnet Date: Thu, 7 Jul 2022 16:25:56 -0400 Subject: [PATCH 42/63] Make the annotation mapping more general --- variant-annotation/annovar.ipynb | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/variant-annotation/annovar.ipynb b/variant-annotation/annovar.ipynb index 6065601..4ff64f0 100644 --- a/variant-annotation/annovar.ipynb +++ b/variant-annotation/annovar.ipynb @@ -642,18 +642,19 @@ " df2[\"Gene\"] = df2.apply(rename_chrom, axis=1)\n", " \n", " # Match annovar annotations with regenie_burden needs \n", - " annotation_mappings = {\"nonsynonymous SNV\":'missense', \n", - " \"synonymous SNV\":'synonymous',\n", - " \"frameshift substitution\":'LoF', \n", + " annotation_mappings = {\"nonsynonymous\":'missense', \n", + " \"synonymous\":'synonymous',\n", + " \"frameshift\":'LoF',\n", " \"stopgain\":'LoF', \n", " \"stoploss\":'LoF',\n", " \"startloss\":'LoF',\n", - " \"nonframeshift substitution\":\"inframe\"\n", + " \"nonframeshift\":\"inframe\"\n", " }\n", - "\n", " def annotation(x):\n", - " if x in annotation_mappings.keys():\n", - " return annotation_mappings[x]\n", + " x = x.strip().split()\n", + " for i in x:\n", + " if i in annotation_mappings.keys():\n", + " return annotation_mappings[i]\n", " return 'other'\n", " df2[\"anno_cat\"] = df2[\"ExonicFunc.refGene\"].apply(annotation)\n", " \n", From 905257b71f2b0a927d5d1c5a4b0160c06fbcfff7 Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Wed, 13 Jul 2022 13:56:36 -0400 Subject: [PATCH 43/63] fix error handeling indels in avinput forma --- variant-annotation/annovar.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/variant-annotation/annovar.ipynb b/variant-annotation/annovar.ipynb index 4ff64f0..ab9a026 100644 --- a/variant-annotation/annovar.ipynb +++ b/variant-annotation/annovar.ipynb @@ -454,8 +454,8 @@ "task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{_output:bn}'\n", "bash: expand= \"${ }\", stderr = f'{_output:n}.err', stdout = f'{_output:n}.out' \n", " # $6 ref_allele, $5 alt_allele in the bim files \n", - " # Output as annovar avinput chr, start, end (has to be calculated depending on allele length), reference, alternative\n", - " awk '{if ($6 > $5) {print $1, $4, $4 + (length ($6) - length ($5)), $6, $5, $2} else {print $1, $4, $4, $6, $5, $2}}' ${_input} > ${_output}" + " # Output as annovar avinput chr, start, end (has to be calculated depending on reference allele length), reference, alternative\n", + " awk '{if (length ($6) > 1) {print $1, $4, $4 + (length ($6) - 1), $6, $5, $2} else {print $1, $4, $4, $6, $5, $2}}' ${_input} > ${_output}" ] }, { @@ -524,7 +524,7 @@ " operation = ['g', 'g', 'g', 'g', 'r', 'r', 'f', 'f', 'f', 'f', 'f']\n", " arg = ['\"-splicing 12 -exonicsplicing\"', '\"-splicing 30\"', '\"-splicing 12 -exonicsplicing\"', '\"-splicing 12\"', '', '', '', '', '', '', '']\n", "else:\n", - " protocol = ['refGene', 'refGeneWithVer', 'knownGene', 'ensGene', 'phastConsElements30way', 'encRegTfbsClustered', 'gwasCatalog', 'gnomad30_genome', 'gnomad211_exome', 'gme', 'kaviar_20150923', 'avsnp150', 'dbnsfp41a', 'dbscsnv11', 'clinvar_20200316', 'gene4denovo201907']\n", + " protocol = ['refGene', 'refGeneWithVer', 'knownGene', 'ensGene', 'phastConsElements30way', 'encRegTfbsClustered', 'gwasCatalog', 'gnomad30_genome', 'gnomad211_exome', 'gme', 'kaviar_20150923', 'avsnp150', 'dbnsfp41a', 'dbscsnv11', 'clinvar_20220320', 'gene4denovo201907']\n", " operation = ['g', 'g', 'g', 'gx', 'r', 'r', 'r', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f']\n", " arg = ['\"-splicing 12 -exonicsplicing\"', '\"-splicing 30\"', '\"-splicing 12 -exonicsplicing\"', '\"-splicing 12\"', '', '', '', '', '', '', '', '', '', '', '', '']\n", "\n", @@ -778,7 +778,7 @@ "sos" ] ], - "version": "0.23.3" + "version": "0.22.6" } }, "nbformat": 4, From 3040b1148fb5886e03cf197fb229990c057dd1e1 Mon Sep 17 00:00:00 2001 From: hsun3163 <54919134+hsun3163@users.noreply.github.com> Date: Tue, 18 Oct 2022 18:06:29 -0400 Subject: [PATCH 44/63] Add container to liftover.ipynb --- GWAS/liftover.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GWAS/liftover.ipynb b/GWAS/liftover.ipynb index 443e38a..9b95f77 100644 --- a/GWAS/liftover.ipynb +++ b/GWAS/liftover.ipynb @@ -249,7 +249,7 @@ "depends: f'{cwd:a}/utils.py'\n", "input: input_file\n", "output: f'{cwd}/{output_file}'\n", - "python: input = f'{cwd:a}/utils.py', expand = '${ }', stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'\n", + "python: input = f'{cwd:a}/utils.py',container = container, expand = '${ }', stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout'\n", " \n", " \n", " import os.path\n", From 763e83627106f47b18ad8fbfac06aff559f719fc Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Wed, 21 Dec 2022 17:34:00 -0500 Subject: [PATCH 45/63] changes to calling pipeline by Isabelles wishes --- variant-calling/gatk_joint_calling.ipynb | 379 +++++++++++++---------- 1 file changed, 216 insertions(+), 163 deletions(-) diff --git a/variant-calling/gatk_joint_calling.ipynb b/variant-calling/gatk_joint_calling.ipynb index 7ff34fb..2ca9e38 100644 --- a/variant-calling/gatk_joint_calling.ipynb +++ b/variant-calling/gatk_joint_calling.ipynb @@ -78,11 +78,12 @@ "This SoS workflow notebook contains four workflows:\n", "\n", "- `gatk_call`\n", - "- `gatk_filter`\n", - "- `annovar`\n", + "- `gatk_filter_strict`\n", + "- `gatk_filter_basic`\n", + "- `vcf_qc`\n", "- `submit_csg`\n", "\n", - "The first three workflows are for the analysis and the last one is for submitting jobs on the cluster.\n", + "The first four workflows are for the analysis and the last one is for submitting jobs on the cluster.\n", "\n", "All workflow steps are numerically ordered to reflect the execution logic. This is the most straightforward SoS workflow style, the \"process-oriented\" style. " ] @@ -190,7 +191,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 8, "metadata": { "kernel": "Bash" }, @@ -208,17 +209,22 @@ "\n", "Workflows:\n", " call\n", - " filter\n", - " annovar\n", - " submit_csg\n", + " strict_filter\n", + " basic_filter\n", + " vcf_qc\n", "\n", "Global Workflow Options:\n", " --vcf-prefix joint_call_output (as path)\n", " Combined VCF file prefix, including path to the output\n", " but without vcf.gz extension, eg\n", " \"/path/to/output_filename\".\n", + " --cwd VAL (as path, required)\n", + " Working directory\n", " --build hg19\n", " Human genome build\n", + " --vcf-filter strict\n", + " VCF filtering strategy e.x: strict or basic (default is\n", + " strict)\n", " --mem 12 (as int)\n", " Memory allocated to a job, in terms of Gigabyte\n", " --container-option 'gaow/gatk4-annovar'\n", @@ -238,46 +244,27 @@ " Workflow Options:\n", " --ref-genome VAL (as path, required)\n", " Path to reference genome file\n", - " filter_1: Split into SNP and INDEL for separate PASS filters\n", - " filter_2: PASS or filter for indels and SNPs (Note | not\n", + " strict_filter_1: Split into SNP and INDEL for separate PASS filters\n", + " strict_filter_2: PASS or filter for indels and SNPs (Note | not\n", " recommended for filters) Ignore MQRankSum warnings <-\n", " can only be calculated for het sites (not homs)\n", " Workflow Options:\n", " --snp-filters QD < 2.0, QD2 QUAL < 30.0, QUAL30 SOR > 3.0, SOR3 FS > 60.0, FS60 MQ < 40.0, MQ40 MQRankSum < -12.5, MQRankSum-12.5 ReadPosRankSum < -8.0, ReadPosRankSum-8 (as list)\n", " --indel-filters QD < 2.0, QD2 QUAL < 30.0, QUAL30 FS > 200.0, FS200 ReadPosRankSum < -20.0, ReadPosRankSum-20 (as list)\n", - " filter_3: Merge back SNP and INDEL\n", - " filter_4: remove non-PASS variants if wanted\n", - " annovar_1: Annotate\n", + " strict_filter_3: Merge back SNP and INDEL\n", + " strict_filter_4: remove non-PASS variants if wanted\n", + " basic_filter_1: remove all coverage < 4x, strand bias and end of read\n", + " bias\n", " Workflow Options:\n", - " --humandb VAL (as path, required)\n", - " humandb path for ANNOVAR\n", - " --x-ref path(f\"{humandb}/mart_export_2019_LOFtools3.txt\")\n", - "\n", - " add xreffile to option without -exonicsplicing\n", - " mart_export_2019_LOFtools3.txt #xreffile latest option\n", - " -> Phenotype description,HGNC symbol,MIM morbid descript\n", - " ion,CGD_CONDITION,CGD_inh,CGD_man,CGD_comm,LOF_tools\n", - " --protocol refGene refGeneWithVer knownGene ensGene wgEncodeBroadHmmGm12878HMM wgEncodeBroadHmmHmecHMM wgEncodeBroadHmmHepg2HMM wgEncodeBroadHmmH1hescHMM wgEncodeRegDnaseClusteredV3 wgEncodeRegTfbsClusteredV3 genomicSuperDups wgRna targetScanS phastConsElements46way tfbsConsSites gwasCatalog gnomad211_genome gnomad211_exome popfreq_max_20150413 gme kaviar_20150923 abraom avsnp150 dbnsfp35a dbscsnv11 regsnpintron cadd13gt20 clinvar_20200316 mcap13 gene4denovo201907 (as list)\n", - " Annovar protocol\n", - " --operation g g g gx r r r r r r r r r r r r f f f f f f f f f f f f f f (as list)\n", - " Annovar operation\n", - " --arg \"-splicing 12 -exonicsplicing\" \"-splicing 30\" \"-splicing 12 -exonicsplicing\" \"-splicing 12\" (as list)\n", - " Annovar args\n", - " annovar_2: Filter out common variants (from 3 databases) with\n", - " annovar\n", - " Workflow Options:\n", - " --humandb humandb (as path)\n", - " humandb path for ANNOVAR\n", - " --keep 'splic|exonic'\n", - " keep pathogenic: use 'pathogenic|Pathogenic', keep\n", - " splice_exonic: use 'splic|exonic'\n", - " submit_csg: Job submission on CSG cluster\n", - " Workflow Options:\n", - " --cmd-file VAL (as path, required)\n", - " Path to job file\n", - " --time '24:00:00'\n", - " Total run time allocated to the script\n", - " --[no-]dryrun (default to False)\n" + " --ref-genome refs/Homo_sapiens.GRCh37.75.dna_sm.primary_assembly.fa (as path)\n", + " Path to reference genome file\n", + " --variant-filter QUAL < 30.0 , QUAL30 FS > 200.0, FS200 ReadPosRankSum < -20.0, ReadPosRankSum-20 DP < 4, DP4 (as list)\n", + " basic_filter_2: Remove non-PASS variants\n", + " vcf_qc_1: QC VCF for relatedness\n", + " vcf_qc_2: QC VCF for sex check\n", + " vcf_qc_3: QC VCF for IBD\n", + " vcf_qc_4: QC VCF for relatedness\n", + " vcf_qc_5: QC VCF for homozygosity mapping\n" ] } ], @@ -312,26 +299,41 @@ " --vcf-prefix output/minimal_example \\\n", " --samples /mnt/mfs/statgen/data_private/gatk_joint_call_example/20200820_sample_manifest.txt \\\n", " --samples-dir /mnt/mfs/statgen/data_private/gatk_joint_call_example/ \\\n", - " --ref-genome /mnt/mfs/statgen/isabelle/REF/refs/Homo_sapiens.GRCh37.75.dna_sm.primary_assembly.fa\n", + " --ref-genome /mnt/mfs/statgen/isabelle/REF/refs/Homo_sapiens.GRCh37.75.dna_sm.primary_assembly.fa \\\n", + " --cwd output \\\n", + " --vcf_filter strict\n", "```\n", "\n", - "Filtering:\n", + "Filtering with strict filters:\n", "\n", "```\n", - "sos run gatk_joint_calling.ipynb filter \\\n", + "sos run gatk_joint_calling.ipynb strict_filter \\\n", " --container-option /mnt/mfs/statgen/containers/gatk4-annovar.sif \\\n", - " --vcf-prefix output/minimal_example\n", + " --vcf-prefix output/minimal_example \\\n", + " --cwd output \\\n", + " --vcf_filter strict\n", "```\n", "\n", - "Annotating:\n", + "Filtering with basic filters:\n", "\n", "```\n", - "sos run gatk_joint_calling.ipynb annovar \\\n", + "sos run gatk_joint_calling.ipynb basic_filter \\\n", + " --container-option /mnt/mfs/statgen/containers/gatk4-annovar.sif \\\n", + " --vcf-prefix output/minimal_example \\\n", + " --ref-genome /mnt/mfs/statgen/isabelle/REF/refs/Homo_sapiens.GRCh37.75.dna_sm.primary_assembly.fa\\\n", + " --cwd output \\\n", + " --vcf_filter strict\n", + "```\n", + "\n", + "VCF quality control (sex checks, IBD, heterozygosity, etc):\n", + "\n", + "```\n", + "sos run gatk_joint_calling.ipynb vcf_qc \\\n", " --container-option /mnt/mfs/statgen/containers/gatk4-annovar.sif \\\n", " --vcf-prefix output/minimal_example.snp_indel.filter.PASS \\\n", - " --keep \"splic|exonic\" \\\n", - " --humandb /mnt/mfs/statgen/isabelle/REF/humandb \\\n", - " --x-ref /mnt/mfs/statgen/isabelle/REF/humandb/mart_export_2019_LOFtools3.txt\n", + " --cwd output \\\n", + " --vcf_filter strict\n", + " \n", "```" ] }, @@ -385,8 +387,12 @@ "# Combined VCF file prefix, including path to the output but without vcf.gz extension, \n", "# eg \"/path/to/output_filename\".\n", "parameter: vcf_prefix = path('joint_call_output')\n", + "# Working directory\n", + "parameter: cwd = path\n", "# Human genome build\n", "parameter: build = 'hg19'\n", + "# VCF filtering strategy e.x: strict or basic (default is strict)\n", + "parameter: vcf_filter = 'strict'\n", "# Memory allocated to a job, in terms of Gigabyte\n", "parameter: mem=12\n", "# Software container option\n", @@ -476,6 +482,15 @@ "Since we have two types of variants SNP and Indels, the first two steps of the filter workflow pipeline process the two variant types in parallel, then merge them and do additional filtering wiht steps 3 and 4." ] }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "### Strict filter" + ] + }, { "cell_type": "code", "execution_count": 10, @@ -485,7 +500,7 @@ "outputs": [], "source": [ "# Split into SNP and INDEL for separate PASS filters\n", - "[filter_1]\n", + "[strict_filter_1]\n", "variant_type = ['SNP', 'INDEL']\n", "input: f'{vcf_prefix:a}.vcf.gz', for_each='variant_type', concurrent = True\n", "output: f'{vcf_prefix:a}.{_variant_type.lower()}.vcf.gz'\n", @@ -508,7 +523,7 @@ "source": [ "# PASS or filter for indels and SNPs (Note | not recommended for filters)\n", "# Ignore MQRankSum warnings <- can only be calculated for het sites (not homs)\n", - "[filter_2]\n", + "[strict_filter_2]\n", "parameter: snp_filters = ['QD < 2.0, QD2', 'QUAL < 30.0, QUAL30', 'SOR > 3.0, SOR3', 'FS > 60.0, FS60', 'MQ < 40.0, MQ40', 'MQRankSum < -12.5, MQRankSum-12.5', 'ReadPosRankSum < -8.0, ReadPosRankSum-8']\n", "parameter: indel_filters = [\"QD < 2.0, QD2\", \"QUAL < 30.0, QUAL30\", \"FS > 200.0, FS200\", \"ReadPosRankSum < -20.0, ReadPosRankSum-20\"]\n", "input: paired_with = dict(filter_option=[snp_filters, indel_filters])\n", @@ -531,7 +546,7 @@ "outputs": [], "source": [ "# Merge back SNP and INDEL\n", - "[filter_3]\n", + "[strict_filter_3]\n", "input: group_by = 'all'\n", "output: f'{vcf_prefix:a}.snp_indel.filter.vcf.gz'\n", "\n", @@ -550,8 +565,8 @@ "outputs": [], "source": [ "# remove non-PASS variants if wanted\n", - "[filter_4]\n", - "output: f'{vcf_prefix:a}.snp_indel.filter.PASS.vcf.gz'\n", + "[strict_filter_4]\n", + "output: strict_out= f'{vcf_prefix:a}.snp_indel.filter.strict_QC.PASS.vcf.gz'\n", "\n", "\n", "bash: container=container_option, expand=\"${ }\", stderr=f'{_output:nn}.err', stdout=f'{_output:nn}.out'\n", @@ -566,7 +581,7 @@ "kernel": "SoS" }, "source": [ - "## Annotation" + "### Basic filter" ] }, { @@ -577,61 +592,38 @@ }, "outputs": [], "source": [ - "# convert vcf to annovar input format\n", - "[annovar_1]\n", + "# remove all coverage < 4x, strand bias and end of read bias\n", + "[basic_filter_1]\n", + "# Path to reference genome file\n", + "parameter: ref_genome = path('refs/Homo_sapiens.GRCh37.75.dna_sm.primary_assembly.fa')\n", + "parameter: variant_filter = ['QUAL < 30.0 , QUAL30', 'FS > 200.0, FS200', 'ReadPosRankSum < -20.0, ReadPosRankSum-20', 'DP < 4, DP4']\n", "input: f'{vcf_prefix:a}.vcf.gz'\n", - "output: f'{_input:nn}.avinput'\n", - "\n", - "bash: container=container_option,expand=\"${ }\", stderr=f'{_output[0]:n}.err', stdout=f'{_output[0]:n}.out'\n", - "\n", - " convert2annovar.pl \\\n", - " -includeinfo \\\n", - " -allsample \\\n", - " -withfreq \\\n", - " -format vcf4 ${_input} > ${_output[0]} " + "output: f'{_input:nn}.snp_indel.filter.basic_QC.vcf.gz'\n", + "bash: container=container_option, expand=\"${ }\", stderr=f'{_output:nn}.err', stdout=f'{_output:nn}.out'\n", + " gatk --java-options '-Xmx${mem}g' VariantFiltration \\\n", + " -R ${ref_genome} \\\n", + " -V ${_input} \\\n", + " ${\" \".join(['-filter \"%s\" --filter-name \"%s\"' % tuple([y.strip() for y in x.split(',')]) for x in variant_filter])} \\\n", + " -O ${_output}\n", + " " ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "kernel": "SoS" }, "outputs": [], "source": [ - "# Annotate \n", - "[annovar_2]\n", - "# humandb path for ANNOVAR\n", - "parameter: humandb = path\n", - "#add xreffile to option without -exonicsplicing\n", - "#mart_export_2019_LOFtools3.txt #xreffile latest option -> Phenotype description,HGNC symbol,MIM morbid description,CGD_CONDITION,CGD_inh,CGD_man,CGD_comm,LOF_tools\n", - "parameter: x_ref = path(f\"{humandb}/mart_export_2019_LOFtools3.txt\")\n", - "# Annovar protocol\n", - "parameter: protocol = ['refGene', 'refGeneWithVer', 'knownGene', 'ensGene', 'wgEncodeBroadHmmGm12878HMM', 'wgEncodeBroadHmmHmecHMM', 'wgEncodeBroadHmmHepg2HMM', 'wgEncodeBroadHmmH1hescHMM', 'wgEncodeRegDnaseClusteredV3', 'wgEncodeRegTfbsClusteredV3', 'genomicSuperDups', 'wgRna', 'targetScanS', 'phastConsElements46way', 'tfbsConsSites', 'gwasCatalog', 'gnomad211_genome', 'gnomad211_exome', 'popfreq_max_20150413', 'gme', 'kaviar_20150923', 'abraom', 'avsnp150', 'dbnsfp35a', 'dbscsnv11', 'regsnpintron', 'cadd13gt20', 'clinvar_20200316', 'mcap13', 'gene4denovo201907']\n", - "# Annovar operation\n", - "parameter: operation = ['g', 'g', 'g', 'gx', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'r', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f']\n", - "# Annovar args\n", - "parameter: arg = ['\"-splicing 12 -exonicsplicing\"', '\"-splicing 30\"', '\"-splicing 12 -exonicsplicing\"', '\"-splicing 12\"', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']\n", - "input: f'{vcf_prefix:a}.avinput'\n", - "output: f'{vcf_prefix:a}.{build}_multianno.csv'\n", - "\n", - "bash: container=container_option, volumes=[f'{humandb:a}:{humandb:a}', f'{x_ref:ad}:{x_ref:ad}'], expand=\"${ }\", stderr=f'{vcf_prefix:a}.err', stdout=f'{vcf_prefix:a}.out'\n", - " #do not add -intronhgvs as option -> writes cDNA variants as HGVS but creates issues (+2 splice site reported only)\n", - " #-nastring . can only be . for VCF files\n", - " #regsnpintron might cause shifted lines (be carefull using)\n", - " table_annovar.pl \\\n", - " ${_input} \\\n", - " ${humandb} \\\n", - " -buildver ${build} \\\n", - " -out ${_output:nn}\\\n", - " -remove \\\n", - " -polish \\\n", - " -nastring . \\\n", - " -protocol ${\",\".join(protocol)} \\\n", - " -operation ${\",\".join(operation)} \\\n", - " -arg ${\",\".join(arg)} \\\n", - " -csvout \\\n", - " -xreffile ${x_ref}" + "# Remove non-PASS variants\n", + "[basic_filter_2]\n", + "output: basic_out=f'{_input:nn}.PASS.vcf.gz'\n", + "\n", + "bash: container=container_option, expand=\"${ }\", stderr=f'{_output:nn}.err', stdout=f'{_output:nn}.out'\n", + " gatk --java-options '-Xmx${mem}g' SelectVariants \\\n", + " -V ${_input} -O ${_output} \\\n", + " --exclude-filtered" ] }, { @@ -640,64 +632,120 @@ "kernel": "SoS" }, "source": [ - "The step below provides some annotation filtered results. If you want to run your own annotation you can do it by running `ANNOVAR` from the singularity image directly, for example:\n", - "\n", - "```\n", - "singularity exec /mnt/mfs/statgen/containers/gatk4-annovar.sif annotate_variation.pl \\\n", - " -filter -dbtype gnomad211_exome \\\n", - " -build hg19 \\\n", - " -score_threshold 0.005 \\\n", - " minimal_example.snp_indel.filter.PASS.hg19_multianno.exonic_splic.txt \\\n", - " humandb \\\n", - " -out minimal_example.snp_indel.filter.PASS.hg19_multianno.exonic_splic\n", - "```" + "## Extra VCF QC filters" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "kernel": "SoS" }, "outputs": [], "source": [ - "# Filter out common variants (from 3 databases) with annovar \n", - "[annovar_3]\n", - "# humandb path for ANNOVAR\n", - "parameter: humandb = path(\"humandb/\")\n", - "# keep pathogenic: use 'pathogenic|Pathogenic',\n", - "# keep splice_exonic: use 'splic|exonic'\n", - "parameter: keep=\"splic|exonic\"\n", - "tag = '_'.join(sorted(set(keep.lower().split('|'))))\n", - "input: f'{vcf_prefix:a}.vcf.gz'\n", - "output: f'{_input[0]:n}.{tag}.txt', \n", - " f'{_input[0]:n}.{tag}.exome_genome.{build}_popfreq_max_20150413_filtered'\n", - "\n", - "bash: container=container_option, volumes=[f'{humandb:a}:{humandb:a}'], expand=\"${ }\", stderr=f'{_output[0]:n}.err', stdout=f'{_output[0]:n}.out'\n", - " set -e\n", - " awk 'FNR == 1 {print} /${keep}/{print}' ${_input[0]} > ${_output[0]}\n", - " \n", - " annotate_variation.pl -filter -dbtype gnomad211_exome \\\n", - " -build ${build} \\\n", - " -score_threshold 0.005 \\\n", - " ${_output[0]} \\\n", - " ${humandb} \\\n", - " -out ${_output[0]:n}\n", - " \n", - " annotate_variation.pl -filter -dbtype gnomad211_genome \\\n", - " -build ${build} \\\n", - " -score_threshold 0.005 \\\n", - " ${_output[0]:n}.${build}_gnomad211_exome_filtered \\\n", - " ${humandb} \\\n", - " -out ${_output[0]:n}.exome\n", - "\n", - " annotate_variation.pl -filter -dbtype popfreq_max_20150413 \\\n", - " -build ${build} \\\n", - " -score_threshold 0.005 \\\n", - " ${_output[0]:n}.exome.${build}_gnomad211_genome_filtered \\\n", - " ${humandb} \\\n", - " -out ${_output[0]:n}.exome_genome\n", - " rm ${_output[0]:nn}*_dropped" + "# QC VCF for relatedness\n", + "[vcf_qc_1 (check relatedness)]\n", + "input: f\"{vcf_prefix:a}.snp_indel.filter.{vcf_filter}_QC.PASS.vcf.gz\" \n", + "output: f'{cwd}/vcf_qc/{_input:bnn}.relatedness', f'{cwd}/vcf_qc/{_input:bnn}.relatedness2'\n", + "bash: expand=\"${ }\", stderr=f'{cwd}/vcf_qc/{_output[0]:b}.err', stdout=f'{cwd}/vcf_qc/{_output[0]:b}.log'\n", + "\n", + " vcftools --relatedness --gzvcf ${_input} --out ${_output[0]:n}\n", + " vcftools --relatedness2 --gzvcf ${_input} --out ${_output[1]:n}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "# QC VCF for sex check\n", + "[vcf_qc_2 (check sex)]\n", + "input: f\"{vcf_prefix:a}.snp_indel.filter.{vcf_filter}_QC.PASS.vcf.gz\"\n", + "output: f\"{cwd}/vcf_qc/{_input:bnn}.bed\", f\"{cwd}/vcf_qc/{_input:bnn}.sex.sexcheck\", f\"{cwd}/vcf_qc/{_input:bnn}.sex2.sexcheck\"\n", + "bash: expand=\"${ }\", stderr=f\"{_output[1]:n}.err\",stdout=f\"{_output[1]:n}.log\"\n", + " plink --vcf ${_input} --double-id --make-bed --out ${_output[0]:n} --allow-extra-chr\n", + " plink --bfile ${_output[0]:n} --check-sex --out ${_output[1]:n} --allow-extra-chr\n", + " plink --bfile ${_output[0]:n} --check-sex 0.35 0.65 --out ${_output[2]:n} --allow-extra-chr\n", + " rm ${_output[1]:n}.nosex && rm ${_output[2]:n}.nosex && rm ${_output[1]:n}.log && rm ${_output[2]:n}.log && rm ${_output[0]:n}.nosex" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "# QC VCF for IBD\n", + "[vcf_qc_3 (IBD)]\n", + "input: f\"{cwd}/vcf_qc/{vcf_prefix:b}.snp_indel.filter.{vcf_filter}_QC.PASS.bed\" \n", + "output: f'{_input:n}.IBD.genome',\n", + " f'{_input:n}.HET.het',\n", + " f'{_input:n}.IBC.ibc',\n", + " f'{_input:n}.SEX.2.C.sexcheck',\n", + " vcf=f'{_input:n}.C.VCF.vcf'\n", + "bash: expand=\"${ }\", stderr=f'{_output[0]}.err', stdout=f'{_output[0]}.log'\n", + " #add plink IBD\n", + " #missing rate per SNP MAF and HWE cut-off\n", + " plink --bfile ${_input:n} --geno 0.1 --hwe 0.00001 --maf 0.05 --make-bed --out ${_input:n}.C --allow-extra-chr\n", + " #LD pruning with window size 100 step size 10 and r^2 threshold 0.5 (MAF <0.05)\n", + " plink --bfile ${_input:n}.C --indep-pairwise 50 5 0.5 --make-bed --out ${_input:n}.CP --allow-extra-chr\n", + " #IBD sharing\n", + " plink --bfile ${_input:n}.CP --genome --make-bed --out ${_output[0]:n} --allow-extra-chr\n", + " #het (Inbreeding and absence of heterozygosity)\n", + " plink --bfile ${_input:n}.CP --het --make-bed --out ${_output[1]:n} --allow-extra-chr\n", + " #IBCs (Inbreeding coeff)\n", + " plink --bfile ${_input:n}.CP --ibc --make-bed --out ${_output[2]:n} --allow-extra-chr\n", + " ###cleaned sex\n", + " plink --bfile ${_input:n}.C --check-sex 0.35 0.65 --out ${_output[3]:n} --allow-extra-chr\n", + " ####cleaned relatedness\n", + " plink --bfile ${_input:n}.C --recode vcf --out ${_output[4]:n} --allow-extra-chr\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "# QC VCF for relatedness\n", + "[vcf_qc_4 (vcftools)]\n", + "input: named_output(\"vcf\")\n", + "output: f'{_input:n}.C.relatedness', f'{_input:n}.C.2.relatedness2'\n", + "bash: expand=\"${ }\", stderr=f'{_output[0]}.err', stdout=f'{_output[0]}.log'\n", + "\n", + " bgzip ${_input} && tabix -p vcf ${_input}.gz\n", + " vcftools --relatedness --gzvcf ${_input}.gz --out ${_output[0]:n}\n", + " vcftools --relatedness2 --gzvcf ${_input}.gz --out ${_output[1]:n}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "# QC VCF for homozygosity mapping \n", + "[vcf_qc_5 (homozygosity mapping)]\n", + "parameter: vcf_filter = 'strict'\n", + "input: f\"{cwd}/vcf_qc/{vcf_prefix:b}.snp_indel.filter.{vcf_filter}_QC.PASS.bed\"\n", + "output: f'{_input:n}.HOM.hom'\n", + "bash: expand=\"${ }\", stderr=f'{_output:nn}.err', stdout=f'{_output:nn}.log'\n", + " ##hom_mapping per sample (at least 100 SNPs, and of total length ≥ 1000 (1Mb) - 0.01 MAF\n", + " plink --bfile ${_input:n} --geno 0.1 --hwe 0.00001 --maf 0.01 --make-bed --out ${_input:n}.CH --allow-extra-chr\n", + " plink --bfile ${_input:n}.CH --homozyg --make-bed --out ${_output:n} --allow-extra-chr\n", + " #remove all the unwanted files at the end\n", + " ##FIXME\n", + " mkdir ${cwd}/vcf_qc/cache\n", + " mv ${cwd}/vcf_qc/*.{bed,bim,fam,log,nosex,in,out,gz,tbi} ${cwd}/vcf_qc/cache\n" ] }, { @@ -718,16 +766,23 @@ " --samples-dir /mnt/mfs/statgen/data_private/gatk_joint_call_example/ \\\n", " --ref-genome /mnt/mfs/statgen/isabelle/REF/refs/Homo_sapiens.GRCh37.75.dna_sm.primary_assembly.fa\n", "\n", - "sos run gatk_joint_calling.ipynb filter \\\n", - " --container-option /mnt/mfs/statgen/containers/gatk4-annovar.sif \\\n", - " --vcf-prefix output/minimal_example\n", + "sos run gatk_joint_calling.ipynb strict_filter \\\n", + " --vcf-prefix output/minimal_example \\\n", + " --cwd output/\\\n", + " --variant_filter 'strict'\n", + " \n", + "sos run gatk_joint_calling.ipynb basic_filter \\\n", + " --vcf-prefix output/minimal_example \\\n", + " --cwd output/\\\n", + " --variant_filter 'basic'\n", "\n", - "sos run gatk_joint_calling.ipynb annovar \\\n", - " --container-option /mnt/mfs/statgen/containers/gatk4-annovar.sif \\\n", - " --vcf-prefix output/minimal_example.snp_indel.filter.PASS \\\n", - " --keep \"splic|exonic\" \\\n", - " --humandb /mnt/mfs/statgen/isabelle/REF/humandb \\\n", - " --x-ref /mnt/mfs/statgen/isabelle/REF/humandb/mart_export_2019_LOFtools3.txt\n", + "module load Singularity\n", + "module load VCFTOOLS/0.1.17\n", + "module load PLINK/1.9.10\n", + "sos run gatk_joint_calling.ipynb vcf_qc \\\n", + " --vcf-prefix output/minimal_example \\\n", + " --cwd output/\\\n", + " --variant_filter 'basic'\n", " \n", " \n", "```\n", @@ -753,12 +808,10 @@ ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "raw", "metadata": { "kernel": "SoS" }, - "outputs": [], "source": [ "# Job submission on CSG cluster\n", "[submit_csg]\n", @@ -830,7 +883,7 @@ "height": 0, "style": "side" }, - "version": "0.22.4" + "version": "0.22.6" } }, "nbformat": 4, From 315ae21b4b405adbbc96180f33703bc70ac23e6e Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Tue, 27 Dec 2022 10:56:02 -0500 Subject: [PATCH 46/63] fixed variant calling --- variant-calling/gatk_joint_calling.ipynb | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/variant-calling/gatk_joint_calling.ipynb b/variant-calling/gatk_joint_calling.ipynb index 2ca9e38..ef2fb26 100644 --- a/variant-calling/gatk_joint_calling.ipynb +++ b/variant-calling/gatk_joint_calling.ipynb @@ -764,7 +764,9 @@ " --vcf-prefix output/minimal_example \\\n", " --samples /mnt/mfs/statgen/data_private/gatk_joint_call_example/20200820_sample_manifest.txt \\\n", " --samples-dir /mnt/mfs/statgen/data_private/gatk_joint_call_example/ \\\n", - " --ref-genome /mnt/mfs/statgen/isabelle/REF/refs/Homo_sapiens.GRCh37.75.dna_sm.primary_assembly.fa\n", + " --ref-genome /mnt/mfs/statgen/isabelle/REF/refs/Homo_sapiens.GRCh37.75.dna_sm.primary_assembly.fa\\\n", + " --cwd output\\ \\\n", + " --variant_filter 'strict'\n", "\n", "sos run gatk_joint_calling.ipynb strict_filter \\\n", " --vcf-prefix output/minimal_example \\\n", @@ -793,9 +795,12 @@ "\n", "```\n", "sos run gatk_joint_calling.ipynb submit_csg \\\n", - " --cmd_file command_1027.txt \n", + " --cmd_file command_1027.txt \\\n", + " --cwd output\n", " \n", - "sos run ~/gatk_joint_calling_test.ipynb submit_csg --cmd_file ~/gatk_joint_calling/command_1027.txt \n", + "sos run ~/gatk_joint_calling_test.ipynb submit_csg \\\n", + " --cmd_file ~/gatk_joint_calling/command_1027.txt \\\n", + " --cwd output\n", "```\n", "\n", "\n", @@ -803,15 +808,18 @@ "```\n", "sos run gatk_joint_calling.ipynb submit_csg \\\n", " --cmd_file analysis_commands_20200825.txt \\\n", + " --cwd output \\\n", " --dryrun True\n", "```" ] }, { - "cell_type": "raw", + "cell_type": "code", + "execution_count": null, "metadata": { "kernel": "SoS" }, + "outputs": [], "source": [ "# Job submission on CSG cluster\n", "[submit_csg]\n", From 6c1ebcd749f32141b38b39f3ad8eb282c5346895 Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Tue, 27 Dec 2022 11:09:01 -0500 Subject: [PATCH 47/63] fixed variant calling --- variant-calling/gatk_joint_calling.ipynb | 55 +++--------------------- 1 file changed, 5 insertions(+), 50 deletions(-) diff --git a/variant-calling/gatk_joint_calling.ipynb b/variant-calling/gatk_joint_calling.ipynb index ef2fb26..74c5247 100644 --- a/variant-calling/gatk_joint_calling.ipynb +++ b/variant-calling/gatk_joint_calling.ipynb @@ -6,63 +6,18 @@ "kernel": "SoS" }, "source": [ - "# WGS GVCF samples joint calling, filtering and annotation\n", + "# WGS GVCF samples joint calling, filtering and quality control \n", "\n", - "Implementing a GATK + ANNOVAR workflow in [SoS](https://github.com/vatlab/SOS), written by Isabelle Schrauwen with software containers built by Gao Wang. " + "Implementing a GATK + VCF_QC workflow in [SoS](https://github.com/vatlab/SOS), written by Isabelle Schrauwen with software containers built by Gao Wang. " ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "kernel": "SoS" }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
RevisionAuthorDateMessage
4762450haoyueshuai2020-08-25update submit_csg
b7958f9Gao Wang2020-08-25Fix a bash variable bug
2986c3cGao Wang2020-08-25Update documentation
c1da803Gao Wang2020-08-25Add job submission template for CSG cluster
69e450aGao Wang2020-08-24Add documentation
3b5a1e8Gao Wang2020-08-24Fix ANNOVAR step
43b3150Gao Wang2020-08-23Update joint variant calling pipeline with minimal working example
2cacdc2Gao Wang2020-08-20Remove the need to mount workdir due to recent changes in SoS
afe343cGao Wang2020-08-20Add variant calling pipeline
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "%revisions -s -n 10" ] @@ -134,7 +89,7 @@ "*.dict\n", "```\n", "\n", - "- `ANNOVAR` reference files ship with `ANNOVAR` software, under a folder called `humandb`.\n", + "- `VCF_QC` provides quality control measurementes on the VCF such as sex checks, heterozygosity, and relatedness. \n", "\n", "This workflow assumes that the required files already exit. This pipeline does not provide steps to download or to generate them automatically, which you could find in the tutorial slides. The pipeline will indeed check the availability of the reference files and quit on error if they are missing." ] From d4f00c4d93bfb6ec1814c27c08e5cfff437fdbb8 Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Tue, 27 Dec 2022 11:15:20 -0500 Subject: [PATCH 48/63] fixed variant calling --- variant-calling/gatk_joint_calling.ipynb | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/variant-calling/gatk_joint_calling.ipynb b/variant-calling/gatk_joint_calling.ipynb index 74c5247..4c267df 100644 --- a/variant-calling/gatk_joint_calling.ipynb +++ b/variant-calling/gatk_joint_calling.ipynb @@ -117,20 +117,32 @@ " ...\n", "```\n", "\n", - "to run variant filtering, \n", + "to run variant filtering both strict or basic, \n", "\n", "\n", "```\n", - "sos run gatk_joint_calling.ipynb filter \\\n", + "sos run gatk_joint_calling.ipynb filter_strict \\\n", " --vcf-prefix /path/to/some_vcf_file_prefix \\\n", + " --cwd output \\\n", + " --variant_filter strict\n", " ...\n", "```\n", "\n", - "to run annotation,\n", + "```\n", + "sos run gatk_joint_calling.ipynb filter_basic \\\n", + " --vcf-prefix /path/to/some_vcf_file_prefix \\\n", + " --cwd output \\\n", + " --variant_filter basic\n", + " ...\n", + "```\n", + "\n", + "to run vcf_qc,\n", "\n", "```\n", - "sos run gatk_joint_calling.ipynb annovar \\\n", + "sos run gatk_joint_calling.ipynb vcf_qc \\\n", " --vcf-prefix /path/to/some_vcf_file_prefix \\\n", + " --cwd output \\\n", + " --variant_filter basic\n", " ...\n", "```\n", "\n", From 17bbb67960d3f70f5701087a793b00998974d017 Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Wed, 28 Dec 2022 10:48:23 -0500 Subject: [PATCH 49/63] add plink and vcftools to the submit script --- variant-calling/gatk_joint_calling.ipynb | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/variant-calling/gatk_joint_calling.ipynb b/variant-calling/gatk_joint_calling.ipynb index 4c267df..a7d2626 100644 --- a/variant-calling/gatk_joint_calling.ipynb +++ b/variant-calling/gatk_joint_calling.ipynb @@ -732,25 +732,22 @@ " --samples /mnt/mfs/statgen/data_private/gatk_joint_call_example/20200820_sample_manifest.txt \\\n", " --samples-dir /mnt/mfs/statgen/data_private/gatk_joint_call_example/ \\\n", " --ref-genome /mnt/mfs/statgen/isabelle/REF/refs/Homo_sapiens.GRCh37.75.dna_sm.primary_assembly.fa\\\n", - " --cwd output\\ \\\n", + " --cwd output/ \\ \n", " --variant_filter 'strict'\n", "\n", "sos run gatk_joint_calling.ipynb strict_filter \\\n", " --vcf-prefix output/minimal_example \\\n", - " --cwd output/\\\n", + " --cwd output/ \\\n", " --variant_filter 'strict'\n", " \n", "sos run gatk_joint_calling.ipynb basic_filter \\\n", " --vcf-prefix output/minimal_example \\\n", - " --cwd output/\\\n", + " --cwd output/ \\\n", " --variant_filter 'basic'\n", "\n", - "module load Singularity\n", - "module load VCFTOOLS/0.1.17\n", - "module load PLINK/1.9.10\n", "sos run gatk_joint_calling.ipynb vcf_qc \\\n", " --vcf-prefix output/minimal_example \\\n", - " --cwd output/\\\n", + " --cwd output/ \\\n", " --variant_filter 'basic'\n", " \n", " \n", @@ -806,6 +803,8 @@ " #$ -j y\n", " #$ -S /bin/bash\n", " module load Singularity\n", + " module load VCFTOOLS/0.1.17\n", + " module load PLINK/1.9.10 \n", " export PATH=$HOME/miniconda3/bin:$PATH\n", " set -e\n", " '''\n", From 65d96c5ce69075dbee6499d1bd774a1a5d0e1db1 Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Fri, 30 Dec 2022 10:31:55 -0500 Subject: [PATCH 50/63] fixed vcf-qc_3 --- variant-calling/gatk_joint_calling.ipynb | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/variant-calling/gatk_joint_calling.ipynb b/variant-calling/gatk_joint_calling.ipynb index a7d2626..0f50861 100644 --- a/variant-calling/gatk_joint_calling.ipynb +++ b/variant-calling/gatk_joint_calling.ipynb @@ -631,12 +631,11 @@ "# QC VCF for sex check\n", "[vcf_qc_2 (check sex)]\n", "input: f\"{vcf_prefix:a}.snp_indel.filter.{vcf_filter}_QC.PASS.vcf.gz\"\n", - "output: f\"{cwd}/vcf_qc/{_input:bnn}.bed\", f\"{cwd}/vcf_qc/{_input:bnn}.sex.sexcheck\", f\"{cwd}/vcf_qc/{_input:bnn}.sex2.sexcheck\"\n", - "bash: expand=\"${ }\", stderr=f\"{_output[1]:n}.err\",stdout=f\"{_output[1]:n}.log\"\n", - " plink --vcf ${_input} --double-id --make-bed --out ${_output[0]:n} --allow-extra-chr\n", - " plink --bfile ${_output[0]:n} --check-sex --out ${_output[1]:n} --allow-extra-chr\n", - " plink --bfile ${_output[0]:n} --check-sex 0.35 0.65 --out ${_output[2]:n} --allow-extra-chr\n", - " rm ${_output[1]:n}.nosex && rm ${_output[2]:n}.nosex && rm ${_output[1]:n}.log && rm ${_output[2]:n}.log && rm ${_output[0]:n}.nosex" + "output: bed=f'{cwd}/vcf_qc/{_input:bnn}.bed'\n", + "bash: expand=\"${ }\", stderr=f\"{_output:n}.sex.err\",stdout=f\"{_output:n}.sex.log\"\n", + " plink --vcf ${_input} --double-id --make-bed --out ${_output:n} --allow-extra-chr\n", + " plink --bfile ${_output:n} --check-sex --out ${_output:n}.sex --allow-extra-chr\n", + " plink --bfile ${_output:n} --check-sex 0.35 0.65 --out ${_output:n}.sex2 --allow-extra-chr" ] }, { @@ -649,7 +648,7 @@ "source": [ "# QC VCF for IBD\n", "[vcf_qc_3 (IBD)]\n", - "input: f\"{cwd}/vcf_qc/{vcf_prefix:b}.snp_indel.filter.{vcf_filter}_QC.PASS.bed\" \n", + "input: output_from('vcf_qc_2')['bed']\n", "output: f'{_input:n}.IBD.genome',\n", " f'{_input:n}.HET.het',\n", " f'{_input:n}.IBC.ibc',\n", From 3d3392e97b7cfd957a1e32b0d245d093ad59e7b2 Mon Sep 17 00:00:00 2001 From: dmc2245 Date: Thu, 19 Jan 2023 15:08:21 -0500 Subject: [PATCH 51/63] added csg2 to the notebook --- variant-calling/gatk_joint_calling.ipynb | 50 ++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/variant-calling/gatk_joint_calling.ipynb b/variant-calling/gatk_joint_calling.ipynb index 0f50861..990be9f 100644 --- a/variant-calling/gatk_joint_calling.ipynb +++ b/variant-calling/gatk_joint_calling.ipynb @@ -773,6 +773,13 @@ " --cmd_file analysis_commands_20200825.txt \\\n", " --cwd output \\\n", " --dryrun True\n", + "```\n", + "\n", + "```\n", + "sos run gatk_joint_calling.ipynb submit_csg2 \\\n", + " --cmd_file analysis_commands_20200825.txt \\\n", + " --cwd output \\\n", + " --dryrun True\n", "```" ] }, @@ -817,6 +824,49 @@ " if output:\n", " print(output)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "# Job submission on CSG cluster\n", + "[submit_csg2]\n", + "# Path to job file\n", + "parameter: cmd_file=path\n", + "# Total run time allocated to the script\n", + "parameter: time='36:00:00'\n", + "parameter: dryrun = False\n", + "input: cmd_file\n", + "python3: expand = '$[ ]'\n", + " tpl = '''\n", + " #!/bin/sh\n", + " #$ -l h_rt=$[time]\n", + " #$ -l h_vmem=$[mem+6]G\n", + " #$ -N gatk_joint_call\n", + " #$ -cwd\n", + " #$ -j y\n", + " #$ -q csg2.q -l t_pri\n", + " #$ -S /bin/bash\n", + " module load Singularity\n", + " module load VCFTOOLS/0.1.17\n", + " module load PLINK/1.9.10 \n", + " export PATH=$HOME/miniconda3/bin:$PATH\n", + " set -e\n", + " '''\n", + " script = tpl.lstrip() + ''.join(open($[_input:r]).readlines())\n", + " exe = 'cat' if $[dryrun] else 'qsub'\n", + " from subprocess import Popen, PIPE\n", + " import sys\n", + " p = Popen(exe, shell = False, stdin = PIPE, stdout = PIPE, stderr = PIPE, close_fds = True)\n", + " for item in p.communicate(script.encode(sys.getdefaultencoding())):\n", + " output = item.decode(sys.getdefaultencoding()).rstrip()\n", + " if output:\n", + " print(output)" + ] } ], "metadata": { From 023e6cff7f5a6b15076db09696b599191c527d1c Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Thu, 19 Jan 2023 15:31:21 -0500 Subject: [PATCH 52/63] split submit CSG from the variant calling pipeline --- admin/submit_csg.ipynb | 210 +++++++++++++++++++++++ variant-calling/gatk_joint_calling.ipynb | 159 +---------------- 2 files changed, 211 insertions(+), 158 deletions(-) create mode 100644 admin/submit_csg.ipynb diff --git a/admin/submit_csg.ipynb b/admin/submit_csg.ipynb new file mode 100644 index 0000000..fbf4940 --- /dev/null +++ b/admin/submit_csg.ipynb @@ -0,0 +1,210 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "# A utility workflow to submit jobs to CSG nodes\n", + "\n", + "This notebook provides a short-cut to submit bash scripts to CSG computing nodes." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "\n", + "Suppose we would like to submit these lines of commands to the cluster:\n", + "\n", + "```\n", + "sos run gatk_joint_calling.ipynb call \\\n", + " --container-option /mnt/mfs/statgen/containers/gatk4-annovar.sif \\\n", + " --vcf-prefix output/minimal_example \\\n", + " --samples /mnt/mfs/statgen/data_private/gatk_joint_call_example/20200820_sample_manifest.txt \\\n", + " --samples-dir /mnt/mfs/statgen/data_private/gatk_joint_call_example/ \\\n", + " --ref-genome /mnt/mfs/statgen/isabelle/REF/refs/Homo_sapiens.GRCh37.75.dna_sm.primary_assembly.fa\\\n", + " --cwd output/ \\ \n", + " --variant_filter 'strict'\n", + "\n", + "sos run gatk_joint_calling.ipynb strict_filter \\\n", + " --vcf-prefix output/minimal_example \\\n", + " --cwd output/ \\\n", + " --variant_filter 'strict'\n", + " \n", + "sos run gatk_joint_calling.ipynb basic_filter \\\n", + " --vcf-prefix output/minimal_example \\\n", + " --cwd output/ \\\n", + " --variant_filter 'basic'\n", + "\n", + "sos run gatk_joint_calling.ipynb vcf_qc \\\n", + " --vcf-prefix output/minimal_example \\\n", + " --cwd output/ \\\n", + " --variant_filter 'basic'\n", + " \n", + " \n", + "```\n", + "\n", + "First, we save the above lines to a text file, e.g. call it `analysis_commands_20200825.txt`, then use the following workflow steps to allocate resources and submit the jobs.\n", + "\n", + "Example to submit a job:\n", + "\n", + "```\n", + "sos run gatk_joint_calling.ipynb submit_csg \\\n", + " --cmd_file command_1027.txt \\\n", + " --cwd output\n", + " \n", + "sos run ~/gatk_joint_calling_test.ipynb submit_csg \\\n", + " --cmd_file ~/gatk_joint_calling/command_1027.txt \\\n", + " --cwd output\n", + "```\n", + "\n", + "\n", + "If you want to run in a dryrun mode, meaning just simply test the process but do not genrate results\n", + "```\n", + "sos run gatk_joint_calling.ipynb submit_csg \\\n", + " --cmd_file analysis_commands_20200825.txt \\\n", + " --cwd output \\\n", + " --dryrun True\n", + "```\n", + "\n", + "```\n", + "sos run gatk_joint_calling.ipynb submit_csg2 \\\n", + " --cmd_file analysis_commands_20200825.txt \\\n", + " --cwd output \\\n", + " --dryrun True\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "# Job submission on CSG cluster\n", + "[submit_csg]\n", + "# Path to job file\n", + "parameter: cmd_file=path\n", + "# Total run time allocated to the script\n", + "parameter: time='36:00:00'\n", + "parameter: dryrun = False\n", + "input: cmd_file\n", + "python3: expand = '$[ ]'\n", + " tpl = '''\n", + " #!/bin/sh\n", + " #$ -l h_rt=$[time]\n", + " #$ -l h_vmem=$[mem+6]G\n", + " #$ -N gatk_joint_call\n", + " #$ -cwd\n", + " #$ -j y\n", + " #$ -S /bin/bash\n", + " module load Singularity\n", + " module load VCFTOOLS/0.1.17\n", + " module load PLINK/1.9.10 \n", + " export PATH=$HOME/miniconda3/bin:$PATH\n", + " set -e\n", + " '''\n", + " script = tpl.lstrip() + ''.join(open($[_input:r]).readlines())\n", + " exe = 'cat' if $[dryrun] else 'qsub'\n", + " from subprocess import Popen, PIPE\n", + " import sys\n", + " p = Popen(exe, shell = False, stdin = PIPE, stdout = PIPE, stderr = PIPE, close_fds = True)\n", + " for item in p.communicate(script.encode(sys.getdefaultencoding())):\n", + " output = item.decode(sys.getdefaultencoding()).rstrip()\n", + " if output:\n", + " print(output)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "# Job submission on CSG cluster\n", + "[submit_csg2]\n", + "# Path to job file\n", + "parameter: cmd_file=path\n", + "# Total run time allocated to the script\n", + "parameter: time='36:00:00'\n", + "parameter: dryrun = False\n", + "input: cmd_file\n", + "python3: expand = '$[ ]'\n", + " tpl = '''\n", + " #!/bin/sh\n", + " #$ -l h_rt=$[time]\n", + " #$ -l h_vmem=$[mem+6]G\n", + " #$ -N gatk_joint_call\n", + " #$ -cwd\n", + " #$ -j y\n", + " #$ -q csg2.q -l t_pri\n", + " #$ -S /bin/bash\n", + " module load Singularity\n", + " module load VCFTOOLS/0.1.17\n", + " module load PLINK/1.9.10 \n", + " export PATH=$HOME/miniconda3/bin:$PATH\n", + " set -e\n", + " '''\n", + " script = tpl.lstrip() + ''.join(open($[_input:r]).readlines())\n", + " exe = 'cat' if $[dryrun] else 'qsub'\n", + " from subprocess import Popen, PIPE\n", + " import sys\n", + " p = Popen(exe, shell = False, stdin = PIPE, stdout = PIPE, stderr = PIPE, close_fds = True)\n", + " for item in p.communicate(script.encode(sys.getdefaultencoding())):\n", + " output = item.decode(sys.getdefaultencoding()).rstrip()\n", + " if output:\n", + " print(output)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "SoS", + "language": "sos", + "name": "sos" + }, + "language_info": { + "codemirror_mode": "sos", + "file_extension": ".sos", + "mimetype": "text/x-sos", + "name": "sos", + "nbconvert_exporter": "sos_notebook.converter.SoS_Exporter", + "pygments_lexer": "sos" + }, + "sos": { + "default_kernel": "SoS", + "kernels": [ + [ + "Bash", + "bash", + "Bash", + "#E6EEFF", + "" + ], + [ + "SoS", + "sos", + "", + "", + "sos" + ] + ], + "panel": { + "displayed": true, + "height": 0, + "style": "side" + }, + "version": "0.22.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/variant-calling/gatk_joint_calling.ipynb b/variant-calling/gatk_joint_calling.ipynb index 990be9f..f8a9081 100644 --- a/variant-calling/gatk_joint_calling.ipynb +++ b/variant-calling/gatk_joint_calling.ipynb @@ -36,9 +36,6 @@ "- `gatk_filter_strict`\n", "- `gatk_filter_basic`\n", "- `vcf_qc`\n", - "- `submit_csg`\n", - "\n", - "The first four workflows are for the analysis and the last one is for submitting jobs on the cluster.\n", "\n", "All workflow steps are numerically ordered to reflect the execution logic. This is the most straightforward SoS workflow style, the \"process-oriented\" style. " ] @@ -713,160 +710,6 @@ " mkdir ${cwd}/vcf_qc/cache\n", " mv ${cwd}/vcf_qc/*.{bed,bim,fam,log,nosex,in,out,gz,tbi} ${cwd}/vcf_qc/cache\n" ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## Submit jobs to the cluster\n", - "\n", - "Suppose we would like to submit these lines of commands to the cluster:\n", - "\n", - "```\n", - "sos run gatk_joint_calling.ipynb call \\\n", - " --container-option /mnt/mfs/statgen/containers/gatk4-annovar.sif \\\n", - " --vcf-prefix output/minimal_example \\\n", - " --samples /mnt/mfs/statgen/data_private/gatk_joint_call_example/20200820_sample_manifest.txt \\\n", - " --samples-dir /mnt/mfs/statgen/data_private/gatk_joint_call_example/ \\\n", - " --ref-genome /mnt/mfs/statgen/isabelle/REF/refs/Homo_sapiens.GRCh37.75.dna_sm.primary_assembly.fa\\\n", - " --cwd output/ \\ \n", - " --variant_filter 'strict'\n", - "\n", - "sos run gatk_joint_calling.ipynb strict_filter \\\n", - " --vcf-prefix output/minimal_example \\\n", - " --cwd output/ \\\n", - " --variant_filter 'strict'\n", - " \n", - "sos run gatk_joint_calling.ipynb basic_filter \\\n", - " --vcf-prefix output/minimal_example \\\n", - " --cwd output/ \\\n", - " --variant_filter 'basic'\n", - "\n", - "sos run gatk_joint_calling.ipynb vcf_qc \\\n", - " --vcf-prefix output/minimal_example \\\n", - " --cwd output/ \\\n", - " --variant_filter 'basic'\n", - " \n", - " \n", - "```\n", - "\n", - "First, we save the above lines to a text file, e.g. call it `analysis_commands_20200825.txt`, then use the following workflow steps to allocate resources and submit the jobs.\n", - "\n", - "Example to submit a job:\n", - "\n", - "```\n", - "sos run gatk_joint_calling.ipynb submit_csg \\\n", - " --cmd_file command_1027.txt \\\n", - " --cwd output\n", - " \n", - "sos run ~/gatk_joint_calling_test.ipynb submit_csg \\\n", - " --cmd_file ~/gatk_joint_calling/command_1027.txt \\\n", - " --cwd output\n", - "```\n", - "\n", - "\n", - "If you want to run in a dryrun mode, meaning just simply test the process but do not genrate results\n", - "```\n", - "sos run gatk_joint_calling.ipynb submit_csg \\\n", - " --cmd_file analysis_commands_20200825.txt \\\n", - " --cwd output \\\n", - " --dryrun True\n", - "```\n", - "\n", - "```\n", - "sos run gatk_joint_calling.ipynb submit_csg2 \\\n", - " --cmd_file analysis_commands_20200825.txt \\\n", - " --cwd output \\\n", - " --dryrun True\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "# Job submission on CSG cluster\n", - "[submit_csg]\n", - "# Path to job file\n", - "parameter: cmd_file=path\n", - "# Total run time allocated to the script\n", - "parameter: time='36:00:00'\n", - "parameter: dryrun = False\n", - "input: cmd_file\n", - "python3: expand = '$[ ]'\n", - " tpl = '''\n", - " #!/bin/sh\n", - " #$ -l h_rt=$[time]\n", - " #$ -l h_vmem=$[mem+6]G\n", - " #$ -N gatk_joint_call\n", - " #$ -cwd\n", - " #$ -j y\n", - " #$ -S /bin/bash\n", - " module load Singularity\n", - " module load VCFTOOLS/0.1.17\n", - " module load PLINK/1.9.10 \n", - " export PATH=$HOME/miniconda3/bin:$PATH\n", - " set -e\n", - " '''\n", - " script = tpl.lstrip() + ''.join(open($[_input:r]).readlines())\n", - " exe = 'cat' if $[dryrun] else 'qsub'\n", - " from subprocess import Popen, PIPE\n", - " import sys\n", - " p = Popen(exe, shell = False, stdin = PIPE, stdout = PIPE, stderr = PIPE, close_fds = True)\n", - " for item in p.communicate(script.encode(sys.getdefaultencoding())):\n", - " output = item.decode(sys.getdefaultencoding()).rstrip()\n", - " if output:\n", - " print(output)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "# Job submission on CSG cluster\n", - "[submit_csg2]\n", - "# Path to job file\n", - "parameter: cmd_file=path\n", - "# Total run time allocated to the script\n", - "parameter: time='36:00:00'\n", - "parameter: dryrun = False\n", - "input: cmd_file\n", - "python3: expand = '$[ ]'\n", - " tpl = '''\n", - " #!/bin/sh\n", - " #$ -l h_rt=$[time]\n", - " #$ -l h_vmem=$[mem+6]G\n", - " #$ -N gatk_joint_call\n", - " #$ -cwd\n", - " #$ -j y\n", - " #$ -q csg2.q -l t_pri\n", - " #$ -S /bin/bash\n", - " module load Singularity\n", - " module load VCFTOOLS/0.1.17\n", - " module load PLINK/1.9.10 \n", - " export PATH=$HOME/miniconda3/bin:$PATH\n", - " set -e\n", - " '''\n", - " script = tpl.lstrip() + ''.join(open($[_input:r]).readlines())\n", - " exe = 'cat' if $[dryrun] else 'qsub'\n", - " from subprocess import Popen, PIPE\n", - " import sys\n", - " p = Popen(exe, shell = False, stdin = PIPE, stdout = PIPE, stderr = PIPE, close_fds = True)\n", - " for item in p.communicate(script.encode(sys.getdefaultencoding())):\n", - " output = item.decode(sys.getdefaultencoding()).rstrip()\n", - " if output:\n", - " print(output)" - ] } ], "metadata": { @@ -906,7 +749,7 @@ "height": 0, "style": "side" }, - "version": "0.22.6" + "version": "0.22.9" } }, "nbformat": 4, From 8a6a1728e8c150410e12aff2bcc6232e2a2cde44 Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Thu, 19 Jan 2023 15:33:47 -0500 Subject: [PATCH 53/63] split submit CSG from the variant calling pipeline --- admin/submit_csg.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/admin/submit_csg.ipynb b/admin/submit_csg.ipynb index fbf4940..cdb8fcd 100644 --- a/admin/submit_csg.ipynb +++ b/admin/submit_csg.ipynb @@ -53,11 +53,11 @@ "Example to submit a job:\n", "\n", "```\n", - "sos run gatk_joint_calling.ipynb submit_csg \\\n", + "sos run submit_csg.ipynb submit_csg \\\n", " --cmd_file command_1027.txt \\\n", " --cwd output\n", " \n", - "sos run ~/gatk_joint_calling_test.ipynb submit_csg \\\n", + "sos run submit_csg.ipynb submit_csg \\\n", " --cmd_file ~/gatk_joint_calling/command_1027.txt \\\n", " --cwd output\n", "```\n", @@ -65,14 +65,14 @@ "\n", "If you want to run in a dryrun mode, meaning just simply test the process but do not genrate results\n", "```\n", - "sos run gatk_joint_calling.ipynb submit_csg \\\n", + "sos run submit_csg.ipynb submit_csg \\\n", " --cmd_file analysis_commands_20200825.txt \\\n", " --cwd output \\\n", " --dryrun True\n", "```\n", "\n", "```\n", - "sos run gatk_joint_calling.ipynb submit_csg2 \\\n", + "sos run submit_csg.ipynb submit_csg2 \\\n", " --cmd_file analysis_commands_20200825.txt \\\n", " --cwd output \\\n", " --dryrun True\n", From 04a4bb96f4a3062ae8050a89ea99f9d1c5cc1451 Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Thu, 19 Jan 2023 16:19:30 -0500 Subject: [PATCH 54/63] Fix typo --- admin/submit_csg.ipynb | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/admin/submit_csg.ipynb b/admin/submit_csg.ipynb index cdb8fcd..a41edf4 100644 --- a/admin/submit_csg.ipynb +++ b/admin/submit_csg.ipynb @@ -54,28 +54,21 @@ "\n", "```\n", "sos run submit_csg.ipynb submit_csg \\\n", - " --cmd_file command_1027.txt \\\n", - " --cwd output\n", - " \n", + " --cmd_file command_1027.txt \n", "sos run submit_csg.ipynb submit_csg \\\n", - " --cmd_file ~/gatk_joint_calling/command_1027.txt \\\n", - " --cwd output\n", + " --cmd_file ~/gatk_joint_calling/command_1027.txt \n", "```\n", "\n", "\n", "If you want to run in a dryrun mode, meaning just simply test the process but do not genrate results\n", "```\n", - "sos run submit_csg.ipynb submit_csg \\\n", - " --cmd_file analysis_commands_20200825.txt \\\n", - " --cwd output \\\n", - " --dryrun True\n", + "sos dryrun submit_csg.ipynb submit_csg \\\n", + " --cmd_file analysis_commands_20200825.txt\n", "```\n", "\n", "```\n", - "sos run submit_csg.ipynb submit_csg2 \\\n", - " --cmd_file analysis_commands_20200825.txt \\\n", - " --cwd output \\\n", - " --dryrun True\n", + "sos dryrun submit_csg.ipynb submit_csg2 \\\n", + " --cmd_file analysis_commands_20200825.txt\n", "```" ] }, From 971d4d700cd1394266abc74ddcf575e0a29004e0 Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Thu, 19 Jan 2023 16:20:36 -0500 Subject: [PATCH 55/63] Fix typo --- admin/submit_csg.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/admin/submit_csg.ipynb b/admin/submit_csg.ipynb index a41edf4..320a202 100644 --- a/admin/submit_csg.ipynb +++ b/admin/submit_csg.ipynb @@ -128,6 +128,7 @@ "parameter: cmd_file=path\n", "# Total run time allocated to the script\n", "parameter: time='36:00:00'\n", + "parameter: mem=12\n", "parameter: dryrun = False\n", "input: cmd_file\n", "python3: expand = '$[ ]'\n", From aefcb23abb81930ce1887f062918cb0829915c6e Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Thu, 19 Jan 2023 16:21:31 -0500 Subject: [PATCH 56/63] Fix typo --- admin/submit_csg.ipynb | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/admin/submit_csg.ipynb b/admin/submit_csg.ipynb index 320a202..b22d3f0 100644 --- a/admin/submit_csg.ipynb +++ b/admin/submit_csg.ipynb @@ -62,13 +62,15 @@ "\n", "If you want to run in a dryrun mode, meaning just simply test the process but do not genrate results\n", "```\n", - "sos dryrun submit_csg.ipynb submit_csg \\\n", - " --cmd_file analysis_commands_20200825.txt\n", + "sos run submit_csg.ipynb submit_csg \\\n", + " --cmd_file analysis_commands_20200825.txt \\\n", + " --dryrun\n", "```\n", "\n", "```\n", - "sos dryrun submit_csg.ipynb submit_csg2 \\\n", - " --cmd_file analysis_commands_20200825.txt\n", + "sos run submit_csg.ipynb submit_csg2 \\\n", + " --cmd_file analysis_commands_20200825.txt \\\n", + " --dryrun\n", "```" ] }, @@ -86,6 +88,7 @@ "parameter: cmd_file=path\n", "# Total run time allocated to the script\n", "parameter: time='36:00:00'\n", + "parameter: mem=12\n", "parameter: dryrun = False\n", "input: cmd_file\n", "python3: expand = '$[ ]'\n", From 8eac9c2f855c19b587b166b7a4491075d63aaee8 Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Thu, 19 Jan 2023 16:22:32 -0500 Subject: [PATCH 57/63] Fix typo --- admin/submit_csg.ipynb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/admin/submit_csg.ipynb b/admin/submit_csg.ipynb index b22d3f0..d590dc6 100644 --- a/admin/submit_csg.ipynb +++ b/admin/submit_csg.ipynb @@ -88,6 +88,7 @@ "parameter: cmd_file=path\n", "# Total run time allocated to the script\n", "parameter: time='36:00:00'\n", + "# Memory allocated to a job, in terms of Gigabyte\n", "parameter: mem=12\n", "parameter: dryrun = False\n", "input: cmd_file\n", @@ -131,6 +132,7 @@ "parameter: cmd_file=path\n", "# Total run time allocated to the script\n", "parameter: time='36:00:00'\n", + "# Memory allocated to a job, in terms of Gigabyte\n", "parameter: mem=12\n", "parameter: dryrun = False\n", "input: cmd_file\n", From 07256c0d4c9834fd995a57c05a02fc718a68daa2 Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Thu, 16 Mar 2023 15:39:32 -0400 Subject: [PATCH 58/63] minor changes to toy examples --- admin/Job_Example.ipynb | 2 +- admin/csg.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/admin/Job_Example.ipynb b/admin/Job_Example.ipynb index f9fdcf7..c5f3946 100644 --- a/admin/Job_Example.ipynb +++ b/admin/Job_Example.ipynb @@ -65,7 +65,7 @@ "input: for_each = 'n'\n", "output: f'File_{_n}.out'\n", "task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = ncore, tags = f'{step_name}_{_output:bn}'\n", - "bash: expand = True\n", + "bash: expand = True, stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'\n", " echo {_n} > {_output}" ] }, diff --git a/admin/csg.yml b/admin/csg.yml index 63c5a30..abbf9b7 100644 --- a/admin/csg.yml +++ b/admin/csg.yml @@ -25,7 +25,7 @@ hosts: #$ -S /bin/bash #{partition} module load Singularity - module load R + module load R/4.2 export PATH=$HOME/miniconda3/bin:$PATH export SINGULARITY_BIND="/mnt/mfs/:/mnt/mfs/" set -e From 715cdf4e483aa25305c0f11c26f21aaa802c6f53 Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Thu, 16 Mar 2023 23:52:18 -0400 Subject: [PATCH 59/63] Update job template --- admin/Job_Example.ipynb | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/admin/Job_Example.ipynb b/admin/Job_Example.ipynb index c5f3946..d0a6a81 100644 --- a/admin/Job_Example.ipynb +++ b/admin/Job_Example.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "reserved-hampshire", + "id": "serial-reproduction", "metadata": { "kernel": "SoS" }, @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "occupational-thanksgiving", + "id": "beginning-breach", "metadata": { "kernel": "SoS" }, @@ -24,7 +24,7 @@ }, { "cell_type": "markdown", - "id": "religious-processing", + "id": "tracked-arthritis", "metadata": { "kernel": "SoS" }, @@ -37,7 +37,7 @@ { "cell_type": "code", "execution_count": null, - "id": "dried-spyware", + "id": "broadband-replica", "metadata": { "kernel": "SoS" }, @@ -53,7 +53,7 @@ { "cell_type": "code", "execution_count": null, - "id": "norwegian-vocabulary", + "id": "sudden-cursor", "metadata": { "kernel": "SoS" }, @@ -71,7 +71,7 @@ }, { "cell_type": "markdown", - "id": "characteristic-estate", + "id": "representative-composite", "metadata": { "kernel": "SoS" }, @@ -81,7 +81,7 @@ }, { "cell_type": "markdown", - "id": "integral-storm", + "id": "phantom-coverage", "metadata": { "kernel": "SoS" }, @@ -105,7 +105,7 @@ }, { "cell_type": "markdown", - "id": "equal-accreditation", + "id": "developmental-nashville", "metadata": { "kernel": "SoS" }, @@ -139,7 +139,15 @@ "cat toy_example.log\n", "```\n", "\n", - "At the end of the job you should see exactly the same content as you have seen earlier on the screen when you submit jobs from login node." + "At the end of the job you should see exactly the same content as you have seen earlier on the screen when you submit jobs from login node.\n", + "\n", + "## Highlights on the job configuration template\n", + "\n", + "You can modify the job template file (`*.yml` file) as you see fit. A few places you might want to edit:\n", + "\n", + "1. `max_mem: 128G` is default to 128G. That means even if you specified more than this amount of memory, it is not going to request that much. Typically this default should reflect the maximum available memory from a specific queue of computing nodes. You can increase this value. For example all the nodes in `csg.q` at Columbia Neurology have at least 258G of memory.\n", + "2. `max_running_jobs: 50` is to ensure the job you actually submit to the cluster at the same time, shown in `qstat`, is less than this value. We generally do not want to overflood the queue. But you can adjust this to be a bit higher especially when you notice the cluster is mostly idle and you want to exploit it some more.\n", + "3. Add a new host, either based on existing host or from scratch. For example you can put in a big memory queue with a much larger `max_mem` and also changes to the job template as necessary." ] } ], From 3257b336afe1ec056c1b6a0582300362ed2594b7 Mon Sep 17 00:00:00 2001 From: gaow Date: Fri, 17 Mar 2023 00:28:49 -0400 Subject: [PATCH 60/63] Update csg.yml --- admin/csg.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/admin/csg.yml b/admin/csg.yml index abbf9b7..4c464b9 100644 --- a/admin/csg.yml +++ b/admin/csg.yml @@ -29,6 +29,7 @@ hosts: export PATH=$HOME/miniconda3/bin:$PATH export SINGULARITY_BIND="/mnt/mfs/:/mnt/mfs/" set -e + echo $HOSTNAME >& 2 # to write the compute node name to *.err file sos execute {task} -v {verbosity} -s {sig_mode} kill_cmd: qdel {job_id} max_cores: 40 From fb1a6df1d1583ffbe77dae29aedeb69a6fd2bb93 Mon Sep 17 00:00:00 2001 From: gaow Date: Fri, 17 Mar 2023 08:51:28 -0400 Subject: [PATCH 61/63] Update csg.yml --- admin/csg.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/admin/csg.yml b/admin/csg.yml index 4c464b9..e8e3a17 100644 --- a/admin/csg.yml +++ b/admin/csg.yml @@ -19,8 +19,8 @@ hosts: #$ -l h_vmem={mem//10**9}G #$ -pe {PE} {cores} #$ -N job_{job_name} - #$ -o {cur_dir}/{job_name}.out - #$ -e {cur_dir}/{job_name}.err + #$ -o /home/{user_name}/.sos/{job_name}.out + #$ -e /home/{user_name}/.sos/{job_name}.err #$ -cwd #$ -S /bin/bash #{partition} @@ -29,6 +29,7 @@ hosts: export PATH=$HOME/miniconda3/bin:$PATH export SINGULARITY_BIND="/mnt/mfs/:/mnt/mfs/" set -e + cd {cur_dir} echo $HOSTNAME >& 2 # to write the compute node name to *.err file sos execute {task} -v {verbosity} -s {sig_mode} kill_cmd: qdel {job_id} From cf3c4f66fd4f2d19e93eb88201dd21be73c8bab6 Mon Sep 17 00:00:00 2001 From: gaow Date: Fri, 17 Mar 2023 08:52:49 -0400 Subject: [PATCH 62/63] Update csg.yml --- admin/csg.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/admin/csg.yml b/admin/csg.yml index e8e3a17..e55e196 100644 --- a/admin/csg.yml +++ b/admin/csg.yml @@ -27,7 +27,7 @@ hosts: module load Singularity module load R/4.2 export PATH=$HOME/miniconda3/bin:$PATH - export SINGULARITY_BIND="/mnt/mfs/:/mnt/mfs/" + export SINGULARITY_BIND="/mnt/mfs/:/mnt/mfs/,/mnt/vast/:/mnt/vast/" set -e cd {cur_dir} echo $HOSTNAME >& 2 # to write the compute node name to *.err file From 12af9c8b1addaa53949eaf68918d2f4790b89384 Mon Sep 17 00:00:00 2001 From: Gao Wang Date: Wed, 11 Oct 2023 11:49:16 -0400 Subject: [PATCH 63/63] Purge obsolete mixture prior codes --- .../mash_data_preprocessing.ipynb | 335 +++++++++ multivariate-fine-mapping/mixture_prior.ipynb | 655 ------------------ 2 files changed, 335 insertions(+), 655 deletions(-) create mode 100644 multivariate-fine-mapping/mash_data_preprocessing.ipynb delete mode 100644 multivariate-fine-mapping/mixture_prior.ipynb diff --git a/multivariate-fine-mapping/mash_data_preprocessing.ipynb b/multivariate-fine-mapping/mash_data_preprocessing.ipynb new file mode 100644 index 0000000..377eed1 --- /dev/null +++ b/multivariate-fine-mapping/mash_data_preprocessing.ipynb @@ -0,0 +1,335 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "# Data munggling for multi-variant summary stats" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Minimal working example" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "To see the input requirements and output data formats, please [download a minimal working example here](https://drive.google.com/file/d/1838xUOQuWTszQ0WJGXNiJMszY05cw3RS/view?usp=sharing), and run the following:\n", + "\n", + "### Merge univariate results\n", + "\n", + "```\n", + "sos run mixture_prior.ipynb merge \\\n", + " --analysis-units \\\n", + " --plink-sumstats \\\n", + " --name gtex_mixture\n", + "```\n", + "\n", + "### Select and merge univariate effects\n", + "\n", + "```\n", + "m=/path/to/data\n", + "cd $m && ls *.rds | sed 's/\\.rds//g' > analysis_units.txt && cd -\n", + "sos run mixture_prior.ipynb extract_effects \\\n", + " --analysis-units $m/analysis_units.txt \\\n", + " --datadir $m --name `basename $m`\n", + "```\n", + "\n", + "Notice that for production use, each `sos run` command should be submitted to the cluster as a job." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Global parameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "[global]\n", + "import os\n", + "# Work directory & output directory\n", + "parameter: cwd = path('./output')\n", + "# The filename prefix for output data\n", + "parameter: name = str\n", + "parameter: mixture_components = ['flash', 'flash_nonneg', 'pca', 'canonical']\n", + "parameter: job_size = 1# Residual correlatoin file\n", + "parameter: resid_cor = path(\".\")\n", + "fail_if(not (resid_cor.is_file() or resid_cor == path('.')), msg = f'Cannot find ``{resid_cor}``')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Merge PLINK univariate association summary statistic to RDS format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "[merge]\n", + "parameter: molecular_pheno = path\n", + "# Analysis units file. For RDS files it can be generated by `ls *.rds | sed 's/\\.rds//g' > analysis_units.txt`\n", + "parameter: analysis_units = path\n", + "regions = [x.strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]\n", + "input: molecular_pheno, for_each = \"regions\"\n", + "output: f'{cwd:a}/RDS/{_regions[0]}'\n", + "\n", + "task: trunk_workers = 1, trunk_size = job_size, walltime = '4h', mem = '6G', tags = f'{step_name}_{_output:bn}' \n", + "\n", + "R: expand = \"$[ ]\", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'\n", + " library(\"dplyr\")\n", + " library(\"tibble\")\n", + " library(\"purrr\")\n", + " library(\"readr\")\n", + " molecular_pheno = read_delim($[molecular_pheno:r], delim = \"\\t\")\n", + " molecular_pheno = molecular_pheno%>%mutate(dir = map_chr(`#molc_pheno`,~paste(c(`.x`,\"$[_regions[0]]\"),collapse = \"\")))\n", + " n = nrow(molecular_pheno)\n", + " # For every condition read rds and extract the bhat and sbhat.\n", + " genos = tibble( i = 1:n)\n", + " genos = genos%>%mutate(bhat = map(i, ~readRDS(molecular_pheno[[.x,2]])$bhat%>%as.data.frame%>%rownames_to_column),\n", + " sbhat = map(i, ~readRDS(molecular_pheno[[.x,2]])$sbhat%>%as.data.frame%>%rownames_to_column))\n", + " \n", + " # Join first two conditions\n", + " genos_join_bhat = full_join((genos%>%pull(bhat))[[1]],(genos%>%pull(bhat))[[2]],by = \"rowname\")\n", + " genos_join_sbhat = full_join((genos%>%pull(sbhat))[[1]],(genos%>%pull(sbhat))[[2]],by = \"rowname\")\n", + " \n", + " # If there are more conditions, join the rest\n", + " if(n > 2){\n", + " for(j in 3:n){\n", + " genos_join_bhat = full_join(genos_join_bhat,(genos%>%pull(bhat))[[j]],by = \"rowname\")%>%select(-rowname)%>%as.matrix\n", + " genos_join_sbhat = full_join(genos_join_sbhat,(genos%>%pull(sbhat))[[j]],by = \"rowname\")%>%select(-rowname)%>%as.matrix\n", + " }\n", + " }\n", + " \n", + " name = molecular_pheno%>%mutate(name = map(`#molc_pheno`, ~read.table(text = .x,sep = \"/\")),\n", + " name = map_chr(name, ~.x[,ncol(.x)-2]%>%as.character) )%>%pull(name)\n", + " colnames(genos_join_bhat) = name\n", + " colnames(genos_join_sbhat) = name\n", + " \n", + " \n", + " # save the rds file\n", + " saveRDS(file = \"$[_output]\", list(bhat=genos_join_bhat, sbhat=genos_join_sbhat))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "kernel": "SoS" + }, + "source": [ + "## Get top, random and null effects per analysis unit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "# extract data for MASH from summary stats\n", + "[extract_effects_1]\n", + "parameter: table_name = \"\"\n", + "parameter: bhat = \"bhat\"\n", + "parameter: sbhat = \"sbhat\"\n", + "parameter: expected_ncondition = 0\n", + "parameter: datadir = path\n", + "parameter: seed = 999\n", + "parameter: n_random = 4\n", + "parameter: n_null = 4\n", + "parameter: z_only = True\n", + "# Analysis units file. For RDS files it can be generated by `ls *.rds | sed 's/\\.rds//g' > analysis_units.txt`\n", + "parameter: analysis_units = path\n", + "# handle N = per_chunk data-set in one job\n", + "parameter: per_chunk = 1000\n", + "regions = [x.strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]\n", + "input: [f'{datadir}/{x[0]}.rds' for x in regions], group_by = per_chunk\n", + "output: f\"{cwd}/{name}/cache/{name}_{_index+1}.rds\"\n", + "task: trunk_workers = 1, walltime = '1h', trunk_size = 1, mem = '4G', cores = 1, tags = f'{_output:bn}'\n", + "R: expand = \"${ }\"\n", + " set.seed(${seed})\n", + " matxMax <- function(mtx) {\n", + " return(arrayInd(which.max(mtx), dim(mtx)))\n", + " }\n", + " remove_rownames = function(x) {\n", + " for (name in names(x)) rownames(x[[name]]) = NULL\n", + " return(x)\n", + " }\n", + " handle_nan_etc = function(x) {\n", + " x$bhat[which(is.nan(x$bhat))] = 0\n", + " x$sbhat[which(is.nan(x$sbhat) | is.infinite(x$sbhat))] = 1E3\n", + " return(x)\n", + " }\n", + " extract_one_data = function(dat, n_random, n_null, infile) {\n", + " if (is.null(dat)) return(NULL)\n", + " z = abs(dat$${bhat}/dat$${sbhat})\n", + " max_idx = matxMax(z)\n", + " # strong effect samples\n", + " strong = list(bhat = dat$${bhat}[max_idx[1],,drop=F], sbhat = dat$${sbhat}[max_idx[1],,drop=F])\n", + " # random samples excluding the top one\n", + " if (max_idx[1] == 1) {\n", + " sample_idx = 2:nrow(z)\n", + " } else if (max_idx[1] == nrow(z)) {\n", + " sample_idx = 1:(max_idx[1]-1)\n", + " } else {\n", + " sample_idx = c(1:(max_idx[1]-1), (max_idx[1]+1):nrow(z))\n", + " }\n", + " random_idx = sample(sample_idx, min(n_random, length(sample_idx)), replace = F)\n", + " random = list(bhat = dat$${bhat}[random_idx,,drop=F], sbhat = dat$${sbhat}[random_idx,,drop=F])\n", + " # null samples defined as |z| < 2\n", + " null.id = which(apply(abs(z), 1, max) < 2)\n", + " if (length(null.id) == 0) {\n", + " warning(paste(\"Null data is empty for input file\", infile))\n", + " null = list()\n", + " } else {\n", + " null_idx = sample(null.id, min(n_null, length(null.id)), replace = F)\n", + " null = list(bhat = dat$${bhat}[null_idx,,drop=F], sbhat = dat$${sbhat}[null_idx,,drop=F])\n", + " }\n", + " dat = (list(random = remove_rownames(random), null = remove_rownames(null), strong = remove_rownames(strong)))\n", + " dat$random = handle_nan_etc(dat$random)\n", + " dat$null = handle_nan_etc(dat$null)\n", + " dat$strong = handle_nan_etc(dat$strong)\n", + " return(dat)\n", + " }\n", + " reformat_data = function(dat, z_only = TRUE) {\n", + " # make output consistent in format with \n", + " # https://github.com/stephenslab/gtexresults/blob/master/workflows/mashr_flashr_workflow.ipynb \n", + " res = list(random.z = dat$random$bhat/dat$random$sbhat, \n", + " strong.z = dat$strong$bhat/dat$strong$sbhat, \n", + " null.z = dat$null$bhat/dat$null$sbhat)\n", + " if (!z_only) {\n", + " res = c(res, list(random.b = dat$random$bhat,\n", + " strong.b = dat$strong$bhat,\n", + " null.b = dat$null$bhat,\n", + " null.s = dat$null$sbhat,\n", + " random.s = dat$random$sbhat,\n", + " strong.s = dat$strong$sbhat))\n", + " }\n", + " return(res)\n", + " }\n", + " merge_data = function(res, one_data) {\n", + " if (length(res) == 0) {\n", + " return(one_data)\n", + " } else if (is.null(one_data)) {\n", + " return(res)\n", + " } else {\n", + " for (d in names(one_data)) {\n", + " if (is.null(one_data[[d]])) {\n", + " next\n", + " } else {\n", + " res[[d]] = rbind(res[[d]], one_data[[d]])\n", + " }\n", + " }\n", + " return(res)\n", + " }\n", + " }\n", + " res = list()\n", + " for (f in c(${_input:r,})) {\n", + " # If cannot read the input for some reason then we just skip it, assuming we have other enough data-sets to use.\n", + " dat = tryCatch(readRDS(f), error = function(e) return(NULL))${(\"$\"+table_name) if table_name != \"\" else \"\"}\n", + " if (is.null(dat)) {\n", + " message(paste(\"Skip loading file\", f, \"due to load failure.\"))\n", + " next\n", + " }\n", + " if (${expected_ncondition} > 0 && (ncol(dat$${bhat}) != ${expected_ncondition} || ncol(dat$${sbhat}) != ${expected_ncondition})) {\n", + " message(paste(\"Skip loading file\", f, \"because it has\", ncol(dat$${bhat}), \"columns different from required\", ${expected_ncondition}))\n", + " next\n", + " }\n", + " res = merge_data(res, reformat_data(extract_one_data(dat, ${n_random}, ${n_null}, f), ${\"TRUE\" if z_only else \"FALSE\"}))\n", + " }\n", + " saveRDS(res, ${_output:r})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "kernel": "SoS" + }, + "outputs": [], + "source": [ + "[extract_effects_2]\n", + "input: group_by = \"all\"\n", + "output: f\"{cwd}/{name}.rds\"\n", + "task: trunk_workers = 1, walltime = '1h', trunk_size = 1, mem = '16G', cores = 1, tags = f'{_output:bn}'\n", + "R: expand = \"${ }\"\n", + " merge_data = function(res, one_data) {\n", + " if (length(res) == 0) {\n", + " return(one_data)\n", + " } else {\n", + " for (d in names(one_data)) {\n", + " res[[d]] = rbind(res[[d]], one_data[[d]])\n", + " }\n", + " return(res)\n", + " }\n", + " }\n", + " dat = list()\n", + " for (f in c(${_input:r,})) {\n", + " dat = merge_data(dat, readRDS(f))\n", + " }\n", + " # compute empirical covariance XtX\n", + " X = dat$strong.z\n", + " X[is.na(X)] = 0\n", + " dat$XtX = t(as.matrix(X)) %*% as.matrix(X) / nrow(X)\n", + " saveRDS(dat, ${_output:r})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "SoS", + "language": "sos", + "name": "sos" + }, + "language_info": { + "codemirror_mode": "sos", + "file_extension": ".sos", + "mimetype": "text/x-sos", + "name": "sos", + "nbconvert_exporter": "sos_notebook.converter.SoS_Exporter", + "pygments_lexer": "sos" + }, + "sos": { + "kernels": [ + [ + "R" + ], + [ + "SoS" + ] + ], + "version": "0.22.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/multivariate-fine-mapping/mixture_prior.ipynb b/multivariate-fine-mapping/mixture_prior.ipynb deleted file mode 100644 index a242ad4..0000000 --- a/multivariate-fine-mapping/mixture_prior.ipynb +++ /dev/null @@ -1,655 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "# A multivariate EBNM approach for mixture multivariate distribution estimate\n", - "\n", - "An earlier version of the approach is outlined in Urbut et al 2019. This workflow implements a few improvements including using additional EBMF methods as well as the new `udr` package to fit the mixture model." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## Overview of approach\n", - "\n", - "1. A workflow step is provided to merge PLINK univariate association analysis results to RDS files for extracting effect estimate samples\n", - "2. Estimated effects are analyzed by FLASH and PCA to extract patterns of sharing\n", - "3. Estimate the weights for patterns extracted from previous step" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## Minimal working example" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "To see the input requirements and output data formats, please [download a minimal working example here](https://drive.google.com/file/d/1838xUOQuWTszQ0WJGXNiJMszY05cw3RS/view?usp=sharing), and run the following:\n", - "\n", - "### Merge univariate results\n", - "\n", - "```\n", - "sos run mixture_prior.ipynb merge \\\n", - " --analysis-units \\\n", - " --plink-sumstats \\\n", - " --name gtex_mixture\n", - "```\n", - "\n", - "### Select and merge univariate effects\n", - "\n", - "```\n", - "m=/path/to/data\n", - "cd $m && ls *.rds | sed 's/\\.rds//g' > analysis_units.txt && cd -\n", - "sos run mixture_prior.ipynb extract_effects \\\n", - " --analysis-units $m/analysis_units.txt \\\n", - " --datadir $m --name `basename $m`\n", - "```\n", - "\n", - "### Perform mixture model fitting\n", - "\n", - "```\n", - "sos run mixture_prior.ipynb ud \\\n", - " --datadir $m --name `basename $m` &> ed_$m.log\n", - "sos run mixture_prior.ipynb ud --ud-method ted \\\n", - " --datadir $m --name `basename $m` &> ted_$m.log\n", - "sos run mixture_prior.ipynb ed \\\n", - " --datadir $m --name `basename $m` &> bovy_$m.log\n", - "```\n", - "\n", - "### Plot results\n", - "\n", - "```\n", - "sos run mixture_prior.ipynb plot_U --model-data /path/to/mixture_model.rds --cwd output\n", - "```\n", - "\n", - "Notice that for production use, each `sos run` command should be submitted to the cluster as a job." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## Global parameters" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "[global]\n", - "import os\n", - "# Work directory & output directory\n", - "parameter: cwd = path('./output')\n", - "# The filename prefix for output data\n", - "parameter: name = str\n", - "parameter: mixture_components = ['flash', 'flash_nonneg', 'pca', 'canonical']\n", - "parameter: job_size = 1# Residual correlatoin file\n", - "parameter: resid_cor = path(\".\")\n", - "fail_if(not (resid_cor.is_file() or resid_cor == path('.')), msg = f'Cannot find ``{resid_cor}``')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## Merge PLINK univariate association summary statistic to RDS format" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "[merge]\n", - "parameter: molecular_pheno = path\n", - "# Analysis units file. For RDS files it can be generated by `ls *.rds | sed 's/\\.rds//g' > analysis_units.txt`\n", - "parameter: analysis_units = path\n", - "regions = [x.strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]\n", - "input: molecular_pheno, for_each = \"regions\"\n", - "output: f'{cwd:a}/RDS/{_regions[0]}'\n", - "\n", - "task: trunk_workers = 1, trunk_size = job_size, walltime = '4h', mem = '6G', tags = f'{step_name}_{_output:bn}' \n", - "\n", - "R: expand = \"$[ ]\", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'\n", - " library(\"dplyr\")\n", - " library(\"tibble\")\n", - " library(\"purrr\")\n", - " library(\"readr\")\n", - " molecular_pheno = read_delim($[molecular_pheno:r], delim = \"\\t\")\n", - " molecular_pheno = molecular_pheno%>%mutate(dir = map_chr(`#molc_pheno`,~paste(c(`.x`,\"$[_regions[0]]\"),collapse = \"\")))\n", - " n = nrow(molecular_pheno)\n", - " # For every condition read rds and extract the bhat and sbhat.\n", - " genos = tibble( i = 1:n)\n", - " genos = genos%>%mutate(bhat = map(i, ~readRDS(molecular_pheno[[.x,2]])$bhat%>%as.data.frame%>%rownames_to_column),\n", - " sbhat = map(i, ~readRDS(molecular_pheno[[.x,2]])$sbhat%>%as.data.frame%>%rownames_to_column))\n", - " \n", - " # Join first two conditions\n", - " genos_join_bhat = full_join((genos%>%pull(bhat))[[1]],(genos%>%pull(bhat))[[2]],by = \"rowname\")\n", - " genos_join_sbhat = full_join((genos%>%pull(sbhat))[[1]],(genos%>%pull(sbhat))[[2]],by = \"rowname\")\n", - " \n", - " # If there are more conditions, join the rest\n", - " if(n > 2){\n", - " for(j in 3:n){\n", - " genos_join_bhat = full_join(genos_join_bhat,(genos%>%pull(bhat))[[j]],by = \"rowname\")%>%select(-rowname)%>%as.matrix\n", - " genos_join_sbhat = full_join(genos_join_sbhat,(genos%>%pull(sbhat))[[j]],by = \"rowname\")%>%select(-rowname)%>%as.matrix\n", - " }\n", - " }\n", - " \n", - " name = molecular_pheno%>%mutate(name = map(`#molc_pheno`, ~read.table(text = .x,sep = \"/\")),\n", - " name = map_chr(name, ~.x[,ncol(.x)-2]%>%as.character) )%>%pull(name)\n", - " colnames(genos_join_bhat) = name\n", - " colnames(genos_join_sbhat) = name\n", - " \n", - " \n", - " # save the rds file\n", - " saveRDS(file = \"$[_output]\", list(bhat=genos_join_bhat, sbhat=genos_join_sbhat))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## Get top, random and null effects per analysis unit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "# extract data for MASH from summary stats\n", - "[extract_effects_1]\n", - "parameter: table_name = \"\"\n", - "parameter: bhat = \"bhat\"\n", - "parameter: sbhat = \"sbhat\"\n", - "parameter: expected_ncondition = 0\n", - "parameter: datadir = path\n", - "parameter: seed = 999\n", - "parameter: n_random = 4\n", - "parameter: n_null = 4\n", - "parameter: z_only = True\n", - "# Analysis units file. For RDS files it can be generated by `ls *.rds | sed 's/\\.rds//g' > analysis_units.txt`\n", - "parameter: analysis_units = path\n", - "# handle N = per_chunk data-set in one job\n", - "parameter: per_chunk = 1000\n", - "regions = [x.strip().split() for x in open(analysis_units).readlines() if x.strip() and not x.strip().startswith('#')]\n", - "input: [f'{datadir}/{x[0]}.rds' for x in regions], group_by = per_chunk\n", - "output: f\"{cwd}/{name}/cache/{name}_{_index+1}.rds\"\n", - "task: trunk_workers = 1, walltime = '1h', trunk_size = 1, mem = '4G', cores = 1, tags = f'{_output:bn}'\n", - "R: expand = \"${ }\"\n", - " set.seed(${seed})\n", - " matxMax <- function(mtx) {\n", - " return(arrayInd(which.max(mtx), dim(mtx)))\n", - " }\n", - " remove_rownames = function(x) {\n", - " for (name in names(x)) rownames(x[[name]]) = NULL\n", - " return(x)\n", - " }\n", - " handle_nan_etc = function(x) {\n", - " x$bhat[which(is.nan(x$bhat))] = 0\n", - " x$sbhat[which(is.nan(x$sbhat) | is.infinite(x$sbhat))] = 1E3\n", - " return(x)\n", - " }\n", - " extract_one_data = function(dat, n_random, n_null, infile) {\n", - " if (is.null(dat)) return(NULL)\n", - " z = abs(dat$${bhat}/dat$${sbhat})\n", - " max_idx = matxMax(z)\n", - " # strong effect samples\n", - " strong = list(bhat = dat$${bhat}[max_idx[1],,drop=F], sbhat = dat$${sbhat}[max_idx[1],,drop=F])\n", - " # random samples excluding the top one\n", - " if (max_idx[1] == 1) {\n", - " sample_idx = 2:nrow(z)\n", - " } else if (max_idx[1] == nrow(z)) {\n", - " sample_idx = 1:(max_idx[1]-1)\n", - " } else {\n", - " sample_idx = c(1:(max_idx[1]-1), (max_idx[1]+1):nrow(z))\n", - " }\n", - " random_idx = sample(sample_idx, min(n_random, length(sample_idx)), replace = F)\n", - " random = list(bhat = dat$${bhat}[random_idx,,drop=F], sbhat = dat$${sbhat}[random_idx,,drop=F])\n", - " # null samples defined as |z| < 2\n", - " null.id = which(apply(abs(z), 1, max) < 2)\n", - " if (length(null.id) == 0) {\n", - " warning(paste(\"Null data is empty for input file\", infile))\n", - " null = list()\n", - " } else {\n", - " null_idx = sample(null.id, min(n_null, length(null.id)), replace = F)\n", - " null = list(bhat = dat$${bhat}[null_idx,,drop=F], sbhat = dat$${sbhat}[null_idx,,drop=F])\n", - " }\n", - " dat = (list(random = remove_rownames(random), null = remove_rownames(null), strong = remove_rownames(strong)))\n", - " dat$random = handle_nan_etc(dat$random)\n", - " dat$null = handle_nan_etc(dat$null)\n", - " dat$strong = handle_nan_etc(dat$strong)\n", - " return(dat)\n", - " }\n", - " reformat_data = function(dat, z_only = TRUE) {\n", - " # make output consistent in format with \n", - " # https://github.com/stephenslab/gtexresults/blob/master/workflows/mashr_flashr_workflow.ipynb \n", - " res = list(random.z = dat$random$bhat/dat$random$sbhat, \n", - " strong.z = dat$strong$bhat/dat$strong$sbhat, \n", - " null.z = dat$null$bhat/dat$null$sbhat)\n", - " if (!z_only) {\n", - " res = c(res, list(random.b = dat$random$bhat,\n", - " strong.b = dat$strong$bhat,\n", - " null.b = dat$null$bhat,\n", - " null.s = dat$null$sbhat,\n", - " random.s = dat$random$sbhat,\n", - " strong.s = dat$strong$sbhat))\n", - " }\n", - " return(res)\n", - " }\n", - " merge_data = function(res, one_data) {\n", - " if (length(res) == 0) {\n", - " return(one_data)\n", - " } else if (is.null(one_data)) {\n", - " return(res)\n", - " } else {\n", - " for (d in names(one_data)) {\n", - " if (is.null(one_data[[d]])) {\n", - " next\n", - " } else {\n", - " res[[d]] = rbind(res[[d]], one_data[[d]])\n", - " }\n", - " }\n", - " return(res)\n", - " }\n", - " }\n", - " res = list()\n", - " for (f in c(${_input:r,})) {\n", - " # If cannot read the input for some reason then we just skip it, assuming we have other enough data-sets to use.\n", - " dat = tryCatch(readRDS(f), error = function(e) return(NULL))${(\"$\"+table_name) if table_name != \"\" else \"\"}\n", - " if (is.null(dat)) {\n", - " message(paste(\"Skip loading file\", f, \"due to load failure.\"))\n", - " next\n", - " }\n", - " if (${expected_ncondition} > 0 && (ncol(dat$${bhat}) != ${expected_ncondition} || ncol(dat$${sbhat}) != ${expected_ncondition})) {\n", - " message(paste(\"Skip loading file\", f, \"because it has\", ncol(dat$${bhat}), \"columns different from required\", ${expected_ncondition}))\n", - " next\n", - " }\n", - " res = merge_data(res, reformat_data(extract_one_data(dat, ${n_random}, ${n_null}, f), ${\"TRUE\" if z_only else \"FALSE\"}))\n", - " }\n", - " saveRDS(res, ${_output:r})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "[extract_effects_2]\n", - "input: group_by = \"all\"\n", - "output: f\"{cwd}/{name}.rds\"\n", - "task: trunk_workers = 1, walltime = '1h', trunk_size = 1, mem = '16G', cores = 1, tags = f'{_output:bn}'\n", - "R: expand = \"${ }\"\n", - " merge_data = function(res, one_data) {\n", - " if (length(res) == 0) {\n", - " return(one_data)\n", - " } else {\n", - " for (d in names(one_data)) {\n", - " res[[d]] = rbind(res[[d]], one_data[[d]])\n", - " }\n", - " return(res)\n", - " }\n", - " }\n", - " dat = list()\n", - " for (f in c(${_input:r,})) {\n", - " dat = merge_data(dat, readRDS(f))\n", - " }\n", - " # compute empirical covariance XtX\n", - " X = dat$strong.z\n", - " X[is.na(X)] = 0\n", - " dat$XtX = t(as.matrix(X)) %*% as.matrix(X) / nrow(X)\n", - " saveRDS(dat, ${_output:r})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## Factor analyses" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "[flash]\n", - "input: f\"{cwd}/{name}.rds\"\n", - "output: f\"{cwd}/{name}.flash.rds\"\n", - "task: trunk_workers = 1, walltime = '6h', trunk_size = 1, mem = '8G', cores = 2, tags = f'{_output:bn}'\n", - "R: expand = \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'\n", - " library(\"mashr\")\n", - " bhat = readRDS(${_input:r})$strong.z\n", - " sbhat = bhat\n", - " sbhat[!is.na(sbhat)] = 1\n", - " dat = mashr::mash_set_data(bhat,sbhat)\n", - " res = mashr::cov_flash(dat, factors=\"default\", remove_singleton=${\"TRUE\" if \"canonical\" in mixture_components else \"FALSE\"}, output_model=\"${_output:n}.model.rds\")\n", - " saveRDS(res, ${_output:r})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "[flash_nonneg]\n", - "input: f\"{cwd}/{name}.rds\"\n", - "output: f\"{cwd}/{name}.flash_nonneg.rds\"\n", - "task: trunk_workers = 1, walltime = '6h', trunk_size = 1, mem = '8G', cores = 2, tags = f'{_output:bn}'\n", - "R: expand = \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'\n", - " library(\"mashr\")\n", - " bhat = readRDS(${_input:r})$strong.z\n", - " sbhat = bhat\n", - " sbhat[!is.na(sbhat)] = 1\n", - " dat = mashr::mash_set_data(bhat,sbhat)\n", - " res = mashr::cov_flash(dat, factors=\"nonneg\", remove_singleton=${\"TRUE\" if \"canonical\" in mixture_components else \"FALSE\"}, output_model=\"${_output:n}.model.rds\")\n", - " saveRDS(res, ${_output:r})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "[pca]\n", - "parameter: npc = 3\n", - "input: f\"{cwd}/{name}.rds\"\n", - "output: f\"{cwd}/{name}.pca.rds\"\n", - "task: trunk_workers = 1, walltime = '2h', trunk_size = 1, mem = '8G', cores = 2, tags = f'{_output:bn}'\n", - "R: expand = \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'\n", - " library(\"mashr\")\n", - " bhat = readRDS(${_input:r})$strong.z\n", - " sbhat = bhat\n", - " sbhat[!is.na(sbhat)] = 1\n", - " dat = mashr::mash_set_data(bhat,sbhat)\n", - " res = mashr::cov_pca(dat, ${npc})\n", - " saveRDS(res, ${_output:r})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "[canonical]\n", - "input: f\"{cwd}/{name}.rds\"\n", - "output: f\"{cwd}/{name}.canonical.rds\"\n", - "task: trunk_workers = 1, walltime = '1h', trunk_size = 1, mem = '8G', cores = 1, tags = f'{_output:bn}'\n", - "R: expand = \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'\n", - " library(\"mashr\")\n", - " bhat = readRDS(${_input:r})$strong.z\n", - " sbhat = bhat\n", - " sbhat[!is.na(sbhat)] = 1\n", - " dat = mashr::mash_set_data(bhat,sbhat)\n", - " res = mashr::cov_canonical(dat)\n", - " saveRDS(res, ${_output:r})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## Fit mixture model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "# Installed commit d6d4c0e\n", - "[ud]\n", - "# Method is `ed` or `ted`\n", - "parameter: ud_method = \"ed\"\n", - "# A list of models where we only update the scales and not the matrices\n", - "# A typical choice is to estimate scales only for canonical components\n", - "parameter: scale_only = []\n", - "# Tolerance for change in likelihood\n", - "parameter: ud_tol_lik = 1e-3\n", - "input: [f\"{cwd}/{name}.rds\"] + [f\"{cwd}/{name}.{m}.rds\" for m in mixture_components]\n", - "output: f'{cwd}/{name}.{ud_method}{\"_unconstrained\" if len(scale_only) == 0 else \"\"}{(\".\" + os.path.basename(resid_cor)[:-4]) if resid_cor.is_file() else \"\"}.rds'\n", - "task: trunk_workers = 1, walltime = '36h', trunk_size = 1, mem = '10G', cores = 4, tags = f'{_output:bn}'\n", - "R: expand = \"${ }\", stderr = f\"{_output:n}.stderr\", stdout = f\"{_output:n}.stdout\"\n", - " library(stringr)\n", - " rds_files = c(${_input:r,})\n", - " dat = readRDS(rds_files[1])\n", - " U = list(XtX = dat$XtX)\n", - " U_scaled = list()\n", - " mixture_components = c(${paths(mixture_components):r,})\n", - " scale_only = c(${paths(scale_only):r,})\n", - " scale_idx = which(mixture_components %in% scale_only )\n", - " for (f in 2:length(rds_files) ) {\n", - " if ((f - 1) %in% scale_idx ) {\n", - " U_scaled = c(U_scaled, readRDS(rds_files[f]))\n", - " } else {\n", - " U = c(U, readRDS(rds_files[f]))\n", - " }\n", - " }\n", - " #\n", - " if (${\"TRUE\" if resid_cor.is_file() else \"FALSE\"}) {\n", - " V = readRDS(${resid_cor:r})\n", - " } else {\n", - " V = cor(dat$null.z)\n", - " }\n", - " # Fit mixture model using udr package\n", - " library(udr)\n", - " message(paste(\"Running ${ud_method.upper()} via udr package for\", length(U), \"mixture components\"))\n", - " f0 = ud_init(X = as.matrix(dat$strong.z), V = V, U_scaled = U_scaled, U_unconstrained = U, n_rank1=0)\n", - " res = ud_fit(f0, X = na.omit(f0$X), control = list(unconstrained.update = \"${ud_method}\", resid.update = 'none', scaled.update = \"fa\", maxiter=5000, tol.lik = ${ud_tol_lik}), verbose=TRUE)\n", - " saveRDS(list(U=res$U, w=res$w, loglik=res$loglik), ${_output:r})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "[ed]\n", - "parameter: ed_tol = 1e-6\n", - "input: [f\"{cwd}/{name}.rds\"] + [f\"{cwd}/{name}.{m}.rds\" for m in mixture_components]\n", - "output: f'{cwd}/{name}.ed_bovy{(\".\" + os.path.basename(resid_cor)[:-4]) if resid_cor.is_file() else \"\"}.rds'\n", - "task: trunk_workers = 1, walltime = '36h', trunk_size = 1, mem = '10G', cores = 4, tags = f'{_output:bn}'\n", - "R: expand = \"${ }\", stderr = f\"{_output:n}.stderr\", stdout = f\"{_output:n}.stdout\"\n", - " rds_files = c(${_input:r,})\n", - " dat = readRDS(rds_files[1])\n", - " U = list(XtX = dat$XtX)\n", - " for (f in rds_files[2:length(rds_files)]) U = c(U, readRDS(f))\n", - " if (${\"TRUE\" if resid_cor.is_file() else \"FALSE\"}) {\n", - " V = readRDS(${resid_cor:r})\n", - " } else {\n", - " V = cor(dat$null.z)\n", - " }\n", - " # Fit mixture model using ED code by J. Bovy\n", - " mash_data = mashr::mash_set_data(dat$strong.z, V=V)\n", - " message(paste(\"Running ED via J. Bovy's code for\", length(U), \"mixture components\"))\n", - " res = mashr:::bovy_wrapper(mash_data, U, logfile=${_output:nr}, tol = ${ed_tol})\n", - " saveRDS(list(U=res$Ulist, w=res$pi, loglik=scan(\"${_output:n}_loglike.log\")), ${_output:r})" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "kernel": "SoS" - }, - "source": [ - "## Plot patterns of sharing\n", - "\n", - "This is a simple utility function that takes the output from the pipeline above and make some heatmap to show major patterns of multivariate effects. The plots will be ordered by their mixture weights." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "kernel": "SoS" - }, - "outputs": [], - "source": [ - "[plot_U]\n", - "parameter: model_data = path\n", - "# number of components to show\n", - "parameter: max_comp = -1\n", - "# whether or not to convert to correlation\n", - "parameter: to_cor = False\n", - "parameter: tol = \"1E-6\"\n", - "parameter: remove_label = False\n", - "parameter: name = \"\"\n", - "input: model_data\n", - "output: f'{cwd:a}/{_input:bn}{(\"_\" + name.replace(\"$\", \"_\")) if name != \"\" else \"\"}.pdf'\n", - "R: expand = \"${ }\", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout'\n", - " library(reshape2)\n", - " library(ggplot2)\n", - " plot_sharing = function(X, col = 'black', to_cor=FALSE, title=\"\", remove_names=F) {\n", - " clrs <- colorRampPalette(rev(c(\"#D73027\",\"#FC8D59\",\"#FEE090\",\"#FFFFBF\",\n", - " \"#E0F3F8\",\"#91BFDB\",\"#4575B4\")))(128)\n", - " if (to_cor) lat <- cov2cor(X)\n", - " else lat = X/max(diag(X))\n", - " lat[lower.tri(lat)] <- NA\n", - " n <- nrow(lat)\n", - " if (remove_names) {\n", - " colnames(lat) = paste('t',1:n, sep = '')\n", - " rownames(lat) = paste('t',1:n, sep = '')\n", - " }\n", - " melted_cormat <- melt(lat[n:1,], na.rm = TRUE)\n", - " p = ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+\n", - " geom_tile(color = \"white\")+ggtitle(title) + \n", - " scale_fill_gradientn(colors = clrs, limit = c(-1,1), space = \"Lab\") +\n", - " theme_minimal()+ \n", - " coord_fixed() +\n", - " theme(axis.title.x = element_blank(),\n", - " axis.title.y = element_blank(),\n", - " axis.text.x = element_text(color=col, size=8,angle=45,hjust=1),\n", - " axis.text.y = element_text(color=rev(col), size=8),\n", - " title =element_text(size=10),\n", - " # panel.grid.major = element_blank(),\n", - " panel.border = element_blank(),\n", - " panel.background = element_blank(),\n", - " axis.ticks = element_blank(),\n", - " legend.justification = c(1, 0),\n", - " legend.position = c(0.6, 0),\n", - " legend.direction = \"horizontal\")+\n", - " guides(fill = guide_colorbar(title=\"\", barwidth = 7, barheight = 1,\n", - " title.position = \"top\", title.hjust = 0.5))\n", - " if(remove_names){\n", - " p = p + scale_x_discrete(labels= 1:n) + scale_y_discrete(labels= n:1)\n", - " }\n", - " return(p)\n", - " }\n", - " \n", - " dat = readRDS(${_input:r})\n", - " name = \"${name}\"\n", - " if (name != \"\") {\n", - " if (is.null(dat[[name]])) stop(\"Cannot find data ${name} in ${_input}\")\n", - " dat = dat[[name]]\n", - " }\n", - " if (is.null(names(dat$U))) names(dat$U) = paste0(\"Comp_\", 1:length(dat$U))\n", - " meta = data.frame(names(dat$U), dat$w, stringsAsFactors=F)\n", - " colnames(meta) = c(\"U\", \"w\")\n", - " tol = ${tol}\n", - " n_comp = length(meta$U[which(meta$w>tol)])\n", - " meta = head(meta[order(meta[,2], decreasing = T),], ${max_comp if max_comp > 1 else \"nrow(meta)\"})\n", - " message(paste(n_comp, \"components out of\", length(dat$w), \"total components have weight greater than\", tol))\n", - " res = list()\n", - " for (i in 1:n_comp) {\n", - " title = paste(meta$U[i], \"w =\", round(meta$w[i], 6))\n", - " ##Handle updated udr data structure\n", - " if(is.list(dat$U[[meta$U[i]]])){\n", - " res[[i]] = plot_sharing(dat$U[[meta$U[i]]]$mat, to_cor = ${\"T\" if to_cor else \"F\"}, title=title, remove_names = ${\"TRUE\" if remove_label else \"FALSE\"})\n", - " } else if(is.matrix(dat$U[[meta$U[i]]])){\n", - " res[[i]] = plot_sharing(dat$U[[meta$U[i]]], to_cor = ${\"T\" if to_cor else \"F\"}, title=title, remove_names = ${\"TRUE\" if remove_label else \"FALSE\"})\n", - " }\n", - " }\n", - " unit = 4\n", - " n_col = 5\n", - " n_row = ceiling(n_comp / n_col)\n", - " pdf(${_output:r}, width = unit * n_col, height = unit * n_row)\n", - " do.call(gridExtra::grid.arrange, c(res, list(ncol = n_col, nrow = n_row, bottom = \"Data source: readRDS(${_input:br})${('$'+name) if name else ''}\")))\n", - " dev.off()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "SoS", - "language": "sos", - "name": "sos" - }, - "language_info": { - "codemirror_mode": "sos", - "file_extension": ".sos", - "mimetype": "text/x-sos", - "name": "sos", - "nbconvert_exporter": "sos_notebook.converter.SoS_Exporter", - "pygments_lexer": "sos" - }, - "sos": { - "kernels": [ - [ - "R" - ], - [ - "SoS" - ] - ], - "version": "0.22.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}