Alina Fedorova
commited on
Commit
·
81b5a62
1
Parent(s):
0b7ce57
env via pydantic added
Browse files- environment.yml +4 -0
- preprocess/.env_paths +7 -0
- preprocess/run_beagle.py +32 -16
- preprocess/setting.py +23 -0
environment.yml
CHANGED
|
@@ -132,6 +132,7 @@ dependencies:
|
|
| 132 |
- zipp=3.15.0=pyhd8ed1ab_0
|
| 133 |
- zlib=1.2.11=h166bdaf_1014
|
| 134 |
- pip:
|
|
|
|
| 135 |
- anyio==3.7.0
|
| 136 |
- argon2-cffi==21.3.0
|
| 137 |
- argon2-cffi-bindings==21.2.0
|
|
@@ -193,11 +194,14 @@ dependencies:
|
|
| 193 |
- prompt-toolkit==3.0.38
|
| 194 |
- psutil==5.9.5
|
| 195 |
- ptyprocess==0.7.0
|
|
|
|
|
|
|
| 196 |
- pygments==2.15.1
|
| 197 |
- pyparsing==3.1.0
|
| 198 |
- pyrsistent==0.19.3
|
| 199 |
- pytest==7.4.0
|
| 200 |
- python-dateutil==2.8.2
|
|
|
|
| 201 |
- pytz==2023.3
|
| 202 |
- pyzmq==25.1.0
|
| 203 |
- qtconsole==5.4.3
|
|
|
|
| 132 |
- zipp=3.15.0=pyhd8ed1ab_0
|
| 133 |
- zlib=1.2.11=h166bdaf_1014
|
| 134 |
- pip:
|
| 135 |
+
- annotated-types==0.5.0
|
| 136 |
- anyio==3.7.0
|
| 137 |
- argon2-cffi==21.3.0
|
| 138 |
- argon2-cffi-bindings==21.2.0
|
|
|
|
| 194 |
- prompt-toolkit==3.0.38
|
| 195 |
- psutil==5.9.5
|
| 196 |
- ptyprocess==0.7.0
|
| 197 |
+
- pydantic==2.0.3
|
| 198 |
+
- pydantic-core==2.3.0
|
| 199 |
- pygments==2.15.1
|
| 200 |
- pyparsing==3.1.0
|
| 201 |
- pyrsistent==0.19.3
|
| 202 |
- pytest==7.4.0
|
| 203 |
- python-dateutil==2.8.2
|
| 204 |
+
- python-dotenv==0.21.1
|
| 205 |
- pytz==2023.3
|
| 206 |
- pyzmq==25.1.0
|
| 207 |
- qtconsole==5.4.3
|
preprocess/.env_paths
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
vcf='/absolute_path_to_your_file/example.vcf'
|
| 2 |
+
samples='/absolute_path_to_your_files/hapmap-ceu-all.lift.vcf'
|
| 3 |
+
conform='/absolute_your_path/conform-gt.24May16.cee.jar'
|
| 4 |
+
beagle='/absolute_your_path/beagle.22Jul22.46e.jar'
|
| 5 |
+
ref='/absolute_your_path/reference'
|
| 6 |
+
maps='/absolute_your_path/maps'
|
| 7 |
+
gb=20
|
preprocess/run_beagle.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
#!/usr/bin/env python
|
| 2 |
import re
|
| 3 |
-
import click
|
| 4 |
import os
|
| 5 |
import glob
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
script for running Beagle 5.4
|
| 8 |
All kind of data for this script like human reference panel, genetic maps
|
|
@@ -45,7 +46,6 @@ def run_conform(conform, vcf_gz_file, ref_folder):
|
|
| 45 |
reference: files was downloaded from from Beagle human reference link
|
| 46 |
https://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.vcf/"""
|
| 47 |
for ref_file in glob.glob(f'{ref_folder}/**/chr*.vcf.gz', recursive=True):
|
| 48 |
-
print('conform ', ref_file)
|
| 49 |
if re.search("chr(\d+)", ref_file):
|
| 50 |
chr_type = (re.search("chr(\d+)", ref_file))[1]
|
| 51 |
elif re.search("chrX", ref_file):
|
|
@@ -58,7 +58,6 @@ def run_conform(conform, vcf_gz_file, ref_folder):
|
|
| 58 |
def ensure_biallelic_ref(ref_dir):
|
| 59 |
for ref_file in glob.glob(f'{ref_dir}/chr*.v5a.vcf.gz'):
|
| 60 |
ref_biall_path = os.path.join(ref_dir, f'{ref_file.split("vcf")[0]}biallelic.vcf.gz')
|
| 61 |
-
print('ensure ', ref_file, ref_biall_path)
|
| 62 |
os.system(f'bcftools view -m2 -M2 -v snps -Oz -o {ref_biall_path} {ref_file}')
|
| 63 |
os.system(f'bcftools index {ref_biall_path}.gz')
|
| 64 |
os.remove(ref_file) # remove initial ref file
|
|
@@ -78,25 +77,42 @@ def run_beagle(beagle, gb, map_dir, ref_dir):
|
|
| 78 |
f' out=imputed_{chr_type} map={map_file}')
|
| 79 |
|
| 80 |
|
| 81 |
-
@click.command()
|
| 82 |
-
@click.option('--vcf', help='Path to the target vcf file')
|
| 83 |
-
@click.option('--samples', help='Path to VCF with other samples for conform checks, not required if target VCF'
|
| 84 |
-
|
| 85 |
-
@click.option('--conform', help='Path to conform .jar file')
|
| 86 |
-
@click.option('--beagle', help='Path to beagle .jar file')
|
| 87 |
-
@click.option('--ref', help='Path to folder with reference genome:'
|
| 88 |
-
|
| 89 |
-
@click.option('--
|
| 90 |
-
|
| 91 |
-
@click.option('--gb', help='Number of gigabytes for running beagle', default=10, show_default=True)
|
| 92 |
-
def main(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
bgzip_and_index(vcf, samples)
|
| 94 |
if samples:
|
| 95 |
merge(vcf, samples)
|
| 96 |
cleaned_file = clean_and_gzip(vcf, samples) # returned cleaned file in .vcf.gz (gzip) format
|
| 97 |
ensure_biallelic_ref(ref)
|
| 98 |
run_conform(conform, cleaned_file, ref)
|
| 99 |
-
run_beagle(beagle, gb,
|
| 100 |
|
| 101 |
|
| 102 |
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#!/usr/bin/env python
|
| 2 |
import re
|
|
|
|
| 3 |
import os
|
| 4 |
import glob
|
| 5 |
+
from setting import IMPUTATION_SETTINGS
|
| 6 |
+
|
| 7 |
"""
|
| 8 |
script for running Beagle 5.4
|
| 9 |
All kind of data for this script like human reference panel, genetic maps
|
|
|
|
| 46 |
reference: files was downloaded from from Beagle human reference link
|
| 47 |
https://bochet.gcc.biostat.washington.edu/beagle/1000_Genomes_phase3_v5a/b37.vcf/"""
|
| 48 |
for ref_file in glob.glob(f'{ref_folder}/**/chr*.vcf.gz', recursive=True):
|
|
|
|
| 49 |
if re.search("chr(\d+)", ref_file):
|
| 50 |
chr_type = (re.search("chr(\d+)", ref_file))[1]
|
| 51 |
elif re.search("chrX", ref_file):
|
|
|
|
| 58 |
def ensure_biallelic_ref(ref_dir):
|
| 59 |
for ref_file in glob.glob(f'{ref_dir}/chr*.v5a.vcf.gz'):
|
| 60 |
ref_biall_path = os.path.join(ref_dir, f'{ref_file.split("vcf")[0]}biallelic.vcf.gz')
|
|
|
|
| 61 |
os.system(f'bcftools view -m2 -M2 -v snps -Oz -o {ref_biall_path} {ref_file}')
|
| 62 |
os.system(f'bcftools index {ref_biall_path}.gz')
|
| 63 |
os.remove(ref_file) # remove initial ref file
|
|
|
|
| 77 |
f' out=imputed_{chr_type} map={map_file}')
|
| 78 |
|
| 79 |
|
| 80 |
+
# @click.command()
|
| 81 |
+
# @click.option('--vcf', help='Path to the target vcf file')
|
| 82 |
+
# @click.option('--samples', help='Path to VCF with other samples for conform checks, not required if target VCF'
|
| 83 |
+
# 'contains data for at least 20 individuals', required=False)
|
| 84 |
+
# @click.option('--conform', help='Path to conform .jar file')
|
| 85 |
+
# @click.option('--beagle', help='Path to beagle .jar file')
|
| 86 |
+
# @click.option('--ref', help='Path to folder with reference genome:'
|
| 87 |
+
# ' .vcf.gz files are expected to start with "chr1."..."chr22.", "chrX."')
|
| 88 |
+
# @click.option('--maps', help='Path to folder with PLINK format genetic maps, files are expected to start with'
|
| 89 |
+
# '"plink.chr1.", ..."plink.chr22.", "plink.chrX."')
|
| 90 |
+
# @click.option('--gb', help='Number of gigabytes for running beagle', default=10, show_default=True)
|
| 91 |
+
def main():
|
| 92 |
+
vcf = IMPUTATION_SETTINGS.vcf
|
| 93 |
+
samples = IMPUTATION_SETTINGS.samples
|
| 94 |
+
conform = IMPUTATION_SETTINGS.conform
|
| 95 |
+
beagle = IMPUTATION_SETTINGS.beagle
|
| 96 |
+
ref = IMPUTATION_SETTINGS.ref
|
| 97 |
+
maps = IMPUTATION_SETTINGS.maps
|
| 98 |
+
gb = IMPUTATION_SETTINGS.gb
|
| 99 |
bgzip_and_index(vcf, samples)
|
| 100 |
if samples:
|
| 101 |
merge(vcf, samples)
|
| 102 |
cleaned_file = clean_and_gzip(vcf, samples) # returned cleaned file in .vcf.gz (gzip) format
|
| 103 |
ensure_biallelic_ref(ref)
|
| 104 |
run_conform(conform, cleaned_file, ref)
|
| 105 |
+
run_beagle(beagle, gb, maps, ref)
|
| 106 |
|
| 107 |
|
| 108 |
main()
|
| 109 |
+
"""
|
| 110 |
+
python preprocess/run_beagle.py
|
| 111 |
+
--vcf /Users/alina/Documents/longevity/genomes/antonkulaga.hg37.pickard.annotate_bcf_alldbsnp.vcf
|
| 112 |
+
--samples /Users/alina/Documents/longevity/genomes/hapmap-ceu-all.lift.vcf
|
| 113 |
+
--conform /Users/alina/tools/conform-gt.24May16.cee.jar
|
| 114 |
+
--beagle /Users/alina/tools/beagle.22Jul22.46e.jar
|
| 115 |
+
--ref /Users/alina/progproj/gennet/test_beagle/reference
|
| 116 |
+
--map /Users/alina/progproj/gennet/test_beagle/maps --gb 20
|
| 117 |
+
|
| 118 |
+
"""
|
preprocess/setting.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Optional
|
| 2 |
+
from pydantic import BaseSettings, Field
|
| 3 |
+
import os
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class EnvBeagle(BaseSettings):
|
| 7 |
+
vcf: str = Field(description='Path to the target vcf file')
|
| 8 |
+
samples: Optional[str] = Field(default=None,
|
| 9 |
+
description='Path to VCF with other samples for conform checks, not required if '
|
| 10 |
+
'target VCF contains data for at least 20 individuals')
|
| 11 |
+
conform: str = Field(description='Path to conform .jar file')
|
| 12 |
+
beagle: str = Field(description='Path to beagle .jar file')
|
| 13 |
+
ref: str = Field(description='Path to folder with reference genome:'
|
| 14 |
+
' .vcf.gz files are expected to start with "chr1."..."chr22.", "chrX."')
|
| 15 |
+
maps: str = Field(description='Path to folder with PLINK format genetic maps, files are expected to start with'
|
| 16 |
+
'"plink.chr1.", ..."plink.chr22.", "plink.chrX."')
|
| 17 |
+
gb: int = Field(description='Number of gigabytes for running beagle')
|
| 18 |
+
|
| 19 |
+
class Config:
|
| 20 |
+
env_file = os.path.dirname(os.path.abspath(__file__))+"/.env_paths"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
IMPUTATION_SETTINGS = EnvBeagle()
|