-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathformat_vcf_for_CharGer.py
91 lines (70 loc) · 2.71 KB
/
format_vcf_for_CharGer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
#title :format_vcf_for_CharGer.py
#author :Fernanda Martins Rodrigues (@fernanda);
#date :20190820
"""
This script takes as input a VEP annotated VCF file and removes INFO fields generated by variant caller pipeline, leaving only VEP annotation fields.
This formatting step is necessary prior to running CharGer, so it takes AF from gnomaD into account, other than AF from variant calling pipeline.
Input must be a VEP annotated VCF file (.vcf or .vcf.gz accepted)
Usage:
python format_vcf_for_CharGer.py [-h] -i <input VCF file> -O <output directory>
Arguments:
-i, --inputVCF: input VCF file; gzip compressed
-O, --outputDirectory: directory to write output files to
-h, --help: prints usage documentation
"""
import sys
import argparse
import getopt
import gzip
import os
def argument_parser():
# create parser
parser = argparse.ArgumentParser(description=__doc__)
# add arguments
parser.add_argument("-i", "--inputVCF", required=True, help="input VEP annotated VCF file; .vcf or .vcf.gz accepted")
parser.add_argument("-O", "--outputDirectory", required=True, help="directory to write output files to")
args = vars(parser.parse_args())
inputVCF = args["inputVCF"]
outputDirectory = args["outputDirectory"]
if outputDirectory[-1] != '/':
outputDirectory = outputDirectory + '/'
if not os.path.exists(outputDirectory):
os.makedirs(outputDirectory)
return inputVCF, outputDirectory
###############
## MAIN CODE ##
###############
def main():
inputVCF, outputDirectory = argument_parser()
if ".gz" in inputVCF:
try:
vcfF = gzip.open(inputVCF,"rt")
except IOError:
print("VCF file does not exist!")
else:
try:
vcfF = open(inputVCF,"r")
except IOError:
print("VCF file does not exist!")
outFile_suffix = ".infoFixed.vcf"
inputFile_basename = inputVCF.split('/')[-1]
if ".gz" in inputVCF:
outFile = outputDirectory+inputFile_basename.replace(".vcf.gz",outFile_suffix)
else:
outFile = outputDirectory+inputFile_basename.replace(".vcf",outFile_suffix)
outF = open(outFile, "w")
# Start parsing input VCF file
for line in vcfF:
# print the info lines to output file
if line.startswith('#'):
outF.write(line)
else:
var = line.strip().split("\t")
info_VEPonly = var[7].split(";")[-1]
var[7]=info_VEPonly
outF.write('\t'.join(var)+'\n')
outF.close()
if __name__ == "__main__":
main()
## END ##################################