I’m trying to perform a GOs annotation using the SIMAP database which is
Blast2GO annotated. Everything is fine, but I have problems when I try
to find the accession number in the file where entry numbers are
associated with their GOs. The problem is that the script does not find
the number in the input file when really there is. I tried several things
without good results (re.match, insert in a list and then extract the element, etc)
File where the GOs are associated with entry numbers has this structure (accession number, GO term, blats2go score):
1f0ba1d119f52ff28e907d2b5ea450db GO:0007154 79
1f0ba1d119f52ff28e907d2b5ea450db GO:0005605 99
The python code:
import re
from Bio.Blast import NCBIXML
from Bio import SeqIO
input_file = open('/home/fpiston/Desktop/test_go/test2.fasta', 'rU')
result_handle = open('/home/fpiston/Desktop/test_go/test2.xml', 'rU')
save_file = open('/home/fpiston/Desktop/test_go/test2.out', 'w')
fh = open('/home/fpiston/Desktop/test_go/Os_Bd_Ta_blat2go_fake', 'rU')
q_dict = SeqIO.to_dict(SeqIO.parse(input_file, "fasta"))
blast_records = NCBIXML.parse(result_handle)
hits = []
for blast_record in blast_records:
if blast_record.alignments:
list = (blast_record.query).split()
if re.match('ENA|\w*|\w*', list[0]) != None:
list2 = list[0].split("|")
save_file.write('%s\t' % list2[1])
else:
save_file.write('%s\t' % list[0])
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
h = alignment.hit_def
for l in fh:
ls = l.split() #at this point all right
if h in ls: #here, 'h' in not found in 'fh'
print h
print 'ok'
save_file.write('%s\t' % ls[1])
save_file.write('\n')
hits.append(blast_record.query.split()[0])
misses =set(q_dict.keys()) - set(hits)
for i in misses:
list = i.split("|")
if len(list) > 1:
save_file.write('%s\t' % list[1])
else:
save_file.write('%s\t' % list)
save_file.write('%s\n' % 'no_match')
save_file.close()
This is the code with the correction of martineau (fh.seek(0)):
#!/usr/bin/env python
import sys
import re
from Bio.Blast import NCBIXML
from Bio import SeqIO
input_file = sys.argv[1] #queries sequences in fasta format
out_blast_file = sys.argv[2] #name of the blast results file
output_file = sys.argv[3] #name of the output file
result_handle = open(out_blast_file, 'rU')
fh = open('/home/fpiston/Desktop/test_go/Os_Bd_Ta_blat2go', 'rU')
q_dict = SeqIO.to_dict(SeqIO.parse(open(input_file), "fasta"))
blast_records = NCBIXML.parse(result_handle)
save_file = open(output_file, 'w')
hits = []
for blast_record in blast_records:
if blast_record.alignments:
list = (blast_record.query).split()
if re.match('ENA|\w*|\w*', list[0]) != None:
list2 = list[0].split("|")
save_file.write('\n%s\t' % list2[1])
else:
save_file.write('\n%s\t' % list[0])
for alignment in blast_record.alignments:
for hsp in alignment.hsps:
hit = alignment.hit_def
save_file.write('%s\t' % hit)
fh.seek(0)
for l in fh:
ls = l.split()
if ls[0] in hit:
save_file.write('%s\t' % ls[1])
hits.append(blast_record.query.split()[0])
misses =set(q_dict.keys()) - set(hits)
for i in misses:
list = i.split("|")
if len(list) > 1:
save_file.write('\n%s\t' % list[1])
else:
save_file.write('\n%s\t' % list)
save_file.write('%s' % 'no_match')
save_file.close()
I really have no idea what you’re talking about here, but noticed that within the outer
for blast_record in blast_records:andfor alignment in blast_record.alignments:loops you have afor l in fh:but never rewind the file with afh.seek(0)anywhere, which means it only reads the lines in the file the first time it’s executed — which seems illogical.You could fix this by adding the
fh.seek(0)just before the inner loop. Although unnecessary the very first time the inner loop executes, it’s need all the following times and doing it one extra time won’t hurt anything.