提取细菌基因组ORF思路:

1.通过FNA文件得到细菌基因组序列

2.分正负链和三个相位共6种情况统计ORF

3.写入文件

转载请保留出处!

统计细菌基因组ORF

贴上Python代码(版本:3.6)

 # -*- coding: utf-8 -*-
"""
Created on Thu Dec 14 13:19:00 2017 @author: zxzhu
""" import re
def N2M(sequence): #正负链转换
hash = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C','N':'N'}
sequence = ''.join([hash[i] for i in sequence])
return sequence[::-1] def translate(seq): #将序列转换为起始,终止,其他密码子
pa1 = re.compile(r'TAA|TAG|TGA')
after_trans = ''
for i in range(0,len(seq),3):
if seq[i:i+3]=='ATG':
after_trans+='I'
elif pa1.match(seq[i:i+3]):
after_trans+='T'
else:
after_trans+='O'
return after_trans def get_orf(seq,length=90):
pa2 = re.compile(r'I[IO]+?T') #匹配模式:起始1非终止1~N终止1
trans_seq = translate(seq)
m = pa2.finditer(trans_seq) #所有匹配结果的迭代
index = []
orf = []
for i in m:
index.append(i.span()) #序列起始,终止位置
for i in index:
orf_start = i[0]*3
orf_end = i[1]*3
#print(orf_start,orf_end)
if orf_end - orf_start >= length: #不小于90bp
orf.append(seq[orf_start:orf_end])
return orf def Seq2AA(sequence,hash): #翻译为AA序列
AA=''
for i in range(0, len(sequence) - 3, 3):
AA += hash[sequence[i:i + 3]]
return AA def main(fna,length=90):
fn = open(fna)
pa = re.compile(r'\s+')
hash_seq = {} # CDS hash,CDS2sequence
result1 = open('orf_seq.txt','w')
result2 = open('orf_AA.txt','w')
start = [0,1,2] #相位
strain = '+-' #正负链
hash_AA = {} # AA hash,sequence2AA
with open('AA.txt', 'r') as f: #AA.txt 为密码子表
for line in f:
line = line.strip()
if line:
line = pa.split(line)
hash_AA[line[0]] = line[1] #AA hash for line in fn: #获取序列
line = line.strip()
if line.startswith('>'):
A = pa.split(line)[0].replace('>', '')
hash_seq[A] = ''
else:
hash_seq[A] += line for key in hash_seq.keys(): #分+-链,3个相位统计ORF
seq = hash_seq[key]
for r in strain:
if r == '-':
seq = N2M(seq)
for s in start:
seq = seq[s:]
#trans_seq = translate(seq)
orf = get_orf(seq)
for i in orf:
if 'N' not in i: #去除N
AA =Seq2AA(i,hash_AA)
result1.write('>'+key+'\t'+r+'\t'+str(s)+'\n'+i+'\n')
result2.write('>'+key+'\t'+r+'\t'+str(s)+'\n'+AA+'\n')
fn.close()
result1.close()
result2.close() fna = 'GCA_000160075.2_ASM16007v2_genomic.fna'
main(fna)

NCBI可以找ORF,很方便。码一下:ORFfinder

05-06 03:13