我几乎可以完成我的needleman-wunsch实现,但是对于如何处理特定案例的回溯我感到困惑。
这个想法是为了重建序列(最长路径),我们需要重新计算以确定得分所来自的矩阵。我遇到的一个极端情况是,右下角的分数不在匹配矩阵中,而在插入列矩阵中(这意味着结果回溯序列应该有一个插入。
这些序列以a2m格式记录,其中序列中的插入被记录为小写字符。因此,在最终输出中,ZZ
与AAAC
的对齐方式应为AAac
。当我手动回溯时,我最终得到AAAc
,因为我只访问了一次Ic矩阵。 Here是我的白板的图片。如您所见,我有三个黑色箭头和一个绿色箭头,这就是为什么我的回溯给我AAAc
的原因。我应该先计数第一个单元格,然后在位置1,1处停止吗?我不确定如何更改实现此方法的方式。
请注意,此处使用的替换矩阵为BLOSUM62。递归关系是
M(i,j) = max(Ic(i-1,j-1)+subst, M(i-1,j-1)+subst, Ir(i-1,j-1)+subst)
Ic(i,j) = max(Ic(i,j-1)-extend, M(i,j-1)-open, Ir(i,j-1)-double)
Ir(i,j) = max(Ic(i-1,j)-double, M(i-1,j)-open, Ir(i-1,j)-extend)
编辑:这是traceback_col_seq函数重写为更干净。请注意,score_cell现在返回thisM,thisC,thisR而不是其中的最大值。这个版本将比对记为AaAc,仍然有相同的问题,现在还有另一个问题,为什么它会在1,2再次进入Ic。但是,此版本更易读。
def traceback_col_seq(self):
i, j = self.maxI-1, self.maxJ-1
self.traceback = list()
matrixDict = {0:'M',1:'Ic',2:'Ir',3:'M',4:'Ic',5:'Ir',6:'M',7:'Ic',8:'Ir'}
while i > 0 or j > 0:
chars = self.col_seq[j-1] + self.row_seq[i-1]
thisM, thisC, thisR = self.score_cell(i, j, chars)
cell = thisM + thisC + thisR
prevMatrix = matrixDict[cell.index(max(cell))]
print(cell, prevMatrix,i,j)
if prevMatrix == 'M':
i -= 1; j -= 1
self.traceback.append(self.col_seq[j])
elif prevMatrix == 'Ic':
j -= 1
self.traceback.append(self.col_seq[j].lower())
elif prevMatrix == 'Ir':
i -= 1
self.traceback.append('-')
return ''.join(self.traceback[::-1])
这是生成动态编程矩阵并追溯对齐方式的python类。还有一个计分功能,用于检查产生的对齐方式是否正确。
class global_aligner():
def __init__(self, subst, open=12, extend=1, double=3):
self.extend, self.open, self.double, self.subst = extend, open, double, subst
def __call__(self, row_seq, col_seq):
#add alphabet error checking?
score_align(row_seq, col_seq)
return traceback_col_seq()
def init_array(self):
"""initialize three numpy arrays, set values in 1st column and row"""
self.M = zeros((self.maxI, self.maxJ), float)
self.Ic = zeros((self.maxI, self.maxJ), float)
self.Ir = zeros((self.maxI, self.maxJ), float)
for i in xrange(1,self.maxI):
self.M[i][0], self.Ic[i][0], self.Ir[i][0] = \
-float('inf'), -float('inf'), -(self.open+self.extend*(i-1))
for j in xrange(1,self.maxJ):
self.M[0][j], self.Ir[0][j], self.Ic[0][j] = \
-float('inf'), -float('inf'), -(self.open+self.extend*(j-1))
self.Ic[0][0] = self.Ir[0][0] = -float('inf')
def score_cell(self, i, j, chars):
"""score a matrix cell based on the 9 total neighbors (3 each direction)"""
thisM = [self.M[i-1][j-1]+self.subst[chars], self.Ic[i-1][j-1]+ \
self.subst[chars], self.Ir[i-1][j-1]+self.subst[chars]]
thisC = [self.M[i][j-1]-self.open, self.Ic[i][j-1]-self.extend, \
self.Ir[i][j-1]-self.double]
thisR = [self.M[i-1][j]-self.open, self.Ic[i-1][j]-self.double, \
self.Ir[i-1][j]-self.extend]
return max(thisM), max(thisC), max(thisR)
def score_align(self, row_seq, col_seq):
"""build dynamic programming matrices to align two sequences"""
self.row_seq, self.col_seq = list(row_seq), list(col_seq)
self.maxI, self.maxJ = len(self.row_seq)+1, len(self.col_seq)+1
self.init_array() #initialize arrays
for i in xrange(1, self.maxI):
row_char = self.row_seq[i-1]
for j in xrange(1, self.maxJ):
chars = row_char+self.col_seq[j-1]
self.M[i][j], self.Ic[i][j], self.Ir[i][j] = self.score_cell(i, j, chars)
def traceback_col_seq(self):
"""trace back column sequence in matrices in a2m format"""
i, j = self.maxI-1, self.maxJ-1
self.traceback = list()
#find which matrix to start in
#THIS IS WHERE THE PROBLEM LIES I THINK
cell = (self.M[i][j], self.Ic[i][j], self.Ir[i][j])
prevMatrix = cell.index(max(cell))
while i > 1 and j > 1:
if prevMatrix == 0: #M matrix
i -= 1; j -= 1 #step up diagonally
prevChars = self.row_seq[i-1]+self.col_seq[j-1]
diag = self.score_cell(i, j, prevChars) #re-score diagonal cell
prevMatrix = diag.index(max(diag)) #determine which matrix that was
self.traceback.append(self.col_seq[j])
elif prevMatrix == 1: #Ic matrix
j -= 1
prevChars = self.row_seq[i-1]+self.col_seq[j-1]
left = self.score_cell(i, j, prevChars)
prevMatrix = left.index(max(left))
self.traceback.append(self.col_seq[j].lower())
elif prevMatrix == 2: #Ir matrix
i -= 1
prevChars = self.row_seq[i-1]+self.col_seq[j-1]
up = self.score_cell(i, j, prevChars)
prevMatrix = up.index(max(up))
self.traceback.append('-')
for j in xrange(j,0,-1): #hit top of matrix before ending, add chars
self.traceback.append(self.col_seq[j-1])
for i in xrange(i,0,-1): #hit left of matrix before ending, add gaps
self.traceback.append('-')
print(''.join(self.row[::-1]))
return ''.join(self.traceback[::-1])
def score_a2m(self, s1, s2):
"""scores an a2m alignment of two sequences. I believe this function correctly
scores alignments and is used to test my alignments. The value produced by this
function should be the same as the largest value in the bottom right of the three
matrices"""
s1, s2 = list(s1.strip('.')), list(s2.strip('.'))
s1_pos, s2_pos = len(s1)-1, len(s2)-1
score, gap = 0, False
while s1_pos >= 0 and s2_pos >= 0:
if s1[s1_pos].islower() and gap is False:
score -= self.open; s1_pos -= 1; gap = True
elif s1[s1_pos].islower() and gap is True:
score -= self.extend; s1_pos -= 1
elif s2[s2_pos].islower() and gap is False:
score -= self.open; s2_pos -= 1; gap = True
elif s2[s2_pos].islower() and gap is True:
score -= self.extend; s2_pos -= 1
elif s1[s1_pos] == '-' and gap is False:
score -= self.open; s1_pos -= 1; s2_pos -= 1; gap = True
elif s1[s1_pos] == '-' and gap is True:
score -= self.extend; s1_pos -= 1; s2_pos -= 1
elif s2[s2_pos] == '-' and gap is False:
score -= self.open; s1_pos -= 1; s2_pos -= 1; gap = True
elif s2[s2_pos] == '-' and gap is True:
score -= self.extend; s1_pos -= 1; s2_pos -= 1
elif gap is True:
score += self.subst[s1[s1_pos].upper() + s2[s2_pos].upper()]
s1_pos -= 1; s2_pos -= 1; gap = False
else:
score += self.subst[s1[s1_pos].upper() + s2[s2_pos].upper()]
s1_pos -= 1; s2_pos -= 1
if s1_pos >= 0 and gap is True:
score -= self.extend*s1_pos
elif s1_pos >= 0 and gap is False:
score -= self.open+s1_pos*self.extend
if s2_pos >= 0 and gap is True:
score -= self.extend*s2_pos
elif s2_pos >= 0 and gap is False:
score -= self.open+s2_pos*self.extend
return score
test = global_aligner(blosumMatrix)
s1,s2 = 'ZZ','AAAC'
test.score_align(s1, s2)
align = test.traceback_col_seq()
print('This score: ', test.score_a2m(s1,align)
print('Correct score: ', test.score_a2m(s1,'AAac'))
Blosum解析器
def parse_blosum(blosumFile):
blosumMatrix, commentFlag = dict(), False
for line in blosumFile:
if not line.startswith('#') and not commentFlag:
alphabet = line.rstrip().split()
commentFlag = True
elif commentFlag:
line = line.rstrip().split()
thisChar, line = line[0], line[1:]
for x in xrange(len(line)):
alphaChar, thisValue = alphabet[x], line[x]
blosumMatrix[thisChar+alphaChar] = int(thisValue)
return blosumMatrix
最佳答案
def traceback_col_seq(self):
"""
Traces back the column sequence to determine final global alignment.
Recalculates the score using score_cell.
"""
i, j = self.maxI-1, self.maxJ-1
self.traceback = list() #stores final sequence
matrixDict = {0:'M',1:'Ic',2:'Ir'} #mapping between numeric value and matrix
chars = self.col_seq[j-1] + self.row_seq[i-1] #store first characters
thisM, thisC, thisR = self.score_cell(i,j, chars)
cell = max(thisM), max(thisC), max(thisR) #determine where to start
prevMatrix = matrixDict[cell.index(max(cell))] #store where to go first
while i > 0 or j > 0:
#loop until the top left corner of the matrix is reached
if prevMatrix == 'M':
self.traceback.append(self.col_seq[j-1])
i -= 1; j -= 1
prevMatrix = matrixDict[thisM.index(max(thisM))]
elif prevMatrix == 'Ic':
self.traceback.append(self.col_seq[j-1].lower())
j -= 1
prevMatrix = matrixDict[thisC.index(max(thisC))]
elif prevMatrix == 'Ir':
self.traceback.append('-')
i -= 1
prevMatrix = matrixDict[thisR.index(max(thisR))]
chars = self.col_seq[j-1] + self.row_seq[i-1] #store next characters
thisM, thisC, thisR = self.score_cell(i,j, chars) #rescore next cell
return ''.join(self.traceback[::-1])