我必须先整理一条数据才能对其进行分析。
数据(样本):
3.30.67.10 ['2i69A','1sfkA','1sfkB','1sfkH','2hcnA','2hcsA','2hfzA','2of6A','2qeqA','2qeqB','2wa1A','2wa1B ,'2wa2A','2wa2B','4r05A','4r8rA','4r8sA','1pjwA','2m0sA','4uifA','4o6bA','2vbcA','3gczA','1ztxE', '3c5xC','4ctjA','4ctkA','3u1iA','3u1iB','3u1jA','3u1jB','3c6eA','3j42A','3l6pA','3lkwA','4al8C','4gsxA ,'4gt0A','4oigA','3j8dB','1p58A','2m9pA','2m9qA','3uzvA','1uzgA','3p8zA','3p97A','3uzeC','3uzeD', '3vttA','2bhrA','2bmfA','2fomA','2fomB','4m9fA','4m9iA','4m9kA','4m9mA','4m9tA','2jlqA','2jlrA','2jlsA ,'2jluA','2jlvA','2jlwA','2jlxA','2jlyA','2jlzA','2whxA','2whxC','2wzqA','2wzqC','3uypB','1yzoA', '1k4rA','4alaC','1z66A','2gg1A','3ixyA','2hg0A','2v6iA','2v6jA','4r8tA','1yksA','1ymfA','3evaA','3evbA ,'3evcA','3evdA','3eveA','3evfA','1p58D','2b6bA','1s6nA','1l9kA','1oamA','1oanA','1ok8A','1okeA', '1r6aA','1r6rA','1thdA','2p1dA','3j8dG','3j8dH','3zkoA','4tplA','4uifB','4ut6A','4ut 9A','4utaA','4utbA','4utcA','3g7tA','3j05A','4cauA','4cbfA','4cbfB','3j35A','1tc7C','2fp7A','2fp7B' ,'2g05D','2ggvA','2ggvB','2ijoA','2ijoB','2p5pA','2yolA','3e90A','3e90B','3e90C','4r8tB','4c2iA',' 2oxtA','3egpA','3ircA','4ffyA','4ffzA','4l5fE','2jqmA','2jv6A','4am0A','4am0Q','4am0R','4fg0A','4bz1A' ,'4bz2A','2p3qA','2xbmA','3c5xA','2oy0A','3i50E','3iywA','3j0bA','3lkzA','4o6cA','4o6dA','4oieA',' 4oiiA','3c6dA','2r6pA','2p3lA','2p3oA','2p40A','2p41A','1urzA','2px2A','2px4A','2px5A','2px8A','2pxaA' ,'2pxcA','2v8oA','2wv9A','3c6dD','3c6eC','3j42D','2z83A','3p54A','4hdgA','4hdhA','4k6mA','4mtpA',' 4mtpD','2jsfA','2r69A','3c6rA','3c6rD','3ixyD','3iyaA','3iyaD','3ixxA','2r29A','3evgA','1df9A','2qidA' ,'3j27A','3j27B','3j2pA','3j2pB','4uihA','3uzqB','1befA','1tg8A','1tgeA','3ixxD','5a1zA','1n6gA',' 1na4A','1svbA','4azxA','4azxD','4b03A','4b03D','4c2iB','4cctA','4cctD','2h0pA','3uajB','3uc0 A','3uc0B','3we1A','2j7uA','2j7wA','3j6sA','3j6sB','3j6tA','3j6tB','3j6uA','3j6uB','3vwsA','4c11A' ,“ 4hhjA”,“ 4v0qA”,“ 4v0rA”]
如您所见,某些数据的前4位类似“ 1sfk”。如果它们共享前4位数字,则表示它们属于相同的结构,我需要为每个完整的蛋白质代码(5位数字,例如1sfkA或1sfkB)(在PDBSum数据库中找到)存储唯一的UniProt码,在该4位以下数字代码。
为此,我创建了以下代码和平:
for domain in dDomainSeqSum.keys():# CHANGE TO COMPRESS FILE
dDomainSeqSumSWS[domain]={}
for pdb in dDomainSeqSum[domain]:#add sws of a pdb in a variable and later add that variable to the domain thing
pdb1 = list(pdb)#split is not working
pdb2 = pdb1[0]+pdb1[1]+pdb1[2]+pdb1[3]
dDomainSeqSumSWS[domain][pdb2]=[]
for i in range(len(PDBSum)): #make pdb3 search and then compare to the pdb stored
if pdb in PDBSum[i]:
if "SWS_ID" in PDBSum[i]:
line = PDBSum[i].split()
if pdb2 not in dDomainSeqSumSWS:
dDomainSeqSumSWS[domain][pdb2]=[line[2]]
else:
dDomainSeqSumSWS[domain][pdb2].append(line[2])
运行两个代码后,这是我得到的结果:
{'3.30.67.10':{'4c2i':['G3F5K5'],'2p3l':['Q9WLZ5'],'4uta':['Q68Y26'],'4utc':['Q68Y26'],'4utb ':['Q68Y26'],'1urz':['Q80E47'],'3l6p':['P17763'],'1tge':['P27914'],'3evb':['P03314'],'2vbc ':['Q2TN89'],'3eva':['P03314'],'3evf':['P03314'],'3evg':['P29991'],'3evd':['P03314'],'3eve ':['P03314'],'2p1d':['P12823'],'3j42':['Q3BCY5'],'2jlx':['Q2YHF0'],'2jly':['Q2YHF0'],'2jlz ':['Q2YHF0'],'2jlu':['Q2YHF0'],'2jlv':['Q2YHF0'],'2jlw':['Q2YHF0'],'1oke':['P12823'],'2jlq ':['Q2YHF0'],'2jlr':['Q2YHF0'],'2jls':['Q2YHF0'],'2wv9':['P05769'],'2z83':['P27395'],'4hdh ':['P27395'],'2hcn':['P14335'],'2oxt':['A0EKU1'],'1tg8':['P27914'],'4hdg':['P27395'],'4ut9 ':['Q68Y26'],'3e90':['P06935'],'4am0':['Q58HT7'],'4ut6':['Q68Y26'],'1ok8':['P12823'],'4ffy ':['Q9J7C6'],'4ffz':['Q88640'],'4b03':['G3F5K5'],'2m9p':['P14337'],'2m9q':['P14337'],'4fg0 ':['P09732'],'4azx':['G3F5K5'],'2hcs':['P14335'],'4hhj':['Q6DLV0'],'4mtp':['P27395'],'3j8 d':['P12823'],'3uc0':['P09866'],'4l5f':['Q8BE40'],'4m9t':['Q91H74'],'4m9k':['Q91H74'],' 4m9i':['Q91H74'],'2of6':['P14335'],'2px5':['P05769'],'4m9m':['Q91H74'],'4m9f':['Q91H74'],' 3j0b':['Q9Q6P4'],'5a1z':['G9FRP5'],'4r8r':['C1KBQ3'],'4r8s':['C1KBQ3'],'1l9k':['P12823'],' 1svb':['P14336'],'4r8t':['O90417'],'2hfz':['P14335'],'2v6j':['Q32ZD5'],'3zko':['P12823'],' 2ggv':['P06935'],'2v6i':['Q32ZD5'],'3u1j':['Q5UB51'],'3u1i':['Q5UB51'],'4oig':['P17763'],' 4ala':['Q7TGD1'],'3p97':['P27915'],'3p8z':['P27915'],'2pxc':['P05769'],'4gsx':['P17763'],' 2pxa':['P05769'],'4oii':['Q9Q6P4'],'1bef':['Q9Q4T1'],'3evc':['P03314'],'3j05':['Q689G3'],' 3egp':['Q9J7C6'],'2yol':['P06935'],'2v8o':['P05769'],'4r05':['C1KBQ3'],'1n6g':['P14336'],' 3lkz':['Q9Q6P4'],'4cau':['Q689G3'],'2px2':['P05769'],'2gg1':['P29837'],'4al8':['P17763'],' 2px4':['P05769'],'3lkw':['P17763'],'2r69':['P18356'],'2r6p':['Q66394'],'3j6s':['Q6DLV0'],' 3j6 u':['Q6DLV0'],'1sfk':['P14335'],'1z66':['P29837'],'3uaj':['P09866'],'3iyw':['Q9Q6P4'],' 3j35':['E7FLK7'],'4k6m':['P27395'],'2fom':['Q91H74'],'3vws':['Q6DLV0'],'3vtt':['P27915'],' 3iya':['P18356'],'2p5p':['P06935'],'2hg0':['Q91R00'],'2jqm':['Q6DV88'],'2p41':['Q9WLZ5'],' 4v0r':['Q6DLV0'],'4tpl':['Q5SBG8'],'1yks':['P03314'],'4bz1':['Q7TGC7'],'4bz2':['Q7TGC7'],' 1thd':['P12823'],'2m0s':['Q9YKL3'],'4cbf':['E0WXI2'],'3ixx':['Q3I100'],'3ixy':['P18356'],' 2px8':['P05769'],'1ztx':['Q91KZ4'],'2fp7':['P06935'],'4uif':['E0WXJ3'],'4uih':['P14340'],' 3uzq':['P27909'],'4c11':['Q6DLV0'],'1p58':['Q9WDA7'],'4cct':['G3F5K5'],'2r29':['P29991'],' 2p40':['Q9WLZ5'],'1na4':['P14336'],'1ymf':['P03314'],'3uzv':['P07564'],'1r6r':['P12823'],' 3c5x':['Q6H1E5'],'2xbm':['C0LMU5'],'3g7t':['Q689G3'],'2g05':['P06935'],'1r6a':['P12823'],' 3uze':['P27915'],'2whx':['Q2YHF0'],'3p54':['P27395'],'1k4r':['C3V005'],'3i50':['Q9Q6P4'],' 3c6 d':['Q3BCY5'],'3c6e':['Q3BCY5'],'4o6c':['Q9Q6P4'],'4o6b':['P29990'],'4o6d':['Q9Q6P4'],' 2ijo':['P06935'],'2wa2':['Q8QL64'],'1tc7':['P06935'],'3j27':['P14340'],'2wa1':['Q8QL64'],' 3gcz':['Q7T918'],'2p3q':['Q20IJ2'],'2jsf':['P18356'],'3we1':['P09866'],'1df9':['P14340'],' 4gt0':['P17763'],'3c6r':['P18356'],'3j2p':['P14340'],'3irc':['Q9J7C6'],'2oy0':['Q9Q6P4'],' 3uyp':['Q2YHF0'],'2qeq':['P14335'],'2jv6':['Q6DV88'],'2qid':['P14340'],'1oan':['P12823'],' 1oam':['P12823'],'2b6b':['Q9WDA7'],'2bmf':['Q91H74'],'2i69':['Q80QJ9'],'2j7w':['Q6DLV0'],' 4v0q':['Q6DLV0'],'1yzo':['P29838'],'1s6n':['Q913C7'],'4oie':['Q9Q6P4'],'2bhr':['Q91H74'],' 3j6t':['Q6DLV0'],'2p3o':['Q9WLZ5'],'4ctk':['A9LIE0'],'4ctj':['A9LIE0'],'2j7u':['Q6DLV0'],' 1pjw':['Q9J0X3'],'1uzg':['P27915'],'2h0p':['P09866'],'2wzq':['Q2YHF0']}}
如您所见,1sfk被覆盖,它应该具有3个单独的UniProt代码
最佳答案
您在两个地方遇到问题(另一个答案也表明了这一点)-
首先是将dDomainSeqSumSWS[domain][pdb2]
写为空列表-dDomainSeqSumSWS[domain][pdb2]=[]
的地方。
第二个条件是-if pdb2 not in dDomainSeqSumSWS:
-始终为False
,因为pdb2
是dDomainSeqSumSWS[domain]
词典中的键,而不是dDomainSeqSumSWS
词典中的键。
实际上,您实际上不需要上述任何一项,而是应该查看为此专用的dict.setdefault
。范例-
for domain in dDomainSeqSum.keys():# CHANGE TO COMPRESS FILE
dDomainSeqSumSWS[domain]={}
for pdb in dDomainSeqSum[domain]:#add sws of a pdb in a variable and later add that variable to the domain thing
pdb2 = pdb[:4] #you do not need to convert to list for indexing and you can slice the first four characters off.
dDomainSeqSumSWS[domain][pdb2]=[]
for i in range(len(PDBSum)): #make pdb3 search and then compare to the pdb stored
if pdb in PDBSum[i]:
if "SWS_ID" in PDBSum[i]:
line = PDBSum[i].split()
dDomainSeqSumSWS[domain].setdefault(pdb2,[]).append(line[2])
dict.setdefault
将key
作为第一个参数,将默认值作为第二个参数,并在字典中不存在该键的情况下设置该值并返回该值。否则,如果键存在于字典中,则仅返回该值的值。另外,我更改了将
pdb
转换为索引所需的list()
的行(可以为字符串建立索引),并且可以使用切片从字符串中获取前四个字符。