
chr1    778704  778912  MSPC_Peak_37509  8.43   cell_line   GM12878  CTCF   ENCSR000AKB CNhs12333   132
chr1    778704  778912  MSPC_Peak_37509  8.43   cell_line   GM12878  CTCF   ENCSR000AKB CNhs12331   132
chr1    778704  778912  MSPC_Peak_37509  8.43   cell_line   GM12878  CTCF   ENCSR000AKB CNhs12332   132
chr1    869773  870132  MSPC_Peak_37508  74.0   cell_line   GM12878  CTCF   ENCSR000AKB CNhs12333   132

CNhs12333   2228319     4.41    CTCF
CNhs12331   6419919     0.0     HES2
CNhs12332   6579994     0.78    ZBTB48
CNhs12333   8817465     0.0     RERE

chr1    778704  778912  MSPC_Peak_37509  8.43   cell_line   GM12878  CTCF   ENCSR000AKB CNhs12333   132   4.41
chr1    778704  778912  MSPC_Peak_37509  8.43   cell_line   GM12878  HES2   ENCSR000AKB CNhs12331   132   0.0
chr1    778704  778912  MSPC_Peak_37509  8.43   cell_line   GM12878  CTCF   ENCSR000AKB CNhs12332   132   0.78
chr1    869773  870132  MSPC_Peak_37508  74.0   cell_line   GM12878  RERE   ENCSR000AKB CNhs12333   132   0.0

def read_file(file):
    with open(file) as f:
        current = []
        for line in f: # read rest of lines
            current.append([x for x in line.split()])

inputfile = "/home/lside/Desktop/database_files/Cell_line_final2.bed" # 2.7GB text file
outpufile = "/home/lside/Desktop/database_files/Cell_line_final3.bed"

file_in = read_file("/home/lside/Desktop/tf_TPM2.csv") # 22.5MB text file
new_line = ""
with open(inputfile, 'r') as infile:
    with open(outpufile, 'w') as outfile:
        for line in infile:
            line = line.split("\t")
            for j in file_in:
                if j[0] == line[9] and j[3] == line[7]:
                    new_line = new_line + '{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}\t{8}\t{9}\t{10}\t{11}\n'.format(line[0], line[1], line[2],line[3], line[4], line[5],line[6], line[7], line[8], line[9], line[10].rstrip(), j[2])



def read_file(filename):
    with open(filename) as f:
        current = []
        for line in f: # read rest of lines
            e0, e2, e3 = line.split()[0], line.split()[2], line.split()[3]
            current.append((e0, e2, e3))  # you only use these three elements
    return current

inputfile = "/home/lside/Desktop/database_files/Cell_line_final2.bed" # 2.7GB text file
outpufile = "/home/lside/Desktop/database_files/Cell_line_final3.bed"

file_in = read_file("/home/lside/Desktop/tf_TPM2.csv") # 22.5MB text file

with open(inputfile, 'r') as infile:
    with open(outpufile, 'w') as outfile:
        for line in infile:
            line = line.split("\t")
            for e0, e2, e3 in file_in:
                if e0 == line[9] and e3 == line[7]:
                    new_line = '{0}\t{1}\n'.format(line.rstrip(), e2)  # just append the column to the entire line
                    outfile.write(new_line)  # dump to file, don't linger around with an ever-growing string

def make_lookup_table(filename):
    lookup = {}
    with open(filename) as f:
        for line in f: # read rest of lines
            e0, e2, e3 = line.split()[0], line.split()[2], line.split()[3]
            lookup[(e0, e3)] = e2  # use (e0,e3) as key, and e2 as value
    return lookup

inputfile = "/home/lside/Desktop/database_files/Cell_line_final2.bed" # 2.7GB text file
outpufile = "/home/lside/Desktop/database_files/Cell_line_final3.bed"

lookup = make_lookup_table("/home/lside/Desktop/tf_TPM2.csv") # 22.5MB text file

with open(inputfile, 'r') as infile:
    with open(outpufile, 'w') as outfile:
        for line in infile:
            line = line.split("\t")
            value = lookup[(line[9],line[7])]
            new_line = '{0}\t{1}\n'.format(line.rstrip(), value)  # just append the column to the entire line
            outfile.write(new_line)  # dump to file, don't linger around with an ever-growing string

关于python - 如何并行化或制作更快的python脚本,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/57280634/

10-12 16:06