python - 使用numpy.genfromtxt进行过滤

我有一个文件，只需要将某些值读入数组即可。该文件按指定TIMESTEP值的行划分。我需要文件中最高TIMESTEP之后的数据部分。

这些文件将包含200,000多行，尽管我不知道任何给定文件所需的节从哪一行开始，并且我不知道最大的TIMESTEP值是多少。

假设如果我可以找到最大的TIMESTEP行号，那么我可以从该行开始导入。所有这些TIMESTEP行均以空格字符开头。关于如何进行的任何想法都会有所帮助。

样本文件

 headerline 1 to skip
 headerline 2 to skip
 headerline 3 to skip
 TIMESTEP =    0.00000000
0,    1.0,   1.0,    1.0,   1.0,      1.0,   1.0
1,    1.0,   1.0,    1.0,   1.0,      1.0,   1.0
2,    1.0,   1.0,    1.0,   1.0,      1.0,   1.0
2,    1.0,   1.0,    1.0,   1.0,      1.0,   1.0
 TIMESTEP =   0.119999997
0,    1.0,   1.0,    1.0,   1.0,      1.0,   1.0
1,    1.0,   1.0,    1.0,   1.0,      1.0,   1.0
2,    1.0,   1.0,    1.0,   1.0,      1.0,   1.0
3,    1.0,   1.0,    1.0,   1.0,      1.0,   1.0
 TIMESTEP =    3.00000000
0,    1.0,   1.0,    1.0,   1.0,      1.0,   1.0
1,    1.0,   1.0,    1.0,   1.0,      1.0,   1.0
1,    1.0,   1.0,    1.0,   1.0,      1.0,   1.0
2,    1.0,   1.0,    1.0,   1.0,      1.0,   1.0

基本代码

import numpy as np

with open('myfile.txt') as f_in:
  data = np.genfromtxt(f_in, skip_header=3, comments=" ")

最佳答案

您可以使用自定义iterator。

这是一个工作示例：

从numpy import genfromtxt

class Iter(object):
    ' a custom iterator which returns a timestep and corresponding data '

    def __init__(self, fd):
        self.__fd = fd
        self.__timestep = None
        self.__next_timestep = None
        self.__finish = False
        for _ in self.to_next_timestep(): pass # skip header

    def to_next_timestep(self):
        ' iterate until next timestep '
        for line in self.__fd:
            if 'TIMESTEP' in line:
                self.__timestep = self.__next_timestep
                self.__next_timestep = float(line.split('=')[1])
                return
            yield line
        self.__timestep = self.__next_timestep
        self.__finish = True

    def __iter__(self): return self

    def next(self):
        if self.__finish:
            raise StopIteration
        data = genfromtxt(self.to_next_timestep(), delimiter=',')
        return self.__timestep, data

with open('myfile.txt') as fd:
    iter = Iter(fd)
    for timestep, data in iter:
        print timestep, data # data can be selected upon highest timestep