Pandas cummin cummax 函数对于我有很多组的用例来说似乎真的很慢。我怎样才能加快他们的速度?

更新

import pandas as pd
import numpy as np

from collections import defaultdict

def cummax(g, v):
    df1 = pd.DataFrame(g, columns=['group'])
    df2 = pd.DataFrame(v)
    df = pd.concat([df1, df2], axis=1)

    result = df.groupby('group').cummax()
    result = result.values
    return result


def transform(g, v):
    df1 = pd.DataFrame(g, columns=['group'])
    df2 = pd.DataFrame(v)
    df = pd.concat([df1, df2], axis=1)

    result = df.groupby('group').transform(lambda x: x.cummax())
    result = result.values
    return result

def itertuples(g, v):
    df1 = pd.DataFrame(g, columns=['group'])
    df2 = pd.DataFrame(v)
    df = pd.concat([df1, df2], axis=1)

    d = defaultdict(list)

    result = [np.nan] * len(g)

    def d_(g, v):
        d[g].append(v)
        if len(d[g]) > 1:
            d[g][-1] = tuple(max(a,b) for (a,b) in zip(d[g][-2], d[g][-1]))
        return d[g][-1]

    for row in df.itertuples(index=True):
        index = row[0]
        result[index] = d_(row[1], row[2:])

    result = np.asarray(result)
    return result

def numpy(g, v):
    d = defaultdict(list)

    result = [np.nan] * len(g)

    def d_(g, v):
        d[g].append(v)
        if len(d[g]) > 1:
            d[g][-1] = np.maximum(d[g][-2], d[g][-1])
        return d[g][-1]

    for i in range(len(g)):
        result[i] = d_(g[i], v[i])

    result = np.asarray(result)
    return result



LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH, 40)

%timeit r1 = cummax(g, v)
%timeit r2 = transform(g, v)
%timeit r3 = itertuples(g, v)
%timeit r4 = numpy(g, v)


1 loop, best of 3: 22.5 s per loop
1 loop, best of 3: 18.4 s per loop
1 loop, best of 3: 1.56 s per loop
1 loop, best of 3: 325 ms per loop

您对我如何改进代码有任何进一步的建议吗?


import pandas as pd
import numpy as np

LENGTH = 100000

df = pd.DataFrame(
    np.random.randint(low=0, high=LENGTH/2, size=(LENGTH,2)),
    columns=['group', 'value'])

df.groupby('group').cummax()

最佳答案

我们将使用 defaultdict ,其中默认值将是 -np.inf 因为我将采用最大值并且我想要一个所有值都大于的默认值。

解决方案

给定一组组 g 和值以累积最大值 v

def pir1(g, v):
    d = defaultdict(lambda: -np.inf)
    result = np.empty(len(g))

    def d_(g, v):
        d[g] = max(d[g], v)
        return d[g]

    for i in range(len(g)):
        result[i] = d_(g[i], v[i])

    return result

示范
LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH)

准确性
vm = pd.DataFrame(dict(group=g, value=v)).groupby('group').value.cummax()
vm.eq(pir1(g, v)).all()

True



深潜

与 Divakar 的回答进行比较

标题图表
python - 加速 Pandas cummin/cummax-LMLPHP

代码
我对 Divakar 的函数进行了一些调整以使其准确。
%%cython
import numpy as np
from collections import defaultdict

# this is a cythonized version of the next function
def pir1(g, v):
    d = defaultdict(lambda: -np.inf)
    result = np.empty(len(g))

    def d_(g, v):
        d[g] = max(d[g], v)
        return d[g]

    for i in range(len(g)):
        result[i] = d_(g[i], v[i])

    return result
def pir2(g, v):
    d = defaultdict(lambda: -np.inf)
    result = np.empty(len(g))

    def d_(g, v):
        d[g] = max(d[g], v)
        return d[g]

    for i in range(len(g)):
        result[i] = d_(g[i], v[i])

    return result

def argsort_unique(idx):
    # Original idea : http://stackoverflow.com/a/41242285/3293881
    n = idx.size
    sidx = np.empty(n,dtype=int)
    sidx[idx] = np.arange(n)
    return sidx

def div1(groupby, value):
    sidx = np.argsort(groupby,kind='mergesort')
    sorted_groupby, sorted_value = groupby[sidx], value[sidx]

    # Get shifts to be used for shifting each group
    mx = sorted_value.max() + 1
    shifts = sorted_groupby * mx

    # Shift and get max accumlate along value col.
    # Those shifts helping out in keeping cummulative max within each group.
    group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
    return group_cummaxed[argsort_unique(sidx)]

def div2(groupby, value):
    sidx = np.argsort(groupby, kind='mergesort')
    sorted_groupby, sorted_value = groupby[sidx], value[sidx]
    # factorize groups to integers
    sorted_groupby = np.append(
        0, sorted_groupby[1:] != sorted_groupby[:-1]).cumsum()

    # Get shifts to be used for shifting each group
    mx = sorted_value.max() + 1
    shifts = (sorted_groupby - sorted_groupby.min()) * mx

    # Shift and get max accumlate along value col.
    # Those shifts helping out in keeping cummulative max within each group.
    group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
    return group_cummaxed[argsort_unique(sidx)]

注释:
  • 有必要对 Divakar 解中的组进行因式分解以对其进行推广

  • 准确性

    整数组
    在基于整数的组上,div1div2 产生相同的结果
    np.isclose(div1(g, v), pir1(g, v)).all()
    
    True
    
    np.isclose(div2(g, v), pir1(g, v)).all()
    
    True
    

    一般组
    基于字符串和浮点数的组 div1 变得不准确但很容易修复
    g = g / 1000000
    
    np.isclose(div1(g, v), pir1(g, v)).all()
    
    False
    
    np.isclose(div2(g, v), pir1(g, v)).all()
    
    True
    

    时间测试
    results = pd.DataFrame(
        index=pd.Index(['pir1', 'pir2', 'div1', 'div2'], name='method'),
        columns=pd.Index(['Large', 'Medium', 'Small'], name='group size'))
    
    size_map = dict(Large=100, Medium=10, Small=1)
    
    from timeit import timeit
    
    for i in results.index:
        for j in results.columns:
            results.set_value(
                i, j,
                timeit(
                    '{}(g // size_map[j], v)'.format(i),
                    'from __main__ import {}, size_map, g, v, j'.format(i),
                    number=100
                )
            )
    
    results
    

    python - 加速 Pandas cummin/cummax-LMLPHP
    results.T.plot.barh()
    

    python - 加速 Pandas cummin/cummax-LMLPHP

    10-07 18:55