python - 加速 Pandas cummin/cummax

Pandas cummin 和 cummax 函数对于我有很多组的用例来说似乎真的很慢。我怎样才能加快他们的速度？

更新

import pandas as pd
import numpy as np

from collections import defaultdict

def cummax(g, v):
    df1 = pd.DataFrame(g, columns=['group'])
    df2 = pd.DataFrame(v)
    df = pd.concat([df1, df2], axis=1)

    result = df.groupby('group').cummax()
    result = result.values
    return result


def transform(g, v):
    df1 = pd.DataFrame(g, columns=['group'])
    df2 = pd.DataFrame(v)
    df = pd.concat([df1, df2], axis=1)

    result = df.groupby('group').transform(lambda x: x.cummax())
    result = result.values
    return result

def itertuples(g, v):
    df1 = pd.DataFrame(g, columns=['group'])
    df2 = pd.DataFrame(v)
    df = pd.concat([df1, df2], axis=1)

    d = defaultdict(list)

    result = [np.nan] * len(g)

    def d_(g, v):
        d[g].append(v)
        if len(d[g]) > 1:
            d[g][-1] = tuple(max(a,b) for (a,b) in zip(d[g][-2], d[g][-1]))
        return d[g][-1]

    for row in df.itertuples(index=True):
        index = row[0]
        result[index] = d_(row[1], row[2:])

    result = np.asarray(result)
    return result

def numpy(g, v):
    d = defaultdict(list)

    result = [np.nan] * len(g)

    def d_(g, v):
        d[g].append(v)
        if len(d[g]) > 1:
            d[g][-1] = np.maximum(d[g][-2], d[g][-1])
        return d[g][-1]

    for i in range(len(g)):
        result[i] = d_(g[i], v[i])

    result = np.asarray(result)
    return result



LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH, 40)

%timeit r1 = cummax(g, v)
%timeit r2 = transform(g, v)
%timeit r3 = itertuples(g, v)
%timeit r4 = numpy(g, v)

给

1 loop, best of 3: 22.5 s per loop
1 loop, best of 3: 18.4 s per loop
1 loop, best of 3: 1.56 s per loop
1 loop, best of 3: 325 ms per loop

您对我如何改进代码有任何进一步的建议吗？

旧

import pandas as pd
import numpy as np

LENGTH = 100000

df = pd.DataFrame(
    np.random.randint(low=0, high=LENGTH/2, size=(LENGTH,2)),
    columns=['group', 'value'])

df.groupby('group').cummax()

最佳答案

我们将使用 defaultdict ，其中默认值将是 -np.inf 因为我将采用最大值并且我想要一个所有值都大于的默认值。

解决方案

给定一组组 g 和值以累积最大值 v

def pir1(g, v):
    d = defaultdict(lambda: -np.inf)
    result = np.empty(len(g))

    def d_(g, v):
        d[g] = max(d[g], v)
        return d[g]

    for i in range(len(g)):
        result[i] = d_(g[i], v[i])

    return result

示范

LENGTH = 100000
g = np.random.randint(low=0, high=LENGTH/2, size=LENGTH)
v = np.random.rand(LENGTH)

准确性

vm = pd.DataFrame(dict(group=g, value=v)).groupby('group').value.cummax()
vm.eq(pir1(g, v)).all()

True

深潜

与 Divakar 的回答进行比较

标题图表

代码
我对 Divakar 的函数进行了一些调整以使其准确。

%%cython
import numpy as np
from collections import defaultdict

# this is a cythonized version of the next function
def pir1(g, v):
    d = defaultdict(lambda: -np.inf)
    result = np.empty(len(g))

    def d_(g, v):
        d[g] = max(d[g], v)
        return d[g]

    for i in range(len(g)):
        result[i] = d_(g[i], v[i])

    return result

def pir2(g, v):
    d = defaultdict(lambda: -np.inf)
    result = np.empty(len(g))

    def d_(g, v):
        d[g] = max(d[g], v)
        return d[g]

    for i in range(len(g)):
        result[i] = d_(g[i], v[i])

    return result

def argsort_unique(idx):
    # Original idea : http://stackoverflow.com/a/41242285/3293881
    n = idx.size
    sidx = np.empty(n,dtype=int)
    sidx[idx] = np.arange(n)
    return sidx

def div1(groupby, value):
    sidx = np.argsort(groupby,kind='mergesort')
    sorted_groupby, sorted_value = groupby[sidx], value[sidx]

    # Get shifts to be used for shifting each group
    mx = sorted_value.max() + 1
    shifts = sorted_groupby * mx

    # Shift and get max accumlate along value col.
    # Those shifts helping out in keeping cummulative max within each group.
    group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
    return group_cummaxed[argsort_unique(sidx)]

def div2(groupby, value):
    sidx = np.argsort(groupby, kind='mergesort')
    sorted_groupby, sorted_value = groupby[sidx], value[sidx]
    # factorize groups to integers
    sorted_groupby = np.append(
        0, sorted_groupby[1:] != sorted_groupby[:-1]).cumsum()

    # Get shifts to be used for shifting each group
    mx = sorted_value.max() + 1
    shifts = (sorted_groupby - sorted_groupby.min()) * mx

    # Shift and get max accumlate along value col.
    # Those shifts helping out in keeping cummulative max within each group.
    group_cummaxed = np.maximum.accumulate(shifts + sorted_value) - shifts
    return group_cummaxed[argsort_unique(sidx)]

注释:

有必要对 Divakar 解中的组进行因式分解以对其进行推广

准确性

整数组
在基于整数的组上，div1 和 div2 产生相同的结果

np.isclose(div1(g, v), pir1(g, v)).all()

True

np.isclose(div2(g, v), pir1(g, v)).all()

True

一般组
基于字符串和浮点数的组 div1 变得不准确但很容易修复

g = g / 1000000

np.isclose(div1(g, v), pir1(g, v)).all()

False

np.isclose(div2(g, v), pir1(g, v)).all()

True

时间测试

results = pd.DataFrame(
    index=pd.Index(['pir1', 'pir2', 'div1', 'div2'], name='method'),
    columns=pd.Index(['Large', 'Medium', 'Small'], name='group size'))

size_map = dict(Large=100, Medium=10, Small=1)

from timeit import timeit

for i in results.index:
    for j in results.columns:
        results.set_value(
            i, j,
            timeit(
                '{}(g // size_map[j], v)'.format(i),
                'from __main__ import {}, size_map, g, v, j'.format(i),
                number=100
            )
        )

results

results.T.plot.barh()