我有一个函数,它是一些较大问题的内在循环。因此,这将被称为数百万次。我已经尝试对其进行优化。但是,由于这是我的第一个数字项目,所以我想知道是否还有其他方法可以提高速度。

cython似乎无济于事。也许numpy已经接近c了。
否则我无法高效编写cython代码。

import numpy as np
import math
import numexpr as ne


par_mu_rho = 0.8
par_alpha_rho = 0.7
# ' the first two are mean of mus and the '
# ' last two are the mean of alphas.'
cov_epsilon = [[1, par_mu_rho], [par_mu_rho, 1]]
cov_nu = [[1, par_alpha_rho], [par_alpha_rho, 1]]
nrows = 10000
np.random.seed(123)
epsilon_sim = np.random.multivariate_normal([0, 0], cov_epsilon, nrows)
nu_sim = np.random.multivariate_normal([0, 0], cov_nu, nrows)
errors = np.concatenate((epsilon_sim, nu_sim), axis=1)
errors = np.exp(errors)


### the function to be optimized

def mktout(mean_mu_alpha, errors, par_gamma):
    mu10 = errors[:, 0] * math.exp(mean_mu_alpha[0])
    mu11 = math.exp(par_gamma) * mu10  # mu with gamma
    mu20 = errors[:, 1] * math.exp(mean_mu_alpha[1])
    mu21 = math.exp(par_gamma) * mu20
    alpha1 = errors[:, 2] * math.exp(mean_mu_alpha[2])
    alpha2 = errors[:, 3] * math.exp(mean_mu_alpha[3])

    j_is_larger = (mu10 > mu20)
    #     useneither1 = (mu10 < 1/168)
    threshold2 = (1 + mu10 * alpha1) / (168 + alpha1)
    #     useboth1 = (mu21 >= threshold2)
    j_is_smaller = ~j_is_larger
    #     useneither2 = (mu20 < 1/168)
    threshold3 = (1 + mu20 * alpha2) / (168 + alpha2)
    #     useboth2 = (mu11 >= threshold3)
    case1 = j_is_larger * (mu10 < 1 / 168)
    case2 = j_is_larger * (mu21 >= threshold2)
    #     case3 = j_is_larger * (1 - (useneither1 | useboth1))
    case3 = j_is_larger ^ (case1 | case2)
    case4 = j_is_smaller * (mu20 < 1 / 168)
    case5 = j_is_smaller * (mu11 >= threshold3)
    #     case6 = j_is_smaller * (1 - (useneither2 | useboth2))
    case6 = j_is_smaller ^ (case4 | case5)
    t0 = ne.evaluate(
        "case1*168+case2 * (168 + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * 168 +case5 * (168 + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3")
    # for some cases, t1 would be 0 anyway, so they are omitted here.
    t1 = ne.evaluate(
        "case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)")
    # t2 = (j_is_larger*useboth1*(t0*alpha2*mu21- alpha2) +
    #       j_is_smaller*useboth2*(t0*alpha2*mu21- alpha2) +
    #       j_is_smaller*(1- (useneither2|useboth2))*(t0*alpha2*mu20 - alpha2)
    #       )
    t2 = 168 - t0 - t1
    p12 = case2 + case5
    p1 = case3 + p12
    p2 = case6 + p12
    return t1.sum()/10000, t2.sum()/10000, p1.sum()/10000, p2.sum()/10000

timeit mktout([-6,-6,-1,-1], errors, -0.7)


在装有2.2GHz i7的旧Mac上。该功能的运行时间约为200µs。

更新:

根据@CodeSurgeon和@ GZ0的建议和代码,我决定使用以下代码

def mktout_full(double[:] mean_mu_alpha, double[:, ::1] errors, double par_gamma):
    cdef:
        size_t i, n
        double[4] exp
        double exp_par_gamma
        double mu10, mu11, mu20, mu21
        double alpha1, alpha2
        double threshold2, threshold3
        double t0, t1, t2
        double t1_sum, t2_sum, p1_sum, p2_sum, p12_sum
        double c

    #compute the exp outside of the loop
    n = errors.shape[0]
    exp[0] = cmath.exp(<double>mean_mu_alpha[0])
    exp[1] = cmath.exp(<double>mean_mu_alpha[1])
    exp[2] = cmath.exp(<double>mean_mu_alpha[2])
    exp[3] = cmath.exp(<double>mean_mu_alpha[3])
    exp_par_gamma = cmath.exp(par_gamma)
    c = 168.0

    t1_sum = 0.0
    t2_sum = 0.0
    p1_sum = 0.0
    p2_sum = 0.0
    p12_sum = 0.0

    for i in range(n) :
        mu10 = errors[i, 0] * exp[0]
    #         mu11 = exp_par_gamma * mu10
        mu20 = errors[i, 1] * exp[1]
    #         mu21 = exp_par_gamma * mu20
    #         alpha1 = errors[i, 2] * exp[2]
    #         alpha2 = errors[i, 3] * exp[3]
    #         j_is_larger = mu10 > mu20
    #         j_is_smaller = not j_is_larger

        if (mu10 >= mu20):
            if (mu10 >= 1/c) :
                mu21 = exp_par_gamma * mu20
                alpha1 = errors[i, 2] * exp[2]
                alpha2 = errors[i, 3] * exp[3]
                threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
                if (mu21 >= threshold2):
                    mu11 = exp_par_gamma * mu10
                    t0 =  (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2)
                    t1 = (t0 * alpha1 * mu11 - alpha1)
                    t1_sum += t1
                    t2_sum += c - t0 - t1
                    p1_sum += 1
                    p2_sum += 1
                    p12_sum += 1
                else :
                    t1_sum += ((1/threshold2) * alpha1 * mu10 - alpha1)
                    p1_sum += 1
        else :
            if (mu20 >= 1/c) :
                mu11 = exp_par_gamma * mu10
                alpha1 = errors[i, 2] * exp[2]
                alpha2 = errors[i, 3] * exp[3]
                threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
                if (mu11 >= threshold3):
                    mu21 = exp_par_gamma * mu20
                    t0 =  (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2)
                    t1 = (t0 * alpha1 * mu11 - alpha1)
                    t1_sum += t1
                    t2_sum += c - t0 - t1
                    p1_sum += 1
                    p2_sum += 1
                    p12_sum += 1
                else :
                    t2_sum += ((1/threshold3) * alpha2 * mu20 - alpha2)
                    p2_sum += 1

    return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n, p12_sum/n


我的原始代码运行时间为650µs。
代码外科医生的mktoutmktout_if的运行时间约为220µs和120µs。
上述mktout_full运行时间约为68 µs。
我在mktout_full中所做的是优化mktout_if中的if-else逻辑。
也许令人惊讶的是,由Codeurgeon并行化的out_loopmktout_full中的if-else逻辑相结合要慢得多(121ms)。

最佳答案

简要地看一下代码并尝试对其进行cythonize处理,仅将ndarray类型添加到所有参数和变量中并不会显着改变性能。如果您正在为此紧缩的内部循环争取微秒级的功能,那么我将考虑进行以下修改:


很难对这些代码进行cythonize的原因是您的代码是矢量化的。所有操作都通过numpynumexpr进行。虽然这些操作本身都是有效的,但它们都会增加一些python开销(如果您查看cython可以生成的带注释的.html文件,则可以看到这些开销)。
如果您多次调用此函数(根据您的注释显示),则可以通过将mktout设置为cdef函数来节省一些时间。 Python函数调用的开销很大。
轻微,但您可以尝试避免使用python math模块中的任何功能。您可以将其替换为from libc cimport math as cmath,而改为使用cmath.exp
我看到您的mktout函数包含一个python列表mean_mu_alpha。您可以考虑使用cdef class对象替换此参数,然后键入此参数。如果您选择使mktout成为cdef函数,则它可以只是一个struct或double *数组。无论哪种方式,索引到python列表(可以包含需要取消装箱成对应的c型的任意python对象)的索引都将很慢。
这可能是最重要的部分。对于每个对mktout的调用,您正在为许多数组分配内存(对于每个mualphathresholdcaset-p-数组)。然后,您可以在函数末尾(通过python的gc)释放所有这些内存,只是可能在下一次调用时再次使用所有这些空间。如果可以更改mktout的签名,则可以将所有这些数组作为参数传递,以便可以在函数调用之间重用和覆盖内存。对于这种情况,另一种更好的选择是遍历数组并一次执行一个元素的所有计算。
您可以使用cython的prange函数对代码进行多线程处理。在完成所有上述更改之后,我会解决这一问题,并且我会在mktout函数本身之外进行多线程处理。也就是说,您将对mktout进行多线程调用,而不是对mktout本身进行多线程处理。


进行上述更改将需要大量工作,并且您可能必须自己重新实现numpy和numexpr提供的许多功能,以避免每次与python相关的开销。如果有任何不清楚的地方,请告诉我。



更新#1:实施#1,#3和#5点时,我的速度提高了11倍。这是这段代码的样子。我相信,如果放弃def函数,list mean_mu_alpha输入和tuple输出,它的运行速度会更快。注意:与原始代码相比,我在最后一个小数位后得到的结果略有不同,这可能是由于某些我不理解的浮点规则所致。

from libc cimport math as cmath
from libc.stdint cimport *
from libc.stdlib cimport *

def mktout(list mean_mu_alpha, double[:, ::1] errors, double par_gamma):
    cdef:
        size_t i, n
        double[4] exp
        double exp_par_gamma
        double mu10, mu11, mu20, mu21
        double alpha1, alpha2
        bint j_is_larger, j_is_smaller
        double threshold2, threshold3
        bint case1, case2, case3, case4, case5, case6
        double t0, t1, t2
        double p12, p1, p2
        double t1_sum, t2_sum, p1_sum, p2_sum
        double c

    #compute the exp outside of the loop
    n = errors.shape[0]
    exp[0] = cmath.exp(<double>mean_mu_alpha[0])
    exp[1] = cmath.exp(<double>mean_mu_alpha[1])
    exp[2] = cmath.exp(<double>mean_mu_alpha[2])
    exp[3] = cmath.exp(<double>mean_mu_alpha[3])
    exp_par_gamma = cmath.exp(par_gamma)
    c = 168.0

    t1_sum = 0.0
    t2_sum = 0.0
    p1_sum = 0.0
    p2_sum = 0.0

    for i in range(n):
        mu10 = errors[i, 0] * exp[0]
        mu11 = exp_par_gamma * mu10
        mu20 = errors[i, 1] * exp[1]
        mu21 = exp_par_gamma * mu20
        alpha1 = errors[i, 2] * exp[2]
        alpha2 = errors[i, 3] * exp[3]

        j_is_larger = mu10 > mu20
        j_is_smaller = not j_is_larger
        threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
        threshold3 = (1 + mu20 * alpha2) / (c + alpha2)

        case1 = j_is_larger * (mu10 < 1 / c)
        case2 = j_is_larger * (mu21 >= threshold2)
        case3 = j_is_larger ^ (case1 | case2)
        case4 = j_is_smaller * (mu20 < 1 / c)
        case5 = j_is_smaller * (mu11 >= threshold3)
        case6 = j_is_smaller ^ (case4 | case5)

        t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
        t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
        t2 = c - t0 - t1

        p12 = case2 + case5
        p1 = case3 + p12
        p2 = case6 + p12

        t1_sum += t1
        t2_sum += t2
        p1_sum += p1
        p2_sum += p2

    return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n




更新#2:实现了cdef(#2),消除python对象(#4)和多线程(#6)的想法。单独#2和#4的好处微不足道,但是对于#6来说是必需的,因为无法在OpenMP prange循环中访问GIL。有了多线程功能,您的四核笔记本电脑将获得2.5倍的额外速度提升,相当于比原始代码快27.5倍的代码。我的outer_loop函数并不完全准确,因为它只是一遍又一遍地重新计算相同的结果,但是对于一个测试用例来说应该足够了。完整的代码如下:

from libc cimport math as cmath
from libc.stdint cimport *
from libc.stdlib cimport *
from cython.parallel cimport prange

def mktout(list mean_mu_alpha, double[:, ::1] errors, double par_gamma):
    cdef:
        size_t i, n
        double[4] exp
        double exp_par_gamma
        double mu10, mu11, mu20, mu21
        double alpha1, alpha2
        bint j_is_larger, j_is_smaller
        double threshold2, threshold3
        bint case1, case2, case3, case4, case5, case6
        double t0, t1, t2
        double p12, p1, p2
        double t1_sum, t2_sum, p1_sum, p2_sum
        double c

    #compute the exp outside of the loop
    n = errors.shape[0]
    exp[0] = cmath.exp(<double>mean_mu_alpha[0])
    exp[1] = cmath.exp(<double>mean_mu_alpha[1])
    exp[2] = cmath.exp(<double>mean_mu_alpha[2])
    exp[3] = cmath.exp(<double>mean_mu_alpha[3])
    exp_par_gamma = cmath.exp(par_gamma)
    c = 168.0

    t1_sum = 0.0
    t2_sum = 0.0
    p1_sum = 0.0
    p2_sum = 0.0

    for i in range(n):
        mu10 = errors[i, 0] * exp[0]
        mu11 = exp_par_gamma * mu10
        mu20 = errors[i, 1] * exp[1]
        mu21 = exp_par_gamma * mu20
        alpha1 = errors[i, 2] * exp[2]
        alpha2 = errors[i, 3] * exp[3]

        j_is_larger = mu10 > mu20
        j_is_smaller = not j_is_larger
        threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
        threshold3 = (1 + mu20 * alpha2) / (c + alpha2)

        case1 = j_is_larger * (mu10 < 1 / c)
        case2 = j_is_larger * (mu21 >= threshold2)
        case3 = j_is_larger ^ (case1 | case2)
        case4 = j_is_smaller * (mu20 < 1 / c)
        case5 = j_is_smaller * (mu11 >= threshold3)
        case6 = j_is_smaller ^ (case4 | case5)

        t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
        t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
        t2 = c - t0 - t1

        p12 = case2 + case5
        p1 = case3 + p12
        p2 = case6 + p12

        t1_sum += t1
        t2_sum += t2
        p1_sum += p1
        p2_sum += p2

    return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n

ctypedef struct Vec4:
    double a
    double b
    double c
    double d

def outer_loop(list mean_mu_alpha, double[:, ::1] errors, double par_gamma, size_t n):
    cdef:
        size_t i
        Vec4 mean_vec
        Vec4 out

    mean_vec.a = <double>(mean_mu_alpha[0])
    mean_vec.b = <double>(mean_mu_alpha[1])
    mean_vec.c = <double>(mean_mu_alpha[2])
    mean_vec.d = <double>(mean_mu_alpha[3])

    with nogil:
        for i in prange(n):
            cy_mktout(&out, &mean_vec, errors, par_gamma)
    return out

cdef void cy_mktout(Vec4 *out, Vec4 *mean_mu_alpha, double[:, ::1] errors, double par_gamma) nogil:
    cdef:
        size_t i, n
        double[4] exp
        double exp_par_gamma
        double mu10, mu11, mu20, mu21
        double alpha1, alpha2
        bint j_is_larger, j_is_smaller
        double threshold2, threshold3
        bint case1, case2, case3, case4, case5, case6
        double t0, t1, t2
        double p12, p1, p2
        double t1_sum, t2_sum, p1_sum, p2_sum
        double c

    #compute the exp outside of the loop
    n = errors.shape[0]
    exp[0] = cmath.exp(mean_mu_alpha.a)
    exp[1] = cmath.exp(mean_mu_alpha.b)
    exp[2] = cmath.exp(mean_mu_alpha.c)
    exp[3] = cmath.exp(mean_mu_alpha.d)
    exp_par_gamma = cmath.exp(par_gamma)
    c = 168.0

    t1_sum = 0.0
    t2_sum = 0.0
    p1_sum = 0.0
    p2_sum = 0.0

    for i in range(n):
        mu10 = errors[i, 0] * exp[0]
        mu11 = exp_par_gamma * mu10
        mu20 = errors[i, 1] * exp[1]
        mu21 = exp_par_gamma * mu20
        alpha1 = errors[i, 2] * exp[2]
        alpha2 = errors[i, 3] * exp[3]

        j_is_larger = mu10 > mu20
        j_is_smaller = not j_is_larger
        threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
        threshold3 = (1 + mu20 * alpha2) / (c + alpha2)

        case1 = j_is_larger * (mu10 < 1 / c)
        case2 = j_is_larger * (mu21 >= threshold2)
        case3 = j_is_larger ^ (case1 | case2)
        case4 = j_is_smaller * (mu20 < 1 / c)
        case5 = j_is_smaller * (mu11 >= threshold3)
        case6 = j_is_smaller ^ (case4 | case5)

        t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
        t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
        t2 = c - t0 - t1

        p12 = case2 + case5
        p1 = case3 + p12
        p2 = case6 + p12

        t1_sum += t1
        t2_sum += t2
        p1_sum += p1
        p2_sum += p2

    out.a = t1_sum/n
    out.b = t2_sum/n
    out.c = p1_sum/n
    out.d = p2_sum/n


我使用的setup.py文件如下(具有所有优化和OpenMP标志):

from distutils.core import setup
from Cython.Build import cythonize
from distutils.core import Extension
import numpy as np
import os
import shutil
import platform

libraries = {
    "Linux": [],
    "Windows": [],
}
language = "c"
args = ["-w", "-std=c11", "-O3", "-ffast-math", "-march=native", "-fopenmp"]
link_args = ["-std=c11", "-fopenmp"]

annotate = True
directives = {
    "binding": True,
    "boundscheck": False,
    "wraparound": False,
    "initializedcheck": False,
    "cdivision": True,
    "nonecheck": False,
    "language_level": "3",
    #"c_string_type": "unicode",
    #"c_string_encoding": "utf-8",
}

if __name__ == "__main__":
    system = platform.system()
    libs = libraries[system]
    extensions = []
    ext_modules = []

    #create extensions
    for path, dirs, file_names in os.walk("."):
        for file_name in file_names:
            if file_name.endswith("pyx"):
                ext_path = "{0}/{1}".format(path, file_name)
                ext_name = ext_path \
                    .replace("./", "") \
                    .replace("/", ".") \
                    .replace(".pyx", "")
                ext = Extension(
                    name=ext_name,
                    sources=[ext_path],
                    libraries=libs,
                    language=language,
                    extra_compile_args=args,
                    extra_link_args=link_args,
                    include_dirs = [np.get_include()],
                )
                extensions.append(ext)

    #setup all extensions
    ext_modules = cythonize(
        extensions,
        annotate=annotate,
        compiler_directives=directives,
    )
    setup(ext_modules=ext_modules)

    """
    #immediately remove build directory
    build_dir = "./build"
    if os.path.exists(build_dir):
        shutil.rmtree(build_dir)
    """




更新#3:根据@ GZ0的建议,在许多情况下,代码中的表达式将计算为零并被浪费地计算。我尝试使用以下代码消除这些区域(同时修复了case3case6语句之后):

cdef void cy_mktout_if(Vec4 *out, Vec4 *mean_mu_alpha, double[:, ::1] errors, double par_gamma) nogil:
    cdef:
        size_t i, n
        double[4] exp
        double exp_par_gamma
        double mu10, mu11, mu20, mu21
        double alpha1, alpha2
        bint j_is_larger
        double threshold2, threshold3
        bint case1, case2, case3, case4, case5, case6
        double t0, t1, t2
        double p12, p1, p2
        double t1_sum, t2_sum, p1_sum, p2_sum
        double c

    #compute the exp outside of the loop
    n = errors.shape[0]
    exp[0] = cmath.exp(mean_mu_alpha.a)
    exp[1] = cmath.exp(mean_mu_alpha.b)
    exp[2] = cmath.exp(mean_mu_alpha.c)
    exp[3] = cmath.exp(mean_mu_alpha.d)
    exp_par_gamma = cmath.exp(par_gamma)
    c = 168.0

    t1_sum = 0.0
    t2_sum = 0.0
    p1_sum = 0.0
    p2_sum = 0.0

    for i in range(n):
        mu10 = errors[i, 0] * exp[0]
        mu11 = exp_par_gamma * mu10
        mu20 = errors[i, 1] * exp[1]
        mu21 = exp_par_gamma * mu20
        alpha1 = errors[i, 2] * exp[2]
        alpha2 = errors[i, 3] * exp[3]

        j_is_larger = mu10 > mu20
        j_is_smaller = not j_is_larger
        threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
        threshold3 = (1 + mu20 * alpha2) / (c + alpha2)

        if j_is_larger:
            case1 = mu10 < 1 / c
            case2 = mu21 >= threshold2
            case3 = not (case1 | case2)

            t0 = case1*c + case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case3 / threshold2
            t1 = case2 * (t0 * alpha1 * mu11 - alpha1) + case3 * (t0 * alpha1 * mu10 - alpha1)
            t2 = c - t0 - t1

            t1_sum += t1
            t2_sum += t2
            p1_sum += case2 + case3
            p2_sum += case2

        else:
            case4 = mu20 < 1 / c
            case5 = mu11 >= threshold3
            case6 = not (case4 | case5)

            t0 = case4 * c + case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
            t1 = case5 * (t0 * alpha1 * mu11 - alpha1)
            t2 = c - t0 - t1

            t1_sum += t1
            t2_sum += t2
            p1_sum += case5
            p2_sum += case5 + case6

    out.a = t1_sum/n
    out.b = t2_sum/n
    out.c = p1_sum/n
    out.d = p2_sum/n


对于10000次迭代,当前代码执行如下:

outer_loop: 0.5116949229995953 seconds
outer_loop_if: 0.617649456995423 seconds
mktout: 0.9221872320049442 seconds
mktout_if: 1.430276553001022 seconds
python: 10.116664300003322 seconds


我认为导致结果的条件和分支错误预测的代价出乎意料的是使该函数变慢,但是我希望能有一定帮助清除此问题。

07-24 13:56