我有一个函数,它是一些较大问题的内在循环。因此,这将被称为数百万次。我已经尝试对其进行优化。但是,由于这是我的第一个数字项目,所以我想知道是否还有其他方法可以提高速度。
cython似乎无济于事。也许numpy已经接近c了。
否则我无法高效编写cython代码。
import numpy as np
import math
import numexpr as ne
par_mu_rho = 0.8
par_alpha_rho = 0.7
# ' the first two are mean of mus and the '
# ' last two are the mean of alphas.'
cov_epsilon = [[1, par_mu_rho], [par_mu_rho, 1]]
cov_nu = [[1, par_alpha_rho], [par_alpha_rho, 1]]
nrows = 10000
np.random.seed(123)
epsilon_sim = np.random.multivariate_normal([0, 0], cov_epsilon, nrows)
nu_sim = np.random.multivariate_normal([0, 0], cov_nu, nrows)
errors = np.concatenate((epsilon_sim, nu_sim), axis=1)
errors = np.exp(errors)
### the function to be optimized
def mktout(mean_mu_alpha, errors, par_gamma):
mu10 = errors[:, 0] * math.exp(mean_mu_alpha[0])
mu11 = math.exp(par_gamma) * mu10 # mu with gamma
mu20 = errors[:, 1] * math.exp(mean_mu_alpha[1])
mu21 = math.exp(par_gamma) * mu20
alpha1 = errors[:, 2] * math.exp(mean_mu_alpha[2])
alpha2 = errors[:, 3] * math.exp(mean_mu_alpha[3])
j_is_larger = (mu10 > mu20)
# useneither1 = (mu10 < 1/168)
threshold2 = (1 + mu10 * alpha1) / (168 + alpha1)
# useboth1 = (mu21 >= threshold2)
j_is_smaller = ~j_is_larger
# useneither2 = (mu20 < 1/168)
threshold3 = (1 + mu20 * alpha2) / (168 + alpha2)
# useboth2 = (mu11 >= threshold3)
case1 = j_is_larger * (mu10 < 1 / 168)
case2 = j_is_larger * (mu21 >= threshold2)
# case3 = j_is_larger * (1 - (useneither1 | useboth1))
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / 168)
case5 = j_is_smaller * (mu11 >= threshold3)
# case6 = j_is_smaller * (1 - (useneither2 | useboth2))
case6 = j_is_smaller ^ (case4 | case5)
t0 = ne.evaluate(
"case1*168+case2 * (168 + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * 168 +case5 * (168 + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3")
# for some cases, t1 would be 0 anyway, so they are omitted here.
t1 = ne.evaluate(
"case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)")
# t2 = (j_is_larger*useboth1*(t0*alpha2*mu21- alpha2) +
# j_is_smaller*useboth2*(t0*alpha2*mu21- alpha2) +
# j_is_smaller*(1- (useneither2|useboth2))*(t0*alpha2*mu20 - alpha2)
# )
t2 = 168 - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
return t1.sum()/10000, t2.sum()/10000, p1.sum()/10000, p2.sum()/10000
timeit mktout([-6,-6,-1,-1], errors, -0.7)
在装有2.2GHz i7的旧Mac上。该功能的运行时间约为200µs。
更新:
根据@CodeSurgeon和@ GZ0的建议和代码,我决定使用以下代码
def mktout_full(double[:] mean_mu_alpha, double[:, ::1] errors, double par_gamma):
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
double threshold2, threshold3
double t0, t1, t2
double t1_sum, t2_sum, p1_sum, p2_sum, p12_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(<double>mean_mu_alpha[0])
exp[1] = cmath.exp(<double>mean_mu_alpha[1])
exp[2] = cmath.exp(<double>mean_mu_alpha[2])
exp[3] = cmath.exp(<double>mean_mu_alpha[3])
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
p12_sum = 0.0
for i in range(n) :
mu10 = errors[i, 0] * exp[0]
# mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
# mu21 = exp_par_gamma * mu20
# alpha1 = errors[i, 2] * exp[2]
# alpha2 = errors[i, 3] * exp[3]
# j_is_larger = mu10 > mu20
# j_is_smaller = not j_is_larger
if (mu10 >= mu20):
if (mu10 >= 1/c) :
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
if (mu21 >= threshold2):
mu11 = exp_par_gamma * mu10
t0 = (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2)
t1 = (t0 * alpha1 * mu11 - alpha1)
t1_sum += t1
t2_sum += c - t0 - t1
p1_sum += 1
p2_sum += 1
p12_sum += 1
else :
t1_sum += ((1/threshold2) * alpha1 * mu10 - alpha1)
p1_sum += 1
else :
if (mu20 >= 1/c) :
mu11 = exp_par_gamma * mu10
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
if (mu11 >= threshold3):
mu21 = exp_par_gamma * mu20
t0 = (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2)
t1 = (t0 * alpha1 * mu11 - alpha1)
t1_sum += t1
t2_sum += c - t0 - t1
p1_sum += 1
p2_sum += 1
p12_sum += 1
else :
t2_sum += ((1/threshold3) * alpha2 * mu20 - alpha2)
p2_sum += 1
return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n, p12_sum/n
我的原始代码运行时间为650µs。
代码外科医生的
mktout
和mktout_if
的运行时间约为220µs和120µs。上述
mktout_full
运行时间约为68 µs。我在
mktout_full
中所做的是优化mktout_if
中的if-else逻辑。也许令人惊讶的是,由Codeurgeon并行化的
out_loop
与mktout_full
中的if-else逻辑相结合要慢得多(121ms)。 最佳答案
简要地看一下代码并尝试对其进行cythonize处理,仅将ndarray类型添加到所有参数和变量中并不会显着改变性能。如果您正在为此紧缩的内部循环争取微秒级的功能,那么我将考虑进行以下修改:
很难对这些代码进行cythonize的原因是您的代码是矢量化的。所有操作都通过numpy
或numexpr
进行。虽然这些操作本身都是有效的,但它们都会增加一些python开销(如果您查看cython可以生成的带注释的.html
文件,则可以看到这些开销)。
如果您多次调用此函数(根据您的注释显示),则可以通过将mktout
设置为cdef
函数来节省一些时间。 Python函数调用的开销很大。
轻微,但您可以尝试避免使用python math
模块中的任何功能。您可以将其替换为from libc cimport math as cmath
,而改为使用cmath.exp
。
我看到您的mktout
函数包含一个python列表mean_mu_alpha
。您可以考虑使用cdef class
对象替换此参数,然后键入此参数。如果您选择使mktout
成为cdef
函数,则它可以只是一个struct或double *
数组。无论哪种方式,索引到python列表(可以包含需要取消装箱成对应的c型的任意python对象)的索引都将很慢。
这可能是最重要的部分。对于每个对mktout
的调用,您正在为许多数组分配内存(对于每个mu
,alpha
,threshold
,case
,t-
和p-
数组)。然后,您可以在函数末尾(通过python的gc)释放所有这些内存,只是可能在下一次调用时再次使用所有这些空间。如果可以更改mktout
的签名,则可以将所有这些数组作为参数传递,以便可以在函数调用之间重用和覆盖内存。对于这种情况,另一种更好的选择是遍历数组并一次执行一个元素的所有计算。
您可以使用cython的prange
函数对代码进行多线程处理。在完成所有上述更改之后,我会解决这一问题,并且我会在mktout
函数本身之外进行多线程处理。也就是说,您将对mktout进行多线程调用,而不是对mktout
本身进行多线程处理。
进行上述更改将需要大量工作,并且您可能必须自己重新实现numpy和numexpr提供的许多功能,以避免每次与python相关的开销。如果有任何不清楚的地方,请告诉我。
更新#1:实施#1,#3和#5点时,我的速度提高了11倍。这是这段代码的样子。我相信,如果放弃def
函数,list mean_mu_alpha
输入和tuple
输出,它的运行速度会更快。注意:与原始代码相比,我在最后一个小数位后得到的结果略有不同,这可能是由于某些我不理解的浮点规则所致。
from libc cimport math as cmath
from libc.stdint cimport *
from libc.stdlib cimport *
def mktout(list mean_mu_alpha, double[:, ::1] errors, double par_gamma):
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger, j_is_smaller
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(<double>mean_mu_alpha[0])
exp[1] = cmath.exp(<double>mean_mu_alpha[1])
exp[2] = cmath.exp(<double>mean_mu_alpha[2])
exp[3] = cmath.exp(<double>mean_mu_alpha[3])
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
case1 = j_is_larger * (mu10 < 1 / c)
case2 = j_is_larger * (mu21 >= threshold2)
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / c)
case5 = j_is_smaller * (mu11 >= threshold3)
case6 = j_is_smaller ^ (case4 | case5)
t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
t1_sum += t1
t2_sum += t2
p1_sum += p1
p2_sum += p2
return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n
更新#2:实现了
cdef
(#2),消除python对象(#4)和多线程(#6)的想法。单独#2和#4的好处微不足道,但是对于#6来说是必需的,因为无法在OpenMP prange
循环中访问GIL。有了多线程功能,您的四核笔记本电脑将获得2.5倍的额外速度提升,相当于比原始代码快27.5倍的代码。我的outer_loop
函数并不完全准确,因为它只是一遍又一遍地重新计算相同的结果,但是对于一个测试用例来说应该足够了。完整的代码如下:from libc cimport math as cmath
from libc.stdint cimport *
from libc.stdlib cimport *
from cython.parallel cimport prange
def mktout(list mean_mu_alpha, double[:, ::1] errors, double par_gamma):
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger, j_is_smaller
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(<double>mean_mu_alpha[0])
exp[1] = cmath.exp(<double>mean_mu_alpha[1])
exp[2] = cmath.exp(<double>mean_mu_alpha[2])
exp[3] = cmath.exp(<double>mean_mu_alpha[3])
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
case1 = j_is_larger * (mu10 < 1 / c)
case2 = j_is_larger * (mu21 >= threshold2)
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / c)
case5 = j_is_smaller * (mu11 >= threshold3)
case6 = j_is_smaller ^ (case4 | case5)
t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
t1_sum += t1
t2_sum += t2
p1_sum += p1
p2_sum += p2
return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n
ctypedef struct Vec4:
double a
double b
double c
double d
def outer_loop(list mean_mu_alpha, double[:, ::1] errors, double par_gamma, size_t n):
cdef:
size_t i
Vec4 mean_vec
Vec4 out
mean_vec.a = <double>(mean_mu_alpha[0])
mean_vec.b = <double>(mean_mu_alpha[1])
mean_vec.c = <double>(mean_mu_alpha[2])
mean_vec.d = <double>(mean_mu_alpha[3])
with nogil:
for i in prange(n):
cy_mktout(&out, &mean_vec, errors, par_gamma)
return out
cdef void cy_mktout(Vec4 *out, Vec4 *mean_mu_alpha, double[:, ::1] errors, double par_gamma) nogil:
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger, j_is_smaller
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(mean_mu_alpha.a)
exp[1] = cmath.exp(mean_mu_alpha.b)
exp[2] = cmath.exp(mean_mu_alpha.c)
exp[3] = cmath.exp(mean_mu_alpha.d)
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
case1 = j_is_larger * (mu10 < 1 / c)
case2 = j_is_larger * (mu21 >= threshold2)
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / c)
case5 = j_is_smaller * (mu11 >= threshold3)
case6 = j_is_smaller ^ (case4 | case5)
t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
t1_sum += t1
t2_sum += t2
p1_sum += p1
p2_sum += p2
out.a = t1_sum/n
out.b = t2_sum/n
out.c = p1_sum/n
out.d = p2_sum/n
我使用的
setup.py
文件如下(具有所有优化和OpenMP标志):from distutils.core import setup
from Cython.Build import cythonize
from distutils.core import Extension
import numpy as np
import os
import shutil
import platform
libraries = {
"Linux": [],
"Windows": [],
}
language = "c"
args = ["-w", "-std=c11", "-O3", "-ffast-math", "-march=native", "-fopenmp"]
link_args = ["-std=c11", "-fopenmp"]
annotate = True
directives = {
"binding": True,
"boundscheck": False,
"wraparound": False,
"initializedcheck": False,
"cdivision": True,
"nonecheck": False,
"language_level": "3",
#"c_string_type": "unicode",
#"c_string_encoding": "utf-8",
}
if __name__ == "__main__":
system = platform.system()
libs = libraries[system]
extensions = []
ext_modules = []
#create extensions
for path, dirs, file_names in os.walk("."):
for file_name in file_names:
if file_name.endswith("pyx"):
ext_path = "{0}/{1}".format(path, file_name)
ext_name = ext_path \
.replace("./", "") \
.replace("/", ".") \
.replace(".pyx", "")
ext = Extension(
name=ext_name,
sources=[ext_path],
libraries=libs,
language=language,
extra_compile_args=args,
extra_link_args=link_args,
include_dirs = [np.get_include()],
)
extensions.append(ext)
#setup all extensions
ext_modules = cythonize(
extensions,
annotate=annotate,
compiler_directives=directives,
)
setup(ext_modules=ext_modules)
"""
#immediately remove build directory
build_dir = "./build"
if os.path.exists(build_dir):
shutil.rmtree(build_dir)
"""
更新#3:根据@ GZ0的建议,在许多情况下,代码中的表达式将计算为零并被浪费地计算。我尝试使用以下代码消除这些区域(同时修复了
case3
和case6
语句之后):cdef void cy_mktout_if(Vec4 *out, Vec4 *mean_mu_alpha, double[:, ::1] errors, double par_gamma) nogil:
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(mean_mu_alpha.a)
exp[1] = cmath.exp(mean_mu_alpha.b)
exp[2] = cmath.exp(mean_mu_alpha.c)
exp[3] = cmath.exp(mean_mu_alpha.d)
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
if j_is_larger:
case1 = mu10 < 1 / c
case2 = mu21 >= threshold2
case3 = not (case1 | case2)
t0 = case1*c + case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case3 / threshold2
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) + case3 * (t0 * alpha1 * mu10 - alpha1)
t2 = c - t0 - t1
t1_sum += t1
t2_sum += t2
p1_sum += case2 + case3
p2_sum += case2
else:
case4 = mu20 < 1 / c
case5 = mu11 >= threshold3
case6 = not (case4 | case5)
t0 = case4 * c + case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
t1_sum += t1
t2_sum += t2
p1_sum += case5
p2_sum += case5 + case6
out.a = t1_sum/n
out.b = t2_sum/n
out.c = p1_sum/n
out.d = p2_sum/n
对于10000次迭代,当前代码执行如下:
outer_loop: 0.5116949229995953 seconds
outer_loop_if: 0.617649456995423 seconds
mktout: 0.9221872320049442 seconds
mktout_if: 1.430276553001022 seconds
python: 10.116664300003322 seconds
我认为导致结果的条件和分支错误预测的代价出乎意料的是使该函数变慢,但是我希望能有一定帮助清除此问题。