我尝试将NN代码移植给茱莉亚,希望能提高网络培训的速度。在我的桌面上,事实证明是这样的。
然而,在我的MacBook上,python+numpy比julia快了几英里。
在相同的参数下训练,python的速度是julia的两倍多(一个时期是4.4s对10.6s)。考虑到Julia在我的桌面上比python快(大约2秒),似乎python/numpy在mac上使用了Julia没有的一些资源。即使并行代码也只能使我降到6.6秒(尽管这可能是因为我没有写paralle的经验)。L代码)。我想问题可能是茱莉亚的blas比Mac本机使用的veclib库慢,但是尝试不同的版本似乎并没有让我更接近。我试着用use-system-blas=1构建和用mkl构建,其中mkl给出了更快的结果(上面发布的时间)。
我将发布我的笔记本电脑版本信息以及下面的Julia实现以供参考。我现在没有访问桌面的权限,但是我在Windows上运行的是同一版本的Julia,它使用OpenBlas,与使用OpenBlas的Python2.7的干净安装相比。
我这里有什么东西不见了吗?
编辑:我知道我的Julia代码在优化方面还有很多需要改进的地方,我真的很感激任何能让它更快的提示。然而,这不是茱莉亚在我的笔记本电脑上变慢的情况,而是巨蟒更快。在我的桌面上,python在大约13秒内运行一个时代,在笔记本电脑上,它只需要大约4.4秒。我最感兴趣的是这一差异的来源。我意识到这个问题可能有些不太明确。
笔记本电脑版本:
julia> versioninfo()
Julia Version 0.6.2
Commit d386e40c17 (2017-12-13 18:08 UTC)
Platform Info:
OS: macOS (x86_64-apple-darwin17.4.0)
CPU: Intel(R) Core(TM) i5-7360U CPU @ 2.30GHz
WORD_SIZE: 64
BLAS: libmkl_rt
LAPACK: libmkl_rt
LIBM: libopenlibm
LLVM: libLLVM-3.9.1 (ORCJIT, broadwell)
Python 2.7.14 (default, Mar 22 2018, 14:43:05)
[GCC 4.2.1 Compatible Apple LLVM 9.0.0 (clang-900.0.39.2)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import numpy
>>> numpy.show_config()
lapack_opt_info:
extra_link_args = ['-Wl,-framework', '-Wl,Accelerate']
extra_compile_args = ['-msse3']
define_macros = [('NO_ATLAS_INFO', 3), ('HAVE_CBLAS', None)]
openblas_lapack_info:
NOT AVAILABLE
atlas_3_10_blas_threads_info:
NOT AVAILABLE
atlas_threads_info:
NOT AVAILABLE
openblas_clapack_info:
NOT AVAILABLE
atlas_3_10_threads_info:
NOT AVAILABLE
atlas_blas_info:
NOT AVAILABLE
atlas_3_10_blas_info:
NOT AVAILABLE
atlas_blas_threads_info:
NOT AVAILABLE
openblas_info:
NOT AVAILABLE
blas_mkl_info:
NOT AVAILABLE
blas_opt_info:
extra_link_args = ['-Wl,-framework', '-Wl,Accelerate']
extra_compile_args = ['-msse3', '-I/System/Library/Frameworks/vecLib.framework/Headers']
define_macros = [('NO_ATLAS_INFO', 3), ('HAVE_CBLAS', None)]
blis_info:
NOT AVAILABLE
atlas_info:
NOT AVAILABLE
atlas_3_10_info:
NOT AVAILABLE
lapack_mkl_info:
NOT AVAILABLE
Julia代码(顺序):
using MLDatasets
mutable struct network
num_layers::Int64
sizearr::Array{Int64,1}
biases::Array{Array{Float64,1},1}
weights::Array{Array{Float64,2},1}
end
function network(sizes)
num_layers = length(sizes)
sizearr = sizes
biases = [randn(y) for y in sizes[2:end]]
weights = [randn(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
network(num_layers, sizearr, biases, weights)
end
σ(z) = 1/(1+e^(-z))
σ_prime(z) = σ(z)*(1-σ(z))
function (net::network)(a)
for (w, b) in zip(net.weights, net.biases)
a = σ.(w*a + b)
end
return a
end
function SGDtrain(net::network, training_data, epochs, mini_batch_size, η, test_data=nothing)
n_test = test_data != nothing ? length(test_data):nothing
n = length(training_data)
for j in 1:epochs
training_data = shuffle(training_data)
mini_batches = [training_data[k:k+mini_batch_size-1] for k in 1:mini_batch_size:n]
@time for batch in mini_batches
update_batch(net, batch, η)
end
if test_data != nothing
println("Epoch ", j,": ", evaluate(net, test_data), "/", n_test)
else
println("Epoch ", j," complete.")
end
end
end
function update_batch(net::network, batch, η)
∇_b = net.biases .- net.biases
∇_w = net.weights .- net.weights
for (x, y) in batch
δ_∇_b, δ_∇_w = backprop(net, x, y)
∇_b += δ_∇_b
∇_w += δ_∇_w
end
net.biases -= (η/length(batch))∇_b
net.weights -= (η/length(batch))∇_w
end
function backprop(net::network, x, y)
∇_b = copy(net.biases)
∇_w = copy(net.weights)
len = length(net.sizearr)
activation = x
activations = Array{Array{Float64,1}}(len)
activations[1] = x
zs = copy(net.biases)
for i in 1:len-1
b = net.biases[i]; w = net.weights[i]
z = w*activation .+ b
zs[i] = z
activation = σ.(z)
activations[i+1] = activation[:]
end
δ = (activations[end] - y) .* σ_prime.(zs[end])
∇_b[end] = δ[:]
∇_w[end] = δ*activations[end-1]'
for l in 1:net.num_layers-2
z = zs[end-l]
δ = net.weights[end-l+1]'δ .* σ_prime.(z)
∇_b[end-l] = δ[:]
∇_w[end-l] = δ*activations[end-l-1]'
end
return (∇_b, ∇_w)
end
function evaluate(net::network, test_data)
test_results = [(findmax(net(x))[2] - 1, y) for (x, y) in test_data]
return sum(Int(x == y) for (x, y) in test_results)
end
function loaddata(rng = 1:50000)
train_x, train_y = MNIST.traindata(Float64, Vector(rng))
train_x = [train_x[:,:,x][:] for x in 1:size(train_x, 3)]
train_y = [vectorize(x) for x in train_y]
traindata = [(x, y) for (x, y) in zip(train_x, train_y)]
test_x, test_y = MNIST.testdata(Float64)
test_x = [test_x[:,:,x][:] for x in 1:size(test_x, 3)]
testdata = [(x, y) for (x, y) in zip(test_x, test_y)]
return traindata, testdata
end
function vectorize(n)
ev = zeros(10,1)
ev[n+1] = 1
return ev
end
function main()
net = network([784, 30, 10])
traindata, testdata = loaddata()
SGDtrain(net, traindata, 10, 10, 1.25, testdata)
end
最佳答案
我开始运行你的代码:
7.110379 seconds (1.37 M allocations: 20.570 GiB, 19.81%gc time)
Epoch 1: 7960/10000
6.147297 seconds (1.27 M allocations: 20.566 GiB, 18.33%gc time)
哎呀,每个时代都分配了21个gib?这是你的问题。垃圾收集受到了很大的影响,你的电脑内存越少,它所需要的内存就越多。我们来解决这个问题。
主要的想法是预先分配缓冲区,然后修改数组,而不是创建新的数组。在您的代码中,您可以使用以下命令开始
backprop
∇_b = copy(net.biases)
∇_w = copy(net.weights)
len = length(net.sizearr)
activation = x
activations = Array{Array{Float64,1}}(len)
activations[1] = x
zs = copy(net.biases)
事实上,您使用的是
copy
意味着您可能需要预先分配东西!那么让我们从zs
和activations
开始。我扩展了您的网络以保存这些缓存阵列:mutable struct network
num_layers::Int64
sizearr::Array{Int64,1}
biases::Array{Array{Float64,1},1}
weights::Array{Array{Float64,2},1}
zs::Array{Array{Float64,1},1}
activations::Array{Array{Float64,1},1}
end
function network(sizes)
num_layers = length(sizes)
sizearr = sizes
biases = [randn(y) for y in sizes[2:end]]
weights = [randn(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
zs = [randn(y) for y in sizes[2:end]]
activations = [randn(y) for y in sizes[1:end]]
network(num_layers, sizearr, biases, weights, zs, activations)
end
然后我更改了您的
backprop
以使用这些缓存:function backprop(net::network, x, y)
∇_b = copy(net.biases)
∇_w = copy(net.weights)
len = length(net.sizearr)
activations = net.activations
activations[1] .= x
zs = net.zs
for i in 1:len-1
b = net.biases[i]; w = net.weights[i];
z = zs[i]; activation = activations[i+1]
z .= w*activations[i] .+ b
activation .= σ.(z)
end
δ = (activations[end] - y) .* σ_prime.(zs[end])
∇_b[end] = δ[:]
∇_w[end] = δ*activations[end-1]'
for l in 1:net.num_layers-2
z = zs[end-l]
δ = net.weights[end-l+1]'δ .* σ_prime.(z)
∇_b[end-l] = δ[:]
∇_w[end-l] = δ*activations[end-l-1]'
end
return (∇_b, ∇_w)
end
这导致分配的内存大大减少。但还有很多事情要做。首先,让我们将a
*
更改为aA_mul_B!
。这个函数是一个矩阵乘法,它写入一个数组而不是创建一个新的矩阵,这样可以大大减少内存分配。所以我做到了:for l in 1:net.num_layers-2
z = zs[end-l]
δ = net.weights[end-l+1]'δ .* σ_prime.(z)
∇_b[end-l] .= vec(δ)
atransp = activations[end-l-1]'
A_mul_B!(∇_w[end-l],δ,atransp)
end
但是,我不使用分配的
C
而是使用A_mul_B!(C,A,B)
因为我只需要一个视图:for l in 1:net.num_layers-2
z = zs[end-l]
δ = net.weights[end-l+1]'δ .* σ_prime.(z)
∇_b[end-l] .= vec(δ)
atransp = reshape(activations[end-l-1],1,length(activations[end-l-1]))
A_mul_B!(∇_w[end-l],δ,atransp)
end
(同时,它还实现了更快的openblas调度。不过,这可能与MKL不同)。但你仍然在复制
∇_b = copy(net.biases)
∇_w = copy(net.weights)
每一步都要分配一组δs,所以我做的下一个更改会预先分配这些δs,并将其全部就位(看起来就像以前的更改)。
然后我做了一些分析。在朱诺,这只是:
@profile main()
Juno.profiler()
或者,如果您不使用juno,您可以用ProfileView.jl替换第二部分。我得到:
所以大部分时间都花在布拉斯,但有一个问题。看到像
'
这样的操作正在创建一组矩阵!相反,我们希望通过循环并就地更新每个矩阵的变化矩阵。这就像是:function update_batch(net::network, batch, η)
∇_b = net.∇_b
∇_w = net.∇_w
for i in 1:length(∇_b)
fill!(∇_b[i],0.0)
end
for i in 1:length(∇_w)
fill!(∇_w[i],0.0)
end
for (x, y) in batch
δ_∇_b, δ_∇_w = backprop(net, x, y)
∇_b .+= δ_∇_b
for i in 1:length(∇_w)
∇_w[i] .+= δ_∇_w[i]
end
end
for i in 1:length(∇_b)
net.biases[i] .-= (η/length(batch)).*∇_b[i]
end
for i in 1:length(∇_w)
net.weights[i] .-= (η/length(batch)).*∇_w[i]
end
end
我在同一行中做了更多的更改,最终代码如下:
mutable struct network
num_layers::Int64
sizearr::Array{Int64,1}
biases::Array{Array{Float64,1},1}
weights::Array{Array{Float64,2},1}
weights_transp::Array{Array{Float64,2},1}
zs::Array{Array{Float64,1},1}
activations::Array{Array{Float64,1},1}
∇_b::Array{Array{Float64,1},1}
∇_w::Array{Array{Float64,2},1}
δ_∇_b::Array{Array{Float64,1},1}
δ_∇_w::Array{Array{Float64,2},1}
δs::Array{Array{Float64,2},1}
end
function network(sizes)
num_layers = length(sizes)
sizearr = sizes
biases = [randn(y) for y in sizes[2:end]]
weights = [randn(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
weights_transp = [randn(x, y) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
zs = [randn(y) for y in sizes[2:end]]
activations = [randn(y) for y in sizes[1:end]]
∇_b = [zeros(y) for y in sizes[2:end]]
∇_w = [zeros(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
δ_∇_b = [zeros(y) for y in sizes[2:end]]
δ_∇_w = [zeros(y, x) for (x, y) in zip(sizes[1:end-1], sizes[2:end])]
δs = [zeros(y,1) for y in sizes[2:end]]
network(num_layers, sizearr, biases, weights, weights_transp, zs, activations,∇_b,∇_w,δ_∇_b,δ_∇_w,δs)
end
function update_batch(net::network, batch, η)
∇_b = net.∇_b
∇_w = net.∇_w
for i in 1:length(∇_b)
∇_b[i] .= 0.0
end
for i in 1:length(∇_w)
∇_w[i] .= 0.0
end
δ_∇_b = net.δ_∇_b
δ_∇_w = net.δ_∇_w
for (x, y) in batch
backprop!(net, x, y)
for i in 1:length(∇_b)
∇_b[i] .+= δ_∇_b[i]
end
for i in 1:length(∇_w)
∇_w[i] .+= δ_∇_w[i]
end
end
for i in 1:length(∇_b)
net.biases[i] .-= (η/length(batch)).*∇_b[i]
end
for i in 1:length(∇_w)
net.weights[i] .-= (η/length(batch)).*∇_w[i]
end
end
function backprop!(net::network, x, y)
∇_b = net.δ_∇_b
∇_w = net.δ_∇_w
len = length(net.sizearr)
activations = net.activations
activations[1] .= x
zs = net.zs
δs = net.δs
for i in 1:len-1
b = net.biases[i]; w = net.weights[i];
z = zs[i]; activation = activations[i+1]
A_mul_B!(z,w,activations[i])
z .+= b
activation .= σ.(z)
end
δ = δs[end]
δ .= (activations[end] .- y) .* σ_prime.(zs[end])
∇_b[end] .= vec(δ)
atransp = reshape(activations[end-1],1,length(activations[end-1]))
A_mul_B!(∇_w[end],δ,atransp)
for l in 1:net.num_layers-2
z = zs[end-l]
transpose!(net.weights_transp[end-l+1],net.weights[end-l+1])
A_mul_B!(δs[end-l],net.weights_transp[end-l+1],δ)
δ = δs[end-l]
δ .*= σ_prime.(z)
∇_b[end-l] .= vec(δ)
atransp = reshape(activations[end-l-1],1,length(activations[end-l-1]))
A_mul_B!(∇_w[end-l],δ,atransp)
end
return nothing
end
其他一切保持不变。为了看到我完成了任务,我将
reshape
添加到了∇_w += δ_∇_w
调用中,并得到:0.000070 seconds (8 allocations: 352 bytes)
0.000066 seconds (8 allocations: 352 bytes)
0.000090 seconds (8 allocations: 352 bytes)
所以这是不分配的。我在循环中添加了
@time
0.000636秒(80次分配:3.438 kib)
0.000610秒(80个分配:3.438 kib)
0.000624秒(80个分配:3.438 kib)
所以这告诉我,基本上剩下的所有分配都来自迭代器(这可以改进,但可能不会改进时间)。所以最后的时机是:
Epoch 2: 8428/10000
4.005540 seconds (586.87 k allocations: 23.925 MiB)
Epoch 1: 8858/10000
3.488674 seconds (414.49 k allocations: 17.082 MiB)
Epoch 2: 9104/10000
在我的机器上,速度快了近2倍,但是每个循环的内存分配量减少了1200倍。这意味着,在RAM越来越慢、越来越小的机器上,这种方法应该做得更好(我的桌面有相当多的内存,所以它真的不太在意!)。
最后的配置文件显示大部分时间都在调用中,因此现在几乎所有的操作都受OpenBlas速度的限制,所以我已经完成了。我可以做的一些额外的事情是多线程一些其他的循环,但是给分析带来的回报很小,所以我将把它留给您(基本上只需在循环上放上
backprop
,如@time
)。希望这不仅可以改进代码,还可以教您如何分析、预分配、使用就地操作以及考虑性能。
关于python - macOS Python比训练神经网络中的Julia更快,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/49719076/