我正在尝试修复函数中嵌套 for 循环的瓶颈。我已经尝试了三个功能,但似乎无法修复它。任何帮助,特别是 data.table 或 rcpp 解决方案,我们都非常感谢。这是一个包含 100 个像元的栅格示例,但我有几个超过 1,000,000 个像元,因此速度至关重要。
例子
考虑以下栅格:
library(raster)
r <- raster(nrows=10,ncols=10,xmn=-10,ymn=-10,xmx=10,ymx=10)
r[] <- rep(1, ncell(r))
extent(r) <- c(-10, 10, -10, 10)
这是一个小栅格,只有一百个像元。我做了以下功能来研究动物的运动,在那里我使用光栅和动物在一个时间片内可以移动的最大距离。嵌套循环中存在一个我无法解决的瓶颈。
我得到的返回是一个具有以下变量的数据框:
from 栅格中动物可以从中移动的像元的 ID
到 栅格中动物可以移动到的像元的 ID
Dist 从单元格到单元格的距离
加载必要的库
library(gdistance)
library(dplyr)
library(tidyr)
DistConect1 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
#Start a list
connections <- list()
#For each pair of cells in B
for (i in 1:nrow(B)){
arcs <- list()
#Create a temporal raster for each row with the distance from cell xy to all other cells
temp <- accCost(h16,B[i,])
#Make all cells with distance to origin larger than maximum dispersal distance equal NA
temp[values(temp) > Distance] = NA
#Create a vector with only the ID raster values of cells that are not NA (To reduce the next loop)
index <- c(1:ncell(temp))[!is.na(values(temp))]
for (j in index){
#For each cell pair i,j generate a vector i,j, distance
arcs[[j]] <- c(i, j, temp[j])
}
#Gather all vectors in a data frame
connections[[i]] <- do.call("rbind", arcs)
#name columns
colnames(connections[[i]]) <- c("from", "to", "dist")
#This is just to see where I am in the function
print(paste(i, "of", nrow(B)))
}
#Get everything together as a large data frame
connections <- do.call("rbind", connections)
connections <- as.data.frame(connections)
#return connections
return(connections)
}
试图用lapply修复它
但我只摆脱了一个循环
DistConect2 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
all.cells <- function(i){
arcs <- list()
temp <- accCost(h16, B[i, ])
temp[values(temp) > Distance] = NA
index <- c(1:ncell(temp))[!is.na(values(temp))]
# all.index <- function(j){
for (j in index) {
arcs[[j]] <- c(i, j, temp[j])
}
# arcs <- lapply(index, all.index)
connections <- do.call("rbind", arcs)
# connections <- do.call("rbind", arcs)
colnames(connections) <- c("from", "to", "dist")
return(connections)
}
connections <- lapply(1:nrow(B), all.cells)
#For each pair of cells in B
#Get everything together as a large data frame
connections <- do.call("rbind", connections)
connections <- as.data.frame(connections)
#return connections
return(connections)
}
但是使用 microbenchmarck 包来测试差异显示没有任何区别:
microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), times = 4)
微基准测试结果
DistConect1(r, Distance = 1e+06) 10.283309 10.40662 10.55879 10.58380 10.71097 10.78428 4
DistConect2(r, Distance = 1e+06) 9.892371 10.07783 10.35453 10.41144 10.63124 10.70288 4
CLD
一种
一种
并行化
我也尝试过并行化,但实际上需要更长的时间:
DistConect2b <- function (Raster, Distance, cpus = NULL)
{
h16 <- transition(Raster, transitionFunction = function(x) {1}, 16, symm = FALSE)
h16 <- geoCorrection(h16, scl = FALSE)
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
all.cells <- function(i){
arcs <- list()
temp <- accCost(h16, B[i, ])
temp[values(temp) > Distance] = NA
index <- c(1:ncell(temp))[!is.na(values(temp))]
# all.index <- function(j){
for (j in index) {
arcs[[j]] <- c(i, j, temp[j])
}
# arcs <- lapply(index, all.index)
connections <- do.call("rbind", arcs)
colnames(connections) <- c("from", "to", "dist")
return(connections)
# cat(paste(i, "of", nrow(B)))
}
require(snowfall)
sfInit(parallel=TRUE, cpus=cpus)
sfLibrary(gdistance)
sep.connections <- sfClusterApplyLB(1:nrow(B), all.cells)
sfStop(nostop=FALSE)
# sep.connections <- lapply(1:nrow(B), all.cells)
connections <- do.call("rbind", sep.connections)
connections <- as.data.frame(connections)
}
结果微基准
microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), DistConect2b(r, Distance = 1000000, cpus = 2), times = 4)
expr min lq mean median uq
DistConect1(r, Distance = 1e+06) 10.145234 10.216611 10.35301 10.36512 10.48942
DistConect2(r, Distance = 1e+06) 9.963549 9.974315 10.01547 10.01173 10.05662
DistConnect2b(r, Distance = 1e+06, cpus = 2) 11.311966 11.486705 12.02240 11.81034 12.55809
答案后的更多试验
在得到很好的答案后,我尝试更进一步,添加了一个 lapply 来替换代码中的 for 循环:
DistConect4 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
all.cells <- function(i){
temp <- accCost2(h16, B[i, ])
index <- which(temp < 1000000) # all.index <- function(j){
connections <- cbind(i, index, temp[index])
return(connections)
}
connections <- lapply(1:nrow(B), all.cells)
connections <- as.data.frame(do.call("rbind", connections))
#Get everything together as a large data frame
colnames(connections) <- c("from", "to", "dist")
#return connections
return(connections)
}
使用下面定义的 acccost2 函数
accCost2 <- function(x, fromCoords) {
fromCells <- cellFromXY(x, fromCoords)
tr <- transitionMatrix(x)
tr <- rBind(tr, rep(0, nrow(tr)))
tr <- cBind(tr, rep(0, nrow(tr)))
startNode <- nrow(tr)
adjP <- cbind(rep(startNode, times = length(fromCells)), fromCells)
tr[adjP] <- Inf
adjacencyGraph <- graph.adjacency(tr, mode = "directed", weighted = TRUE)
E(adjacencyGraph)$weight <- 1/E(adjacencyGraph)$weight
return(shortest.paths(adjacencyGraph, v = startNode, mode = "out")[-startNode])
}
但是当我尝试
timing <- microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), DistConnect2b(r, Distance = 1000000, cpus = 4), DistConect3(r, Distance = 1000000), DistConect4(r, Distance = 1000000) ,times = 20)
print(timing, unit = "relative")
它并没有使过程更快
expr min lq mean median uq
DistConect1(r, Distance = 1e+06) 12.400299 12.43078 12.407909 12.452043 12.502665
DistConect2(r, Distance = 1e+06) 12.238812 12.23194 12.168468 12.191067 12.155786
DistConnect2b(r, Distance = 1e+06, cpus = 4) 13.994594 14.01760 13.909674 13.978060 13.947062
DistConect3(r, Distance = 1e+06) 1.000000 1.00000 1.000000 1.000000 1.000000
DistConect4(r, Distance = 1e+06) 0.997329 1.00141 1.019697 1.002112 1.005626
我认为 apply 总是比 for 快,任何想法为什么这不会使过程更快?
最佳答案
您可以通过更改来摆脱内循环
temp[values(temp) > Distance] = NA
index <- c(1:ncell(temp))[!is.na(values(temp))]
for (j in index){
arcs[[j]] <- c(i, j, temp[j])
}
connections[[i]] <- do.call("rbind", arcs)
colnames(connections[[i]]) <- c("from", "to", "dist")
进入这个:
index <- which(temp < Distance)
connections[[i]] <- cbind(i, index, temp[index])
我还研究了
accCost
,这似乎是这里最慢的函数。不幸的是,它调用了一些 C 代码来寻找最短路径,这可能意味着几乎没有什么可以优化的。我创建了 accCost2
,在那里我剥离了一些代码,但我怀疑它很重要。我也不确定这里的并行化有多有效,因为运行时间没有那么长。低于一些显示出不错改进的基准。library(gdistance)
library(dplyr)
library(tidyr)
library(raster)
r <- raster(nrows=10,ncols=10,xmn=-10,ymn=-10,xmx=10,ymx=10)
r[] <- rep(1, ncell(r))
extent(r) <- c(-10, 10, -10, 10)
DistConect1 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
#Start a list
connections <- list()
#For each pair of cells in B
for (i in 1:nrow(B)){
arcs <- list()
#Create a temporal raster for each row with the distance from cell xy to all other cells
temp <- accCost(h16,B[i,])
#Make all cells with distance to origin larger than maximum dispersal distance equal NA
temp[values(temp) > Distance] = NA
#Create a vector with only the ID raster values of cells that are not NA (To reduce the next loop)
index <- c(1:ncell(temp))[!is.na(values(temp))]
for (j in index){
#For each cell pair i,j generate a vector i,j, distance
arcs[[j]] <- c(i, j, temp[j])
}
#Gather all vectors in a data frame
connections[[i]] <- do.call("rbind", arcs)
#name columns
colnames(connections[[i]]) <- c("from", "to", "dist")
#This is just to see where I am in the function
# print(paste(i, "of", nrow(B)))
}
#Get everything together as a large data frame
connections <- do.call("rbind", connections)
connections <- as.data.frame(connections)
#return connections
return(connections)
}
DistConect3 <- function(Raster, Distance){
#First we make a transition layer with the function transition from gdistance
h16 <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
#Then geocorrect for projection
h16 <- geoCorrection(h16, scl=FALSE)
#Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
B <- xyFromCell(Raster, cell = 1:ncell(Raster))
#This nested loop is where the Bottle neck is
#Start a list
connections <- list()
#For each pair of cells in B
for (i in 1:nrow(B)){
#Create a temporal raster for each row with the distance from cell xy to all other cells
temp <- accCost2(h16,B[i,])
index <- which(temp < Distance)
connections[[i]] <- cbind(i, index, temp[index])
}
#Get everything together as a large data frame
connections <- do.call("rbind", connections)
connections <- as.data.frame(connections)
colnames(connections) <- c("from", "to", "dist")
#return connections
return(connections)
}
accCost2 <- function(x, fromCoords) {
fromCells <- cellFromXY(x, fromCoords)
tr <- transitionMatrix(x)
tr <- rBind(tr, rep(0, nrow(tr)))
tr <- cBind(tr, rep(0, nrow(tr)))
startNode <- nrow(tr)
adjP <- cbind(rep(startNode, times = length(fromCells)), fromCells)
tr[adjP] <- Inf
adjacencyGraph <- graph.adjacency(tr, mode = "directed", weighted = TRUE)
E(adjacencyGraph)$weight <- 1/E(adjacencyGraph)$weight
return(shortest.paths(adjacencyGraph, v = startNode, mode = "out")[-startNode])
}
d1 <- DistConect1(r, Distance = 1000)
d3 <- DistConect3(r, Distance = 1000)
# test float equality
all.equal(d1, d3, check.attributes = FALSE)
# TRUE
timing1 <- microbenchmark(
DistConect1(r, Distance = 1000),
DistConect3(r, Distance = 1000),
times = 10
)
print(timing1, unit = "relative")
# expr min lq mean median uq max neval cld
# 1 DistConect1(r, Distance = 1000) 2.077804 1.991303 1.881478 1.933114 1.951884 1.531302 10 b
# 2 DistConect3(r, Distance = 1000) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
timing2 <- microbenchmark(
DistConect1(r, Distance = 10000),
DistConect3(r, Distance = 10000),
times = 10
)
print(timing2, unit = "relative")
# expr min lq mean median uq max neval cld
# DistConect1(r, Distance = 10000) 2.018707 1.936773 1.966994 1.956694 1.964021 2.094569 10 b
# DistConect3(r, Distance = 10000) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
关于r - 修复函数返回 R 中栅格单元之间距离数据帧的瓶颈,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/45152830/