r - 修复函数返回 R 中栅格单元之间距离数据帧的瓶颈

我正在尝试修复函数中嵌套 for 循环的瓶颈。我已经尝试了三个功能，但似乎无法修复它。任何帮助，特别是 data.table 或 rcpp 解决方案，我们都非常感谢。这是一个包含 100 个像元的栅格示例，但我有几个超过 1,000,000 个像元，因此速度至关重要。

例子

考虑以下栅格:

library(raster)


r   <- raster(nrows=10,ncols=10,xmn=-10,ymn=-10,xmx=10,ymx=10)
r[] <- rep(1, ncell(r))

extent(r) <- c(-10, 10, -10, 10)

这是一个小栅格，只有一百个像元。我做了以下功能来研究动物的运动，在那里我使用光栅和动物在一个时间片内可以移动的最大距离。嵌套循环中存在一个我无法解决的瓶颈。

我得到的返回是一个具有以下变量的数据框:

from 栅格中动物可以从中移动的像元的 ID

到栅格中动物可以移动到的像元的 ID

Dist 从单元格到单元格的距离

加载必要的库

library(gdistance)
library(dplyr)
library(tidyr)

DistConect1 <- function(Raster, Distance){
      #First we make a transition layer with the function transition from gdistance
      h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
      #Then geocorrect for projection
      h16   <- geoCorrection(h16, scl=FALSE)
      #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
      B <- xyFromCell(Raster, cell = 1:ncell(Raster))
      #This nested loop is where the Bottle neck is
      #Start a list
      connections <- list()
      #For each pair of cells in B
      for (i in 1:nrow(B)){
        arcs <- list()
        #Create a temporal raster for each row with the distance from cell xy to all other cells
        temp <- accCost(h16,B[i,])
        #Make all cells with distance to origin larger than maximum dispersal distance equal NA
        temp[values(temp) > Distance] = NA
        #Create a vector with only the ID raster values of cells that are not NA (To reduce the next loop)
        index <- c(1:ncell(temp))[!is.na(values(temp))]
        for (j in index){
          #For each cell pair i,j generate a vector i,j, distance
          arcs[[j]] <- c(i, j, temp[j])
        }
        #Gather all vectors in a data frame
        connections[[i]] <- do.call("rbind", arcs)
        #name columns
        colnames(connections[[i]]) <- c("from", "to", "dist")
        #This is just to see where I am in the function
        print(paste(i, "of", nrow(B)))
      }
      #Get everything together as a large data frame
      connections <- do.call("rbind", connections)
      connections <- as.data.frame(connections)
      #return connections
      return(connections)
      }

试图用lapply修复它

但我只摆脱了一个循环

DistConect2 <- function(Raster, Distance){
  #First we make a transition layer with the function transition from gdistance
  h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
  #Then geocorrect for projection
  h16   <- geoCorrection(h16, scl=FALSE)
  #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
  B <- xyFromCell(Raster, cell = 1:ncell(Raster))
  #This nested loop is where the Bottle neck is

  all.cells <- function(i){
    arcs <- list()
    temp <- accCost(h16, B[i, ])
    temp[values(temp) > Distance] = NA
    index <- c(1:ncell(temp))[!is.na(values(temp))]
    # all.index <- function(j){
    for (j in index) {
      arcs[[j]] <- c(i, j, temp[j])
    }
    # arcs <- lapply(index, all.index)
    connections <- do.call("rbind", arcs)
    # connections <- do.call("rbind", arcs)
    colnames(connections) <- c("from", "to", "dist")
    return(connections)
   }

  connections <- lapply(1:nrow(B), all.cells)
  #For each pair of cells in B
  #Get everything together as a large data frame
  connections <- do.call("rbind", connections)
  connections <- as.data.frame(connections)
  #return connections
  return(connections)
}

但是使用 microbenchmarck 包来测试差异显示没有任何区别:

microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000), times = 4)

微基准测试结果

DistConect1(r, Distance = 1e+06) 10.283309 10.40662 10.55879 10.58380 10.71097 10.78428     4
DistConect2(r, Distance = 1e+06)  9.892371 10.07783 10.35453 10.41144 10.63124 10.70288     4

CLD
一种
一种

并行化

我也尝试过并行化，但实际上需要更长的时间:

DistConect2b <- function (Raster, Distance, cpus = NULL)
{
  h16 <- transition(Raster, transitionFunction = function(x) {1}, 16, symm = FALSE)
  h16 <- geoCorrection(h16, scl = FALSE)
  B <- xyFromCell(Raster, cell = 1:ncell(Raster))

  all.cells <- function(i){
    arcs <- list()
    temp <- accCost(h16, B[i, ])
    temp[values(temp) > Distance] = NA
    index <- c(1:ncell(temp))[!is.na(values(temp))]
    # all.index <- function(j){
     for (j in index) {
      arcs[[j]] <- c(i, j, temp[j])
    }
    # arcs <- lapply(index, all.index)
    connections <- do.call("rbind", arcs)

    colnames(connections) <- c("from", "to", "dist")
    return(connections)
    # cat(paste(i, "of", nrow(B)))
  }
  require(snowfall)
  sfInit(parallel=TRUE, cpus=cpus)
  sfLibrary(gdistance)
  sep.connections <- sfClusterApplyLB(1:nrow(B), all.cells)
  sfStop(nostop=FALSE)
  # sep.connections <- lapply(1:nrow(B), all.cells)
  connections <- do.call("rbind", sep.connections)
  connections <- as.data.frame(connections)

}

结果微基准

microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000),  DistConect2b(r, Distance = 1000000, cpus = 2), times = 4)


                            expr       min        lq     mean   median       uq
             DistConect1(r, Distance = 1e+06) 10.145234 10.216611 10.35301 10.36512 10.48942
             DistConect2(r, Distance = 1e+06)  9.963549  9.974315 10.01547 10.01173 10.05662
 DistConnect2b(r, Distance = 1e+06, cpus = 2) 11.311966 11.486705 12.02240 11.81034 12.55809

答案后的更多试验

在得到很好的答案后，我尝试更进一步，添加了一个 lapply 来替换代码中的 for 循环:

DistConect4 <- function(Raster, Distance){
  #First we make a transition layer with the function transition from gdistance
  h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
   #Then geocorrect for projection
  h16   <- geoCorrection(h16, scl=FALSE)
  #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
  B <- xyFromCell(Raster, cell = 1:ncell(Raster))
  #This nested loop is where the Bottle neck is

  all.cells <- function(i){

    temp <- accCost2(h16, B[i, ])
    index <- which(temp < 1000000)  # all.index <- function(j){
    connections <- cbind(i, index, temp[index])

    return(connections)

  }

  connections <- lapply(1:nrow(B), all.cells)

  connections <- as.data.frame(do.call("rbind", connections))
  #Get everything together as a large data frame
  colnames(connections) <- c("from", "to", "dist")
  #return connections
  return(connections)
}

使用下面定义的 acccost2 函数

accCost2 <- function(x, fromCoords) {

  fromCells <- cellFromXY(x, fromCoords)
  tr <- transitionMatrix(x)
  tr <- rBind(tr, rep(0, nrow(tr)))
  tr <- cBind(tr, rep(0, nrow(tr)))
  startNode <- nrow(tr)
  adjP <- cbind(rep(startNode, times = length(fromCells)), fromCells)
  tr[adjP] <- Inf
  adjacencyGraph <- graph.adjacency(tr, mode = "directed", weighted = TRUE)
  E(adjacencyGraph)$weight <- 1/E(adjacencyGraph)$weight
  return(shortest.paths(adjacencyGraph, v = startNode, mode = "out")[-startNode])
}

但是当我尝试

timing <- microbenchmark::microbenchmark(DistConect1(r, Distance = 1000000), DistConect2(r, Distance = 1000000),  DistConnect2b(r, Distance = 1000000, cpus = 4), DistConect3(r, Distance = 1000000), DistConect4(r, Distance = 1000000) ,times = 20)

print(timing, unit = "relative")

它并没有使过程更快

                                         expr       min       lq      mean        median        uq
             DistConect1(r, Distance = 1e+06) 12.400299 12.43078 12.407909 12.452043 12.502665
             DistConect2(r, Distance = 1e+06) 12.238812 12.23194 12.168468 12.191067 12.155786
 DistConnect2b(r, Distance = 1e+06, cpus = 4) 13.994594 14.01760 13.909674 13.978060 13.947062
             DistConect3(r, Distance = 1e+06)  1.000000  1.00000  1.000000  1.000000  1.000000
             DistConect4(r, Distance = 1e+06)  0.997329  1.00141  1.019697  1.002112  1.005626

我认为 apply 总是比 for 快，任何想法为什么这不会使过程更快？

最佳答案

您可以通过更改来摆脱内循环

temp[values(temp) > Distance] = NA
index <- c(1:ncell(temp))[!is.na(values(temp))]
for (j in index){
  arcs[[j]] <- c(i, j, temp[j])
}
connections[[i]] <- do.call("rbind", arcs)
colnames(connections[[i]]) <- c("from", "to", "dist")

进入这个:

index <- which(temp < Distance)
connections[[i]] <- cbind(i, index, temp[index])

我还研究了 accCost ，这似乎是这里最慢的函数。不幸的是，它调用了一些 C 代码来寻找最短路径，这可能意味着几乎没有什么可以优化的。我创建了 accCost2，在那里我剥离了一些代码，但我怀疑它很重要。我也不确定这里的并行化有多有效，因为运行时间没有那么长。低于一些显示出不错改进的基准。

library(gdistance)
library(dplyr)
library(tidyr)
library(raster)

r   <- raster(nrows=10,ncols=10,xmn=-10,ymn=-10,xmx=10,ymx=10)
r[] <- rep(1, ncell(r))

extent(r) <- c(-10, 10, -10, 10)

DistConect1 <- function(Raster, Distance){
    #First we make a transition layer with the function transition from gdistance
    h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
    #Then geocorrect for projection
    h16   <- geoCorrection(h16, scl=FALSE)
    #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
    B <- xyFromCell(Raster, cell = 1:ncell(Raster))
    #This nested loop is where the Bottle neck is
    #Start a list
    connections <- list()
    #For each pair of cells in B
    for (i in 1:nrow(B)){
        arcs <- list()
        #Create a temporal raster for each row with the distance from cell xy to all other cells
        temp <- accCost(h16,B[i,])
        #Make all cells with distance to origin larger than maximum dispersal distance equal NA
        temp[values(temp) > Distance] = NA
        #Create a vector with only the ID raster values of cells that are not NA (To reduce the next loop)
        index <- c(1:ncell(temp))[!is.na(values(temp))]
        for (j in index){
            #For each cell pair i,j generate a vector i,j, distance
            arcs[[j]] <- c(i, j, temp[j])
        }
        #Gather all vectors in a data frame
        connections[[i]] <- do.call("rbind", arcs)
        #name columns
        colnames(connections[[i]]) <- c("from", "to", "dist")
        #This is just to see where I am in the function
        # print(paste(i, "of", nrow(B)))
    }
    #Get everything together as a large data frame
    connections <- do.call("rbind", connections)
    connections <- as.data.frame(connections)
    #return connections
    return(connections)
}

DistConect3 <- function(Raster, Distance){
    #First we make a transition layer with the function transition from gdistance
    h16  <- transition(Raster, transitionFunction=function(x){1},16,symm=FALSE)
    #Then geocorrect for projection
    h16   <- geoCorrection(h16, scl=FALSE)
    #Since transition layers work with XY rather than IDs, get a matrix of XY coordinates
    B <- xyFromCell(Raster, cell = 1:ncell(Raster))
    #This nested loop is where the Bottle neck is
    #Start a list
    connections <- list()
    #For each pair of cells in B
    for (i in 1:nrow(B)){
        #Create a temporal raster for each row with the distance from cell xy to all other cells
        temp <- accCost2(h16,B[i,])
        index <- which(temp < Distance)
        connections[[i]] <- cbind(i, index, temp[index])
    }
    #Get everything together as a large data frame
    connections <- do.call("rbind", connections)
    connections <- as.data.frame(connections)
    colnames(connections) <- c("from", "to", "dist")
    #return connections
    return(connections)
}

accCost2 <- function(x, fromCoords) {

    fromCells <- cellFromXY(x, fromCoords)
    tr <- transitionMatrix(x)
    tr <- rBind(tr, rep(0, nrow(tr)))
    tr <- cBind(tr, rep(0, nrow(tr)))
    startNode <- nrow(tr)
    adjP <- cbind(rep(startNode, times = length(fromCells)), fromCells)
    tr[adjP] <- Inf
    adjacencyGraph <- graph.adjacency(tr, mode = "directed", weighted = TRUE)
    E(adjacencyGraph)$weight <- 1/E(adjacencyGraph)$weight
    return(shortest.paths(adjacencyGraph, v = startNode, mode = "out")[-startNode])
}


d1 <- DistConect1(r, Distance = 1000)
d3 <- DistConect3(r, Distance = 1000)

# test float equality
all.equal(d1, d3, check.attributes = FALSE)
# TRUE

timing1 <- microbenchmark(
    DistConect1(r, Distance = 1000),
    DistConect3(r, Distance = 1000),
    times = 10
)
print(timing1, unit = "relative")
#   expr                            min      lq       mean     median   uq       max      neval cld
# 1 DistConect1(r, Distance = 1000) 2.077804 1.991303 1.881478 1.933114 1.951884 1.531302 10     b
# 2 DistConect3(r, Distance = 1000) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10    a

timing2 <- microbenchmark(
    DistConect1(r, Distance = 10000),
    DistConect3(r, Distance = 10000),
    times = 10
)
print(timing2, unit = "relative")
# expr                             min      lq       mean     median   uq       max         neval cld
# DistConect1(r, Distance = 10000) 2.018707 1.936773 1.966994 1.956694 1.964021 2.094569    10     b
# DistConect3(r, Distance = 10000) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    10    a

关于r - 修复函数返回 R 中栅格单元之间距离数据帧的瓶颈，我们在Stack Overflow上找到一个类似的问题：https://stackoverflow.com/questions/45152830/