r - 如何缩短欧式距离计算的处理时间

我正在尝试计算具有相同列数(变量)和不同行数(观察值)的两个数据框之间的加权欧式距离(平方)。

计算公式如下:

DIST[m,i] <- sum(((DATA1[m,] - DATA2[i,]) ^ 2) * lambda[1,])

我特别需要用一定的权重(lambda)乘以每一个包裹。

下面提供的代码可以正确运行，但是如果我在数百次迭代中使用它，则会花费大量的处理时间。昨天，我花了18个小时使用包含该计算的函数的多次迭代来创建图形。使用library(profvis)profvis({my code})，我看到代码的这一特定部分占用了大约80％的处理时间。

我读了很多关于如何使用并行和向量化操作减少处理时间的信息，但是由于lamb＃的重量，我不知道如何在这种特殊情况下实现它们。

有人可以帮助我减少这段代码的处理时间吗？

有关代码和数据结构的更多信息，请参见下面提供的代码中的注释。

# Data frames used to calculate the euclidean distances between each observation
#   from DATA1 and each observation from DATA2.
# The euclidean distance is between a [600x50] and a [8X50] dataframes, resulting
#   in a [600X8] dataframe.
DATA1 <- matrix(rexp(30000, rate=.1), ncol=50) #[600x50]
DATA2 <- matrix(rexp(400, rate=.1), ncol=50) #[8X50]

# Weights used for each of the 50 variables to calculate the weighted
#   euclidean distance.
# Can be a vector of different weights or a scalar of the same weight
#   for all variables.
lambda <- runif(n=50, min=0, max=10)   ## length(lambda) > 1
# lambda=1   ## length(lambda) == 1

if (length(lambda) > 1) {
  as.numeric(unlist(lambda))
  lambda <- as.matrix(lambda)
  lambda <- t(lambda)
}

nrows1 <- nrow(DATA1)
nrows2 <- nrow(DATA2)

# Euclidean Distance calculation
DIST <- matrix(NA, nrow=nrows1, ncol=nrows2 )
for (m in 1:nrows1) {
  for (i in 1:nrows2) {
    if (length(lambda) == 1) {
      DIST[m, i] <- sum((DATA1[m, ] - DATA2[i, ])^2)
    }
    if (length(lambda) > 1){
      DIST[m, i] <- sum(((DATA1[m, ] - DATA2[i, ])^2) * lambda[1, ])
    }
    next
  }
  next
}

经过所有建议后，将@MDWITT(对于length(lambda> 1)和@F。Privé(对于length(lambda == 1))的答案结合起来，最终的解决方案只用了一分钟，而原始解决方案只花了我一分钟。运行一个半小时，用一个较大的代码进行计算，对于那些感兴趣的人，此问题的最终代码是:

#Data frames used to calculate the euclidean distances between each observation from DATA1 and each observation from DATA2.
#The euclidean distance is between a [600x50] and a [8X50] dataframes, resulting in a [600X8] dataframe.
DATA1 <- matrix(rexp(30000, rate=.1), ncol=50) #[600x50]
DATA2 <- matrix(rexp(400, rate=.1), ncol=50) #[8X50]

#Weights used for each of the 50 variables to calculate the weighted euclidean distance.
#Can be a vector of different weights or a scalar of the same weight for all variables.
#lambda <- runif(n = 50, min = 0, max = 10)   ##length(lambda) > 1
lambda = 1   ##length(lambda) == 1

nrows1 <- nrow(DATA1)
nrows2 <- nrow(DATA2)

#Euclidean Distance calculation
DIST <- matrix(NA, nrow = nrows1, ncol = nrows2)

if (length(lambda) > 1){
  as.numeric(unlist(lambda))
  lambda <- as.matrix(lambda)
  lambda <- t(lambda)

  library(Rcpp)
  cppFunction('NumericMatrix weighted_distance (NumericMatrix x, NumericMatrix y, NumericVector lambda){

              int n_x = x.nrow();
              int n_y = y.nrow();


              NumericMatrix DIST(n_x, n_y);

              //begin the loop

              for (int i = 0 ; i < n_x; i++){
              for (int j = 0  ; j < n_y ; j ++) {
              double d = sum(pow(x.row(i) - y.row(j), 2)*lambda);
              DIST(i,j) = d;
              }
              }
              return (DIST) ;
  }')

    DIST <- weighted_distance(DATA1, DATA2, lambda = lambda)}


  if (length(lambda) == 1) {
    DIST <- outer(rowSums(DATA1^2), rowSums(DATA2^2), '+') - tcrossprod(DATA1, 2 * DATA2)
  }

最佳答案

在这里，使用Rcpp的另一种方法只是获取此概念文档。在其中一个名为euclidean.cpp的文件中

#include <Rcpp.h>
#include <cmath>

using namespace Rcpp;

// [[Rcpp::export]]

NumericMatrix weighted_distance (NumericMatrix x, NumericMatrix y, NumericVector lambda){

  int n_x = x.nrow();
  int n_y = y.nrow();


  NumericMatrix out(n_x, n_y);

  //begin the loop

  for (int i = 0 ; i < n_x; i++){
    for (int j = 0  ; j < n_y ; j ++) {
      double d = sum(pow(x.row(i) - y.row(j), 2)*lambda);
      out(i,j) = d;
    }
  }
  return (out) ;
}

在R中，我有

library(Rcpp)
sourceCpp("libs/euclidean.cpp")

# Generate Data
DATA1 <- matrix(rexp(30000, rate=.1), ncol=50) #[600x50]
DATA2 <- matrix(rexp(400, rate=.1), ncol=50) #[8X50]
lambda <- runif(n=50, min=0, max=10)

# Run the program

out <- weighted_distance(DATA1, DATA2, lambda = lambda)

当我使用以下命令测试速度时:

microbenchmark(
  Rcpp_way = weighted_distance(DATA1, DATA2, lambda = lambda),
other = {DIST <- matrix(NA, nrow=nrows1, ncol=ncols)
for (m in 1:nrows1) {
  for (i in 1:nrows2) {
    if (length(lambda) == 1) {
      DIST[m, i] <- sum((DATA1[m, ] - DATA2[i, ])^2)
    }
    if (length(lambda) > 1){
      DIST[m, i] <- sum(((DATA1[m, ] - DATA2[i, ])^2) * lambda[1, ])
    }
    next
  }
  next
}}, times = 100)

您可以更快地看到它是一个不错的剪辑:

Unit: microseconds
     expr       min        lq       mean    median         uq        max neval
 Rcpp_way   446.769   492.308   656.9849   562.667   846.9745   1169.231   100
    other 24688.821 30681.641 44153.5264 37511.385 50878.3585 200843.898   100

关于r - 如何缩短欧式距离计算的处理时间，我们在Stack Overflow上找到一个类似的问题：https://stackoverflow.com/questions/56065994/