如果每个值都出现在不同的data.table列中,那么检查data.table R中满列的最快方法是什么?

问题示例:

创建示例大数据:

dt1 <- data.table(dt1row=c(1:1000000),code=sapply(c(1:1000000),FUN=function(x){paste(sample(letters,5), collapse="")}))
dt2 <- data.table(dt2row=c(1:500000),code=sapply(c(1:500000),FUN=function(x){paste(sample(letters,5), collapse="")}))

我想替换的功能慢(但可以):
#SLOW ON BIG DATA!
dt1$in_dt2 <- sapply(c(1:nrow(dt1)),FUN=function(x){dt1$code[x] %in% dt2$code})

最佳答案

到目前为止,@ thelatemail具有更快的方法:

setkey(dt1, code)
setkey(dt2, code)
dt1[, in_dt2 := FALSE][dt2, on=.(code), in_dt2 := TRUE]

我认为您正在寻找连接操作,并且设置键应该可以加快速度:
setkey(dt1, code)
setkey(dt2, code)
existing <- dt2[dt1, on=.(code), nomatch=0L]
dt1[, in_dt2 := dt1row %in% existing$dt1row]

R的另一个选择是使用base::match
m0 <- function() {
    DT10$in_dt2 <- match(DT10$code, dt2$code, nomatch=0L) > 0L
    DT10
}

m1 <- function() {
    setkey(DT11, code)
    existing <- dt2[DT11, on=.(code), nomatch=0L, mult="first"]
    DT11[, in_dt2 := dt1row %in% existing$dt1row]
}

m2 <- function() {
    DT12[, in_dt2 := match(code, dt2$code, nomatch=0L) > 0L]
}

m_thelatemail <- function() {
    setkey(DT13, code)
    DT13[, in_dt2 := FALSE][dt2, on=.(code), in_dt2 := TRUE]
}

bench::mark(m0(), m1(), m2(), m_thelatemail(), check=FALSE)
identical(DT11[order(dt1row), in_dt2], m0()$in_dt2)
identical(DT12[order(dt1row), in_dt2], m0()$in_dt2)
identical(DT13[order(dt1row), in_dt2], m0()$in_dt2)
#[1] TRUE

时间:
# A tibble: 4 x 13
  expression           min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result                   memory            time     gc
  <bch:expr>      <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>                   <list>            <list>   <list>
1 m0()               914ms    914ms      1.09    38.3MB     1.09     1     1      914ms <df[,3] [1,000,000 x 3]> <df[,3] [10 x 3]> <bch:tm> <tibble [1 x 3]>
2 m1()               252ms    273ms      3.66    36.8MB     1.83     2     1      547ms <df[,3] [1,000,000 x 3]> <df[,3] [33 x 3]> <bch:tm> <tibble [2 x 3]>
3 m2()               198ms    252ms      4.14    23.1MB     2.76     3     2      724ms <df[,3] [1,000,000 x 3]> <df[,3] [10 x 3]> <bch:tm> <tibble [3 x 3]>
4 m_thelatemail()    148ms    158ms      6.38    15.4MB     0        4     0      627ms <df[,3] [1,000,000 x 3]> <df[,3] [28 x 3]> <bch:tm> <tibble [4 x 3]>
m0()的输出:
          dt1row  code in_dt2
      1:       1 nydga  FALSE
      2:       2 bwknr  FALSE
      3:       3 sauxj  FALSE
      4:       4 vnjgi  FALSE
      5:       5 ouein  FALSE
     ---
 999996:  999996 wiucs  FALSE
 999997:  999997 yqjrp  FALSE
 999998:  999998 elort  FALSE
 999999:  999999 asjyh  FALSE
1000000: 1000000 lmbjw  FALSE

数据:
library(data.table)
set.seed(0L)
nr <- 1e6
dt1 <- data.table(dt1row=c(1:nr),code=sapply(c(1:nr),FUN=function(x) paste(sample(letters,5), collapse="")))
dt2 <- data.table(dt2row=c(1:(nr/2)),code=sapply(c(1:(nr/2)),FUN=function(x) paste(sample(letters,5), collapse="")))
DT10 <- copy(dt1)
DT11 <- copy(dt1)
DT12 <- copy(dt1)
DT13 <- copy(dt1)
setkey(dt2, code)

建议使用set.seed随机生成数据时使用sample

08-24 12:57