r - left_join用于tbl : na_matches not working

left_join可以像预期的那样在小节或数据帧上使用NA值，但是在tbl上，即使使用选项na_matches =“na”，它似乎也不匹配NA。

R版本和软件包版本

> sessionInfo()
R version 3.6.1 (2019-07-05)
Platform: x86_64-apple-darwin18.6.0 (64-bit)
Running under: macOS Mojave 10.14.6
...
other attached packages:
 [1] reprex_0.3.0    dbplyr_1.4.2    lubridate_1.7.4 magrittr_1.5    forcats_0.4.0   stringr_1.4.0   dplyr_0.8.1     purrr_0.3.2     readr_1.3.1
[10] tidyr_0.8.3     tibble_2.1.3    ggplot2_3.2.0   tidyverse_1.2.1
...

以下是SQLite的代表，但PostgreSQL也是如此(实际上我偶然发现了PostgreSQL DB的问题)。

最少的代表。

(1)我创建2个数据帧，将它们本地复制到SQLite DB，然后再次将它们作为tbl加载。

library(tidyverse)
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
df_1 <- tibble(A = c("a", "aa"), B = c("b", "bb"), D = c("d", NA))
df_2 <- tibble(A = c("a", "aa"), C = c("c", "cc"), D = c("d", NA))
copy_to(con, df_1, overwrite = T)
copy_to(con, df_2, overwrite = T)
dt_1 <- tbl(con, "df_1")
dt_2 <- tbl(con, "df_2")

df_1
#> # A tibble: 2 x 3
#>   A     B     D
#>   <chr> <chr> <chr>
#> 1 a     b     d
#> 2 aa    bb    <NA>

df_2
#> # A tibble: 2 x 3
#>   A     C     D
#>   <chr> <chr> <chr>
#> 1 a     c     d
#> 2 aa    cc    <NA>

dt_1
#> # Source:   table<df_1> [?? x 3]
#> # Database: sqlite 3.29.0 [:memory:]
#>   A     B     D
#>   <chr> <chr> <chr>
#> 1 a     b     d
#> 2 aa    bb    <NA>

dt_2
#> # Source:   table<df_2> [?? x 3]
#> # Database: sqlite 3.29.0 [:memory:]
#>   A     C     D
#>   <chr> <chr> <chr>
#> 1 a     c     d
#> 2 aa    cc    <NA>

(2)然后我首先在数据帧上使用left_join，然后在tbls上使用:

left_join(df_1, df_2)
#> Joining, by = c("A", "D")
#> # A tibble: 2 x 4
#>   A     B     D     C
#>   <chr> <chr> <chr> <chr>
#> 1 a     b     d     c
#> 2 aa    bb    <NA>  cc

left_join(dt_1, dt_2, na_matches = "na")
#> Joining, by = c("A", "D")
#> # Source:   lazy query [?? x 4]
#> # Database: sqlite 3.29.0 [:memory:]
#>   A     B     D     C
#>   <chr> <chr> <chr> <chr>
#> 1 a     b     d     c
#> 2 aa    bb    <NA>  <NA>

我们可以看到在数据帧的情况下，第二行最后一列C具有预期的cc(默认为na_matches = "na")，但在tbl情况下为<NA>，即使使用显式选项na_matches = "na"(根据文档为默认值)。 这是意外的。

编辑

注意，这与使用na_matches = "never"的数据帧的结果相同:


left_join(df_1, df_2, na_matches = "never")
#> Joining, by = c("A", "D")
#> # A tibble: 2 x 4
#>   A     B     D     C
#>   <chr> <chr> <chr> <chr>
#> 1 a     b     d     c
#> 2 aa    bb    <NA>  <NA>

顺便说一句，标题提到left_join是因为它是最常见的联接，但是inner_join也会出现相同的问题(数据表中尚未出现full_join)，如果我们将na_matches = "na"保留在两个表中，则可能更明显:

inner_join(dt_1, dt_2, na_matches = "na")
#> Joining, by = c("A", "D")
#> # Source:   lazy query [?? x 4]
#> # Database: sqlite 3.29.0 [:memory:]
#>   A     B     D     C
#>   <chr> <chr> <chr> <chr>
#> 1 a     b     d     c
inner_join(df_1, df_2, na_matches = "na")
#> Joining, by = c("A", "D")
#> # A tibble: 2 x 4
#>   A     B     D     C
#>   <chr> <chr> <chr> <chr>
#> 1 a     b     d     c
#> 2 aa    bb    <NA>  cc

最佳答案

为了响应@philipxy在left_join进程中进一步挖掘的请求，我进入了left_join的 Debug模式，首先是在数据表上:

debug(left_join)
left_join(dt_1, dt_2, na_matches = "na")
#>  debugging in: left_join(dt_1, dt_2, na_matches = "na")
#>  debug: {
#>      UseMethod("left_join")
#>  }
Browse[2]>  n
#>  debug: UseMethod("left_join")
#>  Browse[2]> n
#>  debugging in: left_join.tbl_lazy(dt_1, dt_2, na_matches = "na")
#>  debug: {
#>      add_op_join(x, y, "left", by = by, sql_on = sql_on, copy = copy,
#>          suffix = suffix, auto_index = auto_index, ...)
#>  }
Browse[3]>
#>  debug: add_op_join(x, y, "left", by = by, sql_on = sql_on, copy = copy,
#>      suffix = suffix, auto_index = auto_index, ...)
Browse[3]> s
#>  debugging in: add_op_join(x, y, "left", by = by, sql_on = sql_on, copy = copy,
#>      suffix = suffix, auto_index = auto_index, ...)
#>  debug: {
#>      if (!is.null(sql_on)) {
#>         by <- list(x = character(0), y = character(0), on = sql(sql_on))
#>      }
#>      else if (identical(type, "full") && identical(by, character())) {
#>          type <- "cross"
#>          by <- list(x = character(0), y = character(0))
#>      }
#>      else {
#>          by <- common_by(by, x, y)
#>      }
#>      y <- auto_copy(x, y, copy = copy, indexes = if (auto_index)
#>          list(by$y))
#>      vars <- join_vars(op_vars(x), op_vars(y), type = type, by = by,
#>          suffix = suffix)
#>      x$ops <- op_double("join", x, y, args = list(vars = vars,
#>          type = type, by = by, suffix = suffix))
#>      x
#>  }
Browse[4]> f
#>  Joining, by = c("A", "D")
#>  exiting from: add_op_join(x, y, "left", by = by, sql_on = sql_on, copy = copy,
#>      suffix = suffix, auto_index = auto_index, ...)
#>  exiting from: left_join.tbl_lazy(dt_1, dt_2, na_matches = "na")
#>  exiting from: left_join(dt_1, dt_2, na_matches = "na")
#>  # Source:   lazy query [?? x 4]
#>  # Database: sqlite 3.29.0 [:memory:]
#>    A     B     D     C
#>    <chr> <chr> <chr> <chr>
#>  1 a     b     d     c
#>  2 aa    bb    NA    NA

我们看到left_join使用left_join.tbl_lazy选项在数据表上调用na_matches = “na”。
但是，此后是对add_op_join的调用，该调用的定义没有提及na_matches。

然后，相比之下，在数据帧上:

left_join(df_1, df_2)
#>  debugging in: left_join(df_1, df_2)
#>  debug: {
#>      UseMethod("left_join")
#>  }
Browse[2]> n
#>  debug: UseMethod("left_join")
Browse[2]>
#>  debugging in: left_join.tbl_df(df_1, df_2)
#>  debug: {
#>      check_valid_names(tbl_vars(x))
#>      check_valid_names(tbl_vars(y))
#>      by <- common_by(by, x, y)
#>      suffix <- check_suffix(suffix)
#>      na_matches <- check_na_matches(na_matches)
#>      y <- auto_copy(x, y, copy = copy)
#>      vars <- join_vars(tbl_vars(x), tbl_vars(y), by, suffix)
#>      by_x <- vars$idx$x$by
#>      by_y <- vars$idx$y$by
#>      aux_x <- vars$idx$x$aux
#>      aux_y <- vars$idx$y$aux
#>      out <- left_join_impl(x, y, by_x, by_y, aux_x, aux_y, na_matches,
#>          environment())
#>      names(out) <- vars$alias
#>      reconstruct_join(out, x, vars)
#>  }
Browse[3]>
#>  debug: check_valid_names(tbl_vars(x))
Browse[3]>
#>  debug: check_valid_names(tbl_vars(y))
Browse[3]>
#>  debug: by <- common_by(by, x, y)
Browse[3]>
#>  Joining, by = c("A", "D")
#>  debug: suffix <- check_suffix(suffix)
Browse[3]>
#>  debug: na_matches <- check_na_matches(na_matches)
Browse[3]>
#>  debug: y <- auto_copy(x, y, copy = copy)
Browse[3]> na_matches
#>  [1] TRUE
Browse[3]> f
#>  exiting from: left_join.tbl_df(df_1, df_2)
#>  exiting from: left_join(df_1, df_2)
#>  # A tibble: 2 x 4
#>    A     B     D     C
#>    <chr> <chr> <chr> <chr>
#>  1 a     b     d     c
#>  2 aa    bb    NA    cc

我们看到left_join在数据帧上调用left_join.tbl_df。再往下看，我们发现na_matches在用作TRUE的参数之前已设置为left_join_impl。所有这些都是有道理的。

键入?left_join.tbl_lazy时，文档会返回join.tbl_sql {dbplyr}的本地页面，其中指明了未指定的参数(…):

“其他传递给方法的参数，例如 na_matches 来控制NA值的匹配方式。有关更多信息，请参见join.tbl_df。

在join.tbl_df doc链接之后，它显然提到了na_matches:

“使用'never'总是将两个NA或NaN值视为不同，例如对数据库源进行联接，类似于merge(incomparables = FALSE)。默认值'na'总是将两个NA或NaN值视为相等，例如merge ()。用户和包作者可以通过调用pkgconfig::set_config('dplyr::na_matches'='never')“来更改默认行为。

因此，文档和数据表的代码之间似乎存在一些不一致之处。

此外，@ philipxy提到了此news link，其中指出“要匹配NA值，请将na_matches ='na'传递给连接动词； 仅支持数据帧”。现在，dt_1和df_1的类为:

class(df_1)
#>  [1] "tbl_df"     "tbl"        "data.frame"
class(dt_1)
#>  [1] "tbl_SQLiteConnection" "tbl_dbi"              "tbl_sql"
#>  [4] "tbl_lazy"             "tbl"

我想术语“数据帧”指的是data.frame和tbl_df类，我所谓的“数据表”是其他tbl_*，包括tbl_sql和tbl_lazy。因此，此新闻链接也回答了这个问题。

不过，我认为连接动词的当前文档令人困惑。它应明确指出:

“，数据帧的默认值为na_matches = 'na'，数据表的默认值为na_matches = 'never'(没有其他选择)。”。

希望将来对于数据表，选择na_matches = "na"的实现时间不会太长。
关于r - left_join用于tbl : na_matches not working，我们在Stack Overflow上找到一个类似的问题：https://stackoverflow.com/questions/57734832/

Never

r - left_join用于tbl : na_matches not working