我有一个包含珊瑚测量值的数据集。在每次测量的同时,还收集了其他元数据,包括实验模块上菌落的位置或“位置”。我试图将数据框中的“位置”列分为水平和垂直两个部分。每个位置代码都是一个字母数字条目,其中字母代表列(A-D),数字部分代表行(1-4)。

在许多情况下,珊瑚位于下一行(例如A1_2)或下一列(例如A_B1)的边缘,从而条目的格式从字母和数字变为一个字母和两个字母数字或两个字母和一个数字。

d <- structure(list(`Module #` = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("111", "112", "113", "114", "115",
"116", "211", "212", "213", "214", "215", "216"), class = "factor"),
    Side = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
    ), .Label = c("N", "S", "T"), class = "factor"), TimeStep = c(4L,
    4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), Location = c("A1", "A1_2",
    "A2", "A3", "A3_4", "A4", "B_C3", "B1", "B1_2", "B2"), Date = structure(c(NA_real_,
    NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
    NA_real_, NA_real_, NA_real_), class = "Date"), Year = structure(c(1L,
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("17", "18"
    ), class = "factor"), Site = structure(c(NA_integer_, NA_integer_,
    NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
    NA_integer_, NA_integer_, NA_integer_), .Label = c("HAN",
    "WAI"), class = "factor"), Treatment = c(NA_character_, NA_character_,
    NA_character_, NA_character_, NA_character_, NA_character_,
    NA_character_, NA_character_, NA_character_, NA_character_
    ), recruits = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Site_long = structure(c(2L,
    2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Hanauma Bay",
    "Waikiki"), class = "factor"), Shelter = structure(c(1L,
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("High", "Low"
    ), class = "factor")), row.names = c(NA, 10L), class = "data.frame")

head(d)


我想最后得到一个带有2个新列的数据框:一个名为“ Column”的列和一个名为“ Row”的列。 “列”是指“位置”中代码的字母部分,“行”是指有编号的部分。请注意,每个列的值应为1或3个字符(例如,对于A1_2,Column = A;对于A_B1,Column = A_B)。

最佳答案

我们可以使用str_extract分别提取值

library(tidyverse)
d %>%
  mutate(Column = str_extract(Location, "[A-Z]_?[A-Z]?"),
         Row = str_extract(Location, "[0-9]_?[0-9]?")) %>%
  select(Location, Column, Row)

#   Location Column Row
#1        A1      A   1
#2      A1_2      A 1_2
#3        A2      A   2
#4        A3      A   3
#5      A3_4      A 3_4
#6        A4      A   4
#7      B_C3    B_C   3
#8        B1      B   1
#9      B1_2      B 1_2
#10       B2      B   2


或者使用tidyr::extract将一个正则表达式中的列分开

d %>%
   extract(Location, into = c("Column", "Row"),
           regex = "([A-Z]_?[A-Z]?)([0-9]_?[0-9]?)")




我们可以使用基数R sub来使用类似的正则表达式提取值

d$Column <- sub("([A-Z]_?[A-Z]?).*", "\\1", d$Location)
d$Row <- sub("[A-Z]_?[A-Z]?([0-9]_?[0-9]?)", "\\1", d$Location)

08-16 07:10