问题描述
我感兴趣的是跨一系列列进行字符串检测和值比较。如果在列中找到该字符串(在本例中为ZSD
),则需要比较它们在另一列中的相应值。
输入
我的输入如下:
a.zsd a.test b.zsd b.test c.zsd c.test d.zsd d.test
'ZSD' 0.0 'ZAD' 1.0 NA 0.5 'ZAD' 1.0
'ZAD' 1.0 NA 0.0 NA 0.5 'ZSD' 0.0
NA 0.5 NA 0.5 'ZAD' 0.5 NA 0.5
'Not Achieved ZSD' 0.0 NA 0.5 'ZAD' 0.5 NA 0.5
'ZSD' 1.0 'ZSD' 0.5 NA 0.5 'ZSD' 0.0
NA 0.0 NA 0.0 NA 0.5 NA 0.0
NA 1.0 'ZSD' 0.0 'ZSD' 0.5 'ZSD' 1.0
输出
在我的输出中,我需要另外两列smallest.test
和zsd.level
:
a.zsd a.test b.zsd b.test c.zsd c.test d.zsd d.test smallest.test zsd.level
'ZSD' 0.0 'ZAD' 1.0 NA 0.5 'ZAD' 1.0 0.0 a
'ZAD' 1.0 NA 0.0 NA 0.5 'ZSD' 0.0 0.0 d
NA 0.5 NA 0.5 'ZAD' 0.5 NA 0.5 0.0 NA
'Not Achieved ZSD' 0.0 NA 0.5 'ZAD' 0.5 NA 0.5 0.0 a
'ZSD' 1.0 'ZSD' 0.5 NA 0.5 'ZSD' 0.0 0.0 d
NA 0.0 NA 0.0 NA 0.5 NA 0.0 0.0 NA
NA 1.0 'ZSD' 0.0 'ZSD' 0.5 'ZSD' 1.0 0.0 b
信息:
我的数据框有一百多列。我只对名称以字符串.zsd
结尾的一些列感兴趣。这些列可以具有NA
或以下字符串值之一ZAD
、ZSD
、Not Achieved ZSD
。具有.zsd
字符串名称的每一列都有一个关联的.test
列。
要求
我希望在输出smallest.test
和zsd.level
中有两个新列。要求如下:迭代以字符串
.zsd
结尾的列名在这些列中检测字符串
ZSD
如果仅在其中一列中找到
ZSD
字符串,则在输出列zsd.level
中返回该列的名称,并从以.test
结尾的列名中返回要返回到输出列smallest.test
的相应值。如果所有列都不包含字符串
ZSD
,则在输出列zsd.level
中返回NA,在相应的输出列smallest.test
中返回0.0。如果多个列包含字符串
ZSD
,请选择相应.test
列中值最小的列,然后在输出中返回。如果多个列包含字符串
ZSD
,并且它们的值都与相应的.test
列相同,则为输出选择最后一个列名,并为输出选择.test
的相应值。
dput()
dput(df)
structure(list(a.zsd = c("ZSD", "ZAD", NA, "Not Achieved ZSD", "ZSD", NA, NA),
a.test = c(0, 1, 0.5, 0, 1, 0, 1),
b.zsd = c("ZAD", NA, NA, NA, "ZSD", NA, "ZSD"),
b.test = c(1, 0, 0.5, 0.5, 0.5, 0, 0),
c.zsd = c(NA, NA, "ZAD", "ZAD", NA, NA, "ZSD"),
c.test = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5),
d.zsd = c("ZAD", "ZSD", NA, NA, "ZSD", NA, "ZSD"),
d.test = c(1, 0, 0.5, 0.5, 0, 0, 1)),
class = "data.frame", row.names = c(NA, -7L))
部分解决方案
基于以下POST:String matching over multiple columns with specific string names,此代码可以迭代并选择.zsd
列,并在输出中返回最高的列名。但它没有考虑.test
字段的对应值。如有任何帮助,我们将不胜感激。
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(across(contains("zsd"), ~case_when(str_detect(., "ZSD") ~ cur_column()), .names = 'new_{col}')) %>%
unite(zsd_level, starts_with('new'), na.rm = TRUE, sep = ' ') %>%
mutate(zsd_level = str_remove_all(zsd_level, ".zsd"),
zsd_level = str_sub(zsd_level, -1))
推荐答案
这里有一个base
解决方案,它假定相应的".Test"和".zsd"列是相邻的。
第一块代码可用于此目的,但如果给定行在最小值中有关系,它将给出第一个实例。下面是一个可用于选择最小值的第一个或最后一个实例的函数。
# sample data
df <- structure(list(a.zsd = c("ZSD", "ZAD", NA, "Not Achieved ZSD", "ZSD", NA, NA),
a.test = c(0, 1, 0.5, 0, 1, 0, 1),
b.zsd = c("ZAD", NA, NA, NA, "ZSD", NA, "ZSD"),
b.test = c(1, 0, 0.5, 0.5, 0.5, 0, 0),
c.zsd = c(NA, NA, "ZAD", "ZAD", NA, NA, "ZSD"),
c.test = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5),
d.zsd = c("ZAD", "ZSD", NA, NA, "ZSD", NA, "ZSD"),
d.test = c(1, 0, 0.5, 0.5, 0, 0, 1)),
class = "data.frame", row.names = c(NA, -7L))
# select .zsd columns
zsd_cols <- grep(".zsd", names(df), value = TRUE)
zsd_df <- df[, zsd_cols]
# select .test columns
test_cols <- gsub("zsd", "test",zsd_cols)
test_df <- df[, test_cols]
# convert "Not Achieved ZSD" to "ZSD"
zsd_df[zsd_df == "Not Achieved ZSD" ] <- "ZSD"
# assign NA to non "ZSD" cells
zsd_df[zsd_df != "ZSD"] <- NA
# assign 999 test_df values whose corresponding zsd_df is NA
test_df[is.na(zsd_df)] <- 999
# return cols which hold minimum
nams <- names(test_df)[apply(test_df, 1 ,which.min)]
# scrub .test suffix
nams <- gsub(".test", "", nams)
# return mins
mins <- apply(test_df, 1 ,min)
# assign values less than 999 as smallest test, or zero
df$smallest.test <- ifelse(mins < 999, mins, 0)
# assign name if corresponding min less than 999 or NA
df$zsd_level <- ifelse(mins < 999, nams, NA)
> df
a.zsd a.test b.zsd b.test c.zsd c.test d.zsd d.test smallest.test zsd_level
1 ZSD 0.0 ZAD 1.0 <NA> 0.5 ZAD 1.0 0 a
2 ZAD 1.0 <NA> 0.0 <NA> 0.5 ZSD 0.0 0 d
3 <NA> 0.5 <NA> 0.5 ZAD 0.5 <NA> 0.5 0 <NA>
4 Not Achieved ZSD 0.0 <NA> 0.5 ZAD 0.5 <NA> 0.5 0 a
5 ZSD 1.0 ZSD 0.5 <NA> 0.5 ZSD 0.0 0 d
6 <NA> 0.0 <NA> 0.0 <NA> 0.5 <NA> 0.0 0 <NA>
7 <NA> 1.0 ZSD 0.0 ZSD 0.5 ZSD 1.0 0 b
编辑
相同的概念,但该函数可以在一行中出现平局时获取第一个或最后一个最小值:
# adjusted sample data
df <- data.frame(a.zsd = c("ZSD", "ZAD", NA,
"Not Achieved ZSD", "ZSD", NA, NA),
a.test = c(0, 1, 0.5, 0, 1, 0, 1),
b.zsd = c("ZAD", NA, NA, NA, "ZSD", NA, "ZSD"),
b.test = c(1, 0, 0.5, 0.5, 0.0, 0, 0), # adjusted
c.zsd = c(NA, NA, "ZAD", "ZAD", NA, NA, "ZSD"),
c.test = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5),
d.zsd = c("ZAD", "ZSD", NA, NA, "ZSD", NA, "ZSD"),
d.test = c(1, 0, 0.5, 0.5, 0, 0, 1))
appendMin <- function(df, last_min = TRUE){
# select .zsd columns
zsd_cols <- grep(".zsd", names(df), value = TRUE)
zsd_df <- df[, zsd_cols]
if(last_min) { zsd_df <- rev(zsd_df) } # for last min
# select .test columns
test_cols <- gsub("zsd", "test",zsd_cols)
test_df <- df[, test_cols]
if(last_min) { test_df <- rev(test_df) } # for last min
# convert "Not Achieved ZSD" to "ZSD"
zsd_df[zsd_df == "Not Achieved ZSD" ] <- "ZSD"
# assign NA to non "ZSD" cells
zsd_df[zsd_df != "ZSD" ] <- NA
# assign 999 test_df values whose corresponding zsd_df is NA
test_df[is.na(zsd_df)] <- 999
# return cols which hold the first minimum
nams <- names(test_df)[apply(test_df, 1 ,which.min)]
# scrub .test suffix
nams <- gsub(".test", "", nams)
# return mins
mins <- apply(test_df, 1 ,min)
# assign values less than 999 as smallest test, or zero
df$smallest.test <- ifelse(mins < 999, mins, 0)
# assign name if corresponding min less than 999 or NA
df$zsd_level <- ifelse(mins < 999, nams, NA)
return(df)
}
ties_first <- appendMin(df, last_min = FALSE)
ties_last <- appendMin(df, last_min = TRUE)
这篇关于在多个列上使用特定字符串进行字符串匹配,并在关联名称中进行值比较的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!