我有一个具有以下结构的数据框:
> dftest
element seqnames start end width strand tx_id tx_name
1 1 chr19 58858172 58864865 6694 - 36769 NM_130786
2 10 chr8 18248755 18258723 9969 + 16614 NM_000015
3 100 chr20 43248163 43280376 32214 - 37719 NM_000022
4 1000 chr18 25530930 25757445 226516 - 33839 NM_001792
5 10000 chr1 243651535 244006584 355050 - 4182 NM_181690
6 10000 chr1 243663021 244006584 343564 - 4183 NM_005465
1316 100302285 chr12 12264886 12264967 82 + 24050 NR_036052
1317 100302285 chr12 9392066 9392147 82 - 25034 NR_036052
1318 100302285 chr2 232578024 232578105 82 + 5491 NR_036052
1319 100302285 chr5 118310281 118310362 82 + 11128 NR_036052
作为中间步骤,我试图摆脱多次出现但具有不同“ seqnames”的元素,例如“ 100302285”。因为所有“ seqnames”都是相同的,所以将保留元素“ 10000”。仅存在一次的元素也将保留。这是所需的输出:
> dftest
element seqnames start end width strand tx_id tx_name
1 1 chr19 58858172 58864865 6694 - 36769 NM_130786
2 10 chr8 18248755 18258723 9969 + 16614 NM_000015
3 100 chr20 43248163 43280376 32214 - 37719 NM_000022
4 1000 chr18 25530930 25757445 226516 - 33839 NM_001792
5 10000 chr1 243651535 244006584 355050 - 4182 NM_181690
6 10000 chr1 243663021 244006584 343564 - 4183 NM_005465
到目前为止,我已经使用了ddply和custom函数来包含重复项:
subChr <- function(df)
{
df[duplicated(df$seqnames),]
}
ddply(df, .(element), subChr)
但是结果远非预期的-愚蠢的我,可能不是那么简单的:
element seqnames start end width strand tx_id tx_name
1 10000 chr1 243663021 244006584 343564 - 4183 NM_005465
2 100302285 chr12 9392066 9392147 82 - 25034 NR_036052
由于这是另一个步骤,因此我很乐意这样做的替代解决方案:
ddply(df, .(element), summarize, chromosome=seqnames[1], gene_start=min(start), gene_end=max(end), strand=strand[1])
element chromosome gene_start gene_end strand
1 1 chr19 58858172 58864865 -
2 10 chr8 18248755 18258723 +
3 100 chr20 43248163 43280376 -
4 1000 chr18 25530930 25757445 -
5 10000 chr1 243651535 244006584 -
6 100302285 chr12 9392066 232578105 +
但汇总了每个“ seqnames”的元素“ 100302285”:
element chromosome gene_start gene_end strand
1 1 chr19 58858172 58864865 -
2 10 chr8 18248755 18258723 +
3 100 chr20 43248163 43280376 -
4 1000 chr18 25530930 25757445 -
5 10000 chr1 243651535 244006584 -
6 100302285 chr12 9392066 12264967 +
7 100302285 chr2 232578024 232578105 +
8 100302285 chr5 118310281 118310362 +
如果可以的话,基本上按.element和.seqname进行汇总。我一直在寻找答案,但进展不大。
测试数据:
dftest <- structure(list(element = c("1", "10", "100", "1000", "10000",
"10000", "100302285", "100302285", "100302285", "100302285"),
seqnames = c("chr19", "chr8", "chr20", "chr18", "chr1", "chr1",
"chr12", "chr12", "chr2", "chr5"), start = c(58858172L, 18248755L,
43248163L, 25530930L, 243651535L, 243663021L, 12264886L,
9392066L, 232578024L, 118310281L), end = c(58864865L, 18258723L,
43280376L, 25757445L, 244006584L, 244006584L, 12264967L,
9392147L, 232578105L, 118310362L), width = c(6694L, 9969L,
32214L, 226516L, 355050L, 343564L, 82L, 82L, 82L, 82L), strand = c("-",
"+", "-", "-", "-", "-", "+", "-", "+", "+"), tx_id = c(36769L,
16614L, 37719L, 33839L, 4182L, 4183L, 24050L, 25034L, 5491L,
11128L), tx_name = c("NM_130786", "NM_000015", "NM_000022",
"NM_001792", "NM_181690", "NM_005465", "NR_036052", "NR_036052",
"NR_036052", "NR_036052")), .Names = c("element", "seqnames",
"start", "end", "width", "strand", "tx_id", "tx_name"), class = "data.frame", row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L))
最佳答案
回答您的第一个问题:如果您愿意,这是一个data.table
解决方案:
require(data.table)
dt <- data.table(dftest, key="element")
dt.out <- dt[, .SD[length(table(seqnames)) == 1],by=c("element")]
> dt.out
# element seqnames start end width strand tx_id tx_name
# 1: 1 chr19 58858172 58864865 6694 - 36769 NM_130786
# 2: 10 chr8 18248755 18258723 9969 + 16614 NM_000015
# 3: 100 chr20 43248163 43280376 32214 - 37719 NM_000022
# 4: 1000 chr18 25530930 25757445 226516 - 33839 NM_001792
# 5: 10000 chr1 243651535 244006584 355050 - 4182 NM_181690
# 6: 10000 chr1 243663021 244006584 343564 - 4183 NM_005465
如果您更喜欢
plyr
解决方案:require(plyr)
out <- ddply(dftest, .(element), function(x) {
if( length(table(x$seqnames)) == 1) {
x
}
})
# element seqnames start end width strand tx_id tx_name
# 1 1 chr19 58858172 58864865 6694 - 36769 NM_130786
# 2 10 chr8 18248755 18258723 9969 + 16614 NM_000015
# 3 100 chr20 43248163 43280376 32214 - 37719 NM_000022
# 4 1000 chr18 25530930 25757445 226516 - 33839 NM_001792
# 5 10000 chr1 243651535 244006584 355050 - 4182 NM_181690
# 6 10000 chr1 243663021 244006584 343564 - 4183 NM_005465
编辑:对于第二个问题,基本上,除了旧的解决方案之外,您只想在不满足第一个条件时返回第一行。
plyr
解决方案:(无summarise
)out <- ddply(dftest, .(element), function(x) {
if (length(table(x$seqnames)) == 1) {
x
} else {
x[1, ]
}
})
> out
# element seqnames start end width strand tx_id tx_name
# 1 1 chr19 58858172 58864865 6694 - 36769 NM_130786
# 2 10 chr8 18248755 18258723 9969 + 16614 NM_000015
# 3 100 chr20 43248163 43280376 32214 - 37719 NM_000022
# 4 1000 chr18 25530930 25757445 226516 - 33839 NM_001792
# 5 10000 chr1 243651535 244006584 355050 - 4182 NM_181690
# 6 10000 chr1 243663021 244006584 343564 - 4183 NM_005465
# 7 100302285 chr12 12264886 12264967 82 + 24050 NR_036052
data.table
解决方案。dt <- data.table(dftest, key="element")
dt[, .SD[(if(length(table(seqnames)) == 1) seq_len(.N) else 1)], by = element]
> dt.out
# element seqnames start end width strand tx_id tx_name
# 1: 1 chr19 58858172 58864865 6694 - 36769 NM_130786
# 2: 10 chr8 18248755 18258723 9969 + 16614 NM_000015
# 3: 100 chr20 43248163 43280376 32214 - 37719 NM_000022
# 4: 1000 chr18 25530930 25757445 226516 - 33839 NM_001792
# 5: 10000 chr1 243651535 244006584 355050 - 4182 NM_181690
# 6: 10000 chr1 243663021 244006584 343564 - 4183 NM_005465
# 7: 100302285 chr12 12264886 12264967 82 + 24050 NR_036052