问题描述
我有一个像这样的数据框:
df< -structure(list(id = c( A, A, A, B, B, C, C, D,
D, E, E),专业知识= c( r , python, julia, python,
r, python, julia, python, julia, r, julia))),类= c( tbl_df,
tbl, data.frame),row.names = c(NA,-11L),.Names = c( id,
expertise) ,spec = structure(list(cols = structure(list(id = structure(list(),class = c( collector_character,
collector))),专门知识= structure(list(),class = c ( collector_character,
collector))))。.names = c( id, expertise))),默认=结构(list(),类= c( collector_guess,
collector)))).names = c( cols, default),class = col_spec))
df
id专业知识
1 A r
2 A蟒蛇
3朱莉亚
4 B蟒蛇
5 B r
6 C蟒蛇
7 C朱莉娅
8 D蟒蛇
9 D朱莉娅
10 E r
11 E朱莉娅
我可以使用以下方法获得专业知识的总数:
library(dplyr)
df%>%group_by(expertise)%>%mutate(counts_overall = n())
但是我想要的是专业知识值组合的计数。换句话说,有多少 id具有两种专业知识的相同组合,例如 r和 julia?
这是所需的输出:
df_out< -structure(list(expertise1 = c( r, r, python),专业2 = c( python,
julia, julia),count = c(2L,2L,3L)),class = c( tbl_df,
tbl, data.frame),row.names = c(NA,-3L),.Names = c( expertise1,
expertise2, count),spec =结构(列表(列(cols =结构(列表(
的专业知识1 =结构(列表(),类= c( collector_character,
collector)))),专业知识2 =结构(列表(),类别= c( collector_character,
collector))),count =结构(list(),类= c( collector_integer,
collector))))。.Names = c( expertise1 , expertise2, count
)),默认=结构(list(),类= c( collector_guess,
collector)))),.Names = c( cols ,默认),类= col_spec))
df_out
专长1专长2 count
1 r python 2
2 r julia 2
3 Python朱莉娅3
来自 latemail的评论创建一个矩阵
crossprod(table(df)> 0)
CJ(专业知识)[V1< V2]
是 t(combn(df $ expertise,2))$ c的
data.table
$ c>或 combinat :: combn2(df $ expertise)
。
2)自加入
这里是使用 self-join 的另一个变体:
library(data.table)
setDT(df)[df,on = id,allow = TRUE] [
专业知识< i.expertise,.N,按=。((专家1 =专业知识,专业2 = i。专业知识)]
I have a dataframe like so:
df<-structure(list(id = c("A", "A", "A", "B", "B", "C", "C", "D",
"D", "E", "E"), expertise = c("r", "python", "julia", "python",
"r", "python", "julia", "python", "julia", "r", "julia")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -11L), .Names = c("id",
"expertise"), spec = structure(list(cols = structure(list(id = structure(list(), class = c("collector_character",
"collector")), expertise = structure(list(), class = c("collector_character",
"collector"))), .Names = c("id", "expertise")), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
df
id expertise
1 A r
2 A python
3 A julia
4 B python
5 B r
6 C python
7 C julia
8 D python
9 D julia
10 E r
11 E julia
I can get the overall counts of "expertise" by using:
library(dplyr)
df %>% group_by(expertise) %>% mutate (counts_overall= n())
However what I want is the counts for combinations of expertise values. In other words how many "id" had the same combination of two expertise e.g. "r" and"julia"?Here is a desired output:
df_out<-structure(list(expertise1 = c("r", "r", "python"), expertise2 = c("python",
"julia", "julia"), count = c(2L, 2L, 3L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -3L), .Names = c("expertise1",
"expertise2", "count"), spec = structure(list(cols = structure(list(
expertise1 = structure(list(), class = c("collector_character",
"collector")), expertise2 = structure(list(), class = c("collector_character",
"collector")), count = structure(list(), class = c("collector_integer",
"collector"))), .Names = c("expertise1", "expertise2", "count"
)), default = structure(list(), class = c("collector_guess",
"collector"))), .Names = c("cols", "default"), class = "col_spec"))
df_out
expertise1 expertise2 count
1 r python 2
2 r julia 2
3 python julia 3
The linked answer from latemail's comment creates a matrix
crossprod(table(df) > 0)
while the OP expects a dataframe in long format.
1) cross join
Below is a data.table
solution which uses the CJ()
(cross join) function:
library(data.table)
setDT(df)[, CJ(expertise, expertise)[V1 < V2], by = id][
, .N, by = .(expertise1 = V1, expertise2 = V2)]
CJ(expertise, expertise)[V1 < V2]
is the data.table
equivalent for t(combn(df$expertise, 2))
or combinat::combn2(df$expertise)
.
2) self-join
Here is another variant which uses a self-join:
library(data.table)
setDT(df)[df, on = "id", allow = TRUE][
expertise < i.expertise, .N, by = .(expertise1 = expertise, expertise2 = i.expertise)]
这篇关于数据帧R中值的组合计数的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!