本文介绍了dplyr和多个线性模型的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧! 问题描述 限时删除!! 我试图使用 dplyr 为个人id(cusip)和year(fyear)运行大量的回归,但我不知道如何使用总结功能。我需要运行模型,获取系数,将它们添加在一起,将 mutate 将结果转换为另一个变量 beta 。这是一些代码,这是不起作用的,但得到了这个想法。 可重现的例子: tdata< - structure(list(cusip = c(02136810,02136810,02136810,02136810,02136810,02136810,02136810 02136810,02136810,02136810,02136810,02136810,02136810,02136810,02136810,02136810,02136810 02136810,02136810,02136810,02136810,02136810,02136810,02136810,01650910,01650910,01650910,01650910 ,01650910,01650910),fyear = c(1979L,1979L, 1979L,1979L,1979L,1979L,1979L,1979L,1979L,1979L,1979L, 1979L,1980L,1980L ,1980L,1980L,1980L,1980L,1980L,1980L, 1980L,1980L,1980L,1980L,1965L,1965L,1965L,1965L,1965L, 1965L),ret = c(0.000000 0.000000,0.111111,-0.063636,0.203883,0.032258,0.078125,0.000000,-0.014493,-0.014706,0.044776 ,0.457143,0.039216,-0.009434, -0.200000,-0.047619,0.100000,0.022727,0.144444,0.067961,-0.009091,0.009174,0.109091,-0.077869 ,0.418182,-0.089744,0.014085,-0.041667,-0.086957,0.000000),vwretd = c(0.049489,0.026766,0 0.065618,0.008522 , -0.013576,0.04685,0.014991,0.064728,0.001428,0.027266,0.063603, 0.028212,0.065607,0.001015,0.120224,0.052288,0.06009,0.073144, 0.069438,0.023553,0.029498, 0.020093,0.034409, 0.038646,0.006946,-0.009715,0.033652,-0.00435,-0.051868), date = c(19790131L,19790228L,19790330L,19790430L,19790531L, 19790629L ,19790731L,19790831L,19790928L,19791031L,19791130L, 19791231L,19800131L,19800229L,19800331L,19800430L,19800530L, 19800630L,19800731L,19800829L,19800930L,19801031L,19801128L, 19801231L,19650129L ,19650226L,19650331L,19650430L,19650528L, 19650630L)),.Names = c(cusip,fyear,ret vwretd,date),row.names = c(NA,30L),class =data.frame) dplyr代码: test< - tdata%>% group_by(cusip,fyear)%>% arrange(desc(date)%>% summary(fm mutate(beta< - 摘要(fm)$系数[2,1] +汇总(fm)$系数[3,1])$ b $ b 编辑: 示例数据: https://www.dropbox.com/s/4padnsjjnt4uvy2/tdata.csv?dl=0 完整示例: https:/ /www.dropbox.com/s/4padnsjjnt4uvy2/tdata.csv?dl=0 解决方案我们可以使用 do library(dplyr) tdata%> % group_by(cusip,fyear)%>%安排(desc(date))%>% do({fm data.frame(。,beta = fm)$系数[2,1] + 摘要(fm)$系数[3,1])}) 我们还可以在 do data.frame(。,beta = ....) c $ c> to ---%>% do({fm< - lm(ret 〜vwretd + lag(vwretd),data =。) data.frame(。,beta = sum(coef(fm)[ - 1]))}) / pre> 更新 如果有单独观察组合,这将返回 NA for'beta' tdata1< - read.csv('tdata.csv ',stringsAsFactors = FALSE) res group_by(cusip,fyear)%>% arrange(desc(date))%>% mutate(n = n())%>% do(data.frame(。,beta = ifelse(。$ n> 1, sum(coef(lm(ret〜vwretd + lag(vwretd),data =。))[ - 1]),NA))) as。 data.frame(res)[1:3,c('date','cusip','ret','vwretd','beta')] #date cusip ret vwretd beta # 19691231 00080010 -0.012594 -0.019681 0.7932 #2 19691128 00080010 0.001995 -0.032164 0.7932 #3 19691031 00080010 0.113889 0.055638 0.7932 Update2 在完整数据集 tdata2< - read.csv('tdatafull.csv',stringsAsFactors = FALSE) tdata2 $ ret< - as.numeric(tdata2 $ ret) res1 arrange(desc(date))%>% mutate(n = n())%>% do 。。(),()=(i = 1)),(n)=(b) head(res1)#X cusip fyear ret vwretd date n beta #1 72 8188 00003210 1973 0.000000 0.011425 19731231 12 2.751094 #2 728187 00003210 1973 -0.300000 -0.120703 19731130 12 2.751094 #3 728186 00003210 1973 -0.166667 -0.000427 19731031 12 2.751094 #4 728185 00003210 1973 0.043478 0.053937 19730928 12 2.751094 #5 728184 00003210 1973 -0.258065 -0.029648 19730831 12 2.751094 #6 728183 00003210 1973 0.291667 0.056954 19730731 12 2.751094 dim(tdata2) #[1] 898657 6 sum(is.na(res1 $ beta))#[1] 461 I'm trying to run a large number of regressions using dplyr for individual id (cusip) and year (fyear) but I'm not sure how to utilize the summary function. I need to run the model, grab coefficients, add them together, and mutate the result to another variable beta. Here's some code, that isn't working, but gets at the idea.Reproducible example : tdata <- structure(list(cusip = c("02136810", "02136810", "02136810","02136810", "02136810", "02136810", "02136810", "02136810", "02136810","02136810", "02136810", "02136810", "02136810", "02136810", "02136810","02136810", "02136810", "02136810", "02136810", "02136810", "02136810","02136810", "02136810", "02136810", "01650910", "01650910", "01650910","01650910", "01650910", "01650910"), fyear = c(1979L, 1979L,1979L, 1979L, 1979L, 1979L, 1979L, 1979L, 1979L, 1979L, 1979L,1979L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L,1980L, 1980L, 1980L, 1980L, 1965L, 1965L, 1965L, 1965L, 1965L,1965L), ret = c("0.000000", "0.000000", "0.111111", "-0.063636","0.203883", "0.032258", "0.078125", "0.000000", "-0.014493","-0.014706", "0.044776", "0.457143", "0.039216", "-0.009434","-0.200000", "-0.047619", "0.100000", "0.022727", "0.144444","0.067961", "-0.009091", "0.009174", "0.109091", "-0.077869","0.418182", "-0.089744", "0.014085", "-0.041667", "-0.086957","0.000000"), vwretd = c(0.049489, -0.026766, 0.065618, 0.008522,-0.013576, 0.04685, 0.014991, 0.064728, 0.001428, -0.07266, 0.063603,0.028212, 0.065607, 0.001015, -0.120224, 0.052288, 0.06009, 0.037714,0.069438, 0.023553, 0.029498, 0.020093, 0.104951, -0.034409,0.038646, 0.006946, -0.009715, 0.033652, -0.00435, -0.051868), date = c(19790131L, 19790228L, 19790330L, 19790430L, 19790531L, 19790629L, 19790731L, 19790831L, 19790928L, 19791031L, 19791130L, 19791231L, 19800131L, 19800229L, 19800331L, 19800430L, 19800530L, 19800630L, 19800731L, 19800829L, 19800930L, 19801031L, 19801128L, 19801231L, 19650129L, 19650226L, 19650331L, 19650430L, 19650528L, 19650630L)), .Names = c("cusip", "fyear", "ret", "vwretd","date"), row.names = c(NA, 30L), class = "data.frame")dplyr code :test <- tdata %>% group_by(cusip, fyear) %>% arrange(desc(date) %>% summary(fm <- lm(ret ~ vwretd + lag(vwretd), data = tdata)) %>% mutate(beta <- summary(fm)$coefficients[2,1] + summary(fm)$coefficients[3,1])Edit :Sample Data : https://www.dropbox.com/s/4padnsjjnt4uvy2/tdata.csv?dl=0Full Sample : https://www.dropbox.com/s/4padnsjjnt4uvy2/tdata.csv?dl=0 解决方案 We could use dolibrary(dplyr)tdata %>% group_by(cusip, fyear) %>% arrange(desc(date)) %>% do({fm <- lm(ret~vwretd+lag(vwretd), data=.) data.frame(., beta=summary(fm)$coefficients[2,1]+ summary(fm)$coefficients[3,1])})We could also change the data.frame(., beta=....) in the do to --- %>% do({fm <- lm(ret~vwretd+lag(vwretd), data=.) data.frame(., beta=sum(coef(fm)[-1]))})UpdateIf there are group combinations with a single observation, this will return NA for 'beta' tdata1 <- read.csv('tdata.csv', stringsAsFactors=FALSE) res <- tdata1 %>% group_by(cusip, fyear) %>% arrange(desc(date)) %>% mutate(n=n()) %>% do(data.frame(., beta=ifelse(.$n > 1, sum(coef(lm(ret~vwretd+lag(vwretd), data=.))[-1]), NA))) as.data.frame(res)[1:3, c('date', 'cusip', 'ret','vwretd', 'beta')] # date cusip ret vwretd beta #1 19691231 00080010 -0.012594 -0.019681 0.7932 #2 19691128 00080010 0.001995 -0.032164 0.7932 #3 19691031 00080010 0.113889 0.055638 0.7932Update2On the full datasettdata2 <- read.csv('tdatafull.csv', stringsAsFactors=FALSE)tdata2$ret <- as.numeric(tdata2$ret)res1 <- tdata2%>% group_by(cusip, fyear) %>% arrange(desc(date)) %>% mutate(n=n()) %>% do(data.frame(., beta=ifelse(.$n > 2, sum(coef(lm(ret~vwretd+lag(vwretd), data=.))[-1]), NA))) head(res1) # X cusip fyear ret vwretd date n beta #1 728188 00003210 1973 0.000000 0.011425 19731231 12 2.751094 #2 728187 00003210 1973 -0.300000 -0.120703 19731130 12 2.751094 #3 728186 00003210 1973 -0.166667 -0.000427 19731031 12 2.751094 #4 728185 00003210 1973 0.043478 0.053937 19730928 12 2.751094 #5 728184 00003210 1973 -0.258065 -0.029648 19730831 12 2.751094 #6 728183 00003210 1973 0.291667 0.056954 19730731 12 2.751094 dim(tdata2)#[1] 898657 6sum(is.na(res1$beta))#[1] 461 这篇关于dplyr和多个线性模型的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持! 1403页,肝出来的.. 09-08 10:43