dplyr和多个线性模型

本文介绍了dplyr和多个线性模型的处理方法，对大家解决问题具有一定的参考价值，需要的朋友们下面随着小编来一起学习吧！问题描述限时删除！！我试图使用 dplyr 为个人id（cusip）和year（fyear）运行大量的回归，但我不知道如何使用总结功能。我需要运行模型，获取系数，将它们添加在一起，将 mutate 将结果转换为另一个变量 beta 。这是一些代码，这是不起作用的，但得到了这个想法。可重现的例子： tdata< - structure（list（cusip = c（02136810，02136810，02136810，02136810，02136810，02136810，02136810 02136810，02136810，02136810，02136810，02136810，02136810，02136810，02136810，02136810，02136810 02136810，02136810，02136810，02136810，02136810，02136810，02136810，01650910，01650910，01650910，01650910 ，01650910，01650910），fyear = c（1979L，1979L， 1979L，1979L，1979L，1979L，1979L，1979L，1979L，1979L，1979L， 1979L，1980L，1980L ，1980L，1980L，1980L，1980L，1980L，1980L， 1980L，1980L，1980L，1980L，1965L，1965L，1965L，1965L，1965L， 1965L），ret = c（0.000000 0.000000，0.111111，-0.063636，0.203883，0.032258，0.078125，0.000000，-0.014493，-0.014706，0.044776 ，0.457143，0.039216，-0.009434， -0.200000，-0.047619，0.100000，0.022727，0.144444，0.067961，-0.009091，0.009174，0.109091，-0.077869 ，0.418182，-0.089744，0.014085，-0.041667，-0.086957，0.000000），vwretd = c（0.049489,0.026766,0 0.065618,0.008522 ， -0.013576,0.04685,0.014991,0.064728,0.001428,0.027266,0.063603， 0.028212,0.065607,0.001015,0.120224,0.052288,0.06009,0.073144， 0.069438,0.023553,0.029498， 0.020093,0.034409， 0.038646,0.006946，-0.009715,0.033652，-0.00435，-0.051868）， date = c（19790131L，19790228L，19790330L，19790430L，19790531L， 19790629L ，19790731L，19790831L，19790928L，19791031L，19791130L， 19791231L，19800131L，19800229L，19800331L，19800430L，19800530L， 19800630L，19800731L，19800829L，19800930L，19801031L，19801128L， 19801231L，19650129L ，19650226L，19650331L，19650430L，19650528L， 19650630L）），.Names = c（cusip，fyear，ret vwretd，date），row.names = c（NA，30L），class =data.frame） dplyr代码： test< - tdata％>％ group_by（cusip，fyear）％>％ arrange（desc（date）％>％ summary（fm mutate（beta< - 摘要（fm）$系数[2,1] +汇总（fm）$系数[3,1]）$ b $ b 编辑：示例数据： https://www.dropbox.com/s/4padnsjjnt4uvy2/tdata.csv?dl=0 完整示例： https：/ /www.dropbox.com/s/4padnsjjnt4uvy2/tdata.csv?dl=0 解决方案我们可以使用 do library（dplyr） tdata％> ％ group_by（cusip，fyear）％>％安排（desc（date））％>％ do（{fm data.frame（。，beta = fm）$系数[2,1] + 摘要（fm）$系数[3,1]）}）我们还可以在 do data.frame（。，beta = ....） c $ c> to ---％>％ do（{fm< - lm（ret 〜vwretd + lag（vwretd），data =。） data.frame（。，beta = sum（coef（fm）[ - 1]））}） / pre> 更新如果有单独观察组合，这将返回 NA for'beta' tdata1< - read.csv（'tdata.csv '，stringsAsFactors = FALSE） res group_by（cusip，fyear）％>％ arrange（desc（date））％>％ mutate（n = n（））％>％ do（data.frame（。，beta = ifelse（。$ n> 1， sum（coef（lm（ret〜vwretd + lag（vwretd），data =。））[ - 1]），NA））） as。 data.frame（res）[1：3，c（'date'，'cusip'，'ret'，'vwretd'，'beta'）] ＃date cusip ret vwretd beta ＃ 19691231 00080010 -0.012594 -0.019681 0.7932 ＃2 19691128 00080010 0.001995 -0.032164 0.7932 ＃3 19691031 00080010 0.113889 0.055638 0.7932 Update2 在完整数据集 tdata2< - read.csv（'tdatafull.csv'，stringsAsFactors = FALSE） tdata2 $ ret< - as.numeric（tdata2 $ ret） res1 arrange（desc（date））％>％ mutate（n = n（））％>％ do 。。（），（）=（i = 1）），（n）=（b） head（res1）＃X cusip fyear ret vwretd date n beta ＃1 72 8188 00003210 1973 0.000000 0.011425 19731231 12 2.751094 ＃2 728187 00003210 1973 -0.300000 -0.120703 19731130 12 2.751094 ＃3 728186 00003210 1973 -0.166667 -0.000427 19731031 12 2.751094 ＃4 728185 00003210 1973 0.043478 0.053937 19730928 12 2.751094 ＃5 728184 00003210 1973 -0.258065 -0.029648 19730831 12 2.751094 ＃6 728183 00003210 1973 0.291667 0.056954 19730731 12 2.751094 dim（tdata2）＃[1] 898657 6 sum（is.na（res1 $ beta））＃[1] 461 I'm trying to run a large number of regressions using dplyr for individual id (cusip) and year (fyear) but I'm not sure how to utilize the summary function. I need to run the model, grab coefficients, add them together, and mutate the result to another variable beta. Here's some code, that isn't working, but gets at the idea.Reproducible example : tdata <- structure(list(cusip = c("02136810", "02136810", "02136810","02136810", "02136810", "02136810", "02136810", "02136810", "02136810","02136810", "02136810", "02136810", "02136810", "02136810", "02136810","02136810", "02136810", "02136810", "02136810", "02136810", "02136810","02136810", "02136810", "02136810", "01650910", "01650910", "01650910","01650910", "01650910", "01650910"), fyear = c(1979L, 1979L,1979L, 1979L, 1979L, 1979L, 1979L, 1979L, 1979L, 1979L, 1979L,1979L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 1980L,1980L, 1980L, 1980L, 1980L, 1965L, 1965L, 1965L, 1965L, 1965L,1965L), ret = c("0.000000", "0.000000", "0.111111", "-0.063636","0.203883", "0.032258", "0.078125", "0.000000", "-0.014493","-0.014706", "0.044776", "0.457143", "0.039216", "-0.009434","-0.200000", "-0.047619", "0.100000", "0.022727", "0.144444","0.067961", "-0.009091", "0.009174", "0.109091", "-0.077869","0.418182", "-0.089744", "0.014085", "-0.041667", "-0.086957","0.000000"), vwretd = c(0.049489, -0.026766, 0.065618, 0.008522,-0.013576, 0.04685, 0.014991, 0.064728, 0.001428, -0.07266, 0.063603,0.028212, 0.065607, 0.001015, -0.120224, 0.052288, 0.06009, 0.037714,0.069438, 0.023553, 0.029498, 0.020093, 0.104951, -0.034409,0.038646, 0.006946, -0.009715, 0.033652, -0.00435, -0.051868), date = c(19790131L, 19790228L, 19790330L, 19790430L, 19790531L, 19790629L, 19790731L, 19790831L, 19790928L, 19791031L, 19791130L, 19791231L, 19800131L, 19800229L, 19800331L, 19800430L, 19800530L, 19800630L, 19800731L, 19800829L, 19800930L, 19801031L, 19801128L, 19801231L, 19650129L, 19650226L, 19650331L, 19650430L, 19650528L, 19650630L)), .Names = c("cusip", "fyear", "ret", "vwretd","date"), row.names = c(NA, 30L), class = "data.frame")dplyr code :test <- tdata %>% group_by(cusip, fyear) %>% arrange(desc(date) %>% summary(fm <- lm(ret ~ vwretd + lag(vwretd), data = tdata)) %>% mutate(beta <- summary(fm)$coefficients[2,1] + summary(fm)$coefficients[3,1])Edit :Sample Data : https://www.dropbox.com/s/4padnsjjnt4uvy2/tdata.csv?dl=0Full Sample : https://www.dropbox.com/s/4padnsjjnt4uvy2/tdata.csv?dl=0 解决方案 We could use dolibrary(dplyr)tdata %>% group_by(cusip, fyear) %>% arrange(desc(date)) %>% do({fm <- lm(ret~vwretd+lag(vwretd), data=.) data.frame(., beta=summary(fm)$coefficients[2,1]+ summary(fm)$coefficients[3,1])})We could also change the data.frame(., beta=....) in the do to --- %>% do({fm <- lm(ret~vwretd+lag(vwretd), data=.) data.frame(., beta=sum(coef(fm)[-1]))})UpdateIf there are group combinations with a single observation, this will return NA for 'beta' tdata1 <- read.csv('tdata.csv', stringsAsFactors=FALSE) res <- tdata1 %>% group_by(cusip, fyear) %>% arrange(desc(date)) %>% mutate(n=n()) %>% do(data.frame(., beta=ifelse(.$n > 1, sum(coef(lm(ret~vwretd+lag(vwretd), data=.))[-1]), NA))) as.data.frame(res)[1:3, c('date', 'cusip', 'ret','vwretd', 'beta')] # date cusip ret vwretd beta #1 19691231 00080010 -0.012594 -0.019681 0.7932 #2 19691128 00080010 0.001995 -0.032164 0.7932 #3 19691031 00080010 0.113889 0.055638 0.7932Update2On the full datasettdata2 <- read.csv('tdatafull.csv', stringsAsFactors=FALSE)tdata2$ret <- as.numeric(tdata2$ret)res1 <- tdata2%>% group_by(cusip, fyear) %>% arrange(desc(date)) %>% mutate(n=n()) %>% do(data.frame(., beta=ifelse(.$n > 2, sum(coef(lm(ret~vwretd+lag(vwretd), data=.))[-1]), NA))) head(res1) # X cusip fyear ret vwretd date n beta #1 728188 00003210 1973 0.000000 0.011425 19731231 12 2.751094 #2 728187 00003210 1973 -0.300000 -0.120703 19731130 12 2.751094 #3 728186 00003210 1973 -0.166667 -0.000427 19731031 12 2.751094 #4 728185 00003210 1973 0.043478 0.053937 19730928 12 2.751094 #5 728184 00003210 1973 -0.258065 -0.029648 19730831 12 2.751094 #6 728183 00003210 1973 0.291667 0.056954 19730731 12 2.751094 dim(tdata2)#[1] 898657 6sum(is.na(res1$beta))#[1] 461 这篇关于dplyr和多个线性模型的文章就介绍到这了，希望我们推荐的答案对大家有所帮助，也希望大家多多支持！ 1403页，肝出来的..