我想念一种以透明方式将数据添加到SO答案的方法。我的经验是o​​jit_code中的structure对象有时会使没有经验的用户感到不必要的困惑。但是,我没有耐心每次都将其复制/粘贴到一个简单的数据框中,并希望实现自动化。类似于dput(),但是是简化版本。

以复制/粘贴的方式说我,其他一些居所也有这样的数据,

Df <- data.frame(A = c(2, 2, 2, 6, 7, 8),
                 B = c("A", "G", "N", NA, "L", "L"),
                 C = c(1L, 3L, 5L, NA, NA, NA))

看起来像这样
Df
#>   A    B  C
#> 1 2    A  1
#> 2 2    G  3
#> 3 2    N  5
#> 4 6 <NA> NA
#> 5 7    L NA
#> 6 8    L NA

在一个整数,一个因子和一个数值 vector 内,
str(Df)
#> 'data.frame':    6 obs. of  3 variables:
#>  $ A: num  2 2 2 6 7 8
#>  $ B: Factor w/ 4 levels "A","G","L","N": 1 2 4 NA 3 3
#>  $ C: int  1 3 5 NA NA NA

现在,我想在SO上共享它,但是我并不总是拥有它来自的原始数据框架。我经常以SO形式对其进行dput()编码,而我知道将其发布的唯一方法是pipe()。喜欢,
dput(Df)
#> structure(list(A = c(2, 2, 2, 6, 7, 8), B = structure(c(1L, 2L,
#> 4L, NA, 3L, 3L), .Label = c("A", "G", "L", "N"), class = "factor"),
#> C = c(1L, 3L, 5L, NA, NA, NA)), .Names = c("A", "B", "C"), row.names = c(NA,
#> -6L), class = "data.frame")

但是,正如我在顶部说的那样,这些dput()看起来非常困惑。因此,我正在寻找一种以某种方式压缩structure输出的方法。我想象一个输出看起来像这样,
dput_small(Df)
#> data.frame(A = c(2, 2, 2, 6, 7, 8), B = c("A", "G", "N", NA, "L", "L"),
#> C = c(1L, 3L, 5L, NA, NA, NA))

那可能吗?我意识到还有其他类,例如dput()liststbl等。

最佳答案

dput_small1(Df)
# Df <- data.frame(
#   A = c(2, 2, 2, 6, 7, 8),
#   B = structure(c(1L, 2L, 4L, NA, 3L, 3L), .Label = c("A", "G", "L",
#     "N"), class = "factor"),
#   C = c(1L, 3L, 5L, NA, NA, NA) ,
#   stringsAsFactors=FALSE)

dput_small2(Df,stringsAsFactors=TRUE)
# Df <- read.table(sep="\t", text="
#   A   B   C
#   2   A    1
#   2   G    3
#   2   N    5
#   6   NA  NA
#   7   L   NA
#   8   L   NA", header=TRUE, stringsAsFactors=TRUE)

dput_small3(Df)
# Df <- tibble::tribble(
#   ~A, ~B, ~C,
#   2,           "A",          1L,
#   2,           "G",          3L,
#   2,           "N",          5L,
#   6, NA_character_, NA_integer_,
#   7,           "L", NA_integer_,
#   8,           "L", NA_integer_
# )
# Df$B <- factor(Df$B)
包装dput此选项的输出非常接近问题中建议的输出。这很笼统,因为它实际上包裹在dput周围,但分别应用于列。multiline的意思是“保持dput的默认输出布局成多行”。
dput_small1<- function(x,
                       name=as.character(substitute(x)),
                       multiline = TRUE,
                       n=if ('list' %in% class(x)) length(x) else nrow(x),
                       random=FALSE,
                       seed = 1){
  name
  if('tbl_df' %in% class(x)) create_fun <- "tibble::tibble" else
    if('list' %in% class(x)) create_fun <- "list" else
      if('data.table' %in% class(x)) create_fun <- "data.table::data.table" else
        create_fun <- "data.frame"

    if(random) {
      set.seed(seed)
      if(create_fun == "list") x <- x[sample(1:length(x),n)] else
        x <- x[sample(1:nrow(x),n),]
    } else {
      x <- head(x,n)
    }

    line_sep <- if (multiline) "\n    " else ""
    cat(sep='',name," <- ",create_fun,"(\n  ",
        paste0(unlist(
          Map(function(item,nm) paste0(nm,if(nm=="") "" else " = ",paste(capture.output(dput(item)),collapse=line_sep)),
              x,if(is.null(names(x))) rep("",length(x)) else names(x))),
          collapse=",\n  "),
        if(create_fun == "data.frame") ",\n  stringsAsFactors = FALSE)" else "\n)")
}

dput_small1(list(1,2,c=3,d=4),"my_list",random=TRUE,n=3)
# my_list <- list(
#   2,
#   d = 4,
#   c = 3
# )
read.table解决方案
对于data.frames,我觉得很舒服,但是可以使用更明确/表格格式的输入。
可以使用read.table来实现,然后自动重新格式化read.table无法正确处理的列的类型。不像第一个解决方案那样普遍,但是可以在SO上找到的95%的情况下顺利运行。
dput_small2 <- function(df,
                        name=as.character(substitute(df)),
                        sep='\t',
                        header=TRUE,
                        stringsAsFactors = FALSE,
                        n= nrow(df),
                        random=FALSE,
                        seed = 1){
    name
    if(random) {
      set.seed(seed)
      df <- df[sample(1:nrow(df),n),]
    } else {
      df <- head(df,n)
    }
  cat(sep='',name,' <- read.table(sep="',sub('\t','\\\\t',sep),'", text="\n  ',
      paste(colnames(df),collapse=sep))
  df <- head(df,n)
  apply(df,1,function(x) cat(sep='','\n  ',paste(x,collapse=sep)))
  cat(sep='','", header=',header,', stringsAsFactors=',stringsAsFactors,')')

  sapply(names(df), function(x){
    if(is.character(df[[x]]) & suppressWarnings(identical(as.character(as.numeric(df[[x]])),df[[x]]))){ # if it's a character column containing numbers
      cat(sep='','\n',name,'$',x,' <- as.character(', name,'$',x,')')
    } else if(is.factor(df[[x]]) & !stringsAsFactors) { # if it's a factor and conversion is not automated
      cat(sep='','\n',name,'$',x,' <- factor(', name,'$',x,')')
    } else if(inherits(df[[x]], "POSIXct")){
      cat(sep='','\n',name,'$',x,' <- as.POSIXct(', name,'$',x,')')
    } else if(inherits(df[[x]], "Date")){
      cat(sep='','\n',name,'$',x,' <- as.Date(', name,'$',x,')')
    }})
  invisible(NULL)
}
最简单的情况
dput_small2(iris,n=6)
将打印:
iris <- read.table(sep="\t", text="
  Sepal.Length  Sepal.Width Petal.Length    Petal.Width Species
  5.1   3.5 1.4 0.2  setosa
  4.9   3.0 1.4 0.2  setosa
  4.7   3.2 1.3 0.2  setosa
  4.6   3.1 1.5 0.2  setosa
  5.0   3.6 1.4 0.2  setosa
  5.4   3.9 1.7 0.4  setosa", header=TRUE, stringsAsFactors=FALSE)
执行时依次返回:
#   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
# 1          5.1         3.5          1.4         0.2  setosa
# 2          4.9         3.0          1.4         0.2  setosa
# 3          4.7         3.2          1.3         0.2  setosa
# 4          4.6         3.1          1.5         0.2  setosa
# 5          5.0         3.6          1.4         0.2  setosa
# 6          5.4         3.9          1.7         0.4  setosa

str(iris)
# 'data.frame': 6 obs. of  5 variables:
# $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4
# $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9
# $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7
# $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4
# $ Species     : chr  " setosa" " setosa" " setosa" " setosa" ...
更复杂的
虚拟数据:
test <- data.frame(a=1:5,
                   b=as.character(6:10),
                   c=letters[1:5],
                   d=factor(letters[6:10]),
                   e=Sys.time()+(1:5),
                   stringsAsFactors = FALSE)
这个:
dput_small2(test,'df2')
将打印:
df2 <- read.table(sep="\t", text="
  a b   c   d   e
  1 6   a   f   2018-02-15 11:53:17
  2 7   b   g   2018-02-15 11:53:18
  3 8   c   h   2018-02-15 11:53:19
  4 9   d   i   2018-02-15 11:53:20
  5 10  e   j   2018-02-15 11:53:21", header=TRUE, stringsAsFactors=FALSE)
df2$b <- as.character(df2$b)
df2$d <- factor(df2$d)
df2$e <- as.POSIXct(df2$e)
执行时依次返回:
#   a  b c d                   e
# 1 1  6 a f 2018-02-15 11:53:17
# 2 2  7 b g 2018-02-15 11:53:18
# 3 3  8 c h 2018-02-15 11:53:19
# 4 4  9 d i 2018-02-15 11:53:20
# 5 5 10 e j 2018-02-15 11:53:21

str(df2)
# 'data.frame': 5 obs. of  5 variables:
# $ a: int  1 2 3 4 5
# $ b: chr  "6" "7" "8" "9" ...
# $ c: chr  "a" "b" "c" "d" ...
# $ d: Factor w/ 5 levels "f","g","h","i",..: 1 2 3 4 5
# $ e: POSIXct, format: "2018-02-15 11:53:17" "2018-02-15 11:53:18" "2018-02-15 11:53:19" "2018-02-15 11:53:20" ...

all.equal(df2,test)
# [1] "Component “e”: Mean absolute difference: 0.4574251" # only some rounding error
tribble解决方案read.table选项可读性强,但不是很通用。使用tribble几乎可以处理任何数据类型(尽管因素需要临时固定)。
对于OP的示例,此解决方案不是很有用,但对于列表列则非常有用(请参见下面的示例)。要使用输出,需要库tibble
就像我的第一个解决方案一样,它是dput的包装器,但是不是'dputting'列,而是'dputting'元素。
dput_small3 <- function(df,
                        name=as.character(substitute(df)),
                        n= nrow(df),
                        random=FALSE,
                        seed = 1){
  name
  if(random) {
    set.seed(seed)
    df <- df[sample(1:nrow(df),n),]
  } else {
    df <- head(df,n)
  }
  df1 <- lapply(df,function(col) if(is.factor(col)) as.character(col) else col)
  dputs   <- sapply(df1,function(col){
    col_dputs <- sapply(col,function(elt) paste(capture.output(dput(elt)),collapse=""))
    max_char <- max(nchar(unlist(col_dputs)))
    sapply(col_dputs,function(elt) paste(c(rep(" ",max_char-nchar(elt)),elt),collapse=""))
  })
  lines   <- paste(apply(dputs,1,paste,collapse=", "),collapse=",\n  ")
  output  <- paste0(name," <- tibble::tribble(\n  ",
                    paste0("~",names(df),collapse=", "),
                    ",\n  ",lines,"\n)")
  cat(output)
  sapply(names(df), function(x) if(is.factor(df[[x]])) cat(sep='','\n',name,'$',x,' <- factor(', name,'$',x,')'))
  invisible(NULL)
}

dput_small3(dplyr::starwars[c(1:3,11)],"sw",n=6,random=TRUE)
# sw <- tibble::tribble(
#   ~name, ~height, ~mass, ~films,
#   "Lando Calrissian", 177L,       79,                     c("Return of the Jedi", "The Empire Strikes Back"),
#      "Finis Valorum", 170L, NA_real_,                                                   "The Phantom Menace",
#       "Ki-Adi-Mundi", 198L,       82, c("Attack of the Clones", "The Phantom Menace", "Revenge of the Sith"),
#           "Grievous", 216L,      159,                                                  "Revenge of the Sith",
#     "Wedge Antilles", 170L,       77,       c("Return of the Jedi", "The Empire Strikes Back", "A New Hope"),
#         "Wat Tambor", 193L,       48,                                                 "Attack of the Clones"
# )

关于r - R中的简化dput(),我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/18746456/

10-12 16:32