Mainly work on the command factor, cut, recode, summary, describe, table, levels, as.factor, as.numeric, na_if, rbind

Factor

factor(x = character(), levels, labels = levels,
       exclude = NA, ordered = is.ordered(x), nmax = NA)



ordered(x = character(), ...)
# 创建一个有序因子
ordered_vector <- ordered(c("Low", "Medium", "High"))



is.factor(x)
is.ordered(x)
# 检查对象是否为因子
is_factor <- is.factor(factor_vector)

# 检查对象是否为有序因子
is_ordered <- is.ordered(ordered_vector)



as.factor(x)
as.ordered(x)
# 将向量转换为因子
as_factor_vector <- as.factor(c("Male", "Female"))

# 将有序向量转换为有序因子
as_ordered_vector <- as.ordered(c("Low", "Medium", "High"))



addNA(x, ifany = FALSE)
# 在因子中添加缺失值水平
factor_with_na <- addNA(factor_vector)



.valid.factor(object)
# 检查对象是否为有效的因子
valid_factor <- .valid.factor(factor_vector)

注意:

在R中,一个有效的因子是指满足以下要求的因子对象:

  1. 有限的水平数: 因子应该有一个有限的水平数,即水平的数量应该是有限的,而不是无穷大。

  2. 唯一性: 每个水平应该是唯一的,没有重复的水平。

  3. 有序性(如果是有序因子): 如果因子是有序的,那么水平之间应该有一定的顺序关系。

  4. 缺失值: 因子中可以包含缺失值,通过NA表示。

cut

cut(x, ...)

## Default S3 method:
cut(x, breaks, labels = NULL,
    include.lowest = FALSE, right = TRUE, dig.lab = 3,
    ordered_result = FALSE, ...)

notice: breaks are either a numeric vector of two or more unique cut points or a single number (greater than or equal to 2) giving the number of intervals into which x is to be cut.

labels for the levels of the resulting category. By default, labels are constructed using "(a,b]" interval notation. 

dig.lab is the integer which is used when labels are not given. It determines the number of digits used in formatting the break numbers.

examples:

Z <- stats::rnorm(10000)
table(cut(Z, breaks = -6:6))


#(-6,-5] (-5,-4] (-4,-3] (-3,-2] (-2,-1]  (-1,0] 
#     0       1      10     212    1377    3391 
#  (0,1]   (1,2]   (2,3]   (3,4]   (4,5]   (5,6] 
#   3442    1345     201      19       2       0

or:

gfk_cleaned_eul$birthyear_cat <- cut(gfk_cleaned_eul$birthyear, breaks = c(-Inf, 1945, 1965, 1985, 1997, Inf), labels = c("(-Inf,1945]", "(1945,1964]", "(1965,1984]", "(1985,1996]", "(1997,Inf]"))

Recode

recode(.x, ..., .default = NULL, .missing = NULL)

recode_factor(.x, ..., .default = NULL, .missing = NULL, .ordered = FALSE)

like:

coffeenew$gender <- recode(coffeenew$gender,"1"="male","2"="female")

mcs$math=recode(mcs$math,"1"="Strongly Disagree", "2"="Disagree","3"="Agree", "4"="Strongly Agree")

gfk_cleaned_eul$birthyear_cat = recode (gfk_cleaned_eul$birthyear_cat,"(-Inf,1945]"= "born in 1945 or before","(1945,1964]"="Boomers","(1965,1984]"="GenX","(1985,1996]"="Millenium","(1997,Inf]"="GenZ")

gfk_cleaned_eul$mmetal_3cat<-recode(gfk_cleaned_eul$mmetal_3cat,'1'="Like or like a lot",'2'="Like or like a lot",'3'="neither",'4'="dislike or dislike a lot",'5'="dislike or dislike a lot")

summary

summary(object, nb.dec = 3, nbelements=10,
   ncp = 3, align.names=TRUE, file="", ...)

 Like:

class(USArrests)
[1] "data.frame"
summary(USArrests)
     Murder          Assault     
 Min.   : 0.800   Min.   : 45.0  
 1st Qu.: 4.075   1st Qu.:109.0  
 Median : 7.250   Median :159.0  
 Mean   : 7.788   Mean   :170.8  
 3rd Qu.:11.250   3rd Qu.:249.0  
 Max.   :17.400   Max.   :337.0  
    UrbanPop          Rape      
 Min.   :32.00   Min.   : 7.30  
 1st Qu.:54.50   1st Qu.:15.07  
 Median :66.00   Median :20.10  
 Mean   :65.54   Mean   :21.23  
 3rd Qu.:77.75   3rd Qu.:26.18  
 Max.   :91.00   Max.   :46.00 

Describe:

describe(UCBAdmissions)

describe(x, na.rm = TRUE, interp=FALSE,skew = TRUE, ranges = TRUE,trim=.1,
              type=3,check=TRUE,fast=NULL,quant=NULL,IQR=FALSE,omit=FALSE,data=NULL)
describeData(x,head=4,tail=4)
describeFast(x)  
> describe(USArrests)
         vars  n   mean    sd median trimmed
Murder      1 50   7.79  4.36   7.25    7.53
Assault     2 50 170.76 83.34 159.00  168.48
UrbanPop    3 50  65.54 14.47  66.00   65.88
Rape        4 50  21.23  9.37  20.10   20.36
            mad  min   max range  skew kurtosis
Murder     5.41  0.8  17.4  16.6  0.37    -0.95
Assault  110.45 45.0 337.0 292.0  0.22    -1.15
UrbanPop  17.79 32.0  91.0  59.0 -0.21    -0.87
Rape       8.60  7.3  46.0  38.7  0.75     0.08
            se
Murder    0.62
Assault  11.79
UrbanPop  2.05
Rape      1.32

 table

table(...,
      exclude = if (useNA == "no") c(NA, NaN),
      useNA = c("no", "ifany", "always"),
      dnn = list.names(...), deparse.level = 1)

as.table(x, ...)
is.table(x)

## S3 method for class 'table'
as.data.frame(x, row.names = NULL, ...,
              responseName = "Freq", stringsAsFactors = TRUE,
              sep = "", base = list(LETTERS))

 

Levels:

levels(x)
levels(x) <- value

like:

> table(UCBAdmissions)
UCBAdmissions
  8  17  19  22  24  53  89  94 120 131 138 202 
  1   1   1   1   1   1   1   1   1   1   2   1 
205 207 244 279 299 313 317 351 353 391 512 
  1   1   1   1   1   1   1   1   1   1   1 
#this is a mistake
x <- gl(2, 4, 8)
levels(x)[1] <- "low"
levels(x)[2] <- "high"
x
## combine some levels
z <- gl(3, 2, 12, labels = c("apple", "salad", "orange"))
z
levels(z) <- c("fruit", "veg", "fruit")
z

这里通过函数gl 生成的因子向量 z 的效果是一个长度为12的因子向量,其中每个水平都按照设定的重复次数重复。在这个例子中,"apple"、"salad"、"orange" 这三个水平分别重复了2次,共计12个因子

> z
 [1] apple  apple  salad  salad  orange orange
 [7] apple  apple  salad  salad  orange orange
Levels: apple salad orange

然后通过levels函数将三个标签转化成"fruit", "veg", "fruit",起到合并的效果

> levels(z) <- c("fruit", "veg", "fruit")
> z
 [1] fruit fruit veg   veg   fruit fruit fruit
 [8] fruit veg   veg   fruit fruit
Levels: fruit veg

或者直接使用,起到输出标签数组的作用:

> levels(z)
[1] "fruit" "veg"  
02-26 07:17