R语言进行数据预处理wranging
li_volleyball
2016年3月22日
data wrangling with R
packages:tidyr dplyr
Ground rules
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
View(iris)
View(mtcars)
select(mtcars,am:1)
## am vs qsec wt drat hp disp cyl mpg
## Mazda RX4 1 0 16.46 2.620 3.90 110 160.0 6 21.0
## Mazda RX4 Wag 1 0 17.02 2.875 3.90 110 160.0 6 21.0
## Datsun 710 1 1 18.61 2.320 3.85 93 108.0 4 22.8
## Hornet 4 Drive 0 1 19.44 3.215 3.08 110 258.0 6 21.4
## Hornet Sportabout 0 0 17.02 3.440 3.15 175 360.0 8 18.7
## Valiant 0 1 20.22 3.460 2.76 105 225.0 6 18.1
## Duster 360 0 0 15.84 3.570 3.21 245 360.0 8 14.3
## Merc 240D 0 1 20.00 3.190 3.69 62 146.7 4 24.4
## Merc 230 0 1 22.90 3.150 3.92 95 140.8 4 22.8
## Merc 280 0 1 18.30 3.440 3.92 123 167.6 6 19.2
## Merc 280C 0 1 18.90 3.440 3.92 123 167.6 6 17.8
## Merc 450SE 0 0 17.40 4.070 3.07 180 275.8 8 16.4
## Merc 450SL 0 0 17.60 3.730 3.07 180 275.8 8 17.3
## Merc 450SLC 0 0 18.00 3.780 3.07 180 275.8 8 15.2
## Cadillac Fleetwood 0 0 17.98 5.250 2.93 205 472.0 8 10.4
## Lincoln Continental 0 0 17.82 5.424 3.00 215 460.0 8 10.4
## Chrysler Imperial 0 0 17.42 5.345 3.23 230 440.0 8 14.7
## Fiat 128 1 1 19.47 2.200 4.08 66 78.7 4 32.4
## Honda Civic 1 1 18.52 1.615 4.93 52 75.7 4 30.4
## Toyota Corolla 1 1 19.90 1.835 4.22 65 71.1 4 33.9
## Toyota Corona 0 1 20.01 2.465 3.70 97 120.1 4 21.5
## Dodge Challenger 0 0 16.87 3.520 2.76 150 318.0 8 15.5
## AMC Javelin 0 0 17.30 3.435 3.15 150 304.0 8 15.2
## Camaro Z28 0 0 15.41 3.840 3.73 245 350.0 8 13.3
## Pontiac Firebird 0 0 17.05 3.845 3.08 175 400.0 8 19.2
## Fiat X1-9 1 1 18.90 1.935 4.08 66 79.0 4 27.3
## Porsche 914-2 1 0 16.70 2.140 4.43 91 120.3 4 26.0
## Lotus Europa 1 1 16.90 1.513 3.77 113 95.1 4 30.4
## Ford Pantera L 1 0 14.50 3.170 4.22 264 351.0 8 15.8
## Ferrari Dino 1 0 15.50 2.770 3.62 175 145.0 6 19.7
## Maserati Bora 1 0 14.60 3.570 3.54 335 301.0 8 15.0
## Volvo 142E 1 1 18.60 2.780 4.11 109 121.0 4 21.4
mtcars %>% select(am:1)
## am vs qsec wt drat hp disp cyl mpg
## Mazda RX4 1 0 16.46 2.620 3.90 110 160.0 6 21.0
## Mazda RX4 Wag 1 0 17.02 2.875 3.90 110 160.0 6 21.0
## Datsun 710 1 1 18.61 2.320 3.85 93 108.0 4 22.8
## Hornet 4 Drive 0 1 19.44 3.215 3.08 110 258.0 6 21.4
## Hornet Sportabout 0 0 17.02 3.440 3.15 175 360.0 8 18.7
## Valiant 0 1 20.22 3.460 2.76 105 225.0 6 18.1
## Duster 360 0 0 15.84 3.570 3.21 245 360.0 8 14.3
## Merc 240D 0 1 20.00 3.190 3.69 62 146.7 4 24.4
## Merc 230 0 1 22.90 3.150 3.92 95 140.8 4 22.8
## Merc 280 0 1 18.30 3.440 3.92 123 167.6 6 19.2
## Merc 280C 0 1 18.90 3.440 3.92 123 167.6 6 17.8
## Merc 450SE 0 0 17.40 4.070 3.07 180 275.8 8 16.4
## Merc 450SL 0 0 17.60 3.730 3.07 180 275.8 8 17.3
## Merc 450SLC 0 0 18.00 3.780 3.07 180 275.8 8 15.2
## Cadillac Fleetwood 0 0 17.98 5.250 2.93 205 472.0 8 10.4
## Lincoln Continental 0 0 17.82 5.424 3.00 215 460.0 8 10.4
## Chrysler Imperial 0 0 17.42 5.345 3.23 230 440.0 8 14.7
## Fiat 128 1 1 19.47 2.200 4.08 66 78.7 4 32.4
## Honda Civic 1 1 18.52 1.615 4.93 52 75.7 4 30.4
## Toyota Corolla 1 1 19.90 1.835 4.22 65 71.1 4 33.9
## Toyota Corona 0 1 20.01 2.465 3.70 97 120.1 4 21.5
## Dodge Challenger 0 0 16.87 3.520 2.76 150 318.0 8 15.5
## AMC Javelin 0 0 17.30 3.435 3.15 150 304.0 8 15.2
## Camaro Z28 0 0 15.41 3.840 3.73 245 350.0 8 13.3
## Pontiac Firebird 0 0 17.05 3.845 3.08 175 400.0 8 19.2
## Fiat X1-9 1 1 18.90 1.935 4.08 66 79.0 4 27.3
## Porsche 914-2 1 0 16.70 2.140 4.43 91 120.3 4 26.0
## Lotus Europa 1 1 16.90 1.513 3.77 113 95.1 4 30.4
## Ford Pantera L 1 0 14.50 3.170 4.22 264 351.0 8 15.8
## Ferrari Dino 1 0 15.50 2.770 3.62 175 145.0 6 19.7
## Maserati Bora 1 0 14.60 3.570 3.54 335 301.0 8 15.0
## Volvo 142E 1 1 18.60 2.780 4.11 109 121.0 4 21.4
example1<-data.frame(A=c(paste("x",1:6,sep = "")),
B=seq(1,11,2),
c=1:6,
date=c("2000-08-15","1998-07-15","1995-06-04","1997-07-01","1999-06-01","1996-06-25"))
example1
## A B c date
## 1 x1 1 1 2000-08-15
## 2 x2 3 2 1998-07-15
## 3 x3 5 3 1995-06-04
## 4 x4 7 4 1997-07-01
## 5 x5 9 5 1999-06-01
## 6 x6 11 6 1996-06-25
# 一个变量一列
# 一个观测值一行
#每一种观测在一个表里
#separate()
separate(example1,date,c("Y","m","d"),sep="-")
## A B c Y m d
## 1 x1 1 1 2000 08 15
## 2 x2 3 2 1998 07 15
## 3 x3 5 3 1995 06 04
## 4 x4 7 4 1997 07 01
## 5 x5 9 5 1999 06 01
## 6 x6 11 6 1996 06 25
example12<-example1 %>% separate(date,c("Y","m","d"),sep="-")
#unite()
unite(example12,"YM",Y,m,sep="-")
## A B c YM d
## 1 x1 1 1 2000-08 15
## 2 x2 3 2 1998-07 15
## 3 x3 5 3 1995-06 04
## 4 x4 7 4 1997-07 01
## 5 x5 9 5 1999-06 01
## 6 x6 11 6 1996-06 25
#select()
select(example1,A,B)
## A B
## 1 x1 1
## 2 x2 3
## 3 x3 5
## 4 x4 7
## 5 x5 9
## 6 x6 11
select(example1,-A)
## B c date
## 1 1 1 2000-08-15
## 2 3 2 1998-07-15
## 3 5 3 1995-06-04
## 4 7 4 1997-07-01
## 5 9 5 1999-06-01
## 6 11 6 1996-06-25
select(example1,B:date)
## B c date
## 1 1 1 2000-08-15
## 2 3 2 1998-07-15
## 3 5 3 1995-06-04
## 4 7 4 1997-07-01
## 5 9 5 1999-06-01
## 6 11 6 1996-06-25
select(example1,starts_with("d"))
## date
## 1 2000-08-15
## 2 1998-07-15
## 3 1995-06-04
## 4 1997-07-01
## 5 1999-06-01
## 6 1996-06-25
select(example1,ends_with("e"))
## date
## 1 2000-08-15
## 2 1998-07-15
## 3 1995-06-04
## 4 1997-07-01
## 5 1999-06-01
## 6 1996-06-25
select(example1,contains("a"))
## A date
## 1 x1 2000-08-15
## 2 x2 1998-07-15
## 3 x3 1995-06-04
## 4 x4 1997-07-01
## 5 x5 1999-06-01
## 6 x6 1996-06-25
#filter()
filter(example1,B>=6)
## A B c date
## 1 x4 7 4 1997-07-01
## 2 x5 9 5 1999-06-01
## 3 x6 11 6 1996-06-25
filter(example1,B>=6,A%in%c("x1","x4","x5"))
## A B c date
## 1 x4 7 4 1997-07-01
## 2 x5 9 5 1999-06-01
#mutate()
mutate(example1,ratio=B/c)
## A B c date ratio
## 1 x1 1 1 2000-08-15 1.000000
## 2 x2 3 2 1998-07-15 1.500000
## 3 x3 5 3 1995-06-04 1.666667
## 4 x4 7 4 1997-07-01 1.750000
## 5 x5 9 5 1999-06-01 1.800000
## 6 x6 11 6 1996-06-25 1.833333
mutate(example1,ratio=B/c,inverse=ratio-1)
## A B c date ratio inverse
## 1 x1 1 1 2000-08-15 1.000000 0.0000000
## 2 x2 3 2 1998-07-15 1.500000 0.5000000
## 3 x3 5 3 1995-06-04 1.666667 0.6666667
## 4 x4 7 4 1997-07-01 1.750000 0.7500000
## 5 x5 9 5 1999-06-01 1.800000 0.8000000
## 6 x6 11 6 1996-06-25 1.833333 0.8333333
mutate(example1,cumsum(B))
## A B c date cumsum(B)
## 1 x1 1 1 2000-08-15 1
## 2 x2 3 2 1998-07-15 4
## 3 x3 5 3 1995-06-04 9
## 4 x4 7 4 1997-07-01 16
## 5 x5 9 5 1999-06-01 25
## 6 x6 11 6 1996-06-25 36
mutate(example1,cumsum(B),cummean(B),cumany(B>6),cumall(B>6))
## A B c date cumsum(B) cummean(B) cumany(B > 6) cumall(B > 6)
## 1 x1 1 1 2000-08-15 1 1 FALSE FALSE
## 2 x2 3 2 1998-07-15 4 2 FALSE FALSE
## 3 x3 5 3 1995-06-04 9 3 FALSE FALSE
## 4 x4 7 4 1997-07-01 16 4 TRUE FALSE
## 5 x5 9 5 1999-06-01 25 5 TRUE FALSE
## 6 x6 11 6 1996-06-25 36 6 TRUE FALSE
mutate(example1,cummin(B),cummax(B))
## A B c date cummin(B) cummax(B)
## 1 x1 1 1 2000-08-15 1 1
## 2 x2 3 2 1998-07-15 1 3
## 3 x3 5 3 1995-06-04 1 5
## 4 x4 7 4 1997-07-01 1 7
## 5 x5 9 5 1999-06-01 1 9
## 6 x6 11 6 1996-06-25 1 11
mutate(example1,between(B,4,8))
## A B c date between(B, 4, 8)
## 1 x1 1 1 2000-08-15 FALSE
## 2 x2 3 2 1998-07-15 FALSE
## 3 x3 5 3 1995-06-04 TRUE
## 4 x4 7 4 1997-07-01 TRUE
## 5 x5 9 5 1999-06-01 FALSE
## 6 x6 11 6 1996-06-25 FALSE
mutate(example1,cume_dist(B))
## A B c date cume_dist(B)
## 1 x1 1 1 2000-08-15 0.1666667
## 2 x2 3 2 1998-07-15 0.3333333
## 3 x3 5 3 1995-06-04 0.5000000
## 4 x4 7 4 1997-07-01 0.6666667
## 5 x5 9 5 1999-06-01 0.8333333
## 6 x6 11 6 1996-06-25 1.0000000
example1 %>% mutate(ratio=B/c) %>% select(A,ratio)
## A ratio
## 1 x1 1.000000
## 2 x2 1.500000
## 3 x3 1.666667
## 4 x4 1.750000
## 5 x5 1.800000
## 6 x6 1.833333
#cume_dist=相对位置(行序号)/绝对行数
#gather() and spread()
example2<-data.frame(country=c("FR","DE","US"),
"2011"=c(7000,5800,15000),
"2012"=c(6900,6000,14000),
"2013"=c(7000,6200,13000),check.names=F)
example2
## country 2011 2012 2013
## 1 FR 7000 6900 7000
## 2 DE 5800 6000 6200
## 3 US 15000 14000 13000
#gather()
gather(example2,"year","amount",2:4)
## country year amount
## 1 FR 2011 7000
## 2 DE 2011 5800
## 3 US 2011 15000
## 4 FR 2012 6900
## 5 DE 2012 6000
## 6 US 2012 14000
## 7 FR 2013 7000
## 8 DE 2013 6200
## 9 US 2013 13000
example2 %>% gather("year","amount",2:4)
## country year amount
## 1 FR 2011 7000
## 2 DE 2011 5800
## 3 US 2011 15000
## 4 FR 2012 6900
## 5 DE 2012 6000
## 6 US 2012 14000
## 7 FR 2013 7000
## 8 DE 2013 6200
## 9 US 2013 13000
#spread()
example3<-data.frame(city=c("NY","NY","LD","LD","BJ","BJ"),
size=rep(c("large","small"),3),
n=c(23,147,22,16,121,56))
example3
## city size n
## 1 NY large 23
## 2 NY small 147
## 3 LD large 22
## 4 LD small 16
## 5 BJ large 121
## 6 BJ small 56
spread(example3,size,n)
## city large small
## 1 BJ 121 56
## 2 LD 22 16
## 3 NY 23 147
example3 %>% spread(size,n)
## city large small
## 1 BJ 121 56
## 2 LD 22 16
## 3 NY 23 147
#summarise()
example3 %>% summarise(median=median(n),variance=var(n))
## median variance
## 1 39.5 3190.167
example3 %>% summarise(mean = mean(n), sum = sum(n), n = n())
## mean sum n
## 1 64.16667 385 6
#arrange()
arrange(example3,n)
## city size n
## 1 LD small 16
## 2 LD large 22
## 3 NY large 23
## 4 BJ small 56
## 5 BJ large 121
## 6 NY small 147
arrange(example3,desc(n))
## city size n
## 1 NY small 147
## 2 BJ large 121
## 3 BJ small 56
## 4 NY large 23
## 5 LD large 22
## 6 LD small 16
#unite of analiysis
example3 %>% group_by(city)
## Source: local data frame [6 x 3]
## Groups: city [3]
##
## city size n
## (fctr) (fctr) (dbl)
## 1 NY large 23
## 2 NY small 147
## 3 LD large 22
## 4 LD small 16
## 5 BJ large 121
## 6 BJ small 56
example3 %>% group_by(city) %>% summarise(mean=mean(n),sum=sum(n))
## Source: local data frame [3 x 3]
##
## city mean sum
## (fctr) (dbl) (dbl)
## 1 BJ 88.5 177
## 2 LD 19.0 38
## 3 NY 85.0 170
example3 %>% group_by(size) %>% summarise(mean=mean(n))
## Source: local data frame [2 x 2]
##
## size mean
## (fctr) (dbl)
## 1 large 55.33333
## 2 small 73.00000
#joining data
X<-data.frame(x1=c("A","B","C","D"),x2=1:4,stringsAsFactors = F)
X
## x1 x2
## 1 A 1
## 2 B 2
## 3 C 3
## 4 D 4
Y<-data.frame(y1=c("B","C","D","E"),y2=seq(2,8,2),stringsAsFactors = F)
Y
## y1 y2
## 1 B 2
## 2 C 4
## 3 D 6
## 4 E 8
X1<-data.frame(x1=c("B","C","D","E"),x2=2:5,stringsAsFactors = F)
bind_cols(X,Y)
## Source: local data frame [4 x 4]
##
## x1 x2 y1 y2
## (chr) (int) (chr) (dbl)
## 1 A 1 B 2
## 2 B 2 C 4
## 3 C 3 D 6
## 4 D 4 E 8
bind_rows(X,Y)
## Source: local data frame [8 x 4]
##
## x1 x2 y1 y2
## (chr) (int) (chr) (dbl)
## 1 A 1 NA NA
## 2 B 2 NA NA
## 3 C 3 NA NA
## 4 D 4 NA NA
## 5 NA NA B 2
## 6 NA NA C 4
## 7 NA NA D 6
## 8 NA NA E 8
bind_rows(X,X1)
## Source: local data frame [8 x 2]
##
## x1 x2
## (chr) (int)
## 1 A 1
## 2 B 2
## 3 C 3
## 4 D 4
## 5 B 2
## 6 C 3
## 7 D 4
## 8 E 5
left_join(X,X1,by="x1")
## x1 x2.x x2.y
## 1 A 1 NA
## 2 B 2 2
## 3 C 3 3
## 4 D 4 4
inner_join(X,X1,by="x1")
## x1 x2.x x2.y
## 1 B 2 2
## 2 C 3 3
## 3 D 4 4
semi_join(X,X1,by="x1")
## x1 x2
## 1 B 2
## 2 C 3
## 3 D 4
anti_join(X,X1,bu="x1")
## Joining by: c("x1", "x2")
## x1 x2
## 1 A 1
anti_join(X1,X,by="x1")
## x1 x2
## 1 E 5
setdiff(X,X1)
## x1 x2
## 1 A 1
setdiff(X1,X)
## x1 x2
## 1 E 5
first <- mtcars[1:20,1:5 ]
second <- mtcars[10:32,1:5 ]
intersect(first, second)
## mpg cyl disp hp drat
## 1 19.2 6 167.6 123 3.92
## 2 17.8 6 167.6 123 3.92
## 3 16.4 8 275.8 180 3.07
## 4 17.3 8 275.8 180 3.07
## 5 15.2 8 275.8 180 3.07
## 6 10.4 8 472.0 205 2.93
## 7 10.4 8 460.0 215 3.00
## 8 14.7 8 440.0 230 3.23
## 9 32.4 4 78.7 66 4.08
## 10 30.4 4 75.7 52 4.93
## 11 33.9 4 71.1 65 4.22
union(first, second)
## mpg cyl disp hp drat
## 1 33.9 4 71.1 65 4.22
## 2 19.2 6 167.6 123 3.92
## 3 30.4 4 75.7 52 4.93
## 4 18.1 6 225.0 105 2.76
## 5 19.7 6 145.0 175 3.62
## 6 15.5 8 318.0 150 2.76
## 7 26.0 4 120.3 91 4.43
## 8 10.4 8 460.0 215 3.00
## 9 14.7 8 440.0 230 3.23
## 10 13.3 8 350.0 245 3.73
## 11 21.5 4 120.1 97 3.70
## 12 19.2 8 400.0 175 3.08
## 13 15.2 8 304.0 150 3.15
## 14 14.3 8 360.0 245 3.21
## 15 32.4 4 78.7 66 4.08
## 16 27.3 4 79.0 66 4.08
## 17 17.8 6 167.6 123 3.92
## 18 30.4 4 95.1 113 3.77
## 19 24.4 4 146.7 62 3.69
## 20 18.7 8 360.0 175 3.15
## 21 21.4 6 258.0 110 3.08
## 22 15.2 8 275.8 180 3.07
## 23 17.3 8 275.8 180 3.07
## 24 22.8 4 140.8 95 3.92
## 25 21.0 6 160.0 110 3.90
## 26 15.8 8 351.0 264 4.22
## 27 16.4 8 275.8 180 3.07
## 28 21.4 4 121.0 109 4.11
## 29 15.0 8 301.0 335 3.54
## 30 22.8 4 108.0 93 3.85
## 31 10.4 8 472.0 205 2.93
setdiff(first, second)
## mpg cyl disp hp drat
## 1 21.0 6 160.0 110 3.90
## 2 22.8 4 108.0 93 3.85
## 3 21.4 6 258.0 110 3.08
## 4 18.7 8 360.0 175 3.15
## 5 18.1 6 225.0 105 2.76
## 6 14.3 8 360.0 245 3.21
## 7 24.4 4 146.7 62 3.69
## 8 22.8 4 140.8 95 3.92
setdiff(second, first)
## mpg cyl disp hp drat
## 1 21.5 4 120.1 97 3.70
## 2 15.5 8 318.0 150 2.76
## 3 15.2 8 304.0 150 3.15
## 4 13.3 8 350.0 245 3.73
## 5 19.2 8 400.0 175 3.08
## 6 27.3 4 79.0 66 4.08
## 7 26.0 4 120.3 91 4.43
## 8 30.4 4 95.1 113 3.77
## 9 15.8 8 351.0 264 4.22
## 10 19.7 6 145.0 175 3.62
## 11 15.0 8 301.0 335 3.54
## 12 21.4 4 121.0 109 4.11
setequal(mtcars, mtcars[32:1, ])
## TRUE