版权声明:本套课程材料开源,使用和分享必须遵守「创作共用许可协议 CC BY-NC-SA」(来源引用-非商业用途使用-以相同方式共享)。
$操作符:直接赋值⭐️dplyr包:mutate()函数data.table包::=象牙运算符 /
let()函数⭐️bruceR包:add()与added()函数⭐️ Var
1 可乐
2 雪碧
3 咖啡
4 茶
Var NewVar1
1 可乐 可乐
2 雪碧 雪碧
3 咖啡 咖啡
4 茶 茶
'data.frame': 4 obs. of 2 variables:
$ Var : chr "可乐" "雪碧" "咖啡" "茶"
$ NewVar1: Factor w/ 4 levels "茶","咖啡","可乐",..: 3 4 2 1
## 方式 2:mutate() 函数【dplyr包】
d = d %>% mutate(
NewVar2 = as.factor(Var),
NewVar3 = as.numeric(NewVar2),
NewVar4 = as.character(NewVar3)
)
d Var NewVar1 NewVar2 NewVar3 NewVar4
1 可乐 可乐 可乐 3 3
2 雪碧 雪碧 雪碧 4 4
3 咖啡 咖啡 咖啡 2 2
4 茶 茶 茶 1 1
'data.frame': 4 obs. of 5 variables:
$ Var : chr "可乐" "雪碧" "咖啡" "茶"
$ NewVar1: Factor w/ 4 levels "茶","咖啡","可乐",..: 3 4 2 1
$ NewVar2: Factor w/ 4 levels "茶","咖啡","可乐",..: 3 4 2 1
$ NewVar3: num 3 4 2 1
$ NewVar4: chr "3" "4" "2" "1"
Var
<char>
1: 可乐
2: 雪碧
3: 咖啡
4: 茶
Var NewVar1
<char> <fctr>
1: 可乐 可乐
2: 雪碧 雪碧
3: 咖啡 咖啡
4: 茶 茶
Var NewVar1 NewVar2
<char> <fctr> <fctr>
1: 可乐 可乐 可乐
2: 雪碧 雪碧 雪碧
3: 咖啡 咖啡 咖啡
4: 茶 茶 茶
d[, let(NewVar2 = as.factor(Var))] # let() 完全等于 `:=`()
d[, let(
NewVar2 = as.factor(Var),
NewVar3 = NewVar1 %>% as.numeric(),
NewVar4 = NewVar1 %>% as.numeric() %>% as.character()
)]
d Var NewVar1 NewVar2 NewVar3 NewVar4
<char> <fctr> <fctr> <num> <char>
1: 可乐 可乐 可乐 3 3
2: 雪碧 雪碧 雪碧 4 4
3: 咖啡 咖啡 咖啡 2 2
4: 茶 茶 茶 1 1
Classes 'data.table' and 'data.frame': 4 obs. of 5 variables:
$ Var : chr "可乐" "雪碧" "咖啡" "茶"
$ NewVar1: Factor w/ 4 levels "茶","咖啡","可乐",..: 3 4 2 1
$ NewVar2: Factor w/ 4 levels "茶","咖啡","可乐",..: 3 4 2 1
$ NewVar3: num 3 4 2 1
$ NewVar4: chr "3" "4" "2" "1"
- attr(*, ".internal.selfref")=<externalptr>
## 方式(4):add() 与 added() 函数【bruceR包】
d = data.table(Var = c("可乐", "雪碧", "咖啡", "茶"))
dd = d %>% add({ # add():不改变原数据
NewVar1 = as.factor(Var) # 行尾无需逗号
NewVar2 = as.numeric(NewVar1) # 新变量可复用
})
dd Var NewVar1 NewVar2
<char> <fctr> <num>
1: 可乐 可乐 3
2: 雪碧 雪碧 4
3: 咖啡 咖啡 2
4: 茶 茶 1
d %>% added({ # added():原地更新数据
NewVar1 = as.factor(Var) # 行尾无需逗号
NewVar2 = as.numeric(NewVar1) # 新变量可复用
})
d Var NewVar1 NewVar2
<char> <fctr> <num>
1: 可乐 可乐 3
2: 雪碧 雪碧 4
3: 咖啡 咖啡 2
4: 茶 茶 1
word
<char>
1: a
2: able
3: about
4: absolute
5: accept
6: account
7: achieve
8: across
9: act
10: active
d1[, let(
len = nchar(word),
a_t = str_detect(word, "^a.*t$"),
new = str_replace_all(word, "c", "_")
)]
d1 word len a_t new
<char> <int> <lgcl> <char>
1: a 1 FALSE a
2: able 4 FALSE able
3: about 5 TRUE about
4: absolute 8 FALSE absolute
5: accept 6 TRUE a__ept
6: account 7 TRUE a__ount
7: achieve 7 FALSE a_hieve
8: across 6 FALSE a_ross
9: act 3 TRUE a_t
10: active 6 FALSE a_tive
d2 = data.table(word=stringr::words[1:10])
d3 = d2 %>% add({
len = nchar(word)
a_t = str_detect(word, "^a.*t$")
new = str_replace_all(word, "c", "_")
})
d3 word len a_t new
<char> <int> <lgcl> <char>
1: a 1 FALSE a
2: able 4 FALSE able
3: about 5 TRUE about
4: absolute 8 FALSE absolute
5: accept 6 TRUE a__ept
6: account 7 TRUE a__ount
7: achieve 7 FALSE a_hieve
8: across 6 FALSE a_ross
9: act 3 TRUE a_t
10: active 6 FALSE a_tive
word
<char>
1: a
2: able
3: about
4: absolute
5: accept
6: account
7: achieve
8: across
9: act
10: active
word len
<char> <int>
1: a 1
2: able 4
3: about 5
4: absolute 8
5: accept 6
word len
<char> <int>
1: a 1
2: able 4
3: about 5
4: absolute 8
5: accept 6
## 深复制(copy函数)
d1 = data.table(word=stringr::words[1:5])
d2 = copy(d1) # 完全复制,两者已是不同对象
d1[, let(len = nchar(word))]
d1 word len
<char> <int>
1: a 1
2: able 4
3: about 5
4: absolute 8
5: accept 6
word
<char>
1: a
2: able
3: about
4: absolute
5: accept
x
<num>
1: 0
2: 10
3: 20
4: 30
5: 40
6: 50
7: 60
8: 70
9: 80
10: 90
11: 100
[1] 50
[1] 33.17
[1] -1.5076 -1.2060 -0.9045 -0.6030 -0.3015 0.0000 0.3015 0.6030 0.9045
[10] 1.2060 1.5076
[,1]
[1,] -1.5076
[2,] -1.2060
[3,] -0.9045
[4,] -0.6030
[5,] -0.3015
[6,] 0.0000
[7,] 0.3015
[8,] 0.6030
[9,] 0.9045
[10,] 1.2060
[11,] 1.5076
attr(,"scaled:center")
[1] 50
attr(,"scaled:scale")
[1] 33.17
added(d, {
x.c1 = x - mean(x) # 中心化:手动计算
x.c2 = scale(x, scale=FALSE) # 中心化:scale()函数
x.std1 = (x - mean(x)) / sd(x) # 标准化:手动计算
x.std2 = scale(x) # 标准化:scale()函数
})
d x x.c1 x.c2 x.std1 x.std2
<num> <num> <num> <num> <num>
1: 0 -50 -50 -1.5076 -1.5076
2: 10 -40 -40 -1.2060 -1.2060
3: 20 -30 -30 -0.9045 -0.9045
4: 30 -20 -20 -0.6030 -0.6030
5: 40 -10 -10 -0.3015 -0.3015
6: 50 0 0 0.0000 0.0000
7: 60 10 10 0.3015 0.3015
8: 70 20 20 0.6030 0.6030
9: 80 30 30 0.9045 0.9045
10: 90 40 40 1.2060 1.2060
11: 100 50 50 1.5076 1.5076
Classes 'data.table' and 'data.frame': 11 obs. of 5 variables:
$ x : num 0 10 20 30 40 50 60 70 80 90 ...
$ x.c1 : num -50 -40 -30 -20 -10 0 10 20 30 40 ...
$ x.c2 : num -50 -40 -30 -20 -10 0 10 20 30 40 ...
..- attr(*, "scaled:center")= num 50
$ x.std1: num -1.508 -1.206 -0.905 -0.603 -0.302 ...
$ x.std2: num -1.508 -1.206 -0.905 -0.603 -0.302 ...
..- attr(*, "scaled:center")= num 50
..- attr(*, "scaled:scale")= num 33.2
- attr(*, ".internal.selfref")=<externalptr>
$`scaled:center`
[1] 50
$`scaled:center`
[1] 50
$`scaled:scale`
[1] 33.17
## 例2(使用bruceR包的中心化函数)
d = data.table(x = seq(0, 100, 10),
y = seq(100, 200, 10))
d1 = grand_mean_center(d)
d1 x y
<num> <num>
1: -50 -50
2: -40 -40
3: -30 -30
4: -20 -20
5: -10 -10
6: 0 0
7: 10 10
8: 20 20
9: 30 30
10: 40 40
11: 50 50
x y x.c y.c
<num> <num> <num> <num>
1: 0 100 -50 -50
2: 10 110 -40 -40
3: 20 120 -30 -30
4: 30 130 -20 -20
5: 40 140 -10 -10
6: 50 150 0 0
7: 60 160 10 10
8: 70 170 20 20
9: 80 180 30 30
10: 90 190 40 40
11: 100 200 50 50
x y x.std y.std
<num> <num> <num> <num>
1: 0 100 -1.5076 -1.5076
2: 10 110 -1.2060 -1.2060
3: 20 120 -0.9045 -0.9045
4: 30 130 -0.6030 -0.6030
5: 40 140 -0.3015 -0.3015
6: 50 150 0.0000 0.0000
7: 60 160 0.3015 0.3015
8: 70 170 0.6030 0.6030
9: 80 180 0.9045 0.9045
10: 90 190 1.2060 1.2060
11: 100 200 1.5076 1.5076
d = data.table(
x1 = 1:5,
x4 = c(2,2,5,4,5),
x3 = c(3,2,NA,NA,5),
x2 = c(4,4,NA,2,5),
x5 = c(5,4,1,4,5)
)
d # d是数据框对象,也是数据框名称,之后需要频繁使用 x1 x4 x3 x2 x5
<int> <num> <num> <num> <num>
1: 1 2 3 4 5
2: 2 2 2 4 4
3: 3 5 NA NA 1
4: 4 4 NA 2 4
5: 5 5 5 5 5
## 大写函数(需要提供数据框名称):SUM()、MEAN()
d1 = add(d, {
sum = SUM(d, "x", 1:5) # SUM(data=d, var="x", items=1:5)
mean = MEAN(d, "x", 1:5) # MEAN(data=d, var="x", items=1:5)
mean1 = MEAN(d, vars=c("x1", "x4"))
mean2 = MEAN(d, varrange="x1:x4") # 找变量名,而不是从1数到4
mean3 = MEAN(d, varrange="x1:x3") # 找变量名,而不是从1数到3
})
d1 x1 x4 x3 x2 x5 sum mean mean1 mean2 mean3
<int> <num> <num> <num> <num> <num> <num> <num> <num> <num>
1: 1 2 3 4 5 15 3.0 1.5 1.5 2
2: 2 2 2 4 4 14 2.8 2.0 2.0 2
3: 3 5 NA NA 1 9 3.0 4.0 4.0 4
4: 4 4 NA 2 4 14 3.5 4.0 4.0 4
5: 5 5 5 5 5 25 5.0 5.0 5.0 5
## 小写函数(必须搭配add或added使用):.sum()、.mean()
d2 = add(d, {
sum = .sum("x", 1:5)
mean = .mean("x", 1:5)
})
d2 x1 x4 x3 x2 x5 sum mean
<int> <num> <num> <num> <num> <num> <num>
1: 1 2 3 4 5 15 3.0
2: 2 2 2 4 4 14 2.8
3: 3 5 NA NA 1 9 3.0
4: 4 4 NA 2 4 14 3.5
5: 5 5 5 5 5 25 5.0
var变量名共同部分中,以大括号{i}占位题目序号 XX.1.pre XX.2.pre XX.3.pre
<int> <int> <int>
1: 1 6 11
2: 2 7 12
3: 3 8 13
4: 4 9 14
5: 5 10 15
XX.1.pre XX.2.pre XX.3.pre XX.mean
<int> <int> <int> <num>
1: 1 6 11 6
2: 2 7 12 7
3: 3 8 13 8
4: 4 9 14 9
5: 5 10 15 10
XX.1.pre XX.2.pre XX.3.pre XX.mean
<int> <int> <int> <num>
1: 1 6 11 6
2: 2 7 12 7
3: 3 8 13 8
4: 4 9 14 9
5: 5 10 15 10
Classes 'data.table' and 'data.frame': 2800 obs. of 28 variables:
$ A1 : int 2 2 5 4 2 6 2 4 4 2 ...
$ A2 : int 4 4 4 4 3 6 5 3 3 5 ...
$ A3 : int 3 5 5 6 3 5 5 1 6 6 ...
$ A4 : int 4 2 4 5 4 6 3 5 3 6 ...
$ A5 : int 4 5 4 5 5 5 5 1 3 5 ...
$ C1 : int 2 5 4 4 4 6 5 3 6 6 ...
$ C2 : int 3 4 5 4 4 6 4 2 6 5 ...
$ C3 : int 3 4 4 3 5 6 4 4 3 6 ...
$ C4 : int 4 3 2 5 3 1 2 2 4 2 ...
$ C5 : int 4 4 5 5 2 3 3 4 5 1 ...
$ E1 : int 3 1 2 5 2 2 4 3 5 2 ...
$ E2 : int 3 1 4 3 2 1 3 6 3 2 ...
$ E3 : int 3 6 4 4 5 6 4 4 NA 4 ...
$ E4 : int 4 4 4 4 4 5 5 2 4 5 ...
$ E5 : int 4 3 5 4 5 6 5 1 3 5 ...
$ N1 : int 3 3 4 2 2 3 1 6 5 5 ...
$ N2 : int 4 3 5 5 3 5 2 3 5 5 ...
$ N3 : int 2 3 4 2 4 2 2 2 2 5 ...
$ N4 : int 2 5 2 4 4 2 1 6 3 2 ...
$ N5 : int 3 5 3 1 3 3 1 4 3 4 ...
$ O1 : int 3 4 4 3 3 4 5 3 6 5 ...
$ O2 : int 6 2 2 3 3 3 2 2 6 1 ...
$ O3 : int 3 4 5 4 4 5 5 4 6 5 ...
$ O4 : int 4 3 5 3 3 6 6 5 6 5 ...
$ O5 : int 3 3 2 5 3 1 1 3 1 2 ...
$ gender : int 1 2 2 2 1 2 1 1 1 2 ...
$ education: int NA NA NA NA NA 3 NA 2 1 NA ...
$ age : int 16 18 17 17 17 21 18 19 19 17 ...
- attr(*, ".internal.selfref")=<externalptr>
added(d.bfi, {
age = age
gender = factor(gender, levels=1:2, labels=c("Male", "Female"))
education = as.factor(education)
E = .mean("E", 1:5, rev=c(1,2), range=1:6)
A = .mean("A", 1:5, rev=1, range=1:6)
C = .mean("C", 1:5, rev=c(4,5), range=1:6)
N = .mean("N", 1:5, rev=NULL, range=1:6)
O = .mean("O", 1:5, rev=c(2,5), range=1:6)
}, drop=TRUE) # 删去原有变量,只保留新变量
d.bfi gender education age E A C N O
<fctr> <fctr> <int> <num> <num> <num> <num> <num>
1: Male <NA> 16 3.8 4.0 2.8 2.80 3.0
2: Female <NA> 18 5.0 4.2 4.0 3.80 4.0
3: Female <NA> 17 4.2 3.8 4.0 3.60 4.8
4: Female <NA> 17 3.6 4.6 3.0 2.80 3.2
5: Male <NA> 17 4.8 4.0 4.4 3.20 3.6
---
2796: Male 3 19 5.0 2.2 6.0 1.00 6.0
2797: Male 4 27 4.2 4.2 3.2 2.75 4.8
2798: Female 4 29 5.0 4.0 5.4 2.80 5.0
2799: Male 4 31 4.6 2.8 4.2 4.20 5.2
2800: Female 4 50 2.6 3.0 4.2 1.40 4.6
Classes 'data.table' and 'data.frame': 2800 obs. of 8 variables:
$ gender : Factor w/ 2 levels "Male","Female": 1 2 2 2 1 2 1 1 1 2 ...
$ education: Factor w/ 5 levels "1","2","3","4",..: NA NA NA NA NA 3 NA 2 1 NA ...
$ age : int 16 18 17 17 17 21 18 19 19 17 ...
$ E : num 3.8 5 4.2 3.6 4.8 5.6 4.2 2.4 3.25 4.8 ...
$ A : num 4 4.2 3.8 4.6 4 4.6 4.6 2.6 3.6 5.4 ...
$ C : num 2.8 4 4 3 4.4 5.6 4.4 3.4 4 5.6 ...
$ N : num 2.8 3.8 3.6 2.8 3.2 3 1.4 4.2 3.6 4.2 ...
$ O : num 3 4 4.8 3.2 3.6 5 5.4 4.2 5 5.2 ...
- attr(*, ".internal.selfref")=<externalptr>
作业要求:
data.table数据对象类型(而不是data.frame)data.table包的let()函数或bruceR包的add()、added()函数计算变量平台提交: