版权声明:本套课程材料开源,使用和分享必须遵守「创作共用许可协议 CC BY-NC-SA」(来源引用-非商业用途使用-以相同方式共享)。


变量计算的总体介绍

【实践1】数据框变量的多种计算方式

  • $操作符:直接赋值⭐️
  • dplyr包:mutate()函数
  • data.table包::=象牙运算符 / let()函数⭐️
  • bruceR包:add()added()函数⭐️
d = data.frame(Var = c("可乐", "雪碧", "咖啡", "茶"))
d
   Var
1 可乐
2 雪碧
3 咖啡
4   茶
## 方式 1:$ 操作符
d$NewVar1 = as.factor(d$Var)
d
   Var NewVar1
1 可乐    可乐
2 雪碧    雪碧
3 咖啡    咖啡
4   茶      茶
str(d)
'data.frame':   4 obs. of  2 variables:
 $ Var    : chr  "可乐" "雪碧" "咖啡" "茶"
 $ NewVar1: Factor w/ 4 levels "茶","咖啡","可乐",..: 3 4 2 1
## 方式 2:mutate() 函数【dplyr包】
d = d %>% mutate(
  NewVar2 = as.factor(Var),
  NewVar3 = as.numeric(NewVar2),
  NewVar4 = as.character(NewVar3)
)
d
   Var NewVar1 NewVar2 NewVar3 NewVar4
1 可乐    可乐    可乐       3       3
2 雪碧    雪碧    雪碧       4       4
3 咖啡    咖啡    咖啡       2       2
4   茶      茶      茶       1       1
str(d)
'data.frame':   4 obs. of  5 variables:
 $ Var    : chr  "可乐" "雪碧" "咖啡" "茶"
 $ NewVar1: Factor w/ 4 levels "茶","咖啡","可乐",..: 3 4 2 1
 $ NewVar2: Factor w/ 4 levels "茶","咖啡","可乐",..: 3 4 2 1
 $ NewVar3: num  3 4 2 1
 $ NewVar4: chr  "3" "4" "2" "1"
## 方式 3:`:=` 象牙运算符 / let() 函数【data.table包】
d = data.table(Var = c("可乐", "雪碧", "咖啡", "茶"))
d
      Var
   <char>
1:   可乐
2:   雪碧
3:   咖啡
4:     茶
d[, NewVar1 := as.factor(Var)]       # 一次只能计算一个变量
d
      Var NewVar1
   <char>  <fctr>
1:   可乐    可乐
2:   雪碧    雪碧
3:   咖啡    咖啡
4:     茶      茶
d[,`:=`(NewVar2 = as.factor(Var))]   # 一次可以计算多个变量
d
      Var NewVar1 NewVar2
   <char>  <fctr>  <fctr>
1:   可乐    可乐    可乐
2:   雪碧    雪碧    雪碧
3:   咖啡    咖啡    咖啡
4:     茶      茶      茶
d[, let(NewVar2 = as.factor(Var))]   # let() 完全等于 `:=`()
d[, let(
  NewVar2 = as.factor(Var),
  NewVar3 = NewVar1 %>% as.numeric(),
  NewVar4 = NewVar1 %>% as.numeric() %>% as.character()
)]
d
      Var NewVar1 NewVar2 NewVar3 NewVar4
   <char>  <fctr>  <fctr>   <num>  <char>
1:   可乐    可乐    可乐       3       3
2:   雪碧    雪碧    雪碧       4       4
3:   咖啡    咖啡    咖啡       2       2
4:     茶      茶      茶       1       1
str(d)
Classes 'data.table' and 'data.frame':  4 obs. of  5 variables:
 $ Var    : chr  "可乐" "雪碧" "咖啡" "茶"
 $ NewVar1: Factor w/ 4 levels "茶","咖啡","可乐",..: 3 4 2 1
 $ NewVar2: Factor w/ 4 levels "茶","咖啡","可乐",..: 3 4 2 1
 $ NewVar3: num  3 4 2 1
 $ NewVar4: chr  "3" "4" "2" "1"
 - attr(*, ".internal.selfref")=<externalptr> 
## 方式(4):add() 与 added() 函数【bruceR包】
d = data.table(Var = c("可乐", "雪碧", "咖啡", "茶"))
dd = d %>% add({                  # add():不改变原数据
  NewVar1 = as.factor(Var)        # 行尾无需逗号
  NewVar2 = as.numeric(NewVar1)   # 新变量可复用
})
dd
      Var NewVar1 NewVar2
   <char>  <fctr>   <num>
1:   可乐    可乐       3
2:   雪碧    雪碧       4
3:   咖啡    咖啡       2
4:     茶      茶       1
d %>% added({                     # added():原地更新数据
  NewVar1 = as.factor(Var)        # 行尾无需逗号
  NewVar2 = as.numeric(NewVar1)   # 新变量可复用
})
d
      Var NewVar1 NewVar2
   <char>  <fctr>   <num>
1:   可乐    可乐       3
2:   雪碧    雪碧       4
3:   咖啡    咖啡       2
4:     茶      茶       1

【实践2】基于data.table的字符串处理和变量计算

d1 = data.table(word=stringr::words[1:10])
d1
        word
      <char>
 1:        a
 2:     able
 3:    about
 4: absolute
 5:   accept
 6:  account
 7:  achieve
 8:   across
 9:      act
10:   active
d1[, let(
  len = nchar(word),
  a_t = str_detect(word, "^a.*t$"),
  new = str_replace_all(word, "c", "_")
)]
d1
        word   len    a_t      new
      <char> <int> <lgcl>   <char>
 1:        a     1  FALSE        a
 2:     able     4  FALSE     able
 3:    about     5   TRUE    about
 4: absolute     8  FALSE absolute
 5:   accept     6   TRUE   a__ept
 6:  account     7   TRUE  a__ount
 7:  achieve     7  FALSE  a_hieve
 8:   across     6  FALSE   a_ross
 9:      act     3   TRUE      a_t
10:   active     6  FALSE   a_tive
d2 = data.table(word=stringr::words[1:10])
d3 = d2 %>% add({
  len = nchar(word)
  a_t = str_detect(word, "^a.*t$")
  new = str_replace_all(word, "c", "_")
})
d3
        word   len    a_t      new
      <char> <int> <lgcl>   <char>
 1:        a     1  FALSE        a
 2:     able     4  FALSE     able
 3:    about     5   TRUE    about
 4: absolute     8  FALSE absolute
 5:   accept     6   TRUE   a__ept
 6:  account     7   TRUE  a__ount
 7:  achieve     7  FALSE  a_hieve
 8:   across     6  FALSE   a_ross
 9:      act     3   TRUE      a_t
10:   active     6  FALSE   a_tive
d2  # add()不改变原数据
        word
      <char>
 1:        a
 2:     able
 3:    about
 4: absolute
 5:   accept
 6:  account
 7:  achieve
 8:   across
 9:      act
10:   active

【知识点】data.table的“浅复制”与“深复制”问题

## 浅复制(=)
d1 = d2 = data.table(word=stringr::words[1:5])
d1[, let(len = nchar(word))]
d1
       word   len
     <char> <int>
1:        a     1
2:     able     4
3:    about     5
4: absolute     8
5:   accept     6
d2  # d2跟着d1一起变了!
       word   len
     <char> <int>
1:        a     1
2:     able     4
3:    about     5
4: absolute     8
5:   accept     6
## 深复制(copy函数)
d1 = data.table(word=stringr::words[1:5])
d2 = copy(d1)  # 完全复制,两者已是不同对象
d1[, let(len = nchar(word))]
d1
       word   len
     <char> <int>
1:        a     1
2:     able     4
3:    about     5
4: absolute     8
5:   accept     6
d2  # d2不会跟着d1一起变
       word
     <char>
1:        a
2:     able
3:    about
4: absolute
5:   accept

变量中心化与标准化

【知识点】变量中心化与标准化

  • 中心化:变量减去均值(mean)
  • 标准化:变量减去均值(mean),再除以标准差(sd)

【实践3】基于data.table的变量中心化与标准化

## 例1
d = data.table(x = seq(0, 100, 10))
d
        x
    <num>
 1:     0
 2:    10
 3:    20
 4:    30
 5:    40
 6:    50
 7:    60
 8:    70
 9:    80
10:    90
11:   100
mean(d$x)
[1] 50
sd(d$x)
[1] 33.17
(d$x - mean(d$x)) / sd(d$x)
 [1] -1.5076 -1.2060 -0.9045 -0.6030 -0.3015  0.0000  0.3015  0.6030  0.9045
[10]  1.2060  1.5076
scale(d$x)
         [,1]
 [1,] -1.5076
 [2,] -1.2060
 [3,] -0.9045
 [4,] -0.6030
 [5,] -0.3015
 [6,]  0.0000
 [7,]  0.3015
 [8,]  0.6030
 [9,]  0.9045
[10,]  1.2060
[11,]  1.5076
attr(,"scaled:center")
[1] 50
attr(,"scaled:scale")
[1] 33.17
added(d, {
  x.c1 = x - mean(x)              # 中心化:手动计算
  x.c2 = scale(x, scale=FALSE)    # 中心化:scale()函数
  x.std1 = (x - mean(x)) / sd(x)  # 标准化:手动计算
  x.std2 = scale(x)               # 标准化:scale()函数
})
d
        x  x.c1  x.c2  x.std1  x.std2
    <num> <num> <num>   <num>   <num>
 1:     0   -50   -50 -1.5076 -1.5076
 2:    10   -40   -40 -1.2060 -1.2060
 3:    20   -30   -30 -0.9045 -0.9045
 4:    30   -20   -20 -0.6030 -0.6030
 5:    40   -10   -10 -0.3015 -0.3015
 6:    50     0     0  0.0000  0.0000
 7:    60    10    10  0.3015  0.3015
 8:    70    20    20  0.6030  0.6030
 9:    80    30    30  0.9045  0.9045
10:    90    40    40  1.2060  1.2060
11:   100    50    50  1.5076  1.5076
str(d)  # 数据结构
Classes 'data.table' and 'data.frame':  11 obs. of  5 variables:
 $ x     : num  0 10 20 30 40 50 60 70 80 90 ...
 $ x.c1  : num  -50 -40 -30 -20 -10 0 10 20 30 40 ...
 $ x.c2  : num  -50 -40 -30 -20 -10 0 10 20 30 40 ...
  ..- attr(*, "scaled:center")= num 50
 $ x.std1: num  -1.508 -1.206 -0.905 -0.603 -0.302 ...
 $ x.std2: num  -1.508 -1.206 -0.905 -0.603 -0.302 ...
  ..- attr(*, "scaled:center")= num 50
  ..- attr(*, "scaled:scale")= num 33.2
 - attr(*, ".internal.selfref")=<externalptr> 
attributes(d$x.c2)
$`scaled:center`
[1] 50
attributes(d$x.std2)
$`scaled:center`
[1] 50

$`scaled:scale`
[1] 33.17
## 例2(使用bruceR包的中心化函数)
d = data.table(x = seq(0, 100, 10),
               y = seq(100, 200, 10))

d1 = grand_mean_center(d)
d1
        x     y
    <num> <num>
 1:   -50   -50
 2:   -40   -40
 3:   -30   -30
 4:   -20   -20
 5:   -10   -10
 6:     0     0
 7:    10    10
 8:    20    20
 9:    30    30
10:    40    40
11:    50    50
d2 = grand_mean_center(
  d,
  vars = c("x", "y"),
  add.suffix = ".c")
d2
        x     y   x.c   y.c
    <num> <num> <num> <num>
 1:     0   100   -50   -50
 2:    10   110   -40   -40
 3:    20   120   -30   -30
 4:    30   130   -20   -20
 5:    40   140   -10   -10
 6:    50   150     0     0
 7:    60   160    10    10
 8:    70   170    20    20
 9:    80   180    30    30
10:    90   190    40    40
11:   100   200    50    50
d3 = grand_mean_center(
  d,
  vars = c("x", "y"),
  std = TRUE,  # 标准化
  add.suffix = ".std")
d3
        x     y   x.std   y.std
    <num> <num>   <num>   <num>
 1:     0   100 -1.5076 -1.5076
 2:    10   110 -1.2060 -1.2060
 3:    20   120 -0.9045 -0.9045
 4:    30   130 -0.6030 -0.6030
 5:    40   140 -0.3015 -0.3015
 6:    50   150  0.0000  0.0000
 7:    60   160  0.3015  0.3015
 8:    70   170  0.6030  0.6030
 9:    80   180  0.9045  0.9045
10:    90   190  1.2060  1.2060
11:   100   200  1.5076  1.5076

总分与平均分的计算

【实践4】计算多变量的总分与平均分

d = data.table(
  x1 = 1:5,
  x4 = c(2,2,5,4,5),
  x3 = c(3,2,NA,NA,5),
  x2 = c(4,4,NA,2,5),
  x5 = c(5,4,1,4,5)
)
d  # d是数据框对象,也是数据框名称,之后需要频繁使用
      x1    x4    x3    x2    x5
   <int> <num> <num> <num> <num>
1:     1     2     3     4     5
2:     2     2     2     4     4
3:     3     5    NA    NA     1
4:     4     4    NA     2     4
5:     5     5     5     5     5
## 大写函数(需要提供数据框名称):SUM()、MEAN()
d1 = add(d, {
  sum = SUM(d, "x", 1:5)    # SUM(data=d, var="x", items=1:5)
  mean = MEAN(d, "x", 1:5)  # MEAN(data=d, var="x", items=1:5)
  mean1 = MEAN(d, vars=c("x1", "x4"))
  mean2 = MEAN(d, varrange="x1:x4")  # 找变量名,而不是从1数到4
  mean3 = MEAN(d, varrange="x1:x3")  # 找变量名,而不是从1数到3
})
d1
      x1    x4    x3    x2    x5   sum  mean mean1 mean2 mean3
   <int> <num> <num> <num> <num> <num> <num> <num> <num> <num>
1:     1     2     3     4     5    15   3.0   1.5   1.5     2
2:     2     2     2     4     4    14   2.8   2.0   2.0     2
3:     3     5    NA    NA     1     9   3.0   4.0   4.0     4
4:     4     4    NA     2     4    14   3.5   4.0   4.0     4
5:     5     5     5     5     5    25   5.0   5.0   5.0     5
## 小写函数(必须搭配add或added使用):.sum()、.mean()
d2 = add(d, {
  sum = .sum("x", 1:5)
  mean = .mean("x", 1:5)
})
d2
      x1    x4    x3    x2    x5   sum  mean
   <int> <num> <num> <num> <num> <num> <num>
1:     1     2     3     4     5    15   3.0
2:     2     2     2     4     4    14   2.8
3:     3     5    NA    NA     1     9   3.0
4:     4     4    NA     2     4    14   3.5
5:     5     5     5     5     5    25   5.0

【实践5】题目序号在变量名中间的特殊情况

  • var变量名共同部分中,以大括号{i}占位题目序号
d = data.table(
  XX.1.pre = 1:5,
  XX.2.pre = 6:10,
  XX.3.pre = 11:15
)
d
   XX.1.pre XX.2.pre XX.3.pre
      <int>    <int>    <int>
1:        1        6       11
2:        2        7       12
3:        3        8       13
4:        4        9       14
5:        5       10       15
add(d, {
  XX.mean = .mean("XX.{i}.pre", 1:3)
})
   XX.1.pre XX.2.pre XX.3.pre XX.mean
      <int>    <int>    <int>   <num>
1:        1        6       11       6
2:        2        7       12       7
3:        3        8       13       8
4:        4        9       14       9
5:        5       10       15      10
# 相同效果
add(d, {
  XX.mean = .mean("XX.{items}.pre", 1:3)
})
   XX.1.pre XX.2.pre XX.3.pre XX.mean
      <int>    <int>    <int>   <num>
1:        1        6       11       6
2:        2        7       12       7
3:        3        8       13       8
4:        4        9       14       9
5:        5       10       15      10

反向计分与重新编码

【实践6】大五人格问卷BFI各维度平均分计算

  • 大五人格维度
    • Extraversion(外倾性)
    • Agreeableness(宜人性)
    • Conscientiousness(尽责性)
    • Neuroticism(神经质/情绪性)
    • Openness(开放性)
d.bfi = as.data.table(psych::bfi)
str(d.bfi)
Classes 'data.table' and 'data.frame':  2800 obs. of  28 variables:
 $ A1       : int  2 2 5 4 2 6 2 4 4 2 ...
 $ A2       : int  4 4 4 4 3 6 5 3 3 5 ...
 $ A3       : int  3 5 5 6 3 5 5 1 6 6 ...
 $ A4       : int  4 2 4 5 4 6 3 5 3 6 ...
 $ A5       : int  4 5 4 5 5 5 5 1 3 5 ...
 $ C1       : int  2 5 4 4 4 6 5 3 6 6 ...
 $ C2       : int  3 4 5 4 4 6 4 2 6 5 ...
 $ C3       : int  3 4 4 3 5 6 4 4 3 6 ...
 $ C4       : int  4 3 2 5 3 1 2 2 4 2 ...
 $ C5       : int  4 4 5 5 2 3 3 4 5 1 ...
 $ E1       : int  3 1 2 5 2 2 4 3 5 2 ...
 $ E2       : int  3 1 4 3 2 1 3 6 3 2 ...
 $ E3       : int  3 6 4 4 5 6 4 4 NA 4 ...
 $ E4       : int  4 4 4 4 4 5 5 2 4 5 ...
 $ E5       : int  4 3 5 4 5 6 5 1 3 5 ...
 $ N1       : int  3 3 4 2 2 3 1 6 5 5 ...
 $ N2       : int  4 3 5 5 3 5 2 3 5 5 ...
 $ N3       : int  2 3 4 2 4 2 2 2 2 5 ...
 $ N4       : int  2 5 2 4 4 2 1 6 3 2 ...
 $ N5       : int  3 5 3 1 3 3 1 4 3 4 ...
 $ O1       : int  3 4 4 3 3 4 5 3 6 5 ...
 $ O2       : int  6 2 2 3 3 3 2 2 6 1 ...
 $ O3       : int  3 4 5 4 4 5 5 4 6 5 ...
 $ O4       : int  4 3 5 3 3 6 6 5 6 5 ...
 $ O5       : int  3 3 2 5 3 1 1 3 1 2 ...
 $ gender   : int  1 2 2 2 1 2 1 1 1 2 ...
 $ education: int  NA NA NA NA NA 3 NA 2 1 NA ...
 $ age      : int  16 18 17 17 17 21 18 19 19 17 ...
 - attr(*, ".internal.selfref")=<externalptr> 
added(d.bfi, {
  age = age
  gender = factor(gender, levels=1:2, labels=c("Male", "Female"))
  education = as.factor(education)
  E = .mean("E", 1:5, rev=c(1,2), range=1:6)
  A = .mean("A", 1:5, rev=1, range=1:6)
  C = .mean("C", 1:5, rev=c(4,5), range=1:6)
  N = .mean("N", 1:5, rev=NULL, range=1:6)
  O = .mean("O", 1:5, rev=c(2,5), range=1:6)
}, drop=TRUE)  # 删去原有变量,只保留新变量
d.bfi
      gender education   age     E     A     C     N     O
      <fctr>    <fctr> <int> <num> <num> <num> <num> <num>
   1:   Male      <NA>    16   3.8   4.0   2.8  2.80   3.0
   2: Female      <NA>    18   5.0   4.2   4.0  3.80   4.0
   3: Female      <NA>    17   4.2   3.8   4.0  3.60   4.8
   4: Female      <NA>    17   3.6   4.6   3.0  2.80   3.2
   5:   Male      <NA>    17   4.8   4.0   4.4  3.20   3.6
  ---                                                     
2796:   Male         3    19   5.0   2.2   6.0  1.00   6.0
2797:   Male         4    27   4.2   4.2   3.2  2.75   4.8
2798: Female         4    29   5.0   4.0   5.4  2.80   5.0
2799:   Male         4    31   4.6   2.8   4.2  4.20   5.2
2800: Female         4    50   2.6   3.0   4.2  1.40   4.6
str(d.bfi)
Classes 'data.table' and 'data.frame':  2800 obs. of  8 variables:
 $ gender   : Factor w/ 2 levels "Male","Female": 1 2 2 2 1 2 1 1 1 2 ...
 $ education: Factor w/ 5 levels "1","2","3","4",..: NA NA NA NA NA 3 NA 2 1 NA ...
 $ age      : int  16 18 17 17 17 21 18 19 19 17 ...
 $ E        : num  3.8 5 4.2 3.6 4.8 5.6 4.2 2.4 3.25 4.8 ...
 $ A        : num  4 4.2 3.8 4.6 4 4.6 4.6 2.6 3.6 5.4 ...
 $ C        : num  2.8 4 4 3 4.4 5.6 4.4 3.4 4 5.6 ...
 $ N        : num  2.8 3.8 3.6 2.8 3.2 3 1.4 4.2 3.6 4.2 ...
 $ O        : num  3 4 4.8 3.2 3.6 5 5.4 4.2 5 5.2 ...
 - attr(*, ".internal.selfref")=<externalptr> 

【作业6】期末作业数据变量计算

作业要求:

  • 基于【作业4】的期末自选公开数据,运用本章所学的变量计算方法,对数据变量进行计算和预处理
    • 请使用data.table数据对象类型(而不是data.frame
    • 可选用data.table包的let()函数或bruceR包的add()added()函数计算变量
  • 使用R Markdown完成

平台提交:

  • 运行得到的HTML网页,及其关键部分截图
