版权声明:本套课程材料开源,使用和分享必须遵守「创作共用许可协议 CC BY-NC-SA」(来源引用-非商业用途使用-以相同方式共享)。


数据操作类R包概述与比较

【知识点】data.table核心要素

【探索发现】data.table vs. dplyr (tidyverse) 数据操作代码风格比较

【探索发现】性能大PK:data.table (R) vs. dplyr (R) vs. pandas (Python)

分组汇总速度 (0.5 GB)

(数据规模:1000万行 × 9个变量)

分组汇总速度 (5 GB)

(数据规模:1亿行 × 9个变量)

分组汇总速度 (50 GB)

(数据规模:10亿行 × 9个变量)

增删查改(data.table)

【实践1】增删查改

## 数据准备:大五人格问卷BFI(部分数据)
data = as.data.table(psych::bfi)
d = data[31:50, c("gender", "education", "age")]
d
    gender education   age
     <int>     <int> <int>
 1:      2        NA    17
 2:      2         3    41
 3:      1         5    23
 4:      2        NA    17
 5:      1         3    20
 6:      1         3    23
 7:      1         3    20
 8:      1         3    21
 9:      1        NA    30
10:      2         5    48
11:      2         3    40
12:      2         4    27
13:      1         1    18
14:      1         4    20
15:      2         5    24
16:      1         3    25
17:      1         2    22
18:      2         1    18
19:      2         1    43
20:      1         3    20
    gender education   age
     <int>     <int> <int>
str(d)
Classes 'data.table' and 'data.frame':  20 obs. of  3 variables:
 $ gender   : int  2 2 1 2 1 1 1 1 1 2 ...
 $ education: int  NA 3 5 NA 3 3 3 3 NA 5 ...
 $ age      : int  17 41 23 17 20 23 20 21 30 48 ...
 - attr(*, ".internal.selfref")=<externalptr> 
1. 增加
## 方式 1: 单变量 $ 操作符
d$Gender = factor(d$gender, levels=1:2, labels=c("Male", "Female"))
d
    gender education   age Gender
     <int>     <int> <int> <fctr>
 1:      2        NA    17 Female
 2:      2         3    41 Female
 3:      1         5    23   Male
 4:      2        NA    17 Female
 5:      1         3    20   Male
 6:      1         3    23   Male
 7:      1         3    20   Male
 8:      1         3    21   Male
 9:      1        NA    30   Male
10:      2         5    48 Female
11:      2         3    40 Female
12:      2         4    27 Female
13:      1         1    18   Male
14:      1         4    20   Male
15:      2         5    24 Female
16:      1         3    25   Male
17:      1         2    22   Male
18:      2         1    18 Female
19:      2         1    43 Female
20:      1         3    20   Male
    gender education   age Gender
     <int>     <int> <int> <fctr>
## 方式 2: 单变量 := 运算符 (原地更新)
d[, Gender := factor(gender, levels=1:2, labels=c("Male", "Female"))]
d
    gender education   age Gender
     <int>     <int> <int> <fctr>
 1:      2        NA    17 Female
 2:      2         3    41 Female
 3:      1         5    23   Male
 4:      2        NA    17 Female
 5:      1         3    20   Male
 6:      1         3    23   Male
 7:      1         3    20   Male
 8:      1         3    21   Male
 9:      1        NA    30   Male
10:      2         5    48 Female
11:      2         3    40 Female
12:      2         4    27 Female
13:      1         1    18   Male
14:      1         4    20   Male
15:      2         5    24 Female
16:      1         3    25   Male
17:      1         2    22   Male
18:      2         1    18 Female
19:      2         1    43 Female
20:      1         3    20   Male
    gender education   age Gender
     <int>     <int> <int> <fctr>
## 方式 3: 多变量 let() 函数 (原地更新)
d[, let(
  Gender = factor(gender, levels=1:2, labels=c("Male", "Female")),
  Age = as.numeric(age),
  Edu = as.factor(education)
)]
d
    gender education   age Gender   Age    Edu
     <int>     <int> <int> <fctr> <num> <fctr>
 1:      2        NA    17 Female    17   <NA>
 2:      2         3    41 Female    41      3
 3:      1         5    23   Male    23      5
 4:      2        NA    17 Female    17   <NA>
 5:      1         3    20   Male    20      3
 6:      1         3    23   Male    23      3
 7:      1         3    20   Male    20      3
 8:      1         3    21   Male    21      3
 9:      1        NA    30   Male    30   <NA>
10:      2         5    48 Female    48      5
11:      2         3    40 Female    40      3
12:      2         4    27 Female    27      4
13:      1         1    18   Male    18      1
14:      1         4    20   Male    20      4
15:      2         5    24 Female    24      5
16:      1         3    25   Male    25      3
17:      1         2    22   Male    22      2
18:      2         1    18 Female    18      1
19:      2         1    43 Female    43      1
20:      1         3    20   Male    20      3
    gender education   age Gender   Age    Edu
     <int>     <int> <int> <fctr> <num> <fctr>
str(d)
Classes 'data.table' and 'data.frame':  20 obs. of  6 variables:
 $ gender   : int  2 2 1 2 1 1 1 1 1 2 ...
 $ education: int  NA 3 5 NA 3 3 3 3 NA 5 ...
 $ age      : int  17 41 23 17 20 23 20 21 30 48 ...
 $ Gender   : Factor w/ 2 levels "Male","Female": 2 2 1 2 1 1 1 1 1 2 ...
 $ Age      : num  17 41 23 17 20 23 20 21 30 48 ...
 $ Edu      : Factor w/ 5 levels "1","2","3","4",..: NA 3 5 NA 3 3 3 3 NA 5 ...
 - attr(*, ".internal.selfref")=<externalptr> 

(本小节进度:1/4)

2. 删除
## 单变量
d$Age = d$Edu = NULL
d
    gender education   age Gender
     <int>     <int> <int> <fctr>
 1:      2        NA    17 Female
 2:      2         3    41 Female
 3:      1         5    23   Male
 4:      2        NA    17 Female
 5:      1         3    20   Male
 6:      1         3    23   Male
 7:      1         3    20   Male
 8:      1         3    21   Male
 9:      1        NA    30   Male
10:      2         5    48 Female
11:      2         3    40 Female
12:      2         4    27 Female
13:      1         1    18   Male
14:      1         4    20   Male
15:      2         5    24 Female
16:      1         3    25   Male
17:      1         2    22   Male
18:      2         1    18 Female
19:      2         1    43 Female
20:      1         3    20   Male
    gender education   age Gender
     <int>     <int> <int> <fctr>
## 多变量
d[, let(
  gender = NULL,
  education = NULL,
  age = NULL
)]
d
    Gender
    <fctr>
 1: Female
 2: Female
 3:   Male
 4: Female
 5:   Male
 6:   Male
 7:   Male
 8:   Male
 9:   Male
10: Female
11: Female
12: Female
13:   Male
14:   Male
15: Female
16:   Male
17:   Male
18: Female
19: Female
20:   Male
    Gender
    <fctr>

(本小节进度:2/4)

3. 查找
d = data[31:50, c("gender", "education", "age", paste0("A", 1:5))]
d
    gender education   age    A1    A2    A3    A4    A5
     <int>     <int> <int> <int> <int> <int> <int> <int>
 1:      2        NA    17     1     6     5     6     3
 2:      2         3    41     2     5     6     6     6
 3:      1         5    23     1     5     6     5     4
 4:      2        NA    17     2     4     5     6     5
 5:      1         3    20     4     4     4     4     4
 6:      1         3    23     5     3     5     4     2
 7:      1         3    20     1     6     4     6     4
 8:      1         3    21     1     4     4     2     3
 9:      1        NA    30     1     6     6     6     6
10:      2         5    48     1     5     4     3     5
11:      2         3    40     1     5     5     6     5
12:      2         4    27     5     4     3     6     4
13:      1         1    18     1     5     4     4     5
14:      1         4    20     5     6     4     3     5
15:      2         5    24     2     6     6     6     6
16:      1         3    25     1     6     6     6     6
17:      1         2    22     5     5     3     4     3
18:      2         1    18     2     6     4     5     5
19:      2         1    43     1     5     3     2     3
20:      1         3    20     1     6     6     6     6
    gender education   age    A1    A2    A3    A4    A5
     <int>     <int> <int> <int> <int> <int> <int> <int>
str(d)
Classes 'data.table' and 'data.frame':  20 obs. of  8 variables:
 $ gender   : int  2 2 1 2 1 1 1 1 1 2 ...
 $ education: int  NA 3 5 NA 3 3 3 3 NA 5 ...
 $ age      : int  17 41 23 17 20 23 20 21 30 48 ...
 $ A1       : int  1 2 1 2 4 5 1 1 1 1 ...
 $ A2       : int  6 5 5 4 4 3 6 4 6 5 ...
 $ A3       : int  5 6 6 5 4 5 4 4 6 4 ...
 $ A4       : int  6 6 5 6 4 4 6 2 6 3 ...
 $ A5       : int  3 6 4 5 4 2 4 3 6 5 ...
 - attr(*, ".internal.selfref")=<externalptr> 
d[1:5, "gender"]  # DT[i, j]
   gender
    <int>
1:      2
2:      2
3:      1
4:      2
5:      1
d[1:5, gender]  # DT[i, j]
[1] 2 2 1 2 1
d[1:5, c("gender", "age")]  # DT[i, j]
   gender   age
    <int> <int>
1:      2    17
2:      2    41
3:      1    23
4:      2    17
5:      1    20
d[1:5, .(gender, age)]  # DT[i, j]
   gender   age
    <int> <int>
1:      2    17
2:      2    41
3:      1    23
4:      2    17
5:      1    20
d[1:3]  # DT[i]
   gender education   age    A1    A2    A3    A4    A5
    <int>     <int> <int> <int> <int> <int> <int> <int>
1:      2        NA    17     1     6     5     6     3
2:      2         3    41     2     5     6     6     6
3:      1         5    23     1     5     6     5     4
d[1:3, ]  # DT[i, ]
   gender education   age    A1    A2    A3    A4    A5
    <int>     <int> <int> <int> <int> <int> <int> <int>
1:      2        NA    17     1     6     5     6     3
2:      2         3    41     2     5     6     6     6
3:      1         5    23     1     5     6     5     4
d[, 2:3]  # DT[i, ]
    education   age
        <int> <int>
 1:        NA    17
 2:         3    41
 3:         5    23
 4:        NA    17
 5:         3    20
 6:         3    23
 7:         3    20
 8:         3    21
 9:        NA    30
10:         5    48
11:         3    40
12:         4    27
13:         1    18
14:         4    20
15:         5    24
16:         3    25
17:         2    22
18:         1    18
19:         1    43
20:         3    20
    education   age
        <int> <int>
d[, age:A3]  # DT[, j](变量名范围,不会数数字)
      age    A1    A2    A3
    <int> <int> <int> <int>
 1:    17     1     6     5
 2:    41     2     5     6
 3:    23     1     5     6
 4:    17     2     4     5
 5:    20     4     4     4
 6:    23     5     3     5
 7:    20     1     6     4
 8:    21     1     4     4
 9:    30     1     6     6
10:    48     1     5     4
11:    40     1     5     5
12:    27     5     4     3
13:    18     1     5     4
14:    20     5     6     4
15:    24     2     6     6
16:    25     1     6     6
17:    22     5     5     3
18:    18     2     6     4
19:    43     1     5     3
20:    20     1     6     6
      age    A1    A2    A3
    <int> <int> <int> <int>

(本小节进度:3/4)

4. 修改
d = data[2536:2540, c("gender", "education", "age")]
d
   gender education   age
    <int>     <int> <int>
1:      2         5    56
2:      1         5     3
3:      1         2    65
4:      2         3    34
5:      2         4    29
d[2, "age"] = NA  # 无效年龄(3岁),替换为NA缺失值
d
   gender education   age
    <int>     <int> <int>
1:      2         5    56
2:      1         5    NA
3:      1         2    65
4:      2         3    34
5:      2         4    29
  • Q:如何自动筛选、替换无效年龄?
    • A:请看下一部分

(本小节进度:4/4)

行列筛选(data.table)

【实践2】行列筛选

## 数据准备:大五人格问卷BFI(部分数据)
data = as.data.table(psych::bfi)
data[, let(
  Gender = factor(gender, levels=1:2, labels=c("Male", "Female")),
  Age = as.numeric(age),
  Edu = as.factor(education)
)]
d = data[2535:2546, c("Gender", "Age", "Edu", paste0("A", 1:5))]
d
    Gender   Age    Edu    A1    A2    A3    A4    A5
    <fctr> <num> <fctr> <int> <int> <int> <int> <int>
 1: Female    20      3     1     6     6     6     6
 2: Female    56      5     3     5     3     5     5
 3:   Male     3      5     2     5     5     4     5
 4:   Male    65      2     5     4     2     6     4
 5: Female    34      3     2     4     3     4     4
 6: Female    29      4     1     6     5     3     5
 7: Female    21      4     4     4     5     4     5
 8: Female    32      2     3     5     4    NA     6
 9:   Male    26      2     2     6     5     4     4
10:   Male    27      4     2     2     1     2     3
11: Female    45      4     1     5     5     6     6
12:   Male    17   <NA>     2     4     4     3     4
str(d)
Classes 'data.table' and 'data.frame':  12 obs. of  8 variables:
 $ Gender: Factor w/ 2 levels "Male","Female": 2 2 1 1 2 2 2 2 1 1 ...
 $ Age   : num  20 56 3 65 34 29 21 32 26 27 ...
 $ Edu   : Factor w/ 5 levels "1","2","3","4",..: 3 5 5 2 3 4 4 2 2 4 ...
 $ A1    : int  1 3 2 5 2 1 4 3 2 2 ...
 $ A2    : int  6 5 5 4 4 6 4 5 6 2 ...
 $ A3    : int  6 3 5 2 3 5 5 4 5 1 ...
 $ A4    : int  6 5 4 6 4 3 4 NA 4 2 ...
 $ A5    : int  6 5 5 4 4 5 5 6 4 3 ...
 - attr(*, ".internal.selfref")=<externalptr> 

1. 行筛选
d[Gender == "Female"]
   Gender   Age    Edu    A1    A2    A3    A4    A5
   <fctr> <num> <fctr> <int> <int> <int> <int> <int>
1: Female    20      3     1     6     6     6     6
2: Female    56      5     3     5     3     5     5
3: Female    34      3     2     4     3     4     4
4: Female    29      4     1     6     5     3     5
5: Female    21      4     4     4     5     4     5
6: Female    32      2     3     5     4    NA     6
7: Female    45      4     1     5     5     6     6
d[Gender == "Male"]
   Gender   Age    Edu    A1    A2    A3    A4    A5
   <fctr> <num> <fctr> <int> <int> <int> <int> <int>
1:   Male     3      5     2     5     5     4     5
2:   Male    65      2     5     4     2     6     4
3:   Male    26      2     2     6     5     4     4
4:   Male    27      4     2     2     1     2     3
5:   Male    17   <NA>     2     4     4     3     4
d[Gender == "Male" & Age >= 16]
   Gender   Age    Edu    A1    A2    A3    A4    A5
   <fctr> <num> <fctr> <int> <int> <int> <int> <int>
1:   Male    65      2     5     4     2     6     4
2:   Male    26      2     2     6     5     4     4
3:   Male    27      4     2     2     1     2     3
4:   Male    17   <NA>     2     4     4     3     4
d[Gender == "Male" & Age >= 16 & !is.na(Edu)]
   Gender   Age    Edu    A1    A2    A3    A4    A5
   <fctr> <num> <fctr> <int> <int> <int> <int> <int>
1:   Male    65      2     5     4     2     6     4
2:   Male    26      2     2     6     5     4     4
3:   Male    27      4     2     2     1     2     3
d[Gender == "Male" & Age %in% 20:45]
   Gender   Age    Edu    A1    A2    A3    A4    A5
   <fctr> <num> <fctr> <int> <int> <int> <int> <int>
1:   Male    26      2     2     6     5     4     4
2:   Male    27      4     2     2     1     2     3
d[Gender == "Male" & Age %notin% 20:45]
   Gender   Age    Edu    A1    A2    A3    A4    A5
   <fctr> <num> <fctr> <int> <int> <int> <int> <int>
1:   Male     3      5     2     5     5     4     5
2:   Male    65      2     5     4     2     6     4
3:   Male    17   <NA>     2     4     4     3     4
d[Gender == "Male" & (Age < 20 | Age > 45)]
   Gender   Age    Edu    A1    A2    A3    A4    A5
   <fctr> <num> <fctr> <int> <int> <int> <int> <int>
1:   Male     3      5     2     5     5     4     5
2:   Male    65      2     5     4     2     6     4
3:   Male    17   <NA>     2     4     4     3     4
d[Age < 15]
   Gender   Age    Edu    A1    A2    A3    A4    A5
   <fctr> <num> <fctr> <int> <int> <int> <int> <int>
1:   Male     3      5     2     5     5     4     5
d[Age < 15, "Age"] = NA  # 自动筛选、替换无效年龄
d
    Gender   Age    Edu    A1    A2    A3    A4    A5
    <fctr> <num> <fctr> <int> <int> <int> <int> <int>
 1: Female    20      3     1     6     6     6     6
 2: Female    56      5     3     5     3     5     5
 3:   Male    NA      5     2     5     5     4     5
 4:   Male    65      2     5     4     2     6     4
 5: Female    34      3     2     4     3     4     4
 6: Female    29      4     1     6     5     3     5
 7: Female    21      4     4     4     5     4     5
 8: Female    32      2     3     5     4    NA     6
 9:   Male    26      2     2     6     5     4     4
10:   Male    27      4     2     2     1     2     3
11: Female    45      4     1     5     5     6     6
12:   Male    17   <NA>     2     4     4     3     4

(本小节进度:1/3)

2. 列筛选
d = data[2535:2546, c("Gender", "Age", "Edu", paste0("A", 1:5))]

## 直接筛选列
d[, Age:A3]  # DT[, j](变量名范围,不会数数字)
      Age    Edu    A1    A2    A3
    <num> <fctr> <int> <int> <int>
 1:    20      3     1     6     6
 2:    56      5     3     5     3
 3:     3      5     2     5     5
 4:    65      2     5     4     2
 5:    34      3     2     4     3
 6:    29      4     1     6     5
 7:    21      4     4     4     5
 8:    32      2     3     5     4
 9:    26      2     2     6     5
10:    27      4     2     2     1
11:    45      4     1     5     5
12:    17   <NA>     2     4     4
d[, .(A1, A2, A3, Edu, Age)]
       A1    A2    A3    Edu   Age
    <int> <int> <int> <fctr> <num>
 1:     1     6     6      3    20
 2:     3     5     3      5    56
 3:     2     5     5      5     3
 4:     5     4     2      2    65
 5:     2     4     3      3    34
 6:     1     6     5      4    29
 7:     4     4     5      4    21
 8:     3     5     4      2    32
 9:     2     6     5      2    26
10:     2     2     1      4    27
11:     1     5     5      4    45
12:     2     4     4   <NA>    17
## 间接筛选列
vars = c("Age", "Edu")
try({
  d[, vars]  # 报错,vars不是data.table内部环境中的变量名
})
Error in `[.data.table`(d, , vars) : 
  j(在 [...] 中的第二个参数)是单一符号,但未找到列名 'vars'。如果你打算在调用环境中使用变量选择列,请尝试 DT[, ..vars]。.. 前缀表示上一级,类似于文件系统路径。
d[, ..vars]  # ..表示从data.table内部环境返回到全局环境,使用变量选择列
      Age    Edu
    <num> <fctr>
 1:    20      3
 2:    56      5
 3:     3      5
 4:    65      2
 5:    34      3
 6:    29      4
 7:    21      4
 8:    32      2
 9:    26      2
10:    27      4
11:    45      4
12:    17   <NA>
## 还可以使用 dplyr::select() 函数,搭配 tidyselect 系列函数
select(d, starts_with("A"))
      Age    A1    A2    A3    A4    A5
    <num> <int> <int> <int> <int> <int>
 1:    20     1     6     6     6     6
 2:    56     3     5     3     5     5
 3:     3     2     5     5     4     5
 4:    65     5     4     2     6     4
 5:    34     2     4     3     4     4
 6:    29     1     6     5     3     5
 7:    21     4     4     5     4     5
 8:    32     3     5     4    NA     6
 9:    26     2     6     5     4     4
10:    27     2     2     1     2     3
11:    45     1     5     5     6     6
12:    17     2     4     4     3     4
select(d, matches("A\\d"))  # 正则表达式
       A1    A2    A3    A4    A5
    <int> <int> <int> <int> <int>
 1:     1     6     6     6     6
 2:     3     5     3     5     5
 3:     2     5     5     4     5
 4:     5     4     2     6     4
 5:     2     4     3     4     4
 6:     1     6     5     3     5
 7:     4     4     5     4     5
 8:     3     5     4    NA     6
 9:     2     6     5     4     4
10:     2     2     1     2     3
11:     1     5     5     6     6
12:     2     4     4     3     4
select(d, num_range("A", 1:5))  # 题序号范围
       A1    A2    A3    A4    A5
    <int> <int> <int> <int> <int>
 1:     1     6     6     6     6
 2:     3     5     3     5     5
 3:     2     5     5     4     5
 4:     5     4     2     6     4
 5:     2     4     3     4     4
 6:     1     6     5     3     5
 7:     4     4     5     4     5
 8:     3     5     4    NA     6
 9:     2     6     5     4     4
10:     2     2     1     2     3
11:     1     5     5     6     6
12:     2     4     4     3     4
d %>% select(num_range("A", 1:5))  # 管道操作符
       A1    A2    A3    A4    A5
    <int> <int> <int> <int> <int>
 1:     1     6     6     6     6
 2:     3     5     3     5     5
 3:     2     5     5     4     5
 4:     5     4     2     6     4
 5:     2     4     3     4     4
 6:     1     6     5     3     5
 7:     4     4     5     4     5
 8:     3     5     4    NA     6
 9:     2     6     5     4     4
10:     2     2     1     2     3
11:     1     5     5     6     6
12:     2     4     4     3     4

拓展了解:tidyselect变量选择语法

(本小节进度:2/3)

3. 综合练习
data[Age >= 18 & Edu == 5]
        A1    A2    A3    A4    A5    C1    C2    C3    C4    C5    E1    E2
     <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
  1:     1     5     6     5     6     4     3     2     4     5     2     1
  2:     2     4     4     4     3     6     5     6     1     1     2     4
  3:     1     5     6     5     4     1     5     6     4     6     6     6
  4:     1     5     4     3     5     6     5     5     2     2     3     2
  5:     2     6     6     6     6     5     4     5     3     4     2     2
 ---                                                                        
413:     1     6     6     5     6     5     5     3     2     4     1     5
414:     1     6     6     6     6     6     6     6     2     1     4     3
415:     2     5     4     5     4     6     5     5     2     4     6     5
416:     2     6     5     4     5     4     4     1     4     1     1     5
417:     3     4     4     3     5     6     6     5     2     1     2     2
        E3    E4    E5    N1    N2    N3    N4    N5    O1    O2    O3    O4
     <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
  1:     2     5     2     2     2     2     2     2     6     1     5     5
  2:     4     2     6     3     3     5     3     2     5     2     6     6
  3:     2     1     1     1     2     1     3     6     6     6     5     6
  4:     3     6     5     1     2     1     2     1     5     1     6     6
  5:     4     5     5     2     2     2     2     3     5     2     5     5
 ---                                                                        
413:     4     4     2     1     1     1     4     2     6     1     6     6
414:     6     6     6     2     6     4     5     5     6     1     6     6
415:     2     3     5     1     1     3     4     1     5     1     4     6
416:     4     5     5    NA     1     4     2     4     5     4     5     5
417:     4     5     5     4     4     3     2     2     5     4     4     3
        O5 gender education   age Gender   Age    Edu
     <int>  <int>     <int> <int> <fctr> <num> <fctr>
  1:     2      1         5    68   Male    68      5
  2:     1      2         5    51 Female    51      5
  3:     1      1         5    23   Male    23      5
  4:     1      2         5    48 Female    48      5
  5:     1      2         5    24 Female    24      5
 ---                                                 
413:     1      1         5    28   Male    28      5
414:     1      2         5    26 Female    26      5
415:     1      2         5    40 Female    40      5
416:     1      1         5    28   Male    28      5
417:     4      2         5    33 Female    33      5
data[, A1:C5]
         A1    A2    A3    A4    A5    C1    C2    C3    C4    C5
      <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
   1:     2     4     3     4     4     2     3     3     4     4
   2:     2     4     5     2     5     5     4     4     3     4
   3:     5     4     5     4     4     4     5     4     2     5
   4:     4     4     6     5     5     4     4     3     5     5
   5:     2     3     3     4     5     4     4     5     3     2
  ---                                                            
2796:     6     1     3     3     3     6     6     6     1     1
2797:     2     4     4     3     5     2     3     4     4     3
2798:     2     3     5     2     5     5     5     5     1     1
2799:     5     2     2     4     4     5     5     5     2     6
2800:     2     3     1     4     2     5     5     3     3     3
data[, Gender:Edu]
      Gender   Age    Edu
      <fctr> <num> <fctr>
   1:   Male    16   <NA>
   2: Female    18   <NA>
   3: Female    17   <NA>
   4: Female    17   <NA>
   5:   Male    17   <NA>
  ---                    
2796:   Male    19      3
2797:   Male    27      4
2798: Female    29      4
2799:   Male    31      4
2800: Female    50      4
data[, .(Gender, Edu)]
      Gender    Edu
      <fctr> <fctr>
   1:   Male   <NA>
   2: Female   <NA>
   3: Female   <NA>
   4: Female   <NA>
   5:   Male   <NA>
  ---              
2796:   Male      3
2797:   Male      4
2798: Female      4
2799:   Male      4
2800: Female      4
data[Age >= 18 & Edu == 5, .(Gender, Age, Edu)]
     Gender   Age    Edu
     <fctr> <num> <fctr>
  1:   Male    68      5
  2: Female    51      5
  3:   Male    23      5
  4: Female    48      5
  5: Female    24      5
 ---                    
413:   Male    28      5
414: Female    26      5
415: Female    40      5
416:   Male    28      5
417: Female    33      5
data %>% select(A1:C5)
         A1    A2    A3    A4    A5    C1    C2    C3    C4    C5
      <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
   1:     2     4     3     4     4     2     3     3     4     4
   2:     2     4     5     2     5     5     4     4     3     4
   3:     5     4     5     4     4     4     5     4     2     5
   4:     4     4     6     5     5     4     4     3     5     5
   5:     2     3     3     4     5     4     4     5     3     2
  ---                                                            
2796:     6     1     3     3     3     6     6     6     1     1
2797:     2     4     4     3     5     2     3     4     4     3
2798:     2     3     5     2     5     5     5     5     1     1
2799:     5     2     2     4     4     5     5     5     2     6
2800:     2     3     1     4     2     5     5     3     3     3
data %>% select(matches("^E"))
         E1    E2    E3    E4    E5 education    Edu
      <int> <int> <int> <int> <int>     <int> <fctr>
   1:     3     3     3     4     4        NA   <NA>
   2:     1     1     6     4     3        NA   <NA>
   3:     2     4     4     4     5        NA   <NA>
   4:     5     3     4     4     4        NA   <NA>
   5:     2     2     5     4     5        NA   <NA>
  ---                                               
2796:     1     4     5     5     6         3      3
2797:     2     2     4     4     3         4      4
2798:     2     2     6     3     6         4      4
2799:     2     2     4     5     4         4      4
2800:     3     3     1     2     2         4      4
data %>% select(matches("E\\d"))
         E1    E2    E3    E4    E5
      <int> <int> <int> <int> <int>
   1:     3     3     3     4     4
   2:     1     1     6     4     3
   3:     2     4     4     4     5
   4:     5     3     4     4     4
   5:     2     2     5     4     5
  ---                              
2796:     1     4     5     5     6
2797:     2     2     4     4     3
2798:     2     2     6     3     6
2799:     2     2     4     5     4
2800:     3     3     1     2     2
data %>% select(num_range("A", 1:5))
         A1    A2    A3    A4    A5
      <int> <int> <int> <int> <int>
   1:     2     4     3     4     4
   2:     2     4     5     2     5
   3:     5     4     5     4     4
   4:     4     4     6     5     5
   5:     2     3     3     4     5
  ---                              
2796:     6     1     3     3     3
2797:     2     4     4     3     5
2798:     2     3     5     2     5
2799:     5     2     2     4     4
2800:     2     3     1     4     2

(本小节进度:3/3)

排序去重(data.table)

【实践3】排序去重

## 数据准备:大五人格问卷BFI(部分数据)
data = as.data.table(psych::bfi)
data[, let(
  Gender = factor(gender, levels=1:2, labels=c("Male", "Female")),
  Age = as.numeric(age),
  Edu = as.factor(education)
)]
d = data[2535:2546, c("Gender", "Age", "Edu", paste0("A", 1:5))]
d
    Gender   Age    Edu    A1    A2    A3    A4    A5
    <fctr> <num> <fctr> <int> <int> <int> <int> <int>
 1: Female    20      3     1     6     6     6     6
 2: Female    56      5     3     5     3     5     5
 3:   Male     3      5     2     5     5     4     5
 4:   Male    65      2     5     4     2     6     4
 5: Female    34      3     2     4     3     4     4
 6: Female    29      4     1     6     5     3     5
 7: Female    21      4     4     4     5     4     5
 8: Female    32      2     3     5     4    NA     6
 9:   Male    26      2     2     6     5     4     4
10:   Male    27      4     2     2     1     2     3
11: Female    45      4     1     5     5     6     6
12:   Male    17   <NA>     2     4     4     3     4
str(d)
Classes 'data.table' and 'data.frame':  12 obs. of  8 variables:
 $ Gender: Factor w/ 2 levels "Male","Female": 2 2 1 1 2 2 2 2 1 1 ...
 $ Age   : num  20 56 3 65 34 29 21 32 26 27 ...
 $ Edu   : Factor w/ 5 levels "1","2","3","4",..: 3 5 5 2 3 4 4 2 2 4 ...
 $ A1    : int  1 3 2 5 2 1 4 3 2 2 ...
 $ A2    : int  6 5 5 4 4 6 4 5 6 2 ...
 $ A3    : int  6 3 5 2 3 5 5 4 5 1 ...
 $ A4    : int  6 5 4 6 4 3 4 NA 4 2 ...
 $ A5    : int  6 5 5 4 4 5 5 6 4 3 ...
 - attr(*, ".internal.selfref")=<externalptr> 
1. 排序
d[order(Gender, Age, Edu)]  # 正序
    Gender   Age    Edu    A1    A2    A3    A4    A5
    <fctr> <num> <fctr> <int> <int> <int> <int> <int>
 1:   Male     3      5     2     5     5     4     5
 2:   Male    17   <NA>     2     4     4     3     4
 3:   Male    26      2     2     6     5     4     4
 4:   Male    27      4     2     2     1     2     3
 5:   Male    65      2     5     4     2     6     4
 6: Female    20      3     1     6     6     6     6
 7: Female    21      4     4     4     5     4     5
 8: Female    29      4     1     6     5     3     5
 9: Female    32      2     3     5     4    NA     6
10: Female    34      3     2     4     3     4     4
11: Female    45      4     1     5     5     6     6
12: Female    56      5     3     5     3     5     5
d[order(-Gender, -Age, -Edu)]  # 倒序
    Gender   Age    Edu    A1    A2    A3    A4    A5
    <fctr> <num> <fctr> <int> <int> <int> <int> <int>
 1: Female    56      5     3     5     3     5     5
 2: Female    45      4     1     5     5     6     6
 3: Female    34      3     2     4     3     4     4
 4: Female    32      2     3     5     4    NA     6
 5: Female    29      4     1     6     5     3     5
 6: Female    21      4     4     4     5     4     5
 7: Female    20      3     1     6     6     6     6
 8:   Male    65      2     5     4     2     6     4
 9:   Male    27      4     2     2     1     2     3
10:   Male    26      2     2     6     5     4     4
11:   Male    17   <NA>     2     4     4     3     4
12:   Male     3      5     2     5     5     4     5
d[order(-Gender, -Age, -Edu)][Age <= 20]  # [ ][ ][ ] 依次执行,无限叠加
   Gender   Age    Edu    A1    A2    A3    A4    A5
   <fctr> <num> <fctr> <int> <int> <int> <int> <int>
1: Female    20      3     1     6     6     6     6
2:   Male    17   <NA>     2     4     4     3     4
3:   Male     3      5     2     5     5     4     5

(本小节进度:1/2)

2. 去重
unique(d, by="Gender")  # 根据某个变量去重,重复数据取第一个
   Gender   Age    Edu    A1    A2    A3    A4    A5
   <fctr> <num> <fctr> <int> <int> <int> <int> <int>
1: Female    20      3     1     6     6     6     6
2:   Male     3      5     2     5     5     4     5
unique(d, by=c("Gender", "Edu"))  # 根据多个变量去重,重复数据取第一个
   Gender   Age    Edu    A1    A2    A3    A4    A5
   <fctr> <num> <fctr> <int> <int> <int> <int> <int>
1: Female    20      3     1     6     6     6     6
2: Female    56      5     3     5     3     5     5
3:   Male     3      5     2     5     5     4     5
4:   Male    65      2     5     4     2     6     4
5: Female    29      4     1     6     5     3     5
6: Female    32      2     3     5     4    NA     6
7:   Male    27      4     2     2     1     2     3
8:   Male    17   <NA>     2     4     4     3     4
unique(d[order(-Gender, -Age, -Edu)], by=c("Gender", "Edu"))
   Gender   Age    Edu    A1    A2    A3    A4    A5
   <fctr> <num> <fctr> <int> <int> <int> <int> <int>
1: Female    56      5     3     5     3     5     5
2: Female    45      4     1     5     5     6     6
3: Female    34      3     2     4     3     4     4
4: Female    32      2     3     5     4    NA     6
5:   Male    65      2     5     4     2     6     4
6:   Male    27      4     2     2     1     2     3
7:   Male    17   <NA>     2     4     4     3     4
8:   Male     3      5     2     5     5     4     5

(本小节进度:2/2)

【自由练习】数据基本操作

参考前三小节代码(增删查改、行列筛选、排序去重),基于psych::bfi或期末自选数据,在自己电脑上自由练习数据基本操作,为【阶段作业①】做准备。

  • 可以尝试改变上述示例代码中的数值范围、变量范围、筛选条件、排序方式等
  • 自己敲代码,掌握更扎实!(AI替代不了你的肌肉记忆!)

分组汇总(data.table)

【实践4】分组汇总

## 数据准备:大五人格问卷BFI(部分数据)
data = as.data.table(psych::bfi)
data[, let(
  Gender = factor(gender, levels=1:2, labels=c("Male", "Female")),
  Age = as.numeric(age),
  Edu = as.factor(education)
)]
d = data[2535:2546, c("Gender", "Age", "Edu", paste0("A", 1:5))]
d
    Gender   Age    Edu    A1    A2    A3    A4    A5
    <fctr> <num> <fctr> <int> <int> <int> <int> <int>
 1: Female    20      3     1     6     6     6     6
 2: Female    56      5     3     5     3     5     5
 3:   Male     3      5     2     5     5     4     5
 4:   Male    65      2     5     4     2     6     4
 5: Female    34      3     2     4     3     4     4
 6: Female    29      4     1     6     5     3     5
 7: Female    21      4     4     4     5     4     5
 8: Female    32      2     3     5     4    NA     6
 9:   Male    26      2     2     6     5     4     4
10:   Male    27      4     2     2     1     2     3
11: Female    45      4     1     5     5     6     6
12:   Male    17   <NA>     2     4     4     3     4
str(d)
Classes 'data.table' and 'data.frame':  12 obs. of  8 variables:
 $ Gender: Factor w/ 2 levels "Male","Female": 2 2 1 1 2 2 2 2 1 1 ...
 $ Age   : num  20 56 3 65 34 29 21 32 26 27 ...
 $ Edu   : Factor w/ 5 levels "1","2","3","4",..: 3 5 5 2 3 4 4 2 2 4 ...
 $ A1    : int  1 3 2 5 2 1 4 3 2 2 ...
 $ A2    : int  6 5 5 4 4 6 4 5 6 2 ...
 $ A3    : int  6 3 5 2 3 5 5 4 5 1 ...
 $ A4    : int  6 5 4 6 4 3 4 NA 4 2 ...
 $ A5    : int  6 5 5 4 4 5 5 6 4 3 ...
 - attr(*, ".internal.selfref")=<externalptr> 
1. 分组聚合

DT[i, j, by]

  • by:定义分组变量(原始顺序)
  • keyby:定义分组变量(重新排序)

## by分组,原始顺序
d[, .(.N), by=Gender]  # .N 是data.table保留关键字,用于统计样本量
   Gender     N
   <fctr> <int>
1: Female     7
2:   Male     5
## keyby分组,重新排序
d[, .(.N), keyby=Gender]  # .N 是data.table保留关键字,用于统计样本量
Key: <Gender>
   Gender     N
   <fctr> <int>
1:   Male     5
2: Female     7
d[, .(.N, Mean.Age = mean(Age)), keyby=Gender]
Key: <Gender>
   Gender     N Mean.Age
   <fctr> <int>    <num>
1:   Male     5    27.60
2: Female     7    33.86
d[, .(
  .N,
  Mean.Age = mean(Age),
  SD.Age = sd(Age)
), keyby=Gender]
Key: <Gender>
   Gender     N Mean.Age SD.Age
   <fctr> <int>    <num>  <num>
1:   Male     5    27.60  23.02
2: Female     7    33.86  12.90
d[Age >= 15, .(
  .N,
  Mean.Age = mean(Age),
  SD.Age = sd(Age)
), keyby=Gender]
Key: <Gender>
   Gender     N Mean.Age SD.Age
   <fctr> <int>    <num>  <num>
1:   Male     4    33.75  21.31
2: Female     7    33.86  12.90
## 全部数据的分组统计
# Education:
# 1 = high school
# 2 = finished high school
# 3 = some college
# 4 = college graduate
# 5 = graduate degree
data[, .(
  .N,
  Mean.Age = mean(Age)
), keyby=.(Gender, Edu)]
Key: <Gender, Edu>
    Gender    Edu     N Mean.Age
    <fctr> <fctr> <int>    <num>
 1:   Male   <NA>    81    18.88
 2:   Male      1    93    25.15
 3:   Male      2   103    31.47
 4:   Male      3   356    25.39
 5:   Male      4   134    33.17
 6:   Male      5   152    33.95
 7: Female   <NA>   142    17.42
 8: Female      1   131    25.12
 9: Female      2   189    31.54
10: Female      3   893    27.96
11: Female      4   260    32.88
12: Female      5   266    36.08

(本小节进度:1/2)

2. 分组计算
d[, Mean.Age := mean(Age)]  # 总均值
d
    Gender   Age    Edu    A1    A2    A3    A4    A5 Mean.Age
    <fctr> <num> <fctr> <int> <int> <int> <int> <int>    <num>
 1: Female    20      3     1     6     6     6     6    31.25
 2: Female    56      5     3     5     3     5     5    31.25
 3:   Male     3      5     2     5     5     4     5    31.25
 4:   Male    65      2     5     4     2     6     4    31.25
 5: Female    34      3     2     4     3     4     4    31.25
 6: Female    29      4     1     6     5     3     5    31.25
 7: Female    21      4     4     4     5     4     5    31.25
 8: Female    32      2     3     5     4    NA     6    31.25
 9:   Male    26      2     2     6     5     4     4    31.25
10:   Male    27      4     2     2     1     2     3    31.25
11: Female    45      4     1     5     5     6     6    31.25
12:   Male    17   <NA>     2     4     4     3     4    31.25
d[, Mean.Age.G := mean(Age), keyby=Gender]  # 组均值
d
Key: <Gender>
    Gender   Age    Edu    A1    A2    A3    A4    A5 Mean.Age Mean.Age.G
    <fctr> <num> <fctr> <int> <int> <int> <int> <int>    <num>      <num>
 1:   Male     3      5     2     5     5     4     5    31.25      27.60
 2:   Male    65      2     5     4     2     6     4    31.25      27.60
 3:   Male    26      2     2     6     5     4     4    31.25      27.60
 4:   Male    27      4     2     2     1     2     3    31.25      27.60
 5:   Male    17   <NA>     2     4     4     3     4    31.25      27.60
 6: Female    20      3     1     6     6     6     6    31.25      33.86
 7: Female    56      5     3     5     3     5     5    31.25      33.86
 8: Female    34      3     2     4     3     4     4    31.25      33.86
 9: Female    29      4     1     6     5     3     5    31.25      33.86
10: Female    21      4     4     4     5     4     5    31.25      33.86
11: Female    32      2     3     5     4    NA     6    31.25      33.86
12: Female    45      4     1     5     5     6     6    31.25      33.86
d[Gender=="Male", AgeGroup := ifelse(Age<40, "Young", "Middle")]
d
Key: <Gender>
    Gender   Age    Edu    A1    A2    A3    A4    A5 Mean.Age Mean.Age.G
    <fctr> <num> <fctr> <int> <int> <int> <int> <int>    <num>      <num>
 1:   Male     3      5     2     5     5     4     5    31.25      27.60
 2:   Male    65      2     5     4     2     6     4    31.25      27.60
 3:   Male    26      2     2     6     5     4     4    31.25      27.60
 4:   Male    27      4     2     2     1     2     3    31.25      27.60
 5:   Male    17   <NA>     2     4     4     3     4    31.25      27.60
 6: Female    20      3     1     6     6     6     6    31.25      33.86
 7: Female    56      5     3     5     3     5     5    31.25      33.86
 8: Female    34      3     2     4     3     4     4    31.25      33.86
 9: Female    29      4     1     6     5     3     5    31.25      33.86
10: Female    21      4     4     4     5     4     5    31.25      33.86
11: Female    32      2     3     5     4    NA     6    31.25      33.86
12: Female    45      4     1     5     5     6     6    31.25      33.86
    AgeGroup
      <char>
 1:    Young
 2:   Middle
 3:    Young
 4:    Young
 5:    Young
 6:     <NA>
 7:     <NA>
 8:     <NA>
 9:     <NA>
10:     <NA>
11:     <NA>
12:     <NA>
## 总均值中心化
d[, Age.Grand.C := Age - mean(Age)]
d[, .(Gender, Age, Mean.Age, Age.Grand.C)]
Key: <Gender>
    Gender   Age Mean.Age Age.Grand.C
    <fctr> <num>    <num>       <num>
 1:   Male     3    31.25      -28.25
 2:   Male    65    31.25       33.75
 3:   Male    26    31.25       -5.25
 4:   Male    27    31.25       -4.25
 5:   Male    17    31.25      -14.25
 6: Female    20    31.25      -11.25
 7: Female    56    31.25       24.75
 8: Female    34    31.25        2.75
 9: Female    29    31.25       -2.25
10: Female    21    31.25      -10.25
11: Female    32    31.25        0.75
12: Female    45    31.25       13.75
## 组均值中心化
d[, Age.Group.C := Age - mean(Age), keyby=Gender]
d[, .(Gender, Age, Mean.Age, Mean.Age.G, Age.Group.C)]
Key: <Gender>
    Gender   Age Mean.Age Mean.Age.G Age.Group.C
    <fctr> <num>    <num>      <num>       <num>
 1:   Male     3    31.25      27.60    -24.6000
 2:   Male    65    31.25      27.60     37.4000
 3:   Male    26    31.25      27.60     -1.6000
 4:   Male    27    31.25      27.60     -0.6000
 5:   Male    17    31.25      27.60    -10.6000
 6: Female    20    31.25      33.86    -13.8571
 7: Female    56    31.25      33.86     22.1429
 8: Female    34    31.25      33.86      0.1429
 9: Female    29    31.25      33.86     -4.8571
10: Female    21    31.25      33.86    -12.8571
11: Female    32    31.25      33.86     -1.8571
12: Female    45    31.25      33.86     11.1429

【知识点】data.table核心知识点总结

(本小节进度:2/2)

匹配拼接(dplyr)

【知识点】dplyr包:join系列函数

两个数据d1d2具有相同变量"var"

  • left_join(d1, d2, by="var"):向d1合并⭐️(最常用)
  • right_join(d1, d2, by="var"):向d2合并
  • inner_join(d1, d2, by="var"):取d1d2交集合并
  • full_join(d1, d2, by="var"):取d1d2并集合并

(data.table也能完成匹配拼接操作,但比dplyr难用)

【实践5】匹配拼接

1. 匹配拼接-例1
## 数据准备:两个来源的数据,至少包含一个共同变量
d1 = as.data.table(dplyr::band_members)
d1
     name    band
   <char>  <char>
1:   Mick  Stones
2:   John Beatles
3:   Paul Beatles
d2 = as.data.table(dplyr::band_instruments)
d2
     name  plays
   <char> <char>
1:   John guitar
2:   Paul   bass
3:  Keith guitar
d3 = as.data.table(dplyr::band_instruments2)
d3
   artist  plays
   <char> <char>
1:   John guitar
2:   Paul   bass
3:  Keith guitar
## 匹配拼接:相同变量名
data = left_join(d1, d2, by="name")  # 向左边数据合并
data
     name    band  plays
   <char>  <char> <char>
1:   Mick  Stones   <NA>
2:   John Beatles guitar
3:   Paul Beatles   bass
right_join(d1, d2, by="name")  # 向右边数据合并
     name    band  plays
   <char>  <char> <char>
1:   John Beatles guitar
2:   Paul Beatles   bass
3:  Keith    <NA> guitar
inner_join(d1, d2, by="name")  # 两个数据的交集
     name    band  plays
   <char>  <char> <char>
1:   John Beatles guitar
2:   Paul Beatles   bass
full_join(d1, d2, by="name")  # 两个数据的并集
     name    band  plays
   <char>  <char> <char>
1:   Mick  Stones   <NA>
2:   John Beatles guitar
3:   Paul Beatles   bass
4:  Keith    <NA> guitar
## 匹配拼接:不同变量名
names(d1)  # 人名变量为"name"
[1] "name" "band"
names(d3)  # 人名变量为"artist"
[1] "artist" "plays" 
data = left_join(d1, d3, by=c("name"="artist"))
data
     name    band  plays
   <char>  <char> <char>
1:   Mick  Stones   <NA>
2:   John Beatles guitar
3:   Paul Beatles   bass

(本小节进度:1/2)

2. 匹配拼接-例2
set.seed(1)
d = data.table(x=rnorm(9), city=rep(1:3, each=3))
d
         x  city
     <num> <int>
1: -0.6265     1
2:  0.1836     1
3: -0.8356     1
4:  1.5953     2
5:  0.3295     2
6: -0.8205     2
7:  0.4874     3
8:  0.7383     3
9:  0.5758     3
d.group = data.table(
  city = 1:3,
  label = c("北京", "上海", "广州"),
  GDP = c(521, 567, 310)
)
d.group
    city  label   GDP
   <int> <char> <num>
1:     1   北京   521
2:     2   上海   567
3:     3   广州   310
d.merge = left_join(d, d.group, by="city")
d.merge
         x  city  label   GDP
     <num> <int> <char> <num>
1: -0.6265     1   北京   521
2:  0.1836     1   北京   521
3: -0.8356     1   北京   521
4:  1.5953     2   上海   567
5:  0.3295     2   上海   567
6: -0.8205     2   上海   567
7:  0.4874     3   广州   310
8:  0.7383     3   广州   310
9:  0.5758     3   广州   310

(本小节进度:2/2)

长宽转换(tidyr)

【知识点】宽数据 vs. 长数据

  • 宽数据(wide-format data)—— “汇总表”或“透视表”
    • 每个被观测的对象只占一行,而该对象的多个属性或随时间变化的观测值作为不同的列横向展开
      • 核心特征:一主体一行,多变量多列
      • 灵活性:增加新变量需增加新列
      • 优点:对人类阅读比较友好,很像我们常见的电子表格或汇总报表,很多统计模型的输入也要求是宽格式
      • 缺点:不利于进行某些分析和可视化,因为数据维度(变量)被锁定在列结构中
↑ 宽数据示例
↑ 宽数据示例
  • 长数据(long-format data)—— “流水记录”或“规范化”的数据库表
    • 每个被观测的对象会占据多行,每一行通常只包含一个变量在某一个时间点或条件下的观测值
      • 核心特征:一观测一行,键值对形式
      • 灵活性:增加新观测或变量只需增加新行,结构稳定
      • 优点:结构规整,是整洁数据的标准格式,特别适合用ggplot2等工具进行数据可视化,也便于进行分组、聚合和复杂变换,数据库存储也常采用类似结构
      • 缺点:对人类阅读不太直观,显得冗长
↑ 长数据示例
↑ 长数据示例

【知识点】tidyr包:pivot系列函数

(data.table也能完成长宽转换操作,但比tidyr难用)

【实践6】长宽转换

1. 宽变长
bruceR::within.1  # 被试内重复测量(宽数据)
  ID A1 A2 A3 A4
1 S1  3  4  8  9
2 S2  6  6  9  8
3 S3  4  4  8  8
4 S4  3  2  7  7
5 S5  5  4  5 12
6 S6  7  5  6 13
7 S7  5  3  7 12
8 S8  2  3  6 11
d.long.1 = pivot_longer(
  data = within.1,
  cols = A1:A4,
  names_to = "Condition",
  values_to = "Y"
) %>% as.data.table()
d.long.1  # 长数据
        ID Condition     Y
    <char>    <char> <num>
 1:     S1        A1     3
 2:     S1        A2     4
 3:     S1        A3     8
 4:     S1        A4     9
 5:     S2        A1     6
 6:     S2        A2     6
 7:     S2        A3     9
 8:     S2        A4     8
 9:     S3        A1     4
10:     S3        A2     4
11:     S3        A3     8
12:     S3        A4     8
13:     S4        A1     3
14:     S4        A2     2
15:     S4        A3     7
16:     S4        A4     7
17:     S5        A1     5
18:     S5        A2     4
19:     S5        A3     5
20:     S5        A4    12
21:     S6        A1     7
22:     S6        A2     5
23:     S6        A3     6
24:     S6        A4    13
25:     S7        A1     5
26:     S7        A2     3
27:     S7        A3     7
28:     S7        A4    12
29:     S8        A1     2
30:     S8        A2     3
31:     S8        A3     6
32:     S8        A4    11
        ID Condition     Y
    <char>    <char> <num>
bruceR::within.2  # 被试内重复测量(宽数据)
  ID A1B1 A1B2 A1B3 A2B1 A2B2 A2B3
1 S1    3    4    5    4    8   12
2 S2    6    6    7    5    9   13
3 S3    4    4    5    3    8   12
4 S4    3    2    2    3    7   11
d.long.2 = pivot_longer(
  data = within.2,
  cols = A1B1:A2B3,
  names_to = c("Cond.A", "Cond.B"),
  names_pattern = "A(.)B(.)",
  values_to = "Y"
) %>% as.data.table()
d.long.2  # 长数据
        ID Cond.A Cond.B     Y
    <char> <char> <char> <num>
 1:     S1      1      1     3
 2:     S1      1      2     4
 3:     S1      1      3     5
 4:     S1      2      1     4
 5:     S1      2      2     8
 6:     S1      2      3    12
 7:     S2      1      1     6
 8:     S2      1      2     6
 9:     S2      1      3     7
10:     S2      2      1     5
11:     S2      2      2     9
12:     S2      2      3    13
13:     S3      1      1     4
14:     S3      1      2     4
15:     S3      1      3     5
16:     S3      2      1     3
17:     S3      2      2     8
18:     S3      2      3    12
19:     S4      1      1     3
20:     S4      1      2     2
21:     S4      1      3     2
22:     S4      2      1     3
23:     S4      2      2     7
24:     S4      2      3    11
        ID Cond.A Cond.B     Y
    <char> <char> <char> <num>

(本小节进度:1/3)

2. 长变宽
d.long.1  # 长数据(见“宽变长”部分代码)
        ID Condition     Y
    <char>    <char> <num>
 1:     S1        A1     3
 2:     S1        A2     4
 3:     S1        A3     8
 4:     S1        A4     9
 5:     S2        A1     6
 6:     S2        A2     6
 7:     S2        A3     9
 8:     S2        A4     8
 9:     S3        A1     4
10:     S3        A2     4
11:     S3        A3     8
12:     S3        A4     8
13:     S4        A1     3
14:     S4        A2     2
15:     S4        A3     7
16:     S4        A4     7
17:     S5        A1     5
18:     S5        A2     4
19:     S5        A3     5
20:     S5        A4    12
21:     S6        A1     7
22:     S6        A2     5
23:     S6        A3     6
24:     S6        A4    13
25:     S7        A1     5
26:     S7        A2     3
27:     S7        A3     7
28:     S7        A4    12
29:     S8        A1     2
30:     S8        A2     3
31:     S8        A3     6
32:     S8        A4    11
        ID Condition     Y
    <char>    <char> <num>
d.wide.1 = pivot_wider(
  data = d.long.1,
  id_cols = ID,
  names_from = "Condition",
  values_from = "Y"
) %>% as.data.table()
d.wide.1  # 宽数据
       ID    A1    A2    A3    A4
   <char> <num> <num> <num> <num>
1:     S1     3     4     8     9
2:     S2     6     6     9     8
3:     S3     4     4     8     8
4:     S4     3     2     7     7
5:     S5     5     4     5    12
6:     S6     7     5     6    13
7:     S7     5     3     7    12
8:     S8     2     3     6    11
d.long.2  # 长数据(见“宽变长”部分代码)
        ID Cond.A Cond.B     Y
    <char> <char> <char> <num>
 1:     S1      1      1     3
 2:     S1      1      2     4
 3:     S1      1      3     5
 4:     S1      2      1     4
 5:     S1      2      2     8
 6:     S1      2      3    12
 7:     S2      1      1     6
 8:     S2      1      2     6
 9:     S2      1      3     7
10:     S2      2      1     5
11:     S2      2      2     9
12:     S2      2      3    13
13:     S3      1      1     4
14:     S3      1      2     4
15:     S3      1      3     5
16:     S3      2      1     3
17:     S3      2      2     8
18:     S3      2      3    12
19:     S4      1      1     3
20:     S4      1      2     2
21:     S4      1      3     2
22:     S4      2      1     3
23:     S4      2      2     7
24:     S4      2      3    11
        ID Cond.A Cond.B     Y
    <char> <char> <char> <num>
d.wide.2 = pivot_wider(
  data = d.long.2,
  id_cols = ID,
  names_from = c("Cond.A", "Cond.B"),
  names_prefix = "A",
  names_sep = "_B",
  values_from = "Y"
) %>% as.data.table()
d.wide.2  # 宽数据
       ID A1_B1 A1_B2 A1_B3 A2_B1 A2_B2 A2_B3
   <char> <num> <num> <num> <num> <num> <num>
1:     S1     3     4     5     4     8    12
2:     S2     6     6     7     5     9    13
3:     S3     4     4     5     3     8    12
4:     S4     3     2     2     3     7    11

(本小节进度:2/3)

3. 更复杂情况
d.wide.2  # 宽数据(见“长变宽”部分代码)
       ID A1_B1 A1_B2 A1_B3 A2_B1 A2_B2 A2_B3
   <char> <num> <num> <num> <num> <num> <num>
1:     S1     3     4     5     4     8    12
2:     S2     6     6     7     5     9    13
3:     S3     4     4     5     3     8    12
4:     S4     3     2     2     3     7    11
d.long.3 = pivot_longer(
  data = d.wide.2,
  cols = starts_with("A"),
  names_to = c("VariableA", ".value"),
  names_pattern = "A(.)_(.+)"
) %>% as.data.table()
d.long.3  # A变长,B保持宽
       ID VariableA    B1    B2    B3
   <char>    <char> <num> <num> <num>
1:     S1         1     3     4     5
2:     S1         2     4     8    12
3:     S2         1     6     6     7
4:     S2         2     5     9    13
5:     S3         1     4     4     5
6:     S3         2     3     8    12
7:     S4         1     3     2     2
8:     S4         2     3     7    11
d.wide.3 = pivot_wider(
  data = d.long.3,
  names_from = "VariableA",
  names_glue = "A{VariableA}{.value}",
  values_from = c("B1", "B2", "B3")
) %>% as.data.table()
d.wide.3
       ID  A1B1  A2B1  A1B2  A2B2  A1B3  A2B3
   <char> <num> <num> <num> <num> <num> <num>
1:     S1     3     4     4     8     5    12
2:     S2     6     5     6     9     7    13
3:     S3     4     3     4     8     5    12
4:     S4     3     3     2     7     2    11

(本小节进度:3/3)

【阶段作业①】数据处理综合

作业要求:

  • 基于【作业4】【作业6】的期末自选数据和代码积累,综合运用本章所学的各种数据操作方法和代码,对数据进行增删查改、行列筛选、排序去重、分组汇总、匹配拼接、长宽转换等操作,体现对目前所学全部内容的掌握和迁移应用
  • 使用R Markdown完成,对关键代码及结果要有注释说明

平台提交:

  • 运行得到的HTML网页
    • 提交文件命名格式:学号-姓名-R阶段作业1.html
---
title: "《R语言》第6章：数据操作"
subtitle: <a href="https://psychbruce.github.io/RCourse/">返回课程主页</a>
author: "授课教师：包寒吴霜"
# date: "`r Sys.Date()`"
output:
  html_document:
    toc: true
    toc_depth: 3
    toc_float:
      collapsed: false
      smooth_scroll: false
    code_download: true
    anchor_sections: true
    highlight: pygments
    css: RmdCSS.css
---

```{=html}
<p style="font-size: 12px">版权声明：本套课程材料开源，使用和分享必须遵守「创作共用许可协议 CC BY-NC-SA」（来源引用-非商业用途使用-以相同方式共享）。<img src="img/CC-BY-NC-SA.jpg" width="120px" height="42px" style="float: right" /></p>
```

```{r Config, include=FALSE}
options(
  knitr.kable.NA = "",
  digits = 4
)
knitr::opts_chunk$set(
  comment = "",
  fig.width = 6,
  fig.height = 4,
  dpi = 300
)
```

------------------------------------------------------------------------

# Chap06：数据操作

#### 往期要点回顾

-   [Chap05 \# 变量计算的多种方式](https://psychbruce.github.io/RCourse/Chap05#%E5%8F%98%E9%87%8F%E8%AE%A1%E7%AE%97%E7%9A%84%E6%80%BB%E4%BD%93%E4%BB%8B%E7%BB%8D){.uri}

#### 本章要点目录

-   [【知识点】data.table核心要素](#知识点data.table核心要素)
-   [【探索发现】data.table vs. dplyr (tidyverse) 数据操作代码风格比较](#探索发现data.table-vs.-dplyr-tidyverse-数据操作代码风格比较)
-   [【探索发现】性能大PK：data.table (R) vs. dplyr (R) vs. pandas (Python)](#探索发现性能大pkdata.table-r-vs.-dplyr-r-vs.-pandas-python)
-   [【实践1】增删查改](#实践1增删查改)（重点）
-   [【实践2】行列筛选](#实践2行列筛选)（重点）
-   [【实践3】排序去重](#实践3排序去重)
-   [【自由练习】数据基本操作](#自由练习数据基本操作)
-   [【实践4】分组汇总](#实践4分组汇总)（重点）
-   [【知识点】data.table核心知识点总结](#知识点data.table核心知识点总结)
-   [【知识点】dplyr包：join系列函数](#知识点dplyr包join系列函数)
-   [【实践5】匹配拼接](#实践5匹配拼接)（重点）
-   [【知识点】宽数据 vs. 长数据](#知识点宽数据-vs.-长数据)
-   [【知识点】tidyr包：pivot系列函数](#知识点tidyr包pivot系列函数)
-   [【实践6】长宽转换](#实践6长宽转换)

```{r, message=FALSE, warning=FALSE}
## 本章所需R包
library(bruceR)
# library(dplyr)       # 加载bruceR时已默认加载dplyr
# library(tidyr)       # 加载bruceR时已默认加载tidyr
# library(data.table)  # 加载bruceR时已默认加载data.table
```

# 数据操作类R包概述与比较

#### 【知识点】data.table核心要素 {#知识点data.table核心要素}

-   [一图学会data.table包（官方速查表）](https://psychbruce.github.io/RCourse/cheatsheet/data.table.pdf)
-   极简代码风格：`data[i, j, by]` —— 只需要一对中括号，就能完成所有操作！
    -   `i`：行操作
    -   `j`：列操作
    -   `by`：分组变量

![](images/clipboard-809273248.png)

#### 【探索发现】data.table vs. dplyr (tidyverse) 数据操作代码风格比较 {#探索发现data.table-vs.-dplyr-tidyverse-数据操作代码风格比较}

-   [A data.table and dplyr tour (Atrebas, 2019)](https://atrebas.github.io/post/2019-03-03-datatable-dplyr/){.uri}

![](images/clipboard-2585080813.png)

![](images/clipboard-3176903319.png)

#### 【探索发现】性能大PK：data.table (R) vs. dplyr (R) vs. pandas (Python) {#探索发现性能大pkdata.table-r-vs.-dplyr-r-vs.-pandas-python .tabset .tabset-fade .table-pill}

-   <https://h2oai.github.io/db-benchmark/>

##### **分组汇总速度 (0.5 GB)**

（数据规模：1000万行 × 9个变量）

![](images/clipboard-2500389434.png)

##### **分组汇总速度 (5 GB)**

（数据规模：1亿行 × 9个变量）

![](images/clipboard-3761485613.png)

##### **分组汇总速度 (50 GB)**

（数据规模：10亿行 × 9个变量）

![](images/clipboard-3773107617.png)

# 增删查改（data.table）

#### 【实践1】增删查改 {#实践1增删查改 .tabset .tabset-fade .table-pill}

```{r}
## 数据准备：大五人格问卷BFI（部分数据）
data = as.data.table(psych::bfi)
d = data[31:50, c("gender", "education", "age")]
d
str(d)
```

##### **1. 增加**

```{r}
## 方式 1: 单变量 $ 操作符
d$Gender = factor(d$gender, levels=1:2, labels=c("Male", "Female"))
d

## 方式 2: 单变量 := 运算符 (原地更新)
d[, Gender := factor(gender, levels=1:2, labels=c("Male", "Female"))]
d

## 方式 3: 多变量 let() 函数 (原地更新)
d[, let(
  Gender = factor(gender, levels=1:2, labels=c("Male", "Female")),
  Age = as.numeric(age),
  Edu = as.factor(education)
)]
d
str(d)
```

（本小节进度：1/4）

##### **2. 删除**

```{r}
## 单变量
d$Age = d$Edu = NULL
d

## 多变量
d[, let(
  gender = NULL,
  education = NULL,
  age = NULL
)]
d
```

（本小节进度：2/4）

##### **3. 查找**

```{r}
d = data[31:50, c("gender", "education", "age", paste0("A", 1:5))]
d
str(d)

d[1:5, "gender"]  # DT[i, j]
d[1:5, gender]  # DT[i, j]
d[1:5, c("gender", "age")]  # DT[i, j]
d[1:5, .(gender, age)]  # DT[i, j]

d[1:3]  # DT[i]
d[1:3, ]  # DT[i, ]
d[, 2:3]  # DT[i, ]
d[, age:A3]  # DT[, j]（变量名范围，不会数数字）
```

（本小节进度：3/4）

##### **4. 修改**

```{r}
d = data[2536:2540, c("gender", "education", "age")]
d

d[2, "age"] = NA  # 无效年龄（3岁），替换为NA缺失值
d
```

-   Q：如何自动筛选、替换无效年龄？
    -   A：请看下一部分

（本小节进度：4/4）

# 行列筛选（data.table）

#### 【实践2】行列筛选 {#实践2行列筛选 .tabset .tabset-fade .table-pill}

```{r}
## 数据准备：大五人格问卷BFI（部分数据）
data = as.data.table(psych::bfi)
data[, let(
  Gender = factor(gender, levels=1:2, labels=c("Male", "Female")),
  Age = as.numeric(age),
  Edu = as.factor(education)
)]
d = data[2535:2546, c("Gender", "Age", "Edu", paste0("A", 1:5))]
d
str(d)
```

![](images/clipboard-2273592458.png)

##### **1. 行筛选**

```{r}
d[Gender == "Female"]

d[Gender == "Male"]

d[Gender == "Male" & Age >= 16]

d[Gender == "Male" & Age >= 16 & !is.na(Edu)]

d[Gender == "Male" & Age %in% 20:45]

d[Gender == "Male" & Age %notin% 20:45]

d[Gender == "Male" & (Age < 20 | Age > 45)]

d[Age < 15]
d[Age < 15, "Age"] = NA  # 自动筛选、替换无效年龄
d
```

（本小节进度：1/3）

##### **2. 列筛选**

```{r}
d = data[2535:2546, c("Gender", "Age", "Edu", paste0("A", 1:5))]

## 直接筛选列
d[, Age:A3]  # DT[, j]（变量名范围，不会数数字）
d[, .(A1, A2, A3, Edu, Age)]

## 间接筛选列
vars = c("Age", "Edu")
try({
  d[, vars]  # 报错，vars不是data.table内部环境中的变量名
})
d[, ..vars]  # ..表示从data.table内部环境返回到全局环境，使用变量选择列

## 还可以使用 dplyr::select() 函数，搭配 tidyselect 系列函数
select(d, starts_with("A"))
select(d, matches("A\\d"))  # 正则表达式
select(d, num_range("A", 1:5))  # 题序号范围
d %>% select(num_range("A", 1:5))  # 管道操作符
```

![](images/clipboard-3642455372.png)

拓展了解：[tidyselect变量选择语法](https://tidyselect.r-lib.org/reference/language.html){.uri}

![](images/clipboard-2435644075.png)

（本小节进度：2/3）

##### **3. 综合练习**

```{r}
data[Age >= 18 & Edu == 5]
data[, A1:C5]
data[, Gender:Edu]
data[, .(Gender, Edu)]
data[Age >= 18 & Edu == 5, .(Gender, Age, Edu)]

data %>% select(A1:C5)
data %>% select(matches("^E"))
data %>% select(matches("E\\d"))
data %>% select(num_range("A", 1:5))
```

（本小节进度：3/3）

# 排序去重（data.table）

#### 【实践3】排序去重 {#实践3排序去重 .tabset .tabset-fade .table-pill}

```{r}
## 数据准备：大五人格问卷BFI（部分数据）
data = as.data.table(psych::bfi)
data[, let(
  Gender = factor(gender, levels=1:2, labels=c("Male", "Female")),
  Age = as.numeric(age),
  Edu = as.factor(education)
)]
d = data[2535:2546, c("Gender", "Age", "Edu", paste0("A", 1:5))]
d
str(d)
```

##### **1. 排序**

```{r}
d[order(Gender, Age, Edu)]  # 正序

d[order(-Gender, -Age, -Edu)]  # 倒序

d[order(-Gender, -Age, -Edu)][Age <= 20]  # [ ][ ][ ] 依次执行，无限叠加
```

![](images/clipboard-955042676.png)

（本小节进度：1/2）

##### **2. 去重**

```{r}
unique(d, by="Gender")  # 根据某个变量去重，重复数据取第一个

unique(d, by=c("Gender", "Edu"))  # 根据多个变量去重，重复数据取第一个

unique(d[order(-Gender, -Age, -Edu)], by=c("Gender", "Edu"))
```

（本小节进度：2/2）

# 【自由练习】数据基本操作 {#自由练习数据基本操作}

参考前三小节代码（增删查改、行列筛选、排序去重），基于`psych::bfi`或期末自选数据，在自己电脑上自由练习数据基本操作，为【阶段作业①】做准备。

-   可以尝试改变上述示例代码中的数值范围、变量范围、筛选条件、排序方式等
-   **自己敲代码，掌握更扎实！（AI替代不了你的肌肉记忆！）**

# 分组汇总（data.table）

#### 【实践4】分组汇总 {#实践4分组汇总 .tabset .tabset-fade .table-pill}

```{r}
## 数据准备：大五人格问卷BFI（部分数据）
data = as.data.table(psych::bfi)
data[, let(
  Gender = factor(gender, levels=1:2, labels=c("Male", "Female")),
  Age = as.numeric(age),
  Edu = as.factor(education)
)]
d = data[2535:2546, c("Gender", "Age", "Edu", paste0("A", 1:5))]
d
str(d)
```

##### **1. 分组聚合**

`DT[i, j, by]`

-   `by`：定义分组变量（原始顺序）
-   `keyby`：定义分组变量（重新排序）

![](images/clipboard-431264484.png)

```{r}
## by分组，原始顺序
d[, .(.N), by=Gender]  # .N 是data.table保留关键字，用于统计样本量

## keyby分组，重新排序
d[, .(.N), keyby=Gender]  # .N 是data.table保留关键字，用于统计样本量

d[, .(.N, Mean.Age = mean(Age)), keyby=Gender]

d[, .(
  .N,
  Mean.Age = mean(Age),
  SD.Age = sd(Age)
), keyby=Gender]

d[Age >= 15, .(
  .N,
  Mean.Age = mean(Age),
  SD.Age = sd(Age)
), keyby=Gender]

## 全部数据的分组统计
# Education:
# 1 = high school
# 2 = finished high school
# 3 = some college
# 4 = college graduate
# 5 = graduate degree
data[, .(
  .N,
  Mean.Age = mean(Age)
), keyby=.(Gender, Edu)]
```

（本小节进度：1/2）

##### **2. 分组计算**

```{r}
d[, Mean.Age := mean(Age)]  # 总均值
d

d[, Mean.Age.G := mean(Age), keyby=Gender]  # 组均值
d

d[Gender=="Male", AgeGroup := ifelse(Age<40, "Young", "Middle")]
d

## 总均值中心化
d[, Age.Grand.C := Age - mean(Age)]
d[, .(Gender, Age, Mean.Age, Age.Grand.C)]

## 组均值中心化
d[, Age.Group.C := Age - mean(Age), keyby=Gender]
d[, .(Gender, Age, Mean.Age, Mean.Age.G, Age.Group.C)]
```

#### 【知识点】data.table核心知识点总结 {#知识点data.table核心知识点总结}

![](images/clipboard-2006248056.png)

（本小节进度：2/2）

# 匹配拼接（dplyr）

#### 【知识点】dplyr包：join系列函数 {#知识点dplyr包join系列函数}

两个数据`d1`和`d2`具有相同变量`"var"`：

-   `left_join(d1, d2, by="var")`：向`d1`合并⭐️（最常用）
-   `right_join(d1, d2, by="var")`：向`d2`合并
-   `inner_join(d1, d2, by="var")`：取`d1`和`d2`交集合并
-   `full_join(d1, d2, by="var")`：取`d1`和`d2`并集合并

（data.table也能完成匹配拼接操作，但比dplyr难用）

![](images/clipboard-1747331443.png)

![](images/clipboard-1531735304.png)

#### 【实践5】匹配拼接 {#实践5匹配拼接 .tabset .tabset-fade .table-pill}

##### **1. 匹配拼接-例1**

```{r}
## 数据准备：两个来源的数据，至少包含一个共同变量
d1 = as.data.table(dplyr::band_members)
d1
d2 = as.data.table(dplyr::band_instruments)
d2
d3 = as.data.table(dplyr::band_instruments2)
d3

## 匹配拼接：相同变量名
data = left_join(d1, d2, by="name")  # 向左边数据合并
data

right_join(d1, d2, by="name")  # 向右边数据合并
inner_join(d1, d2, by="name")  # 两个数据的交集
full_join(d1, d2, by="name")  # 两个数据的并集

## 匹配拼接：不同变量名
names(d1)  # 人名变量为"name"
names(d3)  # 人名变量为"artist"
data = left_join(d1, d3, by=c("name"="artist"))
data
```

（本小节进度：1/2）

##### **2. 匹配拼接-例2**

```{r}
set.seed(1)
d = data.table(x=rnorm(9), city=rep(1:3, each=3))
d

d.group = data.table(
  city = 1:3,
  label = c("北京", "上海", "广州"),
  GDP = c(521, 567, 310)
)
d.group

d.merge = left_join(d, d.group, by="city")
d.merge
```

（本小节进度：2/2）

# 长宽转换（tidyr）

#### 【知识点】宽数据 vs. 长数据 {#知识点宽数据-vs.-长数据}

-   **宽数据（wide-format data）**—— “汇总表”或“透视表”
    -   每个被观测的对象只占一行，而该对象的多个属性或随时间变化的观测值作为不同的列横向展开
        -   核心特征：一主体一行，多变量多列
        -   灵活性：增加新变量需增加新列
        -   优点：对人类阅读比较友好，很像我们常见的电子表格或汇总报表，很多统计模型的输入也要求是宽格式
        -   缺点：不利于进行某些分析和可视化，因为数据维度（变量）被锁定在列结构中

![↑ 宽数据示例](images/clipboard-2390278354.png)

-   **长数据（long-format data）**—— “流水记录”或“规范化”的数据库表
    -   每个被观测的对象会占据多行，每一行通常只包含一个变量在某一个时间点或条件下的观测值
        -   核心特征：一观测一行，键值对形式
        -   灵活性：增加新观测或变量只需增加新行，结构稳定
        -   优点：结构规整，是整洁数据的标准格式，特别适合用`ggplot2`等工具进行数据可视化，也便于进行分组、聚合和复杂变换，数据库存储也常采用类似结构
        -   缺点：对人类阅读不太直观，显得冗长

![↑ 长数据示例](images/clipboard-2872093314.png)

#### 【知识点】tidyr包：pivot系列函数 {#知识点tidyr包pivot系列函数}

（data.table也能完成长宽转换操作，但比tidyr难用）

![](images/clipboard-3630598939.png)

#### 【实践6】长宽转换 {#实践6长宽转换 .tabset .tabset-fade .table-pill}

##### **1. 宽变长**

```{r}
bruceR::within.1  # 被试内重复测量（宽数据）

d.long.1 = pivot_longer(
  data = within.1,
  cols = A1:A4,
  names_to = "Condition",
  values_to = "Y"
) %>% as.data.table()
d.long.1  # 长数据
```

```{r}
bruceR::within.2  # 被试内重复测量（宽数据）

d.long.2 = pivot_longer(
  data = within.2,
  cols = A1B1:A2B3,
  names_to = c("Cond.A", "Cond.B"),
  names_pattern = "A(.)B(.)",
  values_to = "Y"
) %>% as.data.table()
d.long.2  # 长数据
```

（本小节进度：1/3）

##### **2. 长变宽**

```{r}
d.long.1  # 长数据（见“宽变长”部分代码）

d.wide.1 = pivot_wider(
  data = d.long.1,
  id_cols = ID,
  names_from = "Condition",
  values_from = "Y"
) %>% as.data.table()
d.wide.1  # 宽数据
```

```{r}
d.long.2  # 长数据（见“宽变长”部分代码）

d.wide.2 = pivot_wider(
  data = d.long.2,
  id_cols = ID,
  names_from = c("Cond.A", "Cond.B"),
  names_prefix = "A",
  names_sep = "_B",
  values_from = "Y"
) %>% as.data.table()
d.wide.2  # 宽数据
```

（本小节进度：2/3）

##### **3. 更复杂情况**

```{r}
d.wide.2  # 宽数据（见“长变宽”部分代码）

d.long.3 = pivot_longer(
  data = d.wide.2,
  cols = starts_with("A"),
  names_to = c("VariableA", ".value"),
  names_pattern = "A(.)_(.+)"
) %>% as.data.table()
d.long.3  # A变长，B保持宽
```

```{r}
d.wide.3 = pivot_wider(
  data = d.long.3,
  names_from = "VariableA",
  names_glue = "A{VariableA}{.value}",
  values_from = c("B1", "B2", "B3")
) %>% as.data.table()
d.wide.3
```

（本小节进度：3/3）

# 【阶段作业①】数据处理综合

作业要求：

-   基于[【作业4】](https://psychbruce.github.io/RCourse/Chap03#%E4%BD%9C%E4%B8%9A4%E6%9C%9F%E6%9C%AB%E8%87%AA%E9%80%89%E5%85%AC%E5%BC%80%E6%95%B0%E6%8D%AE%E5%AF%BC%E5%85%A5)和[【作业6】](https://psychbruce.github.io/RCourse/Chap05#%E4%BD%9C%E4%B8%9A6%E6%9C%9F%E6%9C%AB%E4%BD%9C%E4%B8%9A%E6%95%B0%E6%8D%AE%E5%8F%98%E9%87%8F%E8%AE%A1%E7%AE%97){.uri}的期末自选数据和代码积累，综合运用本章所学的各种数据操作方法和代码，对数据进行**增删查改、行列筛选、排序去重、分组汇总、匹配拼接、长宽转换**等操作，体现对目前所学全部内容的掌握和迁移应用
-   使用R Markdown完成，对关键代码及结果要有注释说明

平台提交：

-   运行得到的HTML网页
    -   提交文件命名格式：`学号-姓名-R阶段作业1.html`
