版权声明:本套课程材料开源,使用和分享必须遵守「创作共用许可协议 CC BY-NC-SA」(来源引用-非商业用途使用-以相同方式共享)。


数据可视化的总体思路

【探索发现】数据有了,图怎么画?

  • R Graph Gallery R绘图美术馆
    • 绘制变量分布
      • 直方图(Histogram)
      • 密度图(Density)
      • 箱线图(Boxplot)
      • 提琴图(Violin)
      • 山脊图(Ridgeline)
    • 绘制变量大小
      • 柱形图(Barplot)
      • 点线图(Dotted line)
    • 绘制变量关系
      • 散点图(Scatterplot)
      • 气泡图(Bubble)
      • 热力图(Heatmap)
    • 绘制变量趋势
      • 折线图(Line chart)
      • 面积图(Area chart)

【知识点】ggplot2“搭积木”式绘图语法

ggplot2绘图语法的逻辑:

  1. 准备数据(data),将变量映射到美学属性(aes),定义XY轴、颜色、大小等
  2. 添加几何对象(geom),绘制点、线、面等,多个图层可通过加号依次叠加(+
  3. 可做分面小图(facet),调整标度(scale),投影到不同坐标系(coord
  4. 设置标题标签(labs),调整图例(guides),根据喜好选择主题方案(theme

绘制变量分布

直方图

【实践1】直方图

## 数据准备
data = airquality
data$Month = as.factor(data$Month)
data$Temp.C = (data$Temp - 32) / 1.8  # 摄氏度 = (华氏度 - 32) / 1.8
str(data)
'data.frame':   153 obs. of  7 variables:
 $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
 $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
 $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
 $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
 $ Month  : Factor w/ 5 levels "5","6","7","8",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...
 $ Temp.C : num  19.4 22.2 23.3 16.7 13.3 ...
## R基础作图函数:直方图
hist(data$Temp.C)

## ggplot2搭积木作图
ggplot()  # 空画板

ggplot(data, aes(x=Temp.C))  # 有了坐标系

ggplot(data, aes(x=Temp.C)) +
  geom_histogram()  # 直方图
`stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ggplot(data, aes(x=Temp.C)) +
  geom_histogram(bins=10,
                 color="black",
                 fill="grey")

ggplot(data, aes(x=Temp.C)) +
  geom_histogram(binwidth=5,
                 color="black",
                 fill="grey")

ggplot(data, aes(x=Temp.C)) +
  geom_histogram(
    aes(y=after_stat(density)),
    ## density, integrate to 1
    binwidth=5,
    color="black",
    fill="grey")

ggplot(data, aes(x=Temp.C)) +
  geom_histogram(
    aes(y=after_stat(ndensity)),
    ## density, maximum to 1
    binwidth=5,
    color="black",
    fill="grey")

密度图

【实践2】密度图

ggplot(data, aes(x=Temp.C)) +
  geom_density()

ggplot(data, aes(x=Temp.C)) +
  geom_density(adjust=0.5)

ggplot(data, aes(x=Temp.C)) +
  geom_density(adjust=2)

ggplot(data, aes(x=Temp.C)) +
  geom_density(adjust=2,
               linewidth=2,
               color="darkblue",
               fill="lightblue")

ggplot(data, aes(x=Temp.C, y=after_stat(ndensity))) +
  geom_histogram(binwidth=5) +
  geom_density(adjust=2,
               linewidth=2,
               color="darkblue",
               fill="lightblue",
               alpha=0.5)

【探索发现】颜色的RGB十六进制码

箱线图

【知识点】箱线图的统计含义

  • 四分位数(quartile):把数据四等分的界限
    • Q1:0~25%(前25%界限)
    • Q2:25~50%(中位数50%界限)
    • Q3:50~75%(后25%界限)
    • Q4:75~100%(垫底)
  • IQR(interquartile range):四分位距(分布中Q1和Q3之间的变量值范围)
    • IQR = Q3 – Q1

【实践3】箱线图

## R基础作图函数:箱线图
boxplot(data$Temp.C)

boxplot(Temp.C ~ Month, data=data)

## ggplot2搭积木作图
ggplot(data, aes(y=Temp.C)) +
  geom_boxplot()

ggplot(data, aes(x=Month, y=Temp.C)) +
  geom_boxplot()

ggplot(data, aes(x=Month, y=Temp.C)) +
  geom_boxplot(
    fill="blue",
    alpha=0.3,
    outlier.color="red",
    outlier.size=2)

ggplot(data, aes(x=Month, y=Temp.C)) +
  geom_boxplot(aes(fill=Month)) +
  scale_y_continuous(
    limits=c(10, 40)) +
  labs(x="Month",
       y="Temperature",
       fill="Month")

提琴图

【实践4】提琴图

ggplot(data, aes(x=Month, y=Temp.C)) +
  geom_violin(aes(fill=Month)) +
  scale_y_continuous(limits=c(10, 40)) +
  labs(x="Month",
       y="Temperature",
       fill="Month")

ggplot(data, aes(x=Month, y=Temp.C)) +
  geom_violin(aes(fill=Month)) +
  geom_boxplot(fill="white", width=0.3) +
  scale_y_continuous(limits=c(10, 40)) +
  labs(x="Month",
       y="Temperature",
       fill="Month")

山脊图

【探索发现】山脊图

## install.packages("ggridges")
library(ggridges)

ggplot(data, aes(x=Temp.C, y=Month)) +
  geom_density_ridges() +
  theme_ridges()
Picking joint bandwidth of 1.47

ggplot(data, aes(x=Temp.C, y=Month, fill=after_stat(x))) +
  geom_density_ridges_gradient() +
  labs(x="Temperature", y="Month") +
  theme_ridges()
Picking joint bandwidth of 1.47

ggplot(data, aes(x=Temp.C, y=Month, fill=after_stat(x))) +
  geom_density_ridges_gradient(
    scale=0.95,        # 最高峰高度缩放到95%
    show.legend=FALSE  # 不显示图例
  ) +
  scale_fill_viridis_c(option="C") +  # Viridis配色方案
  labs(x="Temperature", y="Month") +
  theme_ridges()
Picking joint bandwidth of 1.47

ggplot(data, aes(x=Temp.C, y=Month, fill=after_stat(x))) +
  geom_density_ridges_gradient(
    stat="binline",    # 直方图统计转换
    bins=20,           # 20个直方图分段
    scale=0.8,         # 最高峰高度缩放到95%
    show.legend=FALSE  # 不显示图例
  ) +
  scale_x_continuous(limits=c(10, 40)) +
  scale_fill_viridis_c(option="C") +
  labs(x="Temperature", y="Month") +
  theme_ridges()

绘制变量大小

柱形图

【实践5】柱形图

## 数据准备:每月气温的估计边际均值(emmeans)
model = lm(Temp.C ~ Month, data)
# summary(emmeans(model, "Month"))
means = model %>% emmeans("Month") %>% summary()
means
 Month emmean    SE  df lower.CL upper.CL
 5       18.6 0.664 148     17.3     19.9
 6       26.2 0.675 148     24.8     27.5
 7       28.8 0.664 148     27.5     30.1
 8       28.9 0.664 148     27.6     30.2
 9       24.9 0.675 148     23.6     26.3

Confidence level used: 0.95 
ggplot(means, aes(x=Month, y=emmean)) +
  geom_col(color="black",
           fill="grey",
           width=0.6)

ggplot(means, aes(x=Month, y=emmean)) +
  geom_col(color="black",
           fill="grey",
           width=0.6) +
  geom_errorbar(aes(ymin=lower.CL,
                    ymax=upper.CL),
                width=0.1) +
  labs(y="Mean Temperature",
       title="Air Quality Data") +
  theme_classic()

## 数据准备:两因素组间ANOVA的估计边际均值(emmeans)
d = between.2
d$A = as.factor(d$A)
d$B = as.factor(d$B)
model = lm(SCORE ~ A * B, data=d)
means = model %>% emmeans("A", by="B") %>% summary()
means
B = 1:
 A emmean    SE df lower.CL upper.CL
 1   4.00 0.682 18     2.57     5.43
 2   3.75 0.682 18     2.32     5.18

B = 2:
 A emmean    SE df lower.CL upper.CL
 1   4.00 0.682 18     2.57     5.43
 2   8.00 0.682 18     6.57     9.43

B = 3:
 A emmean    SE df lower.CL upper.CL
 1   4.75 0.682 18     3.32     6.18
 2  12.00 0.682 18    10.57    13.43

Confidence level used: 0.95 
ggplot(means, aes(x=A, y=emmean, fill=B)) +
  geom_col(position="dodge",  # 调整水平位置,躲避重叠图形
           width=0.6)

ggplot(means, aes(x=A, y=emmean, fill=B)) +
  geom_col(position="dodge",  # 调整水平位置,躲避重叠图形
           width=0.6) +
  geom_errorbar(aes(ymin=lower.CL,
                    ymax=upper.CL),
                position=position_dodge(0.6),
                width=0.15,
                color="black")

ggplot(means, aes(x=A, y=emmean, fill=B)) +
  geom_col(position="dodge", width=0.6) +
  geom_errorbar(aes(ymin=lower.CL,
                    ymax=upper.CL),
                position=position_dodge(0.6),
                width=0.15,
                color="black") +
  scale_y_continuous(expand=expansion(add=0),
                     limits=c(0, 15),
                     breaks=seq(0, 15, 3)) +
  scale_fill_brewer(palette="Set1") +
  labs(x="A", y="SCORE") +
  theme_classic()

点线图

【实践6】点线图

## 数据准备:每月气温的估计边际均值(emmeans)
model = lm(Temp.C ~ Month, data)
# summary(emmeans(model, "Month"))
means = model %>% emmeans("Month") %>% summary()
means
 Month emmean    SE  df lower.CL upper.CL
 5       18.6 0.664 148     17.3     19.9
 6       26.2 0.675 148     24.8     27.5
 7       28.8 0.664 148     27.5     30.1
 8       28.9 0.664 148     27.6     30.2
 9       24.9 0.675 148     23.6     26.3

Confidence level used: 0.95 
emmip(model, ~ Month, CIs=TRUE)  # 返回ggplot对象

emmip(model, ~ Month, CIs=TRUE) +
  labs(x="Month",
       y="Mean Temperature",
       title="Air Quality Data",
       subtitle="Daily Temperature",
       caption="* Error bar = 95% CI") +
  theme_classic()

绘制变量关系

散点图

【实践7】散点图

## R基础作图函数:散点图
plot(x=data$Wind, y=data$Temp.C)

## ggplot2搭积木作图
ggplot(data, aes(x=Wind, y=Temp.C)) +
  geom_point()

ggplot(data, aes(x=Wind, y=Temp.C)) +
  geom_point(color="grey") +
  geom_smooth()  # 后画的geom图层在最上面!
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data, aes(x=Wind, y=Temp.C)) +
  geom_smooth() +
  geom_point(color="grey")  # 后画的geom图层在最上面!
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data, aes(x=Wind, y=Temp.C)) +
  geom_point(color="grey") +
  geom_smooth(method="lm", color="black")
`geom_smooth()` using formula = 'y ~ x'

ggplot(data, aes(x=Wind, y=Temp.C)) +
  geom_point(color="grey") +
  geom_smooth(method="lm", color="black") +
  geom_hline(yintercept=22, linetype="dashed", color="red")
`geom_smooth()` using formula = 'y ~ x'

ggplot(data, aes(x=Wind, y=Temp.C)) +
  geom_hline(yintercept=22, linetype="dashed", color="red") +
  geom_point(color="grey") +
  geom_smooth(method="lm", color="black")
`geom_smooth()` using formula = 'y ~ x'

ggplot(data, aes(x=Wind, y=Temp.C)) +
  geom_hline(yintercept=22, linetype="dashed", color="red") +
  geom_vline(xintercept=9, linetype="dashed", color="blue") +
  geom_point(color="grey") +
  geom_smooth(method="lm", color="black")
`geom_smooth()` using formula = 'y ~ x'

ggplot(data, aes(x=Wind, y=Temp.C, color=Month)) +
  geom_point()

ggplot(data, aes(x=Wind, y=Temp.C, color=Month)) +
  geom_point() +
  scale_color_brewer(palette="Set1")

ggplot(data, aes(x=Wind, y=Temp.C, color=Month)) +
  geom_point() +
  geom_smooth(method="lm", se=FALSE) +
  scale_color_brewer(palette="Set1") +
  labs(title="Temperature & Wind Speed")
`geom_smooth()` using formula = 'y ~ x'

ggplot(data, aes(x=Wind, y=Temp.C, color=Month)) +
  geom_point() +
  geom_smooth(method="lm", se=FALSE) +
  geom_smooth(method="lm", color="black") +
  scale_x_continuous(limits=c(0, 21)) +
  scale_y_continuous(limits=c(10, 40)) +
  scale_color_brewer(palette="Set1") +
  labs(title="Temperature & Wind Speed")
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'

气泡图

【实践8】气泡图

ggplot(data, aes(x=Wind, y=Temp.C, color=Month, size=Solar.R)) +
  geom_point(shape=21) +
  scale_color_brewer(palette="Set1")
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(data, aes(x=Wind, y=Temp.C, color=Month, size=Solar.R)) +
  geom_point(shape=21) +
  scale_color_brewer(palette="Set1") +
  scale_size_continuous(breaks=seq(50, 300, 50))
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_point()`).

ggplot(data, aes(x=Wind, y=Temp.C, color=Month, size=Solar.R)) +
  geom_point(aes(fill=Month), alpha=0.2, shape=21) +
  geom_point(shape=21) +
  scale_color_brewer(palette="Set1") +
  scale_fill_brewer(palette="Set1") +
  scale_size_continuous(breaks=seq(50, 300, 50))
Warning: Removed 7 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 7 rows containing missing values or values outside the scale range
(`geom_point()`).

热力图

【探索发现】热力图

cor = Corr(airquality)
Pearson's r and 95% confidence intervals:
─────────────────────────────────────────────────
                   r       [95% CI]     p       N
─────────────────────────────────────────────────
Ozone-Solar.R   0.35 [ 0.17,  0.50] <.001 *** 111
Ozone-Wind     -0.60 [-0.71, -0.47] <.001 *** 116
Ozone-Temp      0.70 [ 0.59,  0.78] <.001 *** 116
Ozone-Month     0.16 [-0.02,  0.34]  .078 .   116
Ozone-Day      -0.01 [-0.20,  0.17]  .888     116
Solar.R-Wind   -0.06 [-0.22,  0.11]  .496     146
Solar.R-Temp    0.28 [ 0.12,  0.42] <.001 *** 146
Solar.R-Month  -0.08 [-0.23,  0.09]  .366     146
Solar.R-Day    -0.15 [-0.31,  0.01]  .070 .   146
Wind-Temp      -0.46 [-0.57, -0.32] <.001 *** 153
Wind-Month     -0.18 [-0.33, -0.02]  .027 *   153
Wind-Day        0.03 [-0.13,  0.19]  .739     153
Temp-Month      0.42 [ 0.28,  0.54] <.001 *** 153
Temp-Day       -0.13 [-0.28,  0.03]  .108     153
Month-Day      -0.01 [-0.17,  0.15]  .922     153
─────────────────────────────────────────────────

cor$plot + labs(title="Correlation Plot")

cor$plot +
  labs(title="Correlation Plot") +
  scale_fill_fermenter(
    palette="RdBu",
    direction=1,
    limits=c(-1, 1),
    breaks=seq(-1, 1, 0.2),
    guide=guide_colorsteps(
      barwidth=0.5,
      barheight=10))
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.

cor$plot +
  labs(title="Correlation Plot") +
  scale_fill_fermenter(
    palette="Spectral",
    direction=1,
    limits=c(-1, 1),
    breaks=seq(-1, 1, 0.2),
    guide=guide_colorsteps(
      barwidth=0.5,
      barheight=10))
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.

绘制变量趋势

折线图

【实践9】折线图

data = as.data.table(airquality)
data[, Date := as.Date(sprintf("1973-%02d-%02d", Month, Day))]
data[, Temp.C := (Temp - 32) / 1.8]
data
     Ozone Solar.R  Wind  Temp Month   Day       Date Temp.C
     <int>   <int> <num> <int> <int> <int>     <Date>  <num>
  1:    41     190   7.4    67     5     1 1973-05-01  19.44
  2:    36     118   8.0    72     5     2 1973-05-02  22.22
  3:    12     149  12.6    74     5     3 1973-05-03  23.33
  4:    18     313  11.5    62     5     4 1973-05-04  16.67
  5:    NA      NA  14.3    56     5     5 1973-05-05  13.33
 ---                                                        
149:    30     193   6.9    70     9    26 1973-09-26  21.11
150:    NA     145  13.2    77     9    27 1973-09-27  25.00
151:    14     191  14.3    75     9    28 1973-09-28  23.89
152:    18     131   8.0    76     9    29 1973-09-29  24.44
153:    20     223  11.5    68     9    30 1973-09-30  20.00
ggplot(data, aes(x=Date, y=Temp.C)) +
  geom_line()

ggplot(data, aes(x=Date, y=Temp.C)) +
  geom_line() +
  geom_smooth()
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data, aes(x=Date, y=Temp.C)) +
  geom_line() +
  geom_point() +
  geom_smooth()
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data, aes(x=Date, y=Temp.C)) +
  geom_line(color="grey") +
  geom_point(aes(color=Temp.C)) +
  geom_smooth(color="black") +
  scale_color_distiller(palette="RdYlBu")
`geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data, aes(x=Date, y=Temp.C)) +
  geom_line(linewidth=1) +
  geom_point(aes(fill=Temp.C), shape=21) +
  scale_x_date(date_labels="%m-%d",  # "%Y-%m-%d"
               date_breaks="1 month",
               date_minor_breaks="7 days") +
  scale_y_continuous(limits=c(10, 40)) +
  scale_fill_distiller(palette="RdYlBu") +
  labs(x=NULL,
       y="Temperature",
       title="Daily Temperature")

【作业9】基础绘图练习

作业要求:

  • 基于期末自选数据,运用本章所学的ggplot2绘图代码,练习绘制变量的分布(直方图)、大小(柱形图)、关系(散点图)、 趋势(折线图),每种图绘制一个即可
  • 使用R Markdown完成,对关键代码及结果要有注释说明

平台提交:

  • 运行得到的HTML网页,及其关键部分截图
