关于ggplot2:使用ggplot将直方图转换为r中的小提琴图

transform histogram to violin plot in r with ggplot

我目前正试图在Hadley Wickham的丰富资源("面向数据科学家的资源","用于数据分析的ggplot2典雅图形")的帮助下学习r。到目前为止,我已经能够找到那里所有问题的答案(非常感谢,哈德利!),但是这次却没有。

目前,我正在使用一种仪器的数据,该仪器通过粒子散射的光来估计粒子的大小(DLS,Zetasizer Nano,Malvern Instruments)。从此设备提取的数据是一些摘要统计信息(例如,平均粒度)和直方图数据:x =尺寸(以箱为单位分割),y =强度[%]。
这是我的一项测量的小标题:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
   # A tibble: 70 x 3
   sample_name        intensities      bins
   <chr>                    <dbl>     <dbl>
 1 core formulation 1         0       0.4  
 2 core formulation 1         0       0.463
 3 core formulation 1         0       0.536
 4 core formulation 1         0       0.621
 5 core formulation 1         0       0.720
 6 core formulation 1         0       0.833
 7 core formulation 1         0       0.965
 8 core formulation 1         0       1.12
 9 core formulation 1         0       1.29
10 core formulation 1         0       1.50
11 core formulation 1         0       1.74
12 core formulation 1         0       2.01
13 core formulation 1         0       2.33
14 core formulation 1         0       2.70
15 core formulation 1         0       3.12
16 core formulation 1         0       3.62
17 core formulation 1         0       4.19
18 core formulation 1         0       4.85
19 core formulation 1         0       5.62
20 core formulation 1         0       6.50
21 core formulation 1         0       7.53
22 core formulation 1         0       8.72
23 core formulation 1         0      10.1  
24 core formulation 1         0      11.7  
25 core formulation 1         0      13.5  
26 core formulation 1         0      15.7  
27 core formulation 1         0      18.2  
28 core formulation 1         0      21.0  
29 core formulation 1         0      24.4  
30 core formulation 1         0      28.2  
31 core formulation 1         0      32.7  
32 core formulation 1         0      37.8  
33 core formulation 1         0      43.8  
34 core formulation 1         0.2    50.8  
35 core formulation 1         1.4    58.8  
36 core formulation 1         3.7    68.1  
37 core formulation 1         6.9    78.8  
38 core formulation 1        10.2    91.3  
39 core formulation 1        12.9   106.  
40 core formulation 1        14.4   122.  
41 core formulation 1        14.4   142.  
42 core formulation 1        13     164.  
43 core formulation 1        10.3   190.  
44 core formulation 1         7.1   220.  
45 core formulation 1         3.9   255    
46 core formulation 1         1.5   295.  
47 core formulation 1         0.2   342    
48 core formulation 1         0     396.  
49 core formulation 1         0     459.  
50 core formulation 1         0     531.  
51 core formulation 1         0     615.  
52 core formulation 1         0     712.  
53 core formulation 1         0     825    
54 core formulation 1         0     955.  
55 core formulation 1         0    1106    
56 core formulation 1         0    1281    
57 core formulation 1         0    1484    
58 core formulation 1         0    1718    
59 core formulation 1         0    1990    
60 core formulation 1         0    2305    
61 core formulation 1         0    2669    
62 core formulation 1         0    3091    
63 core formulation 1         0    3580    
64 core formulation 1         0    4145    
65 core formulation 1         0    4801    
66 core formulation 1         0    5560    
67 core formulation 1         0    6439    
68 core formulation 1         0    7456    
69 core formulation 1         0    8635    
70 core formulation 1         0   10000

以下是通过dput()命令生成的数据:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
structure(list(sample_name = c("core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1","core formulation 1",
"core formulation 1","core formulation 1"), intensities = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 1.4, 3.7, 6.9, 10.2, 12.9,
14.4, 14.4, 13, 10.3, 7.1, 3.9, 1.5, 0.2, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), bins = c(0.4,
0.4632, 0.5365, 0.6213, 0.7195, 0.8332, 0.9649, 1.117, 1.294,
1.499, 1.736, 2.01, 2.328, 2.696, 3.122, 3.615, 4.187, 4.849,
5.615, 6.503, 7.531, 8.721, 10.1, 11.7, 13.54, 15.69, 18.17,
21.04, 24.36, 28.21, 32.67, 37.84, 43.82, 50.75, 58.77, 68.06,
78.82, 91.28, 105.7, 122.4, 141.8, 164.2, 190.1, 220.2, 255,
295.3, 342, 396.1, 458.7, 531.2, 615.1, 712.4, 825, 955.4, 1106,
1281, 1484, 1718, 1990, 2305, 2669, 3091, 3580, 4145, 4801, 5560,
6439, 7456, 8635, 10000)), class = c("tbl_df","tbl","data.frame"
), row.names = c(NA, -70L))

我可以从此数据中毫无问题地生成直方图:

1
2
3
4
library(tidyverse)
ggplot (DLS_intensities_core, aes(bins,intensities) ) +
  geom_line() +
  scale_x_continuous(trans = 'log10')

line

为了显示我的粒径的总体分布,我想将此数据转换为小提琴图,并在我的图的第二层中使用设备提供的汇总统计信息。

因此,我想对这些数据进行转换,以便能够从中创建一个小提琴图。

我已经尝试将其输入到小提琴图的stat_density()参数,但到目前为止没有成功。

您知道如何根据此数据创建小提琴图吗?

非常感谢!

最佳,

多米尼克


在您回复第二条评论后,我会对此进行更新(如果需要)。您可以使用

获得binsintensities的小提琴图

1
2
3
4
5
6
7
8
9
10
11
12
13
14
library(hrbrthemes)

gather(DLS_intensities_core, measure, value, -sample_name) %>%
  ggplot(aes(measure, value)) +
  geom_violin(scale ="count") +
  scale_y_comma() +
  facet_wrap(~measure, scales="free") +
  labs(
    x = NULL, y ="A better label than this",
    title ="A better title than this",
    caption ="NOTE: Free Y scales"
  ) +
  theme_ipsum_rc(grid="Y") +
  theme(axis.text.x = element_blank())

enter

1
2
3
4
5
6
7
8
9
10
11
12
13
gather(DLS_intensities_core, measure, value, -sample_name) %>%
  ggplot(aes(measure, value)) +
  geom_violin(scale ="count") +
  ggbeeswarm::geom_quasirandom() +
  scale_y_comma() +
  facet_wrap(~measure, scales="free") +
  labs(
    x = NULL, y ="A better label than this",
    title ="A better title than this",
    caption ="NOTE: Free Y scales"
  ) +
  theme_ipsum_rc(grid="Y") +
  theme(axis.text.x = element_blank())

enter

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
library(hrbrthemes)
library(tidyverse)

ggplot(DLS_intensities_core, aes(x="", bins)) +
  geom_violin(scale ="count") +
  ggbeeswarm::geom_quasirandom(
    aes(size = intensities, fill = intensities), shape = 21
  ) +
  scale_y_comma(trans="log10") +
  viridis::scale_fill_viridis(direction = -1, trans ="log1p") +
  scale_size_continuous(trans ="log1p", range = c(2, 10)) +
  guides(fill = guide_legend()) +
  labs(
    x = NULL, y ="A better label than this",
    title ="A better title than this"
  ) +
  theme_ipsum_rc(grid="Y")

enter

您必须做一些其他的自定义转换,以尝试使小提琴的形状随强度而变化(这实际上并不能反映该点的分布)。


我找到了解决问题的方法,它可能不是很优雅:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
library (tidyverse)

DLS_intensities_core <- DLS_intensities_core %>%
  mutate(counts = intensities * 10 )

vectors <- DLS_intensities_core %>%
  filter(counts > 0)

bins_v <- vectors$bins
count_v <- vectors$counts

violin_DLSdata <- as.tibble(rep.int(bins_v, count_v))
violin_DLSdata$sample_name <-"core formulation 1"

ggplot (violin_DLSdata, aes(sample_name, value)) +
  geom_violin() +
  labs(
    x = NULL, y ="size"
  ) +
  scale_y_continuous(trans = 'log10', limits = c(1, 1000))

violin_plot