2 METABRIC methylation data statistics
source(here::here("scripts/init.R"))
2.1 Breakdown of samples
2.1.0.1 Figure 1A
options(repr.plot.width = 7, repr.plot.height = 7)
p_donut <- samp_data %>%
count(IHC) %>%
filter(!is.na(IHC)) %>%
left_join(
tibble(
IHC = c("ER+HER2-", "ER+HER2+", "ER-HER2+", "TNBC", "ADJNORMAL"),
group = c("ER+", "ER+HER2+", "ER-HER2+", "Triple negative", "Adjacent normal")
), by = "IHC") %>%
mutate(
IHC = factor(IHC, levels = c("ER+HER2-", "ER+HER2+", "ER-HER2+", "TNBC", "ADJNORMAL")),
lab = glue("{group}\n({n})")
) %>%
ggpubr::ggdonutchart("n", "lab", fill = "IHC", lab.pos = "out", palette = annot_colors$IHC, font.family = "Arial", lab.font = c(6, "bold", "black"), ggtheme = ggpubr::theme_pubr(base_size = 6, base_family = "Arial")) + guides(fill = "none")
p_donut1 <- samp_data %>%
count(ER1) %>%
rename(ER = ER1) %>%
filter(!is.na(ER)) %>%
left_join(
tibble(
ER = c("ER+", "ER-", "normal"),
group = c("ER+", "ER-", "Adjacent normal")
), by = "ER") %>%
mutate(
ER = factor(ER, levels = c("ER+", "ER-", "normal")),
lab = glue("{group}\n({n})")
) %>%
ggpubr::ggdonutchart("n", "lab", fill = "ER", lab.pos = "out", palette = annot_colors$ER1, font.family = "Arial", lab.font = c(6, "bold", "black"), ggtheme = ggpubr::theme_pubr(base_size = 6, base_family = "Arial")) + guides(fill = "none")
p_donut
p_donut1
2.2 Number of samples
samp_data %>% mutate(ER1 = forcats::fct_explicit_na(ER1)) %>% count(ER1)
## # A tibble: 4 x 2
## ER1 n
## 1 ER- 350
## 2 ER+ 1179
## 3 normal 244
## 4 (Missing) 9
2.3 Total number of reads
tot_reads <- fread(here("data/sample_qc.csv")) %>%
inner_join(samp_data %>%
select(track), by = "track") %>%
pull(total_reads) %>%
sum()
glue("Overall, we used {round(tot_reads / 1e9, digits = 1)}B reads to cover {scales::comma(nrow(samp_data))} breast tumor and normal samples")
## Overall, we used 30.4B reads to cover 1,782 breast tumor and normal samples
2.4 Distribution of CpG coverage per sample
We calculate the distribution of every sample's CpG coverage:
samp_covs <- get_samples_coverage_dist(
tracks = samp_data$track,
breaks = c(0:100, 1e5),
intervals = gintervals.all()) %cache_df% here("data/sample_coverage_dist.tsv") %>% as_tibble()
samp_covs <- samp_covs %>% left_join(samp_data %>% select(samp, ER = ER1), by = "samp")
2.4.0.1 Figure 1B
options(repr.plot.width = 7, repr.plot.height = 7)
p_comulative <- samp_covs %>%
group_by(samp) %>%
summarise(
`>=10` = sum(cpgs[cov >= 10]),
`>=20` = sum(cpgs[cov >= 20]),
`>=30` = sum(cpgs[cov >= 30]),
`>=50` = sum(cpgs[cov >= 50]),
n_tot = sum(cpgs)
) %>%
gather("type", "n", -samp, -n_tot) %>%
ggplot(aes(x = n, color = type, y = 1 - ..y..)) +
stat_ecdf() +
scale_x_log10(labels = scales::scientific) +
scale_color_viridis_d("CpG coverage") +
scale_y_continuous(labels = function(x) round(x * length(unique(samp_covs$samp)))) +
ylab("Samples with #CpGs >= x") +
xlab("# of CpGs (log10)") +
theme(aspect.ratio = 0.8)
p_comulative + theme_bw() + theme(aspect.ratio = 0.8)
cov_above_10 <- samp_covs %>%
filter(cov >= 10) %>%
group_by(samp) %>%
summarise(cpgs = sum(cpgs))
df <- cov_above_10 %>%
summarise(n_threshold = sum(cpgs >= 1e6), n_tot = n()) %>%
mutate(p = scales::percent(n_threshold / n_tot))
glue("Number of samples (Y axis) with a given number of CpGs (X axis) covered with at least 10, 20, 30 or 50 reads. For example, in all samples 449,710 CpGs are covered with over 10 reads.", n_cpgs = scales::comma(min(cov_above_10$cpgs)))
## Number of samples (Y axis) with a given number of CpGs (X axis) covered with at least 10, 20, 30 or 50 reads. For example, in all samples 449,710 CpGs are covered with over 10 reads.
glue("Using our version of the RRBS protocol {perc} of the samples were covered by more than 10 reads for more than 1M CpGs", perc = df$p)
## Using our version of the RRBS protocol 93% of the samples were covered by more than 10 reads for more than 1M CpGs
2.5 Distribution of covered CpGs
We screen for CpGs that are covered by at least 5 reads in half or more of the samples:
min_samples <- round(0.5 * nrow(samp_data))
min_samples
## [1] 891
cov_cpgs <- gscreen_coverage(samp_data$track, 5, min_samples) %>% annotate_loci() %cache_df% here("data/cov_cpgs.tsv") %>% as_tibble()
2.5.0.1 Figure 1C
options(repr.plot.width = 7, repr.plot.height = 3)
p_tss <- cov_cpgs %>%
mutate(tss_d = abs(tss_d) + 1) %>%
ggplot(aes(x = tss_d)) +
geom_density() +
scale_x_log10() +
xlab("Distance from tss (bp)") +
ylab("Density")
p_cg_cont <- cov_cpgs %>%
ggplot(aes(x = cg_cont)) +
geom_density() +
xlab("CpG content (500 bp)") +
ylab("Density")
(p_tss + theme_bw()) + (p_cg_cont + theme_bw())
## Warning: Removed 147 rows containing non-finite values (stat_density).
2.6 Promoter coverage
Distribution of mean promoter coverage over all METABRIC samples, considering 13,282 active promoters. Active promoters were defined as promoters that had log expression >7 in at least one of the METABRIC samples used in this paper.
2.6.0.1 Figure 1D
options(repr.plot.width = 4.6, repr.plot.height = 2.7)
prom_expr_cov <- get_promoter_expr_covs()
min_n_expr <- 0
max_expr <- 7
prom_expr_cov_active <- prom_expr_cov %>%
filter(n_expr > min_n_expr, max_expr > !! max_expr)
p_prom_cov <- prom_expr_cov_active %>%
ggplot(aes(x = cov)) +
geom_histogram(binwidth = 0.1) +
scale_x_log10(breaks = c(1, 10, 50, 1000), labels = function(x) round(x)) +
coord_cartesian(xlim = c(0.01,1500)) +
ylab("# of promoters") +
xlab("Mean promoter coverage")
p_prom_cov + theme_bw()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 119 rows containing non-finite values (stat_bin).
nrow(prom_expr_cov_active)
## [1] 13198
n_cov <- prom_expr_cov_active %>% filter(cov >= 20) %>% nrow()
glue("{perc} of the promoters were nevertheless covered with over 20 reads on average (mean coverage {cov}), facilitating quantitative analysis downstream.", perc = scales::percent(n_cov / nrow(prom_expr_cov_active)), cov = round(mean(prom_expr_cov_active$cov) ))
## 75% of the promoters were nevertheless covered with over 20 reads on average (mean coverage 246), facilitating quantitative analysis downstream.
samp_tot_calls <- get_sample_tot_meth_calls() %cache_df% here("data/sample_tot_meth_calls.tsv") %>% as_tibble()
samp_tot_calls_promoters <- get_sample_tot_meth_calls(promoter_intervs) %cache_df% here("data/sample_tot_meth_calls_promoters.tsv") %>% as_tibble()
tot_reads <- sum(samp_tot_calls$Sum)
tot_promoter_reads <- sum(samp_tot_calls_promoters$Sum)
glue("{perc} of the reads mapped to bona-fide promoters", perc = scales::percent(tot_promoter_reads / tot_reads))
## 9% of the reads mapped to bona-fide promoters
We can also calculate the % of MSP1 fragments that is covering a promoter:
well_covered_loci <- {
msp1_meth <- get_msp1_meth()
msp1_meth_f <- filter_avg_meth(msp1_meth$avg, normal_fraction = 0.7, tumor_fraction = 0.7)
msp1_meth_f %>% select(chrom, start, end)
} %cache_df% here("data/well_covered_msp1_frags.tsv") %>% as_tibble()
n_loci <- nrow(well_covered_loci)
n_promoters <- well_covered_loci %>% gintervals.neighbors1(promoter_intervs) %>% filter(dist == 0) %>% nrow()
scales::percent(n_promoters / n_loci)
## [1] "10%"
2.7 Coverage per clinical annotations
samp_covs <- get_sample_tot_meth_calls() %cache_df% here("data/sample_tot_meth_calls.tsv") %>% as_tibble()
samp_covs <- samp_covs %>%
select(samp, tot_cov = Sum) %>%
left_join(samp_data %>% select(samp, stage, grade, iC10, ER = ER1), by = "samp") %>%
mutate(stage = ifelse(stage %in% c(0, "DCIS", 1), "0-1", stage)) %>%
mutate(stage = ifelse(ER == "normal", "N", stage)) %>%
mutate(grade = ifelse(ER == "normal", "N", grade)) %>%
mutate(stage = factor(stage, levels = c("N", "0-1", "2", "3", "4"))) %>%
mutate(grade = factor(grade, levels = c("N", "1", "2", "3")))
2.7.0.1 Extended Data Figure 1a
options(repr.plot.width = 10, repr.plot.height = 3)
p_stage <- samp_covs %>%
filter(!is.na(stage), !is.na(ER)) %>%
arrange(stage, tot_cov) %>%
mutate(samp = forcats::fct_inorder(samp)) %>%
ggplot(aes(x = samp, y = tot_cov, fill = stage)) +
geom_col() +
theme(axis.ticks.x = element_blank(), axis.text.x = element_blank()) +
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6)) +
scale_x_discrete(breaks = FALSE) +
scale_fill_manual(name = "Stage", values = c("N" = "gray", "0-1" = "black", "2" = "blue", "3" = "red", "4" = "orange")) +
xlab("") +
ylab("# of methylation calls")
p_stage + theme_bw() + theme(axis.ticks.x = element_blank(), axis.text.x = element_blank())
2.7.0.2 Extended Data Figure 1b
options(repr.plot.width = 10, repr.plot.height = 3)
p_iC10 <- samp_covs %>%
filter(!is.na(iC10), !is.na(ER)) %>%
mutate(iC10 = factor(iC10, levels = names(annot_colors$iC10))) %>%
arrange(iC10, tot_cov) %>%
mutate(samp = forcats::fct_inorder(samp)) %>%
ggplot(aes(x = samp, y = tot_cov, fill = iC10)) +
geom_col() +
theme(axis.ticks.x = element_blank(), axis.text.x = element_blank()) +
scale_y_continuous(labels = scales::unit_format(unit = "M", scale = 1e-6)) +
scale_x_discrete(breaks = FALSE) +
scale_fill_manual(values = annot_colors$iC10) +
xlab("") +
ylab("# of methylation calls")
p_iC10 + theme_bw() + theme(axis.ticks.x = element_blank(), axis.text.x = element_blank())
2.8 Marginal CpG coverage
cpg_mars <- covs_marginal(samp_data$track) %cache_df% here("data/cpg_cov_marginal.tsv") %>% as_tibble()
cpg_mars <- cpg_mars %>% mutate(mean_cov = cov / nrow(samp_data))
cov_cdf <- map_df(c(0:100, seq(200, 1000, 20)), ~ tibble(cov = .x, n = sum(cpg_mars$mean_cov >= .x))) %cache_df% here("data/cpg_cov_cdf.tsv") %>% as_tibble()
2.8.0.1 Extended Data Figure 1b
options(repr.plot.width = 4.4, repr.plot.height = 3.5)
p_cov_cdf <- cov_cdf %>%
filter(cov >= 1) %>%
ggplot(aes(x = cov, y = n)) +
geom_line() +
scale_y_log10(labels = scales::scientific) +
scale_x_log10() +
xlab("Average CpG coverage") +
ylab("# of CpGs")
p_cov_cdf + theme_bw()
## Warning: Transformation introduced infinite values in continuous y-axis
2.8.1 Marginal CpG coverage of k4me1 hotspots
conf <- main_config$genomic_regions$enhancers
conf$H3K4me1_tracks
## [1] "Roadmap.Breast_Luminal_Epithelial_Cells.H3K4me1"
## [2] "Roadmap.Breast_Myoepithelial_Cells.H3K4me1_1"
## [3] "Roadmap.Breast_Myoepithelial_Cells.H3K4me1_2"
## [4] "encode.Tfbs.HMECwgEncodeBroadHistoneHmecH3k4me1StdSig"
conf$H3K4me1_thresh
## [1] 0.97
conf$H3K4me1_size
## [1] 200
intervs <- gpatterns.putative_enhancers(conf$H3K4me1_tracks, quant_thresh = conf$H3K4me1_thresh, normalize = conf$H3K4me1_size, min_tss_dist = conf$H3K4me1_tss_dist)
k4me1_mars <- cpg_mars %>%
gintervals.neighbors1(intervs) %>%
filter(dist == 0) %>%
group_by(chrom1, start1, end1) %>%
summarise(cov = sum(cov), mean_cov = cov / nrow(samp_data))
k4me1_mars <- k4me1_mars %>%
ungroup() %>%
select(chrom = chrom1, start = start1, end = end1, cov, mean_cov)
2.8.1.1 Extended Data Figure 1c
options(repr.plot.width = 4.4, repr.plot.height = 3.5)
p_k4me1 <- k4me1_mars %>%
ggplot(aes(x = mean_cov, y = (1 - ..y..) * nrow(k4me1_mars))) +
stat_ecdf() +
scale_y_log10(labels = scales::scientific, breaks = c(10, 1e3, 1e4, 1e5)) +
scale_x_log10(breaks = c(1, 10, 30, 100, 1000)) +
xlab("Mean coverage") +
ylab("# of k4me1 hotsplots") +
vertical_labs()
p_k4me1 + theme_bw()
## Warning: Transformation introduced infinite values in continuous y-axis
k4me1_mars <- cpg_mars %>%
gintervals.neighbors1(enh_intervs_tumors) %>%
filter(dist == 0) %>%
group_by(chrom1, start1, end1) %>%
summarise(cov = sum(cov), mean_cov = cov / nrow(samp_data))
k4me1_mars <- k4me1_mars %>%
ungroup() %>%
select(chrom = chrom1, start = start1, end = end1, cov, mean_cov)
options(repr.plot.width = 4.4, repr.plot.height = 3.5)
p_k4me1_tumors <- k4me1_mars %>%
ggplot(aes(x = mean_cov, y = (1 - ..y..) * nrow(k4me1_mars))) +
stat_ecdf() +
scale_y_log10(labels = scales::scientific, breaks = c(10, 1e3, 1e4, 1e5)) +
scale_x_log10(breaks = c(1, 10, 30, 100, 1000)) +
xlab("Mean coverage") +
ylab("# of k4me1 hotsplots") +
vertical_labs()
p_k4me1_tumors + theme_bw()
## Warning: Transformation introduced infinite values in continuous y-axis
2.9 Sample cellularity
2.9.0.1 Extended Data Figure 1d
p_cellularity_cov <- samp_covs %>%
filter(!is.na(ER)) %>%
left_join(samp_data %>% select(samp, cna_cellularity)) %>%
slice(sample(1:n())) %>%
filter(ER != "normal") %>%
ggplot(aes(x = tot_cov, y = cna_cellularity, color = ER)) +
geom_point(size=0.1) +
scale_color_manual(values = annot_colors$ER1) +
scale_x_log10() +
theme(aspect.ratio = 1) +
guides(color = FALSE) +
xlab("Sample Coverage") +
ylab("Cellularity") +
vertical_labs() +
facet_grid(.~ER)
## Joining, by = "samp"
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
p_cellularity_cov
## Warning: Removed 279 rows containing missing values (geom_point).
samp_genomic_meth <- get_genomic_avg_meth() %>% intervs_to_mat() %>% colMeans(na.rm=TRUE) %>% enframe("samp", "meth") %cache_df% here("data/samp_genomic_meth.tsv") %>% as_tibble()
p_cellularity_meth <- samp_genomic_meth %>%
left_join(samp_data %>% select(samp, cna_cellularity, ER = ER1)) %>%
filter(!is.na(ER)) %>%
slice(sample(1:n())) %>%
filter(ER != "normal") %>%
ggplot(aes(x = meth, y = cna_cellularity, color = ER)) +
geom_point(size=0.1) +
scale_color_manual(values = annot_colors$ER1) +
theme(aspect.ratio = 1) +
guides(color = FALSE) +
xlab("Sample genomic methylation") +
ylab("Cellularity") +
facet_grid(.~ER)
## Joining, by = "samp"
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
p_cellularity_meth
## Warning: Removed 279 rows containing missing values (geom_point).
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4497725 240.3 7606011 406.3 7606011 406.3
## Vcells 211991261 1617.4 562383716 4290.7 490804511 3744.6