5 Loss clock
5.1 Introduction
We continue to characterize the highly correlated group of CpGs (see Epigenomic-scores
notebook) we termed loss clock.
5.2 Initialize
source(here::here("scripts/init.R"))
5.3 Plot methylation distribution of the clock
df_sum <- fread(here("data/epigenomic_features_raw_meth.tsv")) %>% filter(!is.na(ER)) %>% as_tibble()
5.3.0.1 Figure 1i
options(repr.plot.width = 4, repr.plot.height = 4)
p_avg_clock <- df_sum %>%
ggplot(aes(x=clock, color=ER)) +
geom_density() +
scale_color_manual(values = annot_colors$ER1) +
theme(aspect.ratio = 1) +
ylab("Density") +
xlab("Clock avg. methylation") +
coord_cartesian(xlim=c(0.4, 1))
p_avg_clock + theme_bw() + theme(aspect.ratio = 0.9)
ks.test(df_sum[df_sum$ER == "ER+", ]$clock, df_sum[df_sum$ER == "normal", ]$clock)
##
## Two-sample Kolmogorov-Smirnov test
##
## data: df_sum[df_sum$ER == "ER+", ]$clock and df_sum[df_sum$ER == "normal", ]$clock
## D = 0.73803, p-value < 2.2e-16
## alternative hypothesis: two-sided
ks.test(df_sum[df_sum$ER == "ER-", ]$clock, df_sum[df_sum$ER == "normal", ]$clock)
##
## Two-sample Kolmogorov-Smirnov test
##
## data: df_sum[df_sum$ER == "ER-", ]$clock and df_sum[df_sum$ER == "normal", ]$clock
## D = 0.68317, p-value < 2.2e-16
## alternative hypothesis: two-sided
5.4 Annotate "clock" score
5.4.1 Loci annotation
loci_annot <- fread(here("data/loci_annot_epigenomic_features.tsv")) %>% as_tibble()
cor_thresh <- 0.6
5.4.1.1 Figure 1k
options(repr.plot.width = 4, repr.plot.height = 4)
p_tss_d <- loci_annot %>%
mutate(type = ifelse(clock >= cor_thresh, "clock", "other")) %>%
ggplot(aes(x=abs(tss_d), color=type, linetype = type)) +
geom_density() +
xlab("Distance from TSS") +
ylab("Density") +
scale_color_manual(name = "", values=c(clock = "red", other = "gray")) +
scale_x_log10(label=scales::scientific) +
scale_linetype_manual(name = "", values=c(clock = "solid", other = "dashed")) +
theme(aspect.ratio = 0.8)
p_tss_d + theme_bw() + theme(aspect.ratio = 0.8)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 21 rows containing non-finite values (stat_density).
loci_annot %>%
mutate(type = ifelse(clock >= cor_thresh, "clock", "other")) %>%
summarise(pval = ks.test(abs(tss_d)[type == "clock"], abs(tss_d)[type == "other"])$p.value)
## Warning in ks.test(abs(tss_d)[type == "clock"], abs(tss_d)[type == "other"]): p-
## value will be approximate in the presence of ties
## # A tibble: 1 x 1
## pval
## 1 0
5.4.1.2 Figure 1l
options(repr.plot.width = 4, repr.plot.height = 4)
p_tor <- loci_annot %>%
mutate(type = ifelse(clock >= cor_thresh, "clock", "other")) %>%
ggplot(aes(x=-tor, color=type, linetype = type)) +
geom_density() +
xlab("Time of replication") +
ylab("Density") +
scale_color_manual(name = "", values=c(clock = "red", other = "gray")) +
scale_x_continuous(breaks = c(-80, 0), labels = c("Early", "Late")) +
scale_linetype_manual(name = "", values=c(clock = "solid", other = "dashed")) +
theme(aspect.ratio = 0.8)
p_tor + theme_bw() + theme(aspect.ratio = 0.8)
## Warning: Removed 28 rows containing non-finite values (stat_density).
loci_annot %>%
mutate(type = ifelse(clock >= cor_thresh, "clock", "other")) %>%
summarise(pval = ks.test(-tor[type == "clock"], -tor[type == "other"])$p.value)
## Warning in ks.test(-tor[type == "clock"], -tor[type == "other"]): p-value will
## be approximate in the presence of ties
## # A tibble: 1 x 1
## pval
## 1 0
5.4.1.3 Extended Data Figure 4d
options(repr.plot.width = 4, repr.plot.height = 4)
p_cg_cont_d <- loci_annot %>%
mutate(type = ifelse(clock >= cor_thresh, "clock", "other")) %>%
ggplot(aes(x=cg_cont, color=type, linetype = type)) +
geom_density() +
xlab("CpG content (500bp)") +
ylab("Density") +
scale_color_manual(name = "", values=c(clock = "red", other = "gray")) +
scale_linetype_manual(name = "", values=c(clock = "solid", other = "dashed")) +
theme(aspect.ratio = 0.8)
p_cg_cont_d + theme_bw() + theme(aspect.ratio = 0.8)
## Warning: Removed 8 rows containing non-finite values (stat_density).
loci_annot %>%
mutate(type = ifelse(clock >= cor_thresh, "clock", "other")) %>%
summarise(pval = ks.test(cg_cont[type == "clock"], cg_cont[type == "other"])$p.value)
## Warning in ks.test(cg_cont[type == "clock"], cg_cont[type == "other"]): p-value
## will be approximate in the presence of ties
## # A tibble: 1 x 1
## pval
## 1 0
5.4.1.4 Extended Data Figure 4c
options(repr.plot.width = 4, repr.plot.height = 4)
k4me1_names <- grep("k4me1", colnames(loci_annot))
clock_loci_annot <- loci_annot %>%
mutate(type = ifelse(clock >= cor_thresh, "clock", "other")) %>%
mutate(
enh = matrixStats::rowAnys((loci_annot[, k4me1_names] > 0.97), na.rm = TRUE),
polycomb = k27me3 > 0.97
) %>%
mutate(context = case_when(polycomb ~ "K27me3 peaks", enh ~ "K4me1 peaks", TRUE ~ "Background"))
p_enh_polycomb <- clock_loci_annot %>%
count(type, context) %>%
group_by(type) %>%
mutate(frac = n / sum(n)) %>%
ggplot(aes(x = context, y = frac, fill = type)) +
geom_col(position = "dodge") +
scale_fill_manual("", values = c(other = "gray", clock = "red")) +
scale_y_continuous(label = function(x) scales::percent(x, accuracy = 1)) +
ylab("Percent of sites") +
xlab("") +
guides(fill = FALSE) +
vertical_labs() +
theme(aspect.ratio = 0.8)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
p_enh_polycomb + theme_bw() + theme(aspect.ratio = 0.8)
chisq.test(clock_loci_annot$type, clock_loci_annot$context)
##
## Pearson's Chi-squared test
##
## data: clock_loci_annot$type and clock_loci_annot$context
## X-squared = 4794, df = 2, p-value < 2.2e-16
5.4.2 Clinical annotation
feats <- fread(here("data/epigenomic_features.tsv")) %>%
mutate(ML = -ML, clock = -clock, immune.meth = -immune.meth, caf.meth = -caf.meth) %>% as_tibble()
nbins <- 5
df <- feats %>%
mutate(
clock = cut(clock, quantile(clock, 0:nbins/nbins, na.rm=TRUE), include.lowest=TRUE, labels=1:nbins)) %>%
left_join(samp_data %>% select(samp, stage, grade), by = "samp") %>%
mutate(stage = ifelse(stage %in% c(0, "DCIS", 1), "0-1", stage)) %>%
mutate(stage = ifelse(ER == "normal", "N", stage)) %>%
mutate(grade = ifelse(ER == "normal", "N", grade))
df_pval <- df %>% filter(ER %in% c("ER+", "ER-")) %>% gather("feat", "bin", -samp, -ER, -stage, -grade) %>% filter(feat == "clock") %>% group_by(ER, feat) %>% summarise(grade_pval = chisq.test(bin, grade)$p.value, stage_pval = chisq.test(bin, stage)$p.value)
## Warning: attributes are not identical across measure variables;
## they will be dropped
## Warning in chisq.test(bin, grade): Chi-squared approximation may be incorrect
## Warning in chisq.test(bin, stage): Chi-squared approximation may be incorrect
## Warning in chisq.test(bin, stage): Chi-squared approximation may be incorrect
as.data.frame(df_pval)
## ER feat grade_pval stage_pval
## 1 ER- clock 0.6348596 0.07094066
## 2 ER+ clock 0.1471181 0.57227178
df_pval %>% filter(grade_pval <= 0.05)
## # A tibble: 0 x 4
## # groups: ER
## [1] ER feat grade_pval stage_pval
## <0 rows> (or 0-length row.names)
df_pval %>% filter(stage_pval <= 0.05)
## # A tibble: 0 x 4
## # groups: ER
## [1] ER feat grade_pval stage_pval
## <0 rows> (or 0-length row.names)
options(repr.plot.width = 6, repr.plot.height = 4)
p_stage_clock <- df %>%
filter(ER == "ER+") %>%
mutate(stage = factor(stage, levels = c("N", "0-1", "2", "3", "4"))) %>%
filter(!is.na(stage)) %>%
count(stage, clock) %>%
group_by(clock) %>%
mutate(p = n / sum(n)) %>%
ggplot(aes(x=clock, y=p, fill=stage)) +
geom_col() +
scale_fill_manual(name = "Stage", values = c("N" = "gray", "0-1" = "black", "2" = "blue", "3" = "red", "4" = "orange")) +
scale_y_continuous(labels=scales::percent) +
ylab("% of samples")
p_stage_clock + theme_bw()
5.4.2.1 Figure 1j
options(repr.plot.width = 6, repr.plot.height = 4)
p_grade_clock_positive <- df %>%
filter(ER == "ER+") %>%
mutate(grade = factor(grade, levels = c("N", "1", "2", "3"))) %>%
filter(!is.na(grade)) %>%
count(grade, clock) %>%
group_by(clock) %>%
mutate(p = n / sum(n)) %>%
ggplot(aes(x=clock, y=p, fill=grade)) +
geom_col() +
scale_fill_manual(name = "Grade", values = c("N" = "gray", "1" = "darkblue", "2" = "red", "3" = "orange")) +
scale_y_continuous(labels=scales::percent) +
ylab("% of samples")
p_grade_clock_negative <- df %>%
filter(ER == "ER-") %>%
mutate(grade = factor(grade, levels = c("N", "1", "2", "3"))) %>%
filter(!is.na(grade)) %>%
count(grade, clock) %>%
group_by(clock) %>%
mutate(p = n / sum(n)) %>%
ggplot(aes(x=clock, y=p, fill=grade)) +
geom_col() +
scale_fill_manual(name = "Grade", values = c("N" = "gray", "1" = "darkblue", "2" = "red", "3" = "orange")) +
scale_y_continuous(labels=scales::percent) +
ylab("% of samples")
p_grade_clock_positive + theme_bw()
p_grade_clock_negative + theme_bw()
5.5 Plot chromosomal traces of clock score
We separate the samples to tumors with high and low clock score (top and bottom 30%). We then look at average methylation in bins of 10K along the chromosome.
We smooth the methylation traces with rolling average of 50 bins.
5.5.0.1 Figure 1m
options(repr.plot.width = 14, repr.plot.height = 5)
p_trace <- plot_tor_clock_chrom_track("chr1", "ER+", iterator=1e4)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: Removed 147 row(s) containing missing values (geom_path).
p_trace$p + coord_cartesian(ylim = c(0.25, 0.77))
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
5.5.0.2 Extended Data Figure 4f
options(repr.plot.width = 14, repr.plot.height = 5)
p_trace_chr10 <- plot_tor_clock_chrom_track("chr10", "ER+", iterator=1e4)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
## Warning: Removed 147 row(s) containing missing values (geom_path).
p_trace_chr10$p + coord_cartesian(ylim = c(0.1, 0.77))
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.
5.6 Correlation of gene expression to clock
We calculate the correlation of all the genes to the clock score.
feat_gene_cors <- get_expression_features_cors()
5.6.0.1 Extended Data Figure 5a
options(repr.plot.width = 4, repr.plot.height = 7)
top_genes <- feat_gene_cors %>%
filter(ER == "ER+") %>%
arrange(-clock) %>%
slice(1:25)
p_top_genes <- top_genes %>%
ggplot(aes(x = reorder(name, clock), y = clock, fill = ER)) +
geom_col() +
scale_fill_manual("", values = annot_colors$ER1) +
guides(fill = "none") +
ylim(0, 0.5) +
xlab("") +
ylab("ER+") +
coord_flip() +
ggtitle("Correlation of\nexpression with\nclock score") +
theme(plot.title = element_text(hjust = 0.5))
p_top_genes + theme_bw() + theme(plot.title = element_text(hjust = 0.5))
5.6.0.2 Extended Data Figure 5b
gene_feat_df <- get_gene_features_df(c("MAGEC2", "PAGE5")) %>% filter(!is.na(clock))
options(repr.plot.width = 7, repr.plot.height = 4)
p_mage_page <- gene_feat_df %>%
filter(ER != "normal") %>%
mutate(ER = factor(ER, levels=c("ER+", "ER-"))) %>%
ggplot(aes(x=clock, y=expr, color=ER)) +
geom_point(size=0.2) +
xlab("Clock") +
scale_color_manual(values=annot_colors$ER1) +
ylab("Gene expression") +
facet_wrap(ER~name, scales="free_y", nrow=1) +
theme(aspect.ratio = 1) +
guides(color="none")
p_mage_page + theme_bw() + theme(aspect.ratio = 1)
5.7 Plot score in different TOR regimes
all_mat_raw <- get_all_meth() %>% intervs_to_mat()
tor_strata <- loci_annot %>% mutate(tor_strata = cut(tor, breaks = main_config$genomic_regions$tor_low_mid_high, labels=c("late", "intermediate", "early"))) %>% pull(tor_strata) %>% forcats::fct_explicit_na()
samp_meth_tor <- tgs_matrix_tapply(t(all_mat_raw), tor_strata, mean, na.rm=TRUE) %>% t() %>% as.data.frame() %>% rownames_to_column("samp") %>% add_ER() %cache_df% here("data/samp_meth_tor.tsv") %>% as_tibble()
5.7.0.1 Extended Data Figure 4e
options(repr.plot.width = 9, repr.plot.height = 4)
p_early_late <- samp_meth_tor %>%
filter(ER != "normal") %>%
ggplot(aes(x=early, y=late, color=ER)) +
geom_abline(linetype = "dashed") +
geom_point(size=0.1) +
xlim(0.1, 0.9) +
ylim(0.1, 0.9) +
scale_color_manual(values=annot_colors$ER1) +
theme(aspect.ratio = 1)
p_mid_early <- samp_meth_tor %>%
filter(ER != "normal") %>%
ggplot(aes(x=early, y=intermediate, color=ER)) +
geom_abline(linetype = "dashed") +
geom_point(size=0.1) +
xlim(0.1, 0.9) +
ylim(0.1, 0.9) +
scale_color_manual(values=annot_colors$ER1) +
theme(aspect.ratio = 1)
samp_meth_tor %>% filter(ER != "normal") %>% summarise(cor_early_late = cor(early, late, method = "spearman", use = "pairwise.complete.obs"), cor_early_mid = cor(early, intermediate, method = "spearman", use = "pairwise.complete.obs"))
## # A tibble: 1 x 2
## cor_early_late cor_early_mid
## 1 0.5723737 0.7680809
(p_early_late + theme_bw() + theme(aspect.ratio = 1) ) + (p_mid_early + theme_bw() + theme(aspect.ratio = 1) )
5.8 Plot High-Low clock tumors vs TOR
5.8.0.1 Extended Data Figure 4g
options(repr.plot.width = 4, repr.plot.height = 4)
df <- get_tor_clock_chrom_trace("chr1", "ER+", 1e5) %>%
filter(high_loss_n >= 50, low_loss_n >= 50) %>%
filter(!is.na(tor), !is.na(high_loss), !is.na(low_loss))
cor(-df$tor, df$high_loss - df$low_loss, method = "spearman")
## [1] -0.6187326
p_tor_high_low <- df %>%
ggplot(aes(x = -tor, y = high_loss - low_loss)) +
geom_point(size=0.05, alpha=0.5) +
theme(aspect.ratio = 1) +
ylim(-0.4, 0.025) +
xlab("Time of replication\n(Early -> Late)") +
ylab("High Clock - Low Clock")
p_tor_high_low + theme_bw() + theme(aspect.ratio = 1)
5.9 Plot correlation of loci to clock methylation
5.9.0.1 Extended Data Figure 4h
options(repr.plot.width = 8, repr.plot.height = 4)
p_loci_cor <- loci_annot %>%
left_join(get_all_summary_meth()) %>%
mutate(normal_meth = cut(normal, c(0,0.3,0.7,1), include.lowest=TRUE)) %>%
ggplot(aes(x=clock)) +
geom_density() +
facet_grid(.~normal_meth) +
ylab("Density") +
xlab("Correlation of loci to clock methylation")
## Joining, by = c("chrom", "start", "end")
p_loci_cor + theme_bw()
5.10 Plot distribution of methylation of clock in normal samples
normal_mat <- get_all_meth() %>%
select(chrom:end, any_of(normal_samples)) %>%
intervs_to_mat()
loci_annot_f <- loci_annot %>%
filter(cg_cont <= 0.02) %>%
mutate(tor = cut(tor, breaks=main_config$genomic_regions$tor_low_mid_high, labels=c("late", "mid", "early"), include.lowest=TRUE)) %>% mutate(tor = forcats::fct_explicit_na(tor)) %>%
intervs_to_mat()
normal_clock <- tgs_matrix_tapply(t(normal_mat[rownames(loci_annot_f), ]), loci_annot_f[, "tor"], mean, na.rm=TRUE) %>%
t() %>%
as.data.frame() %>%
rownames_to_column("samp") %>%
as_tibble()
nrow(loci_annot_f)
## [1] 36420
5.10.0.1 Extended Data Figure 5b
options(repr.plot.width = 4, repr.plot.height = 4)
df <- normal_clock %>%
gather("tor", "meth", -samp) %>%
mutate(tor = factor(tor, levels = c("early", "mid", "late"))) %>%
filter(!is.na(tor)) %>%
mutate(ER = "normal")
p_clock_normal <- df %>%
ggplot(aes(x = tor, y = meth, fill = ER)) +
geom_boxplot(outlier.size=0.1, lwd = 0.5) +
scale_fill_manual(values = annot_colors$ER1) +
theme(aspect.ratio = 1) +
guides(fill = FALSE) +
xlab("Time of replication") +
vertical_labs() +
ylab("Methylation in normal samples\n(CpG content <= 2%)")
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
p_clock_normal + theme_bw() + theme(aspect.ratio = 1)
5.11 Compare clock layer to biological age
5.11.0.1 Extended Data Figure 5c
options(repr.plot.width = 12, repr.plot.height = 4)
df <- feats %>%
left_join(samp_data %>% select(samp, age))
## Joining, by = "samp"
df %>% summarise(cor = cor(age, clock, method = "spearman", use = "pairwise.complete.obs"))
## # A tibble: 1 x 1
## cor
## 1 0.06042711
p_age <- df %>%
ggplot(aes(x=age, y=clock, color=ER)) +
geom_point(size=0.5) +
scale_color_manual(values = annot_colors$ER1) +
theme(aspect.ratio = 1) +
xlab("Biological age") +
ylab("Clock") +
facet_grid(.~ER)
p_age + theme_bw() + theme(aspect.ratio = 1)
## Warning: Removed 30 rows containing missing values (geom_point).
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 4907303 262.1 8028385 428.8 8028385 428.8
## Vcells 834169869 6364.3 1673781729 12770.0 1576695863 12029.3