12 Finding specific dosage compensation examples

autosome_cands_3n <- get_autosome_dosage_comp_cands("3N")
head(autosome_cands_3n)
## # A tibble: 6 x 17
##   chrom  start    end  ER          1N          2N        >=3N n_>=3N n_1N n_2N
## 1  chr1 762469 763020 ER- 0.008341878 0.007857689 0.007852706     15   60  210
## 2  chr1 762469 763020 ER+ 0.008089050 0.008125402 0.006506879     12  211  816
## 3  chr1 762676 763227 ER- 0.021358769 0.026941446 0.028412915     16   67  220
## 4  chr1 762676 763227 ER+ 0.070967669 0.042346185 0.028590166     16  228  846
## 5  chr1 762851 763402 ER- 0.022340914 0.029069381 0.029766663     16   67  220
## 6  chr1 762851 763402 ER+ 0.078736553 0.047128295 0.032413643     15  228  845
##   expr_>=3N expr_1N expr_2N n_expr_>=3N n_expr_1N n_expr_2N      name
## 1        NA      NA      NA          NA        NA        NA LINC01128
## 2        NA      NA      NA          NA        NA        NA LINC01128
## 3        NA      NA      NA          NA        NA        NA      <NA>
## 4        NA      NA      NA          NA        NA        NA      <NA>
## 5        NA      NA      NA          NA        NA        NA LINC00115
## 6        NA      NA      NA          NA        NA        NA LINC00115
thresh <- 0.1
dosage_cands_gain_3n <- autosome_cands_3n %>% 
        filter(`>=3N` >= (`2N` + thresh) ) %>%         
        filter(`n_>=3N` >= 5, `n_2N` >= 5) %>% #,  !is.na(name))  %>%
        select(chrom, start, end, name, ER, `>=3N`, `2N`, `n_>=3N`, n_2N, `expr_>=3N`, `expr_2N`, `n_expr_>=3N`, `n_expr_2N`) %>%
        arrange(-`>=3N`, ER, name) 
dosage_cands_gain_3n %>% count(ER)
## # A tibble: 2 x 2
##    ER   n
## 1 ER- 197
## 2 ER+ 279
autosome_cands_4n <- get_autosome_dosage_comp_cands("4N")
head(autosome_cands_4n)
## # A tibble: 6 x 21
##   chrom  start    end  ER          1N          2N          3N        >=4N
## 1  chr1 762469 763020 ER- 0.008341878 0.007857689 0.007852706          NA
## 2  chr1 762469 763020 ER+ 0.008089050 0.008125402 0.006234499 0.007324019
## 3  chr1 762676 763227 ER- 0.021358769 0.026941446 0.028412915          NA
## 4  chr1 762676 763227 ER+ 0.070967669 0.042346185 0.032404898 0.012059659
## 5  chr1 762851 763402 ER- 0.022340914 0.029069381 0.029766663          NA
## 6  chr1 762851 763402 ER+ 0.078736553 0.047128295 0.037778446 0.010954431
##   n_>=4N n_1N n_2N n_3N expr_>=4N expr_1N expr_2N expr_3N n_expr_>=4N n_expr_1N
## 1     NA   60  210   15        NA      NA      NA      NA          NA        NA
## 2      3  211  816    9        NA      NA      NA      NA          NA        NA
## 3     NA   67  220   16        NA      NA      NA      NA          NA        NA
## 4      3  228  846   13        NA      NA      NA      NA          NA        NA
## 5     NA   67  220   16        NA      NA      NA      NA          NA        NA
## 6      3  228  845   12        NA      NA      NA      NA          NA        NA
##   n_expr_2N n_expr_3N      name
## 1        NA        NA LINC01128
## 2        NA        NA LINC01128
## 3        NA        NA      <NA>
## 4        NA        NA      <NA>
## 5        NA        NA LINC00115
## 6        NA        NA LINC00115
thresh <- 0.1
dosage_cands_gain_4n <- autosome_cands_4n %>% 
        filter(`>=4N` >= (`2N` + thresh) ) %>%         
        filter(`n_>=4N` >= 5, `n_2N` >= 5) %>% #,  !is.na(name))  %>%
        select(chrom, start, end, name, ER, `>=4N`, `2N`, `n_>=4N`, n_2N, `expr_>=4N`, `expr_2N`, `n_expr_>=4N`, `n_expr_2N`) %>%
        arrange(-`>=4N`, ER, name) 
dosage_cands_gain_4n %>% count(ER)
## # A tibble: 2 x 2
##    ER   n
## 1 ER- 261
## 2 ER+ 522
dosage_cands_loss <- autosome_cna_meth_expr %>% 
        filter(`1N` <= (`2N` - thresh) ) %>%         
        filter(`n_1N` >= 5, `n_2N` >= 5)  %>%
        select(chrom, start, end, name, ER, `1N`, `2N`, `n_1N`, n_2N, `expr_1N`, `expr_2N`, `n_expr_1N`, `n_expr_2N`) %>%
        arrange(-`n_1N`, ER, name)

dosage_cands_loss %>% count(ER)
##    ER  n
## 1 ER- 23
## 2 ER+ 25
df <- get_autosome_meth_cna()

Validating that we have enough samples for each locus:

auto_cna <- get_autosome_cna() %>% inner_join(get_autosome_loci_meth())
## Joining, by = c("chrom", "start", "end")
## Joining, by = c("chrom", "start", "end", "samp")
num_cna_meth <- auto_cna %>% filter(!is.na(meth)) %>% count(chrom, start, end, ER, cna_grp) %>% mutate(cna_grp = paste0("n_", cna_grp)) %>% spread(cna_grp, n)
num_cna_meth %>% gather('type', 'num', -(chrom:ER)) %>% group_by(chrom, start, end, ER) %>% mutate(p = num / sum(num))  %>% select(-num) %>% spread(type, p) %>% ggplot(aes(x=`n_1N`, y=`n_2N`, color=ER)) + geom_point() + scale_color_manual(values=annot_colors$ER1, guide = FALSE) + theme(aspect.ratio=1)
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

num_cna_meth %>% gather('type', 'num', -(chrom:ER)) %>% group_by(chrom, start, end, ER) %>% mutate(p = num / sum(num))  %>% select(-num) %>% spread(type, p) %>% ggplot(aes(x=`n_2N`, y=`n_>=3N`, color=ER)) + geom_point() + scale_color_manual(values=annot_colors$ER1, guide = FALSE) + theme(aspect.ratio=1)
## Warning: Removed 5 rows containing missing values (geom_point).

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

df <- df %>% left_join(num_cna_meth)
## Joining, by = c("chrom", "start", "end", "ER")
df_expr <- get_autosome_expr_cna() %>% gather("cna", "expr", -(chrom:ER)) %>% mutate(cna = paste0("expr_", cna)) %>% spread(cna, expr)
df <- df %>% left_join(df_expr)
## Joining, by = c("chrom", "start", "end", "ER")
num_cna_expr <- get_autosome_cna() %>% inner_join(get_autosome_loci_expr()) %>% filter(!is.na(expr)) %>% count(chrom, start, end, ER, cna_grp) %>% mutate(cna_grp = paste0("n_expr_", cna_grp)) %>% spread(cna_grp, n)
## Joining, by = c("chrom", "start", "end")
## Joining, by = c("chrom", "start", "end", "samp")
df <- df %>% left_join(num_cna_expr)
## Joining, by = c("chrom", "start", "end", "ER")
df <- df %>% left_join(get_gene_expression_mat() %>% select(chrom:end, name) %>% mutate(start = start + 1, end = end + 1))
## Joining, by = c("chrom", "start", "end")
fwrite(df, here("data/autosome_cna_meth_expr.tsv"), sep="\t")
thresh <- 0.1
options(repr.plot.width = 4, repr.plot.height = 4)

df %>% ggplot(aes(x = `2N`, y = `>=3N`, color = ER)) +geom_point(size=0.2)  + geom_abline(color = "black", linetype = "dashed") + scale_color_manual(values = annot_colors$ER1, guide = FALSE) + theme(aspect.ratio = 1) + geom_line(data = tibble(`2N` = seq(0,1,0.001), `>=3N`= `2N` - thresh), color="red", linetype="dashed") +  geom_line(data = tibble(`2N` = seq(0,1,0.001), `>=3N`= `2N` + thresh), color="red", linetype = "dashed") + xlim(0,1) + ylim(0,1)
## Warning: Removed 100 row(s) containing missing values (geom_path).

## Warning: Removed 100 row(s) containing missing values (geom_path).
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

df %>% filter(`>=3N` >= (`2N` + thresh) ) %>% select(chrom, start, end, ER, `>=3N`, `2N`, `n_>=3N`, n_2N, `expr_>=3N`, `expr_2N`, `n_expr_>=3N`, `n_expr_2N`, name) %>% filter(`n_>=3N` >= 5, `n_2N` >= 5, !is.na(name)) %>% arrange(-`n_>=3N`, ER, name) %>% ggplot(aes(x=(`expr_>=3N` - `expr_2N`))) + geom_density() + xlim(-0.5, 1)

df %>% filter(`>=3N` >= (`2N` + thresh) ) %>% select(chrom, start, end, ER, `>=3N`, `2N`, `n_>=3N`, n_2N, `expr_>=3N`, `expr_2N`, `n_expr_>=3N`, `n_expr_2N`, name) %>% filter(`n_>=3N` >= 5, `n_2N` >= 5, !is.na(name)) %>% arrange(-`n_>=3N`, ER, name)
## # A tibble: 0 x 13
##  [1] chrom       start       end         ER          >=3N        2N         
##  [7] n_>=3N      n_2N        expr_>=3N   expr_2N     n_expr_>=3N n_expr_2N  
## [13] name       
## <0 rows> (or 0-length row.names)
df %>% filter(`>=3N` <= (`2N` - thresh) ) %>% select(chrom, start, end, ER, `>=3N`, `2N`, `n_>=3N`, n_2N, `expr_>=3N`, `expr_2N`, `n_expr_>=3N`, `n_expr_2N`, name) %>% filter(`n_>=3N` >= 5, `n_2N` >= 5, !is.na(name))
## # A tibble: 0 x 13
##  [1] chrom       start       end         ER          >=3N        2N         
##  [7] n_>=3N      n_2N        expr_>=3N   expr_2N     n_expr_>=3N n_expr_2N  
## [13] name       
## <0 rows> (or 0-length row.names)
dosage_cands <- df %>% filter(`>=3N` >= (`2N` + thresh) ) %>% select(chrom, start, end, ER, `>=3N`, `2N`, `n_>=3N`, n_2N, `expr_>=3N`, `expr_2N`, `n_expr_>=3N`, `n_expr_2N`, name) %>% filter(`n_>=3N` >= 5, `n_2N` >= 5, !is.na(name)) %>% arrange(-`n_>=3N`, ER, name)
dosage_cands_mean_expr <- dosage_cands %>% select(chrom, start, end, ER) %>% inner_join(get_gene_expression_mat() %>% select(-name3.chr) %>% mutate(start = start + 1, end = end + 1))  %>% gather("samp", "expr", -(chrom:name)) %>% left_join(samp_data %>% select(samp, ER1)) %>% filter(ER == ER1) %>% group_by(chrom, start, end, name, ER) %>% summarise(mean_expr = mean(expr, na.rm=TRUE)) %>% ungroup()
## Joining, by = c("chrom", "start", "end")
## Joining, by = "samp"
all_mean_expr <- get_gene_expression_mat() %>% select(-name3.chr) %>% mutate(start = start + 1, end = end + 1)  %>% gather("samp", "expr", -(chrom:name)) %>% left_join(samp_data %>% select(samp, ER1)) %>% group_by(chrom, start, end, name, ER1) %>% summarise(mean_expr = mean(expr, na.rm=TRUE)) %>% ungroup()
## Joining, by = "samp"
dosage_cands_mean_expr %>% mutate(type = "dosage") %>% bind_rows(all_mean_expr %>% mutate(type = "all")) %>% ggplot(aes(x=mean_expr, color=type)) + geom_density()
## Warning: Removed 34216 rows containing non-finite values (stat_density).

get_xist_cna() %>%
        left_join(get_xist_loci_expr()) %>%
        group_by(cna_grp, samp, ER, chrom, start, end) %>%
        summarise(expr = mean(expr, na.rm = TRUE)) %>%
        ggplot(aes(x = cna_grp, y = expr, fill = ER, group = cna_grp)) +
        geom_boxplot(outlier.size = 0.1) + 
        scale_fill_manual(values = annot_colors$ER1, guide = FALSE) +
        xlab("") +
        ylab("Expression in Xist\nassociated promoters") +
        facet_grid(. ~ ER) +        
        ggpubr::stat_compare_means(label = "p.signif", comparisons = list(c("1N", "2N"), c("2N", ">=3N")))
## Joining, by = "samp"
## Joining, by = c("chrom", "start", "end")
## Joining, by = c("chrom", "start", "end", "samp")
## Warning: Removed 569263 rows containing non-finite values (stat_boxplot).
## Warning: Removed 569263 rows containing non-finite values (stat_signif).
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

gc()
##              used    (Mb) gc trigger    (Mb)   max used    (Mb)
## Ncells    5486807   293.1   38886436  2076.8   60760054  3245.0
## Vcells 1399051102 10674.0 2838607215 21656.9 2825762492 21558.9