%%time
import anndata as ad             # For reading/writing AnnData files
import matplotlib.pyplot as plt  # For plotting
import metacells as mc           # The Metacells package
import numpy as np               # For array/matrix operations
import pandas as pd              # For data frames
import os                        # For filesystem operations
import seaborn as sb             # For plotting
import scipy.sparse as sp        # For sparse matrices
import shutil                    # for filesystem operations
from math import hypot           # For plotting

CPU times: user 6.74 s, sys: 10.5 s, total: 17.3 s
Wall time: 5.71 s


%%time

# Use SVG for scalable low-element-count diagrams.
%config InlineBackend.figure_formats = ["svg"]

# A matter of personal preference.
sb.set_style("white")

# Running operations on an inefficient layout can make code **much** slower.
# For example, summing the columns of a row-major matrix.
# By default this will just be a warning.
# We set it to be an error here to make sure the vignette does not lead you astray.
#
# Note that this only affects the Metacells package.
# Numpy will happily and silently take 100x longer for running such inefficient operations.
# At least, there's no way I can tell to create a warning or error for this;
# also, the implementation for "inefficient" operations could be *much* faster.
#
# The workaround in either case is to explicitly re-layout the 2D matrix before the operations.
# This turns out to be much faster, especially when the matrix can be reused.
# Note that numpy is also very slow when doing matrix re-layout,
# so the metacells package provides a function for doing it more efficiently.
#
# Sigh.
mc.ut.allow_inefficient_layout(False)

CPU times: user 5.91 ms, sys: 0 ns, total: 5.91 ms
Wall time: 5.82 ms

True


%%time
shutil.rmtree("../output/one-pass", ignore_errors=True)
shutil.rmtree("../mcview/one-pass", ignore_errors=True)
os.makedirs("../output/one-pass/preliminary/figures", exist_ok=True)
os.makedirs("../output/one-pass/final", exist_ok=True)

CPU times: user 0 ns, sys: 15 s, total: 15 s
Wall time: 18.6 s


%%time
full = ad.read_h5ad("../blobs/hca_bm.full.h5ad")
mc.ut.top_level(full)
mc.ut.set_name(full, "hca_bm.full")
print(f"Full: {full.n_obs} cells, {full.n_vars} genes")

Full: 378000 cells, 33694 genes
CPU times: user 374 ms, sys: 27.5 s, total: 27.9 s
Wall time: 47.7 s


%%time
doublet_cell_names = \
    mc.ut.to_numpy_vector(pd.read_csv("../captured/one-pass.doublets.csv", header=None))
doublet_cells_mask = pd.Series(False, index=full.obs_names)
doublet_cells_mask[doublet_cell_names] = True
mc.ut.set_o_data(full, "doublet_cell", doublet_cells_mask)

set hca_bm.full.obs[doublet_cell]: 1197 true (0.3167%) out of 378000 bools

CPU times: user 10.9 ms, sys: 4.55 ms, total: 15.5 ms
Wall time: 14.5 ms


PROPERLY_SAMPLED_MIN_CELL_TOTAL = 800
PROPERLY_SAMPLED_MAX_CELL_TOTAL = 20000


%%time
total_umis_per_cell = mc.ut.get_o_numpy(full, "__x__", sum=True)
plot = sb.displot(total_umis_per_cell, log_scale=(10, None))
plot.set(xlabel="UMIs", ylabel="Density", yticks=[])

plot.refline(x=PROPERLY_SAMPLED_MIN_CELL_TOTAL, color="darkgreen")
plot.refline(x=PROPERLY_SAMPLED_MAX_CELL_TOTAL, color="crimson")

plt.savefig("../output/one-pass/preliminary/figures/cell_total_umis.svg")

too_small_cells_count = np.sum(total_umis_per_cell < PROPERLY_SAMPLED_MIN_CELL_TOTAL)
too_large_cells_count = np.sum(total_umis_per_cell > PROPERLY_SAMPLED_MAX_CELL_TOTAL)

total_umis_per_cell = mc.ut.get_o_numpy(full, name="__x__", sum=True)
too_small_cells_percent = 100.0 * too_small_cells_count / full.n_obs
too_large_cells_percent = 100.0 * too_large_cells_count / full.n_obs

print(
    f"Will exclude {too_small_cells_count} ({too_small_cells_percent:.2f}%%) cells"
    f" with less than {PROPERLY_SAMPLED_MIN_CELL_TOTAL} UMIs"
)
print(
    f"Will exclude {too_large_cells_count} ({too_large_cells_percent:.2f}%%) cells"
    f" with more than {PROPERLY_SAMPLED_MAX_CELL_TOTAL} UMIs"
)

Will exclude 66232 (17.52%%) cells with less than 800 UMIs
Will exclude 8672 (2.29%%) cells with more than 20000 UMIs
CPU times: user 5.19 s, sys: 591 ms, total: 5.78 s
Wall time: 5.16 s


EXCLUDED_GENE_NAMES = [
    "XIST", "MALAT1",   # Sex-specific genes.
    "NEAT1"             # Non-coding.
]
EXCLUDED_GENE_PATTERNS = ["MT-.*"]  # Mytochondrial.


%%time
mc.pl.exclude_genes(
    full,
    excluded_gene_names=EXCLUDED_GENE_NAMES, 
    excluded_gene_patterns=EXCLUDED_GENE_PATTERNS,
    random_seed=123456,
)

set hca_bm.full.var[bursty_lonely_gene]: 0 true (0%) out of 33694 bools
set hca_bm.full.var[properly_sampled_gene]: 27277 true (80.96%) out of 33694 bools
set hca_bm.full.var[excluded_gene]: 6433 true (19.09%) out of 33694 bools

CPU times: user 34.9 s, sys: 16.2 s, total: 51.1 s
Wall time: 36.2 s


%%time
mc.tl.compute_excluded_gene_umis(full)

set hca_bm.full.obs[excluded_umis]: 378000 float32s

CPU times: user 3.78 s, sys: 4.92 s, total: 8.7 s
Wall time: 8.7 s


PROPERLY_SAMPLED_MAX_EXCLUDED_GENES_FRACTION = 0.25


%%time
excluded_umis_fraction_regularization = 1e-3  # Avoid 0 values in log scale plot.
excluded_umis_per_cell = mc.ut.get_o_numpy(full, "excluded_umis")
excluded_umis_fraction_per_cell = excluded_umis_per_cell / total_umis_per_cell

excluded_umis_fraction_per_cell += excluded_umis_fraction_regularization
plot = sb.displot(excluded_umis_fraction_per_cell, log_scale=(10, None))
excluded_umis_fraction_per_cell -= excluded_umis_fraction_regularization

plot.set(xlabel="Fraction of excluded gene UMIs", ylabel="Density", yticks=[])
plot.refline(x=PROPERLY_SAMPLED_MAX_EXCLUDED_GENES_FRACTION, color="crimson")

plt.savefig("../output/one-pass/preliminary/figures/cell_excluded_umis_fraction.svg")

too_excluded_cells_count = np.sum(
    excluded_umis_fraction_per_cell > PROPERLY_SAMPLED_MAX_EXCLUDED_GENES_FRACTION
)
too_excluded_cells_fraction = too_excluded_cells_count / full.n_obs

print(
    f"Will exclude {too_excluded_cells_count} ({100 * too_excluded_cells_fraction:.2f}%) cells"
    f" with more than {100 * PROPERLY_SAMPLED_MAX_EXCLUDED_GENES_FRACTION:.2f}% excluded gene UMIs"
)

Will exclude 36458 (9.64%) cells with more than 25.00% excluded gene UMIs
CPU times: user 1.27 s, sys: 708 ms, total: 1.97 s
Wall time: 1.54 s


%%time
mc.pl.exclude_cells(
    full,
    properly_sampled_min_cell_total=PROPERLY_SAMPLED_MIN_CELL_TOTAL,
    properly_sampled_max_cell_total=PROPERLY_SAMPLED_MAX_CELL_TOTAL,
    properly_sampled_max_excluded_genes_fraction=PROPERLY_SAMPLED_MAX_EXCLUDED_GENES_FRACTION,
    additional_cells_masks=["|doublet_cell"]
)

set hca_bm.full.obs[properly_sampled_cell]: 297810 true (78.79%) out of 378000 bools
set hca_bm.full.obs[excluded_cell]: 81387 true (21.53%) out of 378000 bools

CPU times: user 4.46 ms, sys: 1.24 ms, total: 5.7 ms
Wall time: 4.71 ms


%%time
clean = mc.pl.extract_clean_data(full, name="hca_bm.one-pass.clean")
mc.ut.top_level(clean)
print(f"Clean: {clean.n_obs} cells, {clean.n_vars} genes")

set hca_bm.one-pass.clean.obs[full_cell_index]: 296613 int32s
set hca_bm.one-pass.clean.var[full_gene_index]: 27261 int32s

Clean: 296613 cells, 27261 genes
CPU times: user 26.4 s, sys: 16.9 s, total: 43.3 s
Wall time: 43.3 s


%%time
full.write_h5ad("../output/one-pass/preliminary/hca_bm.full.h5ad")
full = None  # Allow it to be gc-ed

CPU times: user 529 ms, sys: 29.7 s, total: 30.3 s
Wall time: 1min 11s


%%time
clean.write_h5ad("../output/one-pass/preliminary/hca_bm.clean.h5ad")

CPU times: user 506 ms, sys: 19.9 s, total: 20.4 s
Wall time: 52.3 s


%%time
cells = clean
clean = None  # Allow it to be gc-ed
mc.ut.set_name(cells, "hca_bm.one-pass.preliminary.cells")
print(f"Input: {cells.n_obs} cells, {cells.n_vars} genes")

Input: 296613 cells, 27261 genes
CPU times: user 109 µs, sys: 0 ns, total: 109 µs
Wall time: 96.3 µs


LATERAL_GENE_NAMES = [
    "ACSM3", "ANP32B", "APOE", "AURKA", "B2M", "BIRC5", "BTG2", "CALM1", "CD63", "CD69", "CDK4",
    "CENPF", "CENPU", "CENPW", "CH17-373J23.1", "CKS1B", "CKS2", "COX4I1", "CXCR4", "DNAJB1",
    "DONSON", "DUSP1", "DUT", "EEF1A1", "EEF1B2", "EIF3E", "EMP3", "FKBP4", "FOS", "FOSB", "FTH1",
    "G0S2", "GGH", "GLTSCR2", "GMNN", "GNB2L1", "GPR183", "H2AFZ", "H3F3B", "HBM", "HIST1H1C",
    "HIST1H2AC", "HIST1H2BG", "HIST1H4C", "HLA-A", "HLA-B", "HLA-C", "HLA-DMA", "HLA-DMB",
    "HLA-DPA1", "HLA-DPB1", "HLA-DQA1", "HLA-DQB1", "HLA-DRA", "HLA-DRB1", "HLA-E", "HLA-F", "HMGA1",
    "HMGB1", "HMGB2", "HMGB3", "HMGN2", "HNRNPAB", "HSP90AA1", "HSP90AB1", "HSPA1A", "HSPA1B",
    "HSPA6", "HSPD1", "HSPE1", "HSPH1", "ID2", "IER2", "IGHA1", "IGHA2", "IGHD", "IGHG1", "IGHG2",
    "IGHG3", "IGHG4", "IGHM", "IGKC", "IGKV1-12", "IGKV1-39", "IGKV1-5", "IGKV3-15", "IGKV4-1",
    "IGLC2", "IGLC3", "IGLC6", "IGLC7", "IGLL1", "IGLL5", "IGLV2-34", "JUN", "JUNB", "KIAA0101",
    "LEPROTL1", "LGALS1", "LINC01206", "LTB", "MCM3", "MCM4", "MCM7", "MKI67", "MT2A", "MYL12A",
    "MYL6", "NASP", "NFKBIA", "NUSAP1", "PA2G4", "PCNA", "PDLIM1", "PLK3", "PPP1R15A", "PTMA",
    "PTTG1", "RAN", "RANBP1", "RGCC", "RGS1", "RGS2", "RGS3", "RP11-1143G9.4", "RP11-160E2.6",
    "RP11-53B5.1", "RP11-620J15.3", "RP5-1025A1.3", "RP5-1171I10.5", "RPS10", "RPS10-NUDT3", "RPS11",
    "RPS12", "RPS13", "RPS14", "RPS15", "RPS15A", "RPS16", "RPS17", "RPS18", "RPS19", "RPS19BP1",
    "RPS2", "RPS20", "RPS21", "RPS23", "RPS24", "RPS25", "RPS26", "RPS27", "RPS27A", "RPS27L",
    "RPS28", "RPS29", "RPS3", "RPS3A", "RPS4X", "RPS4Y1", "RPS4Y2", "RPS5", "RPS6", "RPS6KA1",
    "RPS6KA2", "RPS6KA2-AS1", "RPS6KA3", "RPS6KA4", "RPS6KA5", "RPS6KA6", "RPS6KB1", "RPS6KB2",
    "RPS6KC1", "RPS6KL1", "RPS7", "RPS8", "RPS9", "RPSA", "RRM2", "SMC4", "SRGN", "SRSF7", "STMN1",
    "TK1", "TMSB4X", "TOP2A", "TPX2", "TSC22D3", "TUBA1A", "TUBA1B", "TUBB", "TUBB4B", "TXN", "TYMS",
    "UBA52", "UBC", "UBE2C", "UHRF1", "YBX1", "YPEL5", "ZFP36", "ZWINT"
]
LATERAL_GENE_PATTERNS = ["RP[LS].*"]  # Ribosomal


%%time
# This will mark as "lateral_gene" any genes that match the above, if they exist in the clean dataset.
mc.pl.mark_lateral_genes(
    cells,
    lateral_gene_names=LATERAL_GENE_NAMES,
    lateral_gene_patterns=LATERAL_GENE_PATTERNS,
)

lateral_gene_mask = mc.ut.get_v_numpy(cells, "lateral_gene")
lateral_gene_names = set(cells.var_names[lateral_gene_mask])
print(sorted([
    name for name in lateral_gene_names
    if not name.startswith("RPL") and not name.startswith("RPS")
]))
print(f"""and {len([
    name for name in lateral_gene_names if name.startswith("RPL") or name.startswith("RPS")
])} RP[LS].* genes""")

set hca_bm.one-pass.preliminary.cells.var[lateral_gene]: 257 true (0.9427%) out of 27261 bools

['ACSM3', 'ANP32B', 'APOE', 'AURKA', 'B2M', 'BIRC5', 'BTG2', 'CALM1', 'CD63', 'CD69', 'CDK4', 'CENPF', 'CENPU', 'CENPW', 'CH17-373J23.1', 'CKS1B', 'CKS2', 'COX4I1', 'CXCR4', 'DNAJB1', 'DONSON', 'DUSP1', 'DUT', 'EEF1A1', 'EEF1B2', 'EIF3E', 'EMP3', 'FKBP4', 'FOS', 'FOSB', 'FTH1', 'G0S2', 'GGH', 'GLTSCR2', 'GMNN', 'GNB2L1', 'GPR183', 'H2AFZ', 'H3F3B', 'HBM', 'HIST1H1C', 'HIST1H2AC', 'HIST1H2BG', 'HIST1H4C', 'HLA-A', 'HLA-B', 'HLA-C', 'HLA-DMA', 'HLA-DMB', 'HLA-DPA1', 'HLA-DPB1', 'HLA-DQA1', 'HLA-DQB1', 'HLA-DRA', 'HLA-DRB1', 'HLA-E', 'HLA-F', 'HMGA1', 'HMGB1', 'HMGB2', 'HMGB3', 'HMGN2', 'HNRNPAB', 'HSP90AA1', 'HSP90AB1', 'HSPA1A', 'HSPA1B', 'HSPA6', 'HSPD1', 'HSPE1', 'HSPH1', 'ID2', 'IER2', 'IGHA1', 'IGHA2', 'IGHD', 'IGHG1', 'IGHG2', 'IGHG3', 'IGHG4', 'IGHM', 'IGKC', 'IGKV1-12', 'IGKV1-39', 'IGKV1-5', 'IGKV3-15', 'IGKV4-1', 'IGLC2', 'IGLC3', 'IGLC6', 'IGLC7', 'IGLL1', 'IGLL5', 'IGLV2-34', 'JUN', 'JUNB', 'KIAA0101', 'LEPROTL1', 'LGALS1', 'LINC01206', 'LTB', 'MCM3', 'MCM4', 'MCM7', 'MKI67', 'MT2A', 'MYL12A', 'MYL6', 'NASP', 'NFKBIA', 'NUSAP1', 'PA2G4', 'PCNA', 'PDLIM1', 'PLK3', 'PPP1R15A', 'PTMA', 'PTTG1', 'RAN', 'RANBP1', 'RGCC', 'RGS1', 'RGS2', 'RGS3', 'RP11-1143G9.4', 'RP11-160E2.6', 'RP11-53B5.1', 'RP11-620J15.3', 'RP5-1025A1.3', 'RP5-1171I10.5', 'RRM2', 'SMC4', 'SRGN', 'SRSF7', 'STMN1', 'TK1', 'TMSB4X', 'TOP2A', 'TPX2', 'TSC22D3', 'TUBA1A', 'TUBA1B', 'TUBB', 'TUBB4B', 'TXN', 'TYMS', 'UBA52', 'UBC', 'UBE2C', 'UHRF1', 'YBX1', 'YPEL5', 'ZFP36', 'ZWINT']
and 103 RP[LS].* genes
CPU times: user 18.5 ms, sys: 543 µs, total: 19 ms
Wall time: 18.5 ms


NOISY_GENE_NAMES = [
    "CCL3", "CCL4", "CCL5", "CXCL8", "DUSP1", "FOS", "G0S2", "HBB", "HIST1H4C", "IER2", "IGKC",
    "IGLC2", "JUN", "JUNB", "KLRB1", "MT2A", "RPS26", "RPS4Y1", "TRBC1", "TUBA1B", "TUBB"
]


%%time
# This will mark as "noisy_gene" any genes that match the above, if they exist in the clean dataset.
mc.pl.mark_noisy_genes(cells, noisy_gene_names=NOISY_GENE_NAMES)

set hca_bm.one-pass.preliminary.cells.var[noisy_gene]: 21 true (0.07703%) out of 27261 bools

CPU times: user 7.14 ms, sys: 0 ns, total: 7.14 ms
Wall time: 6.95 ms


%%time
# Either use the guesstimator:
max_parallel_piles = mc.pl.guess_max_parallel_piles(cells)
# Or, if running out of memory manually override:
# max_paralle_piles = ...
print(max_parallel_piles)
mc.pl.set_max_parallel_piles(max_parallel_piles)

498
CPU times: user 8.53 s, sys: 4.13 s, total: 12.7 s
Wall time: 12.7 s


%%time
with mc.ut.progress_bar():
    mc.pl.divide_and_conquer_pipeline(cells, random_seed=123456)

Detect rare gene modules...
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉[05:34]

CPU times: user 3min 43s, sys: 1min 53s, total: 5min 36s
Wall time: 6min 18s


%%time
metacells = \
    mc.pl.collect_metacells(cells, name="hca_bm.one-pass.preliminary.metacells", random_seed=123456)
print(f"Preliminary: {metacells.n_obs} metacells, {metacells.n_vars} genes")

set hca_bm.one-pass.preliminary.metacells.obs[grouped]: 3144 int64s
set hca_bm.one-pass.preliminary.metacells.obs[total_umis]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.layers[total_umis]: ndarray 3144 X 27261 float32s
set hca_bm.one-pass.preliminary.metacells.obs[__zeros_downsample_umis]: 3144 int64s
set hca_bm.one-pass.preliminary.metacells.layers[zeros]: ndarray 3144 X 27261 int32s
set hca_bm.one-pass.preliminary.cells.obs[metacell_name]: 296613 <U8s
set hca_bm.one-pass.preliminary.metacells.var[gene_ids]: 27261 objects
set hca_bm.one-pass.preliminary.metacells.var[bursty_lonely_gene]: 0 true (0%) out of 27261 bools
set hca_bm.one-pass.preliminary.metacells.var[properly_sampled_gene]: 27261 true (100%) out of 27261 bools
set hca_bm.one-pass.preliminary.metacells.var[excluded_gene]: 0 true (0%) out of 27261 bools
set hca_bm.one-pass.preliminary.metacells.var[full_gene_index]: 27261 int32s
set hca_bm.one-pass.preliminary.metacells.var[lateral_gene]: 257 true (0.9427%) out of 27261 bools
set hca_bm.one-pass.preliminary.metacells.var[noisy_gene]: 21 true (0.07703%) out of 27261 bools
set hca_bm.one-pass.preliminary.metacells.var[selected_gene]: 3530 true (12.95%) out of 27261 bools
set hca_bm.one-pass.preliminary.metacells.var[rare_gene]: 73 true (0.2678%) out of 27261 bools
set hca_bm.one-pass.preliminary.metacells.var[rare_gene_module]: 27261 int32s
set hca_bm.one-pass.preliminary.metacells.obs[metacells_rare_gene_module]: 3144 int32s
set hca_bm.one-pass.preliminary.metacells.obs[rare_metacell]: 17 true (0.5407%) out of 3144 bools
set hca_bm.one-pass.preliminary.metacells.uns[outliers]: 147
set hca_bm.one-pass.preliminary.metacells.uns[metacells_algorithm]: metacells.0.9.0-dev.1

Preliminary: 3144 metacells, 27261 genes
CPU times: user 8.33 s, sys: 28.4 s, total: 36.7 s
Wall time: 44.4 s


%%time
# Assign a single value for each metacell based on the cells.
mc.tl.convey_obs_to_group(
    adata=cells, gdata=metacells,
    property_name="donor_organism.organism_age", to_property_name="sex",
    method=mc.ut.most_frequent  # This is the default, for categorical data
)
mc.tl.convey_obs_to_group(
    adata=cells, gdata=metacells,
    property_name="donor_organism.organism_age", to_property_name="age",
    method=np.mean
)
# Compute the fraction of cells with each possible value in each metacell:
mc.tl.convey_obs_fractions_to_group(
    adata=cells, gdata=metacells,
    property_name="donor_organism.sex", to_property_name="sex"
)
mc.tl.convey_obs_fractions_to_group(  # Age has just a few possible values so treat it as categorical.
    adata=cells, gdata=metacells,
    property_name="donor_organism.organism_age", to_property_name="age"
)
mc.tl.convey_obs_fractions_to_group(adata=cells, gdata=metacells, property_name="donor")
mc.tl.convey_obs_fractions_to_group(adata=cells, gdata=metacells, property_name="batch")

set hca_bm.one-pass.preliminary.metacells.obs[sex]: 3144 float32s
set hca_bm.one-pass.preliminary.metacells.obs[age]: 3144 float32s
set hca_bm.one-pass.preliminary.metacells.obs[sex_fraction_of_female]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[sex_fraction_of_male]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[age_fraction_of_26.0]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[age_fraction_of_29.0]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[age_fraction_of_32.0]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[age_fraction_of_36.0]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[age_fraction_of_39.0]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[age_fraction_of_50.0]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[age_fraction_of_52.0]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[donor_fraction_of_MantonBM1]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[donor_fraction_of_MantonBM2]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[donor_fraction_of_MantonBM3]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[donor_fraction_of_MantonBM4]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[donor_fraction_of_MantonBM5]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[donor_fraction_of_MantonBM6]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[donor_fraction_of_MantonBM7]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[donor_fraction_of_MantonBM8]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM1_HiSeq_1]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM1_HiSeq_2]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM1_HiSeq_3]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM1_HiSeq_4]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM1_HiSeq_5]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM1_HiSeq_6]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM1_HiSeq_7]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM1_HiSeq_8]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM2_HiSeq_1]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM2_HiSeq_2]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM2_HiSeq_3]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM2_HiSeq_4]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM2_HiSeq_5]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM2_HiSeq_6]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM2_HiSeq_7]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM2_HiSeq_8]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM3_HiSeq_1]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM3_HiSeq_2]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM3_HiSeq_3]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM3_HiSeq_4]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM3_HiSeq_5]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM3_HiSeq_6]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM3_HiSeq_7]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM3_HiSeq_8]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM4_HiSeq_1]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM4_HiSeq_2]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM4_HiSeq_3]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM4_HiSeq_4]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM4_HiSeq_5]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM4_HiSeq_6]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM4_HiSeq_7]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM4_HiSeq_8]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM5_HiSeq_1]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM5_HiSeq_2]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM5_HiSeq_3]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM5_HiSeq_4]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM5_HiSeq_5]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM5_HiSeq_6]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM5_HiSeq_7]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM5_HiSeq_8]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM6_HiSeq_1]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM6_HiSeq_2]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM6_HiSeq_4]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM6_HiSeq_5]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM6_HiSeq_6]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM6_HiSeq_7]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM6_HiSeq_8]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM7_HiSeq_1]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM7_HiSeq_2]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM7_HiSeq_3]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM7_HiSeq_4]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM7_HiSeq_5]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM7_HiSeq_6]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM7_HiSeq_7]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM7_HiSeq_8]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM8_HiSeq_1]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM8_HiSeq_2]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM8_HiSeq_3]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM8_HiSeq_4]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM8_HiSeq_5]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM8_HiSeq_6]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM8_HiSeq_7]: 3144 float64s
set hca_bm.one-pass.preliminary.metacells.obs[batch_fraction_of_MantonBM8_HiSeq_8]: 3144 float64s

CPU times: user 25 s, sys: 162 ms, total: 25.1 s
Wall time: 25.1 s


%%time
with mc.ut.progress_bar():
    mc.pl.compute_for_mcview(adata=cells, gdata=metacells, random_seed=123456)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉[04:12]

CPU times: user 1h 37min 15s, sys: 2min 21s, total: 1h 39min 37s
Wall time: 4min 12s


%%time
min_long_edge_size = 4
umap_x = mc.ut.get_o_numpy(metacells, "x")
umap_y = mc.ut.get_o_numpy(metacells, "y")
umap_edges = sp.coo_matrix(mc.ut.get_oo_proper(metacells, "obs_outgoing_weights"))
sb.set()
plot = sb.scatterplot(x=umap_x, y=umap_y, s=10)
for (
    source_index, target_index, weight
) in zip(
    umap_edges.row, umap_edges.col, umap_edges.data
):
    source_x = umap_x[source_index]
    target_x = umap_x[target_index]
    source_y = umap_y[source_index]
    target_y = umap_y[target_index]
    if hypot(target_x - source_x, target_y - source_y) >= min_long_edge_size:
        plt.plot([source_x, target_x], [source_y, target_y],
                 linewidth=weight * 2, color='indigo')
plt.show()

CPU times: user 1.11 s, sys: 4.77 s, total: 5.88 s
Wall time: 263 ms


%%time
cells.write_h5ad("../output/one-pass/preliminary/hca_bm.cells.h5ad")

CPU times: user 755 ms, sys: 19.4 s, total: 20.2 s
Wall time: 53.1 s


%%time
metacells.write_h5ad("../output/one-pass/preliminary/hca_bm.metacells.h5ad")

CPU times: user 251 ms, sys: 602 ms, total: 853 ms
Wall time: 2.98 s


%%time
os.system("Rscript ../scripts/import_dataset.r hca_bm one-pass/preliminary 'HCABM 1P|PRE'")

ℹ creating ../mcview/one-pass/preliminary
→ You can edit the app configuration at '../mcview/one-pass/preliminary/config/config.yaml'
ℹ Importing hca_bm-one-pass-preliminary
ℹ Reading '../output/one-pass/preliminary/hca_bm.metacells.h5ad'
ℹ Processing metacell matrix
ℹ Processing 2d projection
ℹ Calculating top genes per metacell (marker genes)
ℹ Calculating metacell correlations of default marker genes
ℹ Processing inner-folds matrix
ℹ Calculating top inner-fold genes
→ Added the Inner-fold tab to the config file. To change the tab order or remove it - edit the tabs section at: '../mcview/one-pass/preliminary/config/config.yaml'
ℹ Processing inner-stdev matrix
ℹ Calculating top inner-stdev genes
→ Added the Stdev-fold tab to the config file. To change the tab order or remove it - edit the tabs section at: '../mcview/one-pass/preliminary/config/config.yaml'
ℹ Clustering in order to get initial annotation.
ℹ using 168 genes
ℹ clustering k = 64
ℹ number of features = 168
ℹ Generating cell type colors using chameleon package.
ℹ Coloring using pre-calculated 3D umap
ℹ Clustering in order to get gene modules. k = 31
ℹ Number of genes considered = 1000
ℹ Loading previously calculated 30 correlated and anti-correlated genes for each gene
✔ hca_bm-one-pass-preliminary dataset imported succesfully to '../mcview/one-pass/preliminary' project
• You can now run the app using: run_app("../mcview/one-pass/preliminary")
• or create a bundle using: create_bundle("../mcview/one-pass/preliminary",
name = "name_of_bundle")

CPU times: user 13.3 ms, sys: 606 ms, total: 619 ms
Wall time: 1min 19s

0


%%time
os.system(
    "Rscript ../scripts/update_types.r one-pass/preliminary "
    "../captured/one-pass.preliminary.types.csv"
)

CPU times: user 2.45 ms, sys: 595 ms, total: 598 ms
Wall time: 3.39 s

✔ Succesfully changed metacell cell type assignments
ℹ File has a field named 'color', updating also cell type colors.
✔ Succesfully changed cell type color assignments

0


%%time
metacell_types_csv = pd.read_csv("../captured/one-pass.preliminary.types.csv")
assert np.all(metacell_types_csv["metacell"] == metacells.obs_names)

CPU times: user 7.84 ms, sys: 3.07 ms, total: 10.9 ms
Wall time: 9.95 ms


%%time
type_of_metacell = np.array(metacell_types_csv["cell_type"])
mc.ut.set_o_data(metacells, "type", type_of_metacell)

extended_type_of_metacell = pd.Series(
    list(type_of_metacell) + ["Outliers"],
    index=list(metacell_types_csv["metacell"]) + ["Outliers"]
)

metacell_of_cell = cells.obs["metacell_name"]
type_of_cell = np.array(extended_type_of_metacell[metacell_of_cell])
mc.ut.set_o_data(cells, "type", type_of_cell)

set hca_bm.one-pass.preliminary.metacells.obs[type]: 3144 objects
set hca_bm.one-pass.preliminary.cells.obs[type]: 296613 objects

CPU times: user 51.5 ms, sys: 2.89 ms, total: 54.4 ms
Wall time: 53.5 ms


%%time
doublets_metacells_mask = type_of_metacell == "doublets"
print(f"Remove {np.sum(doublets_metacells_mask)} metacells as doublets")
metacells = mc.ut.slice(metacells, obs=~doublets_metacells_mask, name="hca_bm.metacells")
print(f"Final: {metacells.n_obs} metacells, {metacells.n_vars} genes")

Remove 10 metacells as doublets
Final: 3134 metacells, 27261 genes
CPU times: user 679 ms, sys: 173 ms, total: 852 ms
Wall time: 851 ms


%%time
doublets_cells_mask = type_of_cell == "doublets"
print(f"Remove {np.sum(doublets_cells_mask)} cells as doublets")
cells = mc.ut.slice(cells, obs=~doublets_cells_mask, name="hca_bm.cells")
print(f"Final (~doublets): {cells.n_obs} cells, {cells.n_vars} genes")

Remove 633 cells as doublets
Final (~doublets): 295980 cells, 27261 genes
CPU times: user 28.9 s, sys: 23.2 s, total: 52.1 s
Wall time: 52.1 s


%%time
new_metacell_of_old_metacell = np.cumsum(~doublets_metacells_mask) - 1
old_metacell_of_cells = mc.ut.get_o_numpy(cells, "metacell")
new_metacell_of_cells = new_metacell_of_old_metacell[old_metacell_of_cells]
mc.ut.set_o_data(cells, "metacell", new_metacell_of_cells)

set hca_bm.cells.obs[metacell]: 295980 int64s

CPU times: user 4.01 ms, sys: 3.77 ms, total: 7.78 ms
Wall time: 6.15 ms


#%%time
#mc.ut.mark_essential_genes(
#    metacells,
#    essential_gene_names_of_types={ "CD8 T-cell" => ["CD8", ...], ...}
#)


%%time
mc.pl.compute_umap_by_markers(metacells, random_seed=123456)

set hca_bm.metacells.obsp[obs_balanced_ranks]: 38127 nonzero (0.3882%) out of 9821956 elements
set hca_bm.metacells.obsp[obs_pruned_ranks]: 12509 nonzero (0.1274%) out of 9821956 elements
set hca_bm.metacells.obsp[obs_outgoing_weights]: 12509 nonzero (0.1274%) out of 9821956 elements
set hca_bm.metacells.obsp[umap_distances]: csr_matrix 3134 X 3134 float32s (9818822 > 0, 99.97%)
set hca_bm.metacells.obs[x]: 3134 float32s
set hca_bm.metacells.obs[y]: 3134 float32s

CPU times: user 2min 5s, sys: 14.9 s, total: 2min 20s
Wall time: 16.7 s


%%time
type_color_csv = pd.read_csv("../captured/type_colors.csv")
color_of_type = pd.Series(
    list(type_color_csv["color"]) + ["magenta", "magenta"],
    index=list(type_color_csv["cell_type"]) + ["Outliers", "(Missing)"]
)
type_of_metacell = mc.ut.get_o_numpy(metacells, "type")
color_of_metacell = np.array(color_of_type[type_of_metacell])

min_long_edge_size = 4
umap_x = mc.ut.get_o_numpy(metacells, "x")
umap_y = mc.ut.get_o_numpy(metacells, "y")
umap_edges = sp.coo_matrix(mc.ut.get_oo_proper(metacells, "obs_outgoing_weights"))
sb.set()
plot = sb.scatterplot(x=umap_x, y=umap_y, color=color_of_metacell, s=10)
for (
    source_index, target_index, weight
) in zip(
    umap_edges.row, umap_edges.col, umap_edges.data
):
    source_x = umap_x[source_index]
    target_x = umap_x[target_index]
    source_y = umap_y[source_index]
    target_y = umap_y[target_index]
    if hypot(target_x - source_x, target_y - source_y) >= min_long_edge_size:
        plt.plot([source_x, target_x], [source_y, target_y],
                 linewidth=weight * 2, color='indigo')
plt.show()

CPU times: user 410 ms, sys: 174 ms, total: 584 ms
Wall time: 391 ms


%%time
metacells.write_h5ad("../output/one-pass/final/hca_bm.metacells.h5ad")

CPU times: user 240 ms, sys: 666 ms, total: 905 ms
Wall time: 2.97 s


%%time
cells.write_h5ad("../output/one-pass/final/hca_bm.cells.h5ad")

CPU times: user 841 ms, sys: 18.6 s, total: 19.5 s
Wall time: 58.4 s


%%time
os.system("Rscript ../scripts/import_dataset.r hca_bm one-pass/final 'HCABM 1P|FIN' type")

ℹ creating ../mcview/one-pass/final
→ You can edit the app configuration at '../mcview/one-pass/final/config/config.yaml'
ℹ Importing hca_bm-one-pass-final
ℹ Reading '../output/one-pass/final/hca_bm.metacells.h5ad'
ℹ Processing metacell matrix
ℹ Processing 2d projection
ℹ Calculating top genes per metacell (marker genes)
ℹ Calculating metacell correlations of default marker genes
ℹ Processing inner-folds matrix
ℹ Calculating top inner-fold genes
→ Added the Inner-fold tab to the config file. To change the tab order or remove it - edit the tabs section at: '../mcview/one-pass/final/config/config.yaml'
ℹ Processing inner-stdev matrix
ℹ Calculating top inner-stdev genes
→ Added the Stdev-fold tab to the config file. To change the tab order or remove it - edit the tabs section at: '../mcview/one-pass/final/config/config.yaml'
ℹ Taking cell type annotations from type field in the anndata object
ℹ Loading cell type color annotations from '../captured/type_colors.csv'
ℹ Clustering in order to get gene modules. k = 31
ℹ Number of genes considered = 1000
ℹ Loading previously calculated 30 correlated and anti-correlated genes for each gene
✔ hca_bm-one-pass-final dataset imported succesfully to '../mcview/one-pass/final' project
• You can now run the app using: run_app("../mcview/one-pass/final")
• or create a bundle using: create_bundle("../mcview/one-pass/final", name =
"name_of_bundle")

CPU times: user 11.2 ms, sys: 286 ms, total: 297 ms
Wall time: 1min 14s

0

Computing Metacells - One-Pass Process¶

1. Setup¶

2. Reading the data¶

3. Cleaning the data¶

3.1 Excluding doublet cells¶

3.2 Decisions¶

3.2.1 Excluding cells by UMIs count¶

3.2.2 Excluding genes by name¶

3.2.3 Excluding cells by high excluded gene UMIs¶

3.3 Extract the clean data¶

3.4 Save the data¶

4. Compute the metacells¶

4.1 Decisions¶

4.1.1 Lateral genes¶

4.1.2 Noisy genes¶

4.1.3 Parallelization¶

4.2 Computation¶

4.2.1 Hyper-parameters¶

4.2.2 Assigning cells to metacells¶

4.2.3 Collecting the metacells¶

4.3 Computing for MCView¶

4.4 Saving the data¶

5. Importing into MCView¶

5.1 Installing MCView¶

5.2 Importing data set¶

5.3 Running MCView¶

5.4 Annotating types in MCView¶

5.5 Updating types in MCView¶

6. Applying type annotations from MCView¶

6.1 Reading the data¶

6.2 Conveying type annotations¶

6.3 Removing doublet meta/cells¶

7. Finalizing the data¶

7.1 Marking essential genes¶

7.2 Recomputing for MCView¶

7.4 Save the data¶

8. Importing into MCView¶