Coverage for metacell/storage/genes.py: 86%

Hot-keys on this page

r m x p toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

"""

Manage storage of per-gene data.

"""

from abc import abstractmethod

from metacell.storage.helpers import file_uuid

from metacell.storage.metadata import YamlMetadata

from tgutils.application import * # pylint: disable=wildcard-import,unused-wildcard-import

from tgutils.cache import Cache

from typing import Any

from typing import Optional

from typing import Type

from uuid import UUID

import hashlib

import os

import tgutils.numpy as np

class GenesMetadata(YamlMetadata): # pylint: disable=too-few-public-methods

"""

Per-batch meta-data.

"""

required_keys = dict(**YamlMetadata.required_keys, genes_count=int, organism=str)

def __init__(self, **kwargs: Any) -> None:

"""

Create metadata for for some genes.

"""

super().__init__(**kwargs)

#: The number of genes.

self.genes_count: int

#: The organism these are genes of.

self.organism: str

class Genes:

"""

A collection of genes with some associated data.

This must be consistent between all profiles that are processed together.

"""

def __init__(self, full: 'GenesSet', names: np.ArrayStr) -> None:

"""

Initialize the genes.

"""

#: Convenient access to the number of genes.

self.count = len(names)

#: The sorted upper-case names of genes in the set.

self.names = names

#: The full gene set this is derived from.

self.full = full

md5 = hashlib.md5()

for name in self.names:

md5.update((name + '\n').encode('utf8'))

#: The unique identifier of this list of gene names.

self.uuid = UUID(bytes=md5.digest())

@abstractmethod

def available_data(self) -> List[str]:

"""

Return a list of the available data.

"""

@abstractmethod

def has_array(self, name: str) -> bool:

"""

Whether there exists some per-gene data.

"""

@abstractmethod

def any_array(self, name: str) -> np.ndarray:

"""

Load a per-gene data array.

"""

def array(self, cls: Type[np.A], name: str) -> np.A:

"""

Load a per-gene data array.

"""

return cls.am(self.any_array(name))

@staticmethod

def load(path: str, organism: Optional[str] = None) -> 'Genes':

"""

Load genes from a directory.

"""

metadata = GenesMetadata.load(os.path.join(path, 'genes.yaml'))

96 ↛ 97line 96 didn't jump to line 97, because the condition on line 96 was never true if organism is not None and organism != metadata.organism:

raise RuntimeError('The genes set metadata file: %s '

'specifies the wrong organism: %s '

'instead of the expected: %s'

% (metadata.yaml_path,

metadata.organism,

organism))

return GenesSet(GenesMetadata.load(os.path.join(path, 'genes.yaml')))

@staticmethod

def created(path: str, organism: str) -> None:

"""

Write the ``genes.yaml`` file after creating a genes directory.

At minimum, the directory should contain the gene names file.

"""

names_path = os.path.join(path, 'g.name.txt')

names = np.ArrayStr.read(names_path[:-4])

with open(os.path.join(path, 'genes.yaml'), 'w') as file:

file.write('uuid: %s\n' % file_uuid(names_path))

file.write('organism: %s\n' % organism)

file.write('genes_count: %s\n' % len(names))

class GenesSet(Genes):

"""

A set of genes with some associated data.

This must be consistent between all batches that are processed together.

"""

_cache: Cache[str, 'GenesSet'] = Cache()

def __init__(self, metadata: GenesMetadata) -> None:

"""

Open a genes set directory for access.

"""

#: The meta-data describing the genes.

self.metadata = metadata

self._array_by_name: Cache[str, np.ndarray] = Cache()

names = self.array(np.ArrayStr, 'name')

super().__init__(self, names)

self._verify_names()

self._verify_uuid()

self._verify_genes_count()

def _verify_names(self) -> None:

prev_name = ''

for index, name in enumerate(self.names):

if name != name.upper():

raise RuntimeError('Lower case gene name: %s '

'in line: %s '

'of the genes set file: %s/genes.name.txt'

% (name, index + 1, self.metadata.path))

if name <= prev_name:

raise RuntimeError('Unsorted genes set file: %s/genes.name.txt at lines: %s - %s'

% (self.metadata.path, index, index + 1))

prev_name = name

def _verify_uuid(self) -> None:

if self.uuid != self.metadata.uuid:

raise RuntimeError('The genes metadata file: %s '

'specifies a different UUID: %s '

'than the md5sum: %s '

'of the gene names file: %s/genes.name.txt'

% (self.metadata.yaml_path,

self.metadata.uuid,

self.uuid,

self.metadata.path))

def _verify_genes_count(self) -> None:

if len(self.names) != self.metadata.genes_count:

raise RuntimeError('The genes metadata file: %s '

'specifies a different genes count: %s '

'than the number of genes: %s '

'in the genes names file: %s/genes.name.txt'

% (self.metadata.yaml_path,

self.metadata.genes_count,

len(self.names),

self.metadata.path))

def available_data(self) -> List[str]:

return sorted([data['name']

for data

in glob_extract(optional(os.path.join(self.metadata.path,

'g.{*name}.{*type}')))])

def data_path(self, path: str) -> str:

"""

Return the path of a data file in the genes directory.

"""

return os.path.join(self.metadata.path, path)

def has_array(self, name: str) -> bool:

return Stat.exists(os.path.join(self.metadata.path, 'g.%s.txt' % name)) \

or Stat.exists(os.path.join(self.metadata.path, 'g.%s.npy' % name))

def any_array(self, name: str) -> np.ndarray:

return self._array_by_name.lookup(name, lambda:

np.BaseArray.read_array(os.path.join(self.metadata.path,

'g.' + name)))

class GenesSubset(Genes):

"""

A subset of some genes.

"""

def __init__(self, superset: Genes, included_indices: np.ArrayInt32) -> None:

"""

Create a subset of some genes.

"""

super().__init__(superset.full, superset.names[included_indices])

#: The genes this is a subset of.

self.superset = superset

#: The indices of the superset genes which are included in the subset.

self.included_indices = included_indices

def available_data(self) -> List[str]:

return self.superset.available_data()

def has_array(self, name: str) -> bool:

return self.superset.has_array(name)

def any_array(self, name: str) -> np.ndarray:

return self.superset.any_array(name)[self.included_indices]

Coverage for metacell/storage/genes.py : 86%

92 statements 80 run 12 missing 0 excluded 1 partial