Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

""" 

Manage storage of per-gene data. 

""" 

 

from abc import abstractmethod 

from metacell.storage.helpers import file_uuid 

from metacell.storage.metadata import YamlMetadata 

from tgutils.application import * # pylint: disable=wildcard-import,unused-wildcard-import 

from tgutils.cache import Cache 

from typing import Any 

from typing import Optional 

from typing import Type 

from uuid import UUID 

 

import hashlib 

import os 

import tgutils.numpy as np 

 

 

class GenesMetadata(YamlMetadata): # pylint: disable=too-few-public-methods 

""" 

Per-batch meta-data. 

""" 

required_keys = dict(**YamlMetadata.required_keys, genes_count=int, organism=str) 

 

def __init__(self, **kwargs: Any) -> None: 

""" 

Create metadata for for some genes. 

""" 

super().__init__(**kwargs) 

 

#: The number of genes. 

self.genes_count: int 

 

#: The organism these are genes of. 

self.organism: str 

 

 

class Genes: 

""" 

A collection of genes with some associated data. 

 

This must be consistent between all profiles that are processed together. 

""" 

 

def __init__(self, full: 'GenesSet', names: np.ArrayStr) -> None: 

""" 

Initialize the genes. 

""" 

#: Convenient access to the number of genes. 

self.count = len(names) 

 

#: The sorted upper-case names of genes in the set. 

self.names = names 

 

#: The full gene set this is derived from. 

self.full = full 

 

md5 = hashlib.md5() 

for name in self.names: 

md5.update((name + '\n').encode('utf8')) 

 

#: The unique identifier of this list of gene names. 

self.uuid = UUID(bytes=md5.digest()) 

 

@abstractmethod 

def available_data(self) -> List[str]: 

""" 

Return a list of the available data. 

""" 

 

@abstractmethod 

def has_array(self, name: str) -> bool: 

""" 

Whether there exists some per-gene data. 

""" 

 

@abstractmethod 

def any_array(self, name: str) -> np.ndarray: 

""" 

Load a per-gene data array. 

""" 

 

def array(self, cls: Type[np.A], name: str) -> np.A: 

""" 

Load a per-gene data array. 

""" 

return cls.am(self.any_array(name)) 

 

@staticmethod 

def load(path: str, organism: Optional[str] = None) -> 'Genes': 

""" 

Load genes from a directory. 

""" 

metadata = GenesMetadata.load(os.path.join(path, 'genes.yaml')) 

96 ↛ 97line 96 didn't jump to line 97, because the condition on line 96 was never true if organism is not None and organism != metadata.organism: 

raise RuntimeError('The genes set metadata file: %s ' 

'specifies the wrong organism: %s ' 

'instead of the expected: %s' 

% (metadata.yaml_path, 

metadata.organism, 

organism)) 

return GenesSet(GenesMetadata.load(os.path.join(path, 'genes.yaml'))) 

 

@staticmethod 

def created(path: str, organism: str) -> None: 

""" 

Write the ``genes.yaml`` file after creating a genes directory. 

 

At minimum, the directory should contain the gene names file. 

""" 

names_path = os.path.join(path, 'g.name.txt') 

names = np.ArrayStr.read(names_path[:-4]) 

with open(os.path.join(path, 'genes.yaml'), 'w') as file: 

file.write('uuid: %s\n' % file_uuid(names_path)) 

file.write('organism: %s\n' % organism) 

file.write('genes_count: %s\n' % len(names)) 

 

 

class GenesSet(Genes): 

""" 

A set of genes with some associated data. 

 

This must be consistent between all batches that are processed together. 

""" 

 

_cache: Cache[str, 'GenesSet'] = Cache() 

 

def __init__(self, metadata: GenesMetadata) -> None: 

""" 

Open a genes set directory for access. 

""" 

#: The meta-data describing the genes. 

self.metadata = metadata 

 

self._array_by_name: Cache[str, np.ndarray] = Cache() 

names = self.array(np.ArrayStr, 'name') 

super().__init__(self, names) 

 

self._verify_names() 

self._verify_uuid() 

self._verify_genes_count() 

 

def _verify_names(self) -> None: 

prev_name = '' 

for index, name in enumerate(self.names): 

if name != name.upper(): 

raise RuntimeError('Lower case gene name: %s ' 

'in line: %s ' 

'of the genes set file: %s/genes.name.txt' 

% (name, index + 1, self.metadata.path)) 

if name <= prev_name: 

raise RuntimeError('Unsorted genes set file: %s/genes.name.txt at lines: %s - %s' 

% (self.metadata.path, index, index + 1)) 

prev_name = name 

 

def _verify_uuid(self) -> None: 

if self.uuid != self.metadata.uuid: 

raise RuntimeError('The genes metadata file: %s ' 

'specifies a different UUID: %s ' 

'than the md5sum: %s ' 

'of the gene names file: %s/genes.name.txt' 

% (self.metadata.yaml_path, 

self.metadata.uuid, 

self.uuid, 

self.metadata.path)) 

 

def _verify_genes_count(self) -> None: 

if len(self.names) != self.metadata.genes_count: 

raise RuntimeError('The genes metadata file: %s ' 

'specifies a different genes count: %s ' 

'than the number of genes: %s ' 

'in the genes names file: %s/genes.name.txt' 

% (self.metadata.yaml_path, 

self.metadata.genes_count, 

len(self.names), 

self.metadata.path)) 

 

def available_data(self) -> List[str]: 

return sorted([data['name'] 

for data 

in glob_extract(optional(os.path.join(self.metadata.path, 

'g.{*name}.{*type}')))]) 

 

def data_path(self, path: str) -> str: 

""" 

Return the path of a data file in the genes directory. 

""" 

return os.path.join(self.metadata.path, path) 

 

def has_array(self, name: str) -> bool: 

return Stat.exists(os.path.join(self.metadata.path, 'g.%s.txt' % name)) \ 

or Stat.exists(os.path.join(self.metadata.path, 'g.%s.npy' % name)) 

 

def any_array(self, name: str) -> np.ndarray: 

return self._array_by_name.lookup(name, lambda: 

np.BaseArray.read_array(os.path.join(self.metadata.path, 

'g.' + name))) 

 

 

class GenesSubset(Genes): 

""" 

A subset of some genes. 

""" 

 

def __init__(self, superset: Genes, included_indices: np.ArrayInt32) -> None: 

""" 

Create a subset of some genes. 

""" 

super().__init__(superset.full, superset.names[included_indices]) 

 

#: The genes this is a subset of. 

self.superset = superset 

 

#: The indices of the superset genes which are included in the subset. 

self.included_indices = included_indices 

 

def available_data(self) -> List[str]: 

return self.superset.available_data() 

 

def has_array(self, name: str) -> bool: 

return self.superset.has_array(name) 

 

def any_array(self, name: str) -> np.ndarray: 

return self.superset.any_array(name)[self.included_indices]