# -*- coding: utf-8 -*-
"""
CoCoMiCo SBML reader.
"""
import gzip
import logging
import re
import xml.etree.ElementTree as etree
from pathlib import Path
from typing import Tuple
from xml.etree.ElementTree import Element # for type hinting
from cocomico.base import Biomolecule, Metabolite, Reaction, Taxon
from cocomico.model import Model
# SbmlReader class
[docs]
class SbmlReader:
"""
SBML reader builds Models from the bipartite metabolite-reaction
relations extracted from SBML files. The extraction is memoized:
if a file or taxon is requested that has already been analyzed,
the existing model is shared rather than being recomputed.
:param sbml_dir: optional directory where model files are found
:type sbml_dir: Path
:raises ValueError: if the model is empty or invalid
"""
def __init__(self, sbml_dir: Path | None = None) -> None:
"""
Initialize an SBML reader, optionally with an SBML directory.
"""
# self.sbml_dir is None or a Path specified by keyword
self.sbml_dir: None | Path = None
if sbml_dir:
self.sbml_dir = sbml_dir
assert self.sbml_dir.is_dir()
# Memoization maps
self.files: dict[Path, Taxon] = {}
self.models: dict[Taxon, Model] = {}
[docs]
def from_file(self, path: Path) -> Tuple[Taxon, Model]:
"""
Parse an SBML file and convert its relations to a Model.
"""
# If we have already seen this file, use previous result
if path in self.files:
taxon = self.files[path]
return taxon, self.models[taxon]
# Read XML from the file, retrieve the taxon id and the model.
taxon, sbml_model = self.read_sbml_file(path)
logging.debug("Read SBML model %s from %s", taxon, path.name)
# If we have already seen this taxon, use previous result.
if taxon in self.models:
return taxon, self.models[taxon]
# Decode the relations in the SBML model
self.models[taxon] = self.decode_relations(taxon, sbml_model)
self.files[path] = taxon
# Return both the taxon and the model
return taxon, self.models[taxon]
[docs]
@staticmethod
def read_sbml_file(candidate: Path):
"""
RetrieveExtract the taxon id and the model from the XML tree.
"""
# Automatically check for file suffixes, w/ w/o compression.
# Try foo, foo.gz, foo.sbml, foo.sbml.gz, foo.xml, foo.xml.gz.
path: Path | None = next(
(
path_suf
for ext in ["", ".sbml", ".xml"]
for suffix in [ext, ext + ".gz"]
if (path_suf := candidate.with_suffix(suffix)).is_file()
),
None,
)
if path is None:
logging.exception("SbmlReader: Invalid SBML path %s", path)
raise ValueError("Invalid SBML model")
# Read SBML file: try to uncompress first, read directly otherwise.
with gzip.open(path, "rt") as fh:
try:
logging.debug("SbmlReader: %s try to read as compressed", path)
xml_tree = etree.parse(fh).getroot()
except gzip.BadGzipFile:
logging.debug("SbmlReader: %s is not compressed, read directly", path)
try:
xml_tree = etree.parse(str(path)).getroot()
except SyntaxError as etree_exc:
logging.exception("SbmlReader: Invalid SBML model in %s", path)
raise ValueError("Invalid SBML model") from etree_exc
sbml_model = None
taxon: None | Taxon = None
for e in xml_tree:
taxon = Taxon(e.attrib.get("id") or path.stem)
if not e.attrib.get("id"):
logging.warning("No SBML id in %s, using %s", path.name, taxon)
if e.tag[0] == "{":
# ignore uri
_, tag = e.tag[1:].split("}")
else:
tag = e.tag
if tag == "model":
sbml_model = e
break
if sbml_model is None:
logging.exception("SbmlReader: Invalid SBML model in %s", path)
raise ValueError("Invalid SBML model")
return taxon, sbml_model
[docs]
@staticmethod
def decode_relations(taxon: Taxon, sbml_model: Element) -> Model:
"""
Search for reaction-metabolite relations, also including reversed relations
for those reactions that are reversible.
"""
# Get namespace from this SBML model.
ns = re.search(r"\{.*\}|$", sbml_model.tag).group() # type: ignore[union-attr]
# XPath search terms for reactions, metabolites wrt reactions, species
# using namespace from this SBML model.
xpath_reaction = [
f".//{ ns }reaction{ filter }" for filter in ["", '[@reversible="true"]']
]
xpath_role = [
f"./{ ns }listOf{ role }/{ ns }speciesReference"
for role in ["Products", "Reactants"]
]
xpath_species = f".//{ ns }species"
reac_suffix = ["", "rev"]
# Decode species-taxon relations.
biomolecule = {
Metabolite(
provenance=taxon, biomolecule=Biomolecule(str(elt.attrib.get("id")))
)
for elt in sbml_model.findall(xpath_species)
}
# Decode species-reaction-taxon relations.
cases = {
"product": [(0, 0), (1, 1)],
"reactant": [(0, 1), (1, 0)],
}
relations = {}
for role, role_directions in cases.items():
relations[role] = {
(
Metabolite(
biomolecule=Biomolecule(str(biomolecule.attrib.get("species"))),
provenance=taxon,
),
Reaction(
name=str(reaction.attrib.get("id"))
+ reac_suffix[reac_direction],
taxon=taxon,
),
taxon,
)
for reac_direction, role_invert in role_directions
for reaction in sbml_model.findall(xpath_reaction[reac_direction])
for biomolecule in reaction.findall(xpath_role[role_invert])
}
# Return the sbml_mModel
return Model(biomolecule=biomolecule, relations=relations)
[docs]
def normalize(self, path: Path, sbml_dir: Path | None = None):
"""
Try to relative this SBML file name to a sbml_dir, either the one
specified as a parameter or the default one encapsulated in the reader.
"""
if root := sbml_dir or self.sbml_dir:
try:
return path.relative_to(root)
except ValueError:
return path
return path
[docs]
def __str__(self) -> str:
"""String representation of class SbmlReader."""
return f"Model<{hex(id(self))}>"