Source code for mgkit.kegg

"""
Module containing classes and functions to access Kegg data
"""
from builtins import object
from future.utils import viewitems
import sys
import logging
import pickle
import random
import re
import itertools
from .utils import dictionary as dict_utils
from .net import uniprot, url_read
from .io import open_file


LOG = logging.getLogger(__name__)
KEGG_REST_URL = 'http://rest.kegg.jp/'


[docs]class KeggCompound(object): "Kegg compound" # __slots__ = ('cp_id', 'description') def __init__(self, cp_id=None, description=''): self.cp_id = cp_id self.description = description
[docs] def __eq__(self, other): """ >>> KeggCompound('test') == KeggCompound('test') True >>> KeggCompound('test') == 1 False """ if not isinstance(other, KeggCompound): return False return self.cp_id == other.cp_id
[docs] def __ne__(self, other): """ >>> KeggCompound('test') != KeggCompound('test1') True >>> KeggCompound('test') != 1 True """ return not self == other
def __hash__(self): return hash(self.cp_id) def __str__(self): return "{0}: {1}".format(self.cp_id, self.description) def __repr__(self): return str(self)
[docs]class KeggReaction(object): """ .. versionchanged:: 0.3.1 reworked, only stores the equation Kegg Reaction, used for parsing the equation line """ rn_id = None left_cp = None right_cp = None def __init__(self, entry): """ Raises: ValueError: if no EQUATION line is found """ entry = entry.splitlines() self.rn_id = entry[0].split()[1] for line in entry: if line.startswith('EQUATION'): break if line.startswith('///'): raise ValueError('No Equation in Entry') self.left_cp, self.right_cp = parse_reaction(line)
[docs]class KeggOrtholog(object): "Kegg Ortholog gene" # __slots__ = ('ko_id', 'description', 'reactions') def __init__(self, ko_id=None, description='', reactions=None): self.ko_id = ko_id self.description = description.replace(';', '') self.reactions = reactions if reactions is not None else {}
[docs] def __eq__(self, other): """ >>> KeggOrtholog('test') == KeggOrtholog('test') True >>> KeggOrtholog('test') == 1 False """ if not isinstance(other, KeggOrtholog): return False return self.ko_id == other.ko_id
[docs] def __ne__(self, other): """ >>> KeggOrtholog('test') != KeggOrtholog('test1') True >>> KeggOrtholog('test') != 1 True """ return not self == other
def __getitem__(self, key): return self.reactions[key] def __setitem__(self, key, value): self.reactions[key] = value def __hash__(self): return hash(self.ko_id) def __len__(self): return len(self.reactions) def __iter__(self): for x in self.reactions: yield x def __str__(self): return "{0}: ({1}) {2}".format(self.ko_id, len(self), self.description) def __repr__(self): return str(self)
[docs]class KeggPathway(object): "Kegg Pathway" # __slots__ = ('path_id', 'description', 'genes') def __init__(self, path_id=None, description=None, genes=None): self.path_id = path_id self.description = description self.genes = genes if genes is not None else {}
[docs] def __eq__(self, other): """ >>> KeggPathway('test') == KeggPathway('test') True >>> KeggPathway('test') == 1 False """ if not isinstance(other, KeggPathway): return False return self.path_id == other.path_id
[docs] def __ne__(self, other): """ >>> KeggPathway('test') != KeggPathway('test1') True >>> KeggPathway('test') != 1 True """ return not self == other
def __hash__(self): return hash(self.path_id) def __contains__(self, item): return item in self.genes def __getitem__(self, key): return self.genes[key] def __setitem__(self, key, value): self.genes[key] = value def __str__(self): return "{0} ({2}): {1}".format( self.path_id, self.description, len(self) ) def __repr__(self): return str(self) def __len__(self): return len(self.genes) def __iter__(self): for x in self.genes: yield x
[docs]class KeggClientRest(object): """ .. versionchanged:: 0.3.1 added a *cache* attribute for some methods Kegg REST client The class includes methods and data to use the REST API provided by Kegg. At the moment it provides methods to for 'link', 'list' and 'get' operations, `Kegg REST API <http://www.kegg.jp/kegg/rest/keggapi.html>`_ """ contact = None api_url = KEGG_REST_URL cpd_re = re.compile( r"ENTRY\s+(C\d{5})\s+Compound\nNAME\s+([,.\w+ ()-]+);?" ) rn_name_re = re.compile(r"R\d{5}") rn_eq_re = re.compile(r'C\d{5}') ko_desc_re = re.compile( r"ko:(K\d{5})\t.+?;\s+([\w+, ()/:'\[\]-]+)( \[EC:)?\n?" ) cpd_desc_re = re.compile( r"cpd:(C\d{5})\t([\w+, ()\[\]'*.-]+);?\n?" ) id_prefix = {'C': 'cpd', 'k': 'map', 'K': 'ko', 'R': 'rn', 'm': 'path'} cache = None def __init__(self, cache=None): """ .. versionadded:: 0.3.1 The "cache" parameter is a file name for the cached data wrote using :meth:`KeggClientRest.write_cache`. """ if cache is None: self.empty_cache() else: self.load_cache(cache)
[docs] def empty_cache(self, methods=None): """ .. versionadded:: 0.3.1 Empties the cache completely or for a specific method(s) Arguments: methods (iterable, str): string or iterable of strings that are part of the cache. If None the cache is fully emptied """ if methods is None: methods = ('link_ids', 'get_entry', 'get_ids_names') else: if isinstance(methods, str): methods = [methods] if self.cache is None: self.cache = {} for method in methods: self.cache[method] = {}
[docs] def load_cache(self, file_handle): """ .. versionadded:: 0.3.1 Loads the cache from file """ self.cache = pickle.load(open_file(file_handle, 'rb'))
[docs] def write_cache(self, file_handle): """ .. versionadded:: 0.3.1 Write the cache to file """ pickle.dump(self.cache, open_file(file_handle, 'wb'))
# Kegg primitives #
[docs] def list_ids(self, k_id): """ The method abstract the use of the 'list' operation in the Kegg API The k_id parameter can be one of the following:: pathway | brite | module | disease | drug | environ | ko | genome | <org> | compound | glycan | reaction | rpair | rclass | enzyme <org> = KEGG organism code or T number :param str k_id: kegg database to get list of ids :return list: list of ids in the specified database """ url = "{0}list/{1}".format(self.api_url, k_id) data = url_read(url, agent=self.contact) # leave out the last \n return data[:-1]
[docs] def get_entry(self, k_id, option=None): """ .. versionchanged:: 0.3.1 this is now cached The method abstract the use of the 'get' operation in the Kegg API :param str k_id: kegg id of the resource to get :param str option: optional, to specify a format """ try: data = self.cache['get_entry'][(k_id, option)] except KeyError: url = "{0}get/{1}/{2}".format( self.api_url, k_id, '' if option is None else option ) data = url_read(url, agent=self.contact) self.cache['get_entry'][(k_id, option)] = data return data
[docs] def find(self, query, database, options=None, strip=True): """ .. versionadded:: 0.3.1 Kegg Help: http://rest.kegg.jp/find/<database>/<query> <database> = pathway | module | ko | genome | <org> | compound | glycan | reaction | rclass | enzyme | disease | drug | dgroup | environ | genes | ligand <org> = KEGG organism code or T number http://rest.kegg.jp/find/<database>/<query>/<option> <database> = compound | drug <option> = formula | exact_mass | mol_weight Examples: >>> kc = KeggClientRest() >>> kc.find('CH4', 'compound') {'C01438': 'Methane; CH4'} >>> kc.find('K00844', 'genes', strip=False) {'tped:TPE_0072': 'hexokinase; K00844 hexokinase [EC:2.7.1.1]', ... >>> kc.find('174.05', 'compound', options='exact_mass') {'C00493': '174.052823', 'C04236': '174.052823', 'C16588': '174.052823', 'C17696': '174.052823', 'C18307': '174.052823', 'C18312': '174.052823', 'C21281': '174.052823'} """ url = 'http://rest.kegg.jp/find/{}/{}/{}'.format( database, query, '' if options is None else options ) LOG.debug(url) data = url_read(url, agent=self.contact) mappings = {} for line in data.splitlines(): target_id, description = line.rstrip().split('\t') if strip: target_id = target_id.split(':')[1] mappings[target_id] = description return mappings
[docs] def conv(self, target_db, source_db, strip=True): """ .. versionadded:: 0.3.1 Kegg Help: http://rest.kegg.jp/conv/<target_db>/<source_db> (<target_db> <source_db>) = (<kegg_db> <outside_db>) | (<outside_db> <kegg_db>) For gene identifiers: <kegg_db> = <org> <org> = KEGG organism code or T number <outside_db> = ncbi-proteinid | ncbi-geneid | uniprot For chemical substance identifiers: <kegg_db> = drug | compound | glycan <outside_db> = pubchem | chebi http://rest.kegg.jp/conv/<target_db>/<dbentries> For gene identifiers: <dbentries> = database entries involving the following <database> <database> = <org> | genes | ncbi-proteinid | ncbi-geneid | uniprot <org> = KEGG organism code or T number For chemical substance identifiers: <dbentries> = database entries involving the following <database> <database> = drug | compound | glycan | pubchem | chebi Examples: >>> kc = KeggClientRest() >>> kc.conv('ncbi-geneid', 'eco') {'eco:b0217': {'ncbi-geneid:949009'}, 'eco:b0216': {'ncbi-geneid:947541'}, 'eco:b0215': {'ncbi-geneid:946441'}, 'eco:b0214': {'ncbi-geneid:946955'}, 'eco:b0213': {'ncbi-geneid:944903'}, ... >>> kc.conv('ncbi-proteinid', 'hsa:10458+ece:Z5100') {'10458': {'NP_059345'}, 'Z5100': {'AAG58814'}} """ url = 'http://rest.kegg.jp/conv/{}/{}/'.format( target_db, source_db, ) LOG.debug(url) data = url_read(url, agent=self.contact) mappings = {} for line in data.splitlines(): source_id, target_id = line.rstrip().split('\t') if strip: target_id = target_id.split(':')[1] source_id = source_id.split(':')[1] try: mappings[source_id].add(target_id) except KeyError: mappings[source_id] = set([target_id]) return mappings
# names #
[docs] def get_ids_names(self, target='ko', strip=True): """ .. versionadded:: 0.1.13 .. versionchanged:: 0.3.1 the call is now cached Returns a dictionary with the names/description of all the id of a specific target, (ko, path, cpd, etc.) If strip=True the id will stripped of the module abbreviation (e.g. md:M00002->M00002) """ if strip: try: return self.cache['get_ids_names'][target].copy() except KeyError: LOG.debug('No cached values for "%s"', target) id_names = {} for line in self.list_ids(target).splitlines(): kegg_id, name = line.strip().split('\t') if strip: kegg_id = kegg_id.split(':')[1] id_names[kegg_id] = name if strip: self.cache['get_ids_names'][target] = id_names.copy() return id_names
[docs] def get_ortholog_pathways(self): """ Gets ortholog pathways, replace 'map' with 'ko' in the id """ data = self.get_ids_names('pathway') pathways = {} for kegg_id, name in viewitems(data): kegg_id = kegg_id.replace('map', 'ko') pathways[kegg_id] = name return pathways
# end names #
[docs] def get_reaction_equations(self, ids, max_len=10): "Get the equation for the reactions" if isinstance(ids, str): ids = [ids] data = {} for idx in range(0, len(ids), max_len): if len(ids) > max_len: LOG.info( "Downloading reactions equations - range %d-%d", idx + 1, idx + max_len) url = "{0}get/{1}".format( self.api_url, '+'.join(ids[idx:idx+max_len]) ) t_data = url_read(url) for entry in t_data.split('///'): for line in entry.split('\n'): if line.startswith('EQUATION'): cp_in, cp_out = line.split('<=>') # print cp_in, cp_out cp_in = self.rn_eq_re.findall(cp_in) cp_out = self.rn_eq_re.findall(cp_out) # print cp_in, cp_out elif line.startswith('ENTRY'): # try: name = self.rn_name_re.search(line).group(0) # except: # print line data[name] = {'in': cp_in, 'out': cp_out} return data
[docs]class KeggData(object): """ .. deprecated:: 0.3.4 """ pathways = None _ko_map = None maps = None def __init__(self, fname=None, gen_maps=True): self.pathways = {} self._ko_map = {} self.maps = None if fname: self.load_data(fname) if gen_maps: self.gen_ko_map() self.gen_maps()
[docs] def get_ko_names(self): ko_names = {} for path_id in self: for ko_id in self[path_id]: if ko_id in ko_names: continue ko_names[ko_id] = self[path_id][ko_id].description return ko_names
[docs] def get_rn_names(self): rn_names = {} for path_id in self: for ko_id in self[path_id]: for rn_id in self[path_id][ko_id].reactions: if rn_id in rn_names: continue rn_names[rn_id] = self[path_id][ko_id].reactions[rn_id].description return rn_names
[docs] def get_cp_names(self): if self.maps is None: self.gen_maps() return dict( (cp_id, cp.description) for cp_id, cp in viewitems(elf.maps['cp']) )
[docs] def gen_ko_map(self): for path_id in self: for ko_id in self[path_id]: try: self._ko_map[ko_id].add(path_id) except KeyError: self._ko_map[ko_id] = set() self._ko_map[ko_id].add(path_id)
[docs] def gen_maps(self): ko_maps = {} rn_maps = {} cp_maps = {} for path_id in self: for ko in self[path_id].genes.itervalues(): if ko.ko_id in ko_maps: continue ko_maps[ko.ko_id] = ko for rn in ko.reactions.itervalues(): if rn.rn_id in rn_maps: continue rn_maps[rn.rn_id] = rn for cp in itertools.chain(rn.cp_in.values(), rn.cp_out.values()): if cp.cp_id in cp_maps: continue cp_maps[cp.cp_id] = cp self.maps = {'cp': cp_maps, 'ko': ko_maps, 'rn': rn_maps}
[docs] def get_pathway_ko_map(self, black_list=None): if black_list is None: black_list = self.pathways.keys() else: # keeps only path_ids that are not in the black_list black_list = set(self.pathways.keys()) - set(black_list) return dict( (path_id, self[path_id].genes.keys()) for path_id in self if path_id in black_list )
[docs] def get_ko_pathway_map(self, black_list=None): return dict_utils.reverse_mapping( self.get_pathway_ko_map(black_list=black_list) )
[docs] def get_ko_pathways(self, ko_id): return sorted(self._ko_map[ko_id])
[docs] def save_data(self, fname): LOG.info("Saving data to file %s", fname) pickle.dump(self.pathways, open(fname, 'w'))
[docs] def load_data(self, fname): LOG.info("Loading data from file %s", fname) self.pathways = pickle.load(open(fname, 'r'))
def __getitem__(self, key): return self.pathways[key] def __len__(self): return len(self.pathways) def __iter__(self): for x in self.pathways: yield x
[docs]class KeggMapperBase(object): """ .. deprecated:: 0.3.4 Base object for Kegg mapping classes """ _ko_map = None _not_found = None _id_names = None def __init__(self, fname=None): self._ko_map = {} self._not_found = [] self._id_names = {} if fname: self.load_data(fname) ko_to_mapping = staticmethod(uniprot.ko_to_mapping)
[docs] def save_data(self, fname): """ Saves mapping data to disk """ LOG.info("Saving data to %s", fname) pickle.dump((self._ko_map, self._not_found, self._id_names), open(fname, 'w'))
[docs] def load_data(self, fname): """ Loads mapping data to disk """ LOG.info("Loading data from %s", fname) data = pickle.load(open(fname, 'r')) try: self._ko_map, self._not_found, self._id_names = data except ValueError: LOG.warning("Pickled data in old format") self._ko_map, self._not_found = data
[docs] def get_ko_map(self): """ Returns a copy of the KO->mapping dictionary """ return self._ko_map.copy()
[docs] def get_id_map(self): """ Returns a mapping->KOs dictionary (a reverse mapping to get_ko_map) """ return dict_utils.reverse_mapping(self.get_ko_map())
[docs] def get_id_names(self): """ Returns a copy of the mapping names """ return self._id_names.copy()
def __len__(self): return len(self._ko_map) def __getitem__(self, key): return self._ko_map[key] def __setitem__(self, key, value): self._ko_map[key] = value def __delitem__(self, key): del self._ko_map[key] def __contains__(self, key): return key in self._ko_map def __iter__(self): return iter(self._ko_map)
BLACK_LIST = [ 'ko05164', # Influenza A 'ko05166', # HTLV-I infection 'ko05161', # Hepatitis B 'ko05160', # Hepatitis C 'ko05162', # Measles 'ko05169', # Epstein-Barr virus infection 'ko05168', # Herpes simplex infection 'ko04520', # Adherens junction 'ko05014', # Amyotrophic lateral sclerosis (ALS) 'ko05016', # Huntington's disease 'ko05310', # Asthma 'ko00908', # Zeatin biosynthesis 'ko04930', # Type II diabetes mellitus 'ko04622', # RIG-I-like receptor signaling pathway 'ko04626', # Plant-pathogen interaction 'ko05222', # Small cell lung cancer 'ko05223', # Non-small cell lung cancer 'ko05220', # Chronic myeloid leukemia 'ko05221', # Acute myeloid leukemia 'ko04612', # Antigen processing and presentation 'ko04621', # NOD-like receptor signaling pathway 'ko04620', # Toll-like receptor signaling pathway 'ko04623', # Cytosolic DNA-sensing pathway 'ko04622', # RIG-I-like receptor signaling pathway 'ko00351', # DDT degradation 'ko04270', # Vascular smooth muscle contraction 'ko04115', # p53 signaling pathway 'ko04114', # Oocyte meiosis 'ko05120', # Epithelial cell signaling in Helicobacter pylori infection 'ko00590', # Arachidonic acid metabolism 'ko04370', # VEGF signaling pathway 'ko04976', # Bile secretion 'ko04971', # Gastric acid secretion 'ko04970', # Salivary secretion 'ko04973', # Carbohydrate digestion and absorption 'ko04972', # Pancreatic secretion 'ko04975', # Fat digestion and absorption 'ko04974', # Protein digestion and absorption 'ko04977', # Vitamin digestion and absorption 'ko04711', # Circadian rhythm - fly 'ko04710', # Circadian rhythm 'ko04713', # Circadian entrainment 'ko04712', # Circadian rhythm - plant 'ko04650', # Natural killer cell mediated cytotoxicity 'ko04391', # Hippo signaling pathway - fly 'ko04151', # PI3K-Akt signaling pathway 'ko04150', # mTOR signaling pathway 'ko04510', # Focal adhesion 'ko04012', # ErbB signaling pathway 'ko04013', # MAPK signaling pathway - fly 'ko04010', # MAPK signaling pathway 'ko03320', # PPAR signaling pathway 'ko04530', # Tight junction 'ko00981', # Insect hormone biosynthesis 'ko00982', # Drug metabolism - cytochrome P450 'ko00983', # Drug metabolism - other enzymes 'ko05150', # Staphylococcus aureus infection 'ko05152', # Tuberculosis 'ko04614', # Renin-angiotensin system 'ko04610', # Complement and coagulation cascades 'ko04340', # Hedgehog signaling pathway 'ko00984', # Steroid degradation 'ko05010', # Alzheimer's disease 'ko05012', # Parkinson's disease 'ko04260', # Cardiac muscle contraction 'ko04540', # Gap junction 'ko05110', # Vibrio cholerae infection 'ko05111', # Vibrio cholerae pathogenic cycle 'ko04640', # Hematopoietic cell lineage 'ko04310', # Wnt signaling pathway 'ko04728', # Dopaminergic synapse 'ko04724', # Glutamatergic synapse 'ko04725', # Cholinergic synapse 'ko04726', # Serotonergic synapse 'ko04727', # GABAergic synapse 'ko04720', # Long-term potentiation 'ko04721', # Synaptic vesicle cycle 'ko04722', # Neurotrophin signaling pathway 'ko04723', # Retrograde endocannabinoid signaling 'ko05340', # Primary immunodeficiency 'ko05414', # Dilated cardiomyopathy 'ko05416', # Viral myocarditis 'ko05410', # Hypertrophic cardiomyopathy (HCM) 'ko05412', # Arrhythmogenic right ventricular cardiomyopathy (ARVC) 'ko04130', # SNARE interactions in vesicular transport 'ko05145', # Toxoplasmosis 'ko05144', # Malaria 'ko04350', # TGF-beta signaling pathway 'ko04210', # Apoptosis 'ko05200', # Pathways in cancer 'ko05202', # Transcriptional misregulation in cancer 'ko05203', # Viral carcinogenesis 'ko05020', # Prion diseases 'ko05143', # African trypanosomiasis 'ko05142', # Chagas disease (American trypanosomiasis) 'ko05140', # Leishmaniasis 'ko00072', # Synthesis and degradation of ketone bodies 'ko00073', # Cutin, suberine and wax biosynthesis 'ko05330', # Allograft rejection 'ko04912', # GnRH signaling pathway 'ko05332', # Graft-versus-host disease 'ko04910', # Insulin signaling pathway 'ko04916', # Melanogenesis 'ko04914', # Progesterone-mediated oocyte maturation 'ko04672', # Intestinal immune network for IgA production 'ko04670', # Leukocyte transendothelial migration 'ko04660', # T cell receptor signaling pathway 'ko00603', # Glycosphingolipid biosynthesis - globo series 'ko00601', # Glycosphingolipid biosynthesis - lacto and neolacto series 'ko00600', # Sphingolipid metabolism 'ko00604', # Glycosphingolipid biosynthesis - ganglio series 'ko04070', # Phosphatidylinositol signaling system 'ko04075', # Plant hormone signal transduction 'ko05100', # Bacterial invasion of epithelial cells 'ko05204', # Chemical carcinogenesis 'ko04920', # Adipocytokine signaling pathway 'ko04080', # Neuroactive ligand-receptor interaction 'ko04320', # Dorso-ventral axis formation 'ko04730', # Long-term depression 'ko04950', # Maturity onset diabetes of the young 'ko04630', # Jak-STAT signaling pathway 'ko04742', # Taste transduction 'ko04740', # Olfactory transduction 'ko04744', # Phototransduction 'ko04745', # Phototransduction - fly 'ko04512', # ECM-receptor interaction 'ko03460', # Fanconi anemia pathway 'ko04360', # Axon guidance 'ko05219', # Bladder cancer 'ko05218', # Melanoma 'ko05217', # Basal cell carcinoma 'ko05216', # Thyroid cancer 'ko05215', # Prostate cancer 'ko05214', # Glioma 'ko05213', # Endometrial cancer 'ko05212', # Pancreatic cancer 'ko05211', # Renal cell carcinoma 'ko05210', # Colorectal cancer 'ko05034', # Alcoholism 'ko05033', # Nicotine addiction 'ko05032', # Morphine addiction 'ko05031', # Amphetamine addiction 'ko05030', # Cocaine addiction 'ko05134', # Legionellosis 'ko05132', # Salmonella infection 'ko05133', # Pertussis 'ko05130', # Pathogenic Escherichia coli infection 'ko05131', # Shigellosis # Too big 'ko01100', # Metabolic pathways 'ko01110', # Biosynthesis of secondary metabolites 'ko01120', # Microbial metabolism in diverse environments 'ko04662', # B cell receptor signaling pathway 'ko04966', # Collecting duct acid secretion 'ko05322', # Systemic lupus erythematosus 'ko04964', # Proximal tubule bicarbonate reclamation 'ko05320', # Autoimmune thyroid disease 'ko04962', # Vasopressin-regulated water reabsorption 'ko04960', # Aldosterone-regulated sodium reabsorption 'ko04961', # Endocrine and other factor-regulated calcium reabsorption 'ko04380', # Osteoclast differentiation 'ko04664', # Fc epsilon RI signaling pathway 'ko04666', # Fc gamma R-mediated phagocytosis 'ko04062', # Chemokine signaling pathway 'ko04060', # Cytokine-cytokine receptor interaction 'ko04066', # HIF-1 signaling pathway 'ko04064', # NF-kappa B signaling pathway 'ko05323', # Rheumatoid arthritis 'ko00140', # Steroid hormone biosynthesis ]
[docs]def download_data(fname='kegg.pickle', contact=None): """ .. deprecated:: 0.3.4 """ kclient = KeggClientRest() kclient.contact = contact kdata = KeggData() LOG.info("Downloading pathway list") path_names = kclient.get_ortholog_pathways() LOG.info("Found %d pathways", len(path_names)) LOG.info("Downloading KO descriptions") ko_names = kclient.get_kos_descriptions() LOG.info("Found %d KOs", len(ko_names)) LOG.info("Downloading reactions descriptions") rn_names = kclient.get_reactions_descriptions() LOG.info("Found %d reactions", len(rn_names)) LOG.info("Downloading compounds descriptions") cpd_names = kclient.get_compounds_descriptions() LOG.info("Found %d compounds", len(cpd_names)) kos = {} rns = {} cps = {} LOG.info("Downloading links pathway-ko (%d)", len(path_names)) path_links = kclient.link_ids('ko', path_names.keys()) for path_id, ko_list in viewitems(path_links): path = KeggPathway(path_id, path_names[path_id]) for ko_id in ko_list: try: name = ko_names[ko_id] except KeyError: # in this case the actual entry for this gene doesn't exists LOG.warning( "KO %s not found in the descriptions, skipping", ko_id ) continue try: ko = kos[ko_id] except KeyError: ko = KeggOrtholog(ko_id, name) kos[ko_id] = ko path[ko_id] = ko kdata.pathways[path_id] = path LOG.info("Downloading links ko-reactions (%d)", len(ko_names)) ko_links = kclient.link_ids('rn', ko_names.keys()) for ko_id, rn_list in viewitems(ko_links): try: ko = kos[ko_id] except KeyError: LOG.warning("KO %s is not found in a pathway, skipping", ko_id) continue for rn_id in rn_list: try: name = rn_names[rn_id] except KeyError: LOG.warning( "Reaction %s not found in the descriptions," + " skipping", rn_id) try: rn = rns[rn_id] except KeyError: rn = KeggReaction(rn_id, name) rns[rn_id] = rn ko[rn_id] = rn LOG.info("Downloading links reactions-compounds (%d)", len(rn_names)) cp_links = kclient.get_reaction_equations(rn_names.keys()) for rn_id, cp_dict in viewitems(cp_links): try: rn = rns[rn_id] except KeyError: LOG.warning("Reaction %s is not found in a pathway, skipping", rn_id) continue for cp_id in cp_dict['in'] + cp_dict['out']: try: name = cpd_names[cp_id] except KeyError: LOG.warning("Compound %s not found in the descriptions, " + "skipping", rn_id) try: cp = cps[cp_id] except KeyError: cp = KeggCompound(cp_id, name) cps[cp_id] = cp if (cp_id in cp_dict['in']) and (cp_id in cp_dict['out']): LOG.debug( "Compound %s in both side of reaction %s", cp_id, rn_id ) if cp_id in cp_dict['in']: rn.cp_in[cp_id] = cp if cp_id in cp_dict['out']: rn.cp_out[cp_id] = cp kdata.save_data(fname)
[docs]class KeggModule(object): """ .. versionadded:: 0.1.13 Used to extract information from a pathway module entry in Kegg The entry, as a string, can be either passed at instance creation or with :meth:`KeggModule.parse_entry` """ entry = '' name = '' classes = None compounds = None _orthologs = None _reactions = None reactions = None def __init__(self, entry=None, old=False): """ .. versionchanged:: 0.3.0 added *old* parameter, to use the old parser """ if entry is None: return if old: self.parse_entry(entry) else: self.parse_entry2(entry)
[docs] def parse_entry(self, entry): """ Parses a Kegg module entry and change the instance values. By default the reactions IDs are substituted with the KO IDs """ entryd = {} curr_field = '' for line in entry.splitlines(): if line.startswith(' '): entryd[curr_field].append(line.strip()) elif line.startswith('///'): continue else: curr_field = line.split(' ')[0] entryd[curr_field] = [] entryd[curr_field].append(line.replace(curr_field, '').strip()) self.entry = re.search(r"(M\d{5})\s+.+", entryd['ENTRY'][0]).group(1) self.name = entryd['NAME'][0] self.classes = entryd['CLASS'][0].split('; ') self.compounds = [ re.search(r"(C\d{5})\s+.+", line).group(1) for line in entryd['COMPOUND'] ] self.reactions = [ self.parse_reaction(reaction, ko_ids) for ko_ids, reaction in zip( entryd['DEFINITION'][0].split(' '), entryd['REACTION'] ) ] self._orthologs = entryd['DEFINITION'][0].split(' ') self._reactions = entryd['REACTION']
[docs] def parse_entry2(self, entry): """ .. versionadded:: 0.3.0 Parses a Kegg module entry and change the instance values. By default the reactions IDs are NOT substituted with the KO IDs. """ entryd = {} curr_field = '' for line in entry.splitlines(): if line.startswith(' '): entryd[curr_field].append(line.strip()) elif line.startswith('///'): continue else: curr_field = line.split(' ')[0] entryd[curr_field] = [] entryd[curr_field].append(line.replace(curr_field, '').strip()) self.entry = re.search(r"(M\d{5})\s+.+", entryd['ENTRY'][0]).group(1) self.name = entryd['NAME'][0] self.classes = entryd['CLASS'][0].split('; ') self.compounds = [ re.search(r"(C\d{5})\s+.+", line).group(1) for line in entryd['COMPOUND'] ] self.reactions = [] for reaction in entryd['REACTION']: reaction = self.parse_reaction(reaction, ko_ids=None) self.reactions.append(reaction) self._orthologs = entryd['DEFINITION'][0].split(' ') self._reactions = entryd['REACTION']
[docs] @staticmethod def parse_reaction(line, ko_ids=None): """ .. versionchanged:: 0.3.0 cleaned the parsing parses the lines with the reactions and substitute reaction IDs with the corresponding KO IDs if provided """ # line = 'R00294,R02492,R09446,R09808,R09809 C00533 -> C00887' # ko_ids = '(K00370+K00371+K00374+K00373,K02567+K02568)' # some reaction lines have only one space rn_ids, reaction = line.replace(' ', ' ').split(' ', 1) rn_ids = tuple(x.strip() for x in rn_ids.replace('+', ',').split(',')) comp1, comp2 = reaction.split(' -> ') comp1 = comp1.replace('(spontaneous)', '') comp2 = comp2.replace('(spontaneous)', '') comp1 = tuple(x.strip() for x in comp1.replace('+', ',').split(',')) comp2 = tuple(x.strip() for x in comp2.replace('+', ',').split(',')) if ko_ids is not None: rn_ids = ko_ids.replace('+', ',').replace('-', ',').replace('(', '').replace(')', '').split(',') return rn_ids, (comp1, comp2)
@property def first_cp(self): "Returns the first compound in the module" return self.reactions[0][1][0][0] @property def last_cp(self): "Returns the first compound in the module" return self.reactions[-1][-1][-1][0]
[docs] def to_edges(self, id_only=None): """ .. versionchanged:: 0.3.0 added id_only and changed to reflect changes in :attr:`reactions` Returns the reactions as edges that can be supplied to make graph. Arguments: id_only (None, iterable): if None the returned edges are for the whole module, if an iterable (converted to a :class:`set`), only edges for those reactions are returned Yield: tuple: the elements are the compounds and reactions in the module """ if id_only is not None: id_only = set(id_only) for rn_ids, (comp1s, comp2s) in self.reactions: for rn_id in rn_ids: if (id_only is not None) and (rn_id not in id_only): continue for comp1 in comp1s: yield (comp1, rn_id) for comp2 in comp2s: yield (rn_id, comp2)
[docs] def find_submodules(self): """ .. versionadded:: 0.3.0 Returns the possible submodules, as a list of tuples where the elements are the first and last compounds in a submodule """ sub_modules = [] sub_module = None for rn_ids, (left_cpds, right_cpds) in self.reactions: if sub_module is None: sub_module = [left_cpds, right_cpds] continue if set(sub_module[1]) & set(left_cpds): sub_module[1] = right_cpds else: sub_modules.append((sub_module[0][0], sub_module[-1][-1])) sub_module = [left_cpds, right_cpds] else: sub_modules.append((sub_module[0][0], sub_module[-1][-1])) return sub_modules
[docs]def parse_reaction(line, prefix=('C', 'G')): """ .. versionadded:: 0.3.1 Parses a reaction equation from Kegg, returning the left and right components. Needs testing Arguments: line (str): reaction string Returns: tuple: left and right components as `sets` Raises: ValueError: if the """ line = line.replace('EQUATION', '').strip() if '<=>' in line: line = line.replace(' ', '').split('<=>') left = set(x if x.startswith('C') else x[1:] for x in line[0].split('+')) right = set(x if x.startswith('C') else x[1:] for x in line[1].split('+')) return left, right elif '=>' in line: raise ValueError('>>>') elif '<=' in line: raise ValueError('<<<') else: raise ValueError('???')