Source code for indra_db.client.statements

import json
import logging
import warnings
from sqlalchemy import or_
from collections import defaultdict

from indra_db import get_ro
from indra_db.client import HasAgent
from indra_db.client.readonly.query import EmptyQuery, HasType, Query, \
    HasHash, FromPapers

logger = logging.getLogger(__file__)

from indra.databases import hgnc_client
from indra.util import batch_iter, clockit
from indra.statements import Unresolved, Evidence, Statement

from indra_db.util import get_db, get_raw_stmts_frm_db_list, \
    get_statement_object


[docs]def get_statements_by_gene_role_type(agent_id=None, agent_ns='HGNC-SYMBOL', role=None, stmt_type=None, count=1000, db=None, do_stmt_count=False, preassembled=True, fix_refs=True, with_evidence=True, with_support=False, essentials_only=False): """Get statements from the DB by stmt type, agent, and/or agent role. WARNING: This function will be removed in the future. Please look to indra_db.client.readonly.query and indra_db.client.principal.raw_statements for alternatives. Parameters ---------- agent_id : str String representing the identifier of the agent from the given namespace. Note: if the agent namespace argument, `agent_ns`, is set to 'HGNC-SYMBOL', this function will treat `agent_id` as an HGNC gene symbol and perform an internal lookup of the corresponding HGNC ID. Default is 'HGNC-SYMBOL'. agent_ns : str Namespace for the identifier given in `agent_id`. role : str String corresponding to the role of the agent in the statement. Options are 'SUBJECT', 'OBJECT', or 'OTHER' (in the case of `Complex`, `SelfModification`, and `ActiveForm` Statements). stmt_type : str Name of the Statement class. count : int (DEPRECATED) Number of statements to retrieve in each batch (passed to :py:func:`get_statements`). db : :py:class:`DatabaseManager` Optionally specify a database manager that attaches to something besides the primary database, for example a local databse instance. do_stmt_count : bool (DEPRECATED) Whether or not to perform an initial statement counting step to give more meaningful progress messages. preassembled : bool (DEPRECATED) If true, statements will be selected from the table of pre-assembled statements. Otherwise, they will be selected from the raw statements. Default is True. with_support : bool (DEPRECATED) Choose whether to populate the supports and supported_by list attributes of the Statement objects. Generally results in slower queries. DEFAULT IS CURRENTLY False. with_evidence : bool Choose whether or not to populate the evidence list attribute of the Statements. As with `with_support`, setting this to True will take longer. fix_refs : bool (DEPRECATED) The paper refs within the evidence objects are not populated in the database, and thus must be filled using the relations in the database. If True (default), the `pmid` field of each Statement Evidence object is set to the correct PMIDs, or None if no PMID is available. If False, the `pmid` field defaults to the value populated by the reading system. essentials_only : bool (DEPRECATED) Default is False. If True, retrieve only some metadata regarding the statements. Implicitly `with_support`, `with_evidence`, `fix_refs`, and `do_stmt_count` are all False, as none of the relevant features apply. Returns ------- if essentials_only is False: list of Statements from the database corresponding to the query. else: list of tuples containing basic data from the statements. """ warnings.warn(('This module is being taken out of service, as the tools ' 'have become deprecated. Moreover, the service has been ' 're-implemented to use newer tools as best as possible, ' 'but some results may be subtly different.'), DeprecationWarning) if db is None: db = get_ro('primary') if not preassembled: raise DeprecationWarning("This functionality is not longer supported. " "indra_db.client.principal.raw_statements " "has more functional features to search for " "raw statements.") if not (agent_id or role or stmt_type): raise ValueError('At least one of agent_id, role, or stmt_type ' 'must be specified.') if agent_id and agent_ns == 'HGNC-SYMBOL': hgnc_symbol = agent_id agent_id = hgnc_client.get_hgnc_id(hgnc_symbol) if not agent_id: logger.warning('Invalid gene name: %s' % hgnc_symbol) return [] agent_ns = 'HGNC' query = EmptyQuery() if agent_id: query &= HasAgent(agent_id, agent_ns, role) if stmt_type: query &= HasType([stmt_type]) if not isinstance(query, Query): raise ValueError("Either agent_id or stmt_type must be given.") if essentials_only: raise DeprecationWarning("This functionality is no longer supported. " "Similar features are available in" "indra_db.client.readonly.query, especially " "the `get_interactions` methods.") if with_evidence: ev_lim = None else: ev_lim = 0 if with_support: raise DeprecationWarning("This feature is not supported at this " "time, and was never truly supported.") result = query.get_statements(db, ev_limit=ev_lim) return result.statements()
[docs]def get_statements_by_paper(id_list, id_type='pmid', db=None, preassembled=True): """Get the statements from a list of paper ids. WARNING: This function will be removed in the future. Please look to indra_db.client.readonly.query and indra_db.client.principal.raw_statements for alternatives. Parameters ---------- id_list : list or set A list of ints or strs that are ids of papers of type `id_type`. id_type : str The type of id used (default is pmid). Options include pmid, pmcid, doi, pii, url, or manuscript_id. Note that pmid is generally the best means of getting a paper. db : :py:class:`DatabaseManager` Optionally specify a database manager that attaches to something besides the primary database, for example a local databse instance. preassembled : bool If True, statements will be selected from the table of pre-assembled statements. Otherwise, they will be selected from the raw statements. Default is True. Returns ------- stmt_dict : dict A dict of Statements from the database keyed the paper id given. Papers that yielded no statements are not included. If `preassembled` is True, there may be ids which were not present in the original dataset, and there may be a key None for statements that has evidence from refs that did not have that id_type of reference. """ warnings.warn(('This module is being taken out of service, as the tools ' 'have become deprecated. Moreover, the service has been ' 're-implemented to use newer tools as best as possible, ' 'but some results may be subtly different.'), DeprecationWarning) if not preassembled: raise DeprecationWarning("This functionality is not longer supported. " "indra_db.client.principal.raw_statements " "has more functional features to search for " "raw statements by paper.") if not db: db = get_ro('primary') query = FromPapers([(id_type, pid) for pid in id_list]) result = query.get_statements(db) # Get the Statement object from the jsons. A statement shows up for # all papers that it references. result_dict = defaultdict(list) for stmt in result.statements(): for ev in stmt.evidence: result_dict[ev.text_refs.get(id_type)].append(stmt) # Convert from defaultdict to ordinary dict. result_dict = dict(result_dict) return result_dict
[docs]@clockit def get_statements(clauses, count=1000, do_stmt_count=False, db=None, preassembled=True, with_support=False, fix_refs=True, with_evidence=True): """Select statements according to a given set of clauses. Parameters ---------- clauses : list list of sqlalchemy WHERE clauses to pass to the filter query. count : int Number of statements to retrieve and process in each batch. do_stmt_count : bool Whether or not to perform an initial statement counting step to give more meaningful progress messages. db : :py:class:`DatabaseManager` Optionally specify a database manager that attaches to something besides the primary database, for example a local database instance. preassembled : bool If true, statements will be selected from the table of pre-assembled statements. Otherwise, they will be selected from the raw statements. Default is True. with_support : bool Choose whether to populate the supports and supported_by list attributes of the Statement objects. General results in slower queries. with_evidence : bool Choose whether or not to populate the evidence list attribute of the Statements. As with `with_support`, setting this to True will take longer. fix_refs : bool The paper refs within the evidence objects are not populated in the database, and thus must be filled using the relations in the database. If True (default), the `pmid` field of each Statement Evidence object is set to the correct PMIDs, or None if no PMID is available. If False, the `pmid` field defaults to the value populated by the reading system. Returns ------- list of Statements from the database corresponding to the query. """ warnings.warn(('This module is being taken out of service, as the tools ' 'have become deprecated. Moreover, the service has been ' 're-implemented to use newer tools as best as possible, ' 'but some results may be subtly different.'), DeprecationWarning) cnt = count if db is None: db = get_db('primary') stmts_tblname = 'pa_statements' if preassembled else 'raw_statements' if not preassembled: stmts = [] q = db.filter_query(stmts_tblname, *clauses) if do_stmt_count: logger.info("Counting statements...") num_stmts = q.count() logger.info("Total of %d statements" % num_stmts) db_stmts = q.yield_per(cnt) for subset in batch_iter(db_stmts, cnt): stmts.extend(get_raw_stmts_frm_db_list(db, subset, with_sids=False, fix_refs=fix_refs)) if do_stmt_count: logger.info("%d of %d statements" % (len(stmts), num_stmts)) else: logger.info("%d statements" % len(stmts)) else: logger.info("Getting preassembled statements.") if with_evidence: logger.info("Getting preassembled statements.") # Get pairs of pa statements with their linked raw statements clauses += [ db.PAStatements.mk_hash == db.RawUniqueLinks.pa_stmt_mk_hash, db.RawStatements.id == db.RawUniqueLinks.raw_stmt_id ] pa_raw_stmt_pairs = \ db.select_all([db.PAStatements, db.RawStatements], *clauses, yield_per=cnt) stmt_dict = _process_pa_statement_res_wev(db, pa_raw_stmt_pairs, count=cnt, fix_refs=fix_refs) else: # Get just pa statements without their supporting raw statement(s). pa_stmts = db.select_all(db.PAStatements, *clauses, yield_per=cnt) stmt_dict = _process_pa_statement_res_nev(pa_stmts, count=cnt) # Populate the supports/supported by fields. if with_support: get_support(stmt_dict, db=db) stmts = list(stmt_dict.values()) logger.info("In all, there are %d pa statements." % len(stmts)) return stmts
@clockit def _process_pa_statement_res_wev(db, stmt_iterable, count=1000, fix_refs=True): warnings.warn(('This module is being taken out of service, as the tools ' 'have become deprecated. Moreover, the service has been ' 're-implemented to use newer tools as best as possible, ' 'but some results may be subtly different.'), DeprecationWarning) # Iterate over the batches to create the statement objects. stmt_dict = {} ev_dict = {} raw_stmt_dict = {} total_ev = 0 for stmt_pair_batch in batch_iter(stmt_iterable, count): # Instantiate the PA statement objects, and record the uuid # evidence (raw statement) links. raw_stmt_objs = [] for pa_stmt_db_obj, raw_stmt_db_obj in stmt_pair_batch: k = pa_stmt_db_obj.mk_hash if k not in stmt_dict.keys(): stmt_dict[k] = get_statement_object(pa_stmt_db_obj) ev_dict[k] = [raw_stmt_db_obj.id,] else: ev_dict[k].append(raw_stmt_db_obj.id) raw_stmt_objs.append(raw_stmt_db_obj) total_ev += 1 logger.info("Up to %d pa statements, with %d pieces of " "evidence in all." % (len(stmt_dict), total_ev)) # Instantiate the raw statements. raw_stmt_sid_tpls = get_raw_stmts_frm_db_list(db, raw_stmt_objs, fix_refs, with_sids=True) raw_stmt_dict.update({sid: s for sid, s in raw_stmt_sid_tpls}) logger.info("Processed %d raw statements." % len(raw_stmt_sid_tpls)) # Attach the evidence logger.info("Inserting evidence.") for k, sid_list in ev_dict.items(): stmt_dict[k].evidence = [raw_stmt_dict[sid].evidence[0] for sid in sid_list] return stmt_dict @clockit def _process_pa_statement_res_nev(stmt_iterable, count=1000): warnings.warn(('This module is being taken out of service, as the tools ' 'have become deprecated. Moreover, the service has been ' 're-implemented to use newer tools as best as possible, ' 'but some results may be subtly different.'), DeprecationWarning) # Iterate over the batches to create the statement objects. stmt_dict = {} for stmt_pair_batch in batch_iter(stmt_iterable, count): # Instantiate the PA statement objects. for pa_stmt_db_obj in stmt_pair_batch: k = pa_stmt_db_obj.mk_hash if k not in stmt_dict.keys(): stmt_dict[k] = get_statement_object(pa_stmt_db_obj) logger.info("Up to %d pa statements in all." % len(stmt_dict)) return stmt_dict
[docs]@clockit def get_evidence(pa_stmt_list, db=None, fix_refs=True, use_views=True): """Fill in the evidence for a list of pre-assembled statements. Parameters ---------- pa_stmt_list : list[Statement] A list of unique statements, generally drawn from the database pa_statement table (via `get_statemetns`). db : DatabaseManager instance or None An instance of a database manager. If None, defaults to the "primary" database, as defined in the db_config.ini file in .config/indra. fix_refs : bool The paper refs within the evidence objects are not populated in the database, and thus must be filled using the relations in the database. If True (default), the `pmid` field of each Statement Evidence object is set to the correct PMIDs, or None if no PMID is available. If False, the `pmid` field defaults to the value populated by the reading system. Returns ------- None - modifications are made to the Statements "in-place". """ warnings.warn(('This module is being taken out of service, as the tools ' 'have become deprecated. Moreover, the service has been ' 're-implemented to use newer tools as best as possible, ' 'but some results may be subtly different.'), DeprecationWarning) if db is None: db = get_db('primary') # Turn the list into a dict. stmt_dict = {s.get_hash(shallow=True): s for s in pa_stmt_list} if use_views: if fix_refs: raw_links = db.select_all( [db.FastRawPaLink.mk_hash, db.FastRawPaLink.raw_json, db.FastRawPaLink.reading_id], db.FastRawPaLink.mk_hash.in_(stmt_dict.keys()) ) rel_refs = ['pmid', 'rid'] ref_cols = [getattr(db.ReadingRefLink, k) for k in rel_refs] else: raw_links = db.select_all( [db.FastRawPaLink.mk_hash, db.FastRawPaLink.raw_json], db.FastRawPaLink.mk_hash.in_(stmt_dict.keys()) ) rid_ref_dict = {} myst_rid_rs_dict = defaultdict(list) for info in raw_links: if fix_refs: mk_hash, raw_json, rid = info else: mk_hash, raw_json = info rid = None json_dict = json.loads(raw_json.decode('utf-8')) ev_json = json_dict.get('evidence', []) assert len(ev_json) == 1, \ "Raw statements must have one evidence, got %d." % len(ev_json) ev = Evidence._from_json(ev_json[0]) stmt_dict[mk_hash].evidence.append(ev) if fix_refs: ref_dict = rid_ref_dict.get(rid) if ref_dict is None: myst_rid_rs_dict[rid].append(ev) if len(myst_rid_rs_dict) >= 1000: ref_data_list = db.select_all( ref_cols, db.ReadingRefLink.rid.in_(myst_rid_rs_dict.keys()) ) for pmid, rid in ref_data_list: rid_ref_dict[rid] = pmid for ev in myst_rid_rs_dict[rid]: ev.pmid = pmid myst_rid_rs_dict.clear() else: ev.pmid = rid_ref_dict[rid] else: # Get the data from the database raw_list = db.select_all( [db.PAStatements.mk_hash, db.RawStatements], db.PAStatements.mk_hash.in_(stmt_dict.keys()), db.PAStatements.mk_hash == db.RawUniqueLinks.pa_stmt_mk_hash, db.RawUniqueLinks.raw_stmt_id == db.RawStatements.id ) # Note that this step depends on the ordering being maintained. mk_hashes, raw_stmt_objs = zip(*raw_list) raw_stmts = get_raw_stmts_frm_db_list(db, raw_stmt_objs, fix_refs, with_sids=False) raw_stmt_mk_pairs = zip(mk_hashes, raw_stmts) # Now attach the evidence for mk_hash, raw_stmt in raw_stmt_mk_pairs: # Each raw statement can have just one piece of evidence. stmt_dict[mk_hash].evidence.append(raw_stmt.evidence[0]) return
[docs]def get_statements_from_hashes(statement_hashes, preassembled=True, db=None, **kwargs): """Retrieve statement objects given only statement hashes. WARNING: This function will be removed in the future. Please look to indra_db.client.readonly.query and indra_db.client.principal.raw_statements for alternatives. """ warnings.warn(('This module is being taken out of service, as the tools ' 'have become deprecated. Moreover, the service has been ' 're-implemented to use newer tools as best as possible, ' 'but some results may be subtly different.'), DeprecationWarning) if db is None: db = get_ro('primary') if not preassembled: raise DeprecationWarning("This functionality is not longer supported. " "indra_db.client.principal.raw_statements " "has more functional features to search for " "raw statements.") query = HasHash(statement_hashes) ev_lim = None if kwargs.get('with_evidence') is False: ev_lim = 0 result = query.get_statements(db, ev_limit=ev_lim) return result.statements()
[docs]def get_support(statements, db=None, recursive=False): """Populate the supports and supported_by lists of the given statements.""" warnings.warn(('This module is being taken out of service, as the tools ' 'have become deprecated. Moreover, the service has been ' 're-implemented to use newer tools as best as possible, ' 'but some results may be subtly different.'), DeprecationWarning) # TODO: Allow recursive mode (argument should probably be an integer level) if db is None: db = get_db('primary') if not isinstance(statements, dict): stmt_dict = {s.get_hash(shallow=True): s for s in statements} else: stmt_dict = statements logger.info("Populating support links.") support_links = db.select_all( [db.PASupportLinks.supported_mk_hash, db.PASupportLinks.supporting_mk_hash], or_(db.PASupportLinks.supported_mk_hash.in_(stmt_dict.keys()), db.PASupportLinks.supporting_mk_hash.in_(stmt_dict.keys())) ) for supped_hash, supping_hash in set(support_links): if supped_hash == supping_hash: assert False, 'Self-support found on-load.' supped_stmt = stmt_dict.get(supped_hash) if supped_stmt is None: supped_stmt = Unresolved(shallow_hash=supped_hash) supping_stmt = stmt_dict.get(supping_hash) if supping_stmt is None: supping_stmt = Unresolved(shallow_hash=supping_hash) supped_stmt.supported_by.append(supping_stmt) supping_stmt.supports.append(supped_stmt) return