Source code for indra_db.client.statements

import json
import logging
import warnings
from sqlalchemy import or_
from collections import defaultdict

from indra_db import get_ro
from indra_db.client import HasAgent
from indra_db.client.readonly.query import EmptyQuery, HasType, Query, \
    HasHash, FromPapers

logger = logging.getLogger(__file__)

from indra.databases import hgnc_client
from indra.util import batch_iter, clockit
from indra.statements import Unresolved, Evidence, Statement

from indra_db.util import get_db, get_raw_stmts_frm_db_list, \
    get_statement_object


[docs]def get_statements_by_gene_role_type(agent_id=None, agent_ns='HGNC-SYMBOL',
                                     role=None, stmt_type=None, count=1000,
                                     db=None, do_stmt_count=False,
                                     preassembled=True, fix_refs=True,
                                     with_evidence=True, with_support=False,
                                     essentials_only=False):
    """Get statements from the DB by stmt type, agent, and/or agent role.

    WARNING: This function will be removed in the future. Please look to
    indra_db.client.readonly.query and indra_db.client.principal.raw_statements
    for alternatives.

    Parameters
    ----------
    agent_id : str
        String representing the identifier of the agent from the given
        namespace. Note: if the agent namespace argument, `agent_ns`, is set
        to 'HGNC-SYMBOL', this function will treat `agent_id` as an HGNC gene
        symbol and perform an internal lookup of the corresponding HGNC ID.
        Default is 'HGNC-SYMBOL'.
    agent_ns : str
        Namespace for the identifier given in `agent_id`.
    role : str
        String corresponding to the role of the agent in the statement.
        Options are 'SUBJECT', 'OBJECT', or 'OTHER' (in the case of `Complex`,
        `SelfModification`, and `ActiveForm` Statements).
    stmt_type : str
        Name of the Statement class.
    count : int (DEPRECATED)
        Number of statements to retrieve in each batch (passed to
        :py:func:`get_statements`).
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local databse instance.
    do_stmt_count : bool (DEPRECATED)
        Whether or not to perform an initial statement counting step to give
        more meaningful progress messages.
    preassembled : bool (DEPRECATED)
        If true, statements will be selected from the table of pre-assembled
        statements. Otherwise, they will be selected from the raw statements.
        Default is True.
    with_support : bool (DEPRECATED)
        Choose whether to populate the supports and supported_by list
        attributes of the Statement objects. Generally results in slower
        queries. DEFAULT IS CURRENTLY False.
    with_evidence : bool
        Choose whether or not to populate the evidence list attribute of the
        Statements. As with `with_support`, setting this to True will take
        longer.
    fix_refs : bool (DEPRECATED)
        The paper refs within the evidence objects are not populated in the
        database, and thus must be filled using the relations in the database.
        If True (default), the `pmid` field of each Statement Evidence object
        is set to the correct PMIDs, or None if no PMID is available. If False,
        the `pmid` field defaults to the value populated by the reading
        system.
    essentials_only : bool (DEPRECATED)
        Default is False. If True, retrieve only some metadata regarding the
        statements. Implicitly `with_support`, `with_evidence`, `fix_refs`, and
        `do_stmt_count` are all False, as none of the relevant features apply.

    Returns
    -------
    if essentials_only is False:
        list of Statements from the database corresponding to the query.
    else:
        list of tuples containing basic data from the statements.
    """
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    if db is None:
        db = get_ro('primary')

    if not preassembled:
        raise DeprecationWarning("This functionality is not longer supported. "
                                 "indra_db.client.principal.raw_statements "
                                 "has more functional features to search for "
                                 "raw statements.")

    if not (agent_id or role or stmt_type):
        raise ValueError('At least one of agent_id, role, or stmt_type '
                         'must be specified.')

    if agent_id and agent_ns == 'HGNC-SYMBOL':
        hgnc_symbol = agent_id
        agent_id = hgnc_client.get_hgnc_id(hgnc_symbol)
        if not agent_id:
            logger.warning('Invalid gene name: %s' % hgnc_symbol)
            return []
        agent_ns = 'HGNC'

    query = EmptyQuery()
    if agent_id:
        query &= HasAgent(agent_id, agent_ns, role)
    if stmt_type:
        query &= HasType([stmt_type])

    if not isinstance(query, Query):
        raise ValueError("Either agent_id or stmt_type must be given.")

    if essentials_only:
        raise DeprecationWarning("This functionality is no longer supported. "
                                 "Similar features are available in"
                                 "indra_db.client.readonly.query, especially "
                                 "the `get_interactions` methods.")
    if with_evidence:
        ev_lim = None
    else:
        ev_lim = 0

    if with_support:
        raise DeprecationWarning("This feature is not supported at this "
                                 "time, and was never truly supported.")

    result = query.get_statements(db, ev_limit=ev_lim)
    return result.statements()


[docs]def get_statements_by_paper(id_list, id_type='pmid', db=None,
                            preassembled=True):
    """Get the statements from a list of paper ids.

    WARNING: This function will be removed in the future. Please look to
    indra_db.client.readonly.query and indra_db.client.principal.raw_statements
    for alternatives.

    Parameters
    ----------
    id_list : list or set
        A list of ints or strs that are ids of papers of type `id_type`.
    id_type : str
        The type of id used (default is pmid). Options include pmid, pmcid,
        doi, pii, url, or manuscript_id. Note that pmid is generally the
        best means of getting a paper.
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local databse instance.
    preassembled : bool
        If True, statements will be selected from the table of pre-assembled
        statements. Otherwise, they will be selected from the raw statements.
        Default is True.

    Returns
    -------
    stmt_dict : dict
        A dict of Statements from the database keyed the paper id given. Papers
        that yielded no statements are not included. If `preassembled` is True,
        there may be ids which were not present in the original dataset, and
        there may be a key None for statements that has evidence from refs that
        did not have that id_type of reference.
    """
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    if not preassembled:
        raise DeprecationWarning("This functionality is not longer supported. "
                                 "indra_db.client.principal.raw_statements "
                                 "has more functional features to search for "
                                 "raw statements by paper.")
    if not db:
        db = get_ro('primary')

    query = FromPapers([(id_type, pid) for pid in id_list])
    result = query.get_statements(db)

    # Get the Statement object from the jsons. A statement shows up for
    # all papers that it references.
    result_dict = defaultdict(list)
    for stmt in result.statements():
        for ev in stmt.evidence:
            result_dict[ev.text_refs.get(id_type)].append(stmt)

    # Convert from defaultdict to ordinary dict.
    result_dict = dict(result_dict)
    return result_dict


[docs]@clockit
def get_statements(clauses, count=1000, do_stmt_count=False, db=None,
                   preassembled=True, with_support=False, fix_refs=True,
                   with_evidence=True):
    """Select statements according to a given set of clauses.

    Parameters
    ----------
    clauses : list
        list of sqlalchemy WHERE clauses to pass to the filter query.
    count : int
        Number of statements to retrieve and process in each batch.
    do_stmt_count : bool
        Whether or not to perform an initial statement counting step to give
        more meaningful progress messages.
    db : :py:class:`DatabaseManager`
        Optionally specify a database manager that attaches to something
        besides the primary database, for example a local database instance.
    preassembled : bool
        If true, statements will be selected from the table of pre-assembled
        statements. Otherwise, they will be selected from the raw statements.
        Default is True.
    with_support : bool
        Choose whether to populate the supports and supported_by list
        attributes of the Statement objects. General results in slower queries.
    with_evidence : bool
        Choose whether or not to populate the evidence list attribute of the
        Statements. As with `with_support`, setting this to True will take
        longer.
    fix_refs : bool
        The paper refs within the evidence objects are not populated in the
        database, and thus must be filled using the relations in the database.
        If True (default), the `pmid` field of each Statement Evidence object
        is set to the correct PMIDs, or None if no PMID is available. If False,
        the `pmid` field defaults to the value populated by the reading
        system.

    Returns
    -------
    list of Statements from the database corresponding to the query.
    """
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    cnt = count
    if db is None:
        db = get_db('primary')

    stmts_tblname = 'pa_statements' if preassembled else 'raw_statements'

    if not preassembled:
        stmts = []
        q = db.filter_query(stmts_tblname, *clauses)
        if do_stmt_count:
            logger.info("Counting statements...")
            num_stmts = q.count()
            logger.info("Total of %d statements" % num_stmts)
        db_stmts = q.yield_per(cnt)
        for subset in batch_iter(db_stmts, cnt):
            stmts.extend(get_raw_stmts_frm_db_list(db, subset, with_sids=False,
                                                   fix_refs=fix_refs))
            if do_stmt_count:
                logger.info("%d of %d statements" % (len(stmts), num_stmts))
            else:
                logger.info("%d statements" % len(stmts))
    else:
        logger.info("Getting preassembled statements.")
        if with_evidence:
            logger.info("Getting preassembled statements.")
            # Get pairs of pa statements with their linked raw statements
            clauses += [
                db.PAStatements.mk_hash == db.RawUniqueLinks.pa_stmt_mk_hash,
                db.RawStatements.id == db.RawUniqueLinks.raw_stmt_id
            ]
            pa_raw_stmt_pairs = \
                db.select_all([db.PAStatements, db.RawStatements],
                              *clauses, yield_per=cnt)
            stmt_dict = _process_pa_statement_res_wev(db, pa_raw_stmt_pairs,
                                                      count=cnt,
                                                      fix_refs=fix_refs)
        else:
            # Get just pa statements without their supporting raw statement(s).
            pa_stmts = db.select_all(db.PAStatements, *clauses, yield_per=cnt)
            stmt_dict = _process_pa_statement_res_nev(pa_stmts, count=cnt)

        # Populate the supports/supported by fields.
        if with_support:
            get_support(stmt_dict, db=db)

        stmts = list(stmt_dict.values())
        logger.info("In all, there are %d pa statements." % len(stmts))

    return stmts


@clockit
def _process_pa_statement_res_wev(db, stmt_iterable, count=1000,
                                  fix_refs=True):
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    # Iterate over the batches to create the statement objects.
    stmt_dict = {}
    ev_dict = {}
    raw_stmt_dict = {}
    total_ev = 0
    for stmt_pair_batch in batch_iter(stmt_iterable, count):
        # Instantiate the PA statement objects, and record the uuid
        # evidence (raw statement) links.
        raw_stmt_objs = []
        for pa_stmt_db_obj, raw_stmt_db_obj in stmt_pair_batch:
            k = pa_stmt_db_obj.mk_hash
            if k not in stmt_dict.keys():
                stmt_dict[k] = get_statement_object(pa_stmt_db_obj)
                ev_dict[k] = [raw_stmt_db_obj.id,]
            else:
                ev_dict[k].append(raw_stmt_db_obj.id)
            raw_stmt_objs.append(raw_stmt_db_obj)
            total_ev += 1

        logger.info("Up to %d pa statements, with %d pieces of "
                    "evidence in all." % (len(stmt_dict), total_ev))

        # Instantiate the raw statements.
        raw_stmt_sid_tpls = get_raw_stmts_frm_db_list(db, raw_stmt_objs,
                                                      fix_refs,
                                                      with_sids=True)
        raw_stmt_dict.update({sid: s for sid, s in raw_stmt_sid_tpls})
        logger.info("Processed %d raw statements."
                    % len(raw_stmt_sid_tpls))

    # Attach the evidence
    logger.info("Inserting evidence.")
    for k, sid_list in ev_dict.items():
        stmt_dict[k].evidence = [raw_stmt_dict[sid].evidence[0]
                                 for sid in sid_list]
    return stmt_dict


@clockit
def _process_pa_statement_res_nev(stmt_iterable, count=1000):
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    # Iterate over the batches to create the statement objects.
    stmt_dict = {}
    for stmt_pair_batch in batch_iter(stmt_iterable, count):
        # Instantiate the PA statement objects.
        for pa_stmt_db_obj in stmt_pair_batch:
            k = pa_stmt_db_obj.mk_hash
            if k not in stmt_dict.keys():
                stmt_dict[k] = get_statement_object(pa_stmt_db_obj)

        logger.info("Up to %d pa statements in all." % len(stmt_dict))
    return stmt_dict


[docs]@clockit
def get_evidence(pa_stmt_list, db=None, fix_refs=True, use_views=True):
    """Fill in the evidence for a list of pre-assembled statements.

    Parameters
    ----------
    pa_stmt_list : list[Statement]
        A list of unique statements, generally drawn from the database
        pa_statement table (via `get_statemetns`).
    db : DatabaseManager instance or None
        An instance of a database manager. If None, defaults to the "primary"
        database, as defined in the db_config.ini file in .config/indra.
    fix_refs : bool
        The paper refs within the evidence objects are not populated in the
        database, and thus must be filled using the relations in the database.
        If True (default), the `pmid` field of each Statement Evidence object
        is set to the correct PMIDs, or None if no PMID is available. If False,
        the `pmid` field defaults to the value populated by the reading
        system.

    Returns
    -------
    None - modifications are made to the Statements "in-place".
    """
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    if db is None:
        db = get_db('primary')

    # Turn the list into a dict.
    stmt_dict = {s.get_hash(shallow=True): s for s in pa_stmt_list}

    if use_views:
        if fix_refs:
            raw_links = db.select_all(
                [db.FastRawPaLink.mk_hash, db.FastRawPaLink.raw_json,
                 db.FastRawPaLink.reading_id],
                db.FastRawPaLink.mk_hash.in_(stmt_dict.keys())
            )
            rel_refs = ['pmid', 'rid']
            ref_cols = [getattr(db.ReadingRefLink, k) for k in rel_refs]
        else:
            raw_links = db.select_all(
                [db.FastRawPaLink.mk_hash, db.FastRawPaLink.raw_json],
                db.FastRawPaLink.mk_hash.in_(stmt_dict.keys())
            )
        rid_ref_dict = {}
        myst_rid_rs_dict = defaultdict(list)
        for info in raw_links:
            if fix_refs:
                mk_hash, raw_json, rid = info
            else:
                mk_hash, raw_json = info
                rid = None
            json_dict = json.loads(raw_json.decode('utf-8'))
            ev_json = json_dict.get('evidence', [])
            assert len(ev_json) == 1, \
                "Raw statements must have one evidence, got %d." % len(ev_json)
            ev = Evidence._from_json(ev_json[0])
            stmt_dict[mk_hash].evidence.append(ev)
            if fix_refs:
                ref_dict = rid_ref_dict.get(rid)
                if ref_dict is None:
                    myst_rid_rs_dict[rid].append(ev)
                    if len(myst_rid_rs_dict) >= 1000:
                        ref_data_list = db.select_all(
                            ref_cols,
                            db.ReadingRefLink.rid.in_(myst_rid_rs_dict.keys())
                        )
                        for pmid, rid in ref_data_list:
                            rid_ref_dict[rid] = pmid
                            for ev in myst_rid_rs_dict[rid]:
                                ev.pmid = pmid
                        myst_rid_rs_dict.clear()
                else:
                    ev.pmid = rid_ref_dict[rid]
    else:
        # Get the data from the database
        raw_list = db.select_all(
            [db.PAStatements.mk_hash, db.RawStatements],
            db.PAStatements.mk_hash.in_(stmt_dict.keys()),
            db.PAStatements.mk_hash == db.RawUniqueLinks.pa_stmt_mk_hash,
            db.RawUniqueLinks.raw_stmt_id == db.RawStatements.id
        )

        # Note that this step depends on the ordering being maintained.
        mk_hashes, raw_stmt_objs = zip(*raw_list)
        raw_stmts = get_raw_stmts_frm_db_list(db, raw_stmt_objs, fix_refs,
                                              with_sids=False)
        raw_stmt_mk_pairs = zip(mk_hashes, raw_stmts)

        # Now attach the evidence
        for mk_hash, raw_stmt in raw_stmt_mk_pairs:
            # Each raw statement can have just one piece of evidence.
            stmt_dict[mk_hash].evidence.append(raw_stmt.evidence[0])

    return


[docs]def get_statements_from_hashes(statement_hashes, preassembled=True, db=None,
                               **kwargs):
    """Retrieve statement objects given only statement hashes.

    WARNING: This function will be removed in the future. Please look to
    indra_db.client.readonly.query and indra_db.client.principal.raw_statements
    for alternatives.
    """
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    if db is None:
        db = get_ro('primary')

    if not preassembled:
        raise DeprecationWarning("This functionality is not longer supported. "
                                 "indra_db.client.principal.raw_statements "
                                 "has more functional features to search for "
                                 "raw statements.")

    query = HasHash(statement_hashes)
    ev_lim = None
    if kwargs.get('with_evidence') is False:
        ev_lim = 0

    result = query.get_statements(db, ev_limit=ev_lim)
    return result.statements()


[docs]def get_support(statements, db=None, recursive=False):
    """Populate the supports and supported_by lists of the given statements."""
    warnings.warn(('This module is being taken out of service, as the tools '
                   'have become deprecated. Moreover, the service has been '
                   're-implemented to use newer tools as best as possible, '
                   'but some results may be subtly different.'),
                  DeprecationWarning)
    # TODO: Allow recursive mode (argument should probably be an integer level)
    if db is None:
        db = get_db('primary')

    if not isinstance(statements, dict):
        stmt_dict = {s.get_hash(shallow=True): s for s in statements}
    else:
        stmt_dict = statements

    logger.info("Populating support links.")
    support_links = db.select_all(
        [db.PASupportLinks.supported_mk_hash,
         db.PASupportLinks.supporting_mk_hash],
        or_(db.PASupportLinks.supported_mk_hash.in_(stmt_dict.keys()),
            db.PASupportLinks.supporting_mk_hash.in_(stmt_dict.keys()))
    )
    for supped_hash, supping_hash in set(support_links):
        if supped_hash == supping_hash:
            assert False, 'Self-support found on-load.'
        supped_stmt = stmt_dict.get(supped_hash)
        if supped_stmt is None:
            supped_stmt = Unresolved(shallow_hash=supped_hash)
        supping_stmt = stmt_dict.get(supping_hash)
        if supping_stmt is None:
            supping_stmt = Unresolved(shallow_hash=supping_hash)
        supped_stmt.supported_by.append(supping_stmt)
        supping_stmt.supports.append(supped_stmt)
    return