Source code for indra_db.util.content_scripts

__all__ = ['get_stmts_with_agent_text_like', 'get_text_content_from_stmt_ids']

from sqlalchemy import text
from collections import defaultdict
from cachetools.keys import hashkey
from cachetools import cached, LRUCache

from .constructors import get_db
from .helpers import unpack, _get_trids


[docs]def get_stmts_with_agent_text_like(pattern, filter_genes=False,
                                   db=None):
    """Get statement ids with agent with rawtext matching pattern


    Parameters
    ----------
    pattern : str
        a pattern understood by sqlalchemy's like operator.
        For example '__' for two letter agents

    filter_genes : Optional[bool]
       if True, only returns map for agent texts for which there is at least
       one HGNC grounding in the database. Default: False

    db : Optional[:py:class:`DatabaseManager`]
        User has the option to pass in a database manager. If None
        the primary database is used. Default: None

    Returns
    -------
    dict
        dict mapping agent texts to statement ids. agent text are those
        matching the input pattern. Each agent text maps to the list of
        statement ids for statements containing an agent with that TEXT
        in its db_refs
    """
    if db is None:
        db = get_db('primary')

    # Query Raw agents table for agents with TEXT db_ref matching pattern
    # Selects agent texts, statement ids and agent numbers. The agent number
    # corresponds to the agents index into the agent list
    agents = db.select_all([db.RawAgents.db_id,
                            db.RawAgents.stmt_id,
                            db.RawAgents.ag_num],
                           db.RawAgents.db_name.like('TEXT'),
                           db.RawAgents.db_id.like(pattern),
                           db.RawAgents.stmt_id.isnot(None))
    if filter_genes:
        # If filtering to only genes, get statement ids and agent numbers
        # for all agents grounded to HGNC. Check if agent text has been
        # grounded to HGNC at least once
        hgnc_agents = db.select_all([db.RawAgents.stmt_id,
                                     db.RawAgents.ag_num],
                                    db.RawAgents.db_name.like('HGNC'),
                                    db.RawAgents.stmt_id.isnot(None))
        hgnc_agents = set(hgnc_agents)
        agents = [(agent_text, stmt_id, ag_num)
                  for agent_text, stmt_id, ag_num in agents
                  if (stmt_id, ag_num) in hgnc_agents]
    output = defaultdict(list)
    for agent_text, stmt_id, ag_num in agents:
        if stmt_id not in output[agent_text]:
            output[agent_text].append(stmt_id)
    return dict(output)


def get_stmts_with_agent_text_in(agent_texts, filter_genes=False, db=None):
    """Get statement ids with agent with rawtext in list


    Parameters
    ----------
    agent_texts : list of str
        a list of agent texts

    filter_genes : Optional[bool]
        if True, only returns map for agent texts for which there is at least
        one HGNC grounding in the database. Default: False

    db : Optional[:py:class:`DatabaseManager`]
        User has the option to pass in a database manager. If None
        the primary database is used. Default: None

    Returns
    -------
    dict
        dict mapping agent texts to lists of statement ids for statements
        containing an agent with that TEXT in its db_refs.
    """
    if db is None:
        db = get_db('primary')

    # Query Raw agents table for agents with TEXT db_ref matching pattern
    # Selects agent texts, statement ids and agent numbers. The agent number
    # corresponds to the agents index into the agent list
    agents = db.select_all([db.RawAgents.db_id,
                            db.RawAgents.stmt_id,
                            db.RawAgents.ag_num],
                           db.RawAgents.db_name.like('TEXT'),
                           db.RawAgents.stmt_id.isnot(None))
    agents = [(agent_text, stmt_id, ag_num)
              for agent_text, stmt_id, ag_num in agents
              if agent_text in agent_texts]
    if filter_genes:
        # If filtering to only genes, get statement ids and agent numbers
        # for all agents grounded to HGNC. Check if agent text has been
        # grounded to HGNC at least once
        hgnc_agents = db.select_all([db.RawAgents.stmt_id,
                                     db.RawAgents.ag_num],
                                    db.RawAgents.db_name.like('HGNC'),
                                    db.RawAgents.stmt_id.isnot(None))
        hgnc_agents = set(hgnc_agents)
        agents = [(agent_text, stmt_id, ag_num)
                  for agent_text, stmt_id, ag_num in agents
                  if (stmt_id, ag_num) in hgnc_agents]
    output = defaultdict(list)
    for agent_text, stmt_id, ag_num in agents:
        if stmt_id not in output[agent_text]:
            output[agent_text].append(stmt_id)
    return dict(output)


[docs]def get_text_content_from_stmt_ids(stmt_ids, db=None):
    """Get text content for statements from a list of ids

    Gets the fulltext if it is available, even if the statement came from an
    abstract.

    Parameters
    ----------
    stmt_ids : list of str

    db : Optional[:py:class:`DatabaseManager`]
        User has the option to pass in a database manager. If None
        the primary database is used. Default: None

    Returns
    -------
    ref_dict: dict
        dict mapping statement ids to identifiers for pieces of content.
        These identifiers take the form `<text_ref_id>/<source>/<text_type>'.
        No entries exist for statements with no associated text content
        (these typically come from databases)


    text_dict: dict
        dict mapping content identifiers used as values in the ref_dict
        to best available text content. The order of preference is
        fulltext xml > plaintext abstract > title
    """
    if db is None:
        db = get_db('primary')
    identifiers = get_content_identifiers_from_stmt_ids(stmt_ids)
    content = _get_text_content(identifiers.values())
    return identifiers, content


def get_text_content_from_pmids(pmids, db=None):
    """Get best available text content for list of pmids

    For each pmid, gets the best piece of text content with the priority
    fulltext > abstract > title.

    Parameters
    ----------
    pmids : list of str

    db : Optional[:py:class:`DatabaseManager`]
        User has the option to pass in a database manager. If None
        the primary database is used. Default: None

    Returns
    -------
    identifiers : dict
        dict mapping pmids to identifiers for pieces of content.
        These identifiers are tuples of the form
        (text_ref_id, source, text_type). Each tuple uniquely specifies
        a piece of content in the database
        No entries exist for statements with no associated text content
        (these typically come from databases)

    content : dict
        dict mapping content identifiers used as values in the ref_dict
        to the best available text content.
    """
    if db is None:
        db = get_db('primary')
    identifiers = get_content_identifiers_from_pmids(pmids)
    content = _get_text_content(identifiers.values())
    return identifiers, content


def get_content_identifiers_from_stmt_ids(stmt_ids, db=None):
    """Get content identifiers for statements from a list of ids

    An identifier is a triple containing a text_ref_id, source, and text_type
    Gets the identifier for best piece of text content with priority
    fulltext > abstract > title

    Parameters
    ----------
    stmt_ids : list of str

    db : Optional[:py:class:`DatabaseManager`]
        User has the option to pass in a database manager. If None
        the primary database is used. Default: None

    Returns
    -------
    ref_dict: dict
        dict mapping statement ids to identifiers for pieces of content.
        These identifiers take the form `<text_ref_id>/<source>/<text_type>'.
        No entries exist for statements with no associated text content
        (these typically come from databases)
    """
    if db is None:
        db = get_db('primary')
    stmt_ids = tuple(set(stmt_ids))
    query = """SELECT
                   sub.stmt_id, tc.text_ref_id, tc.source,
                   tc.format, tc.text_type
               FROM
                   text_content tc,
                   (SELECT
                        stmt_id, text_ref_id
                    FROM
                        raw_stmt_ref_link
                    WHERE
                        stmt_id IN :stmt_ids) sub
                WHERE
                    tc.text_ref_id = sub.text_ref_id
            """
    res = db.session.execute(text(query), {'stmt_ids': stmt_ids})
    return _collect_content_identifiers(res)


def get_content_identifiers_from_pmids(pmids, db=None):
    """Get content identifiers from list of pmids

    An identifier is a triple containing a text_ref_id, source, and text_type
    Gets the identifier for best piece of text content with priority
    fulltext > abstract > title

    Parameters
    ----------
    pmids : list of str

    db : Optional[:py:class:`DatabaseManager`]
        User has the option to pass in a database manager. If None
        the primary is used. Default: None

    Returns
    -------
    ref_dict: dict
        dict mapping statement ids to identifiers for pieces of content.
        These identifiers take the form `<text_ref_id>/<source>/<text_type>'.
        No entries exist for statements with no associated text content
        (these typically come from databases)


    text_dict: dict
        dict mapping content identifiers used as values in the ref_dict
        to best available text content. The order of preference is
        fulltext xml > plaintext abstract > title
    """
    if db is None:
        db = get_db('primary')
    pmids = tuple(set(pmids))
    query = """SELECT
                   tr.pmid, tr.id, tc.source, tc.format, tc.text_type
               FROM
                   text_content AS tc
               JOIN
                   text_ref as tr
               ON
                   tr.id = tc.text_ref_id
               WHERE
                   tr.pmid IN :pmids
            """
    res = db.session.execute(text(query), {'pmids': pmids})
    return _collect_content_identifiers(res)


def _collect_content_identifiers(res):
    priority = {'fulltext': 2, 'abstract': 1, 'title': 0}
    seen_text_refs = {}
    ref_dict = {}
    for id_, text_ref_id, source, format_, text_type in res.fetchall():
        new_identifier = (text_ref_id, source, format_, text_type)
        if (id_, text_ref_id) not in seen_text_refs:
            seen_text_refs[(id_, text_ref_id)] = new_identifier
            ref_dict[id_] = new_identifier
        else:
            # update if we find text_type with higher priority for
            # a given text_ref
            old_identifier = seen_text_refs[(id_, text_ref_id)]
            old_text_type = old_identifier[3]
            if priority[text_type] > priority[old_text_type]:
                seen_text_refs[(id_, text_ref_id)] = new_identifier
                ref_dict[id_] = new_identifier
    return ref_dict


def _get_text_content(content_identifiers, db=None):
    """Return text_content associated to a list of content identifiers

    Parameters
    ----------
    content_identifiers : iterable of tuple
        A content identifier is a triple with three elements, text_ref_id,
        source, and text_type. These three pieces of information uniquely
        specify a piece of content in the database. content_identifiers
        is a list of these triples

     db : Optional[:py:class:`DatabaseManager`]
        User has the option to pass in a database manager. If None
        the primary database is used. Default: None

    Returns
    -------
    dict
        A dictionary mapping content identifiers to pieces of
        text content. content identifiers for which no content
        exists in the database are excluded as keys.
    """
    if db is None:
        db = get_db('primary')
    # Remove duplicate identifiers
    content_identifiers = set(content_identifiers)
    # Query finds content associated to each identifier by joining
    # the text_content table with a virtual table containing the
    # input identifiers. The query string is generated programmatically
    id_str = ', '.join('(:trid%d, :source%d, :format%d, :text_type%d)'
                       % (i, i, i, i)
                       for i in range(len(content_identifiers)))
    params = {}
    for i, (trid, source,
            format_, text_type) in enumerate(content_identifiers):
        params.update({'trid%s' % i: trid,
                       'source%i' % i: source,
                       'format%i' % i: format_,
                       'text_type%i' % i: text_type})
    query = """SELECT
                   tc.text_ref_id, tc.source, tc.format, tc.text_type, content
               FROM
                   text_content AS tc
               JOIN (VALUES %s)
               AS
                  ids (text_ref_id, source, format, text_type)
               ON
                   tc.text_ref_id = ids.text_ref_id
                   AND tc.source = ids.source
                   AND tc.format = ids.format
                   AND tc.text_type = ids.text_type
            """ % id_str

    res = db.session.execute(text(query), params)
    return {(trid, source, format, text_type): unpack(content)
            for trid, source, format, text_type, content in res}


class TextContentSessionHandler(object):
    """Allows querying of text content from text_refs

    Doesn't directly expose the db.

    Parameters
    ----------
    db : Optional[:py:class:`DatabaseManager`]
        User has the option to pass in a database manager. If None
        the primary database is used. Default: None
    """
    def __init__(self, db=None):
        default = False
        if db is None:
            db = get_db('primary')
            default = True
        self.__db = db
        self.default = default

    def close(self):
        self.__db.session.rollback()
        self.__db.session.close()

    def get_text_content_from_text_refs(self, text_refs, use_cache=True):
        """Get text_content from an evidence object's text_refs attribute


        Parameters
        ----------
        text_refs : dict of str: str
            text_refs dictionary as contained in an evidence object
            The dictionary should be keyed on id_types. The valid keys
            are 'PMID', 'PMCID', 'DOI', 'PII', 'URL', 'MANUSCRIPT_ID'.



        use_cache : Optional[bool]
            Whether or not to use cached results. Only relevant when
            querying the primary database. Will not work if primary
            database is passed in with keyword argument. Only if
            keyword db argument is absent or set to None.
            Default: True

        Returns
        -------
        text : str
            fulltext corresponding to the text_refs if it exists in the
            database, otherwise the abstract. Returns None if no content
            exists for the text_refs in the database
        """
        if self.default and use_cache:
            frozen_text_refs = frozenset(text_refs.items())
            result = self.\
                _get_text_content_from_text_refs_cached(frozen_text_refs)
        else:
            text_ref_id = self._get_text_ref_id_from_text_refs(text_refs)
            if text_ref_id is None:
                result = None
            else:
                result = self._get_text_content_from_trid(text_ref_id)
        return result

    @cached(cache=LRUCache(maxsize=10000),
            key=lambda self, frozen_text_refs: hashkey(frozen_text_refs))
    def _get_text_content_from_text_refs_cached(self, frozen_text_refs):
        text_refs = dict(frozen_text_refs)
        text_ref_id = self._get_text_ref_id_from_text_refs(text_refs)
        if text_ref_id is None:
            result = None
        else:
            result = self._get_text_content_from_trid(text_ref_id)
        return result

    def _get_text_ref_id_from_text_refs(self, text_refs):
        # In some cases the TRID is already there so we can just
        # return it
        if 'TRID' in text_refs:
            return text_refs['TRID']
        text_ref_id = None
        for id_type in ['pmid', 'pmcid', 'doi',
                        'pii', 'url', 'manuscript_id']:
            try:
                id_val = text_refs[id_type.upper()]
                trids = _get_trids(self.__db, id_val, id_type)
                if trids:
                    text_ref_id = trids[0]
                    break
            except KeyError:
                pass
        return text_ref_id

    def _get_text_content_from_trid(self, text_ref_id):
        texts = self.__db.select_all([self.__db.TextContent.content,
                                      self.__db.TextContent.text_type],
                                     self.__db.TextContent.text_ref_id ==
                                     text_ref_id)
        contents = defaultdict(list)
        for content, text_type in texts:
            contents[text_type].append(content)
        # Look at text types in order of priority
        for text_type in ('fulltext', 'abstract', 'title'):
            # There are cases when we get a list of results for the same
            # content type with some that are None and some actual content,
            # so we iterate to find a non-empty content to return
            for content in contents.get(text_type, []):
                if content:
                    return unpack(content)
        return None


def _extract_db_refs(stmt_json):
    agent_types = ['sub', 'subj', 'obj', 'enz', 'agent', 'gef;', 'ras',
                   'gap', 'obj_from', 'obj_to']
    db_ref_list = []

    for agent_type in agent_types:
        try:
            agent = stmt_json[agent_type]
        except KeyError:
            continue
        try:
            db_refs = agent['db_refs']
        except KeyError:
            continue
        db_ref_list.append(db_refs)

    members = stmt_json.get('members')
    if members is not None:
        for member in members:
            try:
                db_refs = member['db_refs']
            except KeyError:
                continue
            db_ref_list.append(db_refs)
    return db_ref_list