Source code for indra_db.client.principal.content

__all__ = ['get_reader_output', 'get_content_by_refs', 'get_text']

import logging
from collections import defaultdict

from indra_db.util import unpack, _get_trids

logger = logging.getLogger(__name__)



[docs]
def get_reader_output(db, ref_id, ref_type='tcid', reader=None,
                      reader_version=None):
    """Return reader output for a given text content.

    Parameters
    ----------
    db : :py:class:`DatabaseManager`
        Reference to the DB to query
    ref_id : int or str
        The text reference ID whose reader output should be returned
    ref_type : Optional[str]
        The type of ID to look for, options include
        'tcid' for the database's internal unique text content ID,
        or 'pmid', 'pmcid', 'doi, 'pii', 'manuscript_id'
        Default: 'tcid'
    reader : Optional[str]
        The name of the reader whose output is of interest
    reader_version : Optional[str]
        The specific version of the reader

    Returns
    -------
    reading_results : dict{dict{list[str]}}
        A dict of reader outputs that match the query criteria, indexed first
        by text content id, then by reader.
    """
    if ref_type == 'tcid':
        clauses = [db.Reading.text_content_id == ref_id]
    else:
        trids = _get_trids(db, ref_id, ref_type)
        if not trids:
            return []
        logger.debug("Found %d text ref ids." % len(trids))
        clauses = [db.TextContent.text_ref_id.in_(trids),
                   db.Reading.text_content_id == db.TextContent.id]
    if reader:
        clauses.append(db.Reading.reader == reader.upper())
    if reader_version:
        clauses.append(db.Reading.reader_version == reader_version)

    res = db.select_all([db.Reading.text_content_id, db.Reading.reader,
                         db.Reading.bytes], *clauses)
    reading_dict = defaultdict(lambda: defaultdict(lambda: []))
    for tcid, reader, result in res:
        unpacked_result = None
        if not result:
            logger.warning("Got reading result with zero content.")
        else:
            unpacked_result = unpack(result)
        reading_dict[tcid][reader].append(unpacked_result)
    return reading_dict




[docs]
def get_content_by_refs(db, pmid_list=None, trid_list=None, sources=None,
                        formats=None, content_type='abstract', unzip=True):
    """Return content from the database given a list of PMIDs or text ref ids.

    Note that either pmid_list OR trid_list must be set, and only one can be
    set at a time.

    Parameters
    ----------
    db : :py:class:`DatabaseManager`
        Reference to the DB to query
    pmid_list : list[str] or None
        A list of pmids. Default is None, in which case trid_list must be
        given.
    trid_list : list[int] or None
        A list of text ref ids. Default is None, in which case pmid list must
        be given.
    sources : list[str] or None
        A list of sources to include (e.g. 'pmc_oa', or 'pubmed'). Default is
        None, indicating that all sources will be included.
    formats : list[str]
        A list of the formats to be included ('xml', 'text'). Default is None,
        indicating that all formats will be included.
    content_type : str
        Select the type of content to load ('abstract' or 'fulltext'). Note
        that not all refs will have any, or both, types of content.
    unzip : Optional[bool]
        If True, the compressed output is decompressed into clear text.
        Default: True

    Returns
    -------
    content_dict : dict
        A dictionary whose keys are text ref ids, with each value being the
        the corresponding content.
    """
    # Make sure we only get one type of list.
    if not (pmid_list or trid_list):
        raise ValueError("One of `pmid_list` or `trid_list` must be defined.")
    if pmid_list and trid_list:
        raise ValueError("Only one of `pmid_list` or `trid_list` may be used.")

    # Put together the clauses for the general constraints.
    clauses = []
    if sources is not None:
        clauses.append(db.TextContent.source.in_(sources))
    if formats is not None:
        clauses.append(db.TextContent.format.in_(formats))
    if content_type not in ['abstract', 'fulltext']:
        raise ValueError("Unrecognized content type: %s" % content_type)
    else:
        clauses.append(db.TextContent.text_type == content_type)

    # Do the query to get the content.
    if pmid_list is not None:
        content_list = db.select_all(
            [db.TextRef.pmid, db.TextContent.content],
            db.TextRef.id == db.TextContent.text_ref_id,
            db.TextRef.pmid.in_(pmid_list),
            *clauses
            )
    else:
        content_list = db.select_all([db.TextRef.id, db.TextContent.content],
                                     db.TextContent.text_ref_id.in_(trid_list),
                                     *clauses)
    if unzip:
        content_dict = {id_val: unpack(content)
                        for id_val, content in content_list}
    else:
        content_dict = {id_val: content for id_val, content in content_list}
    return content_dict




[docs]
def get_text(db, pmids, text_type):
    """Return text content of a given type for a list of PMIDs."""
    # Run a query for text content of the desired type
    res = (db.session.query(db.TextRef.pmid, db.TextContent.text_type,
                            db.TextContent.content)
            .filter(db.TextRef.pmid_in(pmids))
            .join(db.TextContent)
            .filter(db.TextContent.text_type == text_type)
            .all())
    # Unpack the content, clean it up, and return it as a dictionary keyed
    # by pmid
    text_by_pmid = {
        row.pmid: unpack(row.content).replace("\t", " ").replace("\n", "\t")
        for row in res
    }
    return text_by_pmid