__all__ = ['get_stmts_with_agent_text_like', 'get_text_content_from_stmt_ids']
from sqlalchemy import text
from collections import defaultdict
from cachetools.keys import hashkey
from cachetools import cached, LRUCache
from .constructors import get_db
from .helpers import unpack, _get_trids
[docs]def get_stmts_with_agent_text_like(pattern, filter_genes=False,
db=None):
"""Get statement ids with agent with rawtext matching pattern
Parameters
----------
pattern : str
a pattern understood by sqlalchemy's like operator.
For example '__' for two letter agents
filter_genes : Optional[bool]
if True, only returns map for agent texts for which there is at least
one HGNC grounding in the database. Default: False
db : Optional[:py:class:`DatabaseManager`]
User has the option to pass in a database manager. If None
the primary database is used. Default: None
Returns
-------
dict
dict mapping agent texts to statement ids. agent text are those
matching the input pattern. Each agent text maps to the list of
statement ids for statements containing an agent with that TEXT
in its db_refs
"""
if db is None:
db = get_db('primary')
# Query Raw agents table for agents with TEXT db_ref matching pattern
# Selects agent texts, statement ids and agent numbers. The agent number
# corresponds to the agents index into the agent list
agents = db.select_all([db.RawAgents.db_id,
db.RawAgents.stmt_id,
db.RawAgents.ag_num],
db.RawAgents.db_name.like('TEXT'),
db.RawAgents.db_id.like(pattern),
db.RawAgents.stmt_id.isnot(None))
if filter_genes:
# If filtering to only genes, get statement ids and agent numbers
# for all agents grounded to HGNC. Check if agent text has been
# grounded to HGNC at least once
hgnc_agents = db.select_all([db.RawAgents.stmt_id,
db.RawAgents.ag_num],
db.RawAgents.db_name.like('HGNC'),
db.RawAgents.stmt_id.isnot(None))
hgnc_agents = set(hgnc_agents)
agents = [(agent_text, stmt_id, ag_num)
for agent_text, stmt_id, ag_num in agents
if (stmt_id, ag_num) in hgnc_agents]
output = defaultdict(list)
for agent_text, stmt_id, ag_num in agents:
if stmt_id not in output[agent_text]:
output[agent_text].append(stmt_id)
return dict(output)
def get_stmts_with_agent_text_in(agent_texts, filter_genes=False, db=None):
"""Get statement ids with agent with rawtext in list
Parameters
----------
agent_texts : list of str
a list of agent texts
filter_genes : Optional[bool]
if True, only returns map for agent texts for which there is at least
one HGNC grounding in the database. Default: False
db : Optional[:py:class:`DatabaseManager`]
User has the option to pass in a database manager. If None
the primary database is used. Default: None
Returns
-------
dict
dict mapping agent texts to lists of statement ids for statements
containing an agent with that TEXT in its db_refs.
"""
if db is None:
db = get_db('primary')
# Query Raw agents table for agents with TEXT db_ref matching pattern
# Selects agent texts, statement ids and agent numbers. The agent number
# corresponds to the agents index into the agent list
agents = db.select_all([db.RawAgents.db_id,
db.RawAgents.stmt_id,
db.RawAgents.ag_num],
db.RawAgents.db_name.like('TEXT'),
db.RawAgents.stmt_id.isnot(None))
agents = [(agent_text, stmt_id, ag_num)
for agent_text, stmt_id, ag_num in agents
if agent_text in agent_texts]
if filter_genes:
# If filtering to only genes, get statement ids and agent numbers
# for all agents grounded to HGNC. Check if agent text has been
# grounded to HGNC at least once
hgnc_agents = db.select_all([db.RawAgents.stmt_id,
db.RawAgents.ag_num],
db.RawAgents.db_name.like('HGNC'),
db.RawAgents.stmt_id.isnot(None))
hgnc_agents = set(hgnc_agents)
agents = [(agent_text, stmt_id, ag_num)
for agent_text, stmt_id, ag_num in agents
if (stmt_id, ag_num) in hgnc_agents]
output = defaultdict(list)
for agent_text, stmt_id, ag_num in agents:
if stmt_id not in output[agent_text]:
output[agent_text].append(stmt_id)
return dict(output)
[docs]def get_text_content_from_stmt_ids(stmt_ids, db=None):
"""Get text content for statements from a list of ids
Gets the fulltext if it is available, even if the statement came from an
abstract.
Parameters
----------
stmt_ids : list of str
db : Optional[:py:class:`DatabaseManager`]
User has the option to pass in a database manager. If None
the primary database is used. Default: None
Returns
-------
ref_dict: dict
dict mapping statement ids to identifiers for pieces of content.
These identifiers take the form `<text_ref_id>/<source>/<text_type>'.
No entries exist for statements with no associated text content
(these typically come from databases)
text_dict: dict
dict mapping content identifiers used as values in the ref_dict
to best available text content. The order of preference is
fulltext xml > plaintext abstract > title
"""
if db is None:
db = get_db('primary')
identifiers = get_content_identifiers_from_stmt_ids(stmt_ids)
content = _get_text_content(identifiers.values())
return identifiers, content
def get_text_content_from_pmids(pmids, db=None):
"""Get best available text content for list of pmids
For each pmid, gets the best piece of text content with the priority
fulltext > abstract > title.
Parameters
----------
pmids : list of str
db : Optional[:py:class:`DatabaseManager`]
User has the option to pass in a database manager. If None
the primary database is used. Default: None
Returns
-------
identifiers : dict
dict mapping pmids to identifiers for pieces of content.
These identifiers are tuples of the form
(text_ref_id, source, text_type). Each tuple uniquely specifies
a piece of content in the database
No entries exist for statements with no associated text content
(these typically come from databases)
content : dict
dict mapping content identifiers used as values in the ref_dict
to the best available text content.
"""
if db is None:
db = get_db('primary')
identifiers = get_content_identifiers_from_pmids(pmids)
content = _get_text_content(identifiers.values())
return identifiers, content
def get_content_identifiers_from_stmt_ids(stmt_ids, db=None):
"""Get content identifiers for statements from a list of ids
An identifier is a triple containing a text_ref_id, source, and text_type
Gets the identifier for best piece of text content with priority
fulltext > abstract > title
Parameters
----------
stmt_ids : list of str
db : Optional[:py:class:`DatabaseManager`]
User has the option to pass in a database manager. If None
the primary database is used. Default: None
Returns
-------
ref_dict: dict
dict mapping statement ids to identifiers for pieces of content.
These identifiers take the form `<text_ref_id>/<source>/<text_type>'.
No entries exist for statements with no associated text content
(these typically come from databases)
"""
if db is None:
db = get_db('primary')
stmt_ids = tuple(set(stmt_ids))
query = """SELECT
sub.stmt_id, tc.text_ref_id, tc.source,
tc.format, tc.text_type
FROM
text_content tc,
(SELECT
stmt_id, text_ref_id
FROM
raw_stmt_ref_link
WHERE
stmt_id IN :stmt_ids) sub
WHERE
tc.text_ref_id = sub.text_ref_id
"""
res = db.session.execute(text(query), {'stmt_ids': stmt_ids})
return _collect_content_identifiers(res)
def get_content_identifiers_from_pmids(pmids, db=None):
"""Get content identifiers from list of pmids
An identifier is a triple containing a text_ref_id, source, and text_type
Gets the identifier for best piece of text content with priority
fulltext > abstract > title
Parameters
----------
pmids : list of str
db : Optional[:py:class:`DatabaseManager`]
User has the option to pass in a database manager. If None
the primary is used. Default: None
Returns
-------
ref_dict: dict
dict mapping statement ids to identifiers for pieces of content.
These identifiers take the form `<text_ref_id>/<source>/<text_type>'.
No entries exist for statements with no associated text content
(these typically come from databases)
text_dict: dict
dict mapping content identifiers used as values in the ref_dict
to best available text content. The order of preference is
fulltext xml > plaintext abstract > title
"""
if db is None:
db = get_db('primary')
pmids = tuple(set(pmids))
query = """SELECT
tr.pmid, tr.id, tc.source, tc.format, tc.text_type
FROM
text_content AS tc
JOIN
text_ref as tr
ON
tr.id = tc.text_ref_id
WHERE
tr.pmid IN :pmids
"""
res = db.session.execute(text(query), {'pmids': pmids})
return _collect_content_identifiers(res)
def _collect_content_identifiers(res):
priority = {'fulltext': 2, 'abstract': 1, 'title': 0}
seen_text_refs = {}
ref_dict = {}
for id_, text_ref_id, source, format_, text_type in res.fetchall():
new_identifier = (text_ref_id, source, format_, text_type)
if (id_, text_ref_id) not in seen_text_refs:
seen_text_refs[(id_, text_ref_id)] = new_identifier
ref_dict[id_] = new_identifier
else:
# update if we find text_type with higher priority for
# a given text_ref
old_identifier = seen_text_refs[(id_, text_ref_id)]
old_text_type = old_identifier[3]
if priority[text_type] > priority[old_text_type]:
seen_text_refs[(id_, text_ref_id)] = new_identifier
ref_dict[id_] = new_identifier
return ref_dict
def _get_text_content(content_identifiers, db=None):
"""Return text_content associated to a list of content identifiers
Parameters
----------
content_identifiers : iterable of tuple
A content identifier is a triple with three elements, text_ref_id,
source, and text_type. These three pieces of information uniquely
specify a piece of content in the database. content_identifiers
is a list of these triples
db : Optional[:py:class:`DatabaseManager`]
User has the option to pass in a database manager. If None
the primary database is used. Default: None
Returns
-------
dict
A dictionary mapping content identifiers to pieces of
text content. content identifiers for which no content
exists in the database are excluded as keys.
"""
if db is None:
db = get_db('primary')
# Remove duplicate identifiers
content_identifiers = set(content_identifiers)
# Query finds content associated to each identifier by joining
# the text_content table with a virtual table containing the
# input identifiers. The query string is generated programmatically
id_str = ', '.join('(:trid%d, :source%d, :format%d, :text_type%d)'
% (i, i, i, i)
for i in range(len(content_identifiers)))
params = {}
for i, (trid, source,
format_, text_type) in enumerate(content_identifiers):
params.update({'trid%s' % i: trid,
'source%i' % i: source,
'format%i' % i: format_,
'text_type%i' % i: text_type})
query = """SELECT
tc.text_ref_id, tc.source, tc.format, tc.text_type, content
FROM
text_content AS tc
JOIN (VALUES %s)
AS
ids (text_ref_id, source, format, text_type)
ON
tc.text_ref_id = ids.text_ref_id
AND tc.source = ids.source
AND tc.format = ids.format
AND tc.text_type = ids.text_type
""" % id_str
res = db.session.execute(text(query), params)
return {(trid, source, format, text_type): unpack(content)
for trid, source, format, text_type, content in res}
class TextContentSessionHandler(object):
"""Allows querying of text content from text_refs
Doesn't directly expose the db.
Parameters
----------
db : Optional[:py:class:`DatabaseManager`]
User has the option to pass in a database manager. If None
the primary database is used. Default: None
"""
def __init__(self, db=None):
default = False
if db is None:
db = get_db('primary')
default = True
self.__db = db
self.default = default
def close(self):
self.__db.session.rollback()
self.__db.session.close()
def get_text_content_from_text_refs(self, text_refs, use_cache=True):
"""Get text_content from an evidence object's text_refs attribute
Parameters
----------
text_refs : dict of str: str
text_refs dictionary as contained in an evidence object
The dictionary should be keyed on id_types. The valid keys
are 'PMID', 'PMCID', 'DOI', 'PII', 'URL', 'MANUSCRIPT_ID'.
use_cache : Optional[bool]
Whether or not to use cached results. Only relevant when
querying the primary database. Will not work if primary
database is passed in with keyword argument. Only if
keyword db argument is absent or set to None.
Default: True
Returns
-------
text : str
fulltext corresponding to the text_refs if it exists in the
database, otherwise the abstract. Returns None if no content
exists for the text_refs in the database
"""
if self.default and use_cache:
frozen_text_refs = frozenset(text_refs.items())
result = self.\
_get_text_content_from_text_refs_cached(frozen_text_refs)
else:
text_ref_id = self._get_text_ref_id_from_text_refs(text_refs)
if text_ref_id is None:
result = None
else:
result = self._get_text_content_from_trid(text_ref_id)
return result
@cached(cache=LRUCache(maxsize=10000),
key=lambda self, frozen_text_refs: hashkey(frozen_text_refs))
def _get_text_content_from_text_refs_cached(self, frozen_text_refs):
text_refs = dict(frozen_text_refs)
text_ref_id = self._get_text_ref_id_from_text_refs(text_refs)
if text_ref_id is None:
result = None
else:
result = self._get_text_content_from_trid(text_ref_id)
return result
def _get_text_ref_id_from_text_refs(self, text_refs):
# In some cases the TRID is already there so we can just
# return it
if 'TRID' in text_refs:
return text_refs['TRID']
text_ref_id = None
for id_type in ['pmid', 'pmcid', 'doi',
'pii', 'url', 'manuscript_id']:
try:
id_val = text_refs[id_type.upper()]
trids = _get_trids(self.__db, id_val, id_type)
if trids:
text_ref_id = trids[0]
break
except KeyError:
pass
return text_ref_id
def _get_text_content_from_trid(self, text_ref_id):
texts = self.__db.select_all([self.__db.TextContent.content,
self.__db.TextContent.text_type],
self.__db.TextContent.text_ref_id ==
text_ref_id)
contents = defaultdict(list)
for content, text_type in texts:
contents[text_type].append(content)
# Look at text types in order of priority
for text_type in ('fulltext', 'abstract', 'title'):
# There are cases when we get a list of results for the same
# content type with some that are None and some actual content,
# so we iterate to find a non-empty content to return
for content in contents.get(text_type, []):
if content:
return unpack(content)
return None
def _extract_db_refs(stmt_json):
agent_types = ['sub', 'subj', 'obj', 'enz', 'agent', 'gef;', 'ras',
'gap', 'obj_from', 'obj_to']
db_ref_list = []
for agent_type in agent_types:
try:
agent = stmt_json[agent_type]
except KeyError:
continue
try:
db_refs = agent['db_refs']
except KeyError:
continue
db_ref_list.append(db_refs)
members = stmt_json.get('members')
if members is not None:
for member in members:
try:
db_refs = member['db_refs']
except KeyError:
continue
db_ref_list.append(db_refs)
return db_ref_list