Source code for indra_db.schemas.principal_schema

The Principal Schema

The Principal database is the core representation of our data, the ultimate
authority on what we know. It is heavily optimized for the *input* and
maintenance of our data.

__all__ = ['PrincipalSchema', 'foreign_key_map']

import logging

from sqlalchemy import Column, Integer, String, UniqueConstraint, ForeignKey, \
     Boolean, DateTime, func, BigInteger, or_, tuple_
from sqlalchemy.orm import relationship
from sqlalchemy.dialects.postgresql import BYTEA, INET, JSONB

from indra_db.schemas.mixins import IndraDBTable, IndraDBRefTable, Schema
from indra_db.schemas.indexes import StringIndex, BtreeIndex

foreign_key_map = [
    ('pa_agents', 'pa_statements'),
    ('raw_unique_links', 'pa_statements'),
    ('raw_unique_links', 'raw_statements'),
    ('raw_statements', 'reading'),
    ('raw_agents', 'raw_statements'),
    ('raw_statements', 'db_info'),
    ('reading', 'text_content'),
    ('text_content', 'text_ref')

logger = logging.getLogger(__name__)

[docs]class PrincipalSchema(Schema): """The Principal schema class organizes the table constructors. The tables can be divided into various groups, with a clear order of creation for many of them. **Core Tables** First are the core tables representing our knowledge: 1. :func:`text_ref <text_ref>` 2. :func:`text_content <text_content>` 3. :func:`reading <reading>` 4. :func:`db_info <db_info>` 5. :func:`raw_statements <raw_statements>` 6. :func:`raw_unique_links <raw_unique_links>` 7. :func:`pa_statements <pa_statements>` 8. :func:`pa_support_links <pa_support_links>` **Statement Attribute Tables** Then there are the tables that represent attributes of statements. The set of tables is identical for the raw statements: - :func:`raw_activity <raw_activity>` - :func:`raw_agents <raw_agents>` - :func:`raw_muts <raw_muts>` - :func:`raw_mods <raw_mods>` and the preassembled statements: - :func:`pa_activity <pa_activity>` - :func:`pa_agents <pa_agents>` - :func:`pa_muts <pa_muts>` - :func:`pa_mods <pa_mods>` **Curation Table** This table is where we record the curations submitted by ourselves and our users, which we use to improve our results. - :func:`curations <curations>` **Ancillary Tables** We also have several tables that we use to keep track of processing metadata, and some artifacts useful in that processing. - :func:`updates <updates>` - :func:`source_file <source_file>` - :func:`reading_updates <reading_updates>` - :func:`preassembly_updates <preassembly_updates>` - :func:`xdd_updates <xdd_updates>` - :func:`rejected_statements <rejected_statements>` - :func:`discarded_statements <discarded_statements>` """
[docs] def text_ref(self): """Represent a piece of text, as per its identifiers. Each piece of text will be made available in different forms through different services, most commonly abstracts through pubmed and full text through pubmed central. However they are from the same *paper*, which has various different identifiers, such as pmids, pmcids, and dois. We do our best to merge the different identifiers and for the most part each paper has exactly one text ref. Where that is not the case it is mostly impossible to automatically reconcile the different identifiers (this often has to do with inconsistent versioning of a paper and mixups over what is IDed). **Size**: medium **Basic Columns** These are the core columns representing the different IDs we use to represent a paper. - **id** ``integer PRIMARY KEY``: The primary key of the TextRef entry. Elsewhere this is often referred to as a "text ref ID" or "trid" for short. - **pmid** ``varchar(20)``: The identifier from pubmed. - **pmcid** ``varchar(20)``: The identifier from PubMed Central (e.g. "PMC12345") - **doi** ``varchar(100)``: The ideally universal identifier. - **pii** ``varchar(250)``: The identifier used by Springer. - **url** ``varchar UNIQUE``: For sources found exclusively online (e.g. wikipedia) use their URL. - **manuscript_id** ``varchar(100) UNIQUE``: The ID assigned documents given to PMC author manuscripts. **Metadata Columns** In addition we also track some basic metadata about the entry and updates to the data in the table. - **create_date** ``timestamp without time zone``: The date the record was added. - **last_updated** ``timestamp without time zone``: The most recent time the record was edited. - **pub_year** ``integer``: The year the article was published, based on the first report we find (in order of PubMed, PMC, then PMC Manuscripts). **Constraints** Postgres is extremely efficient at detecting conflicts, and we use this to help ensure our entries do not have any duplicates. - **pmid-doi**: ``UNIQUE(pmid, doi)`` - **pmid-pmcid**: ``UNIQUE(pmid, pmcid)`` - **pmcid-doi**: ``UNIQUE(pmcid, doi)`` **Lookup Columns** Some columns are hard to look up when they are in their native string format, so they are processed and broken down into integer parts, as far as possible. - **pmid_num** ``integer``: the int-ified pmid, faster for lookup. - **pmcid_num** ``integer``: the int-portion of the PMCID, so "PMC12345" would here be 12345. - **pmcid_version** ``integer``: although rarely used, occasionally a PMC ID will have a version, indicated by a dot, e.g. PMC12345.3, in which case the "3" would be stored in this column. - **doi_ns** ``integer``: The DOI system works by assigning organizations (such as a journal) namespace IDs, and that organization is then responsible for maintaining a unique ID system internally. These namespaces are always numbers, and are stored here as such. - **doi_id** ``varchar``: The custom ID given by the publishing organization. """ class TextRef(self.base, IndraDBTable, IndraDBRefTable): """The class representing the TextRef table. The preferred constructor for this class is the ``new`` classmethod. This is because many of the IDs are processed in addition to the standard SQLAlchemy handling. To modify a column value, please use the ``update`` method, again to ensure that IDs are properly processed. The best way to construct a clause with a :py:class:`TextRef` is to use the ``has_ref`` method, inherited from :py:class:`IndraDBRefTable`, or else one of ``pmid_in``, ``pmcid_in``, or ``doi_in`` for those specific IDs. A dictionary of references can be easily generated using the ``get_ref_dict`` method. """ __tablename__ = 'text_ref' _ref_cols = ['pmid', 'pmcid', 'doi', 'pii', 'url', 'manuscript_id'] _always_disp = ['id', 'pmid', 'pmcid'] _indices = [StringIndex('text_ref_pmid_idx', 'pmid'), StringIndex('text_ref_pmcid_idx', 'pmcid'), BtreeIndex('text_ref_pmid_num_idx', 'pmid_num'), BtreeIndex('text_ref_pmcid_num_idx', 'pmcid_num'), BtreeIndex('text_ref_doi_ns_idx', 'doi_ns'), BtreeIndex('text_ref_doi_id_idx', 'doi_id'), BtreeIndex('text_ref_pub_year', 'pub_year'), StringIndex('text_ref_doi_idx', 'doi')] id = Column(Integer, primary_key=True) pmid = Column(String(20)) pmid_num = Column(Integer) pmcid = Column(String(20)) pmcid_num = Column(Integer) pmcid_version = Column(Integer) doi = Column(String(100)) doi_ns = Column(Integer) doi_id = Column(String) pub_year = Column(Integer) pii = Column(String(250)) url = Column(String, unique=True) manuscript_id = Column(String(100), unique=True) create_date = Column(DateTime, last_updated = Column(DateTime, __table_args__ = ( UniqueConstraint('pmid', 'doi', name='pmid-doi'), UniqueConstraint('pmid', 'pmcid', name='pmid-pmcid'), UniqueConstraint('pmcid', 'doi', name='pmcid-doi') ) def __repr__(self): terms = [f'id={}'] for col in ['pmid', 'pmcid', 'doi', 'pii', 'url', 'manuscript_id']: if getattr(self, col) is not None: terms.append(f'{col}={getattr(self, col)}') if len(terms) > 2: break return f'{self.__class__.__name__}({", ".join(terms)})' @classmethod def new(cls, pmid=None, pmcid=None, doi=None, pii=None, url=None, manuscript_id=None): """The preferred TextRef constructor: create a new TextRef.""" pmid, pmid_num = cls.process_pmid(pmid) pmcid, pmcid_num, pmcid_version = cls.process_pmcid(pmcid) doi, doi_ns, doi_id = cls.process_doi(doi) return cls(pmid=pmid, pmid_num=pmid_num, pmcid=pmcid, pmcid_num=pmcid_num, pmcid_version=pmcid_version, doi=doi, doi_ns=doi_ns, doi_id=doi_id, pii=pii, url=url, manuscript_id=manuscript_id) def update(self, **updates): """Update the value of an ID, processing it as necessary.""" for id_type, id_val in updates.items(): if not hasattr(self, id_type): raise ValueError(f"Invalid ID type: {id_type}") if id_type == 'pmid': self.pmid, self.pmid_num = self.process_pmid(id_val) elif id_type == 'pmcid': self.pmcid, self.pmcid_num, self.pmcid_version = \ self.process_pmcid(id_val) elif id_type == 'doi': self.doi, self.doi_ns, self.doi_id = \ self.process_doi(id_val) else: setattr(self, id_type, id_val) return return TextRef
[docs] def mesh_ref_annotations(self): """Represent the MeSH annotations of papers provided by PubMed. Each abstract/entry in PubMed is accompanied by human-curated MeSH IDs indicating the topics of the paper. Each paper will have many IDs in general, so a separate table is used, liked to the :func:`text_ref <text_ref>` table by an un-constrained PMID. This make insertion of the data easier because the custom TRIDs need not be retrieved to dump the mesh refs. **Size**: large **Columns** - **id** ``integer PRIMARY KEY``: The primary database-assigned ID of the row. - **pmid_num** ``integer NOT NULL``: The int-ified pmid that is used to link entries in this table with those in the :func:`text_ref <text_ref>` table. - **mesh_num** ```integer NOT NULL``: The intified MeSH ID (with the prefix removed). The **is_concept** column indicates whether the prefix was D (False) or C (True). - **qual_num** ``integer``: The qualifier number that is sometimes included with the annotation (Prefix Q). - **major_topic** ``boolean DEFAULT false``: The major topic flag indicates whether the ID describes a primary purpose of the paper. - **is_concept** ``boolean DEFAUL false``: Indicate whether the prefix was C (true) or D (false). **Constraints** Postgres is extremely efficient at detecting conflicts, and we use this to help ensure our entries do not have any duplicates. - **mesh-uniqueness**: ``UNIQUE(pmid_num, mesh_num, qual_num, is_concept)`` """ class MeshRefAnnotations(self.base, IndraDBTable): __tablename__ = 'mesh_ref_annotations' _always_disp = ['pmid_num', 'mesh_num', 'qual_num'] _indices = [BtreeIndex('mesh_ref_annotations_pmid_idx', 'pmid_num'), BtreeIndex('mesh_ref_annotations_mesh_id_idx', 'mesh_num'), BtreeIndex('mesh_ref_annotations_qual_id_idx', 'qual_num')] id = Column(Integer, primary_key=True) pmid_num = Column(Integer, nullable=False) mesh_num = Column(Integer, nullable=False) qual_num = Column(Integer) major_topic = Column(Boolean, default=False) is_concept = Column(Boolean, default=False) __table_args__ = ( UniqueConstraint('pmid_num', 'mesh_num', 'qual_num', 'is_concept', name='mesh-uniqueness'), ) return MeshRefAnnotations
[docs] def mti_ref_annotaions_test(self): """Represent the MeSH annotations of abstracts as inferred by MTI. MTI is a machine learned model that attempts to predict MeSH annotations on new un-annotated abstracts after training on the existing annotations. **Size**: medium **Columns** - **id** ``integer PRIMARY KEY``: The primary database-assigned ID of the row. - **pmid_num** ``integer NOT NULL``: The int-ified pmid that is used to link entries in this table with those in the :func:`text_ref <text_ref>` table. - **mesh_num** ```integer NOT NULL``: The intified MeSH ID (with the prefix removed). The **is_concept** column indicates whether the prefix was D (False) or C (True). - **qual_num** ``integer``: The qualifier number that is sometimes included with the annotation (Prefix Q). - **major_topic** ``boolean DEFAULT false``: The major topic flag indicates whether the ID describes a primary purpose of the paper. - **is_concept** ``boolean DEFAUL false``: Indicate whether the prefix was C (true) or D (false). **Constraints** Postgres is extremely efficient at detecting conflicts, and we use this to help ensure our entries do not have any duplicates. - **mesh-uniqueness**: ``UNIQUE(pmid_num, mesh_num, qual_num, is_concept)`` """ class MtiRefAnnotationsTest(self.base, IndraDBTable): __tablename__ = 'mti_ref_annotations_test' _always_disp = ['pmid_num', 'mesh_num', 'qual_num'] _indices = [BtreeIndex('mti_ref_annotations_test_pmid_idx', 'pmid_num'), BtreeIndex('mti_ref_annotations_test_mesh_id_idx', 'mesh_num'), BtreeIndex('mti_ref_annotations_test_qual_id_idx', 'qual_num')] id = Column(Integer, primary_key=True) pmid_num = Column(Integer, nullable=False) mesh_num = Column(Integer, nullable=False) qual_num = Column(Integer) major_topic = Column(Boolean, default=False) is_concept = Column(Boolean, default=False) __table_args__ = ( UniqueConstraint('pmid_num', 'mesh_num', 'qual_num', 'is_concept', name='mti-uniqueness'), ) return MtiRefAnnotationsTest
[docs] def text_content(self): """Represent the content of a text retrieved from a particular source. For each paper as a logical entity, there are many places where you can acquire the actual article or parts of it. For example you can get an abstract from PubMed for most content, and for a minority subset you can get full text from PubMed Central, either their Open-Access corpus or their author's Manuscripts. Both the text itself and the metadata for the source of the text are represented in this table. **Size**: large **Basic Columns** - **id** ``integer PRIMARY KEY``: The auto-generated primary key of the table. These are elsewhere called Text Content IDs, or TCIDs. - **text_ref_id** ``integer NOT NULL``: A foreign-key constrained reference to the appropriate entry in the :func:`text_ref <text_ref>` table. - **source** ``varchar(250) NOT NULL``: The name of the source, e.g. "pubmed" or "pmc_oa". The list of content names can be found in the class attributes in content managers. - **format** ``varchar(250) NOT NULL``: The file format of the content, e.g. "XML" or "TEXT". - **text_type** ``varchar(250) NOT NULL``: The type of the text, e.g. "abstract" of "fulltext". - **preprint** ``boolean``: Indicate whether the content is from a preprint. - **license** [``varchar``]: Record the license that applies to the content. - **content** ``bytea``: The raw compressed bytes of the content. **Metadata Columns** - **insert_data** ``timestamp without time zone``: The date the record was added. - **last_updated** ``timestamp without time zone``: The most recent time the record was edited. **Constraints** Postgres is extremely efficient at detecting conflicts, and we use this to help ensure our entries do not have any duplicates. - **content-uniqueness**: ``UNIQUE(text_ref_id, source, format, text_type)`` """ class TextContent(self.base, IndraDBTable): __tablename__ = 'text_content' _skip_disp = ['content'] _always_disp = ['id', 'text_ref_id', 'source', 'format', 'text_type'] id = Column(Integer, primary_key=True) text_ref_id = Column(Integer, ForeignKey(''), nullable=False) text_ref = relationship(self.table_dict['text_ref']) source = Column(String(250), nullable=False) format = Column(String(250), nullable=False) text_type = Column(String(250), nullable=False) content = Column(BYTEA) insert_date = Column(DateTime, last_updated = Column(DateTime, preprint = Column(Boolean) license = Column(String) __table_args__ = ( UniqueConstraint('text_ref_id', 'source', 'format', 'text_type', name='content-uniqueness'), ) return TextContent
[docs] def reading(self): """Represent a reading of a piece of text. We have multiple readers and of course many thousands of pieces of text content. Each entry in this table applies to a given reader applied to a given pieces of content. As such, the primary ID is a hash constructed from the text content ID prepended with integers that are assigned to each reader-reader version pair. The function :func:`generate_reading_id <indra_db.reading.read_db.generate_reading_id>` implements the particular process used. The reader numbers are assigned in the :data:`readers <indra_db.databases.readers>` global, and the reader version number is the index of the version listed for the given reader in the :data:`reader_versions <indra_db.databases.reader_versions>` dictionary in the same module. **Size**: very large **Basic Columns** - **id** ``bigint PRIMARY KEY``: A hash ID constructed from a reader number, reader version number, and the text content ID of the content that was read. - **text_content_id** ``integer NOT NULL``: A foreign-key constrained reference to the appropriate entry in the :func:`text_content <text_content>` table. - **batch_id** ``integer NOT NULL``: A simple random integer (not unique) that is assigned each batch of inserted readings. It is used in the moments after the insert to easily retrieve the content that was just added, potentially plus some extra. - **reader** ``varchar(20) NOT NULL``: The name of the reader, e.g. "REACH" or "SPARSER". - **reader_version** ``varchar(20) NOT NULL``: The version of the reader, which may be any arbitrary string in principle. This allows each reader to define its own versioning scheme. - **format** ``varchar(20) NOT NULL``: The file format of the reading result, e.g. "XML" or "JSON". - **bytes** ``bytea``: The raw compressed bytes of the reading result. **Metadata Columns** - **create_date** ``timestamp without time zone``: The date the record was added. - **last_updated** ``timestamp without time zone``: The most recent time the record was edited. **Constraints** Postgres is extremely efficient at detecting conflicts, and we use this to help ensure our entries do not have any duplicates. - **reading-uniqeness**: ``UNIQUE(text_content_id, reader, reader_version)`` """ class Reading(self.base, IndraDBTable): __tablename__ = 'reading' _skip_disp = ['bytes'] _always_disp = ['id', 'text_content_id', 'reader', 'reader_version'] id = Column(BigInteger, primary_key=True, default=None) text_content_id = Column(Integer, ForeignKey(''), nullable=False) batch_id = Column(Integer, nullable=False) text_content = relationship(self.table_dict['text_content']) reader = Column(String(20), nullable=False) reader_version = Column(String(20), nullable=False) format = Column(String(20), nullable=False) # xml, json, etc. bytes = Column(BYTEA) create_date = Column(DateTime, last_updated = Column(DateTime, __table_args__ = ( UniqueConstraint('text_content_id', 'reader', 'reader_version', name='reading-uniqueness'), ) return Reading
[docs] def db_info(self): """Represent the provenance and metadata for an external knowledge base. INDRA DB takes content not just from our own readings but also merges that with many pre-existing knowledge bases, many of them human curated. These knowledge bases are defined and managed by classes contained in :mod:`knowledgebase_manager <indra_db.managers.knowledgebase_manager>`. No real data is contained in this column, simply records of which knowledge bases have been added, updated, and when. **Size**: very small **Basic Columns** - **id** ``integer PRIMARY KEY``: A database-assigned integer unique ID for each database entry. These are elsewhere referred to as `db_info_ids` or `dbids`. - **db_name** ``varchar NOT NULL``: A short lowercase string that is used internally to identify the knowledge base, e.g. "pc" for Pathway Commons. - **db_full_name** ``varchar NOT NULL``: The full name of the knowledge base, neatly formatted, e.g. "Pathway Commons". - **source_api** ``varchar NOT NULL``: The indra source API that was used to extract Statements from the knowledge base, e.g. "biopax". **Metadata Columns** - **create_date** ``timestamp without time zone``: The date the record was added. - **last_updated** ``timestamp without time zone``: The most recent time the record was edited. """ class DBInfo(self.base, IndraDBTable): __tablename__ = 'db_info' _always_disp = ['id', 'db_name', 'source_api'] id = Column(Integer, primary_key=True) db_name = Column(String, nullable=False) db_full_name = Column(String, nullable=False) source_api = Column(String, nullable=False) create_date = Column(DateTime, last_updated = Column(DateTime, return DBInfo
[docs] def raw_statements(self): """Represent Statements exactly as extracted by their source apis. INDRA Defines several source APIs for different file types from which we can extract INDRA Statements. The goal of these APIs is primarily to accurately convey the contents of the files, and minimal fixes are made at this stage (e.g. grounding is saved for preassembly). Thus this table contains statements that are considered "messy" in two key ways: - they have a lot of repetition of information, and - they have whatever grounding the original source gave them. However these Statements also have the Evidence object JSON contained in their **json** column, and this Evidence information is **NOT** copied into the :func:`pa_statements <pa_statements>` table, which allows for a flexible incremental updates. A "lateral join" on this table can be used to get the first N evidence associated with each PA Statement. **Size**: very large **Basic Columns** - **id** ``integer PRIMARY KEY``: A database-assigned integer unique ID for each database entry. These are elsewhere referred to as "Statement ID"s, or "sid"s. - **uuid** ``varchar UNIQUE NOT NULL``: A UUID generated when a Statement object is first created. This can be used for tracking particular objects through the code. - **batch_id** ``integer NOT NULL``: A simple random integer (not unique) that is assigned each batch of inserted Statements. It is used in the moments after the insert to easily retrieve the content that was just added, potentially plus some extra. - **mk_hash** ``bigint NOT NULL``: A hash of the ``matches_key`` of a Statement. This should be unique for any statement containing the same information. - **text_hash** ``bigint``: A hash of a the evidence text, used to detect exact duplicate Statements (same information from the same exact source, right down to the text) that sometimes occur due to bugs - **source_hash** ``bigint NOT NULL``: A hash of the source information. - **db_info_id** ``integer``: A foreign key into the :func:`db_info <db_info>` table, for those statements that come from knowledge bases. - **reading_id** ``bigint``: A foreign key into the :func:`reading <reading>` table, for those statements that come from a reading. - **type** ``varchar(100) NOT NULL``: The type of the Statement, e.g. "Phosphorylation". - **indra_version** ``varchar(100) NOT NULL``: The version of INDRA that was used to generate this Statement, specifically as returned by ``indra.util.get_version.get_version()``. - **json** ``bytea NOT NULL``: The bytes of the Statement JSON (including exactly **one** Evidence JSON) **Metadata Columns** - **create_date** ``timestamp without time zone``: The date the Statement was added. **Constraints** Postgres is extremely efficient at detecting conflicts, and we use this to help ensure our entries do not have any duplicates. - **reading_raw_statement_uniqueness**: ``UNIQUE(mk_hash, text_hash, reading_id)`` - **db_info_raw_statement_uniqueness**: ``UNIQUE(mk_hash, source_hash, db_info_id)`` """ class RawStatements(self.base, IndraDBTable): __tablename__ = 'raw_statements' _skip_disp = ['json'] _always_disp = ['id', 'db_info_id', 'reading_id', 'type'] id = Column(Integer, primary_key=True) uuid = Column(String(40), unique=True, nullable=False) batch_id = Column(Integer, nullable=False) mk_hash = Column(BigInteger, nullable=False) text_hash = Column(BigInteger) source_hash = Column(BigInteger, nullable=False) db_info_id = Column(Integer, ForeignKey('')) db_info = relationship(self.table_dict['db_info']) reading_id = Column(BigInteger, ForeignKey('')) reading = relationship(self.table_dict['reading']) type = Column(String(100), nullable=False) indra_version = Column(String(100), nullable=False) json = Column(BYTEA, nullable=False) create_date = Column(DateTime, __table_args__ = ( UniqueConstraint('mk_hash', 'text_hash', 'reading_id', name='reading_raw_statement_uniqueness'), UniqueConstraint('mk_hash', 'source_hash', 'db_info_id', name='db_info_raw_statement_uniqueness'), ) return RawStatements
[docs] def raw_activity(self): """Represent the activity of a raw statement (an ActiveForm).""" class RawActivity(self.base, IndraDBTable): __tablename__ = 'raw_activity' _always_disp = ['stmt_id', 'activity', 'is_active'] id = Column(Integer, primary_key=True) stmt_id = Column(Integer, ForeignKey(''), nullable=False) statements = relationship(self.table_dict['raw_statements']) activity = Column(String) is_active = Column(Boolean) return RawActivity
[docs] def raw_agents(self): """Represent an identifier for an agent of a raw statement.""" class RawAgents(self.base, IndraDBTable): __tablename__ = 'raw_agents' _always_disp = ['stmt_id', 'db_name', 'db_id', 'ag_num'] id = Column(Integer, primary_key=True) stmt_id = Column(Integer, ForeignKey(''), nullable=False) statements = relationship(self.table_dict['raw_statements']) db_name = Column(String(40), nullable=False) db_id = Column(String, nullable=False) ag_num = Column(Integer, nullable=False) role = Column(String(20), nullable=False) return RawAgents
[docs] def raw_mods(self): """Represent a modification of an agent of a raw statement.""" class RawMods(self.base, IndraDBTable): __tablename__ = 'raw_mods' _always_disp = ['stmt_id', 'type', 'position', 'residue', 'modified', 'ag_num'] id = Column(Integer, primary_key=True) stmt_id = Column(Integer, ForeignKey(''), nullable=False) statements = relationship(self.table_dict['raw_statements']) type = Column(String, nullable=False) position = Column(String(10)) residue = Column(String(5)) modified = Column(Boolean) ag_num = Column(Integer, nullable=False) return RawMods
[docs] def raw_muts(self): """Represent a mutation of an agent of a raw statement.""" class RawMuts(self.base, IndraDBTable): __tablename__ = 'raw_muts' _always_disp = ['stmt_id', 'position', 'residue_from', 'residue_to', 'ag_num'] id = Column(Integer, primary_key=True) stmt_id = Column(Integer, ForeignKey(''), nullable=False) statements = relationship(self.table_dict['raw_statements']) position = Column(String(10)) residue_from = Column(String(5)) residue_to = Column(String(5)) ag_num = Column(Integer, nullable=False) return RawMuts
[docs] def pa_statements(self): """Represent preassembled statements. Preassmebled Statements are generated from Raw Statements using INDRA's preassembly tools. Specifically: - agents are grounded, - agent groundings are disambiguated (using adeft), - sites are fixed (using protmapper), - and finally, repeated information is consolidated, for example ``Phosphorylation(MEK(), ERK())`` is represented only once in this corpus, with links to the many instances that information was extracted, which are stored in the :func:`raw_statements <raw_statements>` table. Each entry is linked back to the (in general multiple) raw statements it was derived from in the :func:`raw_unique_links <raw_unique_links>` table. **Size**: medium large **Basic Columns** - **mk_hash** ``bigint PRIMARY KEY``: a hash of the statement matches key, which is unique for the _knowledge_ of the Statement. - **matches_key** ``varchar NOT NULL``: The matches-key that was hashed. - **uuid** ``varchar UNIQUE NOT NULL``: A UUID generated when a Statement object is first created. This can be used for tracking particular objects through the code. The UUID is distinct from any of the raw statement UUIDs that compose this Statement. - **type** ``varchar(100) NOT NULL``: The type of the Statement, e.g. "Phosphorylation". - **indra_version** ``varchar(100) NOT NULL``: The version of INDRA that was used to generate this Statement, specifically as returned by ``indra.util.get_version.get_version()``. - **json** ``bytea NOT NULL``: The bytes of the Statement JSON (including exactly **one** Evidence JSON) **Metadata Columns** - **create_date** ``timestamp without time zone``: The date the Statement was added. """ class PAStatements(self.base, IndraDBTable): __tablename__ = 'pa_statements' _skip_disp = ['json'] _always_disp = ['mk_hash', 'type'] _default_order_by = 'create_date' mk_hash = Column(BigInteger, primary_key=True) matches_key = Column(String, nullable=False) uuid = Column(String(40), unique=True, nullable=False) type = Column(String(100), nullable=False) indra_version = Column(String(100), nullable=False) json = Column(BYTEA, nullable=False) create_date = Column(DateTime, return PAStatements
[docs] def pa_activity(self): """Represent the activity of a preassembled Statement.""" class PAActivity(self.base, IndraDBTable): __tablename__ = 'pa_activity' __always_disp__ = ['stmt_mk_hash', 'activity', 'is_active'] id = Column(Integer, primary_key=True) stmt_mk_hash = Column(BigInteger, ForeignKey('pa_statements.mk_hash'), nullable=False) statements = relationship(self.table_dict['pa_statements']) activity = Column(String) is_active = Column(Boolean) return PAActivity
[docs] def pa_agents(self): """Represent an identifier for an agent of a preassembled statement.""" class PAAgents(self.base, IndraDBTable): __tablename__ = 'pa_agents' _always_disp = ['stmt_mk_hash', 'db_name', 'db_id', 'ag_num'] id = Column(Integer, primary_key=True) stmt_mk_hash = Column(BigInteger, ForeignKey('pa_statements.mk_hash'), nullable=False) statements = relationship(self.table_dict['pa_statements']) db_name = Column(String(40), nullable=False) db_id = Column(String, nullable=False) role = Column(String(20), nullable=False) ag_num = Column(Integer, nullable=False) agent_ref_hash = Column(BigInteger, unique=True, nullable=False) return PAAgents
[docs] def pa_mods(self): """Represent a modification of an agent of a preassembled statement.""" class PAMods(self.base, IndraDBTable): __tablename__ = 'pa_mods' _always_disp = ['stmt_mk_hash', 'type', 'position', 'residue', 'modified', 'ag_num'] id = Column(Integer, primary_key=True) stmt_mk_hash = Column(BigInteger, ForeignKey('pa_statements.mk_hash'), nullable=False) statements = relationship(self.table_dict['pa_statements']) type = Column(String, nullable=False) position = Column(String(10)) residue = Column(String(5)) modified = Column(Boolean) ag_num = Column(Integer, nullable=False) return PAMods
[docs] def pa_muts(self): """Represent a mutation of an agent of a preassembled statement.""" class PAMuts(self.base, IndraDBTable): __tablename__ = 'pa_muts' _always_disp = ['stmt_mk_hash', 'position', 'residue_from', 'residue_to', 'ag_num'] id = Column(Integer, primary_key=True) stmt_mk_hash = Column(BigInteger, ForeignKey('pa_statements.mk_hash'), nullable=False) statements = relationship(self.table_dict['pa_statements']) position = Column(String(10)) residue_from = Column(String(5)) residue_to = Column(String(5)) ag_num = Column(Integer, nullable=False) return PAMuts
[docs] def curations(self): """Represent the curations of our content. At various points in our APIs and UIs it is possible to curate the content we have extracted, recording whether it is an accurate extraction from the source text, and if not the reason why. **Size**: small **Basic Columns** - **id** ``integer PRIMARY KEY``: A database-assigned integer unique ID for each database entry. - **pa_hash** ``bigint REFERENCES pa_statements(mk_hash)``: A reference into the :func:`pa_statements <pa_statements>` table to the the pa statement whose evidence was curated. - **source_hash** ``bigint``: A hash that represents the source of this Statement (e.g. reader and piece of content). - **tag** ``varchar``: A text code indicating the type of error curated. The domain of these strings is regulated in code elsewhere. - **text** ``varchar``: A free-form text description by the curator of what they think went wrong (or right). - **curator** ``varchar NOT NULL``: The identity of the curator. This has elsewhere been standardized to be their email. - **auth_id** ``varchar``: [deprecated] - **source** ``varchar``: A string indicating where this curation originated, e.g. "DB REST API" for the INDRA Database REST service. - **ip** ``inet``: The IP address from which the curation was submitted. - **date** ``timestamp without time zone``: The date the curation was added. - **pa_json** ``jsonb``: the preassembled Statement JSON that was curated. - **ev_json** ``jsonb``: the Evidence JSON that was curated (including the text). """ class Curation(self.base, IndraDBTable): __tablename__ = 'curation' _always_disp = ['pa_hash', 'source_hash', 'tag', 'curator', 'date'] id = Column(Integer, primary_key=True) pa_hash = Column(BigInteger, ForeignKey('pa_statements.mk_hash')) pa_statements = relationship(self.table_dict['pa_statements']) source_hash = Column(BigInteger) tag = Column(String) text = Column(String) curator = Column(String, nullable=False) auth_id = Column(Integer) source = Column(String) ip = Column(INET) date = Column(DateTime, pa_json = Column(JSONB) ev_json = Column(JSONB) def to_json(self): return {attr: getattr(self, attr) for attr in ['id', 'pa_hash', 'source_hash', 'tag', 'text', 'date', 'curator', 'source', 'pa_json', 'ev_json']} return Curation
[docs] def source_file(self): """Record the pubmed source file that was processed.""" class SourceFile(self.base, IndraDBTable): __tablename__ = 'source_file' _always_disp = ['source', 'name'] id = Column(Integer, primary_key=True) source = Column(String(250), nullable=False) name = Column(String(250), nullable=False) load_date = Column(DateTime, __table_args__ = ( UniqueConstraint('source', 'name', name='source-name'), ) return SourceFile
[docs] def updates(self): """Record when text ref and content updates were performed.""" class Updates(self.base, IndraDBTable): __tablename__ = 'updates' _skip_disp = ['unresolved_conflicts_file'] _always_disp = ['source', 'datetime'] id = Column(Integer, primary_key=True) init_upload = Column(Boolean, nullable=False) source = Column(String(250), nullable=False) unresolved_conflicts_file = Column(BYTEA) datetime = Column(DateTime, return Updates
[docs] def reading_updates(self): """Record runs of the readers on the content we have found.""" class ReadingUpdates(self.base, IndraDBTable): __tablename__ = 'reading_updates' _always_disp = ['reader', 'reader_version', 'run_datetime'] id = Column(Integer, primary_key=True) complete_read = Column(Boolean, nullable=False) reader = Column(String(250), nullable=False) reader_version = Column(String(250), nullable=False) run_datetime = Column(DateTime, earliest_datetime = Column(DateTime) latest_datetime = Column(DateTime, nullable=False) return ReadingUpdates
[docs] def xdd_updates(self): """Record the times we process dumps from xDD.""" class XddUpdates(self.base, IndraDBTable): __tablename__ = 'xdd_updates' _always_disp = ['day_str'] id = Column(Integer, primary_key=True) reader_versions = Column(JSONB) indra_version = Column(String) day_str = Column(String, nullable=False, unique=True) processed_date = Column(DateTime, return XddUpdates
[docs] def rejected_statements(self): """Represent raw statements that were rejected.""" class RejectedStatements(self.base, IndraDBTable): __tablename__ = 'rejected_statements' _skip_disp = ['json'] _always_disp = ['id', 'db_info_id', 'reading_id', 'type'] id = Column(Integer, primary_key=True) uuid = Column(String(40), unique=True, nullable=False) batch_id = Column(Integer, nullable=False) mk_hash = Column(BigInteger, nullable=False) text_hash = Column(BigInteger) source_hash = Column(BigInteger, nullable=False) db_info_id = Column(Integer, ForeignKey('')) db_info = relationship(self.table_dict['db_info']) reading_id = Column(BigInteger, ForeignKey('')) reading = relationship(self.table_dict['reading']) type = Column(String(100), nullable=False) indra_version = Column(String(100), nullable=False) json = Column(BYTEA, nullable=False) create_date = Column(DateTime, return RejectedStatements
[docs] def discarded_statements(self): """Record the reasons for which some statements were discarded.""" class DiscardedStatements(self.base, IndraDBTable): __tablename__ = 'discarded_statements' _always_disp = ['stmt_id', 'reason'] id = Column(Integer, primary_key=True) stmt_id = Column(Integer, ForeignKey(''), nullable=False) reason = Column(String, nullable=False) insert_date = Column(DateTime, return DiscardedStatements
[docs] def preassembly_updates(self): """Record updates of the preassembled corpus.""" class PreassemblyUpdates(self.base, IndraDBTable): __tablename__ = 'preassembly_updates' _always_disp = ['corpus_init', 'run_datetime'] id = Column(Integer, primary_key=True) corpus_init = Column(Boolean, nullable=False) run_datetime = Column(DateTime, stmt_type = Column(String) return PreassemblyUpdates