Source code for indra_db.schemas.principal_schema

"""
The Principal Schema
====================

The Principal database is the core representation of our data, the ultimate
authority on what we know. It is heavily optimized for the *input* and
maintenance of our data.
"""

__all__ = ['PrincipalSchema', 'foreign_key_map']

import logging

from sqlalchemy import Column, Integer, String, UniqueConstraint, ForeignKey, \
     Boolean, DateTime, func, BigInteger, or_, tuple_
from sqlalchemy.orm import relationship
from sqlalchemy.dialects.postgresql import BYTEA, INET, JSONB

from indra_db.schemas.mixins import IndraDBTable, IndraDBRefTable, Schema
from indra_db.schemas.indexes import StringIndex, BtreeIndex

foreign_key_map = [
    ('pa_agents', 'pa_statements'),
    ('raw_unique_links', 'pa_statements'),
    ('raw_unique_links', 'raw_statements'),
    ('raw_statements', 'reading'),
    ('raw_agents', 'raw_statements'),
    ('raw_statements', 'db_info'),
    ('reading', 'text_content'),
    ('text_content', 'text_ref')
]

logger = logging.getLogger(__name__)


[docs]class PrincipalSchema(Schema):
    """The Principal schema class organizes the table constructors.

    The tables can be divided into various groups, with a clear order of
    creation for many of them.

    **Core Tables**

    First are the core tables representing our knowledge:

      1. :func:`text_ref <text_ref>`
      2. :func:`text_content <text_content>`
      3. :func:`reading <reading>`
      4. :func:`db_info <db_info>`
      5. :func:`raw_statements <raw_statements>`
      6. :func:`raw_unique_links <raw_unique_links>`
      7. :func:`pa_statements <pa_statements>`
      8. :func:`pa_support_links <pa_support_links>`

    **Statement Attribute Tables**

    Then there are the tables that represent attributes of statements. The set
    of tables is identical for the raw statements:

      - :func:`raw_activity <raw_activity>`
      - :func:`raw_agents <raw_agents>`
      - :func:`raw_muts <raw_muts>`
      - :func:`raw_mods <raw_mods>`

    and the preassembled statements:

      - :func:`pa_activity <pa_activity>`
      - :func:`pa_agents <pa_agents>`
      - :func:`pa_muts <pa_muts>`
      - :func:`pa_mods <pa_mods>`

    **Curation Table**

    This table is where we record the curations submitted by ourselves and our
    users, which we use to improve our results.

      - :func:`curations <curations>`

    **Ancillary Tables**

    We also have several tables that we use to keep track of processing
    metadata, and some artifacts useful in that processing.

      - :func:`updates <updates>`
      - :func:`source_file <source_file>`
      - :func:`reading_updates <reading_updates>`
      - :func:`preassembly_updates <preassembly_updates>`
      - :func:`xdd_updates <xdd_updates>`
      - :func:`rejected_statements <rejected_statements>`
      - :func:`discarded_statements <discarded_statements>`
    """

[docs]    def text_ref(self):
        """Represent a piece of text, as per its identifiers.

        Each piece of text will be made available in different forms through
        different services, most commonly abstracts through pubmed and full text
        through pubmed central. However they are from the same *paper*, which
        has various different identifiers, such as pmids, pmcids, and dois.

        We do our best to merge the different identifiers and for the most part
        each paper has exactly one text ref. Where that is not the case it is
        mostly impossible to automatically reconcile the different identifiers
        (this often has to do with inconsistent versioning of a paper and mixups
        over what is IDed).

        **Size**: medium

        **Basic Columns**

        These are the core columns representing the different IDs we use to
        represent a paper.

        - **id** ``integer PRIMARY KEY``: The primary key of the TextRef
          entry. Elsewhere this is often referred to as a "text ref ID" or
          "trid" for short.
        - **pmid** ``varchar(20)``: The identifier from pubmed.
        - **pmcid** ``varchar(20)``: The identifier from PubMed Central (e.g.
          "PMC12345")
        - **doi** ``varchar(100)``: The ideally universal identifier.
        - **pii** ``varchar(250)``: The identifier used by Springer.
        - **url** ``varchar UNIQUE``: For sources found exclusively online
          (e.g. wikipedia) use their URL.
        - **manuscript_id** ``varchar(100) UNIQUE``: The ID assigned documents
          given to PMC author manuscripts.

        **Metadata Columns**

        In addition we also track some basic metadata about the entry and
        updates to the data in the table.

        - **create_date** ``timestamp without time zone``: The date the record
          was added.
        - **last_updated** ``timestamp without time zone``: The most recent
          time the record was edited.
        - **pub_year** ``integer``: The year the article was published, based
          on the first report we find (in order of PubMed, PMC, then PMC
          Manuscripts).

        **Constraints**

        Postgres is extremely efficient at detecting conflicts, and we use this
        to help ensure our entries do not have any duplicates.

        - **pmid-doi**: ``UNIQUE(pmid, doi)``
        - **pmid-pmcid**: ``UNIQUE(pmid, pmcid)``
        - **pmcid-doi**: ``UNIQUE(pmcid, doi)``

        **Lookup Columns**

        Some columns are hard to look up when they are in their native string
        format, so they are processed and broken down into integer parts, as
        far as possible.

        - **pmid_num** ``integer``: the int-ified pmid, faster for lookup.
        - **pmcid_num** ``integer``: the int-portion of the PMCID, so
          "PMC12345" would here be 12345.
        - **pmcid_version** ``integer``: although rarely used, occasionally a
          PMC ID will have a version, indicated by a dot, e.g. PMC12345.3, in
          which case the "3" would be stored in this column.
        - **doi_ns** ``integer``: The DOI system works by assigning
          organizations (such as a journal) namespace IDs, and that organization
          is then responsible for maintaining a unique ID system internally.
          These namespaces are always numbers, and are stored here as such.
        - **doi_id** ``varchar``: The custom ID given by the publishing
          organization.
        """
        class TextRef(self.base, IndraDBTable, IndraDBRefTable):
            """The class representing the TextRef table.

            The preferred constructor for this class is the ``new`` classmethod.
            This is because many of the IDs are processed in addition to the
            standard SQLAlchemy handling.

            To modify a column value, please use the ``update`` method, again to
            ensure that IDs are properly processed.

            The best way to construct a clause with a :py:class:`TextRef` is to
            use the ``has_ref`` method, inherited from
            :py:class:`IndraDBRefTable`, or else one of ``pmid_in``,
            ``pmcid_in``, or ``doi_in`` for those specific IDs.

            A dictionary of references can be easily generated using the
            ``get_ref_dict`` method.
            """
            __tablename__ = 'text_ref'
            _ref_cols = ['pmid', 'pmcid', 'doi', 'pii', 'url', 'manuscript_id']
            _always_disp = ['id', 'pmid', 'pmcid']
            _indices = [StringIndex('text_ref_pmid_idx', 'pmid'),
                        StringIndex('text_ref_pmcid_idx', 'pmcid'),
                        BtreeIndex('text_ref_pmid_num_idx', 'pmid_num'),
                        BtreeIndex('text_ref_pmcid_num_idx', 'pmcid_num'),
                        BtreeIndex('text_ref_doi_ns_idx', 'doi_ns'),
                        BtreeIndex('text_ref_doi_id_idx', 'doi_id'),
                        BtreeIndex('text_ref_pub_year', 'pub_year'),
                        StringIndex('text_ref_doi_idx', 'doi')]

            id = Column(Integer, primary_key=True)
            pmid = Column(String(20))
            pmid_num = Column(Integer)
            pmcid = Column(String(20))
            pmcid_num = Column(Integer)
            pmcid_version = Column(Integer)
            doi = Column(String(100))
            doi_ns = Column(Integer)
            doi_id = Column(String)
            pub_year = Column(Integer)
            pii = Column(String(250))
            url = Column(String, unique=True)
            manuscript_id = Column(String(100), unique=True)
            create_date = Column(DateTime, default=func.now())
            last_updated = Column(DateTime, onupdate=func.now())

            __table_args__ = (
                UniqueConstraint('pmid', 'doi', name='pmid-doi'),
                UniqueConstraint('pmid', 'pmcid', name='pmid-pmcid'),
                UniqueConstraint('pmcid', 'doi', name='pmcid-doi')
            )

            def __repr__(self):
                terms = [f'id={self.id}']
                for col in ['pmid', 'pmcid', 'doi', 'pii', 'url', 'manuscript_id']:
                    if getattr(self, col) is not None:
                        terms.append(f'{col}={getattr(self, col)}')
                    if len(terms) > 2:
                        break
                return f'{self.__class__.__name__}({", ".join(terms)})'

            @classmethod
            def new(cls, pmid=None, pmcid=None, doi=None, pii=None, url=None,
                    manuscript_id=None):
                """The preferred TextRef constructor: create a new TextRef."""
                pmid, pmid_num = cls.process_pmid(pmid)
                pmcid, pmcid_num, pmcid_version = cls.process_pmcid(pmcid)
                doi, doi_ns, doi_id = cls.process_doi(doi)
                return cls(pmid=pmid, pmid_num=pmid_num, pmcid=pmcid,
                           pmcid_num=pmcid_num, pmcid_version=pmcid_version,
                           doi=doi, doi_ns=doi_ns, doi_id=doi_id, pii=pii,
                           url=url, manuscript_id=manuscript_id)

            def update(self, **updates):
                """Update the value of an ID, processing it as necessary."""
                for id_type, id_val in updates.items():
                    if not hasattr(self, id_type):
                        raise ValueError(f"Invalid ID type: {id_type}")
                    if id_type == 'pmid':
                        self.pmid, self.pmid_num = self.process_pmid(id_val)
                    elif id_type == 'pmcid':
                        self.pmcid, self.pmcid_num, self.pmcid_version = \
                            self.process_pmcid(id_val)
                    elif id_type == 'doi':
                        self.doi, self.doi_ns, self.doi_id = \
                            self.process_doi(id_val)
                    else:
                        setattr(self, id_type, id_val)
                return
        return TextRef

[docs]    def mesh_ref_annotations(self):
        """Represent the MeSH annotations of papers provided by PubMed.

        Each abstract/entry in PubMed is accompanied by human-curated
        MeSH IDs indicating the topics of the paper. Each paper will have many
        IDs in general, so a separate table is used, liked to the
        :func:`text_ref <text_ref>` table by an un-constrained PMID. This make
        insertion of the data easier because the custom TRIDs need not be
        retrieved to dump the mesh refs.

        **Size**: large

        **Columns**

        - **id** ``integer PRIMARY KEY``: The primary database-assigned ID of
          the row.
        - **pmid_num** ``integer NOT NULL``: The int-ified pmid that is used
          to link entries in this table with those in the
          :func:`text_ref <text_ref>` table.
        - **mesh_num** ```integer NOT NULL``: The intified MeSH ID (with the
          prefix removed). The **is_concept** column indicates whether the
          prefix was D (False) or C (True).
        - **qual_num** ``integer``: The qualifier number that is sometimes
          included with the annotation (Prefix Q).
        - **major_topic** ``boolean DEFAULT false``: The major topic flag
          indicates whether the ID describes a primary purpose of the paper.
        - **is_concept** ``boolean DEFAUL false``: Indicate whether the prefix
          was C (true) or D (false).

        **Constraints**

        Postgres is extremely efficient at detecting conflicts, and we use this
        to help ensure our entries do not have any duplicates.

        - **mesh-uniqueness**: ``UNIQUE(pmid_num, mesh_num, qual_num,
          is_concept)``
        """
        class MeshRefAnnotations(self.base, IndraDBTable):
            __tablename__ = 'mesh_ref_annotations'
            _always_disp = ['pmid_num', 'mesh_num', 'qual_num']
            _indices = [BtreeIndex('mesh_ref_annotations_pmid_idx', 'pmid_num'),
                        BtreeIndex('mesh_ref_annotations_mesh_id_idx',
                                   'mesh_num'),
                        BtreeIndex('mesh_ref_annotations_qual_id_idx',
                                   'qual_num')]
            id = Column(Integer, primary_key=True)
            pmid_num = Column(Integer, nullable=False)
            mesh_num = Column(Integer, nullable=False)
            qual_num = Column(Integer)
            major_topic = Column(Boolean, default=False)
            is_concept = Column(Boolean, default=False)
            __table_args__ = (
                UniqueConstraint('pmid_num', 'mesh_num', 'qual_num',
                                 'is_concept', name='mesh-uniqueness'),
            )
        return MeshRefAnnotations

[docs]    def mti_ref_annotaions_test(self):
        """Represent the MeSH annotations of abstracts as inferred by MTI.

        MTI is a machine learned model that attempts to predict MeSH annotations
        on new un-annotated abstracts after training on the existing
        annotations.

        **Size**: medium

        **Columns**

        - **id** ``integer PRIMARY KEY``: The primary database-assigned ID of
          the row.
        - **pmid_num** ``integer NOT NULL``: The int-ified pmid that is used
          to link entries in this table with those in the
          :func:`text_ref <text_ref>` table.
        - **mesh_num** ```integer NOT NULL``: The intified MeSH ID (with the
          prefix removed). The **is_concept** column indicates whether the
          prefix was D (False) or C (True).
        - **qual_num** ``integer``: The qualifier number that is sometimes
          included with the annotation (Prefix Q).
        - **major_topic** ``boolean DEFAULT false``: The major topic flag
          indicates whether the ID describes a primary purpose of the paper.
        - **is_concept** ``boolean DEFAUL false``: Indicate whether the prefix
          was C (true) or D (false).

        **Constraints**

        Postgres is extremely efficient at detecting conflicts, and we use this
        to help ensure our entries do not have any duplicates.

        - **mesh-uniqueness**: ``UNIQUE(pmid_num, mesh_num, qual_num,
          is_concept)``
        """
        class MtiRefAnnotationsTest(self.base, IndraDBTable):
            __tablename__ = 'mti_ref_annotations_test'
            _always_disp = ['pmid_num', 'mesh_num', 'qual_num']
            _indices = [BtreeIndex('mti_ref_annotations_test_pmid_idx', 'pmid_num'),
                        BtreeIndex('mti_ref_annotations_test_mesh_id_idx', 'mesh_num'),
                        BtreeIndex('mti_ref_annotations_test_qual_id_idx', 'qual_num')]
            id = Column(Integer, primary_key=True)
            pmid_num = Column(Integer, nullable=False)
            mesh_num = Column(Integer, nullable=False)
            qual_num = Column(Integer)
            major_topic = Column(Boolean, default=False)
            is_concept = Column(Boolean, default=False)
            __table_args__ = (
                UniqueConstraint('pmid_num', 'mesh_num', 'qual_num', 'is_concept',
                                    name='mti-uniqueness'),
            )
        return MtiRefAnnotationsTest

[docs]    def text_content(self):
        """Represent the content of a text retrieved from a particular source.

        For each paper as a logical entity, there are many places where you can
        acquire the actual article or parts of it. For example you can get an
        abstract from PubMed for most content, and for a minority subset you can
        get full text from PubMed Central, either their Open-Access corpus or
        their author's Manuscripts.

        Both the text itself and the metadata for the source of the text are
        represented in this table.

        **Size**: large

        **Basic Columns**

        - **id** ``integer PRIMARY KEY``: The auto-generated primary key of
          the table. These are elsewhere called Text Content IDs, or TCIDs.
        - **text_ref_id** ``integer NOT NULL``: A foreign-key constrained
          reference to the appropriate entry in the :func:`text_ref <text_ref>`
          table.
        - **source** ``varchar(250) NOT NULL``: The name of the source, e.g.
          "pubmed" or "pmc_oa". The list of content names can be found in the
          class attributes in content managers.
        - **format** ``varchar(250) NOT NULL``: The file format of the
          content, e.g. "XML" or "TEXT".
        - **text_type** ``varchar(250) NOT NULL``: The type of the text, e.g.
          "abstract" of "fulltext".
        - **preprint** ``boolean``: Indicate whether the content is from
          a preprint.
        - **license** [``varchar``]: Record the license that applies to the
          content.
        - **content** ``bytea``: The raw compressed bytes of the content.

        **Metadata Columns**

        - **insert_data** ``timestamp without time zone``: The date the record
          was added.
        - **last_updated** ``timestamp without time zone``: The most recent
          time the record was edited.

        **Constraints**

        Postgres is extremely efficient at detecting conflicts, and we use this
        to help ensure our entries do not have any duplicates.

        - **content-uniqueness**: ``UNIQUE(text_ref_id, source, format,
          text_type)``
        """
        class TextContent(self.base, IndraDBTable):
            __tablename__ = 'text_content'
            _skip_disp = ['content']
            _always_disp = ['id', 'text_ref_id', 'source', 'format', 'text_type']
            id = Column(Integer, primary_key=True)
            text_ref_id = Column(Integer, ForeignKey('text_ref.id'),
                                 nullable=False)
            text_ref = relationship(self.table_dict['text_ref'])
            source = Column(String(250), nullable=False)
            format = Column(String(250), nullable=False)
            text_type = Column(String(250), nullable=False)
            content = Column(BYTEA)
            insert_date = Column(DateTime, default=func.now())
            last_updated = Column(DateTime, onupdate=func.now())
            preprint = Column(Boolean)
            license = Column(String)
            __table_args__ = (
                UniqueConstraint('text_ref_id', 'source', 'format',
                                 'text_type', name='content-uniqueness'),
            )
        return TextContent

[docs]    def reading(self):
        """Represent a reading of a piece of text.

        We have multiple readers and of course many thousands of pieces of text
        content. Each entry in this table applies to a given reader applied to
        a given pieces of content.

        As such, the primary ID is a hash constructed from the text content ID
        prepended with integers that are assigned to each reader-reader version
        pair. The function
        :func:`generate_reading_id <indra_db.reading.read_db.generate_reading_id>`
        implements the particular process used. The reader numbers are assigned
        in the :data:`readers <indra_db.databases.readers>` global, and the
        reader version number is the index of the version listed for the given
        reader in the :data:`reader_versions <indra_db.databases.reader_versions>`
        dictionary in the same module.

        **Size**: very large

        **Basic Columns**

        - **id** ``bigint PRIMARY KEY``: A hash ID constructed from a reader
          number, reader version number, and the text content ID of the content
          that was read.
        - **text_content_id** ``integer NOT NULL``: A foreign-key constrained
          reference to the appropriate entry in the :func:`text_content
          <text_content>` table.
        - **batch_id** ``integer NOT NULL``: A simple random integer (not
          unique) that is assigned each batch of inserted readings. It is used
          in the moments after the insert to easily retrieve the content that
          was just added, potentially plus some extra.
        - **reader** ``varchar(20) NOT NULL``: The name of the reader, e.g.
          "REACH" or "SPARSER".
        - **reader_version** ``varchar(20) NOT NULL``: The version of the
          reader, which may be any arbitrary string in principle. This allows
          each reader to define its own versioning scheme.
        - **format** ``varchar(20) NOT NULL``: The file format of the
          reading result, e.g. "XML" or "JSON".
        - **bytes** ``bytea``: The raw compressed bytes of the reading result.

        **Metadata Columns**

        - **create_date** ``timestamp without time zone``: The date the record
          was added.
        - **last_updated** ``timestamp without time zone``: The most recent
          time the record was edited.

        **Constraints**

        Postgres is extremely efficient at detecting conflicts, and we use this
        to help ensure our entries do not have any duplicates.

        - **reading-uniqeness**: ``UNIQUE(text_content_id, reader,
          reader_version)``
        """
        class Reading(self.base, IndraDBTable):
            __tablename__ = 'reading'
            _skip_disp = ['bytes']
            _always_disp = ['id', 'text_content_id', 'reader', 'reader_version']
            id = Column(BigInteger, primary_key=True, default=None)
            text_content_id = Column(Integer,
                                     ForeignKey('text_content.id'),
                                     nullable=False)
            batch_id = Column(Integer, nullable=False)
            text_content = relationship(self.table_dict['text_content'])
            reader = Column(String(20), nullable=False)
            reader_version = Column(String(20), nullable=False)
            format = Column(String(20), nullable=False)  # xml, json, etc.
            bytes = Column(BYTEA)
            create_date = Column(DateTime, default=func.now())
            last_updated = Column(DateTime, onupdate=func.now())
            __table_args__ = (
                UniqueConstraint('text_content_id', 'reader', 'reader_version',
                                 name='reading-uniqueness'),
            )
        return Reading

[docs]    def db_info(self):
        """Represent the provenance and metadata for an external knowledge base.

        INDRA DB takes content not just from our own readings but also merges
        that with many pre-existing knowledge bases, many of them human
        curated. These knowledge bases are defined and managed by classes
        contained in :mod:`knowledgebase_manager
        <indra_db.managers.knowledgebase_manager>`.

        No real data is contained in this column, simply records of which
        knowledge bases have been added, updated, and when.

        **Size**: very small

        **Basic Columns**

        - **id** ``integer PRIMARY KEY``: A database-assigned integer unique
          ID for each database entry. These are elsewhere referred to as
          `db_info_ids` or `dbids`.
        - **db_name** ``varchar NOT NULL``: A short lowercase string that is
          used internally to identify the knowledge base, e.g. "pc" for Pathway
          Commons.
        - **db_full_name** ``varchar NOT NULL``: The full name of the
          knowledge base, neatly formatted, e.g. "Pathway Commons".
        - **source_api** ``varchar NOT NULL``: The indra source API that was
          used to extract Statements from the knowledge base, e.g. "biopax".

        **Metadata Columns**

        - **create_date** ``timestamp without time zone``: The date the record
          was added.
        - **last_updated** ``timestamp without time zone``: The most recent
          time the record was edited.
        """
        class DBInfo(self.base, IndraDBTable):
            __tablename__ = 'db_info'
            _always_disp = ['id', 'db_name', 'source_api']
            id = Column(Integer, primary_key=True)
            db_name = Column(String, nullable=False)
            db_full_name = Column(String, nullable=False)
            source_api = Column(String, nullable=False)
            create_date = Column(DateTime, default=func.now())
            last_updated = Column(DateTime, onupdate=func.now())
        return DBInfo

[docs]    def raw_statements(self):
        """Represent Statements exactly as extracted by their source apis.

        INDRA Defines several source APIs for different file types from which we
        can extract INDRA Statements. The goal of these APIs is primarily to
        accurately convey the contents of the files, and minimal fixes are made
        at this stage (e.g. grounding is saved for preassembly).

        Thus this table contains statements that are considered "messy" in two
        key ways:

        - they have a lot of repetition of information, and
        - they have whatever grounding the original source gave them.

        However these Statements also have the Evidence object JSON contained in
        their **json** column, and this Evidence information is **NOT** copied
        into the :func:`pa_statements <pa_statements>` table, which allows for
        a flexible incremental updates. A "lateral join" on this table can be
        used to get the first N evidence associated with each PA Statement.

        **Size**: very large

        **Basic Columns**

        - **id** ``integer PRIMARY KEY``: A database-assigned integer unique
          ID for each database entry. These are elsewhere referred to as
          "Statement ID"s, or "sid"s.
        - **uuid** ``varchar UNIQUE NOT NULL``: A UUID generated when a
          Statement object is first created. This can be used for tracking
          particular objects through the code.
        - **batch_id** ``integer NOT NULL``: A simple random integer (not
          unique) that is assigned each batch of inserted Statements. It is used
          in the moments after the insert to easily retrieve the content that
          was just added, potentially plus some extra.
        - **mk_hash** ``bigint NOT NULL``: A hash of the ``matches_key`` of a
          Statement. This should be unique for any statement containing the same
          information.
        - **text_hash** ``bigint``: A hash of a the evidence text, used to
          detect exact duplicate Statements (same information from the same
          exact source, right down to the text) that sometimes occur due to bugs
        - **source_hash** ``bigint NOT NULL``: A hash of the source
          information.
        - **db_info_id** ``integer``: A foreign key into the
          :func:`db_info <db_info>` table, for those statements that come from
          knowledge bases.
        - **reading_id** ``bigint``: A foreign key into the
          :func:`reading <reading>` table, for those statements that come from a
          reading.
        - **type** ``varchar(100) NOT NULL``: The type of the Statement, e.g.
          "Phosphorylation".
        - **indra_version** ``varchar(100) NOT NULL``: The version of INDRA
          that was used to generate this Statement, specifically as returned by
          ``indra.util.get_version.get_version()``.
        - **json** ``bytea NOT NULL``: The bytes of the Statement JSON
          (including exactly **one** Evidence JSON)

        **Metadata Columns**

        - **create_date** ``timestamp without time zone``: The date the
          Statement was added.

        **Constraints**

        Postgres is extremely efficient at detecting conflicts, and we use this
        to help ensure our entries do not have any duplicates.

        - **reading_raw_statement_uniqueness**: ``UNIQUE(mk_hash, text_hash,
          reading_id)``
        - **db_info_raw_statement_uniqueness**: ``UNIQUE(mk_hash, source_hash,
          db_info_id)``
        """
        class RawStatements(self.base, IndraDBTable):
            __tablename__ = 'raw_statements'
            _skip_disp = ['json']
            _always_disp = ['id', 'db_info_id', 'reading_id', 'type']
            id = Column(Integer, primary_key=True)
            uuid = Column(String(40), unique=True, nullable=False)
            batch_id = Column(Integer, nullable=False)
            mk_hash = Column(BigInteger, nullable=False)
            text_hash = Column(BigInteger)
            source_hash = Column(BigInteger, nullable=False)
            db_info_id = Column(Integer, ForeignKey('db_info.id'))
            db_info = relationship(self.table_dict['db_info'])
            reading_id = Column(BigInteger, ForeignKey('reading.id'))
            reading = relationship(self.table_dict['reading'])
            type = Column(String(100), nullable=False)
            indra_version = Column(String(100), nullable=False)
            json = Column(BYTEA, nullable=False)
            create_date = Column(DateTime, default=func.now())
            __table_args__ = (
                UniqueConstraint('mk_hash', 'text_hash', 'reading_id',
                                 name='reading_raw_statement_uniqueness'),
                UniqueConstraint('mk_hash', 'source_hash', 'db_info_id',
                                 name='db_info_raw_statement_uniqueness'),
            )
        return RawStatements

[docs]    def raw_activity(self):
        """Represent the activity of a raw statement (an ActiveForm)."""
        class RawActivity(self.base, IndraDBTable):
            __tablename__ = 'raw_activity'
            _always_disp = ['stmt_id', 'activity', 'is_active']
            id = Column(Integer, primary_key=True)
            stmt_id = Column(Integer,
                             ForeignKey('raw_statements.id'),
                             nullable=False)
            statements = relationship(self.table_dict['raw_statements'])
            activity = Column(String)
            is_active = Column(Boolean)
        return RawActivity

[docs]    def raw_agents(self):
        """Represent an identifier for an agent of a raw statement."""
        class RawAgents(self.base, IndraDBTable):
            __tablename__ = 'raw_agents'
            _always_disp = ['stmt_id', 'db_name', 'db_id', 'ag_num']
            id = Column(Integer, primary_key=True)
            stmt_id = Column(Integer,
                             ForeignKey('raw_statements.id'),
                             nullable=False)
            statements = relationship(self.table_dict['raw_statements'])
            db_name = Column(String(40), nullable=False)
            db_id = Column(String, nullable=False)
            ag_num = Column(Integer, nullable=False)
            role = Column(String(20), nullable=False)
        return RawAgents

[docs]    def raw_mods(self):
        """Represent a modification of an agent of a raw statement."""
        class RawMods(self.base, IndraDBTable):
            __tablename__ = 'raw_mods'
            _always_disp = ['stmt_id', 'type', 'position', 'residue', 'modified',
                            'ag_num']
            id = Column(Integer, primary_key=True)
            stmt_id = Column(Integer, ForeignKey('raw_statements.id'),
                             nullable=False)
            statements = relationship(self.table_dict['raw_statements'])
            type = Column(String, nullable=False)
            position = Column(String(10))
            residue = Column(String(5))
            modified = Column(Boolean)
            ag_num = Column(Integer, nullable=False)
        return RawMods

[docs]    def raw_muts(self):
        """Represent a mutation of an agent of a raw statement."""
        class RawMuts(self.base, IndraDBTable):
            __tablename__ = 'raw_muts'
            _always_disp = ['stmt_id', 'position', 'residue_from', 'residue_to',
                            'ag_num']
            id = Column(Integer, primary_key=True)
            stmt_id = Column(Integer, ForeignKey('raw_statements.id'),
                             nullable=False)
            statements = relationship(self.table_dict['raw_statements'])
            position = Column(String(10))
            residue_from = Column(String(5))
            residue_to = Column(String(5))
            ag_num = Column(Integer, nullable=False)
        return RawMuts

[docs]    def raw_unique_links(self):
        """Represent links between raw statements and preassembled statements.

        Each preassembled statement is constructed from multiple raw statements,
        in general. This maps each :func:`pa_statement <pa_statements>` to the
        :func:`raw statements <raw_statements>` that were merged to form it. It
        is through this table that evidence can be gathered for pa_statements.

        The astute reader may note that the
        :func:`raw_statements <raw_statements>`-to
        -:func:`pa_statement <pa_statements>` relationship is many-to-one, which
        can be represented simply using a foreign-key in the "many" table, in
        this case :func:`raw_statements <raw_statements>`. This is not done
        because the :func:`pa_statement <pa_statements>` does not, in general,
        exist when the :func:`raw_statement <raw_statements>` is added to the
        database.

        Constructed as it is, these links can be copied in bulk during
        preassembly, as opposed to having to modify as many as a million entries
        with a newly created foreign-key map.

        **Size**: large

        **Basic Columns**

        - **id** ``integer PRIMARY KEY``: A database-assigned integer unique
          ID for each database entry.
        - **raw_stmt_id** ``integer NOT NULL REFERENCES raw_statements(id)``:
          The Raw Statement ID foreign key to the :func:`raw_statements
          <raw_statements>` table.
        - **pa_stmt_mk_hash** ``bigint NOT NULL REFERENCES
          pa_statements(mk_hash)``: The PA Statement matches-key hash foreign
          key to the :func:`pa_statements <pa_statements>` table.

        **Constraints**

        Postgres is extremely efficient at detecting conflicts, and we use this
        to help ensure our entries do not have any duplicates.

        - **stmt-link-uniqueness**: ``UNIQUE(raw_stmt_id, pa_stmt_mk_hash)``
        """
        class RawUniqueLinks(self.base, IndraDBTable):
            __tablename__ = 'raw_unique_links'
            _always_disp = ['raw_stmt_id', 'pa_stmt_mk_hash']
            _indices = [BtreeIndex('raw_unique_links_raw_stmt_id_idx',
                                   'raw_stmt_id'),
                        BtreeIndex('raw_unique_links_pa_stmt_mk_hash_idx',
                                   'pa_stmt_mk_hash')]
            id = Column(Integer, primary_key=True)
            raw_stmt_id = Column(Integer, ForeignKey('raw_statements.id'),
                                 nullable=False)
            pa_stmt_mk_hash = Column(BigInteger,
                                     ForeignKey('pa_statements.mk_hash'),
                                     nullable=False)
            __table_args = (
                UniqueConstraint('raw_stmt_id', 'pa_stmt_mk_hash',
                                 name='stmt-link-uniqueness'),
            )
        return RawUniqueLinks

[docs]    def pa_statements(self):
        """Represent preassembled statements.

         Preassmebled Statements are generated from Raw Statements using INDRA's
         preassembly tools. Specifically:

         - agents are grounded,
         - agent groundings are disambiguated (using adeft),
         - sites are fixed (using protmapper),
         - and finally, repeated information is consolidated, for example
           ``Phosphorylation(MEK(), ERK())`` is represented only once in this
           corpus, with links to the many instances that information was
           extracted, which are stored in the :func:`raw_statements
           <raw_statements>` table.

        Each entry is linked back to the (in general multiple) raw statements
        it was derived from in the :func:`raw_unique_links <raw_unique_links>`
        table.

        **Size**: medium large

        **Basic Columns**

        - **mk_hash** ``bigint PRIMARY KEY``: a hash of the statement matches
          key, which is unique for the _knowledge_ of the Statement.
        - **matches_key** ``varchar NOT NULL``: The matches-key that was hashed.
        - **uuid** ``varchar UNIQUE NOT NULL``: A UUID generated when a
          Statement object is first created. This can be used for tracking
          particular objects through the code. The UUID is distinct from any
          of the raw statement UUIDs that compose this Statement.
        - **type** ``varchar(100) NOT NULL``: The type of the Statement, e.g.
          "Phosphorylation".
        - **indra_version** ``varchar(100) NOT NULL``: The version of INDRA
          that was used to generate this Statement, specifically as returned by
          ``indra.util.get_version.get_version()``.
        - **json** ``bytea NOT NULL``: The bytes of the Statement JSON
          (including exactly **one** Evidence JSON)

        **Metadata Columns**

        - **create_date** ``timestamp without time zone``: The date the
          Statement was added.
        """
        class PAStatements(self.base, IndraDBTable):
            __tablename__ = 'pa_statements'
            _skip_disp = ['json']
            _always_disp = ['mk_hash', 'type']
            _default_order_by = 'create_date'

            mk_hash = Column(BigInteger, primary_key=True)
            matches_key = Column(String, nullable=False)
            uuid = Column(String(40), unique=True, nullable=False)
            type = Column(String(100), nullable=False)
            indra_version = Column(String(100), nullable=False)
            json = Column(BYTEA, nullable=False)
            create_date = Column(DateTime, default=func.now())
        return PAStatements

[docs]    def pa_support_links(self):
        """Represent the links of support calculated during preassembly.

        In INDRA, we look for cases where more specific Statements may lend
        support to more general Statements, and potentially vice versa, to
        better gauge whether an extraction is reliable.

        **Size**: large

        **Basic Columns**

        - **id** ``integer PRIMARY KEY``: A database-assigned integer unique
          ID for each database entry.
        - **supporting_mk_hash** ``bigint NOT NULL REFERENCES
          pa_statements(mk_hash)``: A foreign key to the PA Statement that is
          giving the support (that is, the more specific Statement).
        - **supported_mk_hash** ``bigint NOT NULL REFERENCES
          pa_statements(mk_hash)``: A foreign key to the PA Statement that is
          given the support (that is, the more generic Statement).

        **Constraints**

        Postgres is extremely efficient at detecting conflicts, and we use this
        to help ensure our entries do not have any duplicates.

        - **pa_support_links_link_uniqueness**: ``UNIQUE(supporting_mk_hash,
          supported_mk_hash)``
        """
        class PASupportLinks(self.base, IndraDBTable):
            __tablename__ = 'pa_support_links'
            _always_disp = ['supporting_mk_hash', 'supported_mk_hash']
            id = Column(Integer, primary_key=True)
            supporting_mk_hash = Column(BigInteger,
                                        ForeignKey('pa_statements.mk_hash'),
                                        nullable=False)
            supported_mk_hash = Column(BigInteger,
                                       ForeignKey('pa_statements.mk_hash'),
                                       nullable=False)
            __table_args__ = (
                UniqueConstraint('supporting_mk_hash', 'supported_mk_hash',
                                 name='pa_support_links_link_uniqueness'),
            )
        return PASupportLinks

[docs]    def pa_activity(self):
        """Represent the activity of a preassembled Statement."""
        class PAActivity(self.base, IndraDBTable):
            __tablename__ = 'pa_activity'
            __always_disp__ = ['stmt_mk_hash', 'activity', 'is_active']
            id = Column(Integer, primary_key=True)
            stmt_mk_hash = Column(BigInteger,
                                  ForeignKey('pa_statements.mk_hash'),
                                  nullable=False)
            statements = relationship(self.table_dict['pa_statements'])
            activity = Column(String)
            is_active = Column(Boolean)
        return PAActivity

[docs]    def pa_agents(self):
        """Represent an identifier for an agent of a preassembled statement."""
        class PAAgents(self.base, IndraDBTable):
            __tablename__ = 'pa_agents'
            _always_disp = ['stmt_mk_hash', 'db_name', 'db_id', 'ag_num']
            id = Column(Integer, primary_key=True)
            stmt_mk_hash = Column(BigInteger,
                                  ForeignKey('pa_statements.mk_hash'),
                                  nullable=False)
            statements = relationship(self.table_dict['pa_statements'])
            db_name = Column(String(40), nullable=False)
            db_id = Column(String, nullable=False)
            role = Column(String(20), nullable=False)
            ag_num = Column(Integer, nullable=False)
            agent_ref_hash = Column(BigInteger, unique=True, nullable=False)
        return PAAgents

[docs]    def pa_mods(self):
        """Represent a modification of an agent of a preassembled statement."""
        class PAMods(self.base, IndraDBTable):
            __tablename__ = 'pa_mods'
            _always_disp = ['stmt_mk_hash', 'type', 'position', 'residue',
                            'modified', 'ag_num']
            id = Column(Integer, primary_key=True)
            stmt_mk_hash = Column(BigInteger,
                                  ForeignKey('pa_statements.mk_hash'),
                                  nullable=False)
            statements = relationship(self.table_dict['pa_statements'])
            type = Column(String, nullable=False)
            position = Column(String(10))
            residue = Column(String(5))
            modified = Column(Boolean)
            ag_num = Column(Integer, nullable=False)
        return PAMods

[docs]    def pa_muts(self):
        """Represent a mutation of an agent of a preassembled statement."""
        class PAMuts(self.base, IndraDBTable):
            __tablename__ = 'pa_muts'
            _always_disp = ['stmt_mk_hash', 'position', 'residue_from',
                            'residue_to', 'ag_num']
            id = Column(Integer, primary_key=True)
            stmt_mk_hash = Column(BigInteger,
                                  ForeignKey('pa_statements.mk_hash'),
                                  nullable=False)
            statements = relationship(self.table_dict['pa_statements'])
            position = Column(String(10))
            residue_from = Column(String(5))
            residue_to = Column(String(5))
            ag_num = Column(Integer, nullable=False)
        return PAMuts

[docs]    def curations(self):
        """Represent the curations of our content.

        At various points in our APIs and UIs it is possible to curate the
        content we have extracted, recording whether it is an accurate
        extraction from the source text, and if not the reason why.

        **Size**: small

        **Basic Columns**

        - **id** ``integer PRIMARY KEY``: A database-assigned integer unique
          ID for each database entry.
        - **pa_hash** ``bigint REFERENCES pa_statements(mk_hash)``: A reference
          into the :func:`pa_statements <pa_statements>` table to the the
          pa statement whose evidence was curated.
        - **source_hash** ``bigint``: A hash that represents the source of this
          Statement (e.g. reader and piece of content).
        - **tag** ``varchar``: A text code indicating the type of error curated.
          The domain of these strings is regulated in code elsewhere.
        - **text** ``varchar``: A free-form text description by the curator of
          what they think went wrong (or right).
        - **curator** ``varchar NOT NULL``: The identity of the curator. This
          has elsewhere been standardized to be their email.
        - **auth_id** ``varchar``: [deprecated]
        - **source** ``varchar``: A string indicating where this curation
          originated, e.g. "DB REST API" for the INDRA Database REST service.
        - **ip** ``inet``: The IP address from which the curation was submitted.
        - **date** ``timestamp without time zone``: The date the curation was
          added.
        - **pa_json** ``jsonb``: the preassembled Statement JSON that was
          curated.
        - **ev_json** ``jsonb``: the Evidence JSON that was curated (including
          the text).
        """
        class Curation(self.base, IndraDBTable):
            __tablename__ = 'curation'
            _always_disp = ['pa_hash', 'source_hash', 'tag', 'curator', 'date']
            id = Column(Integer, primary_key=True)
            pa_hash = Column(BigInteger, ForeignKey('pa_statements.mk_hash'))
            pa_statements = relationship(self.table_dict['pa_statements'])
            source_hash = Column(BigInteger)
            tag = Column(String)
            text = Column(String)
            curator = Column(String, nullable=False)
            auth_id = Column(Integer)
            source = Column(String)
            ip = Column(INET)
            date = Column(DateTime, default=func.now())
            pa_json = Column(JSONB)
            ev_json = Column(JSONB)

            def to_json(self):
                return {attr: getattr(self, attr)
                        for attr in ['id', 'pa_hash', 'source_hash', 'tag', 'text',
                                     'date', 'curator', 'source', 'pa_json',
                                     'ev_json']}
        return Curation

[docs]    def source_file(self):
        """Record the pubmed source file that was processed."""
        class SourceFile(self.base, IndraDBTable):
            __tablename__ = 'source_file'
            _always_disp = ['source', 'name']
            id = Column(Integer, primary_key=True)
            source = Column(String(250), nullable=False)
            name = Column(String(250), nullable=False)
            load_date = Column(DateTime, default=func.now())
            __table_args__ = (
                UniqueConstraint('source', 'name', name='source-name'),
            )
        return SourceFile

[docs]    def updates(self):
        """Record when text ref and content updates were performed."""
        class Updates(self.base, IndraDBTable):
            __tablename__ = 'updates'
            _skip_disp = ['unresolved_conflicts_file']
            _always_disp = ['source', 'datetime']
            id = Column(Integer, primary_key=True)
            init_upload = Column(Boolean, nullable=False)
            source = Column(String(250), nullable=False)
            unresolved_conflicts_file = Column(BYTEA)
            datetime = Column(DateTime, default=func.now())
        return Updates

[docs]    def reading_updates(self):
        """Record runs of the readers on the content we have found."""
        class ReadingUpdates(self.base, IndraDBTable):
            __tablename__ = 'reading_updates'
            _always_disp = ['reader', 'reader_version', 'run_datetime']
            id = Column(Integer, primary_key=True)
            complete_read = Column(Boolean, nullable=False)
            reader = Column(String(250), nullable=False)
            reader_version = Column(String(250), nullable=False)
            run_datetime = Column(DateTime, default=func.now())
            earliest_datetime = Column(DateTime)
            latest_datetime = Column(DateTime, nullable=False)
        return ReadingUpdates

[docs]    def xdd_updates(self):
        """Record the times we process dumps from xDD."""
        class XddUpdates(self.base, IndraDBTable):
            __tablename__ = 'xdd_updates'
            _always_disp = ['day_str']
            id = Column(Integer, primary_key=True)
            reader_versions = Column(JSONB)
            indra_version = Column(String)
            day_str = Column(String, nullable=False, unique=True)
            processed_date = Column(DateTime, default=func.now())
        return XddUpdates

[docs]    def rejected_statements(self):
        """Represent raw statements that were rejected."""
        class RejectedStatements(self.base, IndraDBTable):
            __tablename__ = 'rejected_statements'
            _skip_disp = ['json']
            _always_disp = ['id', 'db_info_id', 'reading_id', 'type']
            id = Column(Integer, primary_key=True)
            uuid = Column(String(40), unique=True, nullable=False)
            batch_id = Column(Integer, nullable=False)
            mk_hash = Column(BigInteger, nullable=False)
            text_hash = Column(BigInteger)
            source_hash = Column(BigInteger, nullable=False)
            db_info_id = Column(Integer, ForeignKey('db_info.id'))
            db_info = relationship(self.table_dict['db_info'])
            reading_id = Column(BigInteger, ForeignKey('reading.id'))
            reading = relationship(self.table_dict['reading'])
            type = Column(String(100), nullable=False)
            indra_version = Column(String(100), nullable=False)
            json = Column(BYTEA, nullable=False)
            create_date = Column(DateTime, default=func.now())
        return RejectedStatements

[docs]    def discarded_statements(self):
        """Record the reasons for which some statements were discarded."""
        class DiscardedStatements(self.base, IndraDBTable):
            __tablename__ = 'discarded_statements'
            _always_disp = ['stmt_id', 'reason']
            id = Column(Integer, primary_key=True)
            stmt_id = Column(Integer, ForeignKey('raw_statements.id'),
                             nullable=False)
            reason = Column(String, nullable=False)
            insert_date = Column(DateTime, default=func.now())
        return DiscardedStatements

[docs]    def preassembly_updates(self):
        """Record updates of the preassembled corpus."""
        class PreassemblyUpdates(self.base, IndraDBTable):
            __tablename__ = 'preassembly_updates'
            _always_disp = ['corpus_init', 'run_datetime']
            id = Column(Integer, primary_key=True)
            corpus_init = Column(Boolean, nullable=False)
            run_datetime = Column(DateTime, default=func.now())
            stmt_type = Column(String)
        return PreassemblyUpdates