diff --git a/alembic/versions/a1b2c3d4e5f6_add_vrs_allele_closure_tables.py b/alembic/versions/a1b2c3d4e5f6_add_alleles_and_mapping_records_tables.py similarity index 100% rename from alembic/versions/a1b2c3d4e5f6_add_vrs_allele_closure_tables.py rename to alembic/versions/a1b2c3d4e5f6_add_alleles_and_mapping_records_tables.py diff --git a/alembic/versions/a7c4e9d2f1b8_add_vep_allele_consequences.py b/alembic/versions/a7c4e9d2f1b8_add_vep_allele_consequences.py new file mode 100644 index 00000000..8b655f62 --- /dev/null +++ b/alembic/versions/a7c4e9d2f1b8_add_vep_allele_consequences.py @@ -0,0 +1,65 @@ +"""add vep_allele_consequences table + +Revision ID: a7c4e9d2f1b8 +Revises: e5f7a9c1b3d4 +Create Date: 2026-06-22 + +New valid-time table holding a deduplicated allele's VEP functional consequence, replacing the frozen +vep_functional_consequence/vep_access_date columns on mapped_variants for new-model writes (Step 2 of +the annotation infrastructure migration, docs/design/annotation-infrastructure-migration.md). A row is +live while valid_to is NULL; the partial unique index enforces a single live consequence per allele +(VEP's most-severe consequence is one current value, so a change supersedes rather than accumulates). +functional_consequence is nullable (reserved for a future negative cache). source_version is the +Ensembl release the consequence was resolved under (coordinated software + transcript set + vocabulary), +which version-keys the refresh skip like gnomAD's db_version; access_date is retained as a "last +confirmed" audit stamp. The VEP columns on mapped_variants are left untouched (frozen serving). +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "a7c4e9d2f1b8" +down_revision = "e5f7a9c1b3d4" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_table( + "vep_allele_consequences", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("allele_id", sa.Integer(), nullable=False), + sa.Column("functional_consequence", sa.String(), nullable=True), + sa.Column("source_version", sa.String(), nullable=False), + sa.Column("access_date", sa.Date(), nullable=False), + sa.Column("valid_from", sa.DateTime(timezone=True), nullable=False, server_default=sa.func.now()), + sa.Column("valid_to", sa.DateTime(timezone=True), nullable=True), + sa.ForeignKeyConstraint( + ["allele_id"], + ["alleles.id"], + name="fk_vep_allele_consequences_allele_id", + ondelete="RESTRICT", + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + "ix_vep_allele_consequences_allele_id", + "vep_allele_consequences", + ["allele_id"], + ) + # One live consequence per allele: VEP's most-severe consequence is a single current value, so a + # changed result supersedes the prior row rather than accumulating one live row per access. + op.create_index( + "uq_vep_allele_consequences_live", + "vep_allele_consequences", + ["allele_id"], + unique=True, + postgresql_where=sa.text("valid_to IS NULL"), + ) + + +def downgrade() -> None: + op.drop_index("uq_vep_allele_consequences_live", table_name="vep_allele_consequences") + op.drop_index("ix_vep_allele_consequences_allele_id", table_name="vep_allele_consequences") + op.drop_table("vep_allele_consequences") diff --git a/alembic/versions/a7e1c4f9b3d2_add_annotation_event.py b/alembic/versions/a7e1c4f9b3d2_add_annotation_event.py new file mode 100644 index 00000000..cf8d77e3 --- /dev/null +++ b/alembic/versions/a7e1c4f9b3d2_add_annotation_event.py @@ -0,0 +1,79 @@ +"""add annotation_event log + +Revision ID: a7e1c4f9b3d2 +Revises: d4e5f6a7b8c9 +Create Date: 2026-06-25 16:00:00.000000 + +""" + +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "a7e1c4f9b3d2" +down_revision = "d4e5f6a7b8c9" +branch_labels = None +depends_on = None + +_VARIANT_SUBJECT_TYPES = "'vrs_mapping', 'cross_level_translation', 'variant_translation', 'ldh_submission'" +_ALLELE_SUBJECT_TYPES = ( + "'clingen_allele_id', 'gnomad_allele_frequency', 'vep_functional_consequence', 'clinvar_control', 'mapped_hgvs'" +) + + +def upgrade(): + op.create_table( + "annotation_event", + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column("annotation_type", sa.String(length=50), nullable=False), + sa.Column("variant_id", sa.Integer(), nullable=True), + sa.Column("allele_id", sa.Integer(), nullable=True), + sa.Column("disposition", sa.String(length=50), nullable=False), + sa.Column("reason", sa.String(length=50), nullable=False), + sa.Column("source_version", sa.String(length=50), nullable=True), + sa.Column("metadata", postgresql.JSONB(astext_type=sa.Text()), nullable=True), + sa.Column("job_run_id", sa.Integer(), nullable=True), + sa.Column("score_set_id", sa.Integer(), nullable=True), + sa.Column("created_at", sa.DateTime(timezone=True), server_default=sa.text("now()"), nullable=False), + sa.CheckConstraint( + f"(annotation_type IN ({_VARIANT_SUBJECT_TYPES}) " + "AND variant_id IS NOT NULL AND allele_id IS NULL) " + f"OR (annotation_type IN ({_ALLELE_SUBJECT_TYPES}) " + "AND allele_id IS NOT NULL AND variant_id IS NULL)", + name="ck_annotation_event_subject", + ), + sa.ForeignKeyConstraint(["variant_id"], ["variants.id"], ondelete="RESTRICT"), + sa.ForeignKeyConstraint(["allele_id"], ["alleles.id"], ondelete="RESTRICT"), + sa.ForeignKeyConstraint(["job_run_id"], ["job_runs.id"], ondelete="SET NULL"), + sa.ForeignKeyConstraint(["score_set_id"], ["scoresets.id"], ondelete="SET NULL"), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + "ix_annotation_event_allele_type_id", + "annotation_event", + ["allele_id", "annotation_type", sa.text("id DESC")], + unique=False, + ) + op.create_index( + "ix_annotation_event_variant_type_id", + "annotation_event", + ["variant_id", "annotation_type", sa.text("id DESC")], + unique=False, + ) + op.create_index( + "ix_annotation_event_allele_type_version", + "annotation_event", + ["allele_id", "annotation_type", "source_version"], + unique=False, + ) + op.create_index("ix_annotation_event_job_run_id", "annotation_event", ["job_run_id"], unique=False) + + +def downgrade(): + op.drop_index("ix_annotation_event_job_run_id", table_name="annotation_event") + op.drop_index("ix_annotation_event_allele_type_version", table_name="annotation_event") + op.drop_index("ix_annotation_event_variant_type_id", table_name="annotation_event") + op.drop_index("ix_annotation_event_allele_type_id", table_name="annotation_event") + op.drop_table("annotation_event") diff --git a/alembic/versions/b2c3d4e5f6a7_rename_clinical_controls_to_clinvar_controls.py b/alembic/versions/b2c3d4e5f6a7_rename_clinical_controls_to_clinvar_controls.py new file mode 100644 index 00000000..19cca078 --- /dev/null +++ b/alembic/versions/b2c3d4e5f6a7_rename_clinical_controls_to_clinvar_controls.py @@ -0,0 +1,39 @@ +"""rename clinical_controls to clinvar_controls + +Revision ID: b2c3d4e5f6a7 +Revises: a7c4e9d2f1b8 +Create Date: 2026-06-22 + +Renames the clinical_controls entity table to clinvar_controls, and renames the unique +constraint to match. The frozen association table (mapped_variants_clinical_controls) +and its FK to the renamed table are left structurally intact — PostgreSQL updates the FK +target automatically on table rename. The Python model is renamed ClinicalControl → +ClinvarControl in the same changeset (no data migration). +""" + +from alembic import op + +revision = "b2c3d4e5f6a7" +down_revision = "a7c4e9d2f1b8" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.rename_table("clinical_controls", "clinvar_controls") + # PostgreSQL does not auto-rename constraints on table rename; rename explicitly so the + # on_conflict_do_update(constraint=...) in the job references the correct name. + op.execute( + "ALTER TABLE clinvar_controls RENAME CONSTRAINT " + "uq_clinical_controls_db_name_identifier_version " + "TO uq_clinvar_controls_db_name_identifier_version" + ) + + +def downgrade() -> None: + op.execute( + "ALTER TABLE clinvar_controls RENAME CONSTRAINT " + "uq_clinvar_controls_db_name_identifier_version " + "TO uq_clinical_controls_db_name_identifier_version" + ) + op.rename_table("clinvar_controls", "clinical_controls") diff --git a/alembic/versions/b8f2c5a1d3e4_add_v_current_annotation_events_view.py b/alembic/versions/b8f2c5a1d3e4_add_v_current_annotation_events_view.py new file mode 100644 index 00000000..73843866 --- /dev/null +++ b/alembic/versions/b8f2c5a1d3e4_add_v_current_annotation_events_view.py @@ -0,0 +1,30 @@ +"""add v_current_annotation_events view + +Revision ID: b8f2c5a1d3e4 +Revises: a7e1c4f9b3d2 +Create Date: 2026-06-26 + +Creates the v_current_annotation_events view: the latest AnnotationEvent per (subject, annotation_type), +with ClinVar partitioned additionally by source_version (multi-live, one current row per release). +Replaces the per-variant v_variant_annotations as the current-state projection over the new +allele-model annotation log. Intended for operator queries, BI, and the annotation CLI scripts. +""" + +from alembic import op + +from mavedb.db.view import CreateView, DropView +from mavedb.models.annotation_event_view import definition, signature + +# revision identifiers, used by Alembic. +revision = "b8f2c5a1d3e4" +down_revision = "a7e1c4f9b3d2" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.execute(CreateView(signature, definition, materialized=False)) + + +def downgrade() -> None: + op.execute(DropView(signature, materialized=False)) diff --git a/alembic/versions/c3d4e5f6a7b8_add_clinvar_allele_links.py b/alembic/versions/c3d4e5f6a7b8_add_clinvar_allele_links.py new file mode 100644 index 00000000..4a20f008 --- /dev/null +++ b/alembic/versions/c3d4e5f6a7b8_add_clinvar_allele_links.py @@ -0,0 +1,73 @@ +"""add clinvar_allele_links table + +Revision ID: c3d4e5f6a7b8 +Revises: b2c3d4e5f6a7 +Create Date: 2026-06-22 + +New valid-time link table connecting deduplicated alleles to ClinvarControl rows, replacing +the frozen mapped_variants_clinical_controls association for new-model writes. + +ClinVar's link shape is deliberately multi-live: the partial unique index is +(allele_id, clinvar_control_id) WHERE valid_to IS NULL, so an allele accumulates one live +link per ClinVar release rather than superseding as in gnomAD/VEP. Each ClinVar release is a +distinct ClinvarControl row, so different releases stack as independent live links. A link is +retired (valid_to closed) only if ClinVar later removes the variant from a release, which +would surface as a re-run finding no data for that release and retiring the corresponding +link — archival data never changes, so this path is theoretical. + +The existing mapped_variants_clinical_controls association table is left untouched (frozen +for serving existing data). +""" + +import sqlalchemy as sa + +from alembic import op + +revision = "c3d4e5f6a7b8" +down_revision = "b2c3d4e5f6a7" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_table( + "clinvar_allele_links", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("allele_id", sa.Integer(), nullable=False), + sa.Column("clinvar_control_id", sa.Integer(), nullable=False), + sa.Column("valid_from", sa.DateTime(timezone=True), nullable=False, server_default=sa.func.now()), + sa.Column("valid_to", sa.DateTime(timezone=True), nullable=True), + sa.ForeignKeyConstraint( + ["allele_id"], + ["alleles.id"], + name="fk_clinvar_allele_links_allele_id", + ondelete="RESTRICT", + ), + sa.ForeignKeyConstraint( + ["clinvar_control_id"], + ["clinvar_controls.id"], + name="fk_clinvar_allele_links_clinvar_control_id", + ondelete="RESTRICT", + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index("ix_clinvar_allele_links_allele_id", "clinvar_allele_links", ["allele_id"]) + op.create_index("ix_clinvar_allele_links_clinvar_control_id", "clinvar_allele_links", ["clinvar_control_id"]) + # Multi-live: one live link per (allele, release). An allele accumulates one live link per + # ClinVar release rather than superseding — unlike gnomAD/VEP which enforce one live link + # per allele across all versions. Superseded rows (valid_to IS NOT NULL) are preserved for + # point-in-time queries. + op.create_index( + "uq_clinvar_allele_links_live", + "clinvar_allele_links", + ["allele_id", "clinvar_control_id"], + unique=True, + postgresql_where=sa.text("valid_to IS NULL"), + ) + + +def downgrade() -> None: + op.drop_index("uq_clinvar_allele_links_live", table_name="clinvar_allele_links") + op.drop_index("ix_clinvar_allele_links_clinvar_control_id", table_name="clinvar_allele_links") + op.drop_index("ix_clinvar_allele_links_allele_id", table_name="clinvar_allele_links") + op.drop_table("clinvar_allele_links") diff --git a/alembic/versions/c9b2a4f8e1d3_index_annotation_event_score_set_id.py b/alembic/versions/c9b2a4f8e1d3_index_annotation_event_score_set_id.py new file mode 100644 index 00000000..13c20693 --- /dev/null +++ b/alembic/versions/c9b2a4f8e1d3_index_annotation_event_score_set_id.py @@ -0,0 +1,27 @@ +"""index annotation_event.score_set_id + +Revision ID: c9b2a4f8e1d3 +Revises: b8f2c5a1d3e4 +Create Date: 2026-06-30 + +Adds the missing foreign-key index on annotation_event.score_set_id. Every other FK on this table +is indexed; score_set_id was not. The index backs the ON DELETE SET NULL cascade fired when a score +set is deleted (an unindexed FK forces a sequential scan of the event log per deleted score set) and +any operator/BI query that filters the log by score set. +""" + +from alembic import op + +# revision identifiers, used by Alembic. +revision = "c9b2a4f8e1d3" +down_revision = "b8f2c5a1d3e4" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_index("ix_annotation_event_score_set_id", "annotation_event", ["score_set_id"], unique=False) + + +def downgrade() -> None: + op.drop_index("ix_annotation_event_score_set_id", table_name="annotation_event") diff --git a/alembic/versions/d4e5f6a7b8c9_add_clinvar_variation_id.py b/alembic/versions/d4e5f6a7b8c9_add_clinvar_variation_id.py new file mode 100644 index 00000000..de5758b4 --- /dev/null +++ b/alembic/versions/d4e5f6a7b8c9_add_clinvar_variation_id.py @@ -0,0 +1,29 @@ +"""add clinvar_variation_id to clinvar_controls + +Revision ID: d4e5f6a7b8c9 +Revises: c3d4e5f6a7b8 +Create Date: 2026-06-22 + +Additive, non-breaking column for ClinVar's canonical public identifier (VariationID), captured forward +from the variant_summary TSV. db_identifier continues to hold the AlleleID (the allele-level handle used +for gnomAD cross-references); this carries the VariationID beside it for eventual external ClinVar links +(clinvar/variation/{id}). Nullable and not yet served — the dedicated clinvar_variants remodel (explicit +fields replacing the generic db_* shape, the serving/UI cutover, and backfill of existing rows) is +deferred. +""" + +import sqlalchemy as sa +from alembic import op + +revision = "d4e5f6a7b8c9" +down_revision = "c3d4e5f6a7b8" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column("clinvar_controls", sa.Column("clinvar_variation_id", sa.String(), nullable=True)) + + +def downgrade() -> None: + op.drop_column("clinvar_controls", "clinvar_variation_id") diff --git a/alembic/versions/e5f7a9c1b3d4_add_gnomad_allele_links.py b/alembic/versions/e5f7a9c1b3d4_add_gnomad_allele_links.py new file mode 100644 index 00000000..54f3e3d5 --- /dev/null +++ b/alembic/versions/e5f7a9c1b3d4_add_gnomad_allele_links.py @@ -0,0 +1,71 @@ +"""add gnomad_allele_links table + +Revision ID: e5f7a9c1b3d4 +Revises: d4e6f8a0b2c3 +Create Date: 2026-06-18 + +New valid-time link table connecting deduplicated alleles to gnomAD variants, replacing the frozen +gnomad_variants_mapped_variants association for new-model writes (Step 1 of the annotation +infrastructure migration, docs/design/annotation-infrastructure-migration.md). A link is live while +valid_to is NULL; the partial unique index enforces a single live link per (allele, gnomad variant) +pair. The existing gnomad_variants_mapped_variants table is left untouched (frozen serving). +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "e5f7a9c1b3d4" +down_revision = "d4e6f8a0b2c3" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_table( + "gnomad_allele_links", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("allele_id", sa.Integer(), nullable=False), + sa.Column("gnomad_variant_id", sa.Integer(), nullable=False), + sa.Column("valid_from", sa.DateTime(timezone=True), nullable=False, server_default=sa.func.now()), + sa.Column("valid_to", sa.DateTime(timezone=True), nullable=True), + sa.ForeignKeyConstraint( + ["allele_id"], + ["alleles.id"], + name="fk_gnomad_allele_links_allele_id", + ondelete="RESTRICT", + ), + sa.ForeignKeyConstraint( + ["gnomad_variant_id"], + ["gnomad_variants.id"], + name="fk_gnomad_allele_links_gnomad_variant_id", + ondelete="RESTRICT", + ), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + "ix_gnomad_allele_links_allele_id", + "gnomad_allele_links", + ["allele_id"], + ) + op.create_index( + "ix_gnomad_allele_links_gnomad_variant_id", + "gnomad_allele_links", + ["gnomad_variant_id"], + ) + # One live link per allele: gnomAD frequency is a single current value, so a version bump + # supersedes the prior link rather than accumulating one live link per version. + op.create_index( + "uq_gnomad_allele_links_live", + "gnomad_allele_links", + ["allele_id"], + unique=True, + postgresql_where=sa.text("valid_to IS NULL"), + ) + + +def downgrade() -> None: + op.drop_index("uq_gnomad_allele_links_live", table_name="gnomad_allele_links") + op.drop_index("ix_gnomad_allele_links_gnomad_variant_id", table_name="gnomad_allele_links") + op.drop_index("ix_gnomad_allele_links_allele_id", table_name="gnomad_allele_links") + op.drop_table("gnomad_allele_links") diff --git a/src/mavedb/lib/alleles.py b/src/mavedb/lib/alleles.py new file mode 100644 index 00000000..ddf51565 --- /dev/null +++ b/src/mavedb/lib/alleles.py @@ -0,0 +1,54 @@ +"""Allele-graph queries over the deduplicated allele model. + +These traverse the ``MappingRecordAllele`` link graph rather than any external identifier, because an +allele's cross-layer equivalence (its genomic / coding / protein representations) is established by the +mapping + reverse-translation process and materialized as co-membership in a ``MappingRecord``'s allele +set. No single identifier spans the layers: ClinGen's canonical allele id (CAID) covers only the +nucleotide layers, the protein allele carries a distinct PA, so the link graph is the only thing that +ties all three together. +""" + +from datetime import datetime +from typing import Optional + +from sqlalchemy import select +from sqlalchemy.orm import Session + +from mavedb.models.allele import Allele +from mavedb.models.mapping_record_allele import MappingRecordAllele + + +def get_allele_translations(db: Session, allele_id: int, *, as_of: Optional[datetime] = None) -> list[Allele]: + """Return the cross-layer equivalence set of ``allele_id``: every allele co-linked to a + ``MappingRecord`` that links it (the anchor allele itself included), spanning the genomic, coding, + and protein layers. + + The relation is co-membership in a record's allele set, not a shared identifier — see the module + docstring. Because alleles are deduplicated by ``vrs_digest`` and shared across variants/score sets, + the anchor may belong to several ``MappingRecord``s; the result is the union of their allele sets + (normally the same biological equivalence class). Scope by record or score set upstream if a single + context is required. + + Temporal: by default returns the currently-live set (``valid_to IS NULL``). Pass ``as_of`` to + reconstruct the set as it stood at a past instant — both the anchor hop and the fan-out hop apply + the same half-open ``[valid_from, valid_to)`` predicate, so the whole set is evaluated at one + instant. The retire-cascade invariant (a live link implies a live record) holds under ``as_of`` too, + so filtering the links alone is sufficient. + """ + link_live = MappingRecordAllele.as_of(as_of) if as_of is not None else MappingRecordAllele.current + + record_ids = db.scalars( + select(MappingRecordAllele.mapping_record_id).where(MappingRecordAllele.allele_id == allele_id).where(link_live) + ).all() + if not record_ids: + return [] + + return list( + db.scalars( + select(Allele) + .join(MappingRecordAllele, MappingRecordAllele.allele_id == Allele.id) + .where(MappingRecordAllele.mapping_record_id.in_(record_ids)) + .where(link_live) + .distinct() + ).all() + ) diff --git a/src/mavedb/lib/annotation_status_manager.py b/src/mavedb/lib/annotation_status_manager.py index 6598bfab..4022cb4d 100644 --- a/src/mavedb/lib/annotation_status_manager.py +++ b/src/mavedb/lib/annotation_status_manager.py @@ -1,207 +1,170 @@ -"""Manage annotation statuses for variants. - -This module provides functionality to insert and retrieve annotation statuses -for genetic variants, ensuring that only one current status exists per -(variant, annotation type, version) combination. +"""Append-only writer for the annotation event log. + +Buffers :class:`AnnotationEvent` rows and flushes them in batches. This is an +**append-only** log, not a state table: there is no ``current`` flag to +maintain and no retire-on-write step. "Current" is derived at read time +(``DISTINCT ON (subject, annotation_type) … ORDER BY id DESC``) — exposed as +the ``v_current_annotation_events`` view (``mavedb.models.annotation_event_view``). + +Each event's *subject* is either a variant or an allele, chosen by +``annotation_type``. The writer validates the subject/type pairing up front so +a mis-subjected event fails with a clear ``ValueError`` rather than a deferred +DB ``CHECK`` violation at flush. """ import logging from typing import Optional -from sqlalchemy import select, update +from sqlalchemy import select from sqlalchemy.orm import Session from sqlalchemy.sql import desc from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationFailureCategory, AnnotationStatus -from mavedb.models.variant_annotation_status import VariantAnnotationStatus +from mavedb.models.enums.disposition import Disposition +from mavedb.models.annotation_event import ALLELE_SUBJECT_TYPES, VARIANT_SUBJECT_TYPES, AnnotationEvent logger = logging.getLogger(__name__) -# Default number of pending annotations to accumulate before auto-flushing. +# Default number of pending events to accumulate before auto-flushing. DEFAULT_BATCH_SIZE = 500 class AnnotationStatusManager: - """ - Manager for handling variant annotation statuses with batched writes. + """Buffered, append-only writer for :class:`AnnotationEvent` rows. - Annotations are accumulated in memory and flushed to the database in - batches (default 500) to reduce round-trips. Callers **must** call - :meth:`flush` after the last ``add_annotation`` to persist any remainder. + Events are accumulated in memory and flushed in batches (default 500) to + reduce round-trips. Callers **must** call :meth:`flush` after the last + :meth:`record_event` to persist any remainder. """ - def __init__(self, session: Session, job_run_id: Optional[int] = None, *, batch_size: int = DEFAULT_BATCH_SIZE): + def __init__( + self, + session: Session, + job_run_id: Optional[int] = None, + *, + score_set_id: Optional[int] = None, + batch_size: int = DEFAULT_BATCH_SIZE, + ): self.session = session self.job_run_id = job_run_id + self.score_set_id = score_set_id self.batch_size = batch_size - self._pending: list[VariantAnnotationStatus] = [] - self._retirement_filters: list[dict] = [] + self._pending: list[AnnotationEvent] = [] - def add_annotation( + def record_event( self, - variant_id: int, annotation_type: AnnotationType, - status: AnnotationStatus, - version: Optional[str] = None, - failure_category: Optional[AnnotationFailureCategory] = None, - annotation_data: dict = {}, - current: bool = True, - replace_all_versions: bool = True, + *, + disposition: Disposition, + reason: str, + variant_id: Optional[int] = None, + allele_id: Optional[int] = None, + source_version: Optional[str] = None, + metadata: Optional[dict] = None, + score_set_id: Optional[int] = None, ) -> None: - """ - Stage a new annotation and schedule retirement of previous current rows. - - By default (``replace_all_versions=True``), all existing current annotations for - (variant, type) are retired regardless of version. + """Buffer one terminal observation about a variant or an allele. - When ``replace_all_versions=False``, only existing current annotations matching - (variant, type, version) are retired. + Exactly one of ``variant_id`` / ``allele_id`` must be set, and it must + match the subject the ``annotation_type`` keys on. ``reason`` is the + single "what happened" axis across all dispositions (see EventReason). - Writes are accumulated in memory and flushed to the database when - ``batch_size`` is reached. Call :meth:`flush` after the last add to - persist any remaining annotations. - - NOTE: - This method does not commit the session. The caller is responsible - for persisting changes (e.g., via ``session.commit()``). + Writes are accumulated in memory and flushed when ``batch_size`` is + reached. Call :meth:`flush` after the last call to persist the + remainder. Does not commit — the caller owns the transaction. """ - self._retirement_filters.append( - { - "variant_id": variant_id, - "annotation_type": annotation_type, - "replace_all_versions": replace_all_versions, - "version": version, - } - ) + self._validate_subject(annotation_type, variant_id, allele_id) self._pending.append( - VariantAnnotationStatus( - variant_id=variant_id, + AnnotationEvent( annotation_type=annotation_type, - status=status, - version=version, - failure_category=failure_category, - current=current, + variant_id=variant_id, + allele_id=allele_id, + disposition=disposition, + reason=reason, + source_version=source_version, + event_metadata=metadata, job_run_id=self.job_run_id, - **annotation_data, + score_set_id=score_set_id if score_set_id is not None else self.score_set_id, ) # type: ignore[call-arg] ) if len(self._pending) >= self.batch_size: self.flush() - def flush(self) -> None: - """Flush all pending annotations to the database. + @staticmethod + def _validate_subject(annotation_type: AnnotationType, variant_id: Optional[int], allele_id: Optional[int]) -> None: + if (variant_id is None) == (allele_id is None): + raise ValueError("Exactly one of variant_id or allele_id must be set") - Retires old ``current=True`` rows in bulk, then inserts all pending - new rows in a single ``add_all`` + ``flush``. This replaces the - previous pattern of 2 flushes per ``add_annotation`` call. - """ + type_value = annotation_type.value if isinstance(annotation_type, AnnotationType) else annotation_type + if variant_id is not None and type_value not in VARIANT_SUBJECT_TYPES: + raise ValueError(f"annotation_type {type_value!r} is allele-subject; pass allele_id, not variant_id") + if allele_id is not None and type_value not in ALLELE_SUBJECT_TYPES: + raise ValueError(f"annotation_type {type_value!r} is variant-subject; pass variant_id, not allele_id") + + def flush(self) -> None: + """Insert all pending events in a single ``add_all`` + ``flush``.""" if not self._pending: return - self._retire_existing() self.session.add_all(self._pending) self.session.flush() - logger.debug(f"Flushed {len(self._pending)} annotation statuses") + logger.debug(f"Flushed {len(self._pending)} variant events") self._pending.clear() - self._retirement_filters.clear() - - def _retire_existing(self) -> None: - """Bulk-retire existing current annotations for all pending writes. - - Groups retirement filters by (annotation_type, replace_all_versions, version) - and issues one UPDATE per group, minimizing round-trips. - """ - # Group filters to minimize UPDATE statements. - # Key: (annotation_type, replace_all_versions, version) -> list of variant_ids - groups: dict[tuple, list[int]] = {} - for f in self._retirement_filters: - key = (f["annotation_type"], f["replace_all_versions"], f["version"]) - groups.setdefault(key, []).append(f["variant_id"]) - - for (annotation_type, replace_all_versions, version), variant_ids in groups.items(): - conditions = [ - VariantAnnotationStatus.variant_id.in_(variant_ids), - VariantAnnotationStatus.annotation_type == annotation_type, - VariantAnnotationStatus.current.is_(True), - ] - if not replace_all_versions: - conditions.append(VariantAnnotationStatus.version == version) - - stmt = update(VariantAnnotationStatus).where(*conditions).values(current=False) - self.session.execute(stmt) def get_current_annotation( - self, variant_id: int, annotation_type: AnnotationType, version: Optional[str] = None - ) -> Optional[VariantAnnotationStatus]: - """ - Retrieve the current annotation for a given variant/type/version. - - Flushes pending annotations first to ensure the result is up to date. - """ - self.flush() - - stmt = select(VariantAnnotationStatus).where( - VariantAnnotationStatus.variant_id == variant_id, - VariantAnnotationStatus.annotation_type == annotation_type, - VariantAnnotationStatus.current.is_(True), - ) - - if version is not None: - stmt = stmt.where(VariantAnnotationStatus.version == version) - - result = self.session.execute(stmt) - return result.scalar_one_or_none() - - def get_annotation_history( self, - variant_id: int, annotation_type: AnnotationType, - version: Optional[str] = None, - ) -> list[VariantAnnotationStatus]: - """ - Return the full annotation timeline for a variant/type, newest first. - - Includes both current and retired rows — useful for debugging and - support investigations. + *, + variant_id: Optional[int] = None, + allele_id: Optional[int] = None, + source_version: Optional[str] = None, + ) -> Optional[AnnotationEvent]: + """Latest event for a single ``(subject, annotation_type)`` key. + + Current status is the newest event by ``id`` — there is no ``current`` + flag to filter on. Flushes pending events first so the result reflects + buffered writes. """ self.flush() + self._validate_subject(annotation_type, variant_id, allele_id) - stmt = ( - select(VariantAnnotationStatus) - .where( - VariantAnnotationStatus.variant_id == variant_id, - VariantAnnotationStatus.annotation_type == annotation_type, - ) - .order_by(desc(VariantAnnotationStatus.id)) - ) - - if version is not None: - stmt = stmt.where(VariantAnnotationStatus.version == version) + stmt = select(AnnotationEvent).where(AnnotationEvent.annotation_type == annotation_type) + if variant_id is not None: + stmt = stmt.where(AnnotationEvent.variant_id == variant_id) + else: + stmt = stmt.where(AnnotationEvent.allele_id == allele_id) + if source_version is not None: + stmt = stmt.where(AnnotationEvent.source_version == source_version) - return list(self.session.scalars(stmt).all()) + stmt = stmt.order_by(desc(AnnotationEvent.id)).limit(1) + return self.session.scalars(stmt).first() - def get_all_current_annotations( + def get_event_history( self, - variant_id: int, - ) -> list[VariantAnnotationStatus]: - """ - Return all current annotations for a variant, across all types and versions. - - Useful for a quick overview of what annotations are active for a given variant. + annotation_type: AnnotationType, + *, + variant_id: Optional[int] = None, + allele_id: Optional[int] = None, + source_version: Optional[str] = None, + ) -> list[AnnotationEvent]: + """Full event timeline for a ``(subject, annotation_type)`` key, newest first. + + The append-only log retains every observation — skips, reconfirms, + no-ops — so this is the complete audit trail, not just the current row. """ self.flush() + self._validate_subject(annotation_type, variant_id, allele_id) - stmt = ( - select(VariantAnnotationStatus) - .where( - VariantAnnotationStatus.variant_id == variant_id, - VariantAnnotationStatus.current.is_(True), - ) - .order_by(VariantAnnotationStatus.annotation_type, VariantAnnotationStatus.version) - ) + stmt = select(AnnotationEvent).where(AnnotationEvent.annotation_type == annotation_type) + if variant_id is not None: + stmt = stmt.where(AnnotationEvent.variant_id == variant_id) + else: + stmt = stmt.where(AnnotationEvent.allele_id == allele_id) + if source_version is not None: + stmt = stmt.where(AnnotationEvent.source_version == source_version) + stmt = stmt.order_by(desc(AnnotationEvent.id)) return list(self.session.scalars(stmt).all()) diff --git a/src/mavedb/lib/clingen/allele_registry.py b/src/mavedb/lib/clingen/allele_registry.py index b773e689..b28a184d 100644 --- a/src/mavedb/lib/clingen/allele_registry.py +++ b/src/mavedb/lib/clingen/allele_registry.py @@ -136,80 +136,6 @@ async def get_associated_clinvar_allele_id(clingen_allele_id: str) -> str: return "" -def extract_hgvs_from_ca_allele_data( - data: dict, - target_is_coding: bool, - transcript_accession: Optional[str], -) -> tuple[Optional[str], Optional[str], Optional[str]]: - """Extract HGVS strings from ClinGen allele data for a CA (canonical allele) ID. - - Parses the ClinGen API response to find GRCh38 genomic HGVS, coding HGVS - matching the target transcript (or MANE fallback), and protein HGVS. - - Args: - data: Parsed JSON response from the ClinGen Allele Registry API. - target_is_coding: Whether the score set target is protein-coding. - transcript_accession: Specific transcript accession to match, or None to use MANE. - - Returns: - Tuple of (hgvs_g, hgvs_c, hgvs_p), any of which may be None. - """ - hgvs_g: Optional[str] = None - hgvs_c: Optional[str] = None - hgvs_p: Optional[str] = None - - if data.get("genomicAlleles"): - for allele in data["genomicAlleles"]: - if allele.get("referenceGenome") == "GRCh38" and allele.get("hgvs"): - hgvs_g = allele["hgvs"][0] - break - - if target_is_coding and data.get("transcriptAlleles"): - if transcript_accession: - for allele in data["transcriptAlleles"]: - if allele.get("hgvs"): - for hgvs_string in allele["hgvs"]: - hgvs_reference_sequence = hgvs_string.split(":")[0] - if transcript_accession == hgvs_reference_sequence: - hgvs_c = hgvs_string - break - if hgvs_c: - if allele.get("proteinEffect"): - hgvs_p = allele["proteinEffect"].get("hgvs") - break - else: - # No transcript specified; use MANE if available - for allele in data["transcriptAlleles"]: - if allele.get("MANE"): - hgvs_c = allele["MANE"].get("nucleotide", {}).get("RefSeq", {}).get("hgvs") - hgvs_p = allele["MANE"].get("protein", {}).get("RefSeq", {}).get("hgvs") - break - - return hgvs_g, hgvs_c, hgvs_p - - -def extract_hgvs_from_pa_allele_data(data: dict) -> tuple[Optional[str], Optional[str], Optional[str]]: - """Extract HGVS strings from ClinGen allele data for a PA (protein allele) ID. - - For PA alleles, only hgvs_p is extracted from aminoAcidAlleles. - - Args: - data: Parsed JSON response from the ClinGen Allele Registry API. - - Returns: - Tuple of (None, None, hgvs_p), where hgvs_p may be None. - """ - hgvs_p: Optional[str] = None - - if data.get("aminoAcidAlleles"): - for allele in data["aminoAcidAlleles"]: - if allele.get("hgvs"): - hgvs_p = allele["hgvs"][0] - break - - return None, None, hgvs_p - - def expand_allele_ids(clingen_allele_ids: list[Optional[str]]) -> set[str]: """Expand comma-separated multi-variant ClinGen allele IDs into individual IDs. diff --git a/src/mavedb/lib/clingen/alleles.py b/src/mavedb/lib/clingen/alleles.py index 52757223..58cb6d7d 100644 --- a/src/mavedb/lib/clingen/alleles.py +++ b/src/mavedb/lib/clingen/alleles.py @@ -5,7 +5,7 @@ score set. A single definition here prevents the two jobs from drifting apart. """ -from typing import NamedTuple +from typing import Callable, Iterable, NamedTuple, Optional, TypeVar from sqlalchemy import select from sqlalchemy.orm import Session @@ -15,20 +15,26 @@ from mavedb.models.mapping_record_allele import MappingRecordAllele from mavedb.models.variant import Variant +P = TypeVar("P") + class ScoreSetAlleleRow(NamedTuple): - """One (allele, variant) link for a score set. An allele shared by multiple variants - appears once per variant so callers can fan annotation statuses out correctly. + """One (allele, variant) link for a score set. An allele shared by multiple variants appears once + per variant; :func:`group_alleles_for_annotation` collapses those duplicates into one work-unit + per allele. - ``is_authoritative`` is a property of the link, not the allele: the same VRS allele can be - the authoritative measurement for one variant and an RT-derived equivalence for another. + ``hgvs_g``/``hgvs_c``/``hgvs_p`` are allele-level (stable by construction), carried here so the + VEP job can build its HGVS payload without a second query. They are optional with a ``None`` + default so payloads keying only on the CAID (gnomAD/ClinVar) need not name them. """ allele_id: int post_mapped: dict | None clingen_allele_id: str | None variant_id: int - is_authoritative: bool + hgvs_g: str | None = None + hgvs_c: str | None = None + hgvs_p: str | None = None def get_alleles_for_score_set(db: Session, score_set_id: int) -> list[ScoreSetAlleleRow]: @@ -47,7 +53,9 @@ def get_alleles_for_score_set(db: Session, score_set_id: int) -> list[ScoreSetAl Allele.post_mapped, Allele.clingen_allele_id, Variant.id.label("variant_id"), - MappingRecordAllele.is_authoritative, + Allele.hgvs_g, + Allele.hgvs_c, + Allele.hgvs_p, ) .join(MappingRecordAllele, MappingRecordAllele.allele_id == Allele.id) .join(MappingRecord, MappingRecord.id == MappingRecordAllele.mapping_record_id) @@ -58,4 +66,42 @@ def get_alleles_for_score_set(db: Session, score_set_id: int) -> list[ScoreSetAl .where(Allele.post_mapped.is_not(None)) ).all() - return [ScoreSetAlleleRow(r.id, r.post_mapped, r.clingen_allele_id, r.variant_id, r.is_authoritative) for r in rows] + return [ + ScoreSetAlleleRow(r.id, r.post_mapped, r.clingen_allele_id, r.variant_id, r.hgvs_g, r.hgvs_c, r.hgvs_p) + for r in rows + ] + + +def group_alleles_for_annotation( + rows: Iterable[ScoreSetAlleleRow], + payload: Callable[[ScoreSetAlleleRow], Optional[P]], +) -> dict[int, P]: + """Collapse the per-(allele, variant) rows from :func:`get_alleles_for_score_set` into one + job-specific payload per allele, keyed by ``allele_id``. + + The same allele recurs once per variant that links it, so this dedups those rows down to one + entry per allele — the shape every allele-keyed annotation job wants now that annotation events + are allele-keyed (one event per allele, never fanned per-variant). + + ``payload`` builds the job-specific payload from the first row seen for an allele — the CAID for + gnomAD/ClinVar, the HGVS for VEP, etc. Returning ``None`` skips the allele entirely (e.g. it + carries no CAID), replacing each job's ad-hoc ``if row.x is None: continue``. ``payload`` must be + a pure function of allele-level fields so its result is stable across an allele's rows. + + Grouping on ``allele_id`` rather than ``vrs_digest`` is intentional: the two are 1:1 (content + addressing makes ``vrs_digest`` unique), so the groups are identical either way, and ``allele_id`` + is the permanent, never-reused surrogate. If annotation storage later keys on the digest, carry + it on the row and store against it — the grouping contract here does not change. + """ + groups: dict[int, P] = {} + for row in rows: + if row.allele_id in groups: + continue + + built = payload(row) + if built is None: + continue + + groups[row.allele_id] = built + + return groups diff --git a/src/mavedb/lib/clinvar/constants.py b/src/mavedb/lib/clinvar/constants.py index e70c4fee..fd80212b 100644 --- a/src/mavedb/lib/clinvar/constants.py +++ b/src/mavedb/lib/clinvar/constants.py @@ -26,6 +26,10 @@ backoff for throttling is unnecessary — a modest retry with short backoff suffices. """ -CLINVAR_FIELDS_TO_KEEP = ("GeneSymbol", "ClinicalSignificance", "ReviewStatus") +CLINVAR_FIELDS_TO_KEEP = ("GeneSymbol", "ClinicalSignificance", "ReviewStatus", "VariationID") """Only these fields are extracted from each ClinVar TSV row and cached. The full TSV has ~30 columns; trimming to only what we need shrinks the cached pickle from hundreds of MB to tens of MB and speeds up load times. + +VariationID is ClinVar's canonical public identifier (anchors the web UI / variation links); we keep it +alongside the AlleleID (the row key) so the link record can carry both. A row missing the column on an +older archival TSV degrades to None rather than failing the whole version's parse. """ diff --git a/src/mavedb/lib/clinvar/utils.py b/src/mavedb/lib/clinvar/utils.py index 689e369e..d72968da 100644 --- a/src/mavedb/lib/clinvar/utils.py +++ b/src/mavedb/lib/clinvar/utils.py @@ -175,8 +175,11 @@ def _fetch_parse_and_cache( # as a list (which would be 1.5–2 GB for a modern TSV). with gzip.open(filename=buf, mode="rt") as f: reader = csv.DictReader(f, delimiter="\t") # type: ignore + # row.get (not row[field]) so a field absent from an older archival TSV schema yields + # None for that row rather than raising and discarding the whole version's parse. data: Dict[str, Dict[str, str]] = { - str(row["#AlleleID"]): {field: row[field] for field in CLINVAR_FIELDS_TO_KEEP} for row in reader + str(row["#AlleleID"]): {field: row.get(field) for field in CLINVAR_FIELDS_TO_KEEP} # type: ignore[misc] + for row in reader } finally: csv.field_size_limit(default_csv_field_size_limit) diff --git a/src/mavedb/lib/gnomad.py b/src/mavedb/lib/gnomad.py index 9bfa0fec..5fd4d888 100644 --- a/src/mavedb/lib/gnomad.py +++ b/src/mavedb/lib/gnomad.py @@ -1,24 +1,43 @@ import logging import os import re +from enum import Enum from typing import Any, Sequence, Union -from sqlalchemy import Connection, Row, select, text +from sqlalchemy import Connection, Row, func, select, text from sqlalchemy.orm import Session -from mavedb.lib.annotation_status_manager import AnnotationStatusManager from mavedb.lib.logging.context import logging_context, save_to_logging_context from mavedb.lib.utils import batched -from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationStatus +from mavedb.models.allele import Allele +from mavedb.models.gnomad_allele_link import GnomadAlleleLink from mavedb.models.gnomad_variant import GnomADVariant -from mavedb.models.mapped_variant import MappedVariant -GNOMAD_DB_NAME = "gnomAD" -GNOMAD_DATA_VERSION = os.getenv("GNOMAD_DATA_VERSION", "v4.1") # e.g., "v4.1" logger = logging.getLogger(__name__) +GNOMAD_DB_NAME = "gnomAD" +GNOMAD_DATA_VERSION = os.getenv("GNOMAD_DATA_VERSION", "v4.1") +_CAID_LEADING_ZERO_RE = r"^(CA)0+([0-9])" +""" +Strip leading zeros from a CAID's numeric portion, keeping at least one digit. +Kept byte-for-byte in sync with the SQL form used to normalize Allele.clingen_allele_id +in link_gnomad_variants_to_alleles. +""" + + +class GnomadLinkVerdict(str, Enum): + """Per-allele outcome of a gnomAD linking run, returned for every allele the linker touched. + + The single source of truth for what happened to an allele's link this run — the caller derives + annotation status directly from this, never by re-querying link state (which would be a second, + drift-prone source of truth). + """ + + CREATED = "created" # link created or superseded this run (a new/changed live link) + UNCHANGED = "unchanged" # a live link already pointed at the resolved variant; left untouched + + def gnomad_identifier(contig: str, position: Union[str, int], alleles: list[str]) -> str: """ Generate a gnomAD variant identifier based on contig, position, and alleles. @@ -46,6 +65,18 @@ def gnomad_table_name() -> str: return table_name +def normalize_caid(caid: str) -> str: + """Normalize a ClinGen CAID by stripping leading zeros from its numeric portion. + + The gnomAD Hail/Athena dump drops leading zeros from CAIDs — MaveDB's ``CA025094`` is recorded as + ``CA25094`` — so an exact-string join silently misses every zero-padded CAID (issue #722). Both + sides of the join are normalized to the unpadded form to repair the match. ``CA025094`` and + ``CA25094`` denote the same ClinGen allele, so this can never collide distinct alleles. A value + that is not a recognizable CAID (no ``CA`` prefix + digits) is returned unchanged. + """ + return re.sub(_CAID_LEADING_ZERO_RE, r"\1\2", caid) + + def allele_list_from_list_like_string(alleles_string: str) -> list[str]: """ Convert a list-like string representation of alleles into a Python list. @@ -94,8 +125,9 @@ def gnomad_variant_data_for_caids( Raises: sqlalchemy.exc.SQLAlchemyError: If there is an error executing the query. """ + # Normalize to the unpadded form the dump stores so the IN-list matches zero-padded CAIDs (see issue #722). chunked_caids = batched(caids, 16250) - caid_strs = [",".join(f"'{caid}'" for caid in chunk) for chunk in chunked_caids] + caid_strs = [",".join(f"'{normalize_caid(caid)}'" for caid in chunk) for chunk in chunked_caids] save_to_logging_context({"num_caids": len(caids), "num_chunks": len(caid_strs)}) result_rows: list[Row[Any]] = [] @@ -132,31 +164,46 @@ def gnomad_variant_data_for_caids( return result_rows -def link_gnomad_variants_to_mapped_variants( - db: Session, gnomad_variant_data: Sequence[Row[Any]], only_current: bool = True -) -> int: - """ - Links gnomAD variants to mapped variants in the database based on CAIDs. Note that this function does - not commit this data to the database; it only prepares the relationships. - - Args: - caids (list[str]): A list of CAIDs to link with gnomAD variants. +def link_gnomad_variants_to_alleles( + db: Session, gnomad_variant_data: Sequence[Row[Any]] +) -> dict[int, GnomadLinkVerdict]: + """Link gnomAD variants to deduplicated alleles by CAID, superseding only on change. + + Every ``Allele`` carrying the row's ``clingen_allele_id`` (populated by CAR) is linked through a + valid-time :class:`GnomadAlleleLink`, so one gnomAD variant fans out to every allele sharing the + CAID (cross-score-set dedup included). Each allele holds at most one live link, superseded **only + on change**: a live link already pointing to the resolved variant is left untouched (an unchanged + re-run writes no spurious valid-time boundary); a new/different/older-version target retires it and + inserts a successor, keyed on ``allele_id`` so a version bump replaces rather than accumulates. The + guard is load-bearing despite the job's upstream skip — shared CAIDs and ``force`` runs still reach + it. A current-version link to a *different* identifier (a CAID re-resolved within one release) is + logged and superseded newest-wins, not raised. + + Does not commit. Returns a verdict per allele *touched* this run (matched a CAID-bearing row): + :attr:`GnomadLinkVerdict.CREATED` for a created/superseded link, :attr:`~GnomadLinkVerdict.UNCHANGED` + for a live link left in place. Alleles absent from the map were matched by no row — the caller reads + those as "gnomAD had no record". This is the single source of truth for per-allele status; callers + must not re-derive it by re-querying link state. """ save_to_logging_context({"num_gnomad_variant_rows": len(gnomad_variant_data)}) - save_to_logging_context({"only_current": only_current}) - logger.debug(msg="Linking gnomAD variants to mapped variants", extra=logging_context()) + logger.debug(msg="Linking gnomAD variants to alleles", extra=logging_context()) - linked_gnomad_variants = 0 - annotation_manager = AnnotationStatusManager(db) + verdicts: dict[int, GnomadLinkVerdict] = {} for index, row in enumerate(gnomad_variant_data, start=1): logger.info( msg=f"Processing gnomAD variant row {index}/{len(gnomad_variant_data)}: {row.caid}", extra=logging_context() ) - mapped_variants_with_caids_query = select(MappedVariant).where(MappedVariant.clingen_allele_id == row.caid) - if only_current: - mapped_variants_with_caids_query = mapped_variants_with_caids_query.where(MappedVariant.current.is_(True)) - mapped_variants_with_caids = db.scalars(mapped_variants_with_caids_query).all() + # Match on the unpadded CAID: the dump's caid is already stripped, while the stored CAID may + # be zero-padded, so normalize both sides (issue #722). regexp_replace mirrors normalize_caid. + alleles_with_caid = db.scalars( + select(Allele).where( + func.regexp_replace(Allele.clingen_allele_id, _CAID_LEADING_ZERO_RE, r"\1\2") + == normalize_caid(row.caid) + ) + ).all() + if not alleles_with_caid: + continue gnomad_identifier_for_variant = gnomad_identifier( row.__getattribute__("locus.contig"), @@ -172,80 +219,88 @@ def link_gnomad_variants_to_mapped_variants( if faf95_max is not None: faf95_max = float(faf95_max) - for mapped_variant in mapped_variants_with_caids: - # Remove any existing gnomAD variants for this mapped variant that match the current gnomAD data version to avoid data duplication. - # There should only be one gnomAD variant per mapped variant per gnomAD data version, since each gnomAD variant can only match to one - # CAID. - for linked_gnomad_variant in mapped_variant.gnomad_variants: - if linked_gnomad_variant.db_version == GNOMAD_DATA_VERSION: - logger.debug( - msg=f"Removing existing gnomAD variant {linked_gnomad_variant.db_identifier} from mapped variant {mapped_variant.id} ({mapped_variant.clingen_allele_id})", - extra=logging_context(), - ) - mapped_variant.gnomad_variants.remove(linked_gnomad_variant) - - existing_gnomad_variant = db.scalar( - select(GnomADVariant).where( - GnomADVariant.db_name == "gnomAD", - GnomADVariant.db_identifier == gnomad_identifier_for_variant, - GnomADVariant.db_version == GNOMAD_DATA_VERSION, - ) + # One gnomAD variant per (identifier, version): get-or-create so repeated CAIDs and re-runs + # reuse the same row. Flush so a freshly created variant has an id for the link below. + gnomad_variant = db.scalar( + select(GnomADVariant).where( + GnomADVariant.db_name == GNOMAD_DB_NAME, + GnomADVariant.db_identifier == gnomad_identifier_for_variant, + GnomADVariant.db_version == GNOMAD_DATA_VERSION, + ) + ) + if gnomad_variant is None: + logger.debug( + msg=f"Creating new gnomAD variant for identifier {gnomad_identifier_for_variant}", + extra=logging_context(), + ) + gnomad_variant = GnomADVariant( + db_name=GNOMAD_DB_NAME, + db_identifier=gnomad_identifier_for_variant, + db_version=GNOMAD_DATA_VERSION, + allele_count=allele_count, + allele_number=allele_number, + allele_frequency=allele_frequency, # type: ignore + faf95_max_ancestry=faf95_max_ancestry, + faf95_max=faf95_max, # type: ignore + ) + db.add(gnomad_variant) + db.flush() + else: + logger.debug( + msg=f"Found existing gnomAD variant for identifier {gnomad_identifier_for_variant}", + extra=logging_context(), ) - if existing_gnomad_variant is None: - logger.debug( - msg=f"Creating new gnomAD variant for identifier {gnomad_identifier_for_variant}", - extra=logging_context(), + for allele in alleles_with_caid: + live_link = db.scalar( + select(GnomadAlleleLink).where( + GnomadAlleleLink.allele_id == allele.id, + GnomadAlleleLink.current, ) - gnomad_variant = GnomADVariant( - db_name=GNOMAD_DB_NAME, - db_identifier=gnomad_identifier_for_variant, - db_version=GNOMAD_DATA_VERSION, - allele_count=allele_count, - allele_number=allele_number, - allele_frequency=allele_frequency, # type: ignore - faf95_max_ancestry=faf95_max_ancestry, - faf95_max=faf95_max, # type: ignore - ) - else: - logger.debug( - msg=f"Found existing gnomAD variant for identifier {gnomad_identifier_for_variant}", + ) + + # No change: live link already points here — leave it untouched (no spurious boundary). + if live_link is not None and live_link.gnomad_variant_id == gnomad_variant.id: + verdicts.setdefault(allele.id, GnomadLinkVerdict.UNCHANGED) + continue + + if ( + live_link is not None + and live_link.gnomad_variant.db_version == GNOMAD_DATA_VERSION + and live_link.gnomad_variant.db_identifier != gnomad_identifier_for_variant + ): + logger.warning( + msg=( + f"CAID {allele.clingen_allele_id} for allele {allele.id} resolved to " + f"{gnomad_identifier_for_variant} at version {GNOMAD_DATA_VERSION}, but a live link " + f"already points to {live_link.gnomad_variant.db_identifier} at the same version. " + "Superseding (newest wins); investigate the gnomAD source for a re-resolved CAID." + ), extra=logging_context(), ) - gnomad_variant = existing_gnomad_variant - - if gnomad_variant not in mapped_variant.gnomad_variants: - mapped_variant.gnomad_variants.append(gnomad_variant) - linked_gnomad_variants += 1 - db.add(gnomad_variant) - annotation_manager.add_annotation( - variant_id=mapped_variant.variant_id, # type: ignore - annotation_type=AnnotationType.GNOMAD_ALLELE_FREQUENCY, - version=GNOMAD_DATA_VERSION, - status=AnnotationStatus.SUCCESS, - annotation_data={ - "annotation_metadata": { - "gnomad_db_identifier": gnomad_variant.db_identifier, - } - }, - current=True, + # Change: retire any live link for the allele, insert the successor (allele-keyed, so a + # version bump replaces rather than accumulates). + GnomadAlleleLink.supersede_live_where( + db, + [GnomadAlleleLink(allele_id=allele.id, gnomad_variant_id=gnomad_variant.id)], + GnomadAlleleLink.allele_id == allele.id, ) + verdicts[allele.id] = GnomadLinkVerdict.CREATED # created always wins over a same-run unchanged logger.debug( - msg=f"Linked gnomAD variant {gnomad_variant.db_identifier} to mapped variant {mapped_variant.id} ({mapped_variant.clingen_allele_id})", + msg=f"Linked gnomAD variant {gnomad_variant.db_identifier} to allele {allele.id} ({allele.clingen_allele_id})", extra=logging_context(), ) logger.info( - f"Linked {len(mapped_variants_with_caids)} mapped variants with CAID {row.caid} to gnomAD variant {gnomad_identifier_for_variant}. ({index}/{len(gnomad_variant_data)})" + f"Processed {len(alleles_with_caid)} alleles with CAID {row.caid} for gnomAD variant {gnomad_identifier_for_variant}. ({index}/{len(gnomad_variant_data)})" ) - annotation_manager.flush() - - save_to_logging_context({"linked_gnomad_variants": linked_gnomad_variants}) + changed_allele_count = sum(1 for v in verdicts.values() if v is GnomadLinkVerdict.CREATED) + save_to_logging_context({"changed_allele_count": changed_allele_count}) logger.info( - msg=f"Linked a total of {linked_gnomad_variants} gnomAD variants to mapped variants.", + msg=f"Created or superseded {changed_allele_count} allele links this run.", extra=logging_context(), ) - return linked_gnomad_variants + return verdicts diff --git a/src/mavedb/lib/hgvs.py b/src/mavedb/lib/hgvs.py index 5826e67c..0a03445a 100644 --- a/src/mavedb/lib/hgvs.py +++ b/src/mavedb/lib/hgvs.py @@ -1,10 +1,24 @@ import re +import sys from typing import Optional # Coordinate prefix of an HGVS variant description: a single type letter plus a dot # (g. c. n. m. r. p. ...), capturing the prefix and the remaining description separately. _HGVS_COORD_PREFIX = re.compile(r"^([a-z]\.)(.+)$") +_FIRST_INT = re.compile(r"\d+") + + +def _cis_phased_sort_key(description: str) -> tuple[int, str]: + """Order key for a cis-phased component description by its first integer position. + + The first run of digits is the coordinate for both genomic (``123A>G``) and protein + (``Arg123Gly``) forms. Descriptions with no digit sort last, and the raw string breaks ties so + the order is total and stable. + """ + match = _FIRST_INT.search(description) + return (int(match.group()) if match else sys.maxsize, description) + def extract_accession(hgvs_string: str) -> str: """Extract the reference accession from an HGVS string, or return empty string if it cannot @@ -52,10 +66,13 @@ def split_cis_phased_hgvs(hgvs_string: str) -> list[str]: Unlike a bare mavehgvs split, the accession is preserved: the components feed straight into VRS translation, which requires a reference accession to resolve positions. """ - if "[" not in hgvs_string: + accession, separator, remainder = hgvs_string.partition(":") + # Only an accession-qualified, bracketed expression is a cis-phased multivariant we split here; + # anything else (bare, unbracketed, or accession-less) is returned unchanged so the caller can + # treat both cases uniformly without a ValueError on the missing ":" / "[". + if not separator or "[" not in remainder: return [hgvs_string] - accession, _, remainder = hgvs_string.partition(":") prefix = remainder[: remainder.index("[")] # e.g. "g." / "c." inner = remainder[remainder.index("[") + 1 : remainder.rindex("]")] return [f"{accession}:{prefix}{component}" for component in inner.split(";") if component] @@ -91,4 +108,8 @@ def join_cis_phased_hgvs(components: list[str]) -> Optional[str]: if len(accessions) != 1 or len(prefixes) != 1: return None + # Emit components in coordinate order so the combined string is deterministic regardless of + # member ordering. The VRS block digest is order-independent (so dedup is unaffected), but this + # string is surfaced in CSV export, where a stable, spec-conventional ordering is useful. + descriptions.sort(key=_cis_phased_sort_key) return f"{accessions.pop()}:{prefixes.pop()}[{';'.join(descriptions)}]" diff --git a/src/mavedb/lib/score_sets.py b/src/mavedb/lib/score_sets.py index 93b7c67e..800f1a80 100644 --- a/src/mavedb/lib/score_sets.py +++ b/src/mavedb/lib/score_sets.py @@ -37,7 +37,7 @@ from mavedb.models.experiment_controlled_keyword import ExperimentControlledKeywordAssociation from mavedb.models.experiment_publication_identifier import ExperimentPublicationIdentifierAssociation from mavedb.models.experiment_set import ExperimentSet -from mavedb.models.clinical_control import ClinicalControl +from mavedb.models.clinical_control import ClinvarControl from mavedb.models.clinical_control_mapped_variant import mapped_variants_clinical_controls_association_table from mavedb.models.gnomad_variant import GnomADVariant from mavedb.models.mapped_variant import MappedVariant @@ -700,14 +700,14 @@ def get_score_set_variants_as_csv( idx = 2 if need_mappings else 1 gnomad_data.append(row[idx]) - # For each ClinVar namespace, fetch a mapping from mapped_variant_id to ClinicalControl. - clinvar_data_map: dict[str, dict[int, Optional[ClinicalControl]]] = {} + # For each ClinVar namespace, fetch a mapping from mapped_variant_id to ClinvarControl. + clinvar_data_map: dict[str, dict[int, Optional[ClinvarControl]]] = {} if clinvar_namespaces and mappings is not None: mv_ids = [m.id for m in mappings if m is not None] for ns, db_version in clinvar_namespaces.items(): - mv_to_cc: dict[int, Optional[ClinicalControl]] = {} + mv_to_cc: dict[int, Optional[ClinvarControl]] = {} if mv_ids: - aliased_cc = aliased(ClinicalControl) + aliased_cc = aliased(ClinvarControl) cc_query = ( select( mapped_variants_clinical_controls_association_table.c.mapped_variant_id, @@ -730,11 +730,11 @@ def get_score_set_variants_as_csv( clinvar_data_map[ns] = mv_to_cc # Build per-variant ClinVar lookup (list indexed in parallel with variants). - clinvar_per_variant: Optional[list[Optional[dict[str, Optional[ClinicalControl]]]]] = None + clinvar_per_variant: Optional[list[Optional[dict[str, Optional[ClinvarControl]]]]] = None if clinvar_namespaces and mappings is not None: clinvar_per_variant = [] for mapping in mappings: - row_clinvar: dict[str, Optional[ClinicalControl]] = {} + row_clinvar: dict[str, Optional[ClinvarControl]] = {} for ns, mv_to_cc in clinvar_data_map.items(): if mapping is not None and mapping.id is not None: row_clinvar[ns] = mv_to_cc.get(mapping.id) @@ -810,7 +810,7 @@ def variant_to_csv_row( columns: dict[str, list[str]], mapping: Optional[MappedVariant] = None, gnomad_data: Optional[GnomADVariant] = None, - clinvar_data_by_ns: Optional[dict[str, Optional[ClinicalControl]]] = None, + clinvar_data_by_ns: Optional[dict[str, Optional[ClinvarControl]]] = None, namespaced: Optional[bool] = None, na_rep="NA", ) -> dict[str, Any]: @@ -829,7 +829,7 @@ def variant_to_csv_row( Mapped variant corresponding to the variant. gnomad_data : variant.models.GnomADVariant, optional gnomAD variant data corresponding to the variant. - clinvar_data_by_ns : dict[str, Optional[ClinicalControl]], optional + clinvar_data_by_ns : dict[str, Optional[ClinvarControl]], optional Per-variant ClinVar data keyed by namespace (e.g. "clinvar.2024_01"). na_rep : str String to represent null values. @@ -958,7 +958,7 @@ def variants_to_csv_rows( columns: dict[str, list[str]], mappings: Optional[Sequence[Optional[MappedVariant]]] = None, gnomad_data: Optional[Sequence[Optional[GnomADVariant]]] = None, - clinvar_data_by_ns: Optional[Sequence[Optional[dict[str, Optional[ClinicalControl]]]]] = None, + clinvar_data_by_ns: Optional[Sequence[Optional[dict[str, Optional[ClinvarControl]]]]] = None, namespaced: Optional[bool] = None, na_rep="NA", ) -> Iterable[dict[str, Any]]: @@ -977,7 +977,7 @@ def variants_to_csv_rows( List of mapped variants corresponding to the variants. gnomad_data : list[Optional[variant.models.GnomADVariant]], optional List of gnomAD variant data corresponding to the variants. - clinvar_data_by_ns : list[Optional[dict[str, Optional[ClinicalControl]]]], optional + clinvar_data_by_ns : list[Optional[dict[str, Optional[ClinvarControl]]]], optional Per-variant ClinVar data keyed by namespace (e.g. "clinvar.2024_01"). na_rep : str String to represent null values. @@ -989,7 +989,7 @@ def variants_to_csv_rows( n = len(variants) _mappings: Sequence[Optional[MappedVariant]] = mappings if mappings is not None else [None] * n _gnomad: Sequence[Optional[GnomADVariant]] = gnomad_data if gnomad_data is not None else [None] * n - _clinvar: Sequence[Optional[dict[str, Optional[ClinicalControl]]]] = ( + _clinvar: Sequence[Optional[dict[str, Optional[ClinvarControl]]]] = ( clinvar_data_by_ns if clinvar_data_by_ns is not None else [None] * n ) return map( diff --git a/src/mavedb/lib/variant_translations.py b/src/mavedb/lib/variant_translations.py index 701cc17d..31d89e6d 100644 --- a/src/mavedb/lib/variant_translations.py +++ b/src/mavedb/lib/variant_translations.py @@ -3,6 +3,12 @@ This module provides database operations for the variant_translations table, which stores relationships between protein allele (PA) and nucleotide allele (CA) ClinGen IDs. + +FROZEN (serving-only). The populate_variant_translations_for_score_set job that wrote this table was +retired in the #742 migration: the reverse-translation allele equivalence space (genomic/coding/protein +VRS alleles per variant, linked via MappingRecordAllele with HGVS on Allele) now covers PA<->CA +relationships without querying ClinGen. These helpers and the variant_translations table remain only to +serve existing old-model data; they are never written for new score sets and are dropped at read-cutover. """ from typing import cast diff --git a/src/mavedb/lib/vep.py b/src/mavedb/lib/vep.py index a7d4e7b3..25c0732f 100644 --- a/src/mavedb/lib/vep.py +++ b/src/mavedb/lib/vep.py @@ -4,17 +4,22 @@ import functools import logging import os -from typing import Optional, Sequence +from datetime import date +from enum import Enum +from typing import Mapping, NamedTuple, Optional, Sequence -import requests +from sqlalchemy import select +from sqlalchemy.orm import Session +from mavedb.lib.logging.context import logging_context, save_to_logging_context from mavedb.lib.utils import request_with_backoff +from mavedb.models.vep_allele_consequence import VepAlleleConsequence logger = logging.getLogger(__name__) + ENSEMBL_API_URL = os.environ.get("ENSEMBL_API_URL", "https://rest.ensembl.org") -# List of all possible VEP consequences, in order from most to least severe VEP_CONSEQUENCES = [ "transcript_ablation", "splice_acceptor_variant", @@ -72,6 +77,40 @@ "intron_variant", "intergenic_variant", ] +""" +List of all functional consequences VEP can return, in order of severity (most severe first). +""" + + +class VepLinkVerdict(str, Enum): + """Per-allele outcome of a VEP linking run, returned for every allele whose status is decided. + + The single source of truth for what happened to an allele's consequence this run — the caller + derives annotation status from this, never by re-querying consequence state. An allele absent from + the map had no live consequence and resolved none this run (the caller reads that as "no result"). + + - ``CREATED`` — a new or changed consequence was created/superseded this run. + - ``UNCHANGED`` — a live consequence was retained (value matched, or held against a null run). + """ + + CREATED = "created" + UNCHANGED = "unchanged" + + +class VepResolution(NamedTuple): + """Outcome of resolving a set of HGVS strings, splitting the two kinds of "no consequence". + + This outcome allows us to differentiate between a genuine empty (VEP found nothing) and an + unknown (VEP failed to answer). + + - ``consequences`` — HGVS that resolved to a most-severe consequence (the hits). + - ``errored`` — HGVS whose VEP/Recoder request *failed* (HTTP/transport error after retries); the + result is unknown and the allele should be retried, not treated as a negative. + - Any queried HGVS in neither set was answered (HTTP 200) with no consequence — a genuine **empty**. + """ + + consequences: dict[str, str] + errored: set[str] async def run_variant_recoder(missing_hgvs: Sequence[str]) -> dict[str, list[str]]: @@ -81,35 +120,30 @@ async def run_variant_recoder(missing_hgvs: Sequence[str]) -> dict[str, list[str missing_hgvs (Sequence[str]): List of HGVS strings to recode. Returns: - dict[str, list[str]]: Mapping of input HGVS to list of genomic HGVS strings (hgvsg). - Returns an empty dict if Ensembl rejects the batch (e.g. 400 for - unrecognised identifiers) — callers treat missing entries as failures. + dict[str, list[str]]: Mapping of input HGVS to list of genomic HGVS strings (hgvsg). An input + with no recodable genomic mapping is simply absent (a genuine empty). + + Raises: + requests.exceptions.RequestException: if the Recoder request fails (HTTP/transport error after + retries). The caller attributes the failure to this batch's inputs so they are reported as + errored (unknown, retry) rather than silently conflated with a genuine empty. """ headers = {"Content-Type": "application/json", "Accept": "application/json"} + # request_with_backoff is synchronous (requests lib + time.sleep backoff); run_in_executor # keeps the event loop free during the full request + any retry wait time. loop = asyncio.get_running_loop() - try: - response = await loop.run_in_executor( - None, - functools.partial( - request_with_backoff, - method="POST", - url=f"{ENSEMBL_API_URL}/variant_recoder/human", - headers=headers, - json={"ids": list(missing_hgvs)}, - timeout=600, # Variant Recoder can be very slow for large batches and 504s are common; generous timeout and backoff retries are needed - ), - ) - except requests.exceptions.HTTPError as exc: - # A 4xx from Ensembl (e.g. 400 for an unrecognised identifier format) means the batch - # cannot be recoded. Return empty so callers can handle these missing entries. - logger.warning( - f"Variant Recoder returned {exc.response.status_code if exc.response is not None else 'unknown'} " - f"for batch of {len(missing_hgvs)} HGVS strings — treating as no results.", - exc_info=exc, - ) - return {} + response = await loop.run_in_executor( + None, + functools.partial( + request_with_backoff, + method="POST", + url=f"{ENSEMBL_API_URL}/variant_recoder/human", + headers=headers, + json={"ids": list(missing_hgvs)}, + timeout=600, # Variant Recoder can be very slow for large batches and 504s are common; generous timeout and backoff retries are needed + ), + ) data = response.json() # request_with_backoff handles http errors, so no need to check response status @@ -119,11 +153,13 @@ async def run_variant_recoder(missing_hgvs: Sequence[str]) -> dict[str, list[str hgvs_string = variant_data.get("input") if isinstance(variant_data, dict) else None if variant_str == "input" or not hgvs_string: continue + genomic_strings = variant_data.get("hgvsg") if isinstance(variant_data, dict) else None if genomic_strings: for genomic_hgvs in genomic_strings: if genomic_hgvs.startswith("NC_"): hgvs_to_genomic.setdefault(hgvs_string, []).append(genomic_hgvs) + return hgvs_to_genomic @@ -138,11 +174,14 @@ async def get_functional_consequence(hgvs_strings: Sequence[str]) -> dict[str, O hgvs_strings (Sequence[str]): List of HGVS strings to process (max 200 per call). Returns: - dict[str, Optional[str]]: Mapping of HGVS string to functional consequence. - If no consequence found, maps to None. Returns an empty dict - if Ensembl rejects the batch (e.g. 400 for unrecognised - identifiers) — callers treat missing entries as needing Recoder - fallback or as failures. + dict[str, Optional[str]]: Mapping of HGVS string to functional consequence. An HGVS the + successful response carried no consequence for maps to None (a genuine + miss — the caller may try Recoder, else treats it as empty). + + Raises: + requests.exceptions.RequestException: if the VEP request fails (HTTP/transport error after + retries). The caller attributes the failure to this batch's inputs so they are reported as + errored (unknown, retry) rather than silently conflated with a genuine empty. """ if len(hgvs_strings) > 200: raise ValueError( @@ -155,27 +194,17 @@ async def get_functional_consequence(hgvs_strings: Sequence[str]) -> dict[str, O # request_with_backoff is synchronous (requests lib + time.sleep backoff); run_in_executor # keeps the event loop free during the full request + any retry wait time. loop = asyncio.get_running_loop() - try: - response = await loop.run_in_executor( - None, - functools.partial( - request_with_backoff, - method="POST", - url=f"{ENSEMBL_API_URL}/vep/human/hgvs", - headers=headers, - json={"hgvs_notations": list(hgvs_strings)}, - timeout=60, # VEP can be slow for large batches. - ), - ) - except requests.exceptions.HTTPError as exc: - # A 4xx from Ensembl (e.g. 400 for an unrecognised identifier) means the batch cannot - # be resolved. Return empty so the callers can handle these missing entries. - logger.warning( - f"VEP returned {exc.response.status_code if exc.response is not None else 'unknown'} " - f"for batch of {len(hgvs_strings)} HGVS strings — treating as no results.", - exc_info=exc, - ) - return result + response = await loop.run_in_executor( + None, + functools.partial( + request_with_backoff, + method="POST", + url=f"{ENSEMBL_API_URL}/vep/human/hgvs", + headers=headers, + json={"hgvs_notations": list(hgvs_strings)}, + timeout=60, # VEP can be slow for large batches. + ), + ) data = response.json() for entry in data: @@ -185,3 +214,113 @@ async def get_functional_consequence(hgvs_strings: Sequence[str]) -> dict[str, O result[hgvs] = most_severe_consequence return result + + +async def get_ensembl_release() -> str: + """Return the current Ensembl release the REST API is serving, e.g. ``"116"`` (``/info/software``). + + An Ensembl release is coordinated — software, transcript set, and consequence vocabulary all bump + together under one number — so this single value version-keys VEP results the way gnomAD keys on its + data version. The job stamps it on each consequence and skips re-querying alleles already live at the + current release. Raises on failure: the version is load-bearing for the skip, so a job that cannot + determine it must abort rather than mis-version its writes. + """ + headers = {"Content-Type": "application/json", "Accept": "application/json"} + loop = asyncio.get_running_loop() + response = await loop.run_in_executor( + None, + functools.partial( + request_with_backoff, + method="GET", + url=f"{ENSEMBL_API_URL}/info/software", + headers=headers, + timeout=30, + ), + ) + return str(response.json()["release"]) + + +def link_vep_consequences_to_alleles( + db: Session, + consequence_by_allele_id: Mapping[int, Optional[str]], + *, + source_version: str, + access_date: date, +) -> dict[int, VepLinkVerdict]: + """Store VEP consequences against deduplicated alleles, superseding only on change. + + ``consequence_by_allele_id`` maps each queried allele to the consequence VEP resolved this run + (``None`` when VEP + Variant Recoder found nothing). ``source_version`` is the Ensembl release the + run resolved against. Each allele holds at most one live :class:`VepAlleleConsequence`, handled per + allele: + + - **unchanged** (live row already carries this consequence): advance ``source_version`` and + ``access_date`` in place — no supersede. Supersede is value-keyed, not version-keyed: a new + release that resolves the same categorical consequence must not fabricate a transaction-time + boundary, which would churn history every release. + - **new or changed** (no live row, or a different consequence): supersede keyed on ``allele_id`` + (retire the old, insert the successor stamped with this ``source_version``/``access_date``). + - **None this run**: leave any live row in place — do not overwrite a held consequence with a null result. + Log a warning if VEP found no consequence for an allele which previously had a live consequence. + + Does not commit. Returns a verdict per allele whose status is decided this run: + :attr:`VepLinkVerdict.CREATED` for a created/superseded consequence, :attr:`~VepLinkVerdict.UNCHANGED` + for a live consequence retained (value matched, or held against a null run). An allele absent from + the map had no live row and resolved nothing — the caller reads that as "no result". This is the + single source of truth for per-allele status; callers must not re-derive it from consequence state. + """ + save_to_logging_context({"num_alleles_to_link_vep": len(consequence_by_allele_id)}) + logger.debug(msg="Linking VEP consequences to alleles", extra=logging_context()) + + verdicts: dict[int, VepLinkVerdict] = {} + for allele_id, consequence in consequence_by_allele_id.items(): + live = db.scalar( + select(VepAlleleConsequence).where( + VepAlleleConsequence.allele_id == allele_id, + VepAlleleConsequence.current, + ) + ) + + # TODO#780 - VEP found nothing this run. Do not overwrite a held consequence with a null result; a retained + # consequence is UNCHANGED (status preexisting), while no live row at all leaves the allele out of + # the map (the caller reads that as a no-result). + if consequence is None: + if live is not None: + logger.warning( + f"VEP found no consequence for allele {allele_id} this run; leaving prior consequence " + f"'{live.functional_consequence}' in place.", + extra=logging_context(), + ) + verdicts[allele_id] = VepLinkVerdict.UNCHANGED + + continue + + # Unchanged: advance version/freshness in place. + if live is not None and live.functional_consequence == consequence: + live.source_version = source_version + live.access_date = access_date + verdicts[allele_id] = VepLinkVerdict.UNCHANGED + continue + + # New or changed consequence: retire any live row, insert the successor. + VepAlleleConsequence.supersede_live_where( + db, + [ + VepAlleleConsequence( + allele_id=allele_id, + functional_consequence=consequence, + source_version=source_version, + access_date=access_date, + ) + ], + VepAlleleConsequence.allele_id == allele_id, + ) + verdicts[allele_id] = VepLinkVerdict.CREATED + + changed_allele_count = sum(1 for v in verdicts.values() if v is VepLinkVerdict.CREATED) + save_to_logging_context({"changed_allele_count": changed_allele_count}) + logger.info( + msg=f"Created or superseded {changed_allele_count} VEP allele consequences this run.", + extra=logging_context(), + ) + return verdicts diff --git a/src/mavedb/lib/vrs_utils.py b/src/mavedb/lib/vrs_utils.py index d5a6432e..7979cdae 100644 --- a/src/mavedb/lib/vrs_utils.py +++ b/src/mavedb/lib/vrs_utils.py @@ -21,6 +21,7 @@ LiteralSequenceExpression, ReferenceLengthExpression, SequenceLocation, + SequenceReference, Syntax, ) from ga4gh.vrs.normalize import normalize @@ -188,7 +189,10 @@ def normalize_and_identify(allele: Allele, data_proxy: Any) -> Allele: """ allele = normalize(allele, data_proxy=data_proxy) if isinstance(allele.state, ReferenceLengthExpression): + # Normalization yields an inlined SequenceLocation here, never an IRI reference. + assert isinstance(allele.location, SequenceLocation) allele.state = _rle_to_lse(allele.state, allele.location, data_proxy) + allele.id = identify_allele(allele) return allele @@ -202,8 +206,14 @@ def _rle_to_lse( hashes identically to the mapper's authoritative allele for the same variant. Derives the literal sequence by tiling the repeat subunit out to ``rle.length``. """ + # A normalized indel location has an inlined SequenceReference and integer bounds; + # the IRI-reference and Range branches of these unions should never reach this helper. + assert isinstance(location.sequenceReference, SequenceReference) + assert isinstance(location.start, int) + assert isinstance(rle.length, int) + sequence_id = location.sequenceReference.refgetAccession - start: int = location.start + start = location.start end = start + rle.repeatSubunitLength subsequence = data_proxy.get_sequence(f"ga4gh:{sequence_id}", start, end) c = cycle(subsequence) diff --git a/src/mavedb/lib/workflow/definitions.py b/src/mavedb/lib/workflow/definitions.py index 3d3b7051..93b4b6de 100644 --- a/src/mavedb/lib/workflow/definitions.py +++ b/src/mavedb/lib/workflow/definitions.py @@ -96,16 +96,6 @@ def annotation_pipeline_job_definitions( }, "dependencies": [("warm_clingen_cache", DependencyType.SUCCESS_REQUIRED)], }, - { - "key": "populate_hgvs_for_score_set", - "function": "populate_hgvs_for_score_set", - "type": JobType.MAPPED_VARIANT_ANNOTATION, - "params": { - "correlation_id": None, # Required param to be filled in at runtime - "score_set_id": None, # Required param to be filled in at runtime - }, - "dependencies": [("warm_clingen_cache", DependencyType.SUCCESS_REQUIRED)], - }, { "key": "populate_vep_for_score_set", "function": "populate_vep_for_score_set", @@ -116,16 +106,6 @@ def annotation_pipeline_job_definitions( }, "dependencies": [("submit_score_set_mappings_to_car", DependencyType.SUCCESS_REQUIRED)], }, - { - "key": "populate_variant_translations_for_score_set", - "function": "populate_variant_translations_for_score_set", - "type": JobType.MAPPED_VARIANT_ANNOTATION, - "params": { - "correlation_id": None, # Required param to be filled in at runtime - "score_set_id": None, # Required param to be filled in at runtime - }, - "dependencies": [("warm_clingen_cache", DependencyType.SUCCESS_REQUIRED)], - }, ] diff --git a/src/mavedb/models/__init__.py b/src/mavedb/models/__init__.py index 35c1ba0a..1cec88e9 100644 --- a/src/mavedb/models/__init__.py +++ b/src/mavedb/models/__init__.py @@ -4,6 +4,7 @@ "allele", "collection", "clinical_control", + "clinvar_allele_link", "controlled_keyword", "doi_identifier", "ensembl_identifier", @@ -11,6 +12,7 @@ "experiment", "experiment_set", "genome_identifier", + "gnomad_allele_link", "gnomad_variant", "job_dependency", "job_run", @@ -40,6 +42,9 @@ "uniprot_offset", "user", "variant_annotation_status", + "annotation_event", + "annotation_event_view", "variant", "variant_translation", + "vep_allele_consequence", ] diff --git a/src/mavedb/models/allele.py b/src/mavedb/models/allele.py index 54ce07e5..a57c4c80 100644 --- a/src/mavedb/models/allele.py +++ b/src/mavedb/models/allele.py @@ -51,6 +51,10 @@ def _transcript_expression(cls): back_populates="allele", ) + # Annotation links (VEP, gnomAD, ClinVar) deliberately carry no reverse collection here — they are + # one-directional annotation->Allele, navigated set-wise from the link tables, not from an Allele + # instance. Keep new annotation links one-directional unless a read path needs the navigation. + __table_args__ = ( UniqueConstraint("vrs_digest", name="uq_alleles_vrs_digest"), Index("ix_alleles_vrs_digest", "vrs_digest"), diff --git a/src/mavedb/models/annotation_event.py b/src/mavedb/models/annotation_event.py new file mode 100644 index 00000000..7b5f69b7 --- /dev/null +++ b/src/mavedb/models/annotation_event.py @@ -0,0 +1,140 @@ +""" +SQLAlchemy model for the annotation event log. + +One append-only log spanning the whole pipeline (mapping, reverse translation, +annotation). Its subjects are exactly two persistent entities — ``Variant`` and +``Allele`` — selected by ``annotation_type`` via the polymorphic-subject CHECK. +Everything else the pipeline touches (``MappingRecord``, ``MappingRecordAllele``, +the external value tables) is a vehicle or resolution path, never a status subject. + +This is deliberately **not** a ``ValidTime`` table. A SCD-2 state table is a lossy +projection of an event log — it discards the skip/reconfirm/no-op events that are +the audit point. "Current" is derived (``DISTINCT ON (subject, annotation_type) +… id DESC``), never a stored ``current`` flag. +""" + +from datetime import datetime +from typing import TYPE_CHECKING, Any, Dict, Optional + +from sqlalchemy import CheckConstraint, DateTime, ForeignKey, Index, Integer, String, func, text +from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.ext.mutable import MutableDict +from sqlalchemy.orm import Mapped, mapped_column, relationship + +from mavedb.db.base import Base +from mavedb.models.enums.annotation_type import AnnotationType +from mavedb.models.enums.disposition import Disposition +from mavedb.models.enums.event_reason import EventReason + +if TYPE_CHECKING: + from mavedb.models.allele import Allele + from mavedb.models.job_run import JobRun + from mavedb.models.score_set import ScoreSet + from mavedb.models.variant import Variant + +VARIANT_SUBJECT_TYPES = ( + AnnotationType.VRS_MAPPING.value, + AnnotationType.CROSS_LEVEL_TRANSLATION.value, + AnnotationType.VARIANT_TRANSLATION.value, + AnnotationType.LDH_SUBMISSION.value, +) +"""annotation_type values whose subject is the variant (variant_id set, allele_id null)""" + +ALLELE_SUBJECT_TYPES = ( + AnnotationType.CLINGEN_ALLELE_ID.value, + AnnotationType.GNOMAD_ALLELE_FREQUENCY.value, + AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE.value, + AnnotationType.CLINVAR_CONTROL.value, + AnnotationType.MAPPED_HGVS.value, +) +"""annotation_type values whose subject is the allele (allele_id set, variant_id null)""" + + +def _sql_in_list(values: tuple[str, ...]) -> str: + return ", ".join(f"'{v}'" for v in values) + + +class AnnotationEvent(Base): + """An append-only event recording the *status* (not value) of one pipeline + observation about a ``Variant`` or an ``Allele``. + + The value (frequency, consequence, CAID, control) lives in the domain + ValidTime tables; this log records disposition + why + when. Reading the + domain tables alone cannot distinguish confirmed-absence from never-checked — + that gap is the whole reason the log exists. + + NOTE: JSONB ``event_metadata`` is tracked as a mutable object via MutableDict, + which only catches top-level mutations. Mutating a nested object + requires ``flag_modified(instance, "event_metadata")``. + """ + + __tablename__ = "annotation_event" + + id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True) + + annotation_type: Mapped[AnnotationType] = mapped_column(String(50), nullable=False) + + # Exactly one is set, per ck_annotation_event_subject. + variant_id: Mapped[Optional[int]] = mapped_column( + Integer, ForeignKey("variants.id", ondelete="RESTRICT"), nullable=True + ) + allele_id: Mapped[Optional[int]] = mapped_column( + Integer, ForeignKey("alleles.id", ondelete="RESTRICT"), nullable=True + ) + + disposition: Mapped[Disposition] = mapped_column(String(50), nullable=False) + + # Domain-specific code reusing in-code vocabularies (EventReason, plus MappingOutcome and RT + # skip_category for those two jobs); disposition is the public axis. + reason: Mapped[EventReason] = mapped_column(String(50), nullable=False) + + # gnomAD db_version / Ensembl release / ClinVar release / mapper version. + source_version: Mapped[Optional[str]] = mapped_column(String(50), nullable=True) + + # DB column is "metadata"; the attribute avoids the reserved Declarative name. + event_metadata: Mapped[Optional[Dict[str, Any]]] = mapped_column( + "metadata", MutableDict.as_mutable(JSONB), nullable=True + ) + + job_run_id: Mapped[Optional[int]] = mapped_column( + Integer, ForeignKey("job_runs.id", ondelete="SET NULL"), nullable=True + ) + score_set_id: Mapped[Optional[int]] = mapped_column( + Integer, ForeignKey("scoresets.id", ondelete="SET NULL"), nullable=True + ) + + created_at: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False, server_default=func.now()) + + # One-directional (no back-ref) — match the annotation-link convention. + variant: Mapped[Optional["Variant"]] = relationship("Variant") + allele: Mapped[Optional["Allele"]] = relationship("Allele") + job_run: Mapped[Optional["JobRun"]] = relationship("JobRun") + score_set: Mapped[Optional["ScoreSet"]] = relationship("ScoreSet") + + __table_args__ = ( + # Polymorphic subject: the type picks exactly one subject column. + CheckConstraint( + f"(annotation_type IN ({_sql_in_list(VARIANT_SUBJECT_TYPES)}) " + "AND variant_id IS NOT NULL AND allele_id IS NULL) " + f"OR (annotation_type IN ({_sql_in_list(ALLELE_SUBJECT_TYPES)}) " + "AND allele_id IS NOT NULL AND variant_id IS NULL)", + name="ck_annotation_event_subject", + ), + # latest-per-allele / latest-per-variant (the DISTINCT ON … id DESC projections) + Index("ix_annotation_event_allele_type_id", "allele_id", "annotation_type", text("id DESC")), + Index("ix_annotation_event_variant_type_id", "variant_id", "annotation_type", text("id DESC")), + # version-keyed skip + Index("ix_annotation_event_allele_type_version", "allele_id", "annotation_type", "source_version"), + # audit by run + Index("ix_annotation_event_job_run_id", "job_run_id"), + # backs the score-set ON DELETE SET NULL cascade and score-set-scoped audit queries + Index("ix_annotation_event_score_set_id", "score_set_id"), + ) + + def __repr__(self) -> str: + subject = f"variant_id={self.variant_id}" if self.variant_id is not None else f"allele_id={self.allele_id}" + return ( + f"" + ) diff --git a/src/mavedb/models/annotation_event_view.py b/src/mavedb/models/annotation_event_view.py new file mode 100644 index 00000000..5c854780 --- /dev/null +++ b/src/mavedb/models/annotation_event_view.py @@ -0,0 +1,96 @@ +"""``v_current_annotation_events`` — the current-state projection over the AnnotationEvent log. + +The log is append-only; "current" is derived, never stored. This view exposes the latest event per +``(subject, annotation_type)`` — one row per allele/variant per annotation type — so operators, BI, +and app code can read current status with a plain ``SELECT`` instead of re-deriving the +``DISTINCT ON`` everywhere. It mirrors the existing ``v_variant_annotations`` pattern. + +ClinVar is **multi-live**: an allele accumulates one live link per archival release, so its current +status is one row *per release*. The window partition folds ``source_version`` in **only** for +``clinvar_control`` (via a CASE that is constant-NULL for every other type, collapsing those back to +one row per subject+type). + +The subject is polymorphic — exactly one of ``variant_id`` / ``allele_id`` is set per row (enforced by +the log's CHECK), and the other is constant-NULL within a partition, so partitioning by both keys each +row to its real subject. + +There is deliberately **no ``score_set_id`` axis**: an allele-subject status (CAID, gnomAD, ClinVar, +VEP) is a *shared* allele-level fact, not a property of any one score set. A consumer that wants a +score set's annotation status resolves the score set's current alleles (and variants) through the live +mapping links — e.g. ``lib.clingen.alleles.get_alleles_for_score_set`` — and then looks those subjects +up here. Filtering by the *run's* score set would wrongly drop an allele last (re-)annotated by another +score set's run. The derived "current-for-variant" walk (resolving an allele-subject fact down to a +variant at the level a type keys on) is intentionally **not** built here — that is the deferred +consumer surface; see ``docs/design/allele-annotation-status.md``. +""" + +from sqlalchemy import case, func, select + +from mavedb.db.base import Base +from mavedb.db.view import view +from mavedb.models.annotation_event import AnnotationEvent +from mavedb.models.enums.annotation_type import AnnotationType + +signature = "v_current_annotation_events" + +# ClinVar is the lone multi-live type: split "current" by release. For every other type this is NULL, +# so the partition collapses to (allele_id, variant_id, annotation_type) — one current row per subject. +_clinvar_release_key = case( + (AnnotationEvent.annotation_type == AnnotationType.CLINVAR_CONTROL.value, AnnotationEvent.source_version), + else_=None, +) + +_ranked = select( + AnnotationEvent.id.label("id"), + AnnotationEvent.annotation_type.label("annotation_type"), + AnnotationEvent.variant_id.label("variant_id"), + AnnotationEvent.allele_id.label("allele_id"), + AnnotationEvent.disposition.label("disposition"), + AnnotationEvent.reason.label("reason"), + AnnotationEvent.source_version.label("source_version"), + AnnotationEvent.event_metadata.label("event_metadata"), + AnnotationEvent.job_run_id.label("job_run_id"), + AnnotationEvent.created_at.label("created_at"), + func.row_number() + .over( + partition_by=[ + AnnotationEvent.allele_id, + AnnotationEvent.variant_id, + AnnotationEvent.annotation_type, + _clinvar_release_key, + ], + order_by=AnnotationEvent.id.desc(), + ) + .label("row_number"), +).subquery("ranked_annotation_events") + +definition = select( + _ranked.c.id, + _ranked.c.annotation_type, + _ranked.c.variant_id, + _ranked.c.allele_id, + _ranked.c.disposition, + _ranked.c.reason, + _ranked.c.source_version, + _ranked.c.event_metadata, + _ranked.c.job_run_id, + _ranked.c.created_at, +).where(_ranked.c.row_number == 1) + + +class CurrentAnnotationEventView(Base): + __table__ = view(signature, definition, materialized=False) + # Each surviving event id is unique across the view, so it is a valid mapping key for the + # otherwise-PK-less view (standard SQLAlchemy view-mapping idiom). + __mapper_args__ = {"primary_key": [__table__.c.id]} + + id = __table__.c.id + annotation_type = __table__.c.annotation_type + variant_id = __table__.c.variant_id + allele_id = __table__.c.allele_id + disposition = __table__.c.disposition + reason = __table__.c.reason + source_version = __table__.c.source_version + event_metadata = __table__.c.event_metadata + job_run_id = __table__.c.job_run_id + created_at = __table__.c.created_at diff --git a/src/mavedb/models/clinical_control.py b/src/mavedb/models/clinical_control.py index 0b989cb2..694aad58 100644 --- a/src/mavedb/models/clinical_control.py +++ b/src/mavedb/models/clinical_control.py @@ -8,18 +8,19 @@ from mavedb.models.clinical_control_mapped_variant import mapped_variants_clinical_controls_association_table if TYPE_CHECKING: + from mavedb.models.clinvar_allele_link import ClinvarAlleleLink from mavedb.models.mapped_variant import MappedVariant -class ClinicalControl(Base): - __tablename__ = "clinical_controls" +class ClinvarControl(Base): + __tablename__ = "clinvar_controls" __table_args__ = ( UniqueConstraint( - "db_name", "db_identifier", "db_version", name="uq_clinical_controls_db_name_identifier_version" + "db_name", "db_identifier", "db_version", name="uq_clinvar_controls_db_name_identifier_version" ), ) - id = Column(Integer, primary_key=True) + id: Mapped[int] = Column(Integer, primary_key=True) gene_symbol = Column(String, nullable=False, index=True) @@ -27,14 +28,25 @@ class ClinicalControl(Base): clinical_review_status = Column(String, nullable=False) db_name = Column(String, nullable=False, index=True) + # ClinVar Allele ID (row level link). db_identifier = Column(String, nullable=False, index=True) db_version = Column(String, nullable=False, index=True) + # ClinVar Variation ID (variation level link). + clinvar_variation_id = Column(String, nullable=True) + creation_date = Column(Date, nullable=False, default=date.today) modification_date = Column(Date, nullable=False, default=date.today, onupdate=date.today) + # Frozen serving path: links to MappedVariant via the old association table (never written for new data). mapped_variants: Mapped[list["MappedVariant"]] = relationship( "MappedVariant", secondary=mapped_variants_clinical_controls_association_table, back_populates="clinical_controls", ) + + # New-model annotation links (one live link per allele per ClinVar release). + allele_links: Mapped[list["ClinvarAlleleLink"]] = relationship( + "ClinvarAlleleLink", + back_populates="clinvar_control", + ) diff --git a/src/mavedb/models/clinical_control_mapped_variant.py b/src/mavedb/models/clinical_control_mapped_variant.py index eabb7689..b7fc5d84 100644 --- a/src/mavedb/models/clinical_control_mapped_variant.py +++ b/src/mavedb/models/clinical_control_mapped_variant.py @@ -7,5 +7,5 @@ "mapped_variants_clinical_controls", Base.metadata, Column("mapped_variant_id", ForeignKey("mapped_variants.id"), primary_key=True), - Column("clinical_control_id", ForeignKey("clinical_controls.id"), primary_key=True), + Column("clinical_control_id", ForeignKey("clinvar_controls.id"), primary_key=True), ) diff --git a/src/mavedb/models/clinvar_allele_link.py b/src/mavedb/models/clinvar_allele_link.py new file mode 100644 index 00000000..941ee2f2 --- /dev/null +++ b/src/mavedb/models/clinvar_allele_link.py @@ -0,0 +1,65 @@ +from typing import TYPE_CHECKING + +from sqlalchemy import Column, ForeignKey, Index, Integer, text +from sqlalchemy.orm import Mapped, relationship + +from mavedb.db.base import Base +from mavedb.db.mixins import ValidTime + +if TYPE_CHECKING: + from .allele import Allele + from .clinical_control import ClinvarControl + + +class ClinvarAlleleLink(ValidTime, Base): + """Valid-time link between an :class:`Allele` and a :class:`ClinvarControl` release. + + Replaces the frozen ``mapped_variants_clinical_controls`` association table for new-model writes. + A link is live while ``valid_to`` is NULL. Unlike gnomAD/VEP (one live result per allele), the partial + unique index is ``(allele_id, clinvar_control_id) WHERE valid_to IS NULL`` — **multi-live**: an allele + accumulates one live link per ClinVar release, because each release is a distinct, versioned + ``ClinvarControl`` assertion that stacks rather than supersedes. A link retires only on two theoretical + paths (archival data does not change): ClinVar drops the variant from a release (a re-run finds no data + for it), or the allele re-resolves to a *different* control within the same release — the job supersedes + that newest-wins to preserve one live link per (allele, release), since this index only enforces one live + link per (allele, control). + """ + + __tablename__ = "clinvar_allele_links" + + id: Mapped[int] = Column(Integer, primary_key=True) + allele_id: Mapped[int] = Column( + Integer, + ForeignKey("alleles.id", ondelete="RESTRICT"), + nullable=False, + ) + clinvar_control_id: Mapped[int] = Column( + Integer, + ForeignKey("clinvar_controls.id", ondelete="RESTRICT"), + nullable=False, + ) + + # One-directional to Allele (no reverse collection there); back-ref on the ClinvarControl entity only. + allele: Mapped["Allele"] = relationship("Allele") + clinvar_control: Mapped["ClinvarControl"] = relationship("ClinvarControl", back_populates="allele_links") + + __table_args__ = ( + Index( + "ix_clinvar_allele_links_allele_id", + "allele_id", + ), + Index( + "ix_clinvar_allele_links_clinvar_control_id", + "clinvar_control_id", + ), + # Multi-live: one live link per (allele, release). Each ClinVar release is a distinct + # ClinvarControl row, so different releases stack as independent live links rather than + # superseding. Only live rows participate in this constraint. + Index( + "uq_clinvar_allele_links_live", + "allele_id", + "clinvar_control_id", + unique=True, + postgresql_where=text("valid_to IS NULL"), + ), + ) diff --git a/src/mavedb/models/enums/disposition.py b/src/mavedb/models/enums/disposition.py new file mode 100644 index 00000000..94438f29 --- /dev/null +++ b/src/mavedb/models/enums/disposition.py @@ -0,0 +1,19 @@ +from enum import Enum + + +class Disposition(str, Enum): + """The stable, consumer-facing status axis of a variant event. + + Defined by *what the consumer may conclude* — not by the domain-specific + operation that produced it (that lives in ``reason``). + + - ``present`` — we hold the result / the step succeeded + - ``absent`` — the source or biology has nothing — an informative negative + - ``not_applicable`` — we could not ask — a pipeline/structural gap, not a statement about the source + - ``failed`` — errored, might retry; failure_category carries transient vs permanent + """ + + PRESENT = "present" + ABSENT = "absent" + NOT_APPLICABLE = "not_applicable" + FAILED = "failed" diff --git a/src/mavedb/models/enums/event_reason.py b/src/mavedb/models/enums/event_reason.py new file mode 100644 index 00000000..f7fd4b56 --- /dev/null +++ b/src/mavedb/models/enums/event_reason.py @@ -0,0 +1,42 @@ +from enum import Enum + + +class EventReason(str, Enum): + """Pipeline-wide vocabulary for a variant event's ``reason`` — the single field that says + *what happened*, spanning present / absent / not_applicable / failed. + + Reasons are shared across jobs wherever they mean the same thing (the ``annotation_type`` + already says *which* job), and job-specific only where a case is genuinely unique. One job + contributes its own pre-existing domain enum instead of duplicating here: ``mapping`` uses + ``MappingOutcome`` (mapped / intronic / no_protein_consequence / failed), which mirrors the + external dcd-mapping vocabulary. The full ``reason`` vocabulary is this enum plus that one. + """ + + # present — we hold the result / the step succeeded + CREATED = "created" # linked/registered this run (gnomAD, ClinVar, CAR) + PREEXISTING = "preexisting" # already held before this run (gnomAD, ClinVar, CAR) + RECONFIRMED = "reconfirmed" # re-verified unchanged (gnomAD, CAR force) + SKIPPED = "skipped" # version-skip: already current at this source version (gnomAD, VEP) + SUPERSEDED = "superseded" # re-resolved within a release, newest wins (ClinVar) + SUBMITTED = "submitted" # LDH + TRANSLATED = "translated" # reverse translation + RECONFIRMATION_SKIPPED = "reconfirmation_skipped" # HGVS no longer buildable, existing CAID kept (CAR) + + # absent — the source or biology has nothing (informative negative) + NO_RECORD = "no_record" # source queried, returned nothing (gnomAD, ClinVar, VEP) + NO_CODING_TRANSCRIPT = "no_coding_transcript" # non-coding target has no protein consequence (RT) + + # not_applicable — we could not ask (structural gap) + NO_CAID = "no_caid" # no ClinGen allele id to key on (gnomAD, ClinVar) + NO_HGVS = "no_hgvs" # no HGVS to submit/resolve (CAR, VEP) + MULTI_VARIANT_CAID = "multi_variant_caid" # cis-block CAID cannot be used (ClinVar) + NO_ASSAY_LEVEL_HGVS = "no_assay_level_hgvs" # no assay-level HGVS to translate (RT) + + # failed — errored + API_ERROR = "api_error" # network/timeout/upstream error (ClinVar, VEP, LDH, CAR no-response) + SERVICE_REJECTED = "service_rejected" # external service refused the input (CAR) + MALFORMED_RESPONSE = "malformed_response" # unparseable/contractless response (CAR) + CAID_CONFLICT = "caid_conflict" # returned identifier conflicts with the stored one (CAR) + TRANSLATION_FAILED = "translation_failed" # all candidate HGVS failed translation (RT) + TRANSLATION_ERROR = "translation_error" # the translation engine errored (RT) + TRANSCRIPT_UNRESOLVED = "transcript_unresolved" # protein-coding target with no resolvable transcript (RT) diff --git a/src/mavedb/models/gnomad_allele_link.py b/src/mavedb/models/gnomad_allele_link.py new file mode 100644 index 00000000..75294ba6 --- /dev/null +++ b/src/mavedb/models/gnomad_allele_link.py @@ -0,0 +1,60 @@ +from typing import TYPE_CHECKING + +from sqlalchemy import Column, ForeignKey, Index, Integer, text +from sqlalchemy.orm import Mapped, relationship + +from mavedb.db.base import Base +from mavedb.db.mixins import ValidTime + +if TYPE_CHECKING: + from .allele import Allele + from .gnomad_variant import GnomADVariant + + +class GnomadAlleleLink(ValidTime, Base): + """Valid-time link between an :class:`Allele` and a gnomAD variant. + + Replaces the frozen ``gnomad_variants_mapped_variants`` association table for new-model writes. + A link is live while ``valid_to`` is NULL; a gnomAD version bump retires the live row and inserts + a successor rather than deleting, so prior-version frequency links remain queryable point-in-time. + The partial unique index enforces **a single live link per allele** — gnomAD frequency is one + current value, so a new version supersedes the old (unlike ClinVar, which keeps one live link per + release). This matches the VEP consequence shape, not the ClinVar control shape. + """ + + __tablename__ = "gnomad_allele_links" + + id: Mapped[int] = Column(Integer, primary_key=True) + allele_id: Mapped[int] = Column( + Integer, + ForeignKey("alleles.id", ondelete="RESTRICT"), + nullable=False, + ) + gnomad_variant_id: Mapped[int] = Column( + Integer, + ForeignKey("gnomad_variants.id", ondelete="RESTRICT"), + nullable=False, + ) + + allele: Mapped["Allele"] = relationship("Allele") + gnomad_variant: Mapped["GnomADVariant"] = relationship("GnomADVariant", back_populates="allele_links") + + __table_args__ = ( + Index( + "ix_gnomad_allele_links_allele_id", + "allele_id", + ), + Index( + "ix_gnomad_allele_links_gnomad_variant_id", + "gnomad_variant_id", + ), + # At most one live link per allele. A version bump supersedes (retires the old, inserts the + # new) rather than accumulating per-version live links; superseded rows stay for point-in-time + # queries. Only the live row participates in this constraint. + Index( + "uq_gnomad_allele_links_live", + "allele_id", + unique=True, + postgresql_where=text("valid_to IS NULL"), + ), + ) diff --git a/src/mavedb/models/gnomad_variant.py b/src/mavedb/models/gnomad_variant.py index 0f4a00a5..bfc7c802 100644 --- a/src/mavedb/models/gnomad_variant.py +++ b/src/mavedb/models/gnomad_variant.py @@ -1,20 +1,21 @@ from datetime import date from typing import TYPE_CHECKING -from sqlalchemy import Column, Date, Integer, Float, String +from sqlalchemy import Column, Date, Float, Integer, String from sqlalchemy.orm import Mapped, relationship from mavedb.db.base import Base from mavedb.models.gnomad_variant_mapped_variant import gnomad_variants_mapped_variants_association_table if TYPE_CHECKING: + from mavedb.models.gnomad_allele_link import GnomadAlleleLink from mavedb.models.mapped_variant import MappedVariant class GnomADVariant(Base): __tablename__ = "gnomad_variants" - id = Column(Integer, primary_key=True) + id: Mapped[int] = Column(Integer, primary_key=True) db_name = Column(String, nullable=False) db_identifier = Column(String, nullable=False, index=True) @@ -30,8 +31,16 @@ class GnomADVariant(Base): creation_date = Column(Date, nullable=False, default=date.today) modification_date = Column(Date, nullable=False, default=date.today, onupdate=date.today) + # Frozen association to the old MappedVariant model — read by serving for existing data, never + # written for new score sets (which link through ``allele_links`` instead). mapped_variants: Mapped[list["MappedVariant"]] = relationship( "MappedVariant", secondary=gnomad_variants_mapped_variants_association_table, back_populates="gnomad_variants", ) + + # Valid-time links to deduplicated alleles (new-model writes). + allele_links: Mapped[list["GnomadAlleleLink"]] = relationship( + "GnomadAlleleLink", + back_populates="gnomad_variant", + ) diff --git a/src/mavedb/models/mapped_variant.py b/src/mavedb/models/mapped_variant.py index b35f1bbb..40325cf1 100644 --- a/src/mavedb/models/mapped_variant.py +++ b/src/mavedb/models/mapped_variant.py @@ -11,7 +11,7 @@ from mavedb.models.gnomad_variant_mapped_variant import gnomad_variants_mapped_variants_association_table if TYPE_CHECKING: - from .clinical_control import ClinicalControl + from .clinical_control import ClinvarControl from .gnomad_variant import GnomADVariant from .target_gene_mapping import TargetGeneMapping from .variant import Variant @@ -61,8 +61,8 @@ class MappedVariant(Base): hgvs_c = Column(String, nullable=True) hgvs_p = Column(String, nullable=True) - clinical_controls: Mapped[list["ClinicalControl"]] = relationship( - "ClinicalControl", + clinical_controls: Mapped[list["ClinvarControl"]] = relationship( + "ClinvarControl", secondary=mapped_variants_clinical_controls_association_table, back_populates="mapped_variants", ) diff --git a/src/mavedb/models/variant_translation.py b/src/mavedb/models/variant_translation.py index a50d36ce..36aa0dfd 100644 --- a/src/mavedb/models/variant_translation.py +++ b/src/mavedb/models/variant_translation.py @@ -6,6 +6,10 @@ class VariantTranslation(Base): + """FROZEN (serving-only). Written by the retired populate_variant_translations_for_score_set job; + superseded by the reverse-translation allele equivalence space. Read for existing old-model data, + never written for new score sets, dropped at read-cutover. See lib/variant_translations.py.""" + __tablename__ = "variant_translations" aa_clingen_id = Column(String, nullable=False, primary_key=True) diff --git a/src/mavedb/models/vep_allele_consequence.py b/src/mavedb/models/vep_allele_consequence.py new file mode 100644 index 00000000..de41230a --- /dev/null +++ b/src/mavedb/models/vep_allele_consequence.py @@ -0,0 +1,75 @@ +from datetime import date +from typing import TYPE_CHECKING + +from sqlalchemy import Column, Date, ForeignKey, Index, Integer, String, text +from sqlalchemy.orm import Mapped, relationship + +from mavedb.db.base import Base +from mavedb.db.mixins import ValidTime + +if TYPE_CHECKING: + from .allele import Allele + + +class VepAlleleConsequence(ValidTime, Base): + """Valid-time VEP functional-consequence result for a deduplicated :class:`Allele`. + + Replaces the frozen ``vep_functional_consequence``/``vep_access_date`` columns on + ``MappedVariant`` for new-model writes (Step 2 of the annotation infrastructure migration, + docs/design/annotation-infrastructure-migration.md). A row is live while ``valid_to`` is NULL; + the partial unique index enforces **a single live consequence per allele** — VEP's most-severe + consequence is one current value, so a changed result supersedes the prior row rather than + accumulating. This matches the gnomAD link shape, not ClinVar's multi-live shape. + + ``source_version`` is the Ensembl release the consequence was resolved under (e.g. ``"116"``, + from ``/info/software``). An Ensembl release is coordinated — software + transcript set + + consequence vocabulary all bump together under one number — so this single value version-keys the + upstream result exactly like gnomAD's ``db_version``. The job skips re-querying any allele already + live at the current release. What it does **not** capture is our own ``VEP_CONSEQUENCES`` severity + ordering (the list we pick "most severe" from); a change to that is a manual ``force`` re-run, not + an automatic supersede. + + Supersede is deliberately **value-keyed, not version-keyed** (the one divergence from gnomAD): a VEP + consequence is categorical and usually identical across releases, so superseding on every release + bump would churn history every quarter with rows recording "still missense, still missense." Instead + a new release that resolves the *same* consequence advances ``source_version``/``access_date`` in + place — no supersede — and only a *changed* consequence retires the old row and inserts a successor. + The trade-off: the live row's ``source_version`` is the latest release that confirmed the value, not + the release it first appeared; acceptable because it describes the currently-held value's + provenance, not when it became true. ``access_date`` is retained as a human-facing "last confirmed" + audit stamp; it is no longer load-bearing for the skip. + + ``functional_consequence`` is nullable to leave room for a future negative cache (NULL = "VEP ran + and found nothing"); the current job writes only non-null consequences and re-queries no-result + alleles each run, mirroring gnomAD's no-match handling. + """ + + __tablename__ = "vep_allele_consequences" + + id: Mapped[int] = Column(Integer, primary_key=True) + allele_id: Mapped[int] = Column( + Integer, + ForeignKey("alleles.id", ondelete="RESTRICT"), + nullable=False, + ) + functional_consequence = Column(String, nullable=True) + source_version: Mapped[str] = Column(String, nullable=False) + access_date: Mapped[date] = Column(Date, nullable=False) + + allele: Mapped["Allele"] = relationship("Allele") + + __table_args__ = ( + Index( + "ix_vep_allele_consequences_allele_id", + "allele_id", + ), + # At most one live consequence per allele. A changed result supersedes (retires the old, + # inserts the new) rather than accumulating; superseded rows stay for point-in-time queries. + # Only the live row participates in this constraint. + Index( + "uq_vep_allele_consequences_live", + "allele_id", + unique=True, + postgresql_where=text("valid_to IS NULL"), + ), + ) diff --git a/src/mavedb/routers/score_sets.py b/src/mavedb/routers/score_sets.py index 74f2a5c0..1a8866c6 100644 --- a/src/mavedb/routers/score_sets.py +++ b/src/mavedb/routers/score_sets.py @@ -71,7 +71,7 @@ generate_score_set_urn, ) from mavedb.lib.workflow.pipeline_factory import PipelineFactory -from mavedb.models.clinical_control import ClinicalControl +from mavedb.models.clinical_control import ClinvarControl from mavedb.models.contributor import Contributor from mavedb.models.enums.processing_state import ProcessingState from mavedb.models.experiment import Experiment @@ -2441,7 +2441,7 @@ async def get_clinical_controls_for_score_set( user_data: UserData = Depends(get_current_user), db: Optional[str] = None, version: Optional[str] = None, -) -> Sequence[ClinicalControl]: +) -> Sequence[ClinvarControl]: """ Fetch relevant clinical controls for a given score set. """ @@ -2462,23 +2462,23 @@ async def get_clinical_controls_for_score_set( assert_permission(user_data, item, Action.READ) clinical_controls_query = ( - select(ClinicalControl) - .join(ClinicalControl.mapped_variants) + select(ClinvarControl) + .join(ClinvarControl.mapped_variants) .join(MappedVariant.variant) - .options(contains_eager(ClinicalControl.mapped_variants).contains_eager(MappedVariant.variant)) + .options(contains_eager(ClinvarControl.mapped_variants).contains_eager(MappedVariant.variant)) .filter(MappedVariant.current.is_(True)) .filter(Variant.score_set_id == item.id) ) if db_name is not None: save_to_logging_context({"db_name": db_name}) - clinical_controls_query = clinical_controls_query.filter(ClinicalControl.db_name == db_name) + clinical_controls_query = clinical_controls_query.filter(ClinvarControl.db_name == db_name) if db_version is not None: save_to_logging_context({"db_version": db_version}) - clinical_controls_query = clinical_controls_query.filter(ClinicalControl.db_version == db_version) + clinical_controls_query = clinical_controls_query.filter(ClinvarControl.db_version == db_version) - clinical_controls: Sequence[ClinicalControl] = _db.scalars(clinical_controls_query).unique().all() + clinical_controls: Sequence[ClinvarControl] = _db.scalars(clinical_controls_query).unique().all() if not clinical_controls: logger.info( @@ -2526,8 +2526,8 @@ async def get_clinical_controls_options_for_score_set( assert_permission(user_data, item, Action.READ) clinical_controls_query = ( - select(ClinicalControl.db_name, ClinicalControl.db_version) - .join(MappedVariant, ClinicalControl.mapped_variants) + select(ClinvarControl.db_name, ClinvarControl.db_version) + .join(MappedVariant, ClinvarControl.mapped_variants) .join(Variant) .where(MappedVariant.current.is_(True)) .where(Variant.score_set_id == item.id) diff --git a/src/mavedb/scripts/pipeline_tracking.py b/src/mavedb/scripts/pipeline_tracking.py new file mode 100644 index 00000000..15f7c444 --- /dev/null +++ b/src/mavedb/scripts/pipeline_tracking.py @@ -0,0 +1,195 @@ +"""Operator-facing CLI for tracking which score sets need a pipeline run. + +Command +------- +list-score-sets + Produces a table of all score sets with their most recent pipeline run and + whether they need to be re-processed since a given deployment cutoff date. + +Usage: + + # Tracking list — show all published score sets and last pipeline run + poetry run python -m mavedb.scripts.pipeline_tracking list-score-sets + + # Filter to only those whose last run is before (or missing since) a deployment date + poetry run python -m mavedb.scripts.pipeline_tracking list-score-sets \\ + --needs-rerun-since 2026-05-01 + + # Include private score sets + poetry run python -m mavedb.scripts.pipeline_tracking list-score-sets --include-private + + # JSON output for piping / spreadsheet import + poetry run python -m mavedb.scripts.pipeline_tracking list-score-sets --json +""" + +import json +import logging +from datetime import datetime, timezone +from typing import Optional + +import asyncclick as click +from sqlalchemy import Integer, cast, select +from sqlalchemy.orm import Session + +from mavedb.models.job_run import JobRun +from mavedb.models.pipeline import Pipeline +from mavedb.models.score_set import ScoreSet +from mavedb.scripts.environment import script_environment, with_database_session + +logger = logging.getLogger(__name__) + + +def _format_dt(dt: Optional[datetime]) -> str: + return dt.isoformat() if dt else "-" + + +def _needs_rerun(last_pipeline_finished: Optional[datetime], cutoff: Optional[datetime]) -> bool: + """Return True if the score set needs a re-run relative to *cutoff*.""" + if cutoff is None: + return False + if last_pipeline_finished is None: + return True + + # Normalise both to UTC-aware for comparison + aware_cutoff = cutoff if cutoff.tzinfo else cutoff.replace(tzinfo=timezone.utc) + aware_finished = ( + last_pipeline_finished if last_pipeline_finished.tzinfo else last_pipeline_finished.replace(tzinfo=timezone.utc) + ) + return aware_finished < aware_cutoff + + +def _last_pipeline_subquery(db: Session, score_set_id: int) -> Optional[Pipeline]: + """Return the most recently finished (or created) pipeline for *score_set_id*.""" + # score_set_id is stored as an integer in job_params JSONB + job_run_sq = ( + select(JobRun.pipeline_id) + .where(cast(JobRun.job_params["score_set_id"].astext, Integer) == score_set_id) + .where(JobRun.pipeline_id.is_not(None)) + .distinct() + .subquery() + ) + + return db.scalars( + select(Pipeline).where(Pipeline.id.in_(select(job_run_sq))).order_by(Pipeline.created_at.desc()).limit(1) + ).one_or_none() + + +def _build_score_set_rows( + db: Session, + score_sets: list[ScoreSet], + cutoff: Optional[datetime], +) -> list[dict]: + rows = [] + for ss in score_sets: + last_pipeline = _last_pipeline_subquery(db, ss.id) + last_pipeline_name = last_pipeline.name if last_pipeline else None + last_pipeline_status = str(last_pipeline.status) if last_pipeline else None + last_finished = last_pipeline.finished_at if last_pipeline else None + last_created = last_pipeline.created_at if last_pipeline else None + + rows.append( + { + "score_set_urn": ss.urn or "(no urn)", + "processing_state": str(ss.processing_state) if ss.processing_state else "-", + "mapping_state": str(ss.mapping_state) if ss.mapping_state else "-", + "num_variants": ss.num_variants, + "private": ss.private, + "last_pipeline_name": last_pipeline_name or "-", + "last_pipeline_status": last_pipeline_status or "-", + "last_pipeline_created_at": _format_dt(last_created), + "last_pipeline_finished_at": _format_dt(last_finished), + "needs_rerun": _needs_rerun(last_finished, cutoff), + } + ) + + return rows + + +@script_environment.command(name="list-score-sets") +@with_database_session +@click.option( + "--needs-rerun-since", + "needs_rerun_since", + default=None, + help=( + "ISO date/datetime of a deployment cutoff (e.g. 2026-05-01 or 2026-05-01T12:00:00). " + "Score sets whose last pipeline finished before this timestamp (or have never run) are " + "flagged as needing a re-run." + ), +) +@click.option("--include-private", is_flag=True, default=False, help="Include private (unpublished) score sets.") +@click.option("--needs-rerun-only", is_flag=True, default=False, help="Only show score sets that need a re-run.") +@click.option("--limit", type=int, default=None, help="Cap the number of rows returned (applied after all filtering).") +@click.option("--json", "as_json", is_flag=True, help="Emit results as JSON.") +def list_score_sets( + db: Session, + needs_rerun_since: Optional[str], + include_private: bool, + needs_rerun_only: bool, + limit: Optional[int], + as_json: bool, +) -> None: + """List all score sets with their last pipeline run and re-run status.""" + cutoff: Optional[datetime] = None + if needs_rerun_since: + try: + cutoff = datetime.fromisoformat(needs_rerun_since) + except ValueError: + click.echo( + f"Invalid --needs-rerun-since value: {needs_rerun_since!r}. Use ISO format e.g. 2026-05-01.", err=True + ) + raise SystemExit(1) + + query = select(ScoreSet).where(ScoreSet.urn.is_not(None)) + if not include_private: + query = query.where(ScoreSet.private == False) # noqa: E712 + query = query.order_by(ScoreSet.urn) + + score_sets = db.scalars(query).all() + rows = _build_score_set_rows(db, list(score_sets), cutoff) + + if needs_rerun_only: + rows = [r for r in rows if r["needs_rerun"]] + + if limit is not None: + rows = rows[:limit] + + if as_json: + click.echo(json.dumps(rows, indent=2, default=str)) + return + + if not rows: + click.echo("No score sets match the given filters.") + return + + needs_rerun_count = sum(1 for r in rows if r["needs_rerun"]) + click.echo(f"Total: {len(rows)} score set(s)" + (f", {needs_rerun_count} need re-run" if cutoff else "")) + if cutoff: + click.echo(f"Cutoff: {cutoff.isoformat()}") + click.echo() + + col_w = {"urn": 28, "ps": 28, "ms": 26, "nv": 8, "pipe": 28, "pipe_status": 12, "finished": 12} + header = ( + f"{'URN':<{col_w['urn']}} {'PROC_STATE':<{col_w['ps']}} {'MAP_STATE':<{col_w['ms']}} " + f"{'VARIANTS':>{col_w['nv']}} {'LAST_PIPELINE':<{col_w['pipe']}} " + f"{'PIPE_STATUS':<{col_w['pipe_status']}} {'LAST_FINISHED':<{col_w['finished']}}" + + (" RERUN?" if cutoff else "") + ) + click.echo(header) + click.echo("-" * len(header)) + + for r in rows: + rerun_flag = ("YES" if r["needs_rerun"] else "no") if cutoff else "" + click.echo( + f"{r['score_set_urn']:<{col_w['urn']}} " + f"{r['processing_state']:<{col_w['ps']}} " + f"{r['mapping_state']:<{col_w['ms']}} " + f"{r['num_variants']:>{col_w['nv']}} " + f"{r['last_pipeline_name']:<{col_w['pipe']}} " + f"{r['last_pipeline_status']:<{col_w['pipe_status']}} " + f"{r['last_pipeline_finished_at']:<{col_w['finished']}}" + (f" {rerun_flag}" if cutoff else "") + ) + + +if __name__ == "__main__": + script_environment() diff --git a/src/mavedb/scripts/variant_annotations.py b/src/mavedb/scripts/variant_annotations.py index ac9ff916..fe19172c 100644 --- a/src/mavedb/scripts/variant_annotations.py +++ b/src/mavedb/scripts/variant_annotations.py @@ -1,14 +1,17 @@ -"""Operator-facing CLI for inspecting variant annotation state. +"""Operator-facing CLI for inspecting a score set's current annotation status. + +The score set is the entry point (pipelines run per score set), but annotation status is a per-allele +fact: this resolves the score set's *current* alleles (and variants) through the live mapping links and +reports each subject's true current status — including statuses produced by another score set's run +that landed on a shared allele. It does **not** filter by which run wrote the event. Usage: - # Summarize annotation status counts for all variants in a score set poetry run python -m mavedb.scripts.variant_annotations show-score-set urn:mavedb:00000001-a-1 poetry run python -m mavedb.scripts.variant_annotations show-score-set urn:mavedb:00000001-a-1 --json -For per-variant detail, query the v_variant_annotations view directly: - SELECT * FROM v_variant_annotations WHERE score_set_urn = 'urn:mavedb:00000001-a-1'; - SELECT * FROM v_variant_annotations WHERE variant_urn = 'urn:mavedb:00000001-a-1#42'; - SELECT * FROM v_variant_annotations WHERE annotation_status = 'failed'; +For per-subject detail, query the v_current_annotation_events view directly: + SELECT * FROM v_current_annotation_events WHERE allele_id = ; + SELECT * FROM v_current_annotation_events WHERE disposition = 'failed'; """ import json @@ -16,12 +19,13 @@ from typing import Optional import asyncclick as click -from sqlalchemy import func, select +from sqlalchemy import func, or_, select from sqlalchemy.orm import Session +from mavedb.lib.clingen.alleles import get_alleles_for_score_set +from mavedb.models.annotation_event_view import CurrentAnnotationEventView from mavedb.models.score_set import ScoreSet from mavedb.models.variant import Variant -from mavedb.models.variant_annotation_status import VariantAnnotationStatus from mavedb.scripts.environment import script_environment, with_database_session logger = logging.getLogger(__name__) @@ -31,34 +35,52 @@ def _get_score_set(db: Session, urn: str) -> Optional[ScoreSet]: return db.scalars(select(ScoreSet).where(ScoreSet.urn == urn)).one_or_none() +def current_annotation_summary(db: Session, score_set: ScoreSet) -> list[dict]: + """Current annotation disposition counts for a score set, by annotation type. + + Resolves the score set's subjects — its current alleles (via the live mapping links) for + allele-subject types, and its variants for variant-subject types (mapping/RT/LDH) — then counts the + current status of each in ``v_current_annotation_events``. Allele-subject counts are per allele + (shared alleles counted once), reflecting the true current status regardless of which run produced + it; variant-subject counts are per variant. + """ + allele_ids = {row.allele_id for row in get_alleles_for_score_set(db, score_set.id)} + variant_ids = set(db.scalars(select(Variant.id).where(Variant.score_set_id == score_set.id)).all()) + + if not allele_ids and not variant_ids: + return [] + + rows = db.execute( + select( + CurrentAnnotationEventView.annotation_type, + CurrentAnnotationEventView.disposition, + func.count().label("count"), + ) + .where( + or_( + CurrentAnnotationEventView.allele_id.in_(allele_ids), + CurrentAnnotationEventView.variant_id.in_(variant_ids), + ) + ) + .group_by(CurrentAnnotationEventView.annotation_type, CurrentAnnotationEventView.disposition) + .order_by(CurrentAnnotationEventView.annotation_type, CurrentAnnotationEventView.disposition) + ).all() + + return [{"annotation_type": r.annotation_type, "disposition": r.disposition, "count": r.count} for r in rows] + + @script_environment.command(name="show-score-set") @with_database_session @click.argument("score_set_urn") @click.option("--json", "as_json", is_flag=True, help="Emit result as JSON.") def show_score_set(db: Session, score_set_urn: str, as_json: bool) -> None: - """Summarize annotation status counts for all variants in a score set.""" + """Summarize the current annotation status of a score set's alleles and variants.""" score_set = _get_score_set(db, score_set_urn) if score_set is None: click.echo(f"Score set not found: {score_set_urn}", err=True) raise SystemExit(1) - # Count current annotation statuses grouped by annotation_type and status - rows = db.execute( - select( - VariantAnnotationStatus.annotation_type, - VariantAnnotationStatus.status, - func.count().label("count"), - ) - .join(Variant, Variant.id == VariantAnnotationStatus.variant_id) - .where( - Variant.score_set_id == score_set.id, - VariantAnnotationStatus.current == True, # noqa: E712 - ) - .group_by(VariantAnnotationStatus.annotation_type, VariantAnnotationStatus.status) - .order_by(VariantAnnotationStatus.annotation_type, VariantAnnotationStatus.status) - ).all() - - # Total variant count for the score set + summary = current_annotation_summary(db, score_set) total_variants = db.scalar(select(func.count()).where(Variant.score_set_id == score_set.id)) or 0 if as_json: @@ -67,9 +89,7 @@ def show_score_set(db: Session, score_set_urn: str, as_json: bool) -> None: { "score_set_urn": score_set_urn, "total_variants": total_variants, - "annotation_summary": [ - {"annotation_type": r.annotation_type, "status": r.status, "count": r.count} for r in rows - ], + "annotation_summary": summary, }, indent=2, ) @@ -77,12 +97,12 @@ def show_score_set(db: Session, score_set_urn: str, as_json: bool) -> None: return click.echo(f"Score set: {score_set_urn} ({total_variants} variants)") - click.echo(f"\n{'ANNOTATION TYPE':<32} {'STATUS':<10} COUNT") - for r in rows: - click.echo(f"{r.annotation_type:<32} {str(r.status):<10} {r.count}") + click.echo(f"\n{'ANNOTATION TYPE':<32} {'DISPOSITION':<16} COUNT") + for r in summary: + click.echo(f"{r['annotation_type']:<32} {str(r['disposition']):<16} {r['count']}") - if not rows: - click.echo("No annotation status records found.") + if not summary: + click.echo("No current annotation events found for this score set.") if __name__ == "__main__": diff --git a/src/mavedb/worker/jobs/external_services/__init__.py b/src/mavedb/worker/jobs/external_services/__init__.py index 4537c0ed..78af9251 100644 --- a/src/mavedb/worker/jobs/external_services/__init__.py +++ b/src/mavedb/worker/jobs/external_services/__init__.py @@ -5,8 +5,6 @@ - ClinGen cache pre-warming to prevent stampede on downstream annotation jobs - UniProt for protein sequence annotation and ID mapping - gnomAD for population frequency and genomic context data -- HGVS for standardized variant nomenclature population -- Variant Translation for PA<->CA allele relationship mapping - VEP for functional consequence annotation """ @@ -18,12 +16,10 @@ from .clingen_cache import warm_clingen_cache from .clinvar import refresh_clinvar_controls from .gnomad import link_gnomad_variants -from .hgvs import populate_hgvs_for_score_set from .uniprot import ( poll_uniprot_mapping_jobs_for_score_set, submit_uniprot_mapping_jobs_for_score_set, ) -from .variant_translation import populate_variant_translations_for_score_set from .vep import populate_vep_for_score_set __all__ = [ @@ -32,8 +28,6 @@ "warm_clingen_cache", "refresh_clinvar_controls", "link_gnomad_variants", - "populate_hgvs_for_score_set", - "populate_variant_translations_for_score_set", "poll_uniprot_mapping_jobs_for_score_set", "submit_uniprot_mapping_jobs_for_score_set", "populate_vep_for_score_set", diff --git a/src/mavedb/worker/jobs/external_services/clingen.py b/src/mavedb/worker/jobs/external_services/clingen.py index 27aa1b96..3e8b6a3a 100644 --- a/src/mavedb/worker/jobs/external_services/clingen.py +++ b/src/mavedb/worker/jobs/external_services/clingen.py @@ -11,7 +11,8 @@ import asyncio import functools import logging -from dataclasses import dataclass, field +from collections.abc import Sequence +from dataclasses import dataclass from sqlalchemy import select @@ -32,7 +33,9 @@ from mavedb.lib.variants import get_hgvs_from_post_mapped from mavedb.models.allele import Allele as AlleleModel from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationFailureCategory, AnnotationStatus, FailureCategory +from mavedb.models.enums.disposition import Disposition +from mavedb.models.enums.event_reason import EventReason +from mavedb.models.enums.job_pipeline import FailureCategory from mavedb.models.mapping_record import MappingRecord from mavedb.models.mapping_record_allele import MappingRecordAllele from mavedb.models.score_set import ScoreSet @@ -48,44 +51,33 @@ class _AlleleEntry: post_mapped: dict | None existing_caid: str | None - # Variants for which THIS allele is the authoritative measurement — the only ones that receive a - # per-variant VAS row. INTERIM BANDAID (do not deploy as final): keying clingen's per-variant - # status to the single authoritative link sidesteps the multiple "current" rows a full allele - # fan-out would write for one variant. Durable fix is an allele-level event log; rationale and - # migration seam in docs/design/allele-annotation-status.md. - authoritative_variant_ids: list[int] = field(default_factory=list) def _annotate_caid( annotation_manager: AnnotationStatusManager, - variant_ids: list[int], - status: AnnotationStatus, + allele_id: int, + disposition: Disposition, + reason: EventReason, *, - failure_category: AnnotationFailureCategory | None = None, error_message: str | None = None, metadata: dict | None = None, ) -> None: - """Fan a CLINGEN_ALLELE_ID annotation out to every variant served by an allele. + """Record one CLINGEN_ALLELE_ID event for an allele (the CAID is an allele-level fact). - AAS migration seam: the single choke point for clingen's per-variant VAS writes. At migration it - becomes an allele-keyed event writer; the per-variant fan-out goes away, and the variant - association narrows to provenance (who caused the registration). See - docs/design/allele-annotation-status.md. + The single choke point for CAR's status writes. Provenance (which variants drove the + registration) is derived from the live links as-of the event, not fanned out here. """ - annotation_data: dict = {"annotation_metadata": metadata or {}} + meta = dict(metadata or {}) if error_message is not None: - annotation_data["error_message"] = error_message - - for variant_id in variant_ids: - annotation_manager.add_annotation( - variant_id=variant_id, - annotation_type=AnnotationType.CLINGEN_ALLELE_ID, - version=None, - status=status, - failure_category=failure_category, - annotation_data=annotation_data, - current=True, - ) + meta["error_message"] = error_message + + annotation_manager.record_event( + AnnotationType.CLINGEN_ALLELE_ID, + allele_id=allele_id, + disposition=disposition, + reason=reason, + metadata=meta or None, + ) @with_pipeline_management @@ -174,53 +166,57 @@ async def submit_score_set_mappings_to_car(ctx: dict, job_id: int, job_manager: if row.allele_id not in allele_data: allele_data[row.allele_id] = _AlleleEntry(post_mapped=row.post_mapped, existing_caid=row.clingen_allele_id) - if row.is_authoritative: - allele_data[row.allele_id].authoritative_variant_ids.append(row.variant_id) - - annotation_manager = AnnotationStatusManager(job_manager.db, job_run_id=job_manager.job_id) + annotation_manager = AnnotationStatusManager( + job_manager.db, job_run_id=job_manager.job_id, score_set_id=score_set.id + ) # Track outcomes by distinct allele_id. clingen_allele_id is an allele-level fact (the CAID # lives on the Allele) and CAR's operation is per-allele, so the reported counts are in allele - # units — and they cover every allele submitted, including the RT-derived ones that produce no - # per-variant status row. Each allele has exactly one outcome (submitted once → one response), so - # these sets are disjoint by construction. (Per-variant VAS rows are still written via the - # authoritative link below — that is the interim bandaid, separate from these operation counts.) + # units — and they cover every allele submitted, including the RT-derived ones. Each allele has + # exactly one outcome (submitted once → one response), so these sets are disjoint by construction. linked_allele_ids: set[int] = set() preexisting_allele_ids: set[int] = set() failed_allele_ids: set[int] = set() + preexisting_alleles = [] + pending_alleles = [] + for aid, entry in allele_data.items(): + if entry.existing_caid and not force_reregister: + preexisting_alleles.append(aid) + else: + pending_alleles.append(aid) + # Pre-existing CAIDs: record success without re-submitting unless force_reregister is set. - preexisting = [aid for aid, entry in allele_data.items() if entry.existing_caid] if not force_reregister else [] + preexisting = preexisting_alleles if not force_reregister else [] for allele_id in preexisting: entry = allele_data[allele_id] preexisting_allele_ids.add(allele_id) _annotate_caid( annotation_manager, - entry.authoritative_variant_ids, - AnnotationStatus.SUCCESS, - metadata={"clingen_allele_id": entry.existing_caid, "registration_source": "preexisting"}, + allele_id, + Disposition.PRESENT, + EventReason.PREEXISTING, + metadata={"clingen_allele_id": entry.existing_caid}, ) - # Alleles that need CAR submission: new ones, or all when force_reregister=True. - pending_allele_ids = [aid for aid, entry in allele_data.items() if force_reregister or not entry.existing_caid] - job_manager.update_progress(10, 100, f"Preparing {len(pending_allele_ids)} alleles for CAR submission.") + job_manager.update_progress(10, 100, f"Preparing {len(pending_alleles)} alleles for CAR submission.") # Build HGVS → [allele_ids] map. Multi-variant cis-phased blocks produce no HGVS # (combine_cis defaults to False); those alleles are annotated as failures immediately. hgvs_to_allele_ids: dict[str, list[int]] = {} - for allele_id in pending_allele_ids: + for allele_id in pending_alleles: entry = allele_data[allele_id] hgvs = get_hgvs_from_post_mapped(entry.post_mapped) if hgvs: hgvs_to_allele_ids.setdefault(hgvs, []).append(allele_id) - # Allele is registered but post_mapped can no longer produce HGVS — data - # regression worth surfacing, but the CAID is still valid so treat it as - # preexisting rather than failing the variant. + # TODO#780 - Allele is registered but post_mapped can no longer produce HGVS — data + # regression worth surfacing, but the CAID is still valid so treat it as preexisting + # rather than failing the variant. elif entry.existing_caid: - preexisting_allele_ids.add(allele_id) + preexisting_alleles.append(allele_id) logger.warning( msg=( f"Could not construct HGVS for allele {allele_id} during force re-registration " @@ -230,12 +226,14 @@ async def submit_score_set_mappings_to_car(ctx: dict, job_id: int, job_manager: ) _annotate_caid( annotation_manager, - entry.authoritative_variant_ids, - AnnotationStatus.SUCCESS, - metadata={"clingen_allele_id": entry.existing_caid, "registration_source": "reconfirmation_skipped"}, + allele_id, + Disposition.PRESENT, + EventReason.RECONFIRMATION_SKIPPED, + metadata={"clingen_allele_id": entry.existing_caid}, ) - # No HGVS-- un-submittable. + # No HGVS-- un-submittable. The allele cannot produce an identifier to register, + # so treat this as not_applicable rather than a failure. else: failed_allele_ids.add(allele_id) logger.warning( @@ -244,9 +242,9 @@ async def submit_score_set_mappings_to_car(ctx: dict, job_id: int, job_manager: ) _annotate_caid( annotation_manager, - entry.authoritative_variant_ids, - AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.MISSING_IDENTIFIER, + allele_id, + Disposition.NOT_APPLICABLE, + EventReason.NO_HGVS, error_message="Could not extract a valid HGVS string from post-mapped allele data.", ) @@ -317,9 +315,9 @@ def _outcome_data() -> dict[str, int]: failed_allele_ids.add(allele_id) _annotate_caid( annotation_manager, - allele_data[allele_id].authoritative_variant_ids, - AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.EXTERNAL_SERVICE_REJECTED, + allele_id, + Disposition.FAILED, + EventReason.SERVICE_REJECTED, error_message="Failed to register allele with ClinGen Allele Registry.", metadata={ "submitted_hgvs": hgvs_string, @@ -341,9 +339,9 @@ def _outcome_data() -> dict[str, int]: failed_allele_ids.add(allele_id) _annotate_caid( annotation_manager, - allele_data[allele_id].authoritative_variant_ids, - AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.EXTERNAL_SERVICE_REJECTED, + allele_id, + Disposition.FAILED, + EventReason.MALFORMED_RESPONSE, error_message="ClinGen Allele Registry returned a malformed response with no allele identifier.", metadata={"submitted_hgvs": hgvs_string}, ) @@ -355,7 +353,7 @@ def _outcome_data() -> dict[str, int]: entry = allele_data[allele_id] prior_caid = entry.existing_caid - # CAID is immutable — a different value returned by CAR is a hard invariant + # TODO#780 - CAID is immutable — a different value returned by CAR is a hard invariant # violation. Do not overwrite; record a failure with full audit context. if prior_caid and prior_caid != caid: logger.error( @@ -370,9 +368,9 @@ def _outcome_data() -> dict[str, int]: failed_allele_ids.add(allele_id) _annotate_caid( annotation_manager, - entry.authoritative_variant_ids, - AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.EXTERNAL_SERVICE_REJECTED, + allele_id, + Disposition.FAILED, + EventReason.CAID_CONFLICT, error_message="CAR returned a CAID that conflicts with the stored value.", metadata={ "clingen_allele_id": prior_caid, @@ -387,7 +385,7 @@ def _outcome_data() -> dict[str, int]: allele = alleles_by_id[allele_id] allele.clingen_allele_id = caid - registration_source = "reconfirmed" if prior_caid else "this_run" + reason = EventReason.RECONFIRMED if prior_caid else EventReason.CREATED if prior_caid: logger.info( msg=f"Force re-registration confirmed same CAID {caid!r} for allele {allele_id}.", @@ -396,9 +394,10 @@ def _outcome_data() -> dict[str, int]: _annotate_caid( annotation_manager, - entry.authoritative_variant_ids, - AnnotationStatus.SUCCESS, - metadata={"clingen_allele_id": caid, "registration_source": registration_source}, + allele_id, + Disposition.PRESENT, + reason, + metadata={"clingen_allele_id": caid}, ) # Submitted HGVS with no trustworthy response: the truncated tail when the counts line up @@ -410,9 +409,9 @@ def _outcome_data() -> dict[str, int]: failed_allele_ids.add(allele_id) _annotate_caid( annotation_manager, - allele_data[allele_id].authoritative_variant_ids, - AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.EXTERNAL_API_ERROR, + allele_id, + Disposition.FAILED, + EventReason.API_ERROR, error_message="Failed to register allele with ClinGen Allele Registry.", metadata={"submitted_hgvs": hgvs_string}, ) @@ -500,17 +499,21 @@ async def submit_score_set_mappings_to_ldh(ctx: dict, job_id: int, job_manager: # MappingRecord. RT-derived equivalence alleles are intentionally excluded — LDH links each # MaveDB score to its canonical mapped variant, not to every equivalent allele (unlike CAR, # which registers a CAID per allele). - variant_objects = job_manager.db.execute( - select(Variant, MappingRecord, AlleleModel) - .join(MappingRecord, MappingRecord.variant_id == Variant.id) - .join(MappingRecordAllele, MappingRecordAllele.mapping_record_id == MappingRecord.id) - .join(AlleleModel, AlleleModel.id == MappingRecordAllele.allele_id) - .where(Variant.score_set_id == score_set.id) - .where(MappingRecord.current) - .where(MappingRecordAllele.current) - .where(MappingRecordAllele.is_authoritative.is_(True)) - .where(AlleleModel.post_mapped.is_not(None)) - ).all() + variant_objects: Sequence[tuple[Variant, MappingRecord, AlleleModel]] = ( + job_manager.db.execute( + select(Variant, MappingRecord, AlleleModel) + .join(MappingRecord, MappingRecord.variant_id == Variant.id) + .join(MappingRecordAllele, MappingRecordAllele.mapping_record_id == MappingRecord.id) + .join(AlleleModel, AlleleModel.id == MappingRecordAllele.allele_id) + .where(Variant.score_set_id == score_set.id) + .where(MappingRecord.current) + .where(MappingRecordAllele.current) + .where(MappingRecordAllele.is_authoritative.is_(True)) + .where(AlleleModel.post_mapped.is_not(None)) + ) + .tuples() + .all() + ) # Track total variants to submit job_manager.save_to_context({"total_variants_to_submit_ldh": len(variant_objects)}) @@ -525,11 +528,10 @@ async def submit_score_set_mappings_to_ldh(ctx: dict, job_id: int, job_manager: job_manager.update_progress(10, 100, f"Submitting {len(variant_objects)} mapped variants to LDH.") # Build submission content - variant_content = [] - variant_for_urn = {} + variant_content: list[tuple[str, Variant, MappingRecord, AlleleModel]] = [] + variant_for_urn: dict[str, Variant] = {} for variant, mapping_record, allele in variant_objects: - # See the note above: cis-phased blocks are skipped here pending ClinGen guidance - # (https://github.com/VariantEffect/mavedb-api/issues/764). + # cis-phased blocks are skipped here pending ClinGen guidance TODO#764 variation = get_hgvs_from_post_mapped(allele.post_mapped) if not variation: @@ -540,7 +542,8 @@ async def submit_score_set_mappings_to_ldh(ctx: dict, job_id: int, job_manager: continue variant_content.append((variation, variant, mapping_record, allele)) - variant_for_urn[variant.urn] = variant + # TODO#372: nullable URNs + variant_for_urn[variant.urn] = variant # type: ignore if not variant_content: logger.warning( @@ -567,8 +570,10 @@ async def submit_score_set_mappings_to_ldh(ctx: dict, job_id: int, job_manager: } ) - # TODO prior to finalizing: Verify typing of ClinGen submission responses. See https://reg.clinicalgenome.org/doc/AlleleRegistry_1.01.xx_api_v1.pdf - annotation_manager = AnnotationStatusManager(job_manager.db, job_run_id=job_manager.job_id) + # See https://reg.clinicalgenome.org/doc/AlleleRegistry_1.01.xx_api_v1.pdf + annotation_manager = AnnotationStatusManager( + job_manager.db, job_run_id=job_manager.job_id, score_set_id=score_set.id + ) submitted_variant_urns = set() for success in submission_successes: logger.debug( @@ -578,24 +583,21 @@ async def submit_score_set_mappings_to_ldh(ctx: dict, job_id: int, job_manager: submitted_urn = success["data"]["entId"] submitted_variant = variant_for_urn.get(submitted_urn) + # LDH echoed back an entId we never submitted — record it for investigation rather + # than crashing the whole job mid-batch. if submitted_variant is None: - # LDH echoed back an entId we never submitted — record it for investigation rather - # than crashing the whole job mid-batch. logger.warning( msg=f"LDH returned an unrecognized entId not in this submission: {submitted_urn!r}.", extra=job_manager.logging_context(), ) continue - annotation_manager.add_annotation( + annotation_manager.record_event( + AnnotationType.LDH_SUBMISSION, variant_id=submitted_variant.id, - annotation_type=AnnotationType.LDH_SUBMISSION, - version=None, - status=AnnotationStatus.SUCCESS, - annotation_data={ - "annotation_metadata": {"ldh_iri": success["data"]["ldhIri"], "ldh_id": success["data"]["ldhId"]}, - }, - current=True, + disposition=Disposition.PRESENT, + reason=EventReason.SUBMITTED, + metadata={"ldh_iri": success["data"]["ldhIri"], "ldh_id": success["data"]["ldhId"]}, ) submitted_variant_urns.add(submitted_urn) @@ -612,16 +614,12 @@ async def submit_score_set_mappings_to_ldh(ctx: dict, job_id: int, job_manager: failed_variant = variant_for_urn[failure_urn] - annotation_manager.add_annotation( + annotation_manager.record_event( + AnnotationType.LDH_SUBMISSION, variant_id=failed_variant.id, - annotation_type=AnnotationType.LDH_SUBMISSION, - version=None, - status=AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.EXTERNAL_API_ERROR, - annotation_data={ - "error_message": "Failed to submit variant to ClinGen Linked Data Hub.", - }, - current=True, + disposition=Disposition.FAILED, + reason=EventReason.API_ERROR, + metadata={"error_message": "Failed to submit variant to ClinGen Linked Data Hub."}, ) annotation_manager.flush() diff --git a/src/mavedb/worker/jobs/external_services/clinvar.py b/src/mavedb/worker/jobs/external_services/clinvar.py index 68a34a04..19a9718f 100644 --- a/src/mavedb/worker/jobs/external_services/clinvar.py +++ b/src/mavedb/worker/jobs/external_services/clinvar.py @@ -1,19 +1,18 @@ -"""ClinVar integration jobs for variant annotation +"""ClinVar integration jobs for variant annotation. -This module contains job definitions and utility functions for integrating ClinVar -variant data into MaveDB. It includes functions to fetch and parse ClinVar variant -summary data, and update MaveDB records with the latest ClinVar annotations. +Links deduplicated alleles to ClinVar clinical-control data across every archival ClinVar +release. Each release is a distinct, versioned ``ClinvarControl`` entity, and an allele +accumulates one live ``ClinvarAlleleLink`` per release it appears in (multi-live, unlike +gnomAD/VEP which hold one live result per allele). Both ClinGen API calls and ClinVar TSV data fetches are automatically cached using aiocache with Redis backend: - ClinGen API calls: 24-hour TTL - ClinVar TSV files: 90-day TTL (archival data doesn't change) - -This significantly reduces redundant network requests when refreshing ClinVar -controls across multiple months/years. """ import logging +from collections import Counter from datetime import datetime import requests @@ -22,14 +21,16 @@ from mavedb.lib.annotation_status_manager import AnnotationStatusManager from mavedb.lib.clingen.allele_registry import get_associated_clinvar_allele_id +from mavedb.lib.clingen.alleles import get_alleles_for_score_set, group_alleles_for_annotation from mavedb.lib.clinvar.utils import fetch_clinvar_variant_data from mavedb.lib.types.workflow import JobExecutionOutcome -from mavedb.models.clinical_control import ClinicalControl +from mavedb.models.clinical_control import ClinvarControl +from mavedb.models.clinvar_allele_link import ClinvarAlleleLink from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationFailureCategory, AnnotationStatus, FailureCategory -from mavedb.models.mapped_variant import MappedVariant +from mavedb.models.enums.disposition import Disposition +from mavedb.models.enums.event_reason import EventReason +from mavedb.models.enums.job_pipeline import FailureCategory from mavedb.models.score_set import ScoreSet -from mavedb.models.variant import Variant from mavedb.worker.jobs.utils.setup import validate_job_params from mavedb.worker.lib.decorators.pipeline_management import with_pipeline_management from mavedb.worker.lib.managers.job_manager import JobManager @@ -42,28 +43,71 @@ CLINVAR_START_MONTH = 2 -def generate_clinvar_versions() -> list[tuple[int, int]]: +def _generate_clinvar_versions() -> list[tuple[int, int]]: """Generate all ClinVar version (year, month) pairs from Feb 2015 to current Jan. Returns a list of (year, month) tuples representing each ClinVar archival snapshot that should be processed. """ current_year = datetime.now().year - versions = [(CLINVAR_START_YEAR, CLINVAR_START_MONTH)] - for year in range(CLINVAR_START_YEAR + 1, current_year + 1): - versions.append((year, 1)) - return versions + first_version = (CLINVAR_START_YEAR, CLINVAR_START_MONTH) + archival_versions = [(year, 1) for year in range(CLINVAR_START_YEAR + 1, current_year + 1)] + return [first_version, *archival_versions] + + +def _annotate_clinvar( + annotation_manager: AnnotationStatusManager, + allele_id: int, + disposition: Disposition, + reason: EventReason, + *, + source_version: str, + error_message: str | None = None, + metadata: dict | None = None, +) -> None: + """Record one CLINVAR_CONTROL event for an allele at a given ClinVar release. + + The single choke point for ClinVar's status writes. One event per (allele, release); provenance + (which variants drove the linkage) is derived from the live links as-of the event, not fanned out + here. Unlike gnomAD/VEP, ClinVar is multi-version, so ``source_version`` (the ClinVar release) + distinguishes an allele's events across releases. + """ + meta = dict(metadata or {}) + if error_message is not None: + meta["error_message"] = error_message + + annotation_manager.record_event( + AnnotationType.CLINVAR_CONTROL, + allele_id=allele_id, + disposition=disposition, + reason=reason, + source_version=source_version, + metadata=meta or None, + ) @with_pipeline_management async def refresh_clinvar_controls(ctx: dict, job_id: int, job_manager: JobManager) -> JobExecutionOutcome: - """Refresh ClinVar clinical control data across all archival versions. - - Iterates over every ClinVar archival snapshot (Feb 2015, then Jan of each - subsequent year through the current year), fetching TSV data and updating - clinical control records for all mapped variants in the score set. Individual - version failures are logged and skipped — the job continues processing - remaining versions. + """Link deduplicated alleles to ClinVar clinical-control data across all archival versions. + + Iterates over every ClinVar archival snapshot (Feb 2015, then Jan of each subsequent year through + the current year). For each version it resolves each allele's ClinGen Allele ID (CAID) to a ClinVar + allele id, upserts the versioned :class:`ClinvarControl`, and establishes a live + :class:`ClinvarAlleleLink`. An allele accumulates one live link per release (multi-live). Individual + version fetch failures are logged and skipped — the job continues with the remaining versions. + + Job Parameters: + - score_set_id (int): The ID of the ScoreSet whose alleles to process. + - correlation_id (str): Correlation ID for tracing requests across services. + - force (bool, optional): Bypass the per-version skip and re-resolve every allele. The link + write still get-or-creates (no duplicate live link), so a forced re-run of unchanged data + writes no new links. + + Side Effects: + - Creates ClinvarControl and ClinvarAlleleLink rows. + + Returns: + JobExecutionOutcome: outcome with version and per-link counts. """ job = job_manager.get_job() @@ -72,8 +116,9 @@ async def refresh_clinvar_controls(ctx: dict, job_id: int, job_manager: JobManag score_set = job_manager.db.scalars(select(ScoreSet).where(ScoreSet.id == job.job_params["score_set_id"])).one() # type: ignore correlation_id = job.job_params["correlation_id"] # type: ignore + force = bool(job.job_params.get("force", False)) # type: ignore[union-attr] - versions = generate_clinvar_versions() + versions = _generate_clinvar_versions() job_manager.save_to_context( { @@ -83,26 +128,56 @@ async def refresh_clinvar_controls(ctx: dict, job_id: int, job_manager: JobManag "correlation_id": correlation_id, "versions": versions, "total_versions": len(versions), + "force": force, } ) job_manager.update_progress(0, 100, f"Starting ClinVar refresh across {len(versions)} versions.") logger.info(f"Starting ClinVar refresh across {len(versions)} versions", extra=job_manager.logging_context()) - variants_to_refresh = job_manager.db.scalars( - select(MappedVariant) - .join(Variant) - .where( - Variant.score_set_id == score_set.id, - MappedVariant.current.is_(True), + # One work-unit per allele (payload = CAID; alleles without one are dropped). Covers ALL the score + # set's alleles — authoritative and RT-derived — since the genomic allele ClinVar keys on is often + # the RT-derived one. Events are allele-keyed, so every allele is recorded per release (no fan-out). + allele_data = group_alleles_for_annotation( + get_alleles_for_score_set(job_manager.db, score_set.id), + payload=lambda row: row.clingen_allele_id, + ) + job_manager.save_to_context({"num_alleles_with_caids": len(allele_data)}) + + # Link counts accumulate across all versions (an allele may link in every release it appears in). + annotation_counts: Counter[str] = Counter( + { + "created_link_count": 0, + "preexisting_link_count": 0, + "skipped_link_count": 0, + "failed_link_count": 0, + } + ) + + if not allele_data: + logger.warning( + msg="No current alleles with CAIDs were found for this score set. Skipping ClinVar refresh.", + extra=job_manager.logging_context(), + ) + job_manager.db.flush() + return JobExecutionOutcome.succeeded( + data={"versions_completed": 0, "versions_total": len(versions), **dict(annotation_counts)} + ) + + all_allele_ids = set(allele_data.keys()) + + def alleles_linked_at_version(clinvar_version: str) -> set[int]: + """Allele ids (within the work set) holding a live ClinvarAlleleLink to a control of this version.""" + return set( + job_manager.db.scalars( + select(ClinvarAlleleLink.allele_id) + .join(ClinvarControl, ClinvarControl.id == ClinvarAlleleLink.clinvar_control_id) + .where(ClinvarAlleleLink.allele_id.in_(all_allele_ids)) + .where(ClinvarAlleleLink.current) + .where(ClinvarControl.db_version == clinvar_version) + ).all() ) - ).all() - total_variants_to_refresh = len(variants_to_refresh) - job_manager.save_to_context({"total_variants_to_refresh": total_variants_to_refresh}) - total_refreshed = 0 - total_failed = 0 versions_completed = 0 - for version_index, (year, month) in enumerate(versions): clinvar_version = f"{month:02d}_{year}" job_manager.save_to_context({"current_version": clinvar_version, "version_index": version_index}) @@ -125,163 +200,190 @@ async def refresh_clinvar_controls(ctx: dict, job_id: int, job_manager: JobManag ) continue - annotation_manager = AnnotationStatusManager(job_manager.db, job_run_id=job_manager.job_id) - for mapped_variant in variants_to_refresh: - clingen_id = mapped_variant.clingen_allele_id - - if clingen_id is None: - annotation_manager.add_annotation( - variant_id=mapped_variant.variant_id, # type: ignore - annotation_type=AnnotationType.CLINVAR_CONTROL, - version=clinvar_version, - status=AnnotationStatus.SKIPPED, - failure_category=AnnotationFailureCategory.MISSING_IDENTIFIER, - annotation_data={ - "error_message": "Mapped variant does not have an associated ClinGen allele ID.", - }, - current=True, - replace_all_versions=False, + # Cost: skip alleles already linked at this version (an archival release cannot change). force + # bypasses the skip but the link write still get-or-creates, so a forced no-op writes nothing. + already_linked = set() if force else alleles_linked_at_version(clinvar_version) + + annotation_manager = AnnotationStatusManager( + job_manager.db, job_run_id=job_manager.job_id, score_set_id=score_set.id + ) + for allele_id, caid in allele_data.items(): + if allele_id in already_linked: + annotation_counts["preexisting_link_count"] += 1 + _annotate_clinvar( + annotation_manager, + allele_id, + Disposition.PRESENT, + EventReason.PREEXISTING, + source_version=clinvar_version, + metadata={"clingen_allele_id": caid}, ) continue - if "," in clingen_id: - annotation_manager.add_annotation( - variant_id=mapped_variant.variant_id, # type: ignore - annotation_type=AnnotationType.CLINVAR_CONTROL, - version=clinvar_version, - status=AnnotationStatus.SKIPPED, - failure_category=AnnotationFailureCategory.UNSUPPORTED_IDENTIFIER, - annotation_data={ - "error_message": "Multi-variant ClinGen allele IDs cannot be associated with ClinVar data.", - }, - current=True, - replace_all_versions=False, + # A cis-block (multi-variant) CAID structurally cannot key ClinVar — a terminal gap, not + # a statement about ClinVar's contents. + if "," in caid: + annotation_counts["skipped_link_count"] += 1 + _annotate_clinvar( + annotation_manager, + allele_id, + Disposition.NOT_APPLICABLE, + EventReason.MULTI_VARIANT_CAID, + source_version=clinvar_version, + error_message="Multi-variant ClinGen allele IDs cannot be associated with ClinVar data.", + metadata={"clingen_allele_id": caid}, ) continue try: - clinvar_allele_id = await get_associated_clinvar_allele_id(clingen_id) # type: ignore + clinvar_allele_id = await get_associated_clinvar_allele_id(caid) except requests.exceptions.RequestException as exc: - annotation_manager.add_annotation( - variant_id=mapped_variant.variant_id, # type: ignore - annotation_type=AnnotationType.CLINVAR_CONTROL, - version=clinvar_version, - status=AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.EXTERNAL_API_ERROR, - annotation_data={ - "error_message": f"Failed to retrieve ClinVar allele ID from ClinGen API: {str(exc)}", - }, - current=True, - replace_all_versions=False, + annotation_counts["failed_link_count"] += 1 + _annotate_clinvar( + annotation_manager, + allele_id, + Disposition.FAILED, + EventReason.API_ERROR, + source_version=clinvar_version, + error_message=f"Failed to retrieve ClinVar allele ID from ClinGen API: {str(exc)}", + metadata={"clingen_allele_id": caid}, ) logger.error( - f"Failed to retrieve ClinVar allele ID from ClinGen API for ClinGen allele ID {clingen_id}.", + f"Failed to retrieve ClinVar allele ID from ClinGen API for ClinGen allele ID {caid}.", extra=job_manager.logging_context(), exc_info=exc, ) - total_failed += 1 continue + # ClinGen has no ClinVar AlleleID for this CAID — the allele is not a ClinVar control: an + # informative negative about the source, not a pipeline gap. if not clinvar_allele_id: - annotation_manager.add_annotation( - variant_id=mapped_variant.variant_id, # type: ignore - annotation_type=AnnotationType.CLINVAR_CONTROL, - version=clinvar_version, - status=AnnotationStatus.SKIPPED, - failure_category=AnnotationFailureCategory.NO_LINKED_ALLELE, - annotation_data={ - "error_message": "No ClinVar allele ID found for ClinGen allele ID.", - }, - current=True, - replace_all_versions=False, + annotation_counts["skipped_link_count"] += 1 + _annotate_clinvar( + annotation_manager, + allele_id, + Disposition.ABSENT, + EventReason.NO_RECORD, + source_version=clinvar_version, + error_message="No ClinVar allele ID found for ClinGen allele ID.", + metadata={"clingen_allele_id": caid}, ) continue + # The allele has a ClinVar AlleleID but it is absent from this release's snapshot — a + # genuine, version-scoped negative (ClinVar queried, nothing for this release). if clinvar_allele_id not in tsv_data: - annotation_manager.add_annotation( - variant_id=mapped_variant.variant_id, # type: ignore - annotation_type=AnnotationType.CLINVAR_CONTROL, - version=clinvar_version, - status=AnnotationStatus.SKIPPED, - failure_category=AnnotationFailureCategory.EXTERNAL_REFERENCE_NOT_FOUND, - annotation_data={ - "error_message": "No ClinVar data found for ClinVar allele ID.", - }, - current=True, - replace_all_versions=False, + annotation_counts["skipped_link_count"] += 1 + _annotate_clinvar( + annotation_manager, + allele_id, + Disposition.ABSENT, + EventReason.NO_RECORD, + source_version=clinvar_version, + error_message="No ClinVar data found for ClinVar allele ID.", + metadata={"clingen_allele_id": caid, "clinvar_allele_id": clinvar_allele_id}, ) continue variant_data = tsv_data[clinvar_allele_id] - identifier = str(clinvar_allele_id) - # Atomic upsert — avoids a check-then-act race when two - # refresh_clinvar_controls jobs run concurrently for different - # score sets and encounter the same (db_name, db_identifier, - # db_version) tuple. ON CONFLICT DO UPDATE is guaranteed to - # return exactly one row regardless of concurrent inserts. + # Atomic upsert — avoids a check-then-act race when two refresh_clinvar_controls jobs run + # concurrently for different score sets and encounter the same + # (db_name, db_identifier, db_version) tuple. ON CONFLICT DO UPDATE returns exactly one row. upsert_stmt = ( - pg_insert(ClinicalControl) + pg_insert(ClinvarControl) .values( - db_identifier=identifier, + db_identifier=str(clinvar_allele_id), db_version=clinvar_version, db_name="ClinVar", gene_symbol=variant_data.get("GeneSymbol"), clinical_significance=variant_data.get("ClinicalSignificance"), clinical_review_status=variant_data.get("ReviewStatus"), + clinvar_variation_id=variant_data.get("VariationID"), ) .on_conflict_do_update( - constraint="uq_clinical_controls_db_name_identifier_version", + constraint="uq_clinvar_controls_db_name_identifier_version", set_={ "gene_symbol": variant_data.get("GeneSymbol"), "clinical_significance": variant_data.get("ClinicalSignificance"), "clinical_review_status": variant_data.get("ReviewStatus"), + "clinvar_variation_id": variant_data.get("VariationID"), }, ) - .returning(ClinicalControl) + .returning(ClinvarControl) ) - clinvar_variant = job_manager.db.scalars(upsert_stmt).one() - - job_manager.db.add(clinvar_variant) - job_manager.db.flush() - - if clinvar_variant not in mapped_variant.clinical_controls: - mapped_variant.clinical_controls.append(clinvar_variant) - job_manager.db.add(mapped_variant) - - annotation_manager.add_annotation( - variant_id=mapped_variant.variant_id, # type: ignore - annotation_type=AnnotationType.CLINVAR_CONTROL, - version=clinvar_version, - status=AnnotationStatus.SUCCESS, - annotation_data={ - "annotation_metadata": { - "clinvar_allele_id": clinvar_allele_id, - }, - }, - current=True, - replace_all_versions=False, + clinvar_control = job_manager.db.scalars(upsert_stmt).one() + + # At most one live link per (allele, release). Normally a release is immutable, so the + # allele's live link for this version is either a reconfirm of this same control (no-op) or + # absent (insert) — multi-live accumulates only across *different* releases, never supersedes. + # Defensive guard: if the allele already holds a live link to a *different* control of this + # same version, the release re-resolved under us (re-ingestion / upstream correction — should + # never happen for archival data). Supersede newest-wins with a shared timestamp and log. + live_link = job_manager.db.scalar( + select(ClinvarAlleleLink) + .join(ClinvarControl, ClinvarControl.id == ClinvarAlleleLink.clinvar_control_id) + .where( + ClinvarAlleleLink.allele_id == allele_id, + ClinvarAlleleLink.current, + ClinvarControl.db_version == clinvar_version, + ) ) + if live_link is None: + job_manager.db.add(ClinvarAlleleLink(allele_id=allele_id, clinvar_control_id=clinvar_control.id)) + annotation_counts["created_link_count"] += 1 + reason = EventReason.CREATED + elif live_link.clinvar_control_id == clinvar_control.id: + annotation_counts["preexisting_link_count"] += 1 + reason = EventReason.PREEXISTING + else: + live_link.supersede_with( + job_manager.db, + ClinvarAlleleLink(allele_id=allele_id, clinvar_control_id=clinvar_control.id), + ) + annotation_counts["created_link_count"] += 1 + reason = EventReason.SUPERSEDED + logger.warning( + msg=( + f"Allele {allele_id} held a live ClinVar link to control " + f"{live_link.clinvar_control_id} for version {clinvar_version}, but re-resolved to " + f"control {clinvar_control.id} (ClinVar allele {clinvar_allele_id}). Superseding " + "newest-wins; archival ClinVar data should be immutable — investigate the upstream " + "re-resolution." + ), + extra=job_manager.logging_context(), + ) - total_refreshed += 1 + _annotate_clinvar( + annotation_manager, + allele_id, + Disposition.PRESENT, + reason, + source_version=clinvar_version, + metadata={"clingen_allele_id": caid, "clinvar_allele_id": clinvar_allele_id}, + ) annotation_manager.flush() + job_manager.db.flush() versions_completed += 1 logger.info( - f"Completed ClinVar version {clinvar_version} for {total_variants_to_refresh} variants.", + f"Completed ClinVar version {clinvar_version} for {len(allele_data)} alleles.", extra=job_manager.logging_context(), ) logger.info( f"ClinVar refresh complete: {versions_completed}/{len(versions)} versions, " - f"{total_refreshed} variant-version annotations.", + f"{annotation_counts['created_link_count']} new links, " + f"{annotation_counts['preexisting_link_count']} preexisting.", extra=job_manager.logging_context(), ) - if total_failed > 0 and total_refreshed == 0: - error_message = ( - f"All {total_failed} ClinVar lookups failed for score set {score_set.urn}. Possible ClinGen API outage." - ) + if ( + annotation_counts["failed_link_count"] > 0 + and annotation_counts["created_link_count"] == 0 + and annotation_counts["preexisting_link_count"] == 0 + ): + error_message = f"All {annotation_counts['failed_link_count']} ClinVar lookups failed for score set {score_set.urn}. Possible ClinGen API outage." logger.error(error_message, extra=job_manager.logging_context()) job_manager.db.flush() return JobExecutionOutcome.failed( @@ -289,16 +391,12 @@ async def refresh_clinvar_controls(ctx: dict, job_id: int, job_manager: JobManag data={ "versions_completed": versions_completed, "versions_total": len(versions), - "variant_annotations": 0, + **dict(annotation_counts), }, failure_category=FailureCategory.DEPENDENCY_FAILURE, ) job_manager.db.flush() return JobExecutionOutcome.succeeded( - data={ - "versions_completed": versions_completed, - "versions_total": len(versions), - "variant_annotations": total_refreshed, - } + data={"versions_completed": versions_completed, "versions_total": len(versions), **dict(annotation_counts)} ) diff --git a/src/mavedb/worker/jobs/external_services/gnomad.py b/src/mavedb/worker/jobs/external_services/gnomad.py index 969c3a20..3b7a48d1 100644 --- a/src/mavedb/worker/jobs/external_services/gnomad.py +++ b/src/mavedb/worker/jobs/external_services/gnomad.py @@ -1,29 +1,32 @@ """gnomAD variant linking jobs for population frequency annotation. -This module handles linking of mapped variants to gnomAD (Genome Aggregation Database) +This module handles linking of deduplicated alleles to gnomAD (Genome Aggregation Database) variants to provide population frequency and other genomic context information. This enrichment helps researchers understand the clinical significance and rarity of variants in their datasets. """ import logging -from typing import Sequence +from collections import Counter from sqlalchemy import select from mavedb.db import athena from mavedb.lib.annotation_status_manager import AnnotationStatusManager +from mavedb.lib.clingen.alleles import get_alleles_for_score_set, group_alleles_for_annotation from mavedb.lib.gnomad import ( GNOMAD_DATA_VERSION, + GnomadLinkVerdict, gnomad_variant_data_for_caids, - link_gnomad_variants_to_mapped_variants, + link_gnomad_variants_to_alleles, ) from mavedb.lib.types.workflow import JobExecutionOutcome from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationFailureCategory, AnnotationStatus -from mavedb.models.mapped_variant import MappedVariant +from mavedb.models.enums.disposition import Disposition +from mavedb.models.enums.event_reason import EventReason +from mavedb.models.gnomad_allele_link import GnomadAlleleLink +from mavedb.models.gnomad_variant import GnomADVariant from mavedb.models.score_set import ScoreSet -from mavedb.models.variant import Variant from mavedb.worker.jobs.utils.setup import validate_job_params from mavedb.worker.lib.decorators.pipeline_management import with_pipeline_management from mavedb.worker.lib.managers.job_manager import JobManager @@ -31,17 +34,48 @@ logger = logging.getLogger(__name__) +def _annotate_gnomad( + annotation_manager: AnnotationStatusManager, + allele_id: int, + disposition: Disposition, + reason: EventReason, + *, + error_message: str | None = None, + metadata: dict | None = None, +) -> None: + """Record one GNOMAD_ALLELE_FREQUENCY event for an allele (frequency is an allele-level fact). + + The single choke point for gnomAD's status writes. One event per allele, stamped at the current + gnomAD version; provenance (which variants drove the linkage) is derived from the live links + as-of the event, not fanned out here. + """ + meta = dict(metadata or {}) + if error_message is not None: + meta["error_message"] = error_message + + annotation_manager.record_event( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, + allele_id=allele_id, + disposition=disposition, + reason=reason, + source_version=GNOMAD_DATA_VERSION, + metadata=meta or None, + ) + + @with_pipeline_management async def link_gnomad_variants(ctx: dict, job_id: int, job_manager: JobManager) -> JobExecutionOutcome: """ - Link mapped variants to gnomAD variants based on ClinGen Allele IDs (CAIDs). - This job fetches mapped variants associated with a given score set that have CAIDs, - retrieves corresponding gnomAD variant data, and establishes links between them - in the database. + Link deduplicated alleles to gnomAD variants based on ClinGen Allele IDs (CAIDs). + This job fetches the current authoritative alleles of a score set that carry CAIDs, + retrieves corresponding gnomAD variant data, and establishes valid-time links between them. Job Parameters: - - score_set_id (int): The ID of the ScoreSet containing mapped variants to process. + - score_set_id (int): The ID of the ScoreSet whose alleles to process. - correlation_id (str): Correlation ID for tracing requests across services. + - force (bool, optional): Bypass the version-keyed skip and re-fetch every CAID-bearing + allele. The linker still supersedes only on change, so a forced re-run of unchanged data + writes no new links. Use for re-ingestion or to heal suspected link corruption. Args: ctx (dict): The job context dictionary. @@ -49,10 +83,10 @@ async def link_gnomad_variants(ctx: dict, job_id: int, job_manager: JobManager) job_manager (JobManager): The job manager instance for database and logging operations. Side Effects: - - Updates MappedVariant records to link to gnomAD variants. + - Creates GnomadAlleleLink rows linking alleles to gnomAD variants. Returns: - dict: Result indicating success and any exception details + JobExecutionOutcome: outcome with per-allele created/preexisting/skipped counts. """ # Get the job definition we are working on job = job_manager.get_job() @@ -63,6 +97,7 @@ async def link_gnomad_variants(ctx: dict, job_id: int, job_manager: JobManager) # Fetch required resources based on param inputs. Safely ignore mypy warnings here, as they were checked above. score_set = job_manager.db.scalars(select(ScoreSet).where(ScoreSet.id == job.job_params["score_set_id"])).one() # type: ignore correlation_id = job.job_params["correlation_id"] # type: ignore + force = bool(job.job_params.get("force", False)) # type: ignore[union-attr] # Setup initial context and progress job_manager.save_to_context( @@ -76,87 +111,124 @@ async def link_gnomad_variants(ctx: dict, job_id: int, job_manager: JobManager) job_manager.update_progress(0, 100, "Starting gnomAD mapped resource linkage.") logger.info(msg="Started gnomAD mapped resource linkage", extra=job_manager.logging_context()) - # We filter out mapped variants that do not have a CAID, so this query is typed # as a Sequence[str]. Ignore MyPy's type checking here. - variant_caids: Sequence[str] = job_manager.db.scalars( - select(MappedVariant.clingen_allele_id) - .join(Variant) - .join(ScoreSet) - .where( - ScoreSet.urn == score_set.urn, - MappedVariant.current.is_(True), - MappedVariant.clingen_allele_id.is_not(None), - ) - ).all() # type: ignore + # One work-unit per allele (payload = CAID; alleles without one are skipped). Covers ALL the score + # set's alleles — authoritative and RT-derived — since the genomic allele gnomAD knows is often the + # RT-derived one. Events are allele-keyed, so every linked allele is recorded. + allele_data = group_alleles_for_annotation( + get_alleles_for_score_set(job_manager.db, score_set.id), + payload=lambda row: row.clingen_allele_id, + ) - num_variant_caids = len(variant_caids) - job_manager.save_to_context({"num_variants_to_link_gnomad": num_variant_caids}) + annotation_counts: Counter[str] = Counter( + { + "created_allele_count": 0, + "preexisting_allele_count": 0, + "skipped_allele_count": 0, + } + ) + + num_alleles_with_caids = len(allele_data) + job_manager.save_to_context({"num_alleles_to_link_gnomad": num_alleles_with_caids}) - if not variant_caids: + if not allele_data: logger.warning( - msg="No current mapped variants with CAIDs were found for this score set. Skipping gnomAD linkage (nothing to do).", + msg="No current alleles with CAIDs were found for this score set. Skipping gnomAD linkage (nothing to do).", extra=job_manager.logging_context(), ) job_manager.db.flush() - return JobExecutionOutcome.succeeded(data={"linked_count": 0, "skipped_count": 0}) + return JobExecutionOutcome.succeeded(data=dict(annotation_counts)) - job_manager.update_progress(10, 100, f"Found {num_variant_caids} variants with CAIDs to link to gnomAD variants.") - logger.info( - msg="Found current mapped variants with CAIDs for this score set. Attempting to link them to gnomAD variants.", - extra=job_manager.logging_context(), + job_manager.update_progress( + 10, 100, f"Found {num_alleles_with_caids} alleles with CAIDs to link to gnomAD variants." ) - # Fetch gnomAD variant data for the CAIDs - with athena.engine.connect() as athena_session: - logger.debug("Fetching gnomAD variants from Athena.") - gnomad_variant_data = gnomad_variant_data_for_caids(athena_session, variant_caids) + def alleles_linked_at_current_version(allele_ids: set[int]) -> set[int]: + """Allele ids (within the given set) holding a live gnomAD link at the current gnomAD version.""" + if not allele_ids: + return set() + return set( + job_manager.db.scalars( + select(GnomadAlleleLink.allele_id) + .join(GnomADVariant, GnomADVariant.id == GnomadAlleleLink.gnomad_variant_id) + .where(GnomadAlleleLink.allele_id.in_(allele_ids)) + .where(GnomadAlleleLink.current) + .where(GnomADVariant.db_version == GNOMAD_DATA_VERSION) + ).all() + ) - num_gnomad_variants_with_caid_match = len(gnomad_variant_data) + # Skip alleles already linked at the current version (they can't change). force re-fetches + # all; the linker still supersedes only on change, so a forced no-op writes nothing. + already_current = set() if force else alleles_linked_at_current_version(set(allele_data.keys())) + variant_caids = sorted({allele_data[aid] for aid in allele_data if aid not in already_current}) - # NOTE: Proceed intentionally with linking even if no matches were found, to record skipped annotations. + job_manager.save_to_context( + { + "num_alleles_already_current": len(already_current), + "num_caids_to_query": len(variant_caids), + "force": force, + } + ) - job_manager.save_to_context({"num_gnomad_variants_with_caid_match": num_gnomad_variants_with_caid_match}) - job_manager.update_progress(75, 100, f"Found {num_gnomad_variants_with_caid_match} gnomAD variants matching CAIDs.") + verdicts: dict[int, GnomadLinkVerdict] = {} + if variant_caids: + with athena.engine.connect() as athena_session: + logger.debug("Fetching gnomAD variants from Athena.") + gnomad_variant_data = gnomad_variant_data_for_caids(athena_session, variant_caids) - # Link mapped variants to gnomAD variants - logger.info(msg="Attempting to link mapped variants to gnomAD variants.", extra=job_manager.logging_context()) - num_linked_gnomad_variants = link_gnomad_variants_to_mapped_variants(job_manager.db, gnomad_variant_data) - job_manager.db.flush() + num_gnomad_variants_with_caid_match = len(gnomad_variant_data) + job_manager.save_to_context({"num_gnomad_variants_with_caid_match": num_gnomad_variants_with_caid_match}) + job_manager.update_progress( + 75, 100, f"Found {num_gnomad_variants_with_caid_match} gnomAD variants matching CAIDs." + ) - # For variants which are not linked, create annotation status records indicating skipped linkage - mapped_variants_with_caids = job_manager.db.scalars( - select(MappedVariant) - .join(Variant) - .join(ScoreSet) - .where( - ScoreSet.urn == score_set.urn, - MappedVariant.current.is_(True), - MappedVariant.clingen_allele_id.is_not(None), + logger.info(msg="Attempting to link alleles to gnomAD variants.", extra=job_manager.logging_context()) + verdicts = link_gnomad_variants_to_alleles(job_manager.db, gnomad_variant_data) + job_manager.db.flush() + else: + logger.info( + msg="All CAID-bearing alleles are already linked at the current gnomAD version; skipping Athena query.", + extra=job_manager.logging_context(), ) - ).all() - annotation_manager = AnnotationStatusManager(job_manager.db, job_run_id=job_manager.job_id) - for mapped_variant in mapped_variants_with_caids: - if not mapped_variant.gnomad_variants: - annotation_manager.add_annotation( - variant_id=mapped_variant.variant_id, # type: ignore - annotation_type=AnnotationType.GNOMAD_ALLELE_FREQUENCY, - version=GNOMAD_DATA_VERSION, - status=AnnotationStatus.SKIPPED, - failure_category=AnnotationFailureCategory.EXTERNAL_REFERENCE_NOT_FOUND, - annotation_data={ - "error_message": "No gnomAD variant could be linked for this mapped variant.", - }, - current=True, + job_manager.update_progress(75, 100, "All alleles already current at this gnomAD version.") + + annotation_manager = AnnotationStatusManager( + job_manager.db, job_run_id=job_manager.job_id, score_set_id=score_set.id + ) + for allele_id, caid in allele_data.items(): + verdict = verdicts.get(allele_id) + if verdict is GnomadLinkVerdict.CREATED: + annotation_counts["created_allele_count"] += 1 + _annotate_gnomad( + annotation_manager, + allele_id, + Disposition.PRESENT, + EventReason.CREATED, + metadata={"clingen_allele_id": caid}, + ) + elif allele_id in already_current or verdict is GnomadLinkVerdict.UNCHANGED: + annotation_counts["preexisting_allele_count"] += 1 + _annotate_gnomad( + annotation_manager, + allele_id, + Disposition.PRESENT, + EventReason.PREEXISTING, + metadata={"clingen_allele_id": caid}, + ) + else: + annotation_counts["skipped_allele_count"] += 1 + _annotate_gnomad( + annotation_manager, + allele_id, + Disposition.ABSENT, + EventReason.NO_RECORD, + error_message="No gnomAD variant could be linked for this allele.", + metadata={"clingen_allele_id": caid}, ) annotation_manager.flush() - # Save final context and progress - job_manager.save_to_context({"num_mapped_variants_linked_to_gnomad_variants": num_linked_gnomad_variants}) - logger.info(msg="Done linking gnomAD variants to mapped variants.", extra=job_manager.logging_context()) + outcome_data = dict(annotation_counts) + job_manager.save_to_context(outcome_data) + logger.info(msg="Done linking gnomAD variants to alleles.", extra=job_manager.logging_context()) job_manager.db.flush() - return JobExecutionOutcome.succeeded( - data={ - "linked_count": num_linked_gnomad_variants, - "skipped_count": num_variant_caids - num_linked_gnomad_variants, - } - ) + return JobExecutionOutcome.succeeded(data=outcome_data) diff --git a/src/mavedb/worker/jobs/external_services/hgvs.py b/src/mavedb/worker/jobs/external_services/hgvs.py deleted file mode 100644 index 0b468739..00000000 --- a/src/mavedb/worker/jobs/external_services/hgvs.py +++ /dev/null @@ -1,298 +0,0 @@ -"""ClinGen allele HGVS population jobs for mapped variant annotation. - -This module populates mapped variants with HGVS representations (genomic, coding, -protein) by querying the ClinGen Allele Registry. It uses ClinGen allele IDs -(CAIDs) already associated with mapped variants to look up standardized HGVS -nomenclature at different levels (hgvs_g, hgvs_c, hgvs_p), plus the assay-level -HGVS derived from post-mapped VRS data. -""" - -import logging -from typing import Optional - -import requests -from sqlalchemy import select - -from mavedb.lib.annotation_status_manager import AnnotationStatusManager -from mavedb.lib.clingen.allele_registry import ( - extract_hgvs_from_ca_allele_data, - extract_hgvs_from_pa_allele_data, - get_clingen_allele_data, -) -from mavedb.lib.slack import log_and_send_slack_message -from mavedb.lib.target_genes import get_target_coding_info -from mavedb.lib.types.workflow import JobExecutionOutcome -from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationFailureCategory, AnnotationStatus -from mavedb.models.mapped_variant import MappedVariant -from mavedb.models.score_set import ScoreSet -from mavedb.models.variant import Variant -from mavedb.worker.jobs.utils.setup import validate_job_params -from mavedb.worker.lib.decorators.pipeline_management import with_pipeline_management -from mavedb.worker.lib.managers.job_manager import JobManager - -logger = logging.getLogger(__name__) - - -@with_pipeline_management -async def populate_hgvs_for_score_set(ctx: dict, job_id: int, job_manager: JobManager) -> JobExecutionOutcome: - """Populate mapped variants with HGVS representations for a score set. - - Queries the ClinGen Allele Registry using existing ClinGen allele IDs to populate - standardized HGVS nomenclature (genomic, coding, protein) on mapped variants. - Also extracts the assay-level HGVS from post-mapped VRS data. - - Required job_params in the JobRun: - - score_set_id (int): ID of the ScoreSet to process - - correlation_id (str): Correlation ID for tracking - - Args: - ctx: Worker context containing DB and Redis connections. - job_id: The ID of the job run. - job_manager: Manager for job lifecycle and DB operations. - - Side Effects: - - Updates MappedVariant records with hgvs_assay_level, hgvs_g, hgvs_c, hgvs_p. - - Creates AnnotationStatus records for each processed variant. - - Returns: - JobExecutionOutcome indicating success, failure, or skip. - """ - job = job_manager.get_job() - - _job_required_params = ["score_set_id", "correlation_id"] - validate_job_params(_job_required_params, job) - - # Fetch required resources based on param inputs. Safely ignore mypy warnings here, as they were checked above. - score_set = job_manager.db.scalars(select(ScoreSet).where(ScoreSet.id == job.job_params["score_set_id"])).one() # type: ignore - correlation_id = job.job_params["correlation_id"] # type: ignore - - # Setup initial context and progress - job_manager.save_to_context( - { - "application": "mavedb-worker", - "function": "populate_hgvs_for_score_set", - "resource": score_set.urn, - "correlation_id": correlation_id, - } - ) - job_manager.update_progress(0, 100, "Starting mapped HGVS population.") - logger.info(msg="Started mapped HGVS population", extra=job_manager.logging_context()) - - # Determine target info; multi-target score sets are not yet supported - try: - target_is_coding, transcript_accession = get_target_coding_info(score_set) - except NotImplementedError: - logger.warning( - msg="Multi-target score sets not supported for HGVS population. Skipping.", - extra=job_manager.logging_context(), - ) - job_manager.db.flush() - return JobExecutionOutcome.skipped(data={"reason": "Multi-target score sets not supported"}) - - job_manager.save_to_context({"target_is_coding": target_is_coding, "transcript_accession": transcript_accession}) - logger.info( - msg=f"Target info resolved: coding={target_is_coding}, transcript={transcript_accession}", - extra=job_manager.logging_context(), - ) - - # Fetch current mapped variants for the score set - variant_rows = job_manager.db.execute( - select(Variant.id, MappedVariant) - .join(Variant) - .join(ScoreSet) - .where(ScoreSet.id == score_set.id) - .where(MappedVariant.current.is_(True)) - ).all() - - total_variants = len(variant_rows) - job_manager.save_to_context({"total_variants": total_variants}) - - if not variant_rows: - logger.warning( - msg="No current mapped variants found for this score set. Skipping HGVS population.", - extra=job_manager.logging_context(), - ) - job_manager.db.flush() - return JobExecutionOutcome.succeeded(data={"populated_count": 0, "skipped_count": 0, "failed_count": 0}) - - job_manager.update_progress(5, 100, f"Processing {total_variants} mapped variants for HGVS population.") - - annotation_manager = AnnotationStatusManager(job_manager.db, job_run_id=job_manager.job_id) - populated_count = 0 - skipped_count = 0 - failed_count = 0 - - for index, (variant_id, mapped_variant) in enumerate(variant_rows): - # Periodic progress updates - if total_variants > 0 and index % max(total_variants // 20, 1) == 0: - progress = 5 + int((index / total_variants) * 90) - job_manager.update_progress(progress, 100, f"Processing HGVS for variant {index + 1}/{total_variants}.") - logger.info( - "Processing variant %s/%s: variant_id=%s", - index + 1, - total_variants, - variant_id, - extra=job_manager.logging_context(), - ) - - hgvs_g: Optional[str] = None - hgvs_c: Optional[str] = None - hgvs_p: Optional[str] = None - - clingen_id = mapped_variant.clingen_allele_id - - job_manager.save_to_context( - { - "mapped_variant_id": mapped_variant.id, - "clingen_allele_id": clingen_id, - "progress_index": index, - } - ) - - if not clingen_id: - annotation_manager.add_annotation( - variant_id=variant_id, - annotation_type=AnnotationType.MAPPED_HGVS, - version=None, - status=AnnotationStatus.SKIPPED, - failure_category=AnnotationFailureCategory.MISSING_IDENTIFIER, - annotation_data={ - "error_message": "No ClinGen allele ID available for ClinGen HGVS lookup.", - }, - current=True, - ) - logger.debug( - "Skipping variant %s: no ClinGen allele ID.", - variant_id, - extra=job_manager.logging_context(), - ) - skipped_count += 1 - continue - - # Skip multi-variant allele IDs (comma-separated) - if "," in clingen_id: - annotation_manager.add_annotation( - variant_id=variant_id, - annotation_type=AnnotationType.MAPPED_HGVS, - version=None, - status=AnnotationStatus.SKIPPED, - failure_category=AnnotationFailureCategory.UNSUPPORTED_IDENTIFIER, - annotation_data={ - "error_message": "Multi-variant ClinGen allele IDs not supported for HGVS lookup.", - }, - current=True, - ) - logger.debug( - "Skipping variant %s: multi-variant ClinGen allele ID.", - variant_id, - extra=job_manager.logging_context(), - ) - skipped_count += 1 - continue - - # Query ClinGen API for allele data - try: - allele_data = await get_clingen_allele_data(clingen_id) - except requests.exceptions.RequestException as exc: - annotation_manager.add_annotation( - variant_id=variant_id, - annotation_type=AnnotationType.MAPPED_HGVS, - version=None, - status=AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.EXTERNAL_API_ERROR, - annotation_data={ - "error_message": f"Failed to fetch ClinGen allele data: {str(exc)}", - }, - current=True, - ) - logger.error( - "ClinGen API request failed for allele %s.", - clingen_id, - extra=job_manager.logging_context(), - exc_info=exc, - ) - failed_count += 1 - continue - - if allele_data is None: - annotation_manager.add_annotation( - variant_id=variant_id, - annotation_type=AnnotationType.MAPPED_HGVS, - version=None, - status=AnnotationStatus.SKIPPED, - failure_category=AnnotationFailureCategory.EXTERNAL_REFERENCE_NOT_FOUND, - annotation_data={ - "error_message": f"ClinGen allele {clingen_id} not found in the registry.", - }, - current=True, - ) - logger.debug( - "ClinGen allele %s not found in registry. Skipping variant %s.", - clingen_id, - variant_id, - extra=job_manager.logging_context(), - ) - skipped_count += 1 - continue - - # Extract HGVS based on allele type - if clingen_id.startswith("CA"): - hgvs_g, hgvs_c, hgvs_p = extract_hgvs_from_ca_allele_data( - allele_data, target_is_coding, transcript_accession - ) - elif clingen_id.startswith("PA"): - hgvs_g, hgvs_c, hgvs_p = extract_hgvs_from_pa_allele_data(allele_data) - - # Update mapped variant - mapped_variant.hgvs_g = hgvs_g - mapped_variant.hgvs_c = hgvs_c - mapped_variant.hgvs_p = hgvs_p - job_manager.db.add(mapped_variant) - - annotation_manager.add_annotation( - variant_id=variant_id, - annotation_type=AnnotationType.MAPPED_HGVS, - version=None, - status=AnnotationStatus.SUCCESS, - annotation_data={ - "annotation_metadata": { - "hgvs_g": hgvs_g, - "hgvs_c": hgvs_c, - "hgvs_p": hgvs_p, - }, - }, - current=True, - ) - populated_count += 1 - - annotation_manager.flush() - job_manager.db.flush() - - job_manager.save_to_context( - { - "populated_count": populated_count, - "skipped_count": skipped_count, - "failed_count": failed_count, - } - ) - logger.info( - msg=f"Completed mapped HGVS population: {populated_count} populated, {skipped_count} skipped, {failed_count} failed.", - extra=job_manager.logging_context(), - ) - - if failed_count > 0 and populated_count == 0: - log_and_send_slack_message( - f"All {failed_count} variants failed HGVS population for score set {score_set.urn}. Possible ClinGen API outage.", - job_manager.logging_context(), - logging.ERROR, - ) - - job_manager.db.flush() - return JobExecutionOutcome.succeeded( - data={ - "populated_count": populated_count, - "skipped_count": skipped_count, - "failed_count": failed_count, - } - ) diff --git a/src/mavedb/worker/jobs/external_services/variant_translation.py b/src/mavedb/worker/jobs/external_services/variant_translation.py deleted file mode 100644 index ddec4b73..00000000 --- a/src/mavedb/worker/jobs/external_services/variant_translation.py +++ /dev/null @@ -1,371 +0,0 @@ -"""ClinGen allele variant translation jobs for mapping PA<->CA allele relationships. - -This module populates the variant_translations table with relationships between -protein allele (PA) and nucleotide allele (CA) ClinGen IDs. For CA alleles, it -looks up MANE canonical PA IDs and their matching registered transcript CA IDs. -For PA alleles, it looks up matching registered transcript CA IDs directly. -""" - -import logging - -import requests -from sqlalchemy import select - -from mavedb.lib.annotation_status_manager import AnnotationStatusManager -from mavedb.lib.clingen.allele_registry import ( - expand_allele_ids, - get_canonical_pa_ids, - get_matching_registered_ca_ids, -) -from mavedb.lib.types.workflow import JobExecutionOutcome -from mavedb.lib.variant_translations import upsert_variant_translations -from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationFailureCategory, AnnotationStatus, FailureCategory -from mavedb.models.mapped_variant import MappedVariant -from mavedb.models.score_set import ScoreSet -from mavedb.models.variant import Variant -from mavedb.worker.jobs.utils.setup import validate_job_params -from mavedb.worker.lib.decorators.pipeline_management import with_pipeline_management -from mavedb.worker.lib.managers.job_manager import JobManager - -logger = logging.getLogger(__name__) - - -@with_pipeline_management -async def populate_variant_translations_for_score_set( - ctx: dict, job_id: int, job_manager: JobManager -) -> JobExecutionOutcome: - """Populate variant translations (PA<->CA relationships) for a score set. - - Queries the ClinGen Allele Registry to discover relationships between protein - allele (PA) and nucleotide allele (CA) ClinGen IDs, then stores them in the - variant_translations table. Each unique allele ID is processed once even if - shared across multiple mapped variants. - - Required job_params in the JobRun: - - score_set_id (int): ID of the ScoreSet to process - - correlation_id (str): Correlation ID for tracking - """ - job = job_manager.get_job() - - _job_required_params = ["score_set_id", "correlation_id"] - validate_job_params(_job_required_params, job) - - score_set = job_manager.db.scalars(select(ScoreSet).where(ScoreSet.id == job.job_params["score_set_id"])).one() # type: ignore - correlation_id = job.job_params["correlation_id"] # type: ignore - - job_manager.save_to_context( - { - "application": "mavedb-worker", - "function": "populate_variant_translations_for_score_set", - "resource": score_set.urn, - "correlation_id": correlation_id, - } - ) - job_manager.update_progress(0, 100, "Starting variant translation population.") - logger.info(msg="Started variant translation population.", extra=job_manager.logging_context()) - - # Fetch all current mapped variants with their ClinGen allele IDs - variant_rows = job_manager.db.execute( - select(Variant.id, MappedVariant.clingen_allele_id) - .join(MappedVariant, MappedVariant.variant_id == Variant.id) - .join(ScoreSet, Variant.score_set_id == ScoreSet.id) - .where(ScoreSet.id == score_set.id) - .where(MappedVariant.current.is_(True)) - ).all() - - if not variant_rows: - logger.warning( - msg="No current mapped variants found for this score set.", - extra=job_manager.logging_context(), - ) - job_manager.db.flush() - return JobExecutionOutcome.succeeded( - data={"translations_created": 0, "alleles_skipped": 0, "alleles_failed": 0} - ) - - # Deduplicate: multiple mapped variants can share the same allele ID, but we only - # need to query the ClinGen API once per unique ID. Track which variants map to each - # allele so we can record annotations for all of them after a single lookup. - allele_to_variants: dict[str, list[int]] = {} - for variant_id, clingen_allele_id in variant_rows: - if not clingen_allele_id: - continue - - for individual_id in expand_allele_ids([clingen_allele_id]): - allele_to_variants.setdefault(individual_id, []).append(variant_id) - - unique_allele_ids = list(allele_to_variants.keys()) - total_alleles = len(unique_allele_ids) - job_manager.save_to_context({"total_variants": len(variant_rows), "unique_allele_ids": total_alleles}) - - if not unique_allele_ids: - logger.warning( - msg="No ClinGen allele IDs found on mapped variants.", - extra=job_manager.logging_context(), - ) - job_manager.db.flush() - return JobExecutionOutcome.succeeded( - data={"translations_created": 0, "alleles_skipped": 0, "alleles_failed": 0} - ) - - job_manager.update_progress(5, 100, f"Processing {total_alleles} unique allele IDs for variant translations.") - logger.info( - "Processing %s unique allele IDs for variant translations.", - total_alleles, - extra=job_manager.logging_context(), - ) - - total_created = 0 - total_skipped = 0 - total_failed = 0 - annotation_manager = AnnotationStatusManager(job_manager.db, job_run_id=job_manager.job_id) - - for index, allele_id in enumerate(unique_allele_ids): - if total_alleles > 0 and index % max(total_alleles // 20, 1) == 0: - progress = 5 + int((index / total_alleles) * 90) - job_manager.update_progress(progress, 100, f"Processing allele {index + 1}/{total_alleles}.") - logger.info( - "Processing allele %s/%s: %s", - index + 1, - total_alleles, - allele_id, - extra=job_manager.logging_context(), - ) - - job_manager.save_to_context( - { - "current_allele_id": allele_id, - "progress_index": index, - } - ) - - variant_ids = allele_to_variants[allele_id] - - if allele_id.startswith("CA"): - # CA (nucleotide) alleles: look up the MANE canonical protein alleles (PAs) for - # this CA, then for each PA discover all registered transcript-level CAs. This - # CA -> PA -> CA expansion builds the full translation graph so we can link - # nucleotide variants to their protein equivalents and vice versa. - try: - canonical_pa_ids = await get_canonical_pa_ids(allele_id) - except requests.exceptions.RequestException as exc: - logger.error( - "ClinGen API request failed for canonical PA lookup of %s.", - allele_id, - extra=job_manager.logging_context(), - exc_info=exc, - ) - for vid in variant_ids: - annotation_manager.add_annotation( - variant_id=vid, - annotation_type=AnnotationType.VARIANT_TRANSLATION, - version=None, - status=AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.EXTERNAL_API_ERROR, - annotation_data={ - "error_message": f"ClinGen API error looking up PA IDs for {allele_id}: {exc}", - }, - current=True, - ) - total_failed += len(variant_ids) - continue - - if not canonical_pa_ids: - # Noncoding variants won't have protein alleles — this is expected and not an error. - logger.debug( - "No canonical PA IDs found for %s (may be noncoding).", - allele_id, - extra=job_manager.logging_context(), - ) - for vid in variant_ids: - annotation_manager.add_annotation( - variant_id=vid, - annotation_type=AnnotationType.VARIANT_TRANSLATION, - version=None, - status=AnnotationStatus.SKIPPED, - failure_category=AnnotationFailureCategory.NO_LINKED_ALLELE, - annotation_data={ - "error_message": f"No canonical PA IDs for {allele_id}.", - }, - current=True, - ) - total_skipped += len(variant_ids) - continue - - created = 0 - failed = 0 - translation_pairs: set[tuple[str, str]] = set() - for pa_id in canonical_pa_ids: - # Record the direct PA <-> original CA relationship. - translation_pairs.add((pa_id, allele_id)) - - # Then expand: find all other CAs registered under this PA so we capture - # alternate transcript-level representations of the same protein change. - try: - ca_ids = await get_matching_registered_ca_ids(pa_id) - except requests.exceptions.RequestException as exc: - logger.error( - "ClinGen API request failed for registered CA lookup of %s.", - pa_id, - extra=job_manager.logging_context(), - exc_info=exc, - ) - failed += 1 - continue - - for ca_id in ca_ids: - translation_pairs.add((pa_id, ca_id)) - - created, existing = upsert_variant_translations(job_manager.db, list(translation_pairs)) - for vid in variant_ids: - annotation_manager.add_annotation( - variant_id=vid, - annotation_type=AnnotationType.VARIANT_TRANSLATION, - version=None, - status=AnnotationStatus.FAILED if failed > 0 else AnnotationStatus.SUCCESS, - annotation_data={ - "annotation_metadata": { - "allele_id": allele_id, - "translation_pairs": [[pa, ca] for pa, ca in translation_pairs], - "translations_new": created, - "translations_existing": existing, - "pa_lookups_failed": failed, - "pa_lookups_total": len(canonical_pa_ids), - }, - }, - current=True, - ) - - total_created += created - total_failed += failed - - elif allele_id.startswith("PA"): - # PA (protein) alleles: directly look up all registered transcript-level CAs. - # This is simpler than the CA path since we already have the protein allele. - try: - ca_ids = await get_matching_registered_ca_ids(allele_id) - except requests.exceptions.RequestException as exc: - logger.error( - "ClinGen API request failed for registered CA lookup of %s.", - allele_id, - extra=job_manager.logging_context(), - exc_info=exc, - ) - for vid in variant_ids: - annotation_manager.add_annotation( - variant_id=vid, - annotation_type=AnnotationType.VARIANT_TRANSLATION, - version=None, - status=AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.EXTERNAL_API_ERROR, - annotation_data={ - "error_message": f"ClinGen API error for {allele_id}: {exc}", - }, - current=True, - ) - total_failed += len(variant_ids) - continue - - if not ca_ids: - logger.warning( - "No matching registered transcript CA IDs for PA allele %s. This is unexpected.", - allele_id, - extra=job_manager.logging_context(), - ) - for vid in variant_ids: - annotation_manager.add_annotation( - variant_id=vid, - annotation_type=AnnotationType.VARIANT_TRANSLATION, - version=None, - status=AnnotationStatus.SKIPPED, - failure_category=AnnotationFailureCategory.NO_LINKED_ALLELE, - annotation_data={ - "error_message": f"No registered transcript CA IDs for {allele_id}.", - }, - current=True, - ) - total_skipped += len(variant_ids) - continue - - translation_pairs = set([(allele_id, ca_id) for ca_id in ca_ids]) - created, existing = upsert_variant_translations(job_manager.db, list(translation_pairs)) - for vid in variant_ids: - annotation_manager.add_annotation( - variant_id=vid, - annotation_type=AnnotationType.VARIANT_TRANSLATION, - version=None, - status=AnnotationStatus.SUCCESS, - annotation_data={ - "annotation_metadata": { - "allele_id": allele_id, - "translation_pairs": [[pa, ca] for pa, ca in translation_pairs], - "translations_new": created, - "translations_existing": existing, - }, - }, - current=True, - ) - - total_created += created - - else: - logger.warning( - "Unrecognized ClinGen allele ID format: %s. Skipping.", - allele_id, - extra=job_manager.logging_context(), - ) - for vid in variant_ids: - annotation_manager.add_annotation( - variant_id=vid, - annotation_type=AnnotationType.VARIANT_TRANSLATION, - version=None, - status=AnnotationStatus.SKIPPED, - failure_category=AnnotationFailureCategory.UNSUPPORTED_IDENTIFIER, - annotation_data={ - "error_message": f"Unrecognized allele ID format: {allele_id}", - }, - current=True, - ) - total_skipped += len(variant_ids) - - annotation_manager.flush() - job_manager.db.flush() - - job_manager.save_to_context( - { - "translations_created": total_created, - "alleles_skipped": total_skipped, - "alleles_failed": total_failed, - } - ) - logger.info( - "Completed variant translation population: %s created, %s skipped, %s failed.", - total_created, - total_skipped, - total_failed, - extra=job_manager.logging_context(), - ) - - if total_failed > 0 and total_created == 0: - error_message = f"All {total_failed} variant translation lookups failed for score set {score_set.urn}. Possible ClinGen API outage." - logger.error(error_message, extra=job_manager.logging_context()) - job_manager.db.flush() - return JobExecutionOutcome.failed( - reason=error_message, - data={ - "translations_created": 0, - "alleles_skipped": total_skipped, - "alleles_failed": total_failed, - }, - failure_category=FailureCategory.DEPENDENCY_FAILURE, - ) - - job_manager.db.flush() - return JobExecutionOutcome.succeeded( - data={ - "translations_created": total_created, - "alleles_skipped": total_skipped, - "alleles_failed": total_failed, - } - ) diff --git a/src/mavedb/worker/jobs/external_services/vep.py b/src/mavedb/worker/jobs/external_services/vep.py index ba577f23..0264938f 100644 --- a/src/mavedb/worker/jobs/external_services/vep.py +++ b/src/mavedb/worker/jobs/external_services/vep.py @@ -1,28 +1,37 @@ """VEP functional consequence jobs for variant effect prediction. -This module handles the submission and processing of variant effect predictions -using the Ensembl VEP API. - -The processing is asynchronous, requiring batch submission of HGVS strings -to the VEP API with fallback to Variant Recoder when necessary. +This module links deduplicated alleles to their Ensembl VEP functional consequence. Submission is +batched against the VEP API, with a Variant Recoder fallback for HGVS strings VEP cannot resolve +directly (notably protein HGVS). """ import asyncio import logging import os +from collections import Counter from datetime import date +import requests from sqlalchemy import select from mavedb.lib.annotation_status_manager import AnnotationStatusManager +from mavedb.lib.clingen.alleles import ScoreSetAlleleRow, get_alleles_for_score_set, group_alleles_for_annotation from mavedb.lib.types.workflow import JobExecutionOutcome from mavedb.lib.utils import batched -from mavedb.lib.vep import VEP_CONSEQUENCES, get_functional_consequence, run_variant_recoder +from mavedb.lib.vep import ( + VEP_CONSEQUENCES, + VepLinkVerdict, + VepResolution, + get_ensembl_release, + get_functional_consequence, + link_vep_consequences_to_alleles, + run_variant_recoder, +) from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationFailureCategory, AnnotationStatus -from mavedb.models.mapped_variant import MappedVariant +from mavedb.models.enums.disposition import Disposition +from mavedb.models.enums.event_reason import EventReason from mavedb.models.score_set import ScoreSet -from mavedb.models.variant import Variant +from mavedb.models.vep_allele_consequence import VepAlleleConsequence from mavedb.worker.jobs.utils.setup import validate_job_params from mavedb.worker.lib.decorators.pipeline_management import with_pipeline_management from mavedb.worker.lib.managers.job_manager import JobManager @@ -34,17 +43,198 @@ _RECODER_CONCURRENCY = int(os.getenv("RECODER_CONCURRENCY", "5")) +def _annotate_vep( + annotation_manager: AnnotationStatusManager, + allele_id: int, + disposition: Disposition, + reason: EventReason, + *, + source_version: str, + error_message: str | None = None, + metadata: dict | None = None, +) -> None: + """Record one VEP_FUNCTIONAL_CONSEQUENCE event for an allele (the consequence is an allele-level fact). + + The single choke point for VEP's status writes. One event per allele, stamped with the Ensembl + release queried; provenance (which variants drove the linkage) is derived from the live links + as-of the event, not fanned out here. The consequence value itself is not embedded — it lives in + the ``VepAlleleConsequence`` value table, joinable by ``allele_id`` + ``source_version``. + """ + meta = dict(metadata or {}) + if error_message is not None: + meta["error_message"] = error_message + + annotation_manager.record_event( + AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, + allele_id=allele_id, + disposition=disposition, + reason=reason, + source_version=source_version, + metadata=meta or None, + ) + + +def _vep_hgvs_payload(row: ScoreSetAlleleRow) -> str | None: + """Build VEP's HGVS input for an allele: each allele will only have one of these, so the first + non-null is the one to submit. + """ + return row.hgvs_g or row.hgvs_c or row.hgvs_p or None + + +async def _resolve_consequences(unique_hgvs: list[str], job_manager: JobManager) -> VepResolution: + """Resolve a set of HGVS strings to their most-severe VEP consequence, splitting empty from errored. + + Phase 1 submits the HGVS strings to VEP. Phase 2 runs Variant Recoder on the misses (a VEP entry + with a null consequence is treated as a miss — VEP knew the variant but could not classify it). + Phase 3 re-submits the recoded genomic strings to VEP and maps the most-severe consequence back to + the original HGVS. + + Returns a :class:`VepResolution`: ``consequences`` for the hits, and ``errored`` for HGVS whose VEP + or Recoder *request failed* (HTTP/transport error after retries) — those are unknown and must be + retried, never conflated with a genuine empty. An input in neither set was queried successfully and + yielded no consequence (a genuine empty). A failed batch does not abort the run: only that batch's + inputs are marked errored and processing continues. + """ + all_consequences: dict[str, str] = {} + errored: set[str] = set() + batches = list(batched(unique_hgvs, _VEP_BATCH_SIZE)) + + # --- Phase 1: initial VEP pass --- + all_missing_hgvs: set[str] = set() + for batch_idx, batch in enumerate(batches): + try: + consequences = await get_functional_consequence(list(batch)) + # VEP rejected the batch (e.g. 400 for protein HGVS it cannot parse). Route all + # items to Variant Recoder — the input may still be resolvable via genomic recoding. + except requests.exceptions.HTTPError: + logger.warning( + msg=f"VEP returned an HTTP error for batch {batch_idx + 1}/{len(batches)} ({len(batch)} HGVS); routing to Variant Recoder.", + extra=job_manager.logging_context(), + ) + all_missing_hgvs.update(batch) + continue + # Transport/network error — result unknown. Mark errored and do not route to Recoder. + except requests.exceptions.RequestException as exc: + logger.warning( + msg=f"VEP request failed for batch {batch_idx + 1}/{len(batches)} ({len(batch)} HGVS); marking errored.", + exc_info=exc, + extra=job_manager.logging_context(), + ) + errored.update(batch) + continue + + hit_consequences = {h: c for h, c in consequences.items() if c is not None} + all_consequences.update(hit_consequences) + all_missing_hgvs.update(set(batch) - set(hit_consequences.keys())) + + job_manager.update_progress( + int((batch_idx + 1) / len(batches) * 33), + 100, + f"Processed initial VEP batch {batch_idx + 1}/{len(batches)}", + ) + + logger.info( + msg=f"Completed initial VEP processing. {len(all_missing_hgvs)} HGVS strings require Variant Recoder fallback.", + extra=job_manager.logging_context(), + ) + + if not all_missing_hgvs: + return VepResolution(all_consequences, errored) + + # --- Phase 2: Variant Recoder fallback for HGVS strings VEP could not resolve --- + recoder_batch_list = list(batched(list(all_missing_hgvs), _RECODER_BATCH_SIZE)) + semaphore = asyncio.Semaphore(_RECODER_CONCURRENCY) + completed_recoder_batches = 0 + + async def _recoder_with_semaphore(batch: list[str], total: int) -> dict[str, list[str]]: + nonlocal completed_recoder_batches + async with semaphore: + result = await run_variant_recoder(batch) + completed_recoder_batches += 1 + job_manager.update_progress( + 33 + int(completed_recoder_batches / total * 33), + 100, + f"Completed Variant Recoder batch {completed_recoder_batches}/{total}", + ) + return result + + total_recoder_batches = len(recoder_batch_list) + recoder_results = await asyncio.gather( + *[_recoder_with_semaphore(list(b), total_recoder_batches) for b in recoder_batch_list], + return_exceptions=True, + ) + + # A failed Recoder batch marks its inputs errored (they were misses; we still don't know them) and + # the run continues — one bad batch must not abort every allele's annotation. + hgvs_to_genomic: dict[str, list[str]] = {} + for batch, result in zip(recoder_batch_list, recoder_results): + if isinstance(result, BaseException): + logger.warning( + msg=f"Variant Recoder request failed for a batch of {len(batch)} HGVS; marking errored.", + exc_info=result, + extra=job_manager.logging_context(), + ) + errored.update(batch) + continue + hgvs_to_genomic.update(result) + + logger.info( + msg=f"Completed Variant Recoder processing. {len(hgvs_to_genomic)} HGVS strings successfully recoded.", + extra=job_manager.logging_context(), + ) + + # --- Phase 3: VEP pass on the recoded genomic HGVS strings --- + all_recoded_genomic_hgvs = list({g for genomic_list in hgvs_to_genomic.values() for g in genomic_list}) + recoded_vep_batch_list = list(batched(all_recoded_genomic_hgvs, _VEP_BATCH_SIZE)) + all_recoded_consequences: dict[str, str | None] = {} + errored_genomic: set[str] = set() + + for recoded_idx, recoded_batch in enumerate(recoded_vep_batch_list): + try: + all_recoded_consequences.update(await get_functional_consequence(list(recoded_batch))) + except requests.exceptions.RequestException as exc: + logger.warning( + msg=f"VEP request failed for recoded batch {recoded_idx + 1}/{len(recoded_vep_batch_list)} " + f"({len(recoded_batch)} HGVS); marking errored.", + exc_info=exc, + extra=job_manager.logging_context(), + ) + errored_genomic.update(recoded_batch) + job_manager.update_progress( + 66 + int((recoded_idx + 1) / len(recoded_vep_batch_list) * 33), + 100, + f"Processed recoded VEP batch {recoded_idx + 1}/{len(recoded_vep_batch_list)}", + ) + + # Map the most-severe consequence from the recoded genomic strings back to the original HGVS. An + # original HGVS is a hit if any recoded form resolved; else errored if a recoded query failed and + # nothing resolved; else a genuine empty (Recoder ran, VEP found no consequence) — left out of both. + for original_hgvs, recoded_hgvs_list in hgvs_to_genomic.items(): + recoded_consequences = [c for h in recoded_hgvs_list if (c := all_recoded_consequences.get(h))] + most_severe = next((c for c in VEP_CONSEQUENCES if c in recoded_consequences), None) + if most_severe: + all_consequences[original_hgvs] = most_severe + elif any(g in errored_genomic for g in recoded_hgvs_list): + errored.add(original_hgvs) + + return VepResolution(all_consequences, errored) + + @with_pipeline_management async def populate_vep_for_score_set(ctx: dict, job_id: int, job_manager: JobManager) -> JobExecutionOutcome: - """Populate VEP functional consequence predictions for all mapped variants in a ScoreSet. + """Link deduplicated alleles to their VEP functional consequence. - This function retrieves all mapped variants with a populated hgvs_assay_level field for a given - ScoreSet and submits them to the Ensembl VEP API in configurable batches. It handles fallback - to the Variant Recoder API for variants that cannot be processed by VEP directly. + Runs over the score set's current alleles (authoritative and RT-derived), submits each allele's + HGVS to VEP (with Variant Recoder fallback), and stores the most-severe consequence in a valid-time + :class:`VepAlleleConsequence`, superseding only on change. Job Parameters: - - score_set_id (int): The ID of the ScoreSet containing mapped variants. + - score_set_id (int): The ID of the ScoreSet whose alleles to process. - correlation_id (str): Correlation ID for tracing requests across services. + - force (bool, optional): Bypass the current-release skip and re-query every HGVS-bearing + allele. The linker still supersedes only on a value change, so a forced re-run of unchanged + data writes no new rows. Use for re-ingestion, to heal suspected corruption, or after editing + the VEP_CONSEQUENCES severity ordering (a change the release version cannot see). Args: ctx (dict): The job context dictionary. @@ -52,7 +242,7 @@ async def populate_vep_for_score_set(ctx: dict, job_id: int, job_manager: JobMan job_manager (JobManager): Manager for job lifecycle and DB operations. Returns: - JobExecutionOutcome: Outcome with counts of processed, successful, and failed variants. + JobExecutionOutcome: outcome with per-allele created/preexisting/skipped counts. """ job = job_manager.get_job() @@ -62,6 +252,7 @@ async def populate_vep_for_score_set(ctx: dict, job_id: int, job_manager: JobMan # Safely ignore mypy warnings here, as params were checked above. score_set = job_manager.db.scalars(select(ScoreSet).where(ScoreSet.id == job.job_params["score_set_id"])).one() # type: ignore correlation_id = job.job_params["correlation_id"] # type: ignore + force = bool(job.job_params.get("force", False)) # type: ignore[union-attr] job_manager.save_to_context( { @@ -71,348 +262,156 @@ async def populate_vep_for_score_set(ctx: dict, job_id: int, job_manager: JobMan "correlation_id": correlation_id, } ) - job_manager.update_progress(0, 100, "Starting VEP population.") - logger.info(msg="Started VEP population", extra=job_manager.logging_context()) - - mapped_variants = job_manager.db.scalars( - select(MappedVariant) - .join(Variant) - .where( - Variant.score_set_id == score_set.id, - MappedVariant.current.is_(True), - MappedVariant.post_mapped.isnot(None), - ) - ).all() - - if not mapped_variants: - logger.warning( - msg=f"No mapped variants found for score set {score_set.urn}. Skipped VEP population.", - extra=job_manager.logging_context(), - ) - job_manager.db.flush() - return JobExecutionOutcome.succeeded( - data={ - "variants_processed": 0, - "variants_with_consequences": 0, - "variants_without_consequences": 0, - "variants_recoder_failed": 0, - } - ) - - job_manager.save_to_context({"total_variants_to_process": len(mapped_variants)}) - logger.info( - msg=f"Found {len(mapped_variants)} mapped variants for VEP processing", - extra=job_manager.logging_context(), + job_manager.update_progress(0, 100, "Starting VEP consequence linkage.") + logger.info(msg="Started VEP consequence linkage", extra=job_manager.logging_context()) + + # One work-unit per allele (payload = HGVS; alleles without one are skipped). Events are allele-keyed, + # so each allele records its own event. + allele_data = group_alleles_for_annotation( + get_alleles_for_score_set(job_manager.db, score_set.id), + payload=_vep_hgvs_payload, ) - annotation_manager = AnnotationStatusManager(job_manager.db, job_run_id=job_manager.job_id) - - mapped_variants_by_id = {mv.id: mv for mv in mapped_variants} - - # Extract HGVS strings; skip and annotate variants that have none. - hgvs_and_mapped_variant_id_pairs: list[tuple[str, int]] = [] - - for mapped_variant in mapped_variants: - if not mapped_variant.hgvs_assay_level: - annotation_manager.add_annotation( - variant_id=mapped_variant.variant_id, # type: ignore - annotation_type=AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, - status=AnnotationStatus.SKIPPED, - failure_category=AnnotationFailureCategory.MISSING_IDENTIFIER, - annotation_data={"error_message": "Mapped variant does not have an associated HGVS string."}, - ) - logger.debug("Mapped variant does not have an associated HGVS string.", extra=job_manager.logging_context()) - continue - - hgvs_and_mapped_variant_id_pairs.append((mapped_variant.hgvs_assay_level, mapped_variant.id)) # type: ignore - - batches = list(batched(hgvs_and_mapped_variant_id_pairs, _VEP_BATCH_SIZE)) - - job_manager.save_to_context({"vep_batches": len(batches)}) - logger.debug( - msg=f"Prepared {len(batches)} VEP batches ({_VEP_BATCH_SIZE} variants/batch)", - extra=job_manager.logging_context(), + annotation_counts: Counter[str] = Counter( + { + "created_allele_count": 0, + "preexisting_allele_count": 0, + "absent_allele_count": 0, + "errored_allele_count": 0, + } ) - # --- Phase 1: Initial VEP pass --- - all_consequences: dict[str, str] = {} - all_missing_hgvs: set[str] = set() - - for batch_idx, batch in enumerate(batches): - logger.debug( - msg=f"Processing VEP batch {batch_idx + 1}/{len(batches)}", - extra=job_manager.logging_context(), - ) - - hgvs_strings, mapped_variant_ids = map(list, zip(*batch)) # type: ignore + num_alleles_with_hgvs = len(allele_data) + job_manager.save_to_context({"num_alleles_to_link_vep": num_alleles_with_hgvs}) - consequences = await get_functional_consequence(hgvs_strings) - logger.debug( - msg=f"Received consequences for {len(consequences)} variants in VEP batch {batch_idx + 1}", + if not allele_data: + logger.warning( + msg="No current alleles with HGVS were found for this score set. Skipping VEP linkage (nothing to do).", extra=job_manager.logging_context(), ) - - # Only store variants where VEP returned an actual consequence string. A None value - # means VEP knew the variant but couldn't classify it — treat that the same as absent - # and route to Recoder so we have the best chance of getting a consequence. - hit_consequences = {h: c for h, c in consequences.items() if c is not None} - all_consequences.update(hit_consequences) - - missing_hgvs = set(hgvs_strings) - set(hit_consequences.keys()) - for hgvs, mapped_variant_id in zip(hgvs_strings, mapped_variant_ids): - if hgvs in missing_hgvs: - all_missing_hgvs.add(hgvs) - - progress_pct = int((batch_idx + 1) / len(batches) * 33) - job_manager.save_to_context( - { - "initial_vep_batches_processed": batch_idx + 1, - "missing_hgvs_count": len(all_missing_hgvs), - } - ) - job_manager.update_progress( - progress_pct, - 100, - f"Processed initial VEP batch {batch_idx + 1}/{len(batches)}", + job_manager.db.flush() + return JobExecutionOutcome.succeeded(data=dict(annotation_counts)) + + all_allele_ids = set(allele_data.keys()) + + # The Ensembl release version-keys the run (coordinated software + transcript set + vocabulary). It + # is load-bearing for the skip below, so a failure here aborts the job rather than mis-versioning + # writes (the exception propagates to the job decorators). + ensembl_release = await get_ensembl_release() + job_manager.save_to_context({"ensembl_release": ensembl_release}) + + def alleles_at_current_release(allele_ids: set[int]) -> set[int]: + """Allele ids (within the given set) holding a live VEP consequence at the current Ensembl release.""" + if not allele_ids: + return set() + return set( + job_manager.db.scalars( + select(VepAlleleConsequence.allele_id) + .where(VepAlleleConsequence.allele_id.in_(allele_ids)) + .where(VepAlleleConsequence.current) + .where(VepAlleleConsequence.functional_consequence.isnot(None)) + .where(VepAlleleConsequence.source_version == ensembl_release) + ).all() ) - logger.info( - msg=f"Completed initial VEP processing. {len(all_missing_hgvs)} variants require Variant Recoder fallback.", - extra=job_manager.logging_context(), + # Skip alleles already resolved at the current Ensembl release (they cannot change without a + # release bump). force re-queries all — including alleles unchanged upstream but whose VEP_CONSEQUENCES + # severity ordering we have since edited; the linker still supersedes only on a value change, so a + # forced no-op writes nothing. + already_current = set() if force else alleles_at_current_release(all_allele_ids) + hgvs_by_allele = {aid: allele_data[aid] for aid in allele_data if aid not in already_current} + unique_hgvs = sorted(set(hgvs_by_allele.values())) + job_manager.save_to_context( + { + "num_alleles_already_current": len(already_current), + "num_hgvs_to_query": len(unique_hgvs), + "force": force, + } ) - # --- Phase 2: Variant Recoder fallback for HGVS strings VEP could not resolve --- - hgvs_to_genomic: dict[str, list[str]] = {} - recoder_missing_hgvs: set[str] = set() - - if all_missing_hgvs: - logger.info( - msg=f"Running Variant Recoder for {len(all_missing_hgvs)} HGVS strings", - extra=job_manager.logging_context(), - ) - - recoder_batch_list = list(batched(list(all_missing_hgvs), _RECODER_BATCH_SIZE)) - - logger.debug( - msg=f"Running {len(recoder_batch_list)} Variant Recoder batches with concurrency {_RECODER_CONCURRENCY}", - extra=job_manager.logging_context(), - ) - - semaphore = asyncio.Semaphore(_RECODER_CONCURRENCY) - completed_recoder_batches = 0 - - async def _recoder_with_semaphore(batch: list[str], batch_idx: int, total: int) -> dict[str, list[str]]: - nonlocal completed_recoder_batches - async with semaphore: - logger.debug( - msg=f"Starting Variant Recoder batch {batch_idx + 1}/{total} ({len(batch)} HGVS strings)", - extra=job_manager.logging_context(), - ) - result = await run_variant_recoder(batch) - completed_recoder_batches += 1 - logger.debug( - msg=f"Completed Variant Recoder batch {completed_recoder_batches}/{total} ({len(result)} variants recoded)", - extra=job_manager.logging_context(), - ) - progress_pct = 33 + int(completed_recoder_batches / total * 33) - job_manager.update_progress( - progress_pct, - 100, - f"Completed Variant Recoder batch {completed_recoder_batches}/{total}", - ) - return result - - total_recoder_batches = len(recoder_batch_list) - recoder_results = await asyncio.gather( - *[ - _recoder_with_semaphore(list(recoder_batch), idx, total_recoder_batches) - for idx, recoder_batch in enumerate(recoder_batch_list) - ], - return_exceptions=True, - ) - - successful_batches = sum(1 for r in recoder_results if not isinstance(r, Exception)) - - first_exception = next((r for r in recoder_results if isinstance(r, Exception)), None) - if first_exception is not None: - logger.error( - msg=f"Variant Recoder error ({successful_batches}/{total_recoder_batches} batches succeeded): {str(first_exception)}", - extra=job_manager.logging_context(), - ) - raise first_exception - - for result in recoder_results: - hgvs_to_genomic.update(result) # type: ignore[arg-type] - - job_manager.save_to_context( - { - "variant_recoder_batches_processed": len(recoder_batch_list), - "recoded_variants_count": len(hgvs_to_genomic), - } + verdicts: dict[int, VepLinkVerdict] = {} + errored_allele_ids: set[int] = set() + if unique_hgvs: + job_manager.update_progress(10, 100, f"Querying VEP for {len(unique_hgvs)} HGVS strings.") + resolution = await _resolve_consequences(unique_hgvs, job_manager) + consequence_by_allele_id = {aid: resolution.consequences.get(hgvs) for aid, hgvs in hgvs_by_allele.items()} + # Alleles whose VEP/Recoder request failed: unknown, not a negative — kept distinct from empties. + errored_allele_ids = {aid for aid, hgvs in hgvs_by_allele.items() if hgvs in resolution.errored} + verdicts = link_vep_consequences_to_alleles( + job_manager.db, consequence_by_allele_id, source_version=ensembl_release, access_date=date.today() ) + job_manager.db.flush() + else: logger.info( - msg=f"Completed Variant Recoder processing. {len(hgvs_to_genomic)} variants successfully recoded.", + msg="All HGVS-bearing alleles are already resolved at the current Ensembl release; skipping VEP query.", extra=job_manager.logging_context(), ) + job_manager.update_progress(99, 100, "All alleles already current at this Ensembl release.") - # --- Phase 3: VEP pass on the recoded genomic HGVS strings --- - # hgvs_to_genomic maps original HGVS → list[str]; flatten to a deduplicated list of - # genomic strings before batching with VEP. - all_recoded_genomic_hgvs = list({g for genomic_list in hgvs_to_genomic.values() for g in genomic_list}) - recoded_vep_batch_list = list(batched(all_recoded_genomic_hgvs, _VEP_BATCH_SIZE)) - all_recoded_consequences: dict[str, str | None] = {} - - for recoded_vep_batch_idx, recoded_vep_batch in enumerate(recoded_vep_batch_list): - logger.debug( - msg=f"Processing recoded HGVS VEP batch {recoded_vep_batch_idx + 1}/{len(recoded_vep_batch_list)}", - extra=job_manager.logging_context(), + annotation_manager = AnnotationStatusManager( + job_manager.db, job_run_id=job_manager.job_id, score_set_id=score_set.id + ) + for allele_id, hgvs in allele_data.items(): + verdict = verdicts.get(allele_id) + if verdict is VepLinkVerdict.CREATED: + annotation_counts["created_allele_count"] += 1 + _annotate_vep( + annotation_manager, + allele_id, + Disposition.PRESENT, + EventReason.CREATED, + source_version=ensembl_release, + metadata={"hgvs": hgvs}, ) - recoded_vep_consequences = await get_functional_consequence(recoded_vep_batch) - all_recoded_consequences.update(recoded_vep_consequences) - - progress_pct = 66 + int((recoded_vep_batch_idx + 1) / len(recoded_vep_batch_list) * 33) - job_manager.save_to_context( - { - "recoded_vep_batches_processed": recoded_vep_batch_idx + 1, - "recoded_consequences_count": len(all_recoded_consequences), - } + elif allele_id in already_current or verdict is VepLinkVerdict.UNCHANGED: + annotation_counts["preexisting_allele_count"] += 1 + _annotate_vep( + annotation_manager, + allele_id, + Disposition.PRESENT, + EventReason.PREEXISTING, + source_version=ensembl_release, + metadata={"hgvs": hgvs}, ) - job_manager.update_progress( - progress_pct, - 100, - f"Processed recoded VEP batch {recoded_vep_batch_idx + 1}/{len(recoded_vep_batch_list)}", - ) - - logger.info( - msg=f"Completed recoded VEP processing. {len(all_recoded_consequences)} recoded consequences retrieved.", - extra=job_manager.logging_context(), - ) - - # Map most-severe consequence from recoded genomic HGVS back to the original HGVS. - for original_hgvs, recoded_hgvs_list in hgvs_to_genomic.items(): - recoded_consequences_for_variant = [ - c for recoded_hgvs in recoded_hgvs_list if (c := all_recoded_consequences.get(recoded_hgvs)) - ] - - if recoded_consequences_for_variant: - most_severe = next( - (c for c in VEP_CONSEQUENCES if c in recoded_consequences_for_variant), - None, - ) - if most_severe: - all_consequences[original_hgvs] = most_severe - logger.debug( - msg=f"Selected most severe consequence '{most_severe}' for {original_hgvs}", - extra=job_manager.logging_context(), - ) - else: - logger.debug( - msg=f"Could not retrieve functional consequences for any recoded variants of {original_hgvs}", - extra=job_manager.logging_context(), - ) - - recoder_missing_hgvs = all_missing_hgvs - set(hgvs_to_genomic.keys()) - - # --- Phase 4: Annotate outcomes and update mapped variants in a single pass --- - - # HGVS strings that went through both VEP passes but still have no consequence. - all_processed_hgvs = {h for h, _ in hgvs_and_mapped_variant_id_pairs} - vep_failed_hgvs = all_processed_hgvs - set(all_consequences.keys()) - recoder_missing_hgvs - - variants_processed = 0 - variants_with_consequences = 0 - variants_without_consequences = 0 - variants_recoder_failed = 0 - - for hgvs_string, mapped_variant_id in hgvs_and_mapped_variant_id_pairs: - mapped_variant = mapped_variants_by_id.get(mapped_variant_id) # type: ignore - if mapped_variant is None: - continue - consequence = all_consequences.get(hgvs_string) - if consequence: - mapped_variant.vep_functional_consequence = consequence - mapped_variant.vep_access_date = date.today() - job_manager.db.add(mapped_variant) - annotation_manager.add_annotation( - variant_id=mapped_variant.variant_id, # type: ignore - annotation_type=AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, - status=AnnotationStatus.SUCCESS, - annotation_data={"annotation_metadata": {"functional_consequence": consequence}}, - ) - variants_with_consequences += 1 - logger.debug( - msg=f"Set consequence '{consequence}' for mapped variant {mapped_variant_id} (HGVS: {hgvs_string})", - extra=job_manager.logging_context(), - ) - elif hgvs_string in vep_failed_hgvs: - annotation_manager.add_annotation( - variant_id=mapped_variant.variant_id, # type: ignore - annotation_type=AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, - status=AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.EXTERNAL_REFERENCE_NOT_FOUND, - annotation_data={ - "error_message": "VEP could not determine a functional consequence for this variant, even after Variant Recoder fallback.", - }, - ) - variants_without_consequences += 1 - logger.debug( - msg=f"Recorded VEP failure for mapped_variant_id {mapped_variant_id} (HGVS: {hgvs_string})", - extra=job_manager.logging_context(), - ) - elif hgvs_string in recoder_missing_hgvs: - annotation_manager.add_annotation( - variant_id=mapped_variant.variant_id, # type: ignore - annotation_type=AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, - status=AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.EXTERNAL_REFERENCE_NOT_FOUND, - annotation_data={ - "error_message": "Variant Recoder could not recode this HGVS string to a genomic equivalent.", - }, - ) - variants_recoder_failed += 1 - logger.debug( - msg=f"Recorded Variant Recoder failure for mapped_variant_id {mapped_variant_id} (HGVS: {hgvs_string})", - extra=job_manager.logging_context(), + elif allele_id in errored_allele_ids: + annotation_counts["errored_allele_count"] += 1 + _annotate_vep( + annotation_manager, + allele_id, + Disposition.FAILED, + EventReason.API_ERROR, + source_version=ensembl_release, + error_message="The VEP/Variant Recoder request for this allele failed; result unknown.", + metadata={"hgvs": hgvs}, ) + else: - annotation_manager.add_annotation( - variant_id=mapped_variant.variant_id, # type: ignore - annotation_type=AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, - status=AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.UNKNOWN, - annotation_data={ - "error_message": "Variant was not classified by any VEP outcome branch. This is a bug.", - }, - ) - variants_without_consequences += 1 - logger.warning( - msg=f"Unexpected state: mapped_variant_id {mapped_variant_id} (HGVS: {hgvs_string}) was not classified by any outcome branch.", - extra=job_manager.logging_context(), + annotation_counts["absent_allele_count"] += 1 + _annotate_vep( + annotation_manager, + allele_id, + Disposition.ABSENT, + EventReason.NO_RECORD, + source_version=ensembl_release, + error_message="VEP found no functional consequence for this allele, even after Variant Recoder fallback.", + metadata={"hgvs": hgvs}, ) - variants_processed += 1 - annotation_manager.flush() - job_manager.db.flush() + outcome_data = dict(annotation_counts) + job_manager.save_to_context(outcome_data) job_manager.update_progress( 100, 100, - f"Completed VEP functional consequence prediction for {variants_with_consequences}/{variants_processed} variants.", - ) - logger.info( - msg=f"Completed VEP prediction: {variants_with_consequences} with consequences, {variants_without_consequences} without, {variants_recoder_failed} recoder failed", - extra=job_manager.logging_context(), + ( + f"Completed VEP linkage: {annotation_counts['created_allele_count'] + annotation_counts['preexisting_allele_count']} linked, " + f"{annotation_counts['absent_allele_count']} absent (no result), " + f"{annotation_counts['errored_allele_count']} errored." + ), ) - + logger.info(msg="Done linking VEP consequences to alleles.", extra=job_manager.logging_context()) job_manager.db.flush() - return JobExecutionOutcome.succeeded( - data={ - "variants_processed": variants_processed, - "variants_with_consequences": variants_with_consequences, - "variants_without_consequences": variants_without_consequences, - "variants_recoder_failed": variants_recoder_failed, - } - ) + return JobExecutionOutcome.succeeded(data=outcome_data) diff --git a/src/mavedb/worker/jobs/registry.py b/src/mavedb/worker/jobs/registry.py index c3fb8fd2..c2e3d1b5 100644 --- a/src/mavedb/worker/jobs/registry.py +++ b/src/mavedb/worker/jobs/registry.py @@ -18,8 +18,6 @@ from mavedb.worker.jobs.external_services import ( link_gnomad_variants, poll_uniprot_mapping_jobs_for_score_set, - populate_hgvs_for_score_set, - populate_variant_translations_for_score_set, populate_vep_for_score_set, refresh_clinvar_controls, submit_score_set_mappings_to_car, @@ -49,8 +47,6 @@ submit_uniprot_mapping_jobs_for_score_set, poll_uniprot_mapping_jobs_for_score_set, link_gnomad_variants, - populate_hgvs_for_score_set, - populate_variant_translations_for_score_set, populate_vep_for_score_set, # Data management jobs refresh_materialized_views, @@ -158,20 +154,6 @@ "key": "link_gnomad_variants", "type": JobType.MAPPED_VARIANT_ANNOTATION, }, - populate_hgvs_for_score_set: { - "dependencies": [], - "params": {"score_set_id": None, "correlation_id": None}, - "function": "populate_hgvs_for_score_set", - "key": "populate_hgvs_for_score_set", - "type": JobType.MAPPED_VARIANT_ANNOTATION, - }, - populate_variant_translations_for_score_set: { - "dependencies": [], - "params": {"score_set_id": None, "correlation_id": None}, - "function": "populate_variant_translations_for_score_set", - "key": "populate_variant_translations_for_score_set", - "type": JobType.MAPPED_VARIANT_ANNOTATION, - }, populate_vep_for_score_set: { "dependencies": [], "params": {"score_set_id": None, "correlation_id": None}, diff --git a/src/mavedb/worker/jobs/variant_processing/mapping.py b/src/mavedb/worker/jobs/variant_processing/mapping.py index 9e6a8da7..b2f588b7 100644 --- a/src/mavedb/worker/jobs/variant_processing/mapping.py +++ b/src/mavedb/worker/jobs/variant_processing/mapping.py @@ -31,7 +31,8 @@ from mavedb.models.allele import Allele as AlleleDbModel from mavedb.models.enums.annotation_layer import AnnotationLayer from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationFailureCategory, AnnotationStatus, FailureCategory +from mavedb.models.enums.disposition import Disposition +from mavedb.models.enums.job_pipeline import FailureCategory from mavedb.models.enums.mapping_state import MappingState from mavedb.models.mapping_record import MappingRecord from mavedb.models.mapping_record_allele import MappingRecordAllele @@ -268,7 +269,8 @@ async def map_variants_for_score_set(ctx: dict, job_id: int, job_manager: JobMan f"Processing {total_variants} mapped variants for score set {score_set.urn}.", extra=job_manager.logging_context(), ) - annotation_manager = AnnotationStatusManager(job_manager.db, job_run_id=job.id) + + annotation_manager = AnnotationStatusManager(job_manager.db, job_run_id=job.id, score_set_id=score_set.id) for mapped_score in mapped_scores: variant_urn = mapped_score.get("mavedb_id") variant = job_manager.db.scalars(select(Variant).where(Variant.urn == variant_urn)).one() @@ -349,32 +351,28 @@ async def map_variants_for_score_set(ctx: dict, job_id: int, job_manager: JobMan else: job_manager.db.add(mapping_record) - # MAPPED -> success; benign absences -> skipped; FAILED -> failed. The raw outcome - # is preserved in annotation_metadata so the benign distinction survives. + # The mapper emits a MappingRecord for EVERY variant, so "a record exists" carries no + # signal — the signal is whether a real allele resulted. MAPPED yields an authoritative + # allele -> present. A benign absence (intronic, synonymous) produces a record but no + # allele: an informative biological negative -> absent. FAILED -> failed. `reason` reuses + # the MappingOutcome vocabulary so the intronic-vs-no-protein distinction survives. if outcome is MappingOutcome.MAPPED: - annotation_status = AnnotationStatus.SUCCESS - annotation_failure_category = None + disposition = Disposition.PRESENT elif outcome.is_benign_absence: - annotation_status = AnnotationStatus.SKIPPED - annotation_failure_category = None + disposition = Disposition.ABSENT else: - annotation_status = AnnotationStatus.FAILED - annotation_failure_category = AnnotationFailureCategory.EXTERNAL_SERVICE_REJECTED - - annotation_manager.add_annotation( - variant_id=variant.id, # type: ignore - annotation_type=AnnotationType.VRS_MAPPING, - version=tool_version, - status=annotation_status, - failure_category=annotation_failure_category, - annotation_data={ - "error_message": mapped_score.get("error_message", null()), - "annotation_metadata": { - "outcome": outcome.value, - "mapped_assay_level_hgvs": assay_level_hgvs, - }, + disposition = Disposition.FAILED + + annotation_manager.record_event( + AnnotationType.VRS_MAPPING, + variant_id=variant.id, + disposition=disposition, + reason=outcome.value, + source_version=tool_version, + metadata={ + "mapped_assay_level_hgvs": assay_level_hgvs, + "error_message": mapped_score.get("error_message"), }, - current=True, ) # Only variants with a post-mapped representation yield an authoritative Allele; diff --git a/src/mavedb/worker/jobs/variant_processing/reverse_translation.py b/src/mavedb/worker/jobs/variant_processing/reverse_translation.py index 07df529f..d7f17e91 100644 --- a/src/mavedb/worker/jobs/variant_processing/reverse_translation.py +++ b/src/mavedb/worker/jobs/variant_processing/reverse_translation.py @@ -11,12 +11,13 @@ import dataclasses import functools import logging +from collections import Counter from datetime import date -from enum import Enum from typing import Any, NamedTuple, Sequence from ga4gh.vrs.extras.translator import AlleleTranslator from sqlalchemy import select +from variant_annotation import __version__ as variant_annotation_version from variant_annotation.lib.accessions import looks_like_refseq_protein_accession from variant_annotation.lib.translation import construct_equivalent_variants from variant_annotation.lib.translation.types import TranslationConfig, VariantInput, WtCodonMode @@ -29,7 +30,9 @@ from mavedb.models.allele import Allele as AlleleDbModel from mavedb.models.enums.annotation_layer import AnnotationLayer from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationFailureCategory, AnnotationStatus, FailureCategory +from mavedb.models.enums.disposition import Disposition +from mavedb.models.enums.event_reason import EventReason +from mavedb.models.enums.job_pipeline import FailureCategory from mavedb.models.enums.target_category import TargetCategory from mavedb.models.mapping_record import MappingRecord from mavedb.models.mapping_record_allele import MappingRecordAllele @@ -64,36 +67,21 @@ class _TranscriptResolution(NamedTuple): target_gene_id: int | None -class _TranscriptResolutionSkipReason(Enum): - NO_ASSAY_LEVEL_HGVS = "no_assay_level_hgvs" - TRANSCRIPT_UNRESOLVED = "transcript_unresolved" - NO_CODING_TRANSCRIPT = "no_coding_transcript" - - @classmethod - def classify( - cls, resolution: _TranscriptResolution, category: TargetCategory | None - ) -> tuple["_TranscriptResolutionSkipReason", str]: - """Classify why a record was skipped. - - Protein-coding target with no transcript → recoverable (transcript_unresolved). - Non-coding/regulatory target → correct skip (no_coding_transcript). - """ - if not resolution.rec.hgvs_assay_level: - return ( - cls.NO_ASSAY_LEVEL_HGVS, - "No assay-level HGVS available to reverse-translate.", - ) - if category == TargetCategory.protein_coding: - return ( - cls.TRANSCRIPT_UNRESOLVED, - "Protein-coding target but no coding transcript could be resolved " - "(no cdna TargetGeneMapping and no NP_->NM_ association). Recoverable: " - "re-map or check transcript selection.", - ) - return ( - cls.NO_CODING_TRANSCRIPT, - "Non-coding/regulatory target has no protein consequence to reverse-translate.", - ) +def _classify_skip( + resolution: _TranscriptResolution, category: TargetCategory | None +) -> tuple[EventReason, Disposition]: + """Classify why a record was skipped, returning its reason and event disposition. + + ``no_assay_level_hgvs`` — no input to translate, we could not ask → ``not_applicable``. + ``transcript_unresolved`` — protein-coding target with no transcript, a recoverable pipeline + gap → ``failed``. ``no_coding_transcript`` — non-coding target has no protein consequence, a + biological negative → ``absent``. + """ + if not resolution.rec.hgvs_assay_level: + return (EventReason.NO_ASSAY_LEVEL_HGVS, Disposition.NOT_APPLICABLE) + if category == TargetCategory.protein_coding: + return (EventReason.TRANSCRIPT_UNRESOLVED, Disposition.FAILED) + return (EventReason.NO_CODING_TRANSCRIPT, Disposition.ABSENT) def _coding_transcripts_for_proteins(protein_accessions: set[str]) -> dict[str, str]: @@ -147,6 +135,33 @@ def _build_translation_config(overrides: dict[str, Any] | None) -> TranslationCo raise ValueError(f"Invalid translation_config: {exc}") from exc +def _annotate_translation( + annotation_manager: AnnotationStatusManager, + variant_id: int, + disposition: Disposition, + reason: EventReason, + *, + error_message: str | None = None, + metadata: dict | None = None, +) -> None: + """Record one CROSS_LEVEL_TRANSLATION event for an allele (the translation is a variant-level fact). + + The single choke point for RT's status writes. + """ + meta = dict(metadata or {}) + if error_message is not None: + meta["error_message"] = error_message + + annotation_manager.record_event( + AnnotationType.CROSS_LEVEL_TRANSLATION, + variant_id=variant_id, + disposition=disposition, + reason=reason, + source_version=variant_annotation_version, + metadata=meta or None, + ) + + @with_pipeline_management async def reverse_translate_variants_for_score_set( ctx: dict, job_id: int, job_manager: JobManager @@ -227,13 +242,15 @@ async def reverse_translate_variants_for_score_set( .all() ) + annotation_counts: Counter[str] = Counter({"translated": 0, "failed": 0, "skipped": 0, "alleles_created": 0}) + if not rows: logger.warning( msg="No current and authoritative mapping records found for this score set.", extra=job_manager.logging_context(), ) job_manager.db.flush() - return JobExecutionOutcome.succeeded(data={"translated": 0, "failed": 0, "skipped": 0, "alleles_created": 0}) + return JobExecutionOutcome.succeeded(data=dict(annotation_counts)) # Genomic/cdna records resolve via the cdna TargetGeneMapping; protein records have no # cdna reference_accession, so collect their NP_ accessions for a batched NP_→NM_ UTA lookup. @@ -322,10 +339,9 @@ async def reverse_translate_variants_for_score_set( extra=job_manager.logging_context(), ) - translated = 0 - failed = 0 - alleles_created = 0 - annotation_manager = AnnotationStatusManager(job_manager.db, job_run_id=job_manager.job_id) + annotation_manager = AnnotationStatusManager( + job_manager.db, job_run_id=job_manager.job_id, score_set_id=score_set_id + ) allele_translator = AlleleTranslator(ctx["seqrepo"]) current_record_ids = ( @@ -411,11 +427,12 @@ async def reverse_translate_variants_for_score_set( ) candidate_count += 1 - alleles_created += candidate_count + annotation_counts["alleles_created"] += candidate_count annotation_metadata = { "hgvs_input": result.input.hgvs, "hgvs_c_candidates": result.hgvs_c_candidates, "hgvs_g_candidates": result.hgvs_g_candidates, + "hgvs_p": result.hgvs_p, "alleles_created": candidate_count, "failed_candidates": failed_candidates, } @@ -423,24 +440,23 @@ async def reverse_translate_variants_for_score_set( # No translatable candidates and failures mean the variant failed reverse translation. No # failures and no candidates is a success with no alleles created. if candidate_count == 0 and failed_candidates: - failed += 1 - annotation_manager.add_annotation( + annotation_counts["failed"] += 1 + _annotate_translation( + annotation_manager, variant_id=variant.id, - annotation_type=AnnotationType.CROSS_LEVEL_TRANSLATION, - status=AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.UNKNOWN, - annotation_data={ - "error_message": "All candidate HGVS failed VRS translation.", - "annotation_metadata": annotation_metadata, - }, + disposition=Disposition.FAILED, + reason=EventReason.TRANSLATION_FAILED, + error_message="All candidate HGVS failed VRS translation.", + metadata=annotation_metadata, ) else: - translated += 1 - annotation_manager.add_annotation( + annotation_counts["translated"] += 1 + _annotate_translation( + annotation_manager, variant_id=variant.id, - annotation_type=AnnotationType.CROSS_LEVEL_TRANSLATION, - status=AnnotationStatus.SUCCESS, - annotation_data={"annotation_metadata": annotation_metadata}, + disposition=Disposition.PRESENT, + reason=EventReason.TRANSLATED, + metadata=annotation_metadata, ) # Supersede prior live derived links atomically. @@ -453,71 +469,56 @@ async def reverse_translate_variants_for_score_set( MappingRecordAllele.mapping_record_id.in_(current_record_ids), ) - # TODO#767: non-substitution consequences (del/ins/delins/fs/ext) have no synonymous + # TODO#767: some non-substitution consequences (del/ins/delins/fs/ext) have no synonymous # equivalence class, so they arrive here as TranslationErrors and are miscounted as # FAILED. Classify by the protein consequence's edit type up front and map these to # SKIPPED instead of pattern-matching engine error strings. for error in errors: _rec, variant = variant_input_map[id(error.input)] - failed += 1 - annotation_manager.add_annotation( + annotation_counts["failed"] += 1 + _annotate_translation( + annotation_manager, variant_id=variant.id, - annotation_type=AnnotationType.CROSS_LEVEL_TRANSLATION, - status=AnnotationStatus.FAILED, - failure_category=AnnotationFailureCategory.UNKNOWN, - annotation_data={ - "error_message": error.error, - "annotation_metadata": {"hgvs_input": error.input.hgvs}, - }, + disposition=Disposition.FAILED, + reason=EventReason.TRANSLATION_ERROR, + metadata={"hgvs_input": error.input.hgvs, "error_message": error.error}, ) - skipped = len(skipped_variants) + annotation_counts["skipped"] = len(skipped_variants) for p in skipped_variants: category = target_category_by_gene.get(p.target_gene_id) if p.target_gene_id is not None else None - skip_category, reason = _TranscriptResolutionSkipReason.classify(p, category) - annotation_manager.add_annotation( + reason, disposition = _classify_skip(p, category) + _annotate_translation( + annotation_manager, variant_id=p.variant.id, - annotation_type=AnnotationType.CROSS_LEVEL_TRANSLATION, - status=AnnotationStatus.SKIPPED, - annotation_data={ - "annotation_metadata": { - "hgvs_input": p.rec.hgvs_assay_level, - "skip_category": skip_category.value, - "reason": reason, - } - }, + disposition=disposition, + reason=reason, + metadata={"hgvs_input": p.rec.hgvs_assay_level}, ) annotation_manager.flush() - job_manager.save_to_context( - { - "translated": translated, - "failed": failed, - "skipped": skipped, - "alleles_created": alleles_created, - } - ) + outcome_data = dict(annotation_counts) + job_manager.save_to_context(outcome_data) logger.info( msg=( - f"Reverse translation complete: {translated} translated, {failed} failed, " - f"{skipped} skipped, {alleles_created} alleles created." + f"Reverse translation complete: {annotation_counts['translated']} translated, " + f"{annotation_counts['failed']} failed, {annotation_counts['skipped']} skipped, " + f"{annotation_counts['alleles_created']} alleles created." ), extra=job_manager.logging_context(), ) job_manager.db.flush() - if translated == 0 and failed > 0: + if annotation_counts["translated"] == 0 and annotation_counts["failed"] > 0: logger.error( msg="All variant reverse translations failed.", extra=job_manager.logging_context(), ) return JobExecutionOutcome.failed( reason="All variant reverse translations failed.", - data={"translated": 0, "failed": failed, "skipped": skipped, "alleles_created": 0}, + data=outcome_data, failure_category=FailureCategory.DATA_ERROR, ) - return JobExecutionOutcome.succeeded( - data={"translated": translated, "failed": failed, "skipped": skipped, "alleles_created": alleles_created} - ) + return JobExecutionOutcome.succeeded(data=outcome_data) diff --git a/tests/helpers/util/score_set.py b/tests/helpers/util/score_set.py index b6d7801a..83ebcbca 100644 --- a/tests/helpers/util/score_set.py +++ b/tests/helpers/util/score_set.py @@ -8,7 +8,7 @@ from fastapi.testclient import TestClient from sqlalchemy import select -from mavedb.models.clinical_control import ClinicalControl as ClinicalControlDbModel +from mavedb.models.clinical_control import ClinvarControl as ClinicalControlDbModel from mavedb.models.gnomad_variant import GnomADVariant as GnomADVariantDbModel from mavedb.models.mapped_variant import MappedVariant as MappedVariantDbModel from mavedb.models.score_set import ScoreSet as ScoreSetDbModel diff --git a/tests/lib/clingen/test_alleles.py b/tests/lib/clingen/test_alleles.py new file mode 100644 index 00000000..44064624 --- /dev/null +++ b/tests/lib/clingen/test_alleles.py @@ -0,0 +1,53 @@ +"""Unit tests for the pure allele-grouping primitive shared by the annotation jobs.""" + +from mavedb.lib.clingen.alleles import ScoreSetAlleleRow, group_alleles_for_annotation + + +def _row(allele_id, variant_id, *, caid="CA1"): + return ScoreSetAlleleRow( + allele_id=allele_id, + post_mapped={"type": "Allele"}, + clingen_allele_id=caid, + variant_id=variant_id, + ) + + +def test_collapses_rows_to_one_payload_per_allele(): + rows = [ + _row(1, 10), + _row(1, 11), + _row(2, 12, caid="CA2"), + ] + + groups = group_alleles_for_annotation(rows, payload=lambda r: r.clingen_allele_id) + + assert groups == {1: "CA1", 2: "CA2"} + + +def test_shared_allele_yields_a_single_entry(): + """An allele linked by multiple variants is deduped to one work-unit (events are allele-keyed).""" + rows = [ + _row(1, 10), + _row(1, 11), + ] + + groups = group_alleles_for_annotation(rows, payload=lambda r: r.clingen_allele_id) + + assert groups == {1: "CA1"} + + +def test_payload_returning_none_skips_the_allele(): + """Returning None from payload drops the allele entirely — replacing each job's ad-hoc + 'no CAID / no HGVS -> continue' filter.""" + rows = [ + _row(1, 10, caid=None), + _row(2, 11, caid="CA2"), + ] + + groups = group_alleles_for_annotation(rows, payload=lambda r: r.clingen_allele_id) + + assert groups == {2: "CA2"} + + +def test_empty_rows_yield_empty_grouping(): + assert group_alleles_for_annotation([], payload=lambda r: r.clingen_allele_id) == {} diff --git a/tests/lib/test_annotation_status_manager.py b/tests/lib/test_annotation_status_manager.py index 52771b6b..354ed85f 100644 --- a/tests/lib/test_annotation_status_manager.py +++ b/tests/lib/test_annotation_status_manager.py @@ -5,1362 +5,278 @@ pytest.importorskip("psycopg2") from mavedb.lib.annotation_status_manager import AnnotationStatusManager +from mavedb.models.allele import Allele from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationStatus -from mavedb.models.variant import Variant +from mavedb.models.enums.disposition import Disposition +from mavedb.models.enums.event_reason import EventReason @pytest.fixture def annotation_status_manager(session, job_run): - """Fixture to provide an AnnotationStatusManager instance.""" return AnnotationStatusManager(session, job_run_id=job_run.id) @pytest.fixture -def existing_annotation_status(session, annotation_status_manager, setup_lib_db_with_variant): - """Fixture to create an existing annotation status in the database.""" - - # Add initial annotation - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - - assert annotation.id is not None - assert annotation.current is True - - return annotation - - -@pytest.fixture -def existing_unversioned_annotation_status(session, annotation_status_manager, setup_lib_db_with_variant): - """Fixture to create an existing annotation status in the database.""" - - # Add initial annotation - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version=None, - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - annotation_status_manager.flush() +def allele(session): + allele = Allele(vrs_digest="asm-test-allele-digest", level="genomic") + session.add(allele) session.commit() + session.refresh(allele) + return allele - annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - ) - - assert annotation.id is not None - assert annotation.current is True - - return annotation - - -@pytest.mark.unit -class TestAnnotationStatusManagerCreateAnnotationUnit: - """Unit tests for AnnotationStatusManager.add_annotation method.""" - - @pytest.mark.parametrize( - "annotation_type", - AnnotationType._member_map_.values(), - ) - @pytest.mark.parametrize( - "status", - AnnotationStatus._member_map_.values(), - ) - def test_add_annotation_creates_entry_with_annotation_type_version_status( - self, session, annotation_status_manager, annotation_type, status, setup_lib_db_with_variant - ): - """Test that adding an annotation creates a new entry with correct type and version.""" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=annotation_type, - version="v1.0", - annotation_data={}, - current=True, - status=status, - ) - annotation_status_manager.flush() - session.commit() - - annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=annotation_type, - version="v1.0", - ) - - assert annotation is not None - assert annotation.annotation_type == annotation_type - assert annotation.status == status - assert annotation.version == "v1.0" - - def test_add_annotation_stores_job_run_id( - self, session, annotation_status_manager, job_run, setup_lib_db_with_variant - ): - """Test that every annotation is created with the job_run_id from the manager.""" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - status=AnnotationStatus.SUCCESS, - version="v1.0", - annotation_data={}, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1.0", - ) - - assert annotation is not None - assert annotation.job_run_id == job_run.id - - def test_add_annotation_persists_annotation_data( - self, session, annotation_status_manager, setup_lib_db_with_variant - ): - """Test that adding an annotation persists the provided annotation data.""" - annotation_data = { - "annotation_metadata": {"some_key": "some_value"}, - "error_message": None, - } - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - status=AnnotationStatus.SUCCESS, - version="v1.0", - failure_category=None, - annotation_data=annotation_data, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1.0", - ) - - assert annotation is not None - assert annotation.failure_category is None - for key, value in annotation_data.items(): - assert getattr(annotation, key) == value - - def test_add_annotation_creates_entry_and_marks_previous_not_current( - self, session, job_run, existing_annotation_status, setup_lib_db_with_variant - ): - """Test that adding an annotation creates a new entry and marks previous ones as not current.""" - manager = AnnotationStatusManager(session, job_run_id=job_run.id) - - # Add second annotation for same (variant, type, version) - manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.FAILED, - current=True, - ) - manager.flush() - session.commit() - - annotation = manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - - assert annotation is not None - assert annotation.id is not None - assert annotation.current is True - - # Refresh first annotation from DB - session.refresh(existing_annotation_status) - assert existing_annotation_status.current is False - - def test_add_annotation_with_different_version_keeps_previous_current( - self, session, job_run, existing_annotation_status, setup_lib_db_with_variant - ): - """Test that adding an annotation with a different version keeps previous current.""" - manager = AnnotationStatusManager(session, job_run_id=job_run.id) - - # Add second annotation for same (variant, type) but different version - manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v2", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - replace_all_versions=False, - ) - manager.flush() - session.commit() - - annotation = manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v2", - ) - - assert annotation is not None - assert annotation.id is not None - assert annotation.current is True - - # Refresh first annotation from DB - session.refresh(existing_annotation_status) - assert existing_annotation_status.current is True - - def test_add_annotation_with_different_type_keeps_previous_current( - self, session, job_run, existing_annotation_status, setup_lib_db_with_variant - ): - """Test that adding an annotation with a different type keeps previous current.""" - manager = AnnotationStatusManager(session, job_run_id=job_run.id) - - # Add second annotation for same variant but different type - manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINGEN_ALLELE_ID, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - manager.flush() - session.commit() - - annotation = manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINGEN_ALLELE_ID, - version="v1", - ) - - assert annotation is not None - assert annotation.id is not None - assert annotation.current is True - - # Refresh first annotation from DB - session.refresh(existing_annotation_status) - assert existing_annotation_status.current is True - - def test_add_annotation_without_version(self, session, annotation_status_manager, setup_lib_db_with_variant): - """Test that adding an annotation without specifying version works correctly.""" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, - version=None, - annotation_data={}, - status=AnnotationStatus.SKIPPED, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, - ) - - assert annotation is not None - assert annotation.id is not None - assert annotation.version is None - assert annotation.current is True - - def test_add_annotation_multiple_without_version_marks_previous_not_current( - self, session, annotation_status_manager, existing_unversioned_annotation_status, setup_lib_db_with_variant - ): - """Test that adding multiple annotations without version marks previous ones as not current.""" - - # Add second annotation without version - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version=None, - annotation_data={}, - status=AnnotationStatus.FAILED, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - second_annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - ) - - assert second_annotation is not None - assert second_annotation.id is not None - assert second_annotation.current is True - - # Refresh first annotation from DB - session.refresh(existing_unversioned_annotation_status) - assert existing_unversioned_annotation_status.current is False - - def test_add_annotation_different_type_without_version_keeps_previous_current( - self, session, annotation_status_manager, existing_unversioned_annotation_status, setup_lib_db_with_variant - ): - """Test that adding an annotation of different type without version keeps previous current.""" - - # Add second annotation of different type without version - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINGEN_ALLELE_ID, - version=None, - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - second_annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINGEN_ALLELE_ID, - ) - - assert second_annotation is not None - assert second_annotation.id is not None - assert second_annotation.current is True - - # Refresh first annotation from DB - session.refresh(existing_unversioned_annotation_status) - assert existing_unversioned_annotation_status.current is True - - def test_add_annotation_multiple_variants_independent_current_flags( - self, session, annotation_status_manager, setup_lib_db_with_score_set - ): - """Test that adding annotations for different variants maintains independent current flags.""" - - variant1 = Variant(score_set_id=1, hgvs_nt="NM_000000.1:c.1A>G", hgvs_pro="NP_000000.1:p.Met1Val", data={}) - variant2 = Variant(score_set_id=1, hgvs_nt="NM_000000.1:c.2A>T", hgvs_pro="NP_000000.1:p.Met2Val", data={}) - session.add_all([variant1, variant2]) - session.commit() - session.refresh(variant1) - session.refresh(variant2) - - # Add annotation for variant 1 - annotation_status_manager.add_annotation( - variant_id=variant1.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - - # Add annotation for variant 2 - annotation_status_manager.add_annotation( - variant_id=variant2.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - annotation1 = annotation_status_manager.get_current_annotation( - variant_id=variant1.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - annotation2 = annotation_status_manager.get_current_annotation( - variant_id=variant2.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - - assert annotation1 is not None - assert annotation1.id is not None - assert annotation1.current is True - - assert annotation2 is not None - assert annotation2.id is not None - assert annotation2.current is True - - -class TestAnnotationStatusManagerGetCurrentAnnotationUnit: - """Unit tests for AnnotationStatusManager.get_current_annotation method.""" - - def test_get_current_annotation_returns_none_when_no_entry( - self, annotation_status_manager, setup_lib_db_with_variant - ): - """Test that getting current annotation returns None when no entry exists.""" - annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - assert annotation is None - - def test_get_current_annotation_returns_correct_entry( - self, session, annotation_status_manager, existing_annotation_status, setup_lib_db_with_variant - ): - """Test that getting current annotation returns the correct entry.""" - annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - assert annotation.id == existing_annotation_status.id - assert annotation.current is True - - def test_get_current_annotation_returns_none_for_non_current( - self, session, annotation_status_manager, existing_annotation_status, setup_lib_db_with_variant - ): - """Test that getting current annotation returns None when the entry is not current.""" - # Mark existing annotation as not current - existing_annotation_status.current = False - session.commit() - - annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - assert annotation is None - - def test_get_current_annotation_with_different_version_returns_none( - self, session, annotation_status_manager, existing_annotation_status, setup_lib_db_with_variant - ): - """Test that getting current annotation with different version returns None.""" - annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v2", - ) - assert annotation is None - - def test_get_current_annotation_with_different_type_returns_none( - self, session, annotation_status_manager, existing_annotation_status, setup_lib_db_with_variant - ): - """Test that getting current annotation with different type returns None.""" - annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINGEN_ALLELE_ID, - version="v1", - ) - assert annotation is None - - def test_get_current_annotation_without_version_returns_correct_entry( - self, session, annotation_status_manager, existing_unversioned_annotation_status, setup_lib_db_with_variant - ): - """Test that getting current annotation without version returns the correct entry.""" - annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version=None, - ) - assert annotation.id == existing_unversioned_annotation_status.id - assert annotation.current is True - - -class TestAnnotationStatusManagerIntegration: - """Integration tests for AnnotationStatusManager methods.""" - - def test_add_and_get_current_annotation_work_together( - self, session, annotation_status_manager, setup_lib_db_with_variant - ): - """Test that adding and getting current annotation work together correctly.""" - # Add annotation - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - # Get current annotation - retrieved_annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - - assert retrieved_annotation is not None - assert retrieved_annotation.current is True - assert retrieved_annotation.status == AnnotationStatus.SUCCESS - - @pytest.mark.parametrize( - "version", - ["v1.0", "v2.0", None], - ) - def test_add_multiple_and_get_current_returns_latest( - self, session, annotation_status_manager, version, setup_lib_db_with_variant - ): - """Test that adding multiple annotations and getting current returns the latest one.""" - # Add first annotation - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version=version, - annotation_data={}, - status=AnnotationStatus.FAILED, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - # Add second annotation - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version=version, - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - # Get current annotation - retrieved_annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version=version, - ) - - assert retrieved_annotation is not None - assert retrieved_annotation.current is True - assert retrieved_annotation.version == version - assert retrieved_annotation.status == AnnotationStatus.SUCCESS - - @pytest.mark.parametrize( - "version", - ["v1.0", "v2.0", None], - ) - def test_add_annotations_for_different_variants_and_get_current_independent( - self, session, annotation_status_manager, version, setup_lib_db_with_score_set - ): - """Test that adding annotations for different variants and getting current works independently.""" - - variant1 = Variant(score_set_id=1, hgvs_nt="NM_000000.1:c.1A>G", hgvs_pro="NP_000000.1:p.Met1Val", data={}) - variant2 = Variant(score_set_id=1, hgvs_nt="NM_000000.1:c.2A>T", hgvs_pro="NP_000000.1:p.Met2Val", data={}) - session.add_all([variant1, variant2]) - session.commit() - session.refresh(variant1) - session.refresh(variant2) - - # Add annotation for variant 1 - annotation_status_manager.add_annotation( - variant_id=variant1.id, - annotation_type=AnnotationType.VRS_MAPPING, - version=version, - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - - # Add annotation for variant 2 - annotation_status_manager.add_annotation( - variant_id=variant2.id, - annotation_type=AnnotationType.VRS_MAPPING, - version=version, - annotation_data={}, - status=AnnotationStatus.FAILED, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - # Get current annotation for variant 1 - retrieved_annotation1 = annotation_status_manager.get_current_annotation( - variant_id=variant1.id, - annotation_type=AnnotationType.VRS_MAPPING, - version=version, - ) - - assert retrieved_annotation1 is not None - assert retrieved_annotation1.current is True - assert retrieved_annotation1.status == AnnotationStatus.SUCCESS - assert retrieved_annotation1.version == version - - # Get current annotation for variant 2 - retrieved_annotation2 = annotation_status_manager.get_current_annotation( - variant_id=variant2.id, - annotation_type=AnnotationType.VRS_MAPPING, - version=version, - ) - - assert retrieved_annotation2 is not None - assert retrieved_annotation2.current is True - assert retrieved_annotation2.status == AnnotationStatus.FAILED - assert retrieved_annotation2.version == version - - -@pytest.mark.unit -class TestAnnotationStatusManagerReplaceAllVersionsUnit: - """Unit tests for the replace_all_versions parameter of AnnotationStatusManager.add_annotation.""" - - def test_replace_all_versions_false_keeps_different_version_current( - self, session, annotation_status_manager, existing_annotation_status, setup_lib_db_with_variant - ): - """Default behavior: a new annotation only retires the same version, not others.""" - # existing_annotation_status is version "v1", current=True - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v2", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - replace_all_versions=False, - ) - annotation_status_manager.flush() - session.commit() - - new_annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v2", - ) - assert new_annotation is not None - assert new_annotation.current is True - - session.refresh(existing_annotation_status) - assert existing_annotation_status.current is True - - def test_replace_all_versions_true_retires_all_versions( - self, session, annotation_status_manager, setup_lib_db_with_variant - ): - """replace_all_versions=True retires all current records for (variant, type) regardless of version.""" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - replace_all_versions=False, - ) - annotation_status_manager.flush() - - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v2", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - replace_all_versions=False, - ) - annotation_status_manager.flush() - session.commit() - - # Both v1 and v2 are current at this point (replace_all_versions=False) - v1 = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - v2 = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v2", - ) - assert v1 is not None and v1.current is True - assert v2 is not None and v2.current is True - - # Now add v3 with replace_all_versions=True — should retire both v1 and v2 - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v3", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - replace_all_versions=True, - ) - annotation_status_manager.flush() - session.commit() - - session.refresh(v1) - session.refresh(v2) - assert v1.current is False - assert v2.current is False - - v3 = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v3", - ) - assert v3 is not None and v3.current is True - def test_replace_all_versions_true_only_affects_matching_type( - self, session, annotation_status_manager, setup_lib_db_with_variant - ): - """replace_all_versions=True only retires records for the same annotation_type.""" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - annotation_status_manager.add_annotation( +class TestRecordEvent: + def test_records_variant_subject_event(self, session, annotation_status_manager, setup_lib_db_with_variant): + annotation_status_manager.record_event( + AnnotationType.LDH_SUBMISSION, variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, + disposition=Disposition.PRESENT, + reason=EventReason.SUBMITTED, ) annotation_status_manager.flush() session.commit() - vrs = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - clinvar = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, - version="v1", + event = annotation_status_manager.get_current_annotation( + AnnotationType.LDH_SUBMISSION, variant_id=setup_lib_db_with_variant.id ) + assert event is not None + assert event.disposition == Disposition.PRESENT + assert event.allele_id is None + # No current flag exists on the event log. + assert not hasattr(event, "current") - # replace VRS_MAPPING only - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v2", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - replace_all_versions=True, + def test_records_allele_subject_event_with_metadata(self, session, annotation_status_manager, allele): + annotation_status_manager.record_event( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, + allele_id=allele.id, + disposition=Disposition.PRESENT, + reason=EventReason.CREATED, + source_version="4.1.0", + metadata={"gnomad_variant_id": "1-55051215-G-A"}, ) annotation_status_manager.flush() session.commit() - session.refresh(vrs) - session.refresh(clinvar) - assert vrs.current is False - assert clinvar.current is True - - new_vrs = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v2", - ) - assert new_vrs is not None and new_vrs.current is True - - def test_replace_all_versions_true_only_affects_matching_variant( - self, session, annotation_status_manager, setup_lib_db_with_score_set - ): - """replace_all_versions=True only retires records for the same variant_id.""" - variant1 = Variant(score_set_id=1, hgvs_nt="NM_000000.1:c.1A>G", hgvs_pro="NP_000000.1:p.Met1Val", data={}) - variant2 = Variant(score_set_id=1, hgvs_nt="NM_000000.1:c.2A>T", hgvs_pro="NP_000000.1:p.Met2Val", data={}) - session.add_all([variant1, variant2]) - session.commit() - session.refresh(variant1) - session.refresh(variant2) - - annotation_status_manager.add_annotation( - variant_id=variant1.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - annotation_status_manager.add_annotation( - variant_id=variant2.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - ann1 = annotation_status_manager.get_current_annotation( - variant_id=variant1.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - ann2 = annotation_status_manager.get_current_annotation( - variant_id=variant2.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - - # replace variant1 only - annotation_status_manager.add_annotation( - variant_id=variant1.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v2", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - replace_all_versions=True, + event = annotation_status_manager.get_current_annotation( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, allele_id=allele.id ) - annotation_status_manager.flush() - session.commit() + assert event.event_metadata == {"gnomad_variant_id": "1-55051215-G-A"} + assert event.source_version == "4.1.0" + assert event.variant_id is None - session.refresh(ann1) - session.refresh(ann2) - assert ann1.current is False - assert ann2.current is True # untouched - new_ann1 = annotation_status_manager.get_current_annotation( - variant_id=variant1.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v2", - ) - assert new_ann1 is not None and new_ann1.current is True - - def test_replace_all_versions_true_same_version_also_retired( - self, session, annotation_status_manager, existing_annotation_status, setup_lib_db_with_variant - ): - """replace_all_versions=True retires a same-version record just as replace_all_versions=False would.""" - # existing_annotation_status is version "v1" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.FAILED, - current=True, - replace_all_versions=True, - ) +class TestAppendOnlyLatestWins: + def test_latest_event_by_id_is_current(self, session, annotation_status_manager, allele): + for reason in (EventReason.CREATED, EventReason.RECONFIRMED, EventReason.SKIPPED): + annotation_status_manager.record_event( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, + allele_id=allele.id, + disposition=Disposition.PRESENT, + reason=reason, + source_version="4.1.0", + ) annotation_status_manager.flush() session.commit() - session.refresh(existing_annotation_status) - assert existing_annotation_status.current is False - - new_annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", + # All three events persist (append-only — no retire of prior rows). + history = annotation_status_manager.get_event_history( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, allele_id=allele.id ) - assert new_annotation is not None - assert new_annotation.current is True - assert new_annotation.status == AnnotationStatus.FAILED - - -@pytest.mark.unit -class TestAnnotationStatusManagerBatchingUnit: - """Unit tests for batching and flush behavior.""" - - def test_flush_noop_when_empty(self, annotation_status_manager): - """flush() with no pending annotations does nothing and does not error.""" - annotation_status_manager.flush() # should not raise - - def test_auto_flush_at_batch_size(self, session, setup_lib_db_with_score_set): - """Annotations are auto-flushed to the DB when batch_size is reached.""" - variants = [ - Variant(score_set_id=1, hgvs_nt=f"NM_000000.1:c.{i}A>G", hgvs_pro=f"NP_000000.1:p.Met{i}Val", data={}) - for i in range(3) + assert [e.reason for e in history] == [ # newest first + EventReason.SKIPPED, + EventReason.RECONFIRMED, + EventReason.CREATED, ] - session.add_all(variants) - session.commit() - for v in variants: - session.refresh(v) - - manager = AnnotationStatusManager(session, batch_size=2) - - # Add first — stays pending (below threshold) - manager.add_annotation( - variant_id=variants[0].id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - assert len(manager._pending) == 1 - - # Add second — triggers auto-flush (reaches batch_size=2) - manager.add_annotation( - variant_id=variants[1].id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - assert len(manager._pending) == 0 # flushed - - # Verify the auto-flushed rows are visible in the DB - ann = manager.get_current_annotation( - variant_id=variants[0].id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - assert ann is not None and ann.current is True - - # Add a third — stays pending (below threshold again) - manager.add_annotation( - variant_id=variants[2].id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - assert len(manager._pending) == 1 - # Explicit flush persists the remainder - manager.flush() - assert len(manager._pending) == 0 - - ann3 = manager.get_current_annotation( - variant_id=variants[2].id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - assert ann3 is not None and ann3.current is True - - def test_get_current_annotation_auto_flushes_pending( - self, session, annotation_status_manager, setup_lib_db_with_variant - ): - """get_current_annotation() flushes pending writes before querying.""" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - # No explicit flush — get_current_annotation should auto-flush - annotation = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - assert annotation is not None - assert annotation.current is True + # Current = newest by id. + current = annotation_status_manager.get_current_annotation( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, allele_id=allele.id + ) + assert current.reason == EventReason.SKIPPED + + def test_source_version_scopes_current(self, session, annotation_status_manager, allele): + annotation_status_manager.record_event( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, + allele_id=allele.id, + disposition=Disposition.PRESENT, + reason=EventReason.CREATED, + source_version="4.0.0", + ) + annotation_status_manager.record_event( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, + allele_id=allele.id, + disposition=Disposition.ABSENT, + reason=EventReason.NO_RECORD, + source_version="4.1.0", + ) + annotation_status_manager.flush() + session.commit() + + v40 = annotation_status_manager.get_current_annotation( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, allele_id=allele.id, source_version="4.0.0" + ) + assert v40.disposition == Disposition.PRESENT + + +class TestSubjectValidation: + def test_variant_subject_type_with_allele_id_raises(self, annotation_status_manager, allele): + with pytest.raises(ValueError, match="variant-subject"): + annotation_status_manager.record_event( + AnnotationType.VRS_MAPPING, + allele_id=allele.id, + disposition=Disposition.PRESENT, + reason=EventReason.CREATED, + ) + + def test_allele_subject_type_with_variant_id_raises(self, annotation_status_manager, setup_lib_db_with_variant): + with pytest.raises(ValueError, match="allele-subject"): + annotation_status_manager.record_event( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, + variant_id=setup_lib_db_with_variant.id, + disposition=Disposition.PRESENT, + reason=EventReason.CREATED, + ) + + def test_neither_subject_raises(self, annotation_status_manager): + with pytest.raises(ValueError, match="Exactly one"): + annotation_status_manager.record_event( + AnnotationType.VRS_MAPPING, + disposition=Disposition.PRESENT, + reason=EventReason.CREATED, + ) + + def test_both_subjects_raises(self, annotation_status_manager, setup_lib_db_with_variant, allele): + with pytest.raises(ValueError, match="Exactly one"): + annotation_status_manager.record_event( + AnnotationType.VRS_MAPPING, + variant_id=setup_lib_db_with_variant.id, + allele_id=allele.id, + disposition=Disposition.PRESENT, + reason=EventReason.CREATED, + ) + + +class TestQueryReads: + def test_get_current_annotation_returns_none_when_no_event(self, annotation_status_manager, allele): + # No event recorded for this subject yet — current status is None, not an error. + assert ( + annotation_status_manager.get_current_annotation( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, allele_id=allele.id + ) + is None + ) + + def test_get_event_history_empty_for_no_records(self, annotation_status_manager, allele): + assert ( + annotation_status_manager.get_event_history(AnnotationType.GNOMAD_ALLELE_FREQUENCY, allele_id=allele.id) + == [] + ) + + def test_flush_is_noop_when_empty(self, annotation_status_manager): + # Nothing pending — flush must not raise or emit a write. assert len(annotation_status_manager._pending) == 0 - - def test_flush_clears_internal_buffers(self, session, annotation_status_manager, setup_lib_db_with_variant): - """flush() clears both _pending and _retirement_filters.""" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - assert len(annotation_status_manager._pending) == 1 - assert len(annotation_status_manager._retirement_filters) == 1 - annotation_status_manager.flush() assert len(annotation_status_manager._pending) == 0 - assert len(annotation_status_manager._retirement_filters) == 0 - - def test_batch_retirement_groups_by_annotation_type( - self, session, annotation_status_manager, setup_lib_db_with_variant - ): - """Multiple annotation types in one batch are retired independently.""" - # Create initial annotations for two types - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - vrs_v1 = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - clinvar_v1 = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, - version="v1", - ) - - # Now add replacements for both types in one batch - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v2", - annotation_data={}, - status=AnnotationStatus.FAILED, - current=True, - ) - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, - version="v2", - annotation_data={}, - status=AnnotationStatus.FAILED, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - session.refresh(vrs_v1) - session.refresh(clinvar_v1) - assert vrs_v1.current is False - assert clinvar_v1.current is False - - vrs_v2 = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v2", - ) - clinvar_v2 = annotation_status_manager.get_current_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, - version="v2", - ) - assert vrs_v2 is not None and vrs_v2.current is True - assert clinvar_v2 is not None and clinvar_v2.current is True - - -@pytest.mark.unit -class TestAnnotationStatusManagerAuditHelpersUnit: - """Unit tests for audit query helpers: get_annotation_history and get_all_current_annotations.""" - - def test_get_annotation_history_returns_all_rows_newest_first( - self, session, annotation_status_manager, setup_lib_db_with_variant - ): - """get_annotation_history returns both current and retired rows, newest first.""" - # Create two annotations for the same (variant, type, version) — first gets retired - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - annotation_status_manager.flush() - - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.FAILED, - current=True, - ) - annotation_status_manager.flush() - session.commit() - - history = annotation_status_manager.get_annotation_history( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - ) - - assert len(history) == 2 - # Newest first - assert history[0].status == AnnotationStatus.FAILED - assert history[0].current is True - assert history[1].status == AnnotationStatus.SUCCESS - assert history[1].current is False - - def test_get_annotation_history_filters_by_version( - self, session, annotation_status_manager, setup_lib_db_with_variant - ): - """get_annotation_history with version only returns matching rows.""" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, - version="2025-01", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - replace_all_versions=False, - ) - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, - version="2025-02", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - replace_all_versions=False, - ) - annotation_status_manager.flush() - session.commit() - - history_jan = annotation_status_manager.get_annotation_history( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, - version="2025-01", - ) - assert len(history_jan) == 1 - assert history_jan[0].version == "2025-01" - - def test_get_annotation_history_without_version_returns_all_versions( - self, session, annotation_status_manager, setup_lib_db_with_variant - ): - """get_annotation_history without version returns rows across all versions.""" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, - version="2025-01", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - replace_all_versions=False, - ) - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, - version="2025-02", - annotation_data={}, - status=AnnotationStatus.FAILED, - current=True, - replace_all_versions=False, - ) - annotation_status_manager.flush() - session.commit() - history = annotation_status_manager.get_annotation_history( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, + def test_get_current_annotation_auto_flushes_pending(self, session, annotation_status_manager, allele): + annotation_status_manager.record_event( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, + allele_id=allele.id, + disposition=Disposition.PRESENT, + reason=EventReason.CREATED, ) - assert len(history) == 2 - - def test_get_annotation_history_empty_for_no_records(self, annotation_status_manager, setup_lib_db_with_variant): - """get_annotation_history returns empty list when no records exist.""" - history = annotation_status_manager.get_annotation_history( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, + # Buffered, never explicitly flushed — the read must flush first and still see it. + assert len(annotation_status_manager._pending) == 1 + event = annotation_status_manager.get_current_annotation( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, allele_id=allele.id ) - assert history == [] + assert event is not None + assert len(annotation_status_manager._pending) == 0 - def test_get_annotation_history_auto_flushes_pending( - self, session, annotation_status_manager, setup_lib_db_with_variant - ): - """get_annotation_history flushes pending writes before querying.""" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, + def test_get_event_history_auto_flushes_pending(self, session, annotation_status_manager, allele): + annotation_status_manager.record_event( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, + allele_id=allele.id, + disposition=Disposition.PRESENT, + reason=EventReason.CREATED, ) - # No explicit flush - history = annotation_status_manager.get_annotation_history( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, + assert len(annotation_status_manager._pending) == 1 + history = annotation_status_manager.get_event_history( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, allele_id=allele.id ) assert len(history) == 1 assert len(annotation_status_manager._pending) == 0 - def test_get_all_current_annotations_returns_all_types( - self, session, annotation_status_manager, setup_lib_db_with_variant - ): - """get_all_current_annotations returns current annotations across all types.""" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, - version="2025-01", - annotation_data={}, - status=AnnotationStatus.FAILED, - current=True, - ) - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINGEN_ALLELE_ID, - version=None, - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, + def test_failed_disposition_round_trips_through_manager(self, session, annotation_status_manager, allele): + # The failure path (the case the audit log most needs to capture) survives a write/read cycle + # with its disposition, reason, and error metadata intact. + annotation_status_manager.record_event( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, + allele_id=allele.id, + disposition=Disposition.FAILED, + reason=EventReason.API_ERROR, + metadata={"error_message": "upstream timeout"}, ) annotation_status_manager.flush() session.commit() - all_current = annotation_status_manager.get_all_current_annotations( - variant_id=setup_lib_db_with_variant.id, - ) - assert len(all_current) == 3 - types = {a.annotation_type for a in all_current} - assert types == { - AnnotationType.VRS_MAPPING, - AnnotationType.CLINVAR_CONTROL, - AnnotationType.CLINGEN_ALLELE_ID, - } - - def test_get_all_current_annotations_excludes_retired( - self, session, annotation_status_manager, setup_lib_db_with_variant - ): - """get_all_current_annotations does not include retired rows.""" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, + event = annotation_status_manager.get_current_annotation( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, allele_id=allele.id ) - annotation_status_manager.flush() + assert event.disposition == Disposition.FAILED + assert event.reason == EventReason.API_ERROR + assert event.event_metadata == {"error_message": "upstream timeout"} - # Replace it — v1 becomes retired - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v2", - annotation_data={}, - status=AnnotationStatus.FAILED, - current=True, - ) - annotation_status_manager.flush() + def test_current_is_isolated_per_subject(self, session, annotation_status_manager, allele): + # Recording for one allele must not bleed into another's current status. + other = Allele(vrs_digest="asm-test-allele-digest-2", level="genomic") + session.add(other) session.commit() + session.refresh(other) - all_current = annotation_status_manager.get_all_current_annotations( - variant_id=setup_lib_db_with_variant.id, - ) - assert len(all_current) == 1 - assert all_current[0].version == "v2" - - def test_get_all_current_annotations_empty_for_no_records( - self, annotation_status_manager, setup_lib_db_with_variant - ): - """get_all_current_annotations returns empty list when no records exist.""" - result = annotation_status_manager.get_all_current_annotations( - variant_id=setup_lib_db_with_variant.id, - ) - assert result == [] - - def test_get_all_current_annotations_auto_flushes_pending( - self, session, annotation_status_manager, setup_lib_db_with_variant - ): - """get_all_current_annotations flushes pending writes before querying.""" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - ) - # No explicit flush - result = annotation_status_manager.get_all_current_annotations( - variant_id=setup_lib_db_with_variant.id, - ) - assert len(result) == 1 - assert len(annotation_status_manager._pending) == 0 - - def test_get_all_current_annotations_ordered_by_type_then_version( - self, session, annotation_status_manager, setup_lib_db_with_variant - ): - """get_all_current_annotations returns results ordered by annotation_type, version.""" - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, + annotation_status_manager.record_event( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, + allele_id=allele.id, + disposition=Disposition.PRESENT, + reason=EventReason.CREATED, + source_version="4.1.0", ) - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, - version="2025-02", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - replace_all_versions=False, - ) - annotation_status_manager.add_annotation( - variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.CLINVAR_CONTROL, - version="2025-01", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, - replace_all_versions=False, + annotation_status_manager.record_event( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, + allele_id=other.id, + disposition=Disposition.ABSENT, + reason=EventReason.NO_RECORD, + source_version="4.1.0", ) annotation_status_manager.flush() session.commit() - all_current = annotation_status_manager.get_all_current_annotations( - variant_id=setup_lib_db_with_variant.id, + a = annotation_status_manager.get_current_annotation( + AnnotationType.GNOMAD_ALLELE_FREQUENCY, allele_id=allele.id ) - assert len(all_current) == 3 - # clinvar_control < vrs_mapping alphabetically - assert all_current[0].annotation_type == AnnotationType.CLINVAR_CONTROL - assert all_current[0].version == "2025-01" - assert all_current[1].annotation_type == AnnotationType.CLINVAR_CONTROL - assert all_current[1].version == "2025-02" - assert all_current[2].annotation_type == AnnotationType.VRS_MAPPING + b = annotation_status_manager.get_current_annotation(AnnotationType.GNOMAD_ALLELE_FREQUENCY, allele_id=other.id) + assert a.disposition == Disposition.PRESENT + assert b.disposition == Disposition.ABSENT + assert a.allele_id != b.allele_id -@pytest.mark.unit -class TestVariantAnnotationStatusReprUnit: - """Unit tests for the VariantAnnotationStatus __repr__ method.""" - - def test_repr_includes_key_fields(self, session, annotation_status_manager, setup_lib_db_with_variant): - """__repr__ includes id, variant_id, type, version, status, current, and created_at.""" - annotation_status_manager.add_annotation( +class TestBatching: + def test_auto_flush_at_batch_size(self, session, job_run, setup_lib_db_with_variant, annotation_status_manager): + annotation_status_manager.batch_size = 2 + annotation_status_manager.record_event( + AnnotationType.LDH_SUBMISSION, variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", - annotation_data={}, - status=AnnotationStatus.SUCCESS, - current=True, + disposition=Disposition.PRESENT, + reason=EventReason.SUBMITTED, ) - annotation_status_manager.flush() - session.commit() - - annotation = annotation_status_manager.get_current_annotation( + # Not yet flushed (1 < batch_size). + assert len(annotation_status_manager._pending) == 1 + annotation_status_manager.record_event( + AnnotationType.LDH_SUBMISSION, variant_id=setup_lib_db_with_variant.id, - annotation_type=AnnotationType.VRS_MAPPING, - version="v1", + disposition=Disposition.PRESENT, + reason=EventReason.SUBMITTED, ) - repr_str = repr(annotation) - - assert "VariantAnnotationStatus" in repr_str - assert f"id={annotation.id}" in repr_str - assert f"variant_id={setup_lib_db_with_variant.id}" in repr_str - assert "type='vrs_mapping'" in repr_str - assert "version='v1'" in repr_str - assert "status='success'" in repr_str - assert "current=True" in repr_str - assert "created_at=" in repr_str + # Auto-flushed at batch_size=2. + assert len(annotation_status_manager._pending) == 0 + session.commit() diff --git a/tests/lib/test_gnomad.py b/tests/lib/test_gnomad.py index 14dde952..f9c8b17a 100644 --- a/tests/lib/test_gnomad.py +++ b/tests/lib/test_gnomad.py @@ -3,24 +3,25 @@ from unittest.mock import patch import pytest - -from mavedb.models.variant_annotation_status import VariantAnnotationStatus +from sqlalchemy import select pyathena = pytest.importorskip("pyathena") fastapi = pytest.importorskip("fastapi") from mavedb.lib.gnomad import ( + GnomadLinkVerdict, allele_list_from_list_like_string, gnomad_identifier, gnomad_table_name, - link_gnomad_variants_to_mapped_variants, + link_gnomad_variants_to_alleles, + normalize_caid, ) +from mavedb.models.allele import Allele +from mavedb.models.gnomad_allele_link import GnomadAlleleLink from mavedb.models.gnomad_variant import GnomADVariant -from mavedb.models.mapped_variant import MappedVariant from tests.helpers.constants import ( TEST_GNOMAD_DATA_VERSION, TEST_GNOMAD_VARIANT, - TEST_MINIMAL_MAPPED_VARIANT, ) ### Tests for gnomad_identifier function ### @@ -77,6 +78,23 @@ def test_gnomad_table_name_raises_if_env_not_set(): gnomad_table_name() +### Tests for normalize_caid function ### + + +@pytest.mark.parametrize( + "raw, expected", + [ + ("CA025094", "CA25094"), # the #722 example: gnomAD dump drops the leading zero + ("CA000123", "CA123"), # multiple leading zeros collapse + ("CA341478553", "CA341478553"), # already unpadded — unchanged + ("CA0", "CA0"), # keep the final digit even if it is a zero + ("not-a-caid", "not-a-caid"), # unrecognized input is passed through + ], +) +def test_normalize_caid(raw, expected): + assert normalize_caid(raw) == expected + + ### Tests for allele_list_from_list_like_string function ### @@ -118,217 +136,197 @@ def test_allele_list_from_list_like_string_invalid_format_not_list(): # If the package is working correctly, this function should work as expected. -### Tests for link_gnomad_variants_to_mapped_variants function ### +### Tests for link_gnomad_variants_to_alleles function ### -def _verify_annotation_status(session, mapped_variants, expected_version): - annotations = session.query(VariantAnnotationStatus).all() - assert len(annotations) == len(mapped_variants) +def _make_allele(session, caid, *, vrs_digest, level="genomic"): + """Create and persist a deduplicated Allele carrying a CAID.""" + allele = Allele(vrs_digest=vrs_digest, level=level, clingen_allele_id=caid) + session.add(allele) + session.commit() + session.refresh(allele) + return allele - for mapped_variant, annotation in zip(mapped_variants, annotations): - assert annotation.variant_id == mapped_variant.variant_id - assert annotation.annotation_type == "gnomad_allele_frequency" - assert annotation.version == expected_version +def _live_links_for(session, allele_id): + return session.scalars( + select(GnomadAlleleLink).where( + GnomadAlleleLink.allele_id == allele_id, + GnomadAlleleLink.current, + ) + ).all() -def test_links_new_gnomad_variant_to_mapped_variant( - session, mocked_gnomad_variant_row, setup_lib_db_with_mapped_variant -): - mapped_variant = setup_lib_db_with_mapped_variant - mapped_variant.clingen_allele_id = mocked_gnomad_variant_row.caid - session.add(mapped_variant) - session.commit() - with patch("mavedb.lib.gnomad.GNOMAD_DATA_VERSION", TEST_GNOMAD_DATA_VERSION): - result = link_gnomad_variants_to_mapped_variants(session, [mocked_gnomad_variant_row]) - assert result == 1 - session.commit() +def _assert_gnomad_variant_matches(gnomad_variant, **overrides): + expected = TEST_GNOMAD_VARIANT.copy() + expected.pop("creation_date") + expected.pop("modification_date") + expected.update(overrides) + for attr, value in expected.items(): + assert getattr(gnomad_variant, attr) == value - session.refresh(mapped_variant) - edited_saved_gnomad_variant = TEST_GNOMAD_VARIANT.copy() - edited_saved_gnomad_variant.pop("creation_date") - edited_saved_gnomad_variant.pop("modification_date") +def test_links_new_gnomad_variant_to_allele(session, mocked_gnomad_variant_row): + allele = _make_allele(session, mocked_gnomad_variant_row.caid, vrs_digest="vrs-1") - assert len(mapped_variant.gnomad_variants) == 1 - for attr in edited_saved_gnomad_variant: - assert getattr(mapped_variant.gnomad_variants[0], attr) == edited_saved_gnomad_variant[attr] + with patch("mavedb.lib.gnomad.GNOMAD_DATA_VERSION", TEST_GNOMAD_DATA_VERSION): + result = link_gnomad_variants_to_alleles(session, [mocked_gnomad_variant_row]) + assert result == {allele.id: GnomadLinkVerdict.CREATED} + session.commit() - _verify_annotation_status(session, [mapped_variant], TEST_GNOMAD_DATA_VERSION) + live_links = _live_links_for(session, allele.id) + assert len(live_links) == 1 + _assert_gnomad_variant_matches(live_links[0].gnomad_variant) -def test_can_link_gnomad_variants_with_none_type_faf_fields( - session, mocked_gnomad_variant_row, setup_lib_db_with_mapped_variant -): - mapped_variant = setup_lib_db_with_mapped_variant - mapped_variant.clingen_allele_id = mocked_gnomad_variant_row.caid - session.add(mapped_variant) - session.commit() +def test_can_link_gnomad_variants_with_none_type_faf_fields(session, mocked_gnomad_variant_row): + allele = _make_allele(session, mocked_gnomad_variant_row.caid, vrs_digest="vrs-1") mocked_gnomad_variant_row.__setattr__("joint.fafmax.faf95_max_gen_anc", None) mocked_gnomad_variant_row.__setattr__("joint.fafmax.faf95_max", None) with patch("mavedb.lib.gnomad.GNOMAD_DATA_VERSION", TEST_GNOMAD_DATA_VERSION): - result = link_gnomad_variants_to_mapped_variants(session, [mocked_gnomad_variant_row]) - assert result == 1 + result = link_gnomad_variants_to_alleles(session, [mocked_gnomad_variant_row]) + assert result == {allele.id: GnomadLinkVerdict.CREATED} session.commit() - gnomad_variant_comparator = TEST_GNOMAD_VARIANT.copy() - gnomad_variant_comparator.pop("creation_date") - gnomad_variant_comparator.pop("modification_date") - gnomad_variant_comparator["faf95_max"] = None - gnomad_variant_comparator["faf95_max_ancestry"] = None - - assert len(mapped_variant.gnomad_variants) == 1 - for attr in gnomad_variant_comparator: - assert getattr(mapped_variant.gnomad_variants[0], attr) == gnomad_variant_comparator[attr] - - _verify_annotation_status(session, [mapped_variant], TEST_GNOMAD_DATA_VERSION) + live_links = _live_links_for(session, allele.id) + assert len(live_links) == 1 + _assert_gnomad_variant_matches(live_links[0].gnomad_variant, faf95_max=None, faf95_max_ancestry=None) -def test_links_existing_gnomad_variant(session, mocked_gnomad_variant_row, setup_lib_db_with_mapped_variant): +def test_links_existing_gnomad_variant(session, mocked_gnomad_variant_row): gnomad_variant = GnomADVariant(**TEST_GNOMAD_VARIANT) - mapped_variant = setup_lib_db_with_mapped_variant - mapped_variant.clingen_allele_id = mocked_gnomad_variant_row.caid - session.add(mapped_variant) session.add(gnomad_variant) session.commit() + allele = _make_allele(session, mocked_gnomad_variant_row.caid, vrs_digest="vrs-1") with patch("mavedb.lib.gnomad.GNOMAD_DATA_VERSION", TEST_GNOMAD_DATA_VERSION): - result = link_gnomad_variants_to_mapped_variants(session, [mocked_gnomad_variant_row]) - assert result == 1 + result = link_gnomad_variants_to_alleles(session, [mocked_gnomad_variant_row]) + assert result == {allele.id: GnomadLinkVerdict.CREATED} session.commit() - session.refresh(mapped_variant) - - edited_saved_gnomad_variant = TEST_GNOMAD_VARIANT.copy() - edited_saved_gnomad_variant.pop("creation_date") - edited_saved_gnomad_variant.pop("modification_date") - - assert len(mapped_variant.gnomad_variants) == 1 - for attr in edited_saved_gnomad_variant: - assert getattr(mapped_variant.gnomad_variants[0], attr) == edited_saved_gnomad_variant[attr] - - _verify_annotation_status(session, [mapped_variant], TEST_GNOMAD_DATA_VERSION) - + # Reused the existing gnomAD variant rather than creating a second. + assert len(session.scalars(select(GnomADVariant)).all()) == 1 + live_links = _live_links_for(session, allele.id) + assert len(live_links) == 1 + assert live_links[0].gnomad_variant_id == gnomad_variant.id -def test_adding_existing_gnomad_variant_with_same_version_does_not_result_in_duplication( - session, mocked_gnomad_variant_row, setup_lib_db_with_mapped_variant -): - mapped_variant = setup_lib_db_with_mapped_variant - mapped_variant.clingen_allele_id = mocked_gnomad_variant_row.caid - session.add(mapped_variant) - session.commit() - with patch("mavedb.lib.gnomad.GNOMAD_DATA_VERSION", TEST_GNOMAD_DATA_VERSION): - result = link_gnomad_variants_to_mapped_variants(session, [mocked_gnomad_variant_row]) - assert result == 1 +def test_re_running_unchanged_data_is_idempotent(session, mocked_gnomad_variant_row): + """Supersede only on change: a second run with identical data writes nothing — one live link, + no retired rows, so the valid-time history records no spurious boundary. The second run still + *reports* the allele (verdict UNCHANGED), so the caller can mark it preexisting without re-querying + link state.""" + allele = _make_allele(session, mocked_gnomad_variant_row.caid, vrs_digest="vrs-1") with patch("mavedb.lib.gnomad.GNOMAD_DATA_VERSION", TEST_GNOMAD_DATA_VERSION): - result = link_gnomad_variants_to_mapped_variants(session, [mocked_gnomad_variant_row]) - assert result == 1 + assert link_gnomad_variants_to_alleles(session, [mocked_gnomad_variant_row]) == { + allele.id: GnomadLinkVerdict.CREATED + } + session.commit() + # Second run sees the live link already points to this gnomAD variant → unchanged, no DB write. + assert link_gnomad_variants_to_alleles(session, [mocked_gnomad_variant_row]) == { + allele.id: GnomadLinkVerdict.UNCHANGED + } session.commit() - session.refresh(mapped_variant) - - edited_saved_gnomad_variant = TEST_GNOMAD_VARIANT.copy() - edited_saved_gnomad_variant.pop("creation_date") - edited_saved_gnomad_variant.pop("modification_date") + # One link, still live, never retired — the re-run did not churn the history. + all_links = session.scalars(select(GnomadAlleleLink).where(GnomadAlleleLink.allele_id == allele.id)).all() + assert len(all_links) == 1 + assert all_links[0].valid_to is None + assert len(session.scalars(select(GnomADVariant)).all()) == 1 - assert len(mapped_variant.gnomad_variants) == 1 - for attr in edited_saved_gnomad_variant: - assert getattr(mapped_variant.gnomad_variants[0], attr) == edited_saved_gnomad_variant[attr] - _verify_annotation_status(session, [mapped_variant, mapped_variant], TEST_GNOMAD_DATA_VERSION) +def test_version_bump_supersedes_to_single_live_link(session, mocked_gnomad_variant_row): + """A new gnomAD version retires the prior link and installs the new one — exactly one live link + per allele (not one per version), with the old version preserved as a retired row.""" + allele = _make_allele(session, mocked_gnomad_variant_row.caid, vrs_digest="vrs-1") + with patch("mavedb.lib.gnomad.GNOMAD_DATA_VERSION", "v1.old"): + assert link_gnomad_variants_to_alleles(session, [mocked_gnomad_variant_row]) == { + allele.id: GnomadLinkVerdict.CREATED + } + session.commit() -def test_links_multiple_rows_and_variants(session, mocked_gnomad_variant_row, setup_lib_db_with_mapped_variant): - mapped_variant1 = setup_lib_db_with_mapped_variant - mapped_variant2 = MappedVariant(**TEST_MINIMAL_MAPPED_VARIANT, variant_id=mapped_variant1.variant_id) + with patch("mavedb.lib.gnomad.GNOMAD_DATA_VERSION", "v2.new"): + assert link_gnomad_variants_to_alleles(session, [mocked_gnomad_variant_row]) == { + allele.id: GnomadLinkVerdict.CREATED + } + session.commit() - mapped_variant1.clingen_allele_id = mocked_gnomad_variant_row.caid - mapped_variant2.clingen_allele_id = mocked_gnomad_variant_row.caid - session.add(mapped_variant1) - session.add(mapped_variant2) + live_links = _live_links_for(session, allele.id) + assert len(live_links) == 1 + assert live_links[0].gnomad_variant.db_version == "v2.new" + # Old-version link retired, not deleted; both gnomAD variant rows persist. + all_links = session.scalars(select(GnomadAlleleLink).where(GnomadAlleleLink.allele_id == allele.id)).all() + assert len(all_links) == 2 + assert len([link for link in all_links if link.valid_to is not None]) == 1 + assert len(session.scalars(select(GnomADVariant)).all()) == 2 + + +def test_same_version_different_identifier_supersedes_newest_wins(session, mocked_gnomad_variant_row): + """A CAID re-resolving to a different identifier within the same version is an anomaly: log and + supersede newest-wins rather than raise — one odd allele must not abort the batch.""" + allele = _make_allele(session, mocked_gnomad_variant_row.caid, vrs_digest="vrs-1") + # Prior live link at the current version, but to a different identifier than the row resolves to. + stale = GnomADVariant( + db_name="gnomAD", + db_identifier="9-99999-C-T", + db_version=TEST_GNOMAD_DATA_VERSION, + allele_count=1, + allele_number=2, + allele_frequency=0.5, + ) + session.add(stale) + session.commit() + session.add(GnomadAlleleLink(allele_id=allele.id, gnomad_variant_id=stale.id)) session.commit() with patch("mavedb.lib.gnomad.GNOMAD_DATA_VERSION", TEST_GNOMAD_DATA_VERSION): - result = link_gnomad_variants_to_mapped_variants(session, [mocked_gnomad_variant_row]) - assert result == 2 + assert link_gnomad_variants_to_alleles(session, [mocked_gnomad_variant_row]) == { + allele.id: GnomadLinkVerdict.CREATED + } session.commit() - gnomad_variant_comparator = TEST_GNOMAD_VARIANT.copy() - gnomad_variant_comparator.pop("creation_date") - gnomad_variant_comparator.pop("modification_date") - - assert len(mapped_variant1.gnomad_variants) == 1 - assert len(mapped_variant2.gnomad_variants) == 1 - for mv in [mapped_variant1, mapped_variant2]: - for attr in gnomad_variant_comparator: - assert getattr(mv.gnomad_variants[0], attr) == gnomad_variant_comparator[attr] - - _verify_annotation_status(session, [mapped_variant1, mapped_variant2], TEST_GNOMAD_DATA_VERSION) - + live_links = _live_links_for(session, allele.id) + assert len(live_links) == 1 + assert live_links[0].gnomad_variant.db_identifier != "9-99999-C-T" # newest wins -def test_returns_zero_when_no_mapped_variants(session, mocked_gnomad_variant_row): - result = link_gnomad_variants_to_mapped_variants(session, [mocked_gnomad_variant_row]) - assert result == 0 - _verify_annotation_status(session, [], TEST_GNOMAD_DATA_VERSION) - - -def test_only_current_flag_filters_variants(session, mocked_gnomad_variant_row, setup_lib_db_with_mapped_variant): - mapped_variant1 = setup_lib_db_with_mapped_variant - mapped_variant2 = MappedVariant(**TEST_MINIMAL_MAPPED_VARIANT, variant_id=mapped_variant1.variant_id) - - mapped_variant1.current = False - mapped_variant1.clingen_allele_id = mocked_gnomad_variant_row.caid - mapped_variant2.clingen_allele_id = mocked_gnomad_variant_row.caid - session.add(mapped_variant1) - session.add(mapped_variant2) - session.commit() +def test_links_one_gnomad_variant_to_multiple_alleles_sharing_a_caid(session, mocked_gnomad_variant_row): + """A CAID shared by multiple alleles (cross-score-set dedup) fans the gnomAD variant to each.""" + allele1 = _make_allele(session, mocked_gnomad_variant_row.caid, vrs_digest="vrs-1") + allele2 = _make_allele(session, mocked_gnomad_variant_row.caid, vrs_digest="vrs-2", level="cdna") with patch("mavedb.lib.gnomad.GNOMAD_DATA_VERSION", TEST_GNOMAD_DATA_VERSION): - result = link_gnomad_variants_to_mapped_variants(session, [mocked_gnomad_variant_row]) - assert result == 1 + result = link_gnomad_variants_to_alleles(session, [mocked_gnomad_variant_row]) + assert result == {allele1.id: GnomadLinkVerdict.CREATED, allele2.id: GnomadLinkVerdict.CREATED} session.commit() - gnomad_variant_comparator = TEST_GNOMAD_VARIANT.copy() - gnomad_variant_comparator.pop("creation_date") - gnomad_variant_comparator.pop("modification_date") - - assert len(mapped_variant1.gnomad_variants) == 0 - assert len(mapped_variant2.gnomad_variants) == 1 - for attr in gnomad_variant_comparator: - assert getattr(mapped_variant2.gnomad_variants[0], attr) == gnomad_variant_comparator[attr] + for allele in (allele1, allele2): + assert len(_live_links_for(session, allele.id)) == 1 + # Both links point at the single get-or-created gnomAD variant. + assert len(session.scalars(select(GnomADVariant)).all()) == 1 - _verify_annotation_status(session, [mapped_variant2], TEST_GNOMAD_DATA_VERSION) - -def test_only_current_flag_is_false_operates_on_all_variants( - session, mocked_gnomad_variant_row, setup_lib_db_with_mapped_variant -): - mapped_variant1 = setup_lib_db_with_mapped_variant - mapped_variant2 = MappedVariant(**TEST_MINIMAL_MAPPED_VARIANT, variant_id=mapped_variant1.variant_id) - - mapped_variant1.current = False - mapped_variant1.clingen_allele_id = mocked_gnomad_variant_row.caid - mapped_variant2.clingen_allele_id = mocked_gnomad_variant_row.caid - session.add(mapped_variant1) - session.add(mapped_variant2) - session.commit() +def test_links_allele_when_dump_strips_leading_zero_from_caid(session, mocked_gnomad_variant_row): + """The gnomAD dump records CAIDs without leading zeros (#722). An allele stored with the + zero-padded CAID must still match the dump's stripped form across the join.""" + allele = _make_allele(session, "CA025094", vrs_digest="vrs-1") + mocked_gnomad_variant_row.caid = "CA25094" # dump form: leading zero stripped with patch("mavedb.lib.gnomad.GNOMAD_DATA_VERSION", TEST_GNOMAD_DATA_VERSION): - result = link_gnomad_variants_to_mapped_variants(session, [mocked_gnomad_variant_row], False) - assert result == 2 + result = link_gnomad_variants_to_alleles(session, [mocked_gnomad_variant_row]) + assert result == {allele.id: GnomadLinkVerdict.CREATED} session.commit() - gnomad_variant_comparator = TEST_GNOMAD_VARIANT.copy() - gnomad_variant_comparator.pop("creation_date") - gnomad_variant_comparator.pop("modification_date") - for mv in [mapped_variant1, mapped_variant2]: - assert len(mv.gnomad_variants) == 1 - for attr in gnomad_variant_comparator: - assert getattr(mv.gnomad_variants[0], attr) == gnomad_variant_comparator[attr] + assert len(_live_links_for(session, allele.id)) == 1 + - _verify_annotation_status(session, [mapped_variant1, mapped_variant2], TEST_GNOMAD_DATA_VERSION) +def test_returns_empty_map_when_no_alleles_match(session, mocked_gnomad_variant_row): + result = link_gnomad_variants_to_alleles(session, [mocked_gnomad_variant_row]) + assert result == {} + assert len(session.scalars(select(GnomadAlleleLink)).all()) == 0 + # No gnomAD variant is created when nothing matches the CAID. + assert len(session.scalars(select(GnomADVariant)).all()) == 0 diff --git a/tests/lib/test_hgvs.py b/tests/lib/test_hgvs.py index 45793034..e1e49978 100644 --- a/tests/lib/test_hgvs.py +++ b/tests/lib/test_hgvs.py @@ -94,3 +94,32 @@ def test_join_cis_phased_hgvs_returns_none_for_mixed_coordinate_prefixes(): def test_join_cis_phased_hgvs_returns_none_for_component_without_accession(): assert join_cis_phased_hgvs(["g.1A>G", "g.2T>C"]) is None + + +def test_split_cis_phased_hgvs_passes_through_bracketed_without_accession(): + # Bracketed but accession-less input is not a cis-phased multivariant we can qualify; it must + # degrade to a single-element list rather than raising on the missing ":". + assert split_cis_phased_hgvs("g.[1000A>G;1002T>C]") == ["g.[1000A>G;1002T>C]"] + + +def test_join_cis_phased_hgvs_orders_components_by_position(): + # Out-of-order members are emitted in coordinate order. + assert ( + join_cis_phased_hgvs(["NC_000001.11:g.1002T>C", "NC_000001.11:g.1000A>G"]) == "NC_000001.11:g.[1000A>G;1002T>C]" + ) + + +def test_join_cis_phased_hgvs_is_order_independent(): + # The same set of members yields the same string regardless of input ordering (the VRS block + # digest is order-independent; the exported HGVS string must be too). + forward = join_cis_phased_hgvs(["NC_000001.11:g.1000A>G", "NC_000001.11:g.1002T>C"]) + reverse = join_cis_phased_hgvs(["NC_000001.11:g.1002T>C", "NC_000001.11:g.1000A>G"]) + assert forward == reverse + + +def test_join_cis_phased_hgvs_orders_protein_components_by_position(): + # The first integer is the position for protein forms too (Arg123Gly), not just genomic. + assert ( + join_cis_phased_hgvs(["NP_000001.1:p.Arg223Gly", "NP_000001.1:p.Ala12Val"]) + == "NP_000001.1:p.[Ala12Val;Arg223Gly]" + ) diff --git a/tests/lib/test_variant_annotations_script.py b/tests/lib/test_variant_annotations_script.py new file mode 100644 index 00000000..494bce22 --- /dev/null +++ b/tests/lib/test_variant_annotations_script.py @@ -0,0 +1,89 @@ +# ruff: noqa: E402 +"""Tests for the variant_annotations CLI's current_annotation_summary resolution. + +The score set is the entry point, but status is resolved per-allele through the live mapping links — +so a shared allele's status counts even when a *different* score set's run produced it. +""" + +import pytest + +pytest.importorskip("psycopg2") + +from mavedb.models.allele import Allele +from mavedb.models.annotation_event import AnnotationEvent +from mavedb.models.enums.annotation_type import AnnotationType +from mavedb.models.enums.disposition import Disposition +from mavedb.models.mapping_record import MappingRecord +from mavedb.models.mapping_record_allele import MappingRecordAllele +from mavedb.models.variant import Variant +from mavedb.scripts.variant_annotations import current_annotation_summary +from tests.helpers.constants import TEST_MINIMAL_VARIANT + + +def _variant_mapped_to_allele(session, score_set, allele): + """A variant in the score set with a live mapping record + live authoritative link to ``allele``.""" + variant = Variant(**TEST_MINIMAL_VARIANT, urn=f"{score_set.urn}#1", score_set_id=score_set.id) + session.add(variant) + session.commit() + + record = MappingRecord(variant_id=variant.id, assay_level="genomic", mapping_api_version="test.0.0") + session.add(record) + session.commit() + session.add(MappingRecordAllele(mapping_record_id=record.id, allele_id=allele.id, is_authoritative=True)) + session.commit() + return variant + + +def test_summary_counts_allele_status_from_another_score_sets_run(session, setup_lib_db_with_score_set, job_run): + """An allele-subject status produced by a *different* score set's run (or none) still counts for a + score set whose variant currently maps to that shared allele.""" + score_set = setup_lib_db_with_score_set + allele = Allele( + vrs_digest="summary-shared", level="genomic", clingen_allele_id="CA1", post_mapped={"type": "Allele"} + ) + session.add(allele) + session.commit() + _variant_mapped_to_allele(session, score_set, allele) + + # Event written with no owning score set (e.g. a different score set's run touched the shared allele). + session.add( + AnnotationEvent( + annotation_type=AnnotationType.CLINGEN_ALLELE_ID, + allele_id=allele.id, + disposition=Disposition.PRESENT, + reason="created", + job_run_id=job_run.id, + score_set_id=None, + ) + ) + session.commit() + + summary = current_annotation_summary(session, score_set) + counts = {(r["annotation_type"], r["disposition"]): r["count"] for r in summary} + + assert counts.get((AnnotationType.CLINGEN_ALLELE_ID.value, Disposition.PRESENT.value)) == 1 + + +def test_summary_includes_variant_subject_status(session, setup_lib_db_with_score_set, job_run): + """Variant-subject types (mapping/RT/LDH) are counted per variant, off the score set's variants.""" + score_set = setup_lib_db_with_score_set + allele = Allele(vrs_digest="summary-vs", level="genomic", post_mapped={"type": "Allele"}) + session.add(allele) + session.commit() + variant = _variant_mapped_to_allele(session, score_set, allele) + + session.add( + AnnotationEvent( + annotation_type=AnnotationType.VRS_MAPPING, + variant_id=variant.id, + disposition=Disposition.PRESENT, + reason="mapped", + job_run_id=job_run.id, + ) + ) + session.commit() + + summary = current_annotation_summary(session, score_set) + counts = {(r["annotation_type"], r["disposition"]): r["count"] for r in summary} + + assert counts.get((AnnotationType.VRS_MAPPING.value, Disposition.PRESENT.value)) == 1 diff --git a/tests/lib/test_vep.py b/tests/lib/test_vep.py index 2e9e1fae..bea141ab 100644 --- a/tests/lib/test_vep.py +++ b/tests/lib/test_vep.py @@ -4,11 +4,21 @@ logic correctly handles the actual Ensembl REST API response shapes. """ +from datetime import date, timedelta from unittest.mock import MagicMock, patch import pytest +from sqlalchemy import select -from mavedb.lib.vep import get_functional_consequence, run_variant_recoder +from mavedb.lib.vep import ( + VepLinkVerdict, + get_ensembl_release, + get_functional_consequence, + link_vep_consequences_to_alleles, + run_variant_recoder, +) +from mavedb.models.allele import Allele +from mavedb.models.vep_allele_consequence import VepAlleleConsequence def _mock_response(data) -> MagicMock: @@ -240,3 +250,158 @@ async def test_raises_if_more_than_200_variants(self): """Passing more than 200 HGVS strings raises ValueError before any HTTP call.""" with pytest.raises(ValueError, match="maximum of 200"): await get_functional_consequence(["NM_007294.4:c.1A>T"] * 201) + + +### Tests for get_ensembl_release function ### + + +@pytest.mark.asyncio +async def test_get_ensembl_release_returns_release_as_string(): + """The /info/software release integer is returned as a string for use as source_version.""" + with patch("mavedb.lib.vep.request_with_backoff", return_value=_mock_response({"release": 116})): + assert await get_ensembl_release() == "116" + + +### Tests for link_vep_consequences_to_alleles function ### + + +def _make_allele(session, *, vrs_digest, level="genomic"): + """Create and persist a deduplicated Allele.""" + allele = Allele(vrs_digest=vrs_digest, level=level) + session.add(allele) + session.commit() + session.refresh(allele) + return allele + + +def _live_rows_for(session, allele_id): + return session.scalars( + select(VepAlleleConsequence).where( + VepAlleleConsequence.allele_id == allele_id, + VepAlleleConsequence.current, + ) + ).all() + + +def _all_rows_for(session, allele_id): + return session.scalars(select(VepAlleleConsequence).where(VepAlleleConsequence.allele_id == allele_id)).all() + + +def test_link_vep_creates_new_consequence(session): + """A consequence for an allele with no live row creates a single live row and is reported changed.""" + allele = _make_allele(session, vrs_digest="vrs-1") + + verdicts = link_vep_consequences_to_alleles( + session, {allele.id: "missense_variant"}, source_version="116", access_date=date.today() + ) + session.commit() + + assert verdicts == {allele.id: VepLinkVerdict.CREATED} + live = _live_rows_for(session, allele.id) + assert len(live) == 1 + assert live[0].functional_consequence == "missense_variant" + assert live[0].source_version == "116" + assert live[0].access_date == date.today() + + +def test_link_vep_unchanged_bumps_version_and_date_in_place(session): + """Re-confirming an unchanged consequence at a new release advances source_version and access_date + in place — no supersede, no new valid-time boundary. The allele is reported UNCHANGED (status + preexisting) so the caller need not re-query consequence state.""" + allele = _make_allele(session, vrs_digest="vrs-1") + session.add( + VepAlleleConsequence( + allele_id=allele.id, + functional_consequence="missense_variant", + source_version="115", + access_date=date.today() - timedelta(days=90), + ) + ) + session.commit() + + verdicts = link_vep_consequences_to_alleles( + session, {allele.id: "missense_variant"}, source_version="116", access_date=date.today() + ) + session.commit() + + assert verdicts == {allele.id: VepLinkVerdict.UNCHANGED} + # One row, still live, never retired — version and access_date advanced in place. + all_rows = _all_rows_for(session, allele.id) + assert len(all_rows) == 1 + assert all_rows[0].valid_to is None + assert all_rows[0].source_version == "116" + assert all_rows[0].access_date == date.today() + + +def test_link_vep_changed_consequence_supersedes(session): + """A changed consequence retires the live row and inserts the successor — exactly one live row, + keyed on allele_id, with the old one preserved as retired history.""" + allele = _make_allele(session, vrs_digest="vrs-1") + session.add( + VepAlleleConsequence( + allele_id=allele.id, + functional_consequence="synonymous_variant", + source_version="115", + access_date=date.today() - timedelta(days=90), + ) + ) + session.commit() + + verdicts = link_vep_consequences_to_alleles( + session, {allele.id: "missense_variant"}, source_version="116", access_date=date.today() + ) + session.commit() + + assert verdicts == {allele.id: VepLinkVerdict.CREATED} + live = _live_rows_for(session, allele.id) + assert len(live) == 1 + assert live[0].functional_consequence == "missense_variant" + assert live[0].source_version == "116" + + all_rows = _all_rows_for(session, allele.id) + assert len(all_rows) == 2 + assert len([r for r in all_rows if r.valid_to is not None]) == 1 + + +def test_link_vep_none_leaves_live_row_untouched(session): + """A transient None result must not overwrite a held consequence: the live row is left intact + (value, version, and date). The held consequence is reported UNCHANGED (status preexisting) — the + allele still has a live consequence, it just was not re-confirmed this run.""" + allele = _make_allele(session, vrs_digest="vrs-1") + session.add( + VepAlleleConsequence( + allele_id=allele.id, + functional_consequence="missense_variant", + source_version="115", + access_date=date.today() - timedelta(days=90), + ) + ) + session.commit() + + verdicts = link_vep_consequences_to_alleles( + session, {allele.id: None}, source_version="116", access_date=date.today() + ) + session.commit() + + assert verdicts == {allele.id: VepLinkVerdict.UNCHANGED} + live = _live_rows_for(session, allele.id) + assert len(live) == 1 + assert live[0].functional_consequence == "missense_variant" + # Not re-confirmed -> neither version nor access_date advanced. + assert live[0].source_version == "115" + assert live[0].access_date == date.today() - timedelta(days=90) + + +def test_link_vep_none_with_no_live_row_writes_nothing(session): + """A None result for an allele with no live row writes nothing and leaves the allele out of the + verdict map (the caller reads that as a no-result and re-queries next run), mirroring gnomAD's + no-match handling.""" + allele = _make_allele(session, vrs_digest="vrs-1") + + verdicts = link_vep_consequences_to_alleles( + session, {allele.id: None}, source_version="116", access_date=date.today() + ) + session.commit() + + assert verdicts == {} + assert len(_all_rows_for(session, allele.id)) == 0 diff --git a/tests/lib/test_vrs_utils.py b/tests/lib/test_vrs_utils.py index f40dce97..3b279560 100644 --- a/tests/lib/test_vrs_utils.py +++ b/tests/lib/test_vrs_utils.py @@ -6,10 +6,12 @@ pytest.importorskip("ga4gh.vrs") +from ga4gh.core.models import iriReference from ga4gh.vrs.models import ( Allele, CisPhasedBlock, LiteralSequenceExpression, + Range, ReferenceLengthExpression, SequenceLocation, SequenceReference, @@ -192,3 +194,49 @@ def test_normalize_and_identify_coerces_rle_to_lse(monkeypatch): assert isinstance(result.state, LiteralSequenceExpression) assert result.id is not None and result.id.startswith("ga4gh:VA.") + + +# The unions below (location.sequenceReference, location.start, rle.length) carry IRI-reference +# and Range variants that a fully-resolved indel allele never has. _rle_to_lse and the RLE branch +# of normalize_and_identify guard the inlined-and-integer contract with asserts; these tests pin +# that the guards fire rather than letting an IRI/Range slip into the digest computation as a +# silent AttributeError or wrong sequence read. +_NEVER_READ = SimpleNamespace(get_sequence=lambda identifier, start, end: pytest.fail("proxy read despite bad input")) + + +def test_rle_to_lse_rejects_iri_sequence_reference(): + location = SequenceLocation(sequenceReference=iriReference("seqref:unresolved"), start=10, end=12) + rle = ReferenceLengthExpression(length=4, repeatSubunitLength=2) + + with pytest.raises(AssertionError): + _rle_to_lse(rle, location, _NEVER_READ) + + +def test_rle_to_lse_rejects_range_start(): + location = SequenceLocation(sequenceReference=SequenceReference(refgetAccession=_SQ), start=Range([10, 12]), end=14) + rle = ReferenceLengthExpression(length=4, repeatSubunitLength=2) + + with pytest.raises(AssertionError): + _rle_to_lse(rle, location, _NEVER_READ) + + +def test_rle_to_lse_rejects_range_length(): + location = SequenceLocation(sequenceReference=SequenceReference(refgetAccession=_SQ), start=10, end=12) + rle = ReferenceLengthExpression(length=Range([2, 4]), repeatSubunitLength=2) + + with pytest.raises(AssertionError): + _rle_to_lse(rle, location, _NEVER_READ) + + +def test_normalize_and_identify_rejects_iri_location_for_rle(monkeypatch): + # If normalization ever returned an RLE allele whose location is an unresolved IRI reference, + # _rle_to_lse could not read a sequence from it; the call-site assert must catch this before + # the digest is (mis)computed rather than crashing deeper with an AttributeError. + bad = Allele( + location=iriReference("loc:unresolved"), + state=ReferenceLengthExpression(length=2, repeatSubunitLength=2), + ) + monkeypatch.setattr(vrs_utils, "normalize", lambda allele, data_proxy: bad) + + with pytest.raises(AssertionError): + normalize_and_identify(bad, data_proxy=_NEVER_READ) diff --git a/tests/models/conftest.py b/tests/models/conftest.py new file mode 100644 index 00000000..5edfe06e --- /dev/null +++ b/tests/models/conftest.py @@ -0,0 +1,80 @@ +import pytest + +from mavedb.models.enums import JobStatus +from mavedb.models.experiment import Experiment +from mavedb.models.experiment_set import ExperimentSet +from mavedb.models.job_run import JobRun +from mavedb.models.score_set import ScoreSet +from mavedb.models.user import User +from mavedb.models.variant import Variant +from tests.helpers.constants import ( + TEST_EXPERIMENT, + TEST_EXPERIMENT_SET, + TEST_LICENSE, + TEST_MINIMAL_VARIANT, + TEST_SEQ_SCORESET, + TEST_USER, + VALID_EXPERIMENT_SET_URN, + VALID_EXPERIMENT_URN, + VALID_SCORE_SET_URN, +) + + +@pytest.fixture +def setup_lib_db_with_score_set(session, setup_lib_db): + """Build an experiment set, experiment, and score set on top of the base lib db (users/licenses).""" + user = session.query(User).filter(User.username == TEST_USER["username"]).first() + + experiment_set = ExperimentSet(**TEST_EXPERIMENT_SET, urn=VALID_EXPERIMENT_SET_URN) + experiment_set.created_by = user + experiment_set.modified_by = user + session.add(experiment_set) + session.commit() + session.refresh(experiment_set) + + experiment = Experiment(**TEST_EXPERIMENT, urn=VALID_EXPERIMENT_URN, experiment_set_id=experiment_set.id) + experiment.created_by = user + experiment.modified_by = user + session.add(experiment) + session.commit() + session.refresh(experiment) + + score_set_scaffold = TEST_SEQ_SCORESET.copy() + score_set_scaffold.pop("target_genes") + score_set = ScoreSet( + **score_set_scaffold, urn=VALID_SCORE_SET_URN, experiment_id=experiment.id, licence_id=TEST_LICENSE["id"] + ) + score_set.created_by = user + score_set.modified_by = user + session.add(score_set) + session.commit() + session.refresh(score_set) + + return score_set + + +@pytest.fixture +def setup_lib_db_with_variant(session, setup_lib_db_with_score_set): + """Add a single variant to the score set, for variant-subject event tests.""" + variant = Variant( + **TEST_MINIMAL_VARIANT, urn=f"{setup_lib_db_with_score_set.urn}#1", score_set_id=setup_lib_db_with_score_set.id + ) + session.add(variant) + session.commit() + session.refresh(variant) + + return variant + + +@pytest.fixture +def job_run(session): + """Create a persisted JobRun to anchor an event's provenance.""" + job = JobRun( + job_type="test_annotation_job", + job_function="test_function", + status=JobStatus.RUNNING, + ) + session.add(job) + session.commit() + session.refresh(job) + return job diff --git a/tests/models/test_annotation_event_model.py b/tests/models/test_annotation_event_model.py new file mode 100644 index 00000000..33653647 --- /dev/null +++ b/tests/models/test_annotation_event_model.py @@ -0,0 +1,103 @@ +# ruff: noqa: E402 + +import pytest + +pytest.importorskip("psycopg2") + +from sqlalchemy.exc import IntegrityError + +from mavedb.models.allele import Allele +from mavedb.models.annotation_event import AnnotationEvent +from mavedb.models.enums.annotation_type import AnnotationType +from mavedb.models.enums.disposition import Disposition + + +@pytest.fixture +def allele(session): + allele = Allele(vrs_digest="test-variant-event-allele-digest", level="genomic") + session.add(allele) + session.commit() + session.refresh(allele) + return allele + + +def _expect_rejected(session, event): + """Adding `event` must violate a CHECK constraint on flush.""" + session.add(event) + with pytest.raises(IntegrityError): + session.flush() + session.rollback() + + +class TestSubjectConstraint: + def test_variant_subject_event_inserts(self, session, setup_lib_db_with_variant, job_run): + event = AnnotationEvent( + annotation_type=AnnotationType.VRS_MAPPING, + variant_id=setup_lib_db_with_variant.id, + disposition=Disposition.PRESENT, + reason="mapped", + job_run_id=job_run.id, + score_set_id=setup_lib_db_with_variant.score_set_id, + ) + session.add(event) + session.commit() + assert event.id is not None + assert event.allele_id is None + + def test_allele_subject_event_inserts(self, session, allele, job_run): + event = AnnotationEvent( + annotation_type=AnnotationType.GNOMAD_ALLELE_FREQUENCY, + allele_id=allele.id, + disposition=Disposition.ABSENT, + reason="no_record", + source_version="4.1.0", + job_run_id=job_run.id, + ) + session.add(event) + session.commit() + assert event.id is not None + assert event.variant_id is None + + def test_variant_subject_type_with_allele_id_rejected(self, session, allele): + _expect_rejected( + session, + AnnotationEvent( + annotation_type=AnnotationType.VRS_MAPPING, + allele_id=allele.id, + disposition=Disposition.PRESENT, + reason="mapped", + ), + ) + + def test_allele_subject_type_with_variant_id_rejected(self, session, setup_lib_db_with_variant): + _expect_rejected( + session, + AnnotationEvent( + annotation_type=AnnotationType.GNOMAD_ALLELE_FREQUENCY, + variant_id=setup_lib_db_with_variant.id, + disposition=Disposition.PRESENT, + reason="created", + ), + ) + + def test_neither_subject_set_rejected(self, session): + _expect_rejected( + session, + AnnotationEvent( + annotation_type=AnnotationType.VRS_MAPPING, + disposition=Disposition.PRESENT, + reason="mapped", + ), + ) + + def test_both_subjects_set_rejected(self, session, setup_lib_db_with_variant, allele): + _expect_rejected( + session, + AnnotationEvent( + annotation_type=AnnotationType.VRS_MAPPING, + variant_id=setup_lib_db_with_variant.id, + allele_id=allele.id, + disposition=Disposition.PRESENT, + reason="mapped", + ), + ) diff --git a/tests/models/test_annotation_event_view.py b/tests/models/test_annotation_event_view.py new file mode 100644 index 00000000..14c74f80 --- /dev/null +++ b/tests/models/test_annotation_event_view.py @@ -0,0 +1,147 @@ +# ruff: noqa: E402 +"""Tests for the v_current_annotation_events view (current event per subject + type).""" + +import pytest + +pytest.importorskip("psycopg2") + +from sqlalchemy import select + +from mavedb.models.allele import Allele +from mavedb.models.annotation_event import AnnotationEvent +from mavedb.models.annotation_event_view import CurrentAnnotationEventView +from mavedb.models.enums.annotation_type import AnnotationType +from mavedb.models.enums.disposition import Disposition + + +def _allele(session, digest, level="genomic"): + allele = Allele(vrs_digest=digest, level=level, post_mapped={"type": "Allele"}) + session.add(allele) + session.commit() + session.refresh(allele) + return allele + + +def _event(session, job_run, annotation_type, disposition, reason, *, allele_id=None, variant_id=None, **kw): + event = AnnotationEvent( + annotation_type=annotation_type, + allele_id=allele_id, + variant_id=variant_id, + disposition=disposition, + reason=reason, + job_run_id=job_run.id, + **kw, + ) + session.add(event) + session.commit() + session.refresh(event) + return event + + +def _view_rows(session, **filters): + stmt = select(CurrentAnnotationEventView) + for col, val in filters.items(): + stmt = stmt.where(getattr(CurrentAnnotationEventView, col) == val) + return list(session.scalars(stmt).all()) + + +def test_returns_only_latest_event_per_subject_and_type(session, job_run): + allele = _allele(session, "view-latest") + _event( + session, job_run, AnnotationType.GNOMAD_ALLELE_FREQUENCY, Disposition.ABSENT, "no_record", allele_id=allele.id + ) + latest = _event( + session, job_run, AnnotationType.GNOMAD_ALLELE_FREQUENCY, Disposition.PRESENT, "created", allele_id=allele.id + ) + + rows = _view_rows(session, allele_id=allele.id, annotation_type=AnnotationType.GNOMAD_ALLELE_FREQUENCY.value) + + assert [r.id for r in rows] == [latest.id] + assert rows[0].disposition == Disposition.PRESENT + + +def test_clinvar_is_multi_live_one_row_per_release(session, job_run): + allele = _allele(session, "view-clinvar") + e_2025 = _event( + session, + job_run, + AnnotationType.CLINVAR_CONTROL, + Disposition.PRESENT, + "created", + allele_id=allele.id, + source_version="01_2025", + ) + _event( + session, + job_run, + AnnotationType.CLINVAR_CONTROL, + Disposition.PRESENT, + "created", + allele_id=allele.id, + source_version="01_2026", + ) + e_2026_latest = _event( + session, + job_run, + AnnotationType.CLINVAR_CONTROL, + Disposition.PRESENT, + "superseded", + allele_id=allele.id, + source_version="01_2026", + ) + + rows = _view_rows(session, allele_id=allele.id, annotation_type=AnnotationType.CLINVAR_CONTROL.value) + + by_version = {r.source_version: r.id for r in rows} + assert by_version == {"01_2025": e_2025.id, "01_2026": e_2026_latest.id} + + +def test_non_clinvar_collapses_across_versions(session, job_run): + """gnomAD/VEP supersede to a single current state: a re-fetch at a new source_version yields one + row, not one per version (the CASE folds source_version in only for ClinVar).""" + allele = _allele(session, "view-gnomad-version") + _event( + session, + job_run, + AnnotationType.GNOMAD_ALLELE_FREQUENCY, + Disposition.PRESENT, + "created", + allele_id=allele.id, + source_version="4.0.0", + ) + latest = _event( + session, + job_run, + AnnotationType.GNOMAD_ALLELE_FREQUENCY, + Disposition.PRESENT, + "reconfirmed", + allele_id=allele.id, + source_version="4.1.0", + ) + + rows = _view_rows(session, allele_id=allele.id, annotation_type=AnnotationType.GNOMAD_ALLELE_FREQUENCY.value) + + assert [r.id for r in rows] == [latest.id] + assert rows[0].source_version == "4.1.0" + + +def test_variant_subject_events_present_and_keyed_by_variant(session, setup_lib_db_with_variant, job_run): + variant = setup_lib_db_with_variant + _event(session, job_run, AnnotationType.VRS_MAPPING, Disposition.FAILED, "failed", variant_id=variant.id) + latest = _event(session, job_run, AnnotationType.VRS_MAPPING, Disposition.PRESENT, "mapped", variant_id=variant.id) + + rows = _view_rows(session, variant_id=variant.id, annotation_type=AnnotationType.VRS_MAPPING.value) + + assert [r.id for r in rows] == [latest.id] + assert rows[0].allele_id is None + + +def test_distinct_subjects_yield_distinct_rows(session, job_run): + a1 = _allele(session, "view-a1") + a2 = _allele(session, "view-a2") + _event(session, job_run, AnnotationType.CLINGEN_ALLELE_ID, Disposition.PRESENT, "created", allele_id=a1.id) + _event(session, job_run, AnnotationType.CLINGEN_ALLELE_ID, Disposition.NOT_APPLICABLE, "no_hgvs", allele_id=a2.id) + + rows = _view_rows(session, annotation_type=AnnotationType.CLINGEN_ALLELE_ID.value) + + assert {r.allele_id for r in rows} == {a1.id, a2.id} diff --git a/tests/routers/conftest.py b/tests/routers/conftest.py index ba34c548..0971bde4 100644 --- a/tests/routers/conftest.py +++ b/tests/routers/conftest.py @@ -3,7 +3,7 @@ import pytest -from mavedb.models.clinical_control import ClinicalControl +from mavedb.models.clinical_control import ClinvarControl from mavedb.models.contributor import Contributor from mavedb.models.controlled_keyword import ControlledKeyword from mavedb.models.enums.user_role import UserRole @@ -52,8 +52,8 @@ def setup_router_db(session): db.add(License(**TEST_INACTIVE_LICENSE)) db.add(License(**EXTRA_LICENSE)) db.add(Contributor(**EXTRA_CONTRIBUTOR)) - db.add(ClinicalControl(**TEST_CLINVAR_CONTROL)) - db.add(ClinicalControl(**TEST_GENERIC_CLINICAL_CONTROL)) + db.add(ClinvarControl(**TEST_CLINVAR_CONTROL)) + db.add(ClinvarControl(**TEST_GENERIC_CLINICAL_CONTROL)) db.add(GnomADVariant(**TEST_GNOMAD_VARIANT)) db.bulk_save_objects([ControlledKeyword(**keyword_obj) for keyword_obj in TEST_DB_KEYWORDS]) db.commit() diff --git a/tests/worker/jobs/conftest.py b/tests/worker/jobs/conftest.py index eac38086..873d09ff 100644 --- a/tests/worker/jobs/conftest.py +++ b/tests/worker/jobs/conftest.py @@ -1,9 +1,13 @@ import pytest +from sqlalchemy import select +from mavedb.models.allele import Allele from mavedb.models.enums.job_pipeline import DependencyType from mavedb.models.job_dependency import JobDependency from mavedb.models.job_run import JobRun from mavedb.models.mapped_variant import MappedVariant +from mavedb.models.mapping_record import MappingRecord +from mavedb.models.mapping_record_allele import MappingRecordAllele from mavedb.models.pipeline import Pipeline from mavedb.models.score_set import ScoreSet from mavedb.models.variant import Variant @@ -281,6 +285,87 @@ def setup_sample_variants_with_caid( return variant, mapped_variant +@pytest.fixture +def setup_sample_alleles_with_caid(session, with_populated_domain_data, sample_link_gnomad_variants_run): + """Set up new-model rows (Variant + live MappingRecord + authoritative MappingRecordAllele + Allele) + for the gnomAD linkage job. The allele carries the CAID matched by the mocked Athena row, and the + allele is the authoritative measurement for the variant so the bandaid seam writes its VAS row. + """ + score_set = session.get(ScoreSet, sample_link_gnomad_variants_run.job_params["score_set_id"]) + + variant = Variant( + urn="urn:variant:test-variant-with-allele-caid", + score_set_id=score_set.id, + hgvs_nt="NM_000000.1:c.1A>G", + hgvs_pro="NP_000000.1:p.Met1Val", + data={"hgvs_c": "NM_000000.1:c.1A>G", "hgvs_p": "NP_000000.1:p.Met1Val"}, + ) + allele = Allele( + vrs_digest="test-allele-vrs-digest", + level="genomic", + clingen_allele_id=VALID_CAID, + post_mapped={"type": "Allele", "expressions": [{"value": "NM_000000.1:c.1A>G", "syntax": "hgvs.c"}]}, + ) + session.add_all([variant, allele]) + session.commit() + + mapping_record = MappingRecord( + variant_id=variant.id, + assay_level="genomic", + mapping_api_version="pytest.0.0", + ) + session.add(mapping_record) + session.commit() + + session.add( + MappingRecordAllele( + mapping_record_id=mapping_record.id, + allele_id=allele.id, + is_authoritative=True, + ) + ) + session.commit() + return variant, allele + + +@pytest.fixture +def setup_rt_derived_allele_with_caid(session, setup_sample_alleles_with_caid): + """Add a NON-authoritative (RT-derived) allele to the variant's current mapping record, carrying + the CAID the mocked gnomAD row matches. The authoritative allele is given a CAID with no gnomAD + match, so only the RT-derived allele can link. This isolates the requirement that gnomAD linkage + must cover the full allele set (authoritative + RT-derived), not just authoritative links — for + protein/coding score sets the genomic allele gnomAD knows is the RT-derived one. + """ + variant, authoritative_allele = setup_sample_alleles_with_caid + + # Authoritative allele's CAID intentionally has no gnomAD match, so it cannot be what links. + authoritative_allele.clingen_allele_id = "CA_NO_GNOMAD_MATCH" + session.add(authoritative_allele) + + mapping_record = session.scalars( + select(MappingRecord).where(MappingRecord.variant_id == variant.id, MappingRecord.current) + ).one() + + rt_allele = Allele( + vrs_digest="test-rt-derived-allele-vrs-digest", + level="genomic", + clingen_allele_id=VALID_CAID, + post_mapped={"type": "Allele", "expressions": [{"value": "NC_000001.11:g.12345G>A", "syntax": "hgvs.g"}]}, + ) + session.add(rt_allele) + session.commit() + + session.add( + MappingRecordAllele( + mapping_record_id=mapping_record.id, + allele_id=rt_allele.id, + is_authoritative=False, + ) + ) + session.commit() + return variant, authoritative_allele, rt_allele + + ## Uniprot Job Fixtures ## @@ -926,198 +1011,6 @@ def with_cleanup_job(session, sample_cleanup_job_run): session.commit() -## HGVS Population Job Fixtures ## - - -@pytest.fixture -def populate_hgvs_sample_params(with_populated_domain_data, sample_score_set): - """Provide sample parameters for populate_hgvs_for_score_set job.""" - - return { - "correlation_id": "sample-correlation-id", - "score_set_id": sample_score_set.id, - } - - -@pytest.fixture -def sample_populate_hgvs_pipeline(): - """Create a pipeline instance for populate_hgvs_for_score_set job.""" - - return Pipeline( - urn="test:populate_hgvs_pipeline", - name="Populate HGVS Pipeline", - ) - - -@pytest.fixture -def sample_populate_hgvs_run(populate_hgvs_sample_params): - """Create a JobRun instance for populate_hgvs_for_score_set job.""" - - return JobRun( - urn="test:populate_hgvs_for_score_set", - job_type="populate_hgvs_for_score_set", - job_function="populate_hgvs_for_score_set", - max_retries=3, - retry_count=0, - job_params=populate_hgvs_sample_params, - ) - - -@pytest.fixture -def with_populate_hgvs_job(session, sample_populate_hgvs_run): - """Add a populate_hgvs_for_score_set job run to the session.""" - - session.add(sample_populate_hgvs_run) - session.commit() - - -@pytest.fixture -def with_populate_hgvs_pipeline(session, sample_populate_hgvs_pipeline): - """Add a populate_hgvs pipeline to the session.""" - - session.add(sample_populate_hgvs_pipeline) - session.commit() - - -@pytest.fixture -def sample_populate_hgvs_run_pipeline( - session, - with_populate_hgvs_job, - with_populate_hgvs_pipeline, - sample_populate_hgvs_run, - sample_populate_hgvs_pipeline, -): - """Provide a context with a populate_hgvs job run and pipeline.""" - - sample_populate_hgvs_run.pipeline_id = sample_populate_hgvs_pipeline.id - session.commit() - return sample_populate_hgvs_run - - -@pytest.fixture -def setup_sample_variants_with_caid_for_hgvs( - session, with_populated_domain_data, mock_worker_ctx, sample_populate_hgvs_run -): - """Setup variants and mapped variants in the database for HGVS population testing.""" - score_set = session.get(ScoreSet, sample_populate_hgvs_run.job_params["score_set_id"]) - - variant = Variant( - urn="urn:variant:test-variant-with-caid-hgvs", - score_set_id=score_set.id, - hgvs_nt="NM_000000.1:c.1A>G", - hgvs_pro="NP_000000.1:p.Met1Val", - data={"hgvs_c": "NM_000000.1:c.1A>G", "hgvs_p": "NP_000000.1:p.Met1Val"}, - ) - session.add(variant) - session.commit() - mapped_variant = MappedVariant( - variant_id=variant.id, - clingen_allele_id=VALID_CAID, - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - ) - session.add(mapped_variant) - session.commit() - return variant, mapped_variant - - -# --- Variant Translation Fixtures --- - - -@pytest.fixture -def populate_variant_translations_sample_params(with_populated_domain_data, sample_score_set): - """Provide sample parameters for populate_variant_translations_for_score_set job.""" - - return { - "correlation_id": "sample-correlation-id", - "score_set_id": sample_score_set.id, - } - - -@pytest.fixture -def sample_populate_variant_translations_pipeline(): - """Create a pipeline instance for populate_variant_translations_for_score_set job.""" - - return Pipeline( - urn="test:populate_variant_translations_pipeline", - name="Populate Variant Translations Pipeline", - ) - - -@pytest.fixture -def sample_populate_variant_translations_run(populate_variant_translations_sample_params): - """Create a JobRun instance for populate_variant_translations_for_score_set job.""" - - return JobRun( - urn="test:populate_variant_translations_for_score_set", - job_type="populate_variant_translations_for_score_set", - job_function="populate_variant_translations_for_score_set", - max_retries=3, - retry_count=0, - job_params=populate_variant_translations_sample_params, - ) - - -@pytest.fixture -def with_populate_variant_translations_job(session, sample_populate_variant_translations_run): - """Add a populate_variant_translations_for_score_set job run to the session.""" - - session.add(sample_populate_variant_translations_run) - session.commit() - - -@pytest.fixture -def with_populate_variant_translations_pipeline(session, sample_populate_variant_translations_pipeline): - """Add a populate_variant_translations pipeline to the session.""" - - session.add(sample_populate_variant_translations_pipeline) - session.commit() - - -@pytest.fixture -def sample_populate_variant_translations_run_pipeline( - session, - with_populate_variant_translations_job, - with_populate_variant_translations_pipeline, - sample_populate_variant_translations_run, - sample_populate_variant_translations_pipeline, -): - """Provide a context with a populate_variant_translations job run and pipeline.""" - - sample_populate_variant_translations_run.pipeline_id = sample_populate_variant_translations_pipeline.id - session.commit() - return sample_populate_variant_translations_run - - -@pytest.fixture -def setup_sample_variants_with_caid_for_translation( - session, with_populated_domain_data, mock_worker_ctx, sample_populate_variant_translations_run -): - """Setup variants and mapped variants in the database for variant translation testing.""" - score_set = session.get(ScoreSet, sample_populate_variant_translations_run.job_params["score_set_id"]) - - variant = Variant( - urn="urn:variant:test-variant-with-caid-translation", - score_set_id=score_set.id, - hgvs_nt="NM_000000.1:c.1A>G", - hgvs_pro="NP_000000.1:p.Met1Val", - data={"hgvs_c": "NM_000000.1:c.1A>G", "hgvs_p": "NP_000000.1:p.Met1Val"}, - ) - session.add(variant) - session.commit() - mapped_variant = MappedVariant( - variant_id=variant.id, - clingen_allele_id=VALID_CAID, - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - ) - session.add(mapped_variant) - session.commit() - return variant, mapped_variant - - ## ClinGen Cache Warming Job Fixtures ## @@ -1255,8 +1148,14 @@ def sample_populate_vep_run_pipeline( @pytest.fixture -def setup_sample_variants_for_vep(session, with_populated_domain_data, mock_worker_ctx, sample_populate_vep_run): - """Setup a variant and mapped variant with hgvs_assay_level for VEP testing.""" +def setup_sample_alleles_for_vep(session, with_populated_domain_data, mock_worker_ctx, sample_populate_vep_run): + """Set up new-model rows (Variant + live MappingRecord + authoritative MappingRecordAllele + Allele) + for the VEP consequence job. The allele carries an HGVS the job submits to VEP and is the + authoritative measurement for the variant, so the bandaid seam writes its per-variant VAS row. + + The HGVS lives on ``Allele.hgvs_c`` (a coding HGVS VEP resolves directly) — the new job reads its + submission string from the allele, not from ``MappedVariant.hgvs_assay_level``. + """ score_set = session.get(ScoreSet, sample_populate_vep_run.job_params["score_set_id"]) variant = Variant( @@ -1266,28 +1165,42 @@ def setup_sample_variants_for_vep(session, with_populated_domain_data, mock_work hgvs_pro="NP_009225.1:p.Cys2Tyr", data={"hgvs_c": "NM_007294.4:c.5A>G", "hgvs_p": "NP_009225.1:p.Cys2Tyr"}, ) - session.add(variant) + allele = Allele( + vrs_digest="test-vep-allele-vrs-digest", + level="cdna", + hgvs_c="NM_007294.4:c.5A>G", + post_mapped={"type": "Allele", "expressions": [{"value": "NM_007294.4:c.5A>G", "syntax": "hgvs.c"}]}, + ) + session.add_all([variant, allele]) session.commit() - mapped_variant = MappedVariant( + + mapping_record = MappingRecord( variant_id=variant.id, - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - post_mapped={"type": "Allele", "expressions": [{"value": "NM_007294.4:c.5A>G", "syntax": "hgvs.c"}]}, - hgvs_assay_level="NM_007294.4:c.5A>G", + assay_level="cdna", + mapping_api_version="pytest.0.0", ) - session.add(mapped_variant) + session.add(mapping_record) session.commit() - return variant, mapped_variant + + session.add( + MappingRecordAllele( + mapping_record_id=mapping_record.id, + allele_id=allele.id, + is_authoritative=True, + ) + ) + session.commit() + return variant, allele @pytest.fixture -def setup_sample_protein_variant_for_vep(session, with_populated_domain_data, mock_worker_ctx, sample_populate_vep_run): - """Setup a protein HGVS variant (NP_ accession) that VEP cannot resolve directly. +def setup_sample_protein_allele_for_vep(session, with_populated_domain_data, mock_worker_ctx, sample_populate_vep_run): + """Set up an allele whose only HGVS is a protein HGVS (NP_ accession) that VEP cannot resolve + directly. - VEP's /vep/human/hgvs endpoint does not return results for protein HGVS strings like - NP_009225.1:p.Val1696His, so these must be recoded via Variant Recoder first. This fixture - exercises the recoder fallback path end-to-end. + VEP's /vep/human/hgvs endpoint returns no consequence for protein HGVS like + NP_009225.1:p.Val1696His, so the job must fall back to Variant Recoder. ``hgvs_g``/``hgvs_c`` are + left unset so the VEP payload resolves to ``hgvs_p``, exercising the recoder fallback path. """ score_set = session.get(ScoreSet, sample_populate_vep_run.job_params["score_set_id"]) @@ -1297,17 +1210,61 @@ def setup_sample_protein_variant_for_vep(session, with_populated_domain_data, mo hgvs_pro="NP_009225.1:p.Val1696His", data={"hgvs_p": "NP_009225.1:p.Val1696His"}, ) - session.add(variant) + allele = Allele( + vrs_digest="test-vep-protein-allele-vrs-digest", + level="protein", + hgvs_p="NP_009225.1:p.Val1696His", + post_mapped={"type": "Allele", "expressions": [{"value": "NP_009225.1:p.Val1696His", "syntax": "hgvs.p"}]}, + ) + session.add_all([variant, allele]) session.commit() - mapped_variant = MappedVariant( + mapping_record = MappingRecord( variant_id=variant.id, - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - post_mapped={"type": "Allele", "expressions": [{"value": "NP_009225.1:p.Val1696His", "syntax": "hgvs.p"}]}, - hgvs_assay_level="NP_009225.1:p.Val1696His", + assay_level="protein", + mapping_api_version="pytest.0.0", ) - session.add(mapped_variant) + session.add(mapping_record) session.commit() - return variant, mapped_variant + + session.add( + MappingRecordAllele( + mapping_record_id=mapping_record.id, + allele_id=allele.id, + is_authoritative=True, + ) + ) + session.commit() + return variant, allele + + +@pytest.fixture +def setup_rt_derived_allele_for_vep(session, setup_sample_alleles_for_vep): + """Add a NON-authoritative (RT-derived) allele to the variant's current mapping record, carrying a + genomic HGVS of its own. Isolates the requirement that VEP linkage covers the full allele set + (authoritative + RT-derived), while the per-variant VAS fan-out stays authoritative-only. + """ + variant, authoritative_allele = setup_sample_alleles_for_vep + + mapping_record = session.scalars( + select(MappingRecord).where(MappingRecord.variant_id == variant.id, MappingRecord.current) + ).one() + + rt_allele = Allele( + vrs_digest="test-rt-derived-vep-allele-vrs-digest", + level="genomic", + hgvs_g="NC_000017.11:g.43124027T>C", + post_mapped={"type": "Allele", "expressions": [{"value": "NC_000017.11:g.43124027T>C", "syntax": "hgvs.g"}]}, + ) + session.add(rt_allele) + session.commit() + + session.add( + MappingRecordAllele( + mapping_record_id=mapping_record.id, + allele_id=rt_allele.id, + is_authoritative=False, + ) + ) + session.commit() + return variant, authoritative_allele, rt_allele diff --git a/tests/worker/jobs/external_services/network/test_clinvar.py b/tests/worker/jobs/external_services/network/test_clinvar.py index 881c9394..a23da5e2 100644 --- a/tests/worker/jobs/external_services/network/test_clinvar.py +++ b/tests/worker/jobs/external_services/network/test_clinvar.py @@ -6,11 +6,12 @@ from sqlalchemy import select -from mavedb.models.clinical_control import ClinicalControl +from mavedb.models.clinical_control import ClinvarControl from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationStatus, JobStatus -from mavedb.models.variant_annotation_status import VariantAnnotationStatus -from mavedb.worker.jobs.external_services.clinvar import generate_clinvar_versions +from mavedb.models.enums.disposition import Disposition +from mavedb.models.enums.job_pipeline import JobStatus +from mavedb.models.annotation_event import AnnotationEvent +from mavedb.worker.jobs.external_services.clinvar import _generate_clinvar_versions pytestmark = pytest.mark.usefixtures("patch_db_session_ctxmgr") @@ -26,7 +27,7 @@ async def test_refresh_clinvar_controls_e2e( arq_redis, arq_worker, standalone_worker_context, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, with_refresh_clinvar_controls_job, sample_refresh_clinvar_controls_job_run, ): @@ -37,33 +38,34 @@ async def test_refresh_clinvar_controls_e2e( # Verify that clinical controls were added successfully — one row per ClinVar version # that contains the variant, so there may be more than one. - clinical_controls = session.scalars(select(ClinicalControl)).all() + clinical_controls = session.scalars(select(ClinvarControl)).all() assert len(clinical_controls) >= 1 assert all(cc.db_identifier == "3045425" for cc in clinical_controls) - # Verify that at least one SUCCESS annotation was recorded for the variant. - # The job processes 12 ClinVar versions; versions without the variant produce - # SKIPPED annotations, so only filtering for SUCCESS gives a stable assertion. - success_annotations = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.annotation_type == AnnotationType.CLINVAR_CONTROL, - VariantAnnotationStatus.status == AnnotationStatus.SUCCESS, + # Verify that at least one present event was recorded for the allele. The job processes one + # event per ClinVar version; versions without the allele produce absent events, so filtering + # for present gives a stable assertion. Events are allele-keyed (no variant_id). + present_events = session.scalars( + select(AnnotationEvent).where( + AnnotationEvent.annotation_type == AnnotationType.CLINVAR_CONTROL, + AnnotationEvent.disposition == Disposition.PRESENT, ) ).all() - assert len(success_annotations) >= 1 + assert len(present_events) >= 1 + assert all(e.variant_id is None and e.allele_id is not None for e in present_events) - # Verify that SKIPPED annotations are produced for versions where the variant - # is absent — expected for any of the 12 ClinVar versions that don't contain it. - skipped_annotations = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.annotation_type == AnnotationType.CLINVAR_CONTROL, - VariantAnnotationStatus.status == AnnotationStatus.SKIPPED, + # Versions where the allele's resolved ClinVar id is absent from that release's snapshot + # produce an absent event — expected for any version that doesn't contain it. + absent_events = session.scalars( + select(AnnotationEvent).where( + AnnotationEvent.annotation_type == AnnotationType.CLINVAR_CONTROL, + AnnotationEvent.disposition == Disposition.ABSENT, ) ).all() - assert len(skipped_annotations) >= 1 + assert len(absent_events) >= 1 - # Total annotations should equal the number of ClinVar versions processed. - assert len(success_annotations) + len(skipped_annotations) == len(generate_clinvar_versions()) + # Total events should equal the number of ClinVar versions processed (one allele, one per version). + assert len(present_events) + len(absent_events) == len(_generate_clinvar_versions()) # Verify that the job run was completed successfully session.refresh(sample_refresh_clinvar_controls_job_run) diff --git a/tests/worker/jobs/external_services/network/test_hgvs.py b/tests/worker/jobs/external_services/network/test_hgvs.py deleted file mode 100644 index 56f100e0..00000000 --- a/tests/worker/jobs/external_services/network/test_hgvs.py +++ /dev/null @@ -1,54 +0,0 @@ -# ruff: noqa: E402 - -import pytest - -pytest.importorskip("arq") - -from sqlalchemy import select - -from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationStatus, JobStatus, PipelineStatus -from mavedb.models.variant_annotation_status import VariantAnnotationStatus - -pytestmark = pytest.mark.usefixtures("patch_db_session_ctxmgr") - - -@pytest.mark.asyncio -@pytest.mark.integration -@pytest.mark.network -@pytest.mark.slow -class TestE2EPopulateHgvsForScoreSet: - """End-to-end test for HGVS population against the real ClinGen API.""" - - async def test_populate_hgvs_e2e( - self, - session, - arq_redis, - arq_worker, - sample_populate_hgvs_run_pipeline, - sample_populate_hgvs_pipeline, - setup_sample_variants_with_caid_for_hgvs, - ): - """Enqueue the HGVS population job, run the worker, and verify HGVS fields are populated.""" - _, mapped_variant = setup_sample_variants_with_caid_for_hgvs - - await arq_redis.enqueue_job("populate_hgvs_for_score_set", sample_populate_hgvs_run_pipeline.id) - await arq_worker.async_run() - await arq_worker.run_check() - - session.refresh(sample_populate_hgvs_run_pipeline) - assert sample_populate_hgvs_run_pipeline.status == JobStatus.SUCCEEDED - - session.refresh(sample_populate_hgvs_pipeline) - assert sample_populate_hgvs_pipeline.status == PipelineStatus.SUCCEEDED - - session.refresh(mapped_variant) - annotation = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.variant_id == mapped_variant.variant_id, - VariantAnnotationStatus.annotation_type == AnnotationType.MAPPED_HGVS, - VariantAnnotationStatus.current.is_(True), - ) - ).one_or_none() - assert annotation is not None - assert annotation.status in (AnnotationStatus.SUCCESS, AnnotationStatus.SKIPPED) diff --git a/tests/worker/jobs/external_services/network/test_variant_translations.py b/tests/worker/jobs/external_services/network/test_variant_translations.py deleted file mode 100644 index b45087dd..00000000 --- a/tests/worker/jobs/external_services/network/test_variant_translations.py +++ /dev/null @@ -1,56 +0,0 @@ -# ruff: noqa: E402 - -import pytest - -pytest.importorskip("arq") - -from sqlalchemy import select - -from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationStatus, JobStatus, PipelineStatus -from mavedb.models.variant_annotation_status import VariantAnnotationStatus - -pytestmark = pytest.mark.usefixtures("patch_db_session_ctxmgr") - - -@pytest.mark.asyncio -@pytest.mark.integration -@pytest.mark.network -@pytest.mark.slow -class TestE2EPopulateVariantTranslationsForScoreSet: - """End-to-end test for variant translation population against the real ClinGen API.""" - - async def test_populate_variant_translations_e2e( - self, - session, - arq_redis, - arq_worker, - sample_populate_variant_translations_run_pipeline, - sample_populate_variant_translations_pipeline, - setup_sample_variants_with_caid_for_translation, - ): - """Enqueue the variant translation job, run the worker, and verify translations are created.""" - _, mapped_variant = setup_sample_variants_with_caid_for_translation - - await arq_redis.enqueue_job( - "populate_variant_translations_for_score_set", - sample_populate_variant_translations_run_pipeline.id, - ) - await arq_worker.async_run() - await arq_worker.run_check() - - session.refresh(sample_populate_variant_translations_run_pipeline) - assert sample_populate_variant_translations_run_pipeline.status == JobStatus.SUCCEEDED - - session.refresh(sample_populate_variant_translations_pipeline) - assert sample_populate_variant_translations_pipeline.status == PipelineStatus.SUCCEEDED - - annotation = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.variant_id == mapped_variant.variant_id, - VariantAnnotationStatus.annotation_type == AnnotationType.VARIANT_TRANSLATION, - VariantAnnotationStatus.current.is_(True), - ) - ).one_or_none() - assert annotation is not None - assert annotation.status in (AnnotationStatus.SUCCESS, AnnotationStatus.SKIPPED) diff --git a/tests/worker/jobs/external_services/network/test_vep.py b/tests/worker/jobs/external_services/network/test_vep.py index 61071fc4..61ba8dfd 100644 --- a/tests/worker/jobs/external_services/network/test_vep.py +++ b/tests/worker/jobs/external_services/network/test_vep.py @@ -7,8 +7,10 @@ from sqlalchemy import select from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationStatus, JobStatus, PipelineStatus -from mavedb.models.variant_annotation_status import VariantAnnotationStatus +from mavedb.models.enums.disposition import Disposition +from mavedb.models.enums.job_pipeline import JobStatus, PipelineStatus +from mavedb.models.annotation_event import AnnotationEvent +from mavedb.models.vep_allele_consequence import VepAlleleConsequence pytestmark = pytest.mark.usefixtures("patch_db_session_ctxmgr") @@ -27,10 +29,10 @@ async def test_populate_vep_e2e( arq_worker, sample_populate_vep_run_pipeline, sample_populate_vep_pipeline, - setup_sample_variants_for_vep, + setup_sample_alleles_for_vep, ): - """Enqueue the VEP job, run the worker, and verify consequence and annotation are populated.""" - _, mapped_variant = setup_sample_variants_for_vep + """Enqueue the VEP job, run the worker, and verify the allele's consequence and annotation.""" + variant, allele = setup_sample_alleles_for_vep await arq_redis.enqueue_job("populate_vep_for_score_set", sample_populate_vep_run_pipeline.id) await arq_worker.async_run() @@ -42,18 +44,23 @@ async def test_populate_vep_e2e( session.refresh(sample_populate_vep_pipeline) assert sample_populate_vep_pipeline.status == PipelineStatus.SUCCEEDED - session.refresh(mapped_variant) - assert mapped_variant.vep_functional_consequence is not None - assert mapped_variant.vep_access_date is not None - - annotation = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.variant_id == mapped_variant.variant_id, - VariantAnnotationStatus.annotation_type == AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, - VariantAnnotationStatus.current.is_(True), + live = session.scalars( + select(VepAlleleConsequence).where( + VepAlleleConsequence.allele_id == allele.id, + VepAlleleConsequence.current, + ) + ).one() + assert live.functional_consequence is not None + assert live.access_date is not None + + # Events are allele-keyed now; the variant resolves its status through the live link. + event = session.scalars( + select(AnnotationEvent).where( + AnnotationEvent.allele_id == allele.id, + AnnotationEvent.annotation_type == AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, ) ).one() - assert annotation.status == AnnotationStatus.SUCCESS + assert event.disposition == Disposition.PRESENT async def test_populate_vep_e2e_with_recoder_path( self, @@ -62,7 +69,7 @@ async def test_populate_vep_e2e_with_recoder_path( arq_worker, sample_populate_vep_run_pipeline, sample_populate_vep_pipeline, - setup_sample_protein_variant_for_vep, + setup_sample_protein_allele_for_vep, ): """VEP job uses Variant Recoder for a protein HGVS (NP_ accession) that VEP cannot resolve directly. @@ -70,7 +77,7 @@ async def test_populate_vep_e2e_with_recoder_path( does not return a consequence for. The job must fall back to Variant Recoder, recode it to a genomic HGVS, and then re-query VEP with the recoded string. """ - _, mapped_variant = setup_sample_protein_variant_for_vep + variant, allele = setup_sample_protein_allele_for_vep await arq_redis.enqueue_job("populate_vep_for_score_set", sample_populate_vep_run_pipeline.id) await arq_worker.async_run() @@ -82,15 +89,20 @@ async def test_populate_vep_e2e_with_recoder_path( session.refresh(sample_populate_vep_pipeline) assert sample_populate_vep_pipeline.status == PipelineStatus.SUCCEEDED - session.refresh(mapped_variant) - assert mapped_variant.vep_functional_consequence is not None - assert mapped_variant.vep_access_date is not None - - annotation = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.variant_id == mapped_variant.variant_id, - VariantAnnotationStatus.annotation_type == AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, - VariantAnnotationStatus.current.is_(True), + live = session.scalars( + select(VepAlleleConsequence).where( + VepAlleleConsequence.allele_id == allele.id, + VepAlleleConsequence.current, + ) + ).one() + assert live.functional_consequence is not None + assert live.access_date is not None + + # Events are allele-keyed now; the variant resolves its status through the live link. + event = session.scalars( + select(AnnotationEvent).where( + AnnotationEvent.allele_id == allele.id, + AnnotationEvent.annotation_type == AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, ) ).one() - assert annotation.status == AnnotationStatus.SUCCESS + assert event.disposition == Disposition.PRESENT diff --git a/tests/worker/jobs/external_services/test_clingen.py b/tests/worker/jobs/external_services/test_clingen.py index 76b1b0b2..3b44a35c 100644 --- a/tests/worker/jobs/external_services/test_clingen.py +++ b/tests/worker/jobs/external_services/test_clingen.py @@ -16,7 +16,7 @@ from mavedb.models.enums.job_pipeline import JobStatus, PipelineStatus from mavedb.models.mapping_record_allele import MappingRecordAllele from mavedb.models.variant import Variant -from mavedb.models.variant_annotation_status import VariantAnnotationStatus +from mavedb.models.annotation_event import AnnotationEvent from mavedb.worker.jobs.external_services.clingen import ( submit_score_set_mappings_to_car, submit_score_set_mappings_to_ldh, @@ -153,14 +153,14 @@ async def test_submit_score_set_mappings_to_car_no_registered_alleles( alleles = session.scalars(select(Allele).where(Allele.clingen_allele_id.isnot(None))).all() assert len(alleles) == 0 - # Verify annotation statuses were rendered as failed — 4 variants, all failed - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + # 4 variants dedup to 1 allele → one allele-keyed event, failed (no CAR response). + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 - for ann in annotation_statuses: - assert ann.status == "failed" - assert ann.annotation_type == "clingen_allele_id" + assert len(events) == 1 + for event in events: + assert event.disposition == "failed" + assert event.allele_id is not None async def test_submit_score_set_mappings_to_car_all_car_errors( self, @@ -221,16 +221,16 @@ async def test_submit_score_set_mappings_to_car_all_car_errors( alleles = session.scalars(select(Allele).where(Allele.clingen_allele_id.isnot(None))).all() assert len(alleles) == 0 - # 1 allele failed → all 4 variant annotations are failed - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + # 1 allele, rejected by CAR → one failed event (reason=service_rejected). + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 - for ann in annotation_statuses: - assert ann.status == "failed" - assert ann.annotation_type == "clingen_allele_id" + assert len(events) == 1 + for event in events: + assert event.disposition == "failed" + assert event.reason == "service_rejected" - async def test_submit_score_set_mappings_to_car_derived_allele_no_duplicate_annotation( + async def test_submit_score_set_mappings_to_car_event_per_allele( self, mock_worker_ctx, session, @@ -243,8 +243,8 @@ async def test_submit_score_set_mappings_to_car_derived_allele_no_duplicate_anno dummy_variant_creation_job_run, dummy_variant_mapping_job_run, ): - """A variant linked to an authoritative AND a derived allele gets exactly one VAS row (from the - authoritative link), while the derived allele is still registered with a CAID.""" + """Each allele — authoritative and derived — gets exactly one allele-keyed event; there is no + per-variant fan-out. Both alleles are registered with a CAID.""" await create_mappings_in_score_set( session, mock_s3_client, @@ -300,18 +300,16 @@ def fake_dispatch(hgvs_list): assert result.status == JobStatus.SUCCEEDED - # Exactly one current VAS row per variant (4 total) — no duplicate from the derived allele. - current_statuses = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.annotation_type == "clingen_allele_id", - VariantAnnotationStatus.current.is_(True), - ) + # One event per allele (2 alleles → 2 events), each keyed on its allele, not fanned per-variant. + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(current_statuses) == 4 - assert len({s.variant_id for s in current_statuses}) == 4 + assert len(events) == 2 + assert all(e.variant_id is None for e in events) + assert {e.allele_id for e in events} == {authoritative_allele.id, derived_allele.id} + assert all(e.disposition == "present" for e in events) - # Both alleles registered: registration breadth is preserved even though the derived allele - # produced no VAS row. + # Both alleles registered: registration breadth covers the derived allele too. session.refresh(authoritative_allele) session.refresh(derived_allele) assert authoritative_allele.clingen_allele_id is not None @@ -370,13 +368,13 @@ async def test_submit_score_set_mappings_to_car_response_count_mismatch( # No CAID written — neither of the returned values is trusted. assert len(session.scalars(select(Allele).where(Allele.clingen_allele_id.isnot(None))).all()) == 0 - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 - for ann in annotation_statuses: - assert ann.status == "failed" - assert ann.failure_category == "external_api_error" + assert len(events) == 1 + for event in events: + assert event.disposition == "failed" + assert event.reason == "api_error" async def test_submit_score_set_mappings_to_car_malformed_response( self, @@ -425,15 +423,15 @@ async def test_submit_score_set_mappings_to_car_malformed_response( assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.FAILED - # No CAID assigned, and all 4 variant annotations are failed as rejected. + # No CAID assigned; the one allele's event is failed (reason=malformed_response). assert len(session.scalars(select(Allele).where(Allele.clingen_allele_id.isnot(None))).all()) == 0 - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 - for ann in annotation_statuses: - assert ann.status == "failed" - assert ann.failure_category == "external_service_rejected" + assert len(events) == 1 + for event in events: + assert event.disposition == "failed" + assert event.reason == "malformed_response" async def test_submit_score_set_mappings_to_car_repeated_hgvs( self, @@ -492,14 +490,14 @@ async def test_submit_score_set_mappings_to_car_repeated_hgvs( assert len(alleles) == 1 assert alleles[0].clingen_allele_id == "CA_DUPLICATE" - # 4 per-variant annotations — all success - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + # 1 allele → one present event. + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 - for ann in annotation_statuses: - assert ann.status == "success" - assert ann.annotation_type == "clingen_allele_id" + assert len(events) == 1 + for event in events: + assert event.disposition == "present" + assert event.allele_id is not None async def test_submit_score_set_mappings_to_car_partial_failure( self, @@ -562,13 +560,13 @@ async def test_submit_score_set_mappings_to_car_partial_failure( assert alleles_with_caid[0].clingen_allele_id == f"CA{alleles[0].id}" # All 4 variant annotations succeeded - success_annotations = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.annotation_type == "clingen_allele_id", - VariantAnnotationStatus.status == "success", + present_events = session.scalars( + select(AnnotationEvent).where( + AnnotationEvent.annotation_type == "clingen_allele_id", + AnnotationEvent.disposition == "present", ) ).all() - assert len(success_annotations) == 4 + assert len(present_events) == 1 async def test_submit_score_set_mappings_to_car_hgvs_not_found( self, @@ -614,13 +612,13 @@ async def test_submit_score_set_mappings_to_car_hgvs_not_found( assert len(alleles) == 0 # Verify annotation statuses were rendered as failed — 4 variants, all failed - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 - for ann in annotation_statuses: - assert ann.status == "failed" - assert ann.annotation_type == "clingen_allele_id" + assert len(events) == 1 + for event in events: + assert event.disposition == "not_applicable" + assert event.reason == "no_hgvs" async def test_submit_score_set_mappings_to_car_propagates_exception( self, @@ -724,13 +722,13 @@ async def test_submit_score_set_mappings_to_car_success( assert alleles_with_caid[0].clingen_allele_id == f"CA{alleles[0].id}" # 4 per-variant annotations — all success - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 - for ann in annotation_statuses: - assert ann.status == "success" - assert ann.annotation_type == "clingen_allele_id" + assert len(events) == 1 + for event in events: + assert event.disposition == "present" + assert event.allele_id is not None async def test_submit_score_set_mappings_to_car_preexisting( self, @@ -780,14 +778,14 @@ async def test_submit_score_set_mappings_to_car_preexisting( assert result.data["already_registered_allele_count"] == 1 assert result.data["submitted_allele_count"] == 0 - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 - for ann in annotation_statuses: - assert ann.status == "success" - assert ann.annotation_metadata["registration_source"] == "preexisting" - assert ann.annotation_metadata["clingen_allele_id"] == "CA_PRIOR" + assert len(events) == 1 + for event in events: + assert event.disposition == "present" + assert event.reason == "preexisting" + assert event.event_metadata["clingen_allele_id"] == "CA_PRIOR" async def test_submit_score_set_mappings_to_car_force_reregister_same_caid( self, @@ -843,13 +841,13 @@ async def test_submit_score_set_mappings_to_car_force_reregister_same_caid( assert result.data["registered_allele_count"] == 1 assert result.data["submitted_allele_count"] == 1 - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 - for ann in annotation_statuses: - assert ann.status == "success" - assert ann.annotation_metadata["registration_source"] == "reconfirmed" + assert len(events) == 1 + for event in events: + assert event.disposition == "present" + assert event.reason == "reconfirmed" async def test_submit_score_set_mappings_to_car_force_reregister_caid_conflict( self, @@ -909,14 +907,15 @@ async def test_submit_score_set_mappings_to_car_force_reregister_caid_conflict( session.refresh(allele) assert allele.clingen_allele_id == "CA_STORED" - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 - for ann in annotation_statuses: - assert ann.status == "failed" - assert ann.annotation_metadata["clingen_allele_id"] == "CA_STORED" - assert ann.annotation_metadata["conflicting_caid"] == "CA_DIFFERENT" + assert len(events) == 1 + for event in events: + assert event.disposition == "failed" + assert event.reason == "caid_conflict" + assert event.event_metadata["clingen_allele_id"] == "CA_STORED" + assert event.event_metadata["conflicting_caid"] == "CA_DIFFERENT" @pytest.mark.integration @@ -979,12 +978,12 @@ async def test_submit_score_set_mappings_to_car_independent_ctx( assert alleles_with_caid[0].clingen_allele_id == f"CA{alleles[0].id}" # 4 per-variant annotations — all success - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 - for ann in annotation_statuses: - assert ann.status == "success" + assert len(events) == 1 + for event in events: + assert event.disposition == "present" # Verify the job status is updated in the database session.refresh(submit_score_set_mappings_to_car_sample_job_run) @@ -1046,12 +1045,12 @@ async def test_submit_score_set_mappings_to_car_pipeline_ctx( assert alleles_with_caid[0].clingen_allele_id == f"CA{alleles[0].id}" # 4 per-variant annotations — all success - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 - for ann in annotation_statuses: - assert ann.status == "success" + assert len(events) == 1 + for event in events: + assert event.disposition == "present" # Verify the job status is updated in the database session.refresh(submit_score_set_mappings_to_car_sample_job_run_in_pipeline) @@ -1090,8 +1089,8 @@ async def test_submit_score_set_mappings_to_car_submission_disabled( assert len(alleles) == 0 # Verify no annotation statuses were created - annotation_statuses = session.scalars(select(VariantAnnotationStatus)).all() - assert len(annotation_statuses) == 0 + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 0 # Verify the job status is updated in the database session.refresh(submit_score_set_mappings_to_car_sample_job_run) @@ -1129,8 +1128,8 @@ async def test_submit_score_set_mappings_to_car_no_submission_endpoint( assert len(alleles) == 0 # Verify no annotation statuses were created - annotation_statuses = session.scalars(select(VariantAnnotationStatus)).all() - assert len(annotation_statuses) == 0 + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 0 # Verify the job status is updated in the database session.refresh(submit_score_set_mappings_to_car_sample_job_run) @@ -1160,8 +1159,8 @@ async def test_submit_score_set_mappings_to_car_no_mappings( assert len(alleles) == 0 # Verify no annotation statuses were created - annotation_statuses = session.scalars(select(VariantAnnotationStatus)).all() - assert len(annotation_statuses) == 0 + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 0 # Verify the job status is updated in the database session.refresh(submit_score_set_mappings_to_car_sample_job_run) @@ -1214,10 +1213,10 @@ async def test_submit_score_set_mappings_to_car_no_registered_alleles( assert len(alleles) == 0 # Verify annotation statuses were rendered as failed — 4 variants, all failed - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 + assert len(events) == 1 # Verify the job status is updated in the database session.refresh(submit_score_set_mappings_to_car_sample_job_run) @@ -1274,10 +1273,10 @@ async def test_submit_score_set_mappings_to_car_no_linked_alleles( assert len(alleles) == 0 # Verify annotation statuses were rendered as failed — 4 variants, all failed - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 + assert len(events) == 1 # Verify the job status is updated in the database session.refresh(submit_score_set_mappings_to_car_sample_job_run) @@ -1344,13 +1343,13 @@ async def test_submit_score_set_mappings_to_car_partial_failure( assert alleles_with_caid[0].clingen_allele_id == f"CA{alleles[0].id}" # All 4 variant annotations succeeded - success_annotations = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.annotation_type == "clingen_allele_id", - VariantAnnotationStatus.status == "success", + present_events = session.scalars( + select(AnnotationEvent).where( + AnnotationEvent.annotation_type == "clingen_allele_id", + AnnotationEvent.disposition == "present", ) ).all() - assert len(success_annotations) == 4 + assert len(present_events) == 1 # Verify the job status is updated in the database session.refresh(submit_score_set_mappings_to_car_sample_job_run) @@ -1409,18 +1408,19 @@ async def test_submit_score_set_mappings_to_car_car_error_details_stored_in_anno standalone_worker_context, submit_score_set_mappings_to_car_sample_job_run.id ) - # All 4 variant annotations should have EXTERNAL_SERVICE_REJECTED since the 1 shared allele was rejected - car_rejected_annotations = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.annotation_type == "clingen_allele_id", - VariantAnnotationStatus.failure_category == "external_service_rejected", + # The 1 shared allele was rejected by ClinGen → one failed event (reason service_rejected). + rejected_events = session.scalars( + select(AnnotationEvent).where( + AnnotationEvent.annotation_type == "clingen_allele_id", + AnnotationEvent.reason == "service_rejected", ) ).all() - assert len(car_rejected_annotations) == 4 - for rejected in car_rejected_annotations: - assert rejected.annotation_metadata["submitted_hgvs"] == allele_hgvs - assert rejected.annotation_metadata["car_error_type"] == "InvalidHGVS" - assert rejected.annotation_metadata["car_error_message"] == "The HGVS string is invalid." + assert len(rejected_events) == 1 + for event in rejected_events: + assert event.disposition == "failed" + assert event.event_metadata["submitted_hgvs"] == allele_hgvs + assert event.event_metadata["car_error_type"] == "InvalidHGVS" + assert event.event_metadata["car_error_message"] == "The HGVS string is invalid." async def test_submit_score_set_mappings_to_car_propagates_exception_to_decorator( self, @@ -1536,12 +1536,12 @@ async def test_submit_score_set_mappings_to_car_with_arq_context_independent( assert alleles_with_caid[0].clingen_allele_id == f"CA{alleles[0].id}" # 4 per-variant annotations — all success - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 - for ann in annotation_statuses: - assert ann.status == "success" + assert len(events) == 1 + for event in events: + assert event.disposition == "present" async def test_submit_score_set_mappings_to_car_with_arq_context_pipeline( self, @@ -1608,12 +1608,12 @@ async def test_submit_score_set_mappings_to_car_with_arq_context_pipeline( assert alleles_with_caid[0].clingen_allele_id == f"CA{alleles[0].id}" # 4 per-variant annotations — all success - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 4 - for ann in annotation_statuses: - assert ann.status == "success" + assert len(events) == 1 + for event in events: + assert event.disposition == "present" async def test_submit_score_set_mappings_to_car_with_arq_context_exception_handling_independent( self, @@ -1668,10 +1668,10 @@ async def test_submit_score_set_mappings_to_car_with_arq_context_exception_handl assert len(alleles) == 0 # Verify no annotation statuses were created - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 0 + assert len(events) == 0 async def test_submit_score_set_mappings_to_car_with_arq_context_exception_handling_pipeline( self, @@ -1731,10 +1731,10 @@ async def test_submit_score_set_mappings_to_car_with_arq_context_exception_handl assert len(alleles) == 0 # Verify no annotation statuses were created - annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "clingen_allele_id") + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "clingen_allele_id") ).all() - assert len(annotation_statuses) == 0 + assert len(events) == 0 @pytest.mark.unit @@ -2084,11 +2084,11 @@ async def dummy_ldh_submission(*args, **kwargs): # Verify annotation statuses were created annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "ldh_submission") + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "ldh_submission") ).all() assert len(annotation_statuses) == 4 for ann in annotation_statuses: - assert ann.status == "success" + assert ann.disposition == "present" # Verify the job status is updated in the database session.refresh(submit_score_set_mappings_to_ldh_sample_job_run) @@ -2155,11 +2155,11 @@ async def dummy_ldh_submission(*args, **kwargs): # Verify annotation statuses were created annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "ldh_submission") + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "ldh_submission") ).all() assert len(annotation_statuses) == 4 for ann in annotation_statuses: - assert ann.status == "success" + assert ann.disposition == "present" # Verify the job status is updated in the database session.refresh(submit_score_set_mappings_to_ldh_sample_job_run_in_pipeline) @@ -2262,11 +2262,11 @@ async def dummy_no_linked_alleles_submission(*args, **kwargs): # Verify annotation statuses were created with failures annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "ldh_submission") + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "ldh_submission") ).all() assert len(annotation_statuses) == 4 for ann in annotation_statuses: - assert ann.status == "failed" + assert ann.disposition == "failed" # Verify the job status is updated in the database session.refresh(submit_score_set_mappings_to_ldh_sample_job_run) @@ -2310,7 +2310,7 @@ async def test_submit_score_set_mappings_to_ldh_hgvs_not_found( # Verify no annotation statuses were created annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "ldh_submission") + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "ldh_submission") ).all() assert len(annotation_statuses) == 0 @@ -2365,11 +2365,11 @@ async def dummy_submission_failure(*args, **kwargs): # Verify annotation statuses were created with failures annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "ldh_submission") + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "ldh_submission") ).all() assert len(annotation_statuses) == 4 for ann in annotation_statuses: - assert ann.status == "failed" + assert ann.disposition == "failed" # Verify the job status is updated in the database # TODO:XXX: Change status to 'failed' once decorator supports it @@ -2435,15 +2435,15 @@ async def dummy_partial_submission(*args, **kwargs): # Verify annotation statuses were created annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "ldh_submission") + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "ldh_submission") ).all() assert len(annotation_statuses) == 4 success_count = 0 failure_count = 0 for ann in annotation_statuses: - if ann.status == "success": + if ann.disposition == "present": success_count += 1 - elif ann.status == "failed": + elif ann.disposition == "failed": failure_count += 1 assert success_count == 1 @@ -2513,11 +2513,11 @@ async def dummy_ldh_submission(*args, **kwargs): # Verify annotation statuses were created annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "ldh_submission") + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "ldh_submission") ).all() assert len(annotation_statuses) == 4 for ann in annotation_statuses: - assert ann.status == "success" + assert ann.disposition == "present" # Verify the job status is updated in the database session.refresh(submit_score_set_mappings_to_ldh_sample_job_run) @@ -2590,11 +2590,11 @@ async def dummy_ldh_submission(*args, **kwargs): # Verify annotation statuses were created annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "ldh_submission") + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "ldh_submission") ).all() assert len(annotation_statuses) == 4 for ann in annotation_statuses: - assert ann.status == "success" + assert ann.disposition == "present" # Verify the job status is updated in the database session.refresh(submit_score_set_mappings_to_ldh_sample_job_run) @@ -2662,11 +2662,11 @@ async def dummy_ldh_submission(*args, **kwargs): # Verify annotation statuses were created annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "ldh_submission") + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "ldh_submission") ).all() assert len(annotation_statuses) == 4 for ann in annotation_statuses: - assert ann.status == "success" + assert ann.disposition == "present" # Verify the job status is updated in the database session.refresh(submit_score_set_mappings_to_ldh_sample_job_run_in_pipeline) @@ -2721,7 +2721,7 @@ async def test_submit_score_set_mappings_to_ldh_with_arq_context_exception_handl mock_send_slack_job_error.assert_called_once() # Verify no annotation statuses were created annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "ldh_submission") + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "ldh_submission") ).all() assert len(annotation_statuses) == 0 @@ -2776,7 +2776,7 @@ async def test_submit_score_set_mappings_to_ldh_with_arq_context_exception_handl mock_send_slack_job_error.assert_called_once() # Verify no annotation statuses were created annotation_statuses = session.scalars( - select(VariantAnnotationStatus).where(VariantAnnotationStatus.annotation_type == "ldh_submission") + select(AnnotationEvent).where(AnnotationEvent.annotation_type == "ldh_submission") ).all() assert len(annotation_statuses) == 0 diff --git a/tests/worker/jobs/external_services/test_clinvar.py b/tests/worker/jobs/external_services/test_clinvar.py index 1ac616ff..24cfab47 100644 --- a/tests/worker/jobs/external_services/test_clinvar.py +++ b/tests/worker/jobs/external_services/test_clinvar.py @@ -3,19 +3,20 @@ import pytest import requests -from mavedb.models.clinical_control import ClinicalControl -from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationStatus, FailureCategory, JobStatus, PipelineStatus -from mavedb.models.variant_annotation_status import VariantAnnotationStatus - pytest.importorskip("arq") from unittest.mock import patch +from sqlalchemy import select + from mavedb.lib.types.workflow import JobExecutionOutcome -from mavedb.models.mapped_variant import MappedVariant -from mavedb.models.score_set import ScoreSet -from mavedb.models.variant import Variant +from mavedb.models.clinical_control import ClinvarControl +from mavedb.models.clinvar_allele_link import ClinvarAlleleLink +from mavedb.models.enums.annotation_type import AnnotationType +from mavedb.models.enums.disposition import Disposition +from mavedb.models.enums.event_reason import EventReason +from mavedb.models.enums.job_pipeline import FailureCategory, JobStatus +from mavedb.models.annotation_event import AnnotationEvent from mavedb.worker.jobs.external_services.clinvar import refresh_clinvar_controls from mavedb.worker.lib.managers.job_manager import JobManager @@ -26,6 +27,7 @@ "GeneSymbol": "TEST", "ClinicalSignificance": "benign", "ReviewStatus": "reviewed by expert panel", + "VariationID": "987654", }, } @@ -33,32 +35,29 @@ @pytest.mark.unit @pytest.mark.asyncio class TestRefreshClinvarControlsUnit: - """Tests for the refresh_clinvar_controls job function.""" + """Unit tests for the allele-model refresh_clinvar_controls job.""" @pytest.fixture(autouse=True) def _mock_clinvar_versions(self): - """Mock generate_clinvar_versions to return a single version for testing.""" + """Pin _generate_clinvar_versions to a single version so clinvar_version == '01_2026'.""" with patch( - "mavedb.worker.jobs.external_services.clinvar.generate_clinvar_versions", + "mavedb.worker.jobs.external_services.clinvar._generate_clinvar_versions", return_value=[(2026, 1)], ): yield - async def test_refresh_clinvar_controls_skips_version_on_fetch_failure( + async def test_no_alleles_with_caids( self, mock_worker_ctx, session, + with_populated_domain_data, with_refresh_clinvar_controls_job, sample_refresh_clinvar_controls_job_run, ): - """Test that a fetch failure for a version is logged and skipped, not propagated.""" - - async def awaitable_exception(*args, **kwargs): - raise Exception("Network error") - + """No current alleles carry a CAID -> the job succeeds with nothing to do.""" with patch( "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - side_effect=awaitable_exception, + return_value=MOCK_CLINVAR_DATA, ): result = await refresh_clinvar_controls( mock_worker_ctx, @@ -68,61 +67,26 @@ async def awaitable_exception(*args, **kwargs): assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - assert result.data["versions_completed"] == 0 + assert session.scalars(select(ClinvarControl)).all() == [] + assert session.scalars(select(ClinvarAlleleLink)).all() == [] + assert session.scalars(select(AnnotationEvent)).all() == [] - async def test_refresh_clinvar_controls_no_mapped_variants( + async def test_fetch_failure_skips_version( self, mock_worker_ctx, session, with_refresh_clinvar_controls_job, sample_refresh_clinvar_controls_job_run, + setup_sample_alleles_with_caid, ): - """Test that the job completes successfully when there are no mapped variants.""" - - with patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value={}, - ): - result = await refresh_clinvar_controls( - mock_worker_ctx, - sample_refresh_clinvar_controls_job_run.id, - JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), - ) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED + """A version whose TSV fetch fails is logged and skipped, not propagated.""" - async def test_refresh_clinvar_controls_no_variants_have_caids( - self, - mock_worker_ctx, - session, - with_refresh_clinvar_controls_job, - sample_refresh_clinvar_controls_job_run, - ): - """Test that the job completes successfully when no variants have CAIDs.""" - # Add a variant without a CAID - score_set = session.get(ScoreSet, sample_refresh_clinvar_controls_job_run.job_params["score_set_id"]) - variant = Variant( - urn="urn:variant:test-variant-no-caid", - score_set_id=score_set.id, - hgvs_nt="NM_000000.1:c.2G>A", - hgvs_pro="NP_000000.1:p.Val2Ile", - data={"hgvs_c": "NM_000000.1:c.2G>A", "hgvs_p": "NP_000000.1:p.Val2Ile"}, - ) - session.add(variant) - session.commit() - mapped_variant = MappedVariant( - variant_id=variant.id, - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - ) - session.add(mapped_variant) - session.commit() + async def boom(*args, **kwargs): + raise Exception("Network error") with patch( "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, + side_effect=boom, ): result = await refresh_clinvar_controls( mock_worker_ctx, @@ -130,29 +94,21 @@ async def test_refresh_clinvar_controls_no_variants_have_caids( JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), ) - assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED + assert result.data["versions_completed"] == 0 + assert session.scalars(select(ClinvarAlleleLink)).all() == [] - # Verify an annotation status was created for the variant without a CAID - variant_no_caid = ( - session.query(VariantAnnotationStatus).filter(VariantAnnotationStatus.variant_id == variant.id).one() - ) - assert variant_no_caid.status == AnnotationStatus.SKIPPED - assert variant_no_caid.annotation_type == AnnotationType.CLINVAR_CONTROL - assert variant_no_caid.error_message == "Mapped variant does not have an associated ClinGen allele ID." - - async def test_refresh_clinvar_controls_variants_are_multivariants( + async def test_multi_variant_caid_skipped( self, mock_worker_ctx, session, with_refresh_clinvar_controls_job, sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, ): - """Test that the job completes successfully when all variants are multi-variant CAIDs.""" - # Update the mapped variant to have a multi-variant CAID - mapped_variant = session.query(MappedVariant).first() - mapped_variant.clingen_allele_id = "CA-MULTI-001,CA-MULTI-002" + """An allele whose CAID is a multi-variant identifier is skipped (ClinVar can't key on it).""" + _, allele = setup_sample_alleles_with_caid + allele.clingen_allele_id = "CA-MULTI-001,CA-MULTI-002" session.commit() with patch( @@ -165,37 +121,32 @@ async def test_refresh_clinvar_controls_variants_are_multivariants( JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), ) - assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED + assert session.scalars(select(ClinvarAlleleLink)).all() == [] - # Verify an annotation status was created for the multi-variant CAID - variant_with_multicid = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant.variant_id) - .one() - ) - assert variant_with_multicid.status == AnnotationStatus.SKIPPED - assert variant_with_multicid.annotation_type == AnnotationType.CLINVAR_CONTROL - assert ( - variant_with_multicid.error_message - == "Multi-variant ClinGen allele IDs cannot be associated with ClinVar data." - ) + # Allele-keyed event: a cis-block CAID structurally can't key ClinVar — a pipeline gap, not a + # statement about the source. + event = session.scalars(select(AnnotationEvent)).one() + assert event.annotation_type == AnnotationType.CLINVAR_CONTROL + assert event.disposition == Disposition.NOT_APPLICABLE + assert event.reason == EventReason.MULTI_VARIANT_CAID + assert event.allele_id == allele.id and event.variant_id is None - async def test_refresh_clinvar_controls_clingen_api_failure( + async def test_no_associated_clinvar_allele_id_skipped( self, mock_worker_ctx, session, with_refresh_clinvar_controls_job, sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, ): - """Test that the job handles ClinGen API failures gracefully.""" + """ClinGen returns no ClinVar allele id for the CAID -> absent/no_record event, no link.""" + _, allele = setup_sample_alleles_with_caid - # Mock the get_associated_clinvar_allele_id function to raise an exception with ( patch( "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - side_effect=requests.exceptions.RequestException("ClinGen API error"), + return_value=None, ), patch( "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", @@ -208,40 +159,33 @@ async def test_refresh_clinvar_controls_clingen_api_failure( JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), ) - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.FAILED - assert result.failure_category == FailureCategory.DEPENDENCY_FAILURE - - # Verify an annotation status was created for the variant due to ClinGen API failure - mapped_variant = session.query(MappedVariant).first() - variant_with_api_failure = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant.variant_id) - .one() - ) - assert variant_with_api_failure.status == AnnotationStatus.FAILED - assert variant_with_api_failure.annotation_type == AnnotationType.CLINVAR_CONTROL - assert "Failed to retrieve ClinVar allele ID from ClinGen API" in variant_with_api_failure.error_message - - async def test_refresh_clinvar_controls_no_associated_clinvar_allele_id( + assert result.status == JobStatus.SUCCEEDED + assert session.scalars(select(ClinvarAlleleLink)).all() == [] + # ClinGen had no ClinVar AlleleID for this CAID — an informative negative about the source. + event = session.scalars(select(AnnotationEvent)).one() + assert event.disposition == Disposition.ABSENT + assert event.reason == EventReason.NO_RECORD + assert event.allele_id == allele.id and event.variant_id is None + + async def test_clinvar_data_not_found_skipped( self, mock_worker_ctx, session, with_refresh_clinvar_controls_job, sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, ): - """Test that the job handles no associated ClinVar Allele ID gracefully.""" + """The resolved ClinVar allele id is absent from the version's TSV -> absent/no_record, no link.""" + _, allele = setup_sample_alleles_with_caid - # Mock the get_associated_clinvar_allele_id function to return None with ( patch( "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - return_value=None, + return_value="VCV000000123", ), patch( "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, + return_value={"VCV000000999": {"GeneSymbol": "X", "ClinicalSignificance": "y", "ReviewStatus": "z"}}, ), ): result = await refresh_clinvar_controls( @@ -250,48 +194,34 @@ async def test_refresh_clinvar_controls_no_associated_clinvar_allele_id( JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), ) - assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - - # Verify an annotation status was created for the variant due to no associated ClinVar Allele ID - mapped_variant = session.query(MappedVariant).first() - variant_no_clinvar_allele = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant.variant_id) - .one() - ) - assert variant_no_clinvar_allele.status == AnnotationStatus.SKIPPED - assert variant_no_clinvar_allele.annotation_type == AnnotationType.CLINVAR_CONTROL - assert "No ClinVar allele ID found for ClinGen allele ID" in variant_no_clinvar_allele.error_message - - async def test_refresh_clinvar_controls_no_clinvar_data_found( + assert session.scalars(select(ClinvarAlleleLink)).all() == [] + # The CAID resolved to a ClinVar AlleleID, but it's absent from this release's snapshot — a + # genuine, version-scoped negative. + event = session.scalars(select(AnnotationEvent)).one() + assert event.disposition == Disposition.ABSENT + assert event.reason == EventReason.NO_RECORD + assert event.allele_id == allele.id and event.variant_id is None + + async def test_clingen_api_failure_fails_when_total( self, mock_worker_ctx, session, with_refresh_clinvar_controls_job, sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, ): - """Test that the job handles no ClinVar data found for the associated ClinVar Allele ID.""" - - # TSV data with a different allele ID than the one being looked up - non_matching_clinvar_data = { - "VCV000000001": { - "GeneSymbol": "TEST", - "ClinicalSignificance": "benign", - "ReviewStatus": "reviewed by expert panel", - }, - } + """A ClinGen API error with no successful links returns FAILED/DEPENDENCY_FAILURE.""" + _, allele = setup_sample_alleles_with_caid - # Mock the get_associated_clinvar_allele_id function to return a ClinVar Allele ID with ( patch( "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - return_value="VCV000000123", + side_effect=requests.exceptions.RequestException("ClinGen API error"), ), patch( "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=non_matching_clinvar_data, + return_value=MOCK_CLINVAR_DATA, ), ): result = await refresh_clinvar_controls( @@ -300,31 +230,24 @@ async def test_refresh_clinvar_controls_no_clinvar_data_found( JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), ) - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - # Verify an annotation status was created for the variant due to no ClinVar data found - mapped_variant = session.query(MappedVariant).first() - variant_no_clinvar_data = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant.variant_id) - .one() - ) - assert variant_no_clinvar_data.status == AnnotationStatus.SKIPPED - assert variant_no_clinvar_data.annotation_type == AnnotationType.CLINVAR_CONTROL - assert "No ClinVar data found for ClinVar allele ID" in variant_no_clinvar_data.error_message + assert result.status == JobStatus.FAILED + assert result.failure_category == FailureCategory.DEPENDENCY_FAILURE + event = session.scalars(select(AnnotationEvent)).one() + assert event.disposition == Disposition.FAILED + assert event.reason == EventReason.API_ERROR + assert event.allele_id == allele.id and event.variant_id is None - async def test_refresh_clinvar_controls_successful_annotation_existing_control( + async def test_successful_link_new_control( self, mock_worker_ctx, session, with_refresh_clinvar_controls_job, sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, ): - """Test that the job successfully annotates a variant with ClinVar control data.""" + """A resolved + present CAID creates a ClinvarControl, a live link, and a present/created event.""" + _, allele = setup_sample_alleles_with_caid - # Mock the get_associated_clinvar_allele_id function to return a ClinVar Allele ID with ( patch( "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", @@ -341,54 +264,53 @@ async def test_refresh_clinvar_controls_successful_annotation_existing_control( JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), ) - assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - - # Verify an annotation status was created for the variant with successful annotation - mapped_variant = session.query(MappedVariant).first() - annotated_variant = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant.variant_id) - .one() - ) - assert annotated_variant.status == AnnotationStatus.SUCCESS - assert annotated_variant.annotation_type == AnnotationType.CLINVAR_CONTROL - assert annotated_variant.error_message is None - - async def test_refresh_clinvar_controls_successful_annotation_new_control( + assert result.data["created_link_count"] == 1 + + control = session.scalars(select(ClinvarControl)).one() + assert control.db_identifier == "VCV000000123" + assert control.db_version == "01_2026" + assert control.clinical_significance == "benign" + # AlleleID stays in db_identifier; the canonical VariationID is captured alongside it. + assert control.clinvar_variation_id == "987654" + + links = session.scalars( + select(ClinvarAlleleLink).where(ClinvarAlleleLink.allele_id == allele.id, ClinvarAlleleLink.current) + ).all() + assert len(links) == 1 + assert links[0].clinvar_control_id == control.id + + event = session.scalars(select(AnnotationEvent)).one() + assert event.disposition == Disposition.PRESENT + assert event.reason == EventReason.CREATED + assert event.annotation_type == AnnotationType.CLINVAR_CONTROL + assert event.allele_id == allele.id and event.variant_id is None + + async def test_links_and_annotates_full_allele_set( self, mock_worker_ctx, session, with_refresh_clinvar_controls_job, sample_refresh_clinvar_controls_job_run, + setup_rt_derived_allele_with_caid, ): - """Test that the job successfully annotates a variant with ClinVar control data when no prior status exists.""" - # Add a variant and mapped variant to the database with a CAID - score_set = session.get(ScoreSet, sample_refresh_clinvar_controls_job_run.job_params["score_set_id"]) - variant = Variant( - urn="urn:variant:test-variant-with-caid-2", - score_set_id=score_set.id, - hgvs_nt="NM_000000.1:c.3C>T", - hgvs_pro="NP_000000.1:p.Ala3Val", - data={"hgvs_c": "NM_000000.1:c.3C>T", "hgvs_p": "NP_000000.1:p.Ala3Val"}, - ) - session.add(variant) - session.commit() - mapped_variant = MappedVariant( - variant_id=variant.id, - clingen_allele_id="CA124", - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - ) - session.add(mapped_variant) - session.commit() + """ClinVar linkage covers the full allele set, not just authoritative links: the RT-derived + allele carries the matching CAID and is linked. Events are now allele-keyed, so the RT-derived + allele's present status is recorded directly — the limitation the per-variant bandaid had + (dropping the RT-derived allele's status) is lifted. Each allele gets its own event: present for + the linked RT allele, absent for the unmatched authoritative one. + """ + _, authoritative_allele, rt_allele = setup_rt_derived_allele_with_caid + rt_caid = rt_allele.clingen_allele_id + + async def resolve(caid): + # Only the RT-derived allele's CAID resolves to a ClinVar id present in the TSV. + return "VCV000000123" if caid == rt_caid else None - # Mock the get_associated_clinvar_allele_id function to return a ClinVar Allele ID with ( patch( "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - return_value="VCV000000123", + side_effect=resolve, ), patch( "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", @@ -401,1105 +323,293 @@ async def test_refresh_clinvar_controls_successful_annotation_new_control( JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), ) - assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - # Verify an annotation status was created for the variant with successful annotation - annotated_variant = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant.variant_id) - .one() - ) - assert annotated_variant.status == AnnotationStatus.SUCCESS - assert annotated_variant.annotation_type == AnnotationType.CLINVAR_CONTROL - assert annotated_variant.error_message is None - - async def test_refresh_clinvar_controls_idempotent_run( + # The RT-derived allele IS linked — the core fix. + rt_links = session.scalars( + select(ClinvarAlleleLink).where(ClinvarAlleleLink.allele_id == rt_allele.id, ClinvarAlleleLink.current) + ).all() + assert len(rt_links) == 1 + # The authoritative allele's CAID had no ClinVar match, so it is not linked. + assert ( + session.scalars( + select(ClinvarAlleleLink).where(ClinvarAlleleLink.allele_id == authoritative_allele.id) + ).all() + == [] + ) + + # One allele-keyed event per allele (never per-variant): the linked RT allele is present, the + # unmatched authoritative allele is absent. The variant resolves its derived allele's status + # through the live links — no variant_id on the events. + events = session.scalars(select(AnnotationEvent)).all() + assert all(e.annotation_type == AnnotationType.CLINVAR_CONTROL for e in events) + assert all(e.variant_id is None for e in events) + by_allele = {e.allele_id: e for e in events} + assert by_allele[rt_allele.id].disposition == Disposition.PRESENT + assert by_allele[rt_allele.id].reason == EventReason.CREATED + assert by_allele[authoritative_allele.id].disposition == Disposition.ABSENT + assert by_allele[authoritative_allele.id].reason == EventReason.NO_RECORD + + async def test_idempotent_rerun_skips_and_does_not_duplicate( self, mock_worker_ctx, session, with_refresh_clinvar_controls_job, sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, ): - """Test that running the job multiple times does not create duplicate annotation statuses.""" + """A second run finds the allele already linked at the version, skips the resolution, reports + preexisting, and creates neither a duplicate control nor a duplicate link.""" + _, allele = setup_sample_alleles_with_caid - # Mock the get_associated_clinvar_allele_id function to return a ClinVar Allele ID with ( patch( "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", return_value="VCV000000123", - ), + ) as resolve_spy, patch( "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - side_effect=[MOCK_CLINVAR_DATA, MOCK_CLINVAR_DATA], + return_value=MOCK_CLINVAR_DATA, ), ): - # First run - result1 = await refresh_clinvar_controls( + first = await refresh_clinvar_controls( mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id, JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), ) - session.commit() + assert resolve_spy.await_count == 1 - # Second run - result2 = await refresh_clinvar_controls( + second = await refresh_clinvar_controls( mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id, JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), ) - - assert isinstance(result1, JobExecutionOutcome) - assert result1.status == JobStatus.SUCCEEDED - assert isinstance(result2, JobExecutionOutcome) - assert result2.status == JobStatus.SUCCEEDED - - # Verify only one clinical control annotation exists for the variant - clinical_controls = session.query(ClinicalControl).all() - assert len(clinical_controls) == 1 - - # Verify two annotated variants exist but both reflect the same successful annotation, and only - # one is current - annotated_variants = session.query(VariantAnnotationStatus).all() - assert len(annotated_variants) == 2 - statuses = [av.status for av in annotated_variants] - assert statuses.count(AnnotationStatus.SUCCESS) == 2 - current_statuses = [av for av in annotated_variants if av.current] - assert len(current_statuses) == 1 - - async def test_refresh_clinvar_controls_partial_failure( + # Second run skipped the resolution entirely (version-keyed skip). + assert resolve_spy.await_count == 1 + + assert first.data["created_link_count"] == 1 + assert second.data["preexisting_link_count"] == 1 + assert second.data["created_link_count"] == 0 + + # One control, one live link — no churn. + assert len(session.scalars(select(ClinvarControl)).all()) == 1 + links = session.scalars(select(ClinvarAlleleLink).where(ClinvarAlleleLink.allele_id == allele.id)).all() + assert len(links) == 1 + assert links[0].valid_to is None + + # Per-version events: two allele-keyed events (one per run), no current flag. The first run + # created the link; the second found it preexisting. Latest event (by id) wins. + events = session.scalars( + select(AnnotationEvent).where(AnnotationEvent.allele_id == allele.id).order_by(AnnotationEvent.id) + ).all() + assert len(events) == 2 + assert events[0].reason == EventReason.CREATED + assert events[1].reason == EventReason.PREEXISTING + assert all(e.disposition == Disposition.PRESENT for e in events) + + async def test_release_reresolution_supersedes_newest_wins( self, mock_worker_ctx, session, with_refresh_clinvar_controls_job, sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, ): - """Test that the job handles partial failures gracefully.""" - - variant1, mapped_variant1 = setup_sample_variants_with_caid - - # Add an additional mapped variant to the database with a CAID - score_set = session.get(ScoreSet, sample_refresh_clinvar_controls_job_run.job_params["score_set_id"]) - variant2 = Variant( - urn="urn:variant:test-variant-with-caid-2", - score_set_id=score_set.id, - hgvs_nt="NM_000000.1:c.4G>C", - hgvs_pro="NP_000000.1:p.Gly4Ala", - data={"hgvs_c": "NM_000000.1:c.4G>C", "hgvs_p": "NP_000000.1:p.Gly4Ala"}, - ) - session.add(variant2) - session.commit() - mapped_variant2 = MappedVariant( - variant_id=variant2.id, - clingen_allele_id="CA125", - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - ) - session.add(mapped_variant2) + """Defensive guard: if an allele re-resolves to a *different* control within the same release + (should never happen — archival data is immutable), the old live link is superseded + newest-wins, leaving exactly one live link per (allele, version).""" + variant, allele = setup_sample_alleles_with_caid + sample_refresh_clinvar_controls_job_run.job_params = { + **sample_refresh_clinvar_controls_job_run.job_params, + "force": True, + } session.commit() - # Mock the get_associated_clinvar_allele_id function to raise an exception for the first call - def side_effect_get_associated_clinvar_allele_id(clingen_allele_id): - if clingen_allele_id == "CA125": - raise requests.exceptions.RequestException("ClinGen API error") - return "VCV000000123" + tsv = { + "VCV000000123": { + "GeneSymbol": "A", + "ClinicalSignificance": "benign", + "ReviewStatus": "ok", + "VariationID": "111", + }, + "VCV000000999": { + "GeneSymbol": "B", + "ClinicalSignificance": "pathogenic", + "ReviewStatus": "ok", + "VariationID": "222", + }, + } with ( patch( "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - side_effect=side_effect_get_associated_clinvar_allele_id, + side_effect=["VCV000000123", "VCV000000999"], ), patch( "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, + return_value=tsv, ), ): - result = await refresh_clinvar_controls( + await refresh_clinvar_controls( mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id, JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), ) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - # Verify annotation statuses for both variants - variant_with_api_failure = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant2.variant_id) - .one() - ) - assert variant_with_api_failure.status == AnnotationStatus.FAILED - assert variant_with_api_failure.annotation_type == AnnotationType.CLINVAR_CONTROL - assert "Failed to retrieve ClinVar allele ID from ClinGen API" in variant_with_api_failure.error_message - - annotated_variant2 = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant1.variant_id) - .one() - ) - assert annotated_variant2.status == AnnotationStatus.SUCCESS - assert annotated_variant2.annotation_type == AnnotationType.CLINVAR_CONTROL - assert annotated_variant2.error_message is None - - async def test_total_api_failure_returns_failed( - self, - mock_worker_ctx, - session, - with_refresh_clinvar_controls_job, - sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, - ): - """Test that the job returns FAILED when all ClinVar lookups fail.""" - with ( - patch( - "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - side_effect=requests.exceptions.RequestException("ClinGen API error"), - ), - patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, - ), - ): - result = await refresh_clinvar_controls( + session.commit() + await refresh_clinvar_controls( mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id, JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), ) - assert result.status == JobStatus.FAILED - assert result.failure_category == FailureCategory.DEPENDENCY_FAILURE - - async def test_upsert_does_not_create_duplicate_control_when_row_already_exists( + control_a = session.scalars(select(ClinvarControl).where(ClinvarControl.db_identifier == "VCV000000123")).one() + control_b = session.scalars(select(ClinvarControl).where(ClinvarControl.db_identifier == "VCV000000999")).one() + + all_links = session.scalars(select(ClinvarAlleleLink).where(ClinvarAlleleLink.allele_id == allele.id)).all() + live_links = [link for link in all_links if link.valid_to is None] + + # Exactly one live link per (allele, version) — the new control — and the old one is retired. + assert len(live_links) == 1 + assert live_links[0].clinvar_control_id == control_b.id + retired = [link for link in all_links if link.clinvar_control_id == control_a.id] + assert len(retired) == 1 + assert retired[0].valid_to is not None + # Gap-free handoff: the retired link's valid_to equals the successor's valid_from. + assert retired[0].valid_to == live_links[0].valid_from + # Stamped from the DB clock (func.now()), so the boundary is timezone-aware and comparable to + # every other func.now()-stamped valid-time row — a regression to a naive datetime.now() would + # land here as a tz-naive value. + assert live_links[0].valid_from.tzinfo is not None + assert retired[0].valid_to.tzinfo is not None + + async def test_force_reresolves_without_duplicating_link( self, mock_worker_ctx, session, with_refresh_clinvar_controls_job, sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, ): - """Test that the upsert handles a pre-existing ClinicalControl row without creating a duplicate. - - This covers the concurrent-job race condition where two refresh_clinvar_controls - jobs run simultaneously for different score sets and both try to insert a - ClinicalControl for the same (db_name, db_identifier, db_version). The second - job's upsert must hit ON CONFLICT DO UPDATE rather than inserting a second row. - """ - # Simulate the state left by a concurrent job: the ClinicalControl row - # for this identifier/version already exists in the DB with stale data. - pre_existing_control = ClinicalControl( - db_name="ClinVar", - db_identifier="VCV000000123", - db_version="01_2026", - gene_symbol="OLD_SYMBOL", - clinical_significance="likely pathogenic", - clinical_review_status="criteria provided, single submitter", - ) - session.add(pre_existing_control) + """force bypasses the version-keyed skip and re-resolves, but the get-or-create link write + does not duplicate an existing live link.""" + variant, allele = setup_sample_alleles_with_caid + sample_refresh_clinvar_controls_job_run.job_params = { + **sample_refresh_clinvar_controls_job_run.job_params, + "force": True, + } session.commit() with ( patch( "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", return_value="VCV000000123", - ), + ) as resolve_spy, patch( "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", return_value=MOCK_CLINVAR_DATA, ), ): - result = await refresh_clinvar_controls( + await refresh_clinvar_controls( + mock_worker_ctx, + sample_refresh_clinvar_controls_job_run.id, + JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), + ) + session.commit() + second = await refresh_clinvar_controls( mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id, JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), ) - assert result.status == JobStatus.SUCCEEDED - - # Only one row should exist — the upsert must not have inserted a second one. - controls = session.query(ClinicalControl).filter_by(db_identifier="VCV000000123", db_version="01_2026").all() - assert len(controls) == 1 - - # The upsert should have updated the stale data from the concurrent job. - session.refresh(controls[0]) - assert controls[0].gene_symbol == "TEST" - assert controls[0].clinical_significance == "benign" - assert controls[0].clinical_review_status == "reviewed by expert panel" + # force re-resolved on the second run despite an existing link. + assert resolve_spy.await_count == 2 + assert second.data["preexisting_link_count"] == 1 + assert second.data["created_link_count"] == 0 + links = session.scalars(select(ClinvarAlleleLink).where(ClinvarAlleleLink.allele_id == allele.id)).all() + assert len(links) == 1 + assert links[0].valid_to is None @pytest.mark.integration @pytest.mark.asyncio -class TestRefreshClinvarControlsIntegration: - """Integration tests for the refresh_clinvar_controls job function.""" +class TestRefreshClinvarControlsArqContext: + """End-to-end tests for refresh_clinvar_controls within an ARQ worker context.""" @pytest.fixture(autouse=True) def _mock_clinvar_versions(self): - """Mock generate_clinvar_versions to return a single version for testing.""" with patch( - "mavedb.worker.jobs.external_services.clinvar.generate_clinvar_versions", + "mavedb.worker.jobs.external_services.clinvar._generate_clinvar_versions", return_value=[(2026, 1)], ): yield - async def test_refresh_clinvar_controls_no_mapped_variants( - self, - session, - with_populated_domain_data, - with_refresh_clinvar_controls_job, - mock_worker_ctx, - sample_refresh_clinvar_controls_job_run, - ): - """Integration test: job completes successfully when there are no mapped variants.""" - - with ( - patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, - ), - ): - result = await refresh_clinvar_controls(mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - # Verify no controls were added - clinical_controls = session.query(ClinicalControl).all() - assert len(clinical_controls) == 0 - - # Verify no annotation statuses were created - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 0 - - # Verify job run status is marked as completed - session.refresh(sample_refresh_clinvar_controls_job_run) - assert sample_refresh_clinvar_controls_job_run.status == JobStatus.SUCCEEDED - - async def test_refresh_clinvar_controls_no_variants_with_caid( - self, - session, - with_populated_domain_data, - with_refresh_clinvar_controls_job, - mock_worker_ctx, - sample_refresh_clinvar_controls_job_run, - ): - """Integration test: job completes successfully when no variants have CAIDs.""" - # Add a variant without a CAID - score_set = session.get(ScoreSet, sample_refresh_clinvar_controls_job_run.job_params["score_set_id"]) - variant = Variant( - urn="urn:variant:integration-test-variant-no-caid", - score_set_id=score_set.id, - hgvs_nt="NM_000000.1:c.5T>A", - hgvs_pro="NP_000000.1:p.Leu5Gln", - data={"hgvs_c": "NM_000000.1:c.5T>A", "hgvs_p": "NP_000000.1:p.Leu5Gln"}, - ) - session.add(variant) - session.commit() - mapped_variant = MappedVariant( - variant_id=variant.id, - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - ) - session.add(mapped_variant) - session.commit() - - with ( - patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, - ), - ): - result = await refresh_clinvar_controls(mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - # Verify an annotation status was created for the variant without a CAID - variant_no_caid = ( - session.query(VariantAnnotationStatus).filter(VariantAnnotationStatus.variant_id == variant.id).one() - ) - assert variant_no_caid.status == AnnotationStatus.SKIPPED - assert variant_no_caid.annotation_type == AnnotationType.CLINVAR_CONTROL - assert variant_no_caid.error_message == "Mapped variant does not have an associated ClinGen allele ID." - - # Verify no clinical controls were added - clinical_controls = session.query(ClinicalControl).all() - assert len(clinical_controls) == 0 - - # Verify job run status is marked as completed - session.refresh(sample_refresh_clinvar_controls_job_run) - assert sample_refresh_clinvar_controls_job_run.status == JobStatus.SUCCEEDED - - async def test_refresh_clinvar_controlsvariants_are_multivariants( - self, - session, - with_populated_domain_data, - with_refresh_clinvar_controls_job, - mock_worker_ctx, - sample_refresh_clinvar_controls_job_run, - ): - """Integration test: job completes successfully when all variants are multi-variant CAIDs.""" - # Add a variant with a multi-variant CAID - score_set = session.get(ScoreSet, sample_refresh_clinvar_controls_job_run.job_params["score_set_id"]) - variant = Variant( - urn="urn:variant:integration-test-variant-multicid", - score_set_id=score_set.id, - hgvs_nt="NM_000000.1:c.6A>G", - hgvs_pro="NP_000000.1:p.Thr6Ala", - data={"hgvs_c": "NM_000000.1:c.6A>G", "hgvs_p": "NP_000000.1:p.Thr6Ala"}, - ) - session.add(variant) - session.commit() - mapped_variant = MappedVariant( - variant_id=variant.id, - clingen_allele_id="CA-MULTI-003,CA-MULTI-004", - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - ) - session.add(mapped_variant) - session.commit() - - with ( - patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, - ), - ): - result = await refresh_clinvar_controls(mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - # Verify an annotation status was created for the multi-variant CAID - variant_with_multicid = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant.variant_id) - .one() - ) - assert variant_with_multicid.status == AnnotationStatus.SKIPPED - assert variant_with_multicid.annotation_type == AnnotationType.CLINVAR_CONTROL - assert ( - variant_with_multicid.error_message - == "Multi-variant ClinGen allele IDs cannot be associated with ClinVar data." - ) - - # Verify no clinical controls were added - clinical_controls = session.query(ClinicalControl).all() - assert len(clinical_controls) == 0 - - # Verify job run status is marked as completed - session.refresh(sample_refresh_clinvar_controls_job_run) - assert sample_refresh_clinvar_controls_job_run.status == JobStatus.SUCCEEDED - - async def test_refresh_clinvar_controls_no_associated_clinvar_allele_id( + async def test_arq_context_successful_link( self, + arq_redis, + arq_worker, session, with_populated_domain_data, with_refresh_clinvar_controls_job, - mock_worker_ctx, sample_refresh_clinvar_controls_job_run, + setup_sample_alleles_with_caid, ): - """Integration test: job handles no associated ClinVar Allele ID gracefully.""" - # Add a variant with a CAID - score_set = session.get(ScoreSet, sample_refresh_clinvar_controls_job_run.job_params["score_set_id"]) - variant = Variant( - urn="urn:variant:integration-test-variant-with-caid", - score_set_id=score_set.id, - hgvs_nt="NM_000000.1:c.7C>A", - hgvs_pro="NP_000000.1:p.Ser7Tyr", - data={"hgvs_c": "NM_000000.1:c.7C>A", "hgvs_p": "NP_000000.1:p.Ser7Tyr"}, - ) - session.add(variant) - session.commit() - mapped_variant = MappedVariant( - variant_id=variant.id, - clingen_allele_id="CA126", - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - ) - session.add(mapped_variant) - session.commit() - - # Mock the get_associated_clinvar_allele_id function to return None + """The job links an allele and records a present event under an ARQ worker.""" with ( patch( "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - return_value=None, + return_value="VCV000000123", ), patch( "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", return_value=MOCK_CLINVAR_DATA, ), ): - result = await refresh_clinvar_controls(mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED + await arq_redis.enqueue_job("refresh_clinvar_controls", sample_refresh_clinvar_controls_job_run.id) + await arq_worker.async_run() + await arq_worker.run_check() - # Verify an annotation status was created for the variant due to no associated ClinVar Allele ID - variant_no_clinvar_allele = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant.variant_id) - .one() - ) - assert variant_no_clinvar_allele.status == AnnotationStatus.SKIPPED - assert variant_no_clinvar_allele.annotation_type == AnnotationType.CLINVAR_CONTROL - assert "No ClinVar allele ID found for ClinGen allele ID" in variant_no_clinvar_allele.error_message + assert len(session.scalars(select(ClinvarControl)).all()) >= 1 + assert len(session.scalars(select(ClinvarAlleleLink).where(ClinvarAlleleLink.current)).all()) == 1 - # Verify no clinical controls were added - clinical_controls = session.query(ClinicalControl).all() - assert len(clinical_controls) == 0 + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 1 + assert events[0].disposition == Disposition.PRESENT + assert events[0].annotation_type == AnnotationType.CLINVAR_CONTROL + assert events[0].allele_id is not None and events[0].variant_id is None - # Verify job run status is marked as completed session.refresh(sample_refresh_clinvar_controls_job_run) assert sample_refresh_clinvar_controls_job_run.status == JobStatus.SUCCEEDED - async def test_refresh_clinvar_controls_no_clinvar_data( + async def test_arq_context_exception_handling( self, + arq_redis, + arq_worker, session, with_populated_domain_data, with_refresh_clinvar_controls_job, - mock_worker_ctx, sample_refresh_clinvar_controls_job_run, + setup_sample_alleles_with_caid, ): - """Integration test: job handles no ClinVar data found for the associated ClinVar Allele ID.""" - # Add a variant with a CAID - score_set = session.get(ScoreSet, sample_refresh_clinvar_controls_job_run.job_params["score_set_id"]) - variant = Variant( - urn="urn:variant:integration-test-variant-with-caid", - score_set_id=score_set.id, - hgvs_nt="NM_000000.1:c.8G>T", - hgvs_pro="NP_000000.1:p.Val8Phe", - data={"hgvs_c": "NM_000000.1:c.8G>T", "hgvs_p": "NP_000000.1:p.Val8Phe"}, - ) - session.add(variant) - session.commit() - mapped_variant = MappedVariant( - variant_id=variant.id, - clingen_allele_id="CA127", - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - ) - session.add(mapped_variant) - session.commit() - - # Mock the get_associated_clinvar_allele_id function to return a ClinVar Allele ID + """An unexpected error during resolution is caught by the decorators and the job errors.""" with ( patch( "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - return_value="VCV000000001", + side_effect=ValueError("Unexpected error"), ), patch( "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", return_value=MOCK_CLINVAR_DATA, ), + patch("mavedb.worker.lib.decorators.job_management.send_slack_job_error") as mock_slack, ): - result = await refresh_clinvar_controls(mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id) + await arq_redis.enqueue_job("refresh_clinvar_controls", sample_refresh_clinvar_controls_job_run.id) + await arq_worker.async_run() + await arq_worker.run_check() - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED + mock_slack.assert_called_once() + assert session.scalars(select(ClinvarAlleleLink)).all() == [] + assert session.scalars(select(AnnotationEvent)).all() == [] - # Verify an annotation status was created for the variant due to no ClinVar data found - variant_no_clinvar_data = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant.variant_id) - .one() - ) - assert variant_no_clinvar_data.status == AnnotationStatus.SKIPPED - assert variant_no_clinvar_data.annotation_type == AnnotationType.CLINVAR_CONTROL - assert "No ClinVar data found for ClinVar allele ID" in variant_no_clinvar_data.error_message - - # Verify no clinical controls were added - clinical_controls = session.query(ClinicalControl).all() - assert len(clinical_controls) == 0 - - # Verify job run status is marked as completed - session.refresh(sample_refresh_clinvar_controls_job_run) - assert sample_refresh_clinvar_controls_job_run.status == JobStatus.SUCCEEDED - - async def test_refresh_clinvar_controls_successful_annotation_existing_control( - self, - session, - with_populated_domain_data, - with_refresh_clinvar_controls_job, - mock_worker_ctx, - sample_refresh_clinvar_controls_job_run, - ): - """Integration test: job successfully annotates a variant with ClinVar control data.""" - # Add a variant with a CAID - score_set = session.get(ScoreSet, sample_refresh_clinvar_controls_job_run.job_params["score_set_id"]) - variant = Variant( - urn="urn:variant:integration-test-variant-with-caid", - score_set_id=score_set.id, - hgvs_nt="NM_000000.1:c.9A>C", - hgvs_pro="NP_000000.1:p.Lys9Thr", - data={"hgvs_c": "NM_000000.1:c.9A>C", "hgvs_p": "NP_000000.1:p.Lys9Thr"}, - ) - session.add(variant) - session.commit() - mapped_variant = MappedVariant( - variant_id=variant.id, - clingen_allele_id="CA128", - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - ) - session.add(mapped_variant) - session.commit() - clinical_control = ClinicalControl( - db_name="ClinVar", - db_identifier="VCV000000123", - clinical_significance="likely pathogenic", - gene_symbol="TEST", - clinical_review_status="criteria provided, single submitter", - db_version="01_2026", - ) - session.add(clinical_control) - session.commit() - - mapped_variant.clinical_controls.append(clinical_control) - session.commit() - - # Mock the get_associated_clinvar_allele_id function to return a ClinVar Allele ID - with ( - patch( - "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - return_value="VCV000000123", - ), - patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, - ), - ): - result = await refresh_clinvar_controls(mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - # Verify an annotation status was created for the variant with successful annotation - annotated_variant = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant.variant_id) - .one() - ) - assert annotated_variant.status == AnnotationStatus.SUCCESS - assert annotated_variant.annotation_type == AnnotationType.CLINVAR_CONTROL - assert annotated_variant.error_message is None - - # Verify the clinical control was updated - session.refresh(clinical_control) - assert clinical_control.clinical_significance == "benign" - assert clinical_control.clinical_review_status == "reviewed by expert panel" - assert mapped_variant in clinical_control.mapped_variants - - # Verify job run status is marked as completed - session.refresh(sample_refresh_clinvar_controls_job_run) - assert sample_refresh_clinvar_controls_job_run.status == JobStatus.SUCCEEDED - - async def test_refresh_clinvar_controls_successful_annotation_new_control( - self, - session, - with_populated_domain_data, - with_refresh_clinvar_controls_job, - mock_worker_ctx, - sample_refresh_clinvar_controls_job_run, - ): - """Integration test: job successfully annotates a variant with ClinVar control data when no prior status exists.""" - # Add a variant with a CAID - score_set = session.get(ScoreSet, sample_refresh_clinvar_controls_job_run.job_params["score_set_id"]) - variant = Variant( - urn="urn:variant:integration-test-variant-with-caid", - score_set_id=score_set.id, - hgvs_nt="NM_000000.1:c.10C>G", - hgvs_pro="NP_000000.1:p.Pro10Arg", - data={"hgvs_c": "NM_000000.1:c.10C>G", "hgvs_p": "NP_000000.1:p.Pro10Arg"}, - ) - session.add(variant) - session.commit() - mapped_variant = MappedVariant( - variant_id=variant.id, - clingen_allele_id="CA129", - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - ) - session.add(mapped_variant) - session.commit() - - # Mock the get_associated_clinvar_allele_id function to return a ClinVar Allele ID - with ( - patch( - "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - return_value="VCV000000123", - ), - patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, - ), - ): - result = await refresh_clinvar_controls(mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - # Verify an annotation status was created for the variant with successful annotation - annotated_variant = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant.variant_id) - .one() - ) - assert annotated_variant.status == AnnotationStatus.SUCCESS - assert annotated_variant.annotation_type == AnnotationType.CLINVAR_CONTROL - assert annotated_variant.error_message is None - - # Verify the clinical control was added - clinical_control = ( - session.query(ClinicalControl).filter(ClinicalControl.mapped_variants.contains(mapped_variant)).one() - ) - assert clinical_control.db_identifier == "VCV000000123" - - # Verify job run status is marked as completed - session.refresh(sample_refresh_clinvar_controls_job_run) - assert sample_refresh_clinvar_controls_job_run.status == JobStatus.SUCCEEDED - - async def test_refresh_clinvar_controls_successful_annotation_pipeline_context( - self, - session, - with_populated_domain_data, - with_refresh_clinvar_controls_job, - mock_worker_ctx, - sample_refresh_clinvar_controls_pipeline, - sample_refresh_clinvar_controls_job_in_pipeline, - ): - """Integration test: job successfully annotates a variant with ClinVar control data in a pipeline context.""" - # Add a variant with a CAID - score_set = session.get(ScoreSet, sample_refresh_clinvar_controls_job_in_pipeline.job_params["score_set_id"]) - variant = Variant( - urn="urn:variant:integration-test-variant-with-caid", - score_set_id=score_set.id, - hgvs_nt="NM_000000.1:c.12G>A", - hgvs_pro="NP_000000.1:p.Met12Ile", - data={"hgvs_c": "NM_000000.1:c.12G>A", "hgvs_p": "NP_000000.1:p.Met12Ile"}, - ) - session.add(variant) - session.commit() - mapped_variant = MappedVariant( - variant_id=variant.id, - clingen_allele_id="CA130", - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - ) - session.add(mapped_variant) - session.commit() - - # Mock the get_associated_clinvar_allele_id function to return a ClinVar Allele ID - with ( - patch( - "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - return_value="VCV000000123", - ), - patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, - ), - ): - result = await refresh_clinvar_controls(mock_worker_ctx, sample_refresh_clinvar_controls_job_in_pipeline.id) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - # Verify an annotation status was created for the variant with successful annotation - annotated_variant = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant.variant_id) - .one() - ) - assert annotated_variant.status == AnnotationStatus.SUCCESS - assert annotated_variant.annotation_type == AnnotationType.CLINVAR_CONTROL - assert annotated_variant.error_message is None - - # Verify the clinical control was added - clinical_control = ( - session.query(ClinicalControl).filter(ClinicalControl.mapped_variants.contains(mapped_variant)).one() - ) - assert clinical_control.db_identifier == "VCV000000123" - - # Verify job run status is marked as completed - session.refresh(sample_refresh_clinvar_controls_job_in_pipeline) - assert sample_refresh_clinvar_controls_job_in_pipeline.status == JobStatus.SUCCEEDED - - # Verify the pipeline is marked as completed - session.refresh(sample_refresh_clinvar_controls_pipeline) - assert sample_refresh_clinvar_controls_pipeline.status == PipelineStatus.SUCCEEDED - - async def test_refresh_clinvar_controls_idempotent_run( - self, - session, - with_populated_domain_data, - with_refresh_clinvar_controls_job, - mock_worker_ctx, - sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, - ): - """Integration test: running the job multiple times does not create duplicate annotation statuses.""" - - # Mock the get_associated_clinvar_allele_id function to return a ClinVar Allele ID - with ( - patch( - "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - return_value="VCV000000123", - ), - patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - side_effect=[MOCK_CLINVAR_DATA, MOCK_CLINVAR_DATA], - ), - ): - # First run - result1 = await refresh_clinvar_controls(mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id) - - session.commit() - # reset the job run status to pending for the second run - sample_refresh_clinvar_controls_job_run.status = JobStatus.PENDING - session.commit() - - # Second run - result2 = await refresh_clinvar_controls(mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id) - - assert isinstance(result1, JobExecutionOutcome) - assert result1.status == JobStatus.SUCCEEDED - assert isinstance(result2, JobExecutionOutcome) - assert result2.status == JobStatus.SUCCEEDED - - # Verify only one clinical control annotation exists for the variant - clinical_controls = session.query(ClinicalControl).all() - assert len(clinical_controls) == 1 - - # Verify two annotated variants exist but both reflect the same successful annotation, and only - # one is current - annotated_variants = session.query(VariantAnnotationStatus).all() - assert len(annotated_variants) == 2 - statuses = [av.status for av in annotated_variants] - assert statuses.count(AnnotationStatus.SUCCESS) == 2 - current_statuses = [av for av in annotated_variants if av.current] - assert len(current_statuses) == 1 - - # Verify job run status is marked as completed - session.refresh(sample_refresh_clinvar_controls_job_run) - assert sample_refresh_clinvar_controls_job_run.status == JobStatus.SUCCEEDED - - async def test_refresh_clinvar_controls_partial_failure( - self, - session, - with_populated_domain_data, - with_refresh_clinvar_controls_job, - mock_worker_ctx, - sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, - ): - """Integration test: job handles partial failures gracefully.""" - - variant1, mapped_variant1 = setup_sample_variants_with_caid - # Add an additional mapped variant to the database with a CAID - score_set = session.get(ScoreSet, sample_refresh_clinvar_controls_job_run.job_params["score_set_id"]) - variant2 = Variant( - urn="urn:variant:integration-test-variant-with-caid-2", - score_set_id=score_set.id, - hgvs_nt="NM_000000.1:c.11G>C", - hgvs_pro="NP_000000.1:p.Gly11Ala", - data={"hgvs_c": "NM_000000.1:c.11G>C", "hgvs_p": "NP_000000.1:p.Gly11Ala"}, - ) - session.add(variant2) - session.commit() - mapped_variant2 = MappedVariant( - variant_id=variant2.id, - clingen_allele_id="CA130", - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - ) - session.add(mapped_variant2) - session.commit() - - # Mock the get_associated_clinvar_allele_id function to raise an exception for the first call - def side_effect_get_associated_clinvar_allele_id(clingen_allele_id): - if clingen_allele_id == "CA130": - raise requests.exceptions.RequestException("ClinGen API error") - return "VCV000000123" - - with ( - patch( - "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - side_effect=side_effect_get_associated_clinvar_allele_id, - ), - patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, - ), - ): - result = await refresh_clinvar_controls(mock_worker_ctx, sample_refresh_clinvar_controls_job_run.id) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - # Verify annotation statuses for both variants - variant_with_api_failure = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant2.variant_id) - .one() - ) - assert variant_with_api_failure.status == AnnotationStatus.FAILED - assert variant_with_api_failure.annotation_type == AnnotationType.CLINVAR_CONTROL - assert "Failed to retrieve ClinVar allele ID from ClinGen API" in variant_with_api_failure.error_message - - annotated_variant2 = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == mapped_variant1.variant_id) - .one() - ) - assert annotated_variant2.status == AnnotationStatus.SUCCESS - assert annotated_variant2.annotation_type == AnnotationType.CLINVAR_CONTROL - assert annotated_variant2.error_message is None - - # Verify a clinical control was added for the successfully annotated variant and not the unsuccessful one - clinical_control1 = ( - session.query(ClinicalControl).filter(ClinicalControl.mapped_variants.contains(mapped_variant1)).one() - ) - assert clinical_control1.db_identifier == "VCV000000123" - - clinical_control2 = ( - session.query(ClinicalControl).filter(ClinicalControl.mapped_variants.contains(mapped_variant2)).all() - ) - assert len(clinical_control2) == 0 - - # Verify job run status is marked as completed - session.refresh(sample_refresh_clinvar_controls_job_run) - assert sample_refresh_clinvar_controls_job_run.status == JobStatus.SUCCEEDED - - async def test_refresh_clinvar_controls_propagates_exceptions_to_decorator( - self, - mock_worker_ctx, - session, - with_refresh_clinvar_controls_job, - sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, - ): - """Test that unexpected exceptions are propagated.""" - - # Mock the get_associated_clinvar_allele_id function to raise an unexpected exception - with ( - patch( - "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - side_effect=ValueError("Unexpected error"), - ), - patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, - ), - ): - result = await refresh_clinvar_controls( - mock_worker_ctx, - sample_refresh_clinvar_controls_job_run.id, - JobManager(session, mock_worker_ctx["redis"], sample_refresh_clinvar_controls_job_run.id), - ) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.ERRORED - - # Verify no annotation statuses were created - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 0 - - # Verify no clinical controls were added - clinical_controls = session.query(ClinicalControl).all() - assert len(clinical_controls) == 0 - - # Verify job run status is marked as errored (unhandled exception caught by decorator) - session.refresh(sample_refresh_clinvar_controls_job_run) - assert sample_refresh_clinvar_controls_job_run.status == JobStatus.ERRORED - - -@pytest.mark.asyncio -@pytest.mark.integration -class TestRefreshClinvarControlsArqContext: - """Tests for running the refresh_clinvar_controls job function within an ARQ worker context.""" - - @pytest.fixture(autouse=True) - def _mock_clinvar_versions(self): - """Mock generate_clinvar_versions to return a single version for testing.""" - with patch( - "mavedb.worker.jobs.external_services.clinvar.generate_clinvar_versions", - return_value=[(2026, 1)], - ): - yield - - async def test_refresh_clinvar_controls_with_arq_context_independent( - self, - arq_redis, - arq_worker, - session, - with_populated_domain_data, - with_refresh_clinvar_controls_job, - sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, - ): - """Integration test: job completes successfully within an ARQ worker context.""" - - # Patch external service calls - with ( - patch( - "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - return_value="VCV000000123", - ), - patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, - ), - ): - await arq_redis.enqueue_job("refresh_clinvar_controls", sample_refresh_clinvar_controls_job_run.id) - await arq_worker.async_run() - await arq_worker.run_check() - - # Verify that clinical controls were added - clinical_controls = session.query(ClinicalControl).all() - assert len(clinical_controls) > 0 - - # Verify annotation status was created - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 1 - assert annotation_statuses[0].status == AnnotationStatus.SUCCESS - assert annotation_statuses[0].annotation_type == AnnotationType.CLINVAR_CONTROL - - # Verify that the job completed successfully - session.refresh(sample_refresh_clinvar_controls_job_run) - assert sample_refresh_clinvar_controls_job_run.status == JobStatus.SUCCEEDED - - async def test_refresh_clinvar_controls_with_arq_context_pipeline( - self, - arq_redis, - arq_worker, - session, - with_populated_domain_data, - with_refresh_clinvar_controls_job, - sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, - ): - """Integration test: job completes successfully within an ARQ worker context in a pipeline context.""" - - # Patch external service calls - with ( - patch( - "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - return_value="VCV000000123", - ), - patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, - ), - ): - await arq_redis.enqueue_job("refresh_clinvar_controls", sample_refresh_clinvar_controls_job_run.id) - await arq_worker.async_run() - await arq_worker.run_check() - - # Verify that clinical controls were added - clinical_controls = session.query(ClinicalControl).all() - assert len(clinical_controls) > 0 - - # Verify annotation status was created - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 1 - assert annotation_statuses[0].status == AnnotationStatus.SUCCESS - assert annotation_statuses[0].annotation_type == AnnotationType.CLINVAR_CONTROL - - # Verify that the job completed successfully - session.refresh(sample_refresh_clinvar_controls_job_run) - assert sample_refresh_clinvar_controls_job_run.status == JobStatus.SUCCEEDED - - # Verify the pipeline is marked as completed - pass - - async def test_refresh_clinvar_controls_with_arq_context_exception_handling_independent( - self, - arq_redis, - arq_worker, - session, - with_populated_domain_data, - with_refresh_clinvar_controls_job, - sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, - ): - """Integration test: job handles exceptions properly within an ARQ worker context.""" - # Patch external service calls to raise an exception - with ( - patch( - "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - side_effect=ValueError("Unexpected error"), - ), - patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, - ), - ): - await arq_redis.enqueue_job("refresh_clinvar_controls", sample_refresh_clinvar_controls_job_run.id) - await arq_worker.async_run() - await arq_worker.run_check() - - # Verify no annotation statuses were created - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 0 - - # Verify no clinical controls were added - clinical_controls = session.query(ClinicalControl).all() - assert len(clinical_controls) == 0 - - # Verify job run status is marked as errored (unhandled exception caught by decorator) - session.refresh(sample_refresh_clinvar_controls_job_run) - assert sample_refresh_clinvar_controls_job_run.status == JobStatus.ERRORED - - async def test_refresh_clinvar_controls_with_arq_context_exception_handling_pipeline( - self, - arq_redis, - arq_worker, - session, - with_populated_domain_data, - with_refresh_clinvar_controls_job, - sample_refresh_clinvar_controls_job_run, - setup_sample_variants_with_caid, - ): - """Integration test: job handles exceptions properly within an ARQ worker context in a pipeline context.""" - # Patch external service calls to raise an exception - with ( - patch( - "mavedb.worker.jobs.external_services.clinvar.get_associated_clinvar_allele_id", - side_effect=ValueError("Unexpected error"), - ), - patch( - "mavedb.worker.jobs.external_services.clinvar.fetch_clinvar_variant_data", - return_value=MOCK_CLINVAR_DATA, - ), - ): - await arq_redis.enqueue_job("refresh_clinvar_controls", sample_refresh_clinvar_controls_job_run.id) - await arq_worker.async_run() - await arq_worker.run_check() - - # Verify no annotation statuses were created - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 0 - - # Verify no clinical controls were added - clinical_controls = session.query(ClinicalControl).all() - assert len(clinical_controls) == 0 - - # Verify job run status is marked as errored (unhandled exception caught by decorator) session.refresh(sample_refresh_clinvar_controls_job_run) assert sample_refresh_clinvar_controls_job_run.status == JobStatus.ERRORED - - # Verify the pipeline is marked as failed - pass diff --git a/tests/worker/jobs/external_services/test_gnomad.py b/tests/worker/jobs/external_services/test_gnomad.py index fc8e211c..0b14fda2 100644 --- a/tests/worker/jobs/external_services/test_gnomad.py +++ b/tests/worker/jobs/external_services/test_gnomad.py @@ -4,13 +4,17 @@ pytest.importorskip("arq") -from unittest.mock import MagicMock, patch +from unittest.mock import patch +from sqlalchemy import select + +from mavedb.lib import gnomad as gnomad_lib +from mavedb.lib.gnomad import GNOMAD_DATA_VERSION from mavedb.lib.types.workflow import JobExecutionOutcome from mavedb.models.enums.job_pipeline import JobStatus, PipelineStatus +from mavedb.models.gnomad_allele_link import GnomadAlleleLink from mavedb.models.gnomad_variant import GnomADVariant -from mavedb.models.mapped_variant import MappedVariant -from mavedb.models.variant_annotation_status import VariantAnnotationStatus +from mavedb.models.annotation_event import AnnotationEvent from mavedb.worker.jobs.external_services.gnomad import link_gnomad_variants from mavedb.worker.lib.managers.job_manager import JobManager @@ -22,7 +26,7 @@ class TestLinkGnomadVariantsUnit: """Unit tests for the link_gnomad_variants job.""" - async def test_link_gnomad_variants_no_variants_with_caids( + async def test_link_gnomad_variants_no_alleles_with_caids( self, session, with_populated_domain_data, @@ -30,7 +34,7 @@ async def test_link_gnomad_variants_no_variants_with_caids( mock_worker_ctx, sample_link_gnomad_variants_run, ): - """Test linking gnomAD variants when no mapped variants have CAIDs.""" + """No authoritative alleles with CAIDs -> the job succeeds with nothing to do.""" result = await link_gnomad_variants( mock_worker_ctx, 1, @@ -47,7 +51,7 @@ async def test_link_gnomad_variants_no_gnomad_matches( with_gnomad_linking_job, mock_worker_ctx, sample_link_gnomad_variants_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, athena_engine, ): """Test linking gnomAD variants when no gnomAD variants match the CAIDs.""" @@ -55,7 +59,7 @@ async def test_link_gnomad_variants_no_gnomad_matches( with ( patch( "mavedb.worker.jobs.external_services.gnomad.gnomad_variant_data_for_caids", - return_value={}, + return_value=[], ), patch("mavedb.worker.jobs.external_services.gnomad.athena.engine", athena_engine), ): @@ -75,7 +79,7 @@ async def test_link_gnomad_variants_call_linking_method( with_gnomad_linking_job, mock_worker_ctx, sample_link_gnomad_variants_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, athena_engine, ): """Test that the linking method is called when gnomAD variants match CAIDs.""" @@ -83,11 +87,11 @@ async def test_link_gnomad_variants_call_linking_method( with ( patch( "mavedb.worker.jobs.external_services.gnomad.gnomad_variant_data_for_caids", - return_value=[MagicMock()], + return_value=[object()], ), patch( - "mavedb.worker.jobs.external_services.gnomad.link_gnomad_variants_to_mapped_variants", - return_value=1, + "mavedb.worker.jobs.external_services.gnomad.link_gnomad_variants_to_alleles", + return_value={}, ) as mock_linking_method, patch("mavedb.worker.jobs.external_services.gnomad.athena.engine", athena_engine), ): @@ -108,7 +112,7 @@ async def test_link_gnomad_variants_propagates_exceptions( with_gnomad_linking_job, mock_worker_ctx, sample_link_gnomad_variants_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, athena_engine, ): """Test that exceptions during the linking process are propagated.""" @@ -134,7 +138,7 @@ async def test_link_gnomad_variants_propagates_exceptions( class TestLinkGnomadVariantsIntegration: """Integration tests for the link_gnomad_variants job.""" - async def test_link_gnomad_variants_no_variants_with_caids( + async def test_link_gnomad_variants_no_alleles_with_caids( self, session, with_populated_domain_data, @@ -142,19 +146,18 @@ async def test_link_gnomad_variants_no_variants_with_caids( mock_worker_ctx, sample_link_gnomad_variants_run, ): - """Test the end-to-end functionality of the link_gnomad_variants job when no variants have CAIDs.""" + """Test the end-to-end functionality of the link_gnomad_variants job when no alleles have CAIDs.""" result = await link_gnomad_variants(mock_worker_ctx, sample_link_gnomad_variants_run.id) assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - # Verify that no gnomAD variants were linked - gnomad_variants = session.query(GnomADVariant).all() - assert len(gnomad_variants) == 0 + # Verify that no allele links were created + assert len(session.scalars(select(GnomadAlleleLink)).all()) == 0 - # Verify no annotations were rendered (since there were no variants with CAIDs) - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 0 + # Verify no annotations were rendered (since there were no alleles with CAIDs) + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 0 # Verify job status updates session.refresh(sample_link_gnomad_variants_run) @@ -167,13 +170,13 @@ async def test_link_gnomad_variants_no_matching_caids( with_gnomad_linking_job, mock_worker_ctx, sample_link_gnomad_variants_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, athena_engine, ): """Test the end-to-end functionality of the link_gnomad_variants job when no matching CAIDs are found.""" - # Update the created mapped variant to have a CAID that won't match any gnomAD data - mapped_variant = session.query(MappedVariant).first() - mapped_variant.clingen_allele_id = "NON_MATCHING_CAID" + # Update the allele to have a CAID that won't match any seeded gnomAD data + _, allele = setup_sample_alleles_with_caid + allele.clingen_allele_id = "NON_MATCHING_CAID" session.commit() # Patch the athena engine to use the mock athena_engine fixture @@ -183,15 +186,16 @@ async def test_link_gnomad_variants_no_matching_caids( assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - # Verify that no gnomAD variants were linked - gnomad_variants = session.query(GnomADVariant).all() - assert len(gnomad_variants) == 0 + # Verify that no allele links were created + assert len(session.scalars(select(GnomadAlleleLink)).all()) == 0 - # Verify a skipped annotation status was rendered (since there were variants with CAIDs) - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 1 - assert annotation_statuses[0].status == "skipped" - assert annotation_statuses[0].annotation_type == "gnomad_allele_frequency" + # Verify an absent event was rendered: the allele's CAID was queried but gnomAD had no record. + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 1 + assert events[0].disposition == "absent" + assert events[0].reason == "no_record" + assert events[0].annotation_type == "gnomad_allele_frequency" + assert events[0].allele_id == allele.id and events[0].variant_id is None # Verify job status updates session.refresh(sample_link_gnomad_variants_run) @@ -204,10 +208,11 @@ async def test_link_gnomad_variants_successful_linking_independent( with_gnomad_linking_job, mock_worker_ctx, sample_link_gnomad_variants_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, athena_engine, ): """Test the end-to-end functionality of the link_gnomad_variants job with successful linking.""" + _, allele = setup_sample_alleles_with_caid # Patch the athena engine to use the mock athena_engine fixture with patch("mavedb.worker.jobs.external_services.gnomad.athena.engine", athena_engine): @@ -216,20 +221,173 @@ async def test_link_gnomad_variants_successful_linking_independent( assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - # Verify that gnomAD variants were linked - gnomad_variants = session.query(GnomADVariant).all() - assert len(gnomad_variants) > 0 + # Verify that a gnomAD variant was created and a live link to the allele established + assert len(session.scalars(select(GnomADVariant)).all()) > 0 + live_links = session.scalars( + select(GnomadAlleleLink).where( + GnomadAlleleLink.allele_id == allele.id, + GnomadAlleleLink.current, + ) + ).all() + assert len(live_links) == 1 # Verify annotation status was rendered - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 1 - assert annotation_statuses[0].status == "success" - assert annotation_statuses[0].annotation_type == "gnomad_allele_frequency" + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 1 + assert events[0].disposition == "present" + assert events[0].reason == "created" + assert events[0].annotation_type == "gnomad_allele_frequency" + assert events[0].allele_id is not None and events[0].variant_id is None # Verify job status updates session.refresh(sample_link_gnomad_variants_run) assert sample_link_gnomad_variants_run.status == JobStatus.SUCCEEDED + async def test_link_gnomad_variants_links_and_annotates_rt_derived_allele( + self, + session, + with_populated_domain_data, + with_gnomad_linking_job, + mock_worker_ctx, + sample_link_gnomad_variants_run, + setup_rt_derived_allele_with_caid, + athena_engine, + ): + """gnomAD linkage covers the full allele set, not just authoritative links: the RT-derived + allele carries the CAID gnomAD matches and is linked. Events are now allele-keyed, so the + RT-derived allele's PRESENT status is recorded directly — the limitation the per-variant + bandaid had (dropping the RT-derived allele's status) is lifted. Each allele in the score set + gets its own event: present for the linked RT allele, absent for the unmatched authoritative + one.""" + variant, authoritative_allele, rt_allele = setup_rt_derived_allele_with_caid + + with patch("mavedb.worker.jobs.external_services.gnomad.athena.engine", athena_engine): + result = await link_gnomad_variants(mock_worker_ctx, sample_link_gnomad_variants_run.id) + + assert result.status == JobStatus.SUCCEEDED + + # The RT-derived (non-authoritative) allele IS linked — the core fix. + rt_links = session.scalars( + select(GnomadAlleleLink).where( + GnomadAlleleLink.allele_id == rt_allele.id, + GnomadAlleleLink.current, + ) + ).all() + assert len(rt_links) == 1 + # The authoritative allele's CAID had no gnomAD match, so it gets no link. + assert ( + len( + session.scalars( + select(GnomadAlleleLink).where(GnomadAlleleLink.allele_id == authoritative_allele.id) + ).all() + ) + == 0 + ) + + # One allele-keyed event per allele (never per-variant): the linked RT allele is present, + # the unmatched authoritative allele is absent. The variant resolves its derived allele's + # status through the live links — no variant_id on the events. + events = session.scalars(select(AnnotationEvent)).all() + assert all(e.annotation_type == "gnomad_allele_frequency" for e in events) + assert all(e.variant_id is None for e in events) + by_allele = {e.allele_id: e for e in events} + assert by_allele[rt_allele.id].disposition == "present" + assert by_allele[authoritative_allele.id].disposition == "absent" + assert by_allele[authoritative_allele.id].reason == "no_record" + + async def test_link_gnomad_variants_skips_allele_already_current( + self, + session, + with_populated_domain_data, + with_gnomad_linking_job, + mock_worker_ctx, + sample_link_gnomad_variants_run, + setup_sample_alleles_with_caid, + ): + """An allele already linked at the current gnomAD version is skipped: no Athena query, the + status reports SUCCESS/preexisting, and the existing link is not churned.""" + _, allele = setup_sample_alleles_with_caid + + # Simulate a prior run: a live link at the current gnomAD version. + gnomad_variant = GnomADVariant( + db_name="gnomAD", + db_identifier="1-12345-G-A", + db_version=GNOMAD_DATA_VERSION, + allele_count=1, + allele_number=2, + allele_frequency=0.5, + ) + session.add(gnomad_variant) + session.commit() + session.add(GnomadAlleleLink(allele_id=allele.id, gnomad_variant_id=gnomad_variant.id)) + session.commit() + + with patch("mavedb.worker.jobs.external_services.gnomad.gnomad_variant_data_for_caids") as fetch_spy: + result = await link_gnomad_variants(mock_worker_ctx, sample_link_gnomad_variants_run.id) + + fetch_spy.assert_not_called() # version-keyed skip avoided the external query entirely + assert result.data["preexisting_allele_count"] == 1 + assert result.data["created_allele_count"] == 0 + + # Link not churned: still exactly one, still live. + links = session.scalars(select(GnomadAlleleLink).where(GnomadAlleleLink.allele_id == allele.id)).all() + assert len(links) == 1 + assert links[0].valid_to is None + + # Event is PRESENT, marked preexisting (the link was already current; not created this run). + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 1 + assert events[0].disposition == "present" + assert events[0].reason == "preexisting" + + async def test_link_gnomad_variants_force_refetches_without_churn( + self, + session, + with_populated_domain_data, + with_gnomad_linking_job, + mock_worker_ctx, + sample_link_gnomad_variants_run, + setup_sample_alleles_with_caid, + athena_engine, + ): + """force bypasses the skip and re-fetches, but the linker supersedes only on change, so a + forced re-run of unchanged data reports preexisting and does not churn the link.""" + _, allele = setup_sample_alleles_with_caid + + # Prior live link pointing at the same variant the Athena mock resolves to (1-12345-G-A). + gnomad_variant = GnomADVariant( + db_name="gnomAD", + db_identifier="1-12345-G-A", + db_version=GNOMAD_DATA_VERSION, + allele_count=23, + allele_number=32432423, + allele_frequency=23 / 32432423, + ) + session.add(gnomad_variant) + session.commit() + session.add(GnomadAlleleLink(allele_id=allele.id, gnomad_variant_id=gnomad_variant.id)) + session.commit() + + sample_link_gnomad_variants_run.job_params = {**sample_link_gnomad_variants_run.job_params, "force": True} + session.commit() + + with ( + patch("mavedb.worker.jobs.external_services.gnomad.athena.engine", athena_engine), + patch( + "mavedb.worker.jobs.external_services.gnomad.gnomad_variant_data_for_caids", + side_effect=gnomad_lib.gnomad_variant_data_for_caids, + ) as fetch_spy, + ): + result = await link_gnomad_variants(mock_worker_ctx, sample_link_gnomad_variants_run.id) + + fetch_spy.assert_called_once() # force bypassed the version-keyed skip + assert result.data["preexisting_allele_count"] == 1 + assert result.data["created_allele_count"] == 0 + # Unchanged → no churn. + links = session.scalars(select(GnomadAlleleLink).where(GnomadAlleleLink.allele_id == allele.id)).all() + assert len(links) == 1 + assert links[0].valid_to is None + async def test_link_gnomad_variants_successful_linking_pipeline( self, session, @@ -237,7 +395,7 @@ async def test_link_gnomad_variants_successful_linking_pipeline( mock_worker_ctx, sample_link_gnomad_variants_run_pipeline, sample_link_gnomad_variants_pipeline, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, athena_engine, ): """Test the end-to-end functionality of the link_gnomad_variants job with successful linking in a pipeline.""" @@ -249,15 +407,16 @@ async def test_link_gnomad_variants_successful_linking_pipeline( assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - # Verify that gnomAD variants were linked - gnomad_variants = session.query(GnomADVariant).all() - assert len(gnomad_variants) > 0 + # Verify that allele links were created + assert len(session.scalars(select(GnomadAlleleLink).where(GnomadAlleleLink.current)).all()) > 0 # Verify annotation status was rendered - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 1 - assert annotation_statuses[0].status == "success" - assert annotation_statuses[0].annotation_type == "gnomad_allele_frequency" + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 1 + assert events[0].disposition == "present" + assert events[0].reason == "created" + assert events[0].annotation_type == "gnomad_allele_frequency" + assert events[0].allele_id is not None and events[0].variant_id is None # Verify job status updates session.refresh(sample_link_gnomad_variants_run_pipeline) @@ -274,7 +433,7 @@ async def test_link_gnomad_variants_exceptions_handled_by_decorators( with_gnomad_linking_job, mock_worker_ctx, sample_link_gnomad_variants_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, athena_engine, ): """Test that exceptions during the linking process are handled by decorators.""" @@ -317,7 +476,7 @@ async def test_link_gnomad_variants_with_arq_context_independent( with_gnomad_linking_job, athena_engine, sample_link_gnomad_variants_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, ): """Test that the link_gnomad_variants job works with the ARQ context fixture.""" @@ -328,15 +487,16 @@ async def test_link_gnomad_variants_with_arq_context_independent( await arq_worker.async_run() await arq_worker.run_check() - # Verify that gnomAD variants were linked - gnomad_variants = session.query(GnomADVariant).all() - assert len(gnomad_variants) > 0 + # Verify that allele links were created + assert len(session.scalars(select(GnomadAlleleLink).where(GnomadAlleleLink.current)).all()) > 0 # Verify annotation status was rendered - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 1 - assert annotation_statuses[0].status == "success" - assert annotation_statuses[0].annotation_type == "gnomad_allele_frequency" + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 1 + assert events[0].disposition == "present" + assert events[0].reason == "created" + assert events[0].annotation_type == "gnomad_allele_frequency" + assert events[0].allele_id is not None and events[0].variant_id is None # Verify that the job completed successfully session.refresh(sample_link_gnomad_variants_run) @@ -351,7 +511,7 @@ async def test_link_gnomad_variants_with_arq_context_pipeline( athena_engine, sample_link_gnomad_variants_run_pipeline, sample_link_gnomad_variants_pipeline, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, ): """Test that the link_gnomad_variants job works with the ARQ context fixture in a pipeline.""" @@ -362,15 +522,16 @@ async def test_link_gnomad_variants_with_arq_context_pipeline( await arq_worker.async_run() await arq_worker.run_check() - # Verify that gnomAD variants were linked - gnomad_variants = session.query(GnomADVariant).all() - assert len(gnomad_variants) > 0 + # Verify that allele links were created + assert len(session.scalars(select(GnomadAlleleLink).where(GnomadAlleleLink.current)).all()) > 0 # Verify annotation status was rendered - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 1 - assert annotation_statuses[0].status == "success" - assert annotation_statuses[0].annotation_type == "gnomad_allele_frequency" + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 1 + assert events[0].disposition == "present" + assert events[0].reason == "created" + assert events[0].annotation_type == "gnomad_allele_frequency" + assert events[0].allele_id is not None and events[0].variant_id is None # Verify that the job completed successfully session.refresh(sample_link_gnomad_variants_run_pipeline) @@ -389,7 +550,7 @@ async def test_link_gnomad_variants_with_arq_context_exception_handling_independ with_gnomad_linking_job, athena_engine, sample_link_gnomad_variants_run, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, ): """Test that exceptions in the link_gnomad_variants job are handled with the ARQ context fixture.""" @@ -406,13 +567,12 @@ async def test_link_gnomad_variants_with_arq_context_exception_handling_independ await arq_worker.run_check() mock_send_slack_job_error.assert_called_once() - # Verify that no gnomAD variants were linked - gnomad_variants = session.query(GnomADVariant).all() - assert len(gnomad_variants) == 0 + # Verify that no allele links were created + assert len(session.scalars(select(GnomadAlleleLink)).all()) == 0 # Verify no annotations were rendered - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 0 + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 0 # Verify that the job errored session.refresh(sample_link_gnomad_variants_run) @@ -427,7 +587,7 @@ async def test_link_gnomad_variants_with_arq_context_exception_handling_pipeline athena_engine, sample_link_gnomad_variants_pipeline, sample_link_gnomad_variants_run_pipeline, - setup_sample_variants_with_caid, + setup_sample_alleles_with_caid, ): """Test that exceptions in the link_gnomad_variants job are handled with the ARQ context fixture.""" @@ -444,13 +604,12 @@ async def test_link_gnomad_variants_with_arq_context_exception_handling_pipeline await arq_worker.run_check() mock_send_slack_job_error.assert_called_once() - # Verify that no gnomAD variants were linked - gnomad_variants = session.query(GnomADVariant).all() - assert len(gnomad_variants) == 0 + # Verify that no allele links were created + assert len(session.scalars(select(GnomadAlleleLink)).all()) == 0 # Verify no annotations were rendered - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 0 + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 0 # Verify that the job errored session.refresh(sample_link_gnomad_variants_run_pipeline) diff --git a/tests/worker/jobs/external_services/test_hgvs.py b/tests/worker/jobs/external_services/test_hgvs.py deleted file mode 100644 index 946724cc..00000000 --- a/tests/worker/jobs/external_services/test_hgvs.py +++ /dev/null @@ -1,544 +0,0 @@ -# ruff: noqa: E402 - -import pytest - -pytest.importorskip("arq") - -from unittest.mock import patch - -from mavedb.lib.types.workflow import JobExecutionOutcome -from mavedb.models.enums.job_pipeline import JobStatus, PipelineStatus -from mavedb.models.mapped_variant import MappedVariant -from mavedb.models.variant_annotation_status import VariantAnnotationStatus -from mavedb.worker.jobs.external_services.hgvs import populate_hgvs_for_score_set -from mavedb.worker.lib.managers.job_manager import JobManager - -pytestmark = pytest.mark.usefixtures("patch_db_session_ctxmgr") - -SAMPLE_CA_ALLELE_DATA = { - "genomicAlleles": [ - { - "referenceGenome": "GRCh38", - "hgvs": ["NC_000001.11:g.12345A>G"], - } - ], - "transcriptAlleles": [ - { - "hgvs": ["NM_000000.1:c.1A>G"], - "proteinEffect": {"hgvs": "NP_000000.1:p.Met1Val"}, - "MANE": { - "nucleotide": {"RefSeq": {"hgvs": "NM_000000.1:c.1A>G"}}, - "protein": {"RefSeq": {"hgvs": "NP_000000.1:p.Met1Val"}}, - }, - } - ], -} - -SAMPLE_PA_ALLELE_DATA = { - "aminoAcidAlleles": [ - { - "hgvs": ["NP_000000.1:p.Met1Val"], - } - ], -} - - -@pytest.mark.asyncio -@pytest.mark.unit -class TestPopulateHgvsForScoreSetUnit: - """Unit tests for the populate_hgvs_for_score_set job.""" - - async def test_no_mapped_variants( - self, - session, - with_populated_domain_data, - with_populate_hgvs_job, - mock_worker_ctx, - sample_populate_hgvs_run, - ): - """Test populating HGVS when no mapped variants exist.""" - result = await populate_hgvs_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_hgvs_run.id), - ) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - async def test_variant_without_caid_skipped( - self, - session, - with_populated_domain_data, - with_populate_hgvs_job, - mock_worker_ctx, - sample_populate_hgvs_run, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test that a variant without a CAID gets a skipped annotation.""" - _, mapped_variant = setup_sample_variants_with_caid_for_hgvs - mapped_variant.clingen_allele_id = None - session.commit() - - result = await populate_hgvs_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_hgvs_run.id), - ) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - assert result.data["skipped_count"] == 1 - - async def test_variant_with_multi_caid_skipped( - self, - session, - with_populated_domain_data, - with_populate_hgvs_job, - mock_worker_ctx, - sample_populate_hgvs_run, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test that a variant with a multi-variant CAID gets a skipped annotation.""" - _, mapped_variant = setup_sample_variants_with_caid_for_hgvs - mapped_variant.clingen_allele_id = "CA123,CA456" - session.commit() - - result = await populate_hgvs_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_hgvs_run.id), - ) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - assert result.data["skipped_count"] == 1 - - async def test_successful_ca_allele_hgvs_population( - self, - session, - with_populated_domain_data, - with_populate_hgvs_job, - mock_worker_ctx, - sample_populate_hgvs_run, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test successful HGVS population for a CA allele.""" - with ( - patch( - "mavedb.worker.jobs.external_services.hgvs.get_clingen_allele_data", - return_value=SAMPLE_CA_ALLELE_DATA, - ), - ): - result = await populate_hgvs_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_hgvs_run.id), - ) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - assert result.data["populated_count"] == 1 - - _, mapped_variant = setup_sample_variants_with_caid_for_hgvs - session.refresh(mapped_variant) - assert mapped_variant.hgvs_g == "NC_000001.11:g.12345A>G" - - async def test_clingen_api_error_recorded_as_failed( - self, - session, - with_populated_domain_data, - with_populate_hgvs_job, - mock_worker_ctx, - sample_populate_hgvs_run, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test that ClinGen API errors are recorded as failed annotations.""" - import requests - - with ( - patch( - "mavedb.worker.jobs.external_services.hgvs.get_clingen_allele_data", - side_effect=requests.exceptions.ConnectionError("Connection refused"), - ), - ): - result = await populate_hgvs_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_hgvs_run.id), - ) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - assert result.data["failed_count"] == 1 - - async def test_total_api_failure_sends_slack_alert( - self, - session, - with_populated_domain_data, - with_populate_hgvs_job, - mock_worker_ctx, - sample_populate_hgvs_run, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test that a Slack alert is sent when all variants fail HGVS population.""" - import requests - - with ( - patch( - "mavedb.worker.jobs.external_services.hgvs.get_clingen_allele_data", - side_effect=requests.exceptions.ConnectionError("Connection refused"), - ), - patch("mavedb.worker.jobs.external_services.hgvs.log_and_send_slack_message") as mock_slack, - ): - result = await populate_hgvs_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_hgvs_run.id), - ) - - assert result.status == JobStatus.SUCCEEDED - assert result.data["failed_count"] == 1 - assert result.data["populated_count"] == 0 - mock_slack.assert_called_once() - - async def test_clingen_allele_not_found_skipped( - self, - session, - with_populated_domain_data, - with_populate_hgvs_job, - mock_worker_ctx, - sample_populate_hgvs_run, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test that a 404 from ClinGen results in a skipped annotation.""" - with ( - patch( - "mavedb.worker.jobs.external_services.hgvs.get_clingen_allele_data", - return_value=None, - ), - ): - result = await populate_hgvs_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_hgvs_run.id), - ) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - assert result.data["skipped_count"] == 1 - - async def test_propagates_exceptions( - self, - session, - with_populated_domain_data, - with_populate_hgvs_job, - mock_worker_ctx, - sample_populate_hgvs_run, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test that unexpected exceptions are propagated.""" - with patch( - "mavedb.worker.jobs.external_services.hgvs.get_clingen_allele_data", - side_effect=Exception("Test exception"), - ): - with pytest.raises(Exception) as exc_info: - await populate_hgvs_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_hgvs_run.id), - ) - - assert str(exc_info.value) == "Test exception" - - -@pytest.mark.asyncio -@pytest.mark.integration -class TestPopulateHgvsForScoreSetIntegration: - """Integration tests for the populate_hgvs_for_score_set job.""" - - async def test_no_mapped_variants( - self, - session, - with_populated_domain_data, - with_populate_hgvs_job, - mock_worker_ctx, - sample_populate_hgvs_run, - ): - """Test end-to-end when no mapped variants exist.""" - result = await populate_hgvs_for_score_set(mock_worker_ctx, sample_populate_hgvs_run.id) - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 0 - - session.refresh(sample_populate_hgvs_run) - assert sample_populate_hgvs_run.status == JobStatus.SUCCEEDED - - async def test_successful_hgvs_population( - self, - session, - with_populated_domain_data, - with_populate_hgvs_job, - mock_worker_ctx, - sample_populate_hgvs_run, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test end-to-end successful HGVS population.""" - with patch( - "mavedb.worker.jobs.external_services.hgvs.get_clingen_allele_data", - return_value=SAMPLE_CA_ALLELE_DATA, - ): - result = await populate_hgvs_for_score_set(mock_worker_ctx, sample_populate_hgvs_run.id) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - # Verify mapped variant was updated with HGVS - mapped_variant = session.query(MappedVariant).first() - assert mapped_variant.hgvs_g == "NC_000001.11:g.12345A>G" - - # Verify annotation status was rendered - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 1 - assert annotation_statuses[0].status == "success" - assert annotation_statuses[0].annotation_type == "mapped_hgvs" - - session.refresh(sample_populate_hgvs_run) - assert sample_populate_hgvs_run.status == JobStatus.SUCCEEDED - - async def test_successful_hgvs_population_pipeline( - self, - session, - with_populated_domain_data, - mock_worker_ctx, - sample_populate_hgvs_run_pipeline, - sample_populate_hgvs_pipeline, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test end-to-end HGVS population in a pipeline.""" - with patch( - "mavedb.worker.jobs.external_services.hgvs.get_clingen_allele_data", - return_value=SAMPLE_CA_ALLELE_DATA, - ): - result = await populate_hgvs_for_score_set(mock_worker_ctx, sample_populate_hgvs_run_pipeline.id) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - # Verify mapped variant was updated - mapped_variant = session.query(MappedVariant).first() - assert mapped_variant.hgvs_g == "NC_000001.11:g.12345A>G" - - # Verify annotation status - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 1 - assert annotation_statuses[0].status == "success" - assert annotation_statuses[0].annotation_type == "mapped_hgvs" - - # Verify job and pipeline status - session.refresh(sample_populate_hgvs_run_pipeline) - assert sample_populate_hgvs_run_pipeline.status == JobStatus.SUCCEEDED - - session.refresh(sample_populate_hgvs_pipeline) - assert sample_populate_hgvs_pipeline.status == PipelineStatus.SUCCEEDED - - async def test_variant_without_caid_creates_skipped_annotation( - self, - session, - with_populated_domain_data, - with_populate_hgvs_job, - mock_worker_ctx, - sample_populate_hgvs_run, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test that variants without CAIDs get a skipped annotation status.""" - _, mapped_variant = setup_sample_variants_with_caid_for_hgvs - mapped_variant.clingen_allele_id = None - session.commit() - - result = await populate_hgvs_for_score_set(mock_worker_ctx, sample_populate_hgvs_run.id) - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 1 - assert annotation_statuses[0].status == "skipped" - assert annotation_statuses[0].annotation_type == "mapped_hgvs" - - session.refresh(sample_populate_hgvs_run) - assert sample_populate_hgvs_run.status == JobStatus.SUCCEEDED - - async def test_exceptions_handled_by_decorators( - self, - session, - with_populated_domain_data, - with_populate_hgvs_job, - mock_worker_ctx, - sample_populate_hgvs_run, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test that unexpected exceptions are handled by decorators.""" - with ( - patch( - "mavedb.worker.jobs.external_services.hgvs.get_clingen_allele_data", - side_effect=Exception("Test exception"), - ), - patch("mavedb.worker.lib.decorators.job_management.send_slack_job_error") as mock_send_slack_job_error, - ): - result = await populate_hgvs_for_score_set( - mock_worker_ctx, - sample_populate_hgvs_run.id, - ) - - mock_send_slack_job_error.assert_called_once() - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.ERRORED - assert isinstance(result.exception, Exception) - - session.refresh(sample_populate_hgvs_run) - assert sample_populate_hgvs_run.status == JobStatus.ERRORED - - -@pytest.mark.asyncio -@pytest.mark.integration -class TestPopulateHgvsForScoreSetArqContext: - """Tests for populate_hgvs_for_score_set job using the ARQ context fixture.""" - - async def test_with_arq_context_independent( - self, - arq_redis, - arq_worker, - session, - with_populated_domain_data, - with_populate_hgvs_job, - sample_populate_hgvs_run, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test that the job works with the ARQ context fixture.""" - with patch( - "mavedb.worker.jobs.external_services.hgvs.get_clingen_allele_data", - return_value=SAMPLE_CA_ALLELE_DATA, - ): - await arq_redis.enqueue_job("populate_hgvs_for_score_set", sample_populate_hgvs_run.id) - await arq_worker.async_run() - await arq_worker.run_check() - - # Verify mapped variant was updated - mapped_variant = session.query(MappedVariant).first() - assert mapped_variant.hgvs_g == "NC_000001.11:g.12345A>G" - - # Verify annotation status - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 1 - assert annotation_statuses[0].status == "success" - assert annotation_statuses[0].annotation_type == "mapped_hgvs" - - # Verify job completed - session.refresh(sample_populate_hgvs_run) - assert sample_populate_hgvs_run.status == JobStatus.SUCCEEDED - - async def test_with_arq_context_pipeline( - self, - arq_redis, - arq_worker, - session, - with_populated_domain_data, - sample_populate_hgvs_run_pipeline, - sample_populate_hgvs_pipeline, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test that the job works with the ARQ context fixture in a pipeline.""" - with patch( - "mavedb.worker.jobs.external_services.hgvs.get_clingen_allele_data", - return_value=SAMPLE_CA_ALLELE_DATA, - ): - await arq_redis.enqueue_job("populate_hgvs_for_score_set", sample_populate_hgvs_run_pipeline.id) - await arq_worker.async_run() - await arq_worker.run_check() - - # Verify mapped variant was updated - mapped_variant = session.query(MappedVariant).first() - assert mapped_variant.hgvs_g == "NC_000001.11:g.12345A>G" - - # Verify annotation status - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 1 - assert annotation_statuses[0].status == "success" - assert annotation_statuses[0].annotation_type == "mapped_hgvs" - - # Verify job and pipeline status - session.refresh(sample_populate_hgvs_run_pipeline) - assert sample_populate_hgvs_run_pipeline.status == JobStatus.SUCCEEDED - - session.refresh(sample_populate_hgvs_pipeline) - assert sample_populate_hgvs_pipeline.status == PipelineStatus.SUCCEEDED - - async def test_with_arq_context_exception_handling_independent( - self, - arq_redis, - arq_worker, - session, - with_populated_domain_data, - with_populate_hgvs_job, - sample_populate_hgvs_run, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test that exceptions are handled with the ARQ context fixture.""" - with ( - patch( - "mavedb.worker.jobs.external_services.hgvs.get_clingen_allele_data", - side_effect=Exception("Test exception"), - ), - patch("mavedb.worker.lib.decorators.job_management.send_slack_job_error") as mock_send_slack_job_error, - ): - await arq_redis.enqueue_job("populate_hgvs_for_score_set", sample_populate_hgvs_run.id) - await arq_worker.async_run() - await arq_worker.run_check() - - mock_send_slack_job_error.assert_called_once() - - # Verify no annotations were rendered - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 0 - - # Verify job errored - session.refresh(sample_populate_hgvs_run) - assert sample_populate_hgvs_run.status == JobStatus.ERRORED - - async def test_with_arq_context_exception_handling_pipeline( - self, - arq_redis, - arq_worker, - session, - with_populated_domain_data, - sample_populate_hgvs_pipeline, - sample_populate_hgvs_run_pipeline, - setup_sample_variants_with_caid_for_hgvs, - ): - """Test that exceptions in pipeline context are handled.""" - with ( - patch( - "mavedb.worker.jobs.external_services.hgvs.get_clingen_allele_data", - side_effect=Exception("Test exception"), - ), - patch("mavedb.worker.lib.decorators.job_management.send_slack_job_error") as mock_send_slack_job_error, - ): - await arq_redis.enqueue_job("populate_hgvs_for_score_set", sample_populate_hgvs_run_pipeline.id) - await arq_worker.async_run() - await arq_worker.run_check() - - mock_send_slack_job_error.assert_called_once() - - # Verify no annotations were rendered - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 0 - - # Verify job errored - session.refresh(sample_populate_hgvs_run_pipeline) - assert sample_populate_hgvs_run_pipeline.status == JobStatus.ERRORED - - # Verify pipeline failed - session.refresh(sample_populate_hgvs_pipeline) - assert sample_populate_hgvs_pipeline.status == PipelineStatus.FAILED diff --git a/tests/worker/jobs/external_services/test_variant_translation.py b/tests/worker/jobs/external_services/test_variant_translation.py deleted file mode 100644 index 3e1f364e..00000000 --- a/tests/worker/jobs/external_services/test_variant_translation.py +++ /dev/null @@ -1,770 +0,0 @@ -# ruff: noqa: E402 - -import pytest - -pytest.importorskip("arq") - -from unittest.mock import patch - -from sqlalchemy import select - -from mavedb.lib.types.workflow import JobExecutionOutcome -from mavedb.models.enums.job_pipeline import FailureCategory, JobStatus, PipelineStatus -from mavedb.models.mapped_variant import MappedVariant -from mavedb.models.variant import Variant -from mavedb.models.variant_annotation_status import VariantAnnotationStatus -from mavedb.models.variant_translation import VariantTranslation -from mavedb.worker.jobs.external_services.variant_translation import populate_variant_translations_for_score_set -from mavedb.worker.lib.managers.job_manager import JobManager - -pytestmark = pytest.mark.usefixtures("patch_db_session_ctxmgr") - - -# --- Unit Tests --- - - -@pytest.mark.asyncio -@pytest.mark.unit -class TestPopulateVariantTranslationsUnit: - """Unit tests for the populate_variant_translations_for_score_set job.""" - - async def test_no_mapped_variants( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - ): - """Test that the job succeeds with zero translations when no mapped variants exist.""" - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_variant_translations_run.id), - ) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - assert result.data["translations_created"] == 0 - - async def test_variant_without_caid_no_translations( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that a variant without a CAID results in no translations.""" - _, mapped_variant = setup_sample_variants_with_caid_for_translation - mapped_variant.clingen_allele_id = None - session.commit() - - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_variant_translations_run.id), - ) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - assert result.data["translations_created"] == 0 - - async def test_ca_allele_creates_translations( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that a CA allele creates translations via PA lookup.""" - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - return_value=["PA00001"], - ), - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_matching_registered_ca_ids", - return_value=["CA11111", "CA22222"], - ), - ): - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_variant_translations_run.id), - ) - - assert result.status == JobStatus.SUCCEEDED - # 1 for PA00001->CA9765210 (the original CA), 2 for PA00001->CA11111 and PA00001->CA22222 - assert result.data["translations_created"] == 3 - - translations = session.scalars(select(VariantTranslation)).all() - assert len(translations) == 3 - - annotation = session.scalars(select(VariantAnnotationStatus)).one() - assert annotation is not None - - async def test_pa_allele_creates_translations( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that a PA allele creates translations via CA lookup.""" - _, mapped_variant = setup_sample_variants_with_caid_for_translation - mapped_variant.clingen_allele_id = "PA99999" - session.commit() - - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_matching_registered_ca_ids", - return_value=["CA33333", "CA44444"], - ), - ): - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_variant_translations_run.id), - ) - - assert result.status == JobStatus.SUCCEEDED - assert result.data["translations_created"] == 2 - - translations = session.scalars(select(VariantTranslation)).all() - assert len(translations) == 2 - aa_ids = {t.aa_clingen_id for t in translations} - assert aa_ids == {"PA99999"} - - async def test_multi_variant_caid_expanded( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that comma-separated CAIDs are expanded and each processed independently.""" - _, mapped_variant = setup_sample_variants_with_caid_for_translation - mapped_variant.clingen_allele_id = "CA55555,CA66666" - session.commit() - - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - return_value=["PA00002"], - ), - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_matching_registered_ca_ids", - return_value=[], - ), - ): - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_variant_translations_run.id), - ) - - assert result.status == JobStatus.SUCCEEDED - # PA00002->CA55555 and PA00002->CA66666 - assert result.data["translations_created"] == 2 - - async def test_ca_allele_no_pa_ids_skipped( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that a CA allele with no canonical PA IDs results in a skip.""" - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - return_value=[], - ), - ): - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_variant_translations_run.id), - ) - - assert result.status == JobStatus.SUCCEEDED - assert result.data["alleles_skipped"] == 1 - assert result.data["translations_created"] == 0 - - annotation = session.scalars(select(VariantAnnotationStatus)).one() - assert annotation.status == "skipped" - - async def test_pa_allele_no_ca_ids_skipped( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that a PA allele with no registered CA IDs results in a skip.""" - _, mapped_variant = setup_sample_variants_with_caid_for_translation - mapped_variant.clingen_allele_id = "PA88888" - session.commit() - - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_matching_registered_ca_ids", - return_value=[], - ), - ): - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_variant_translations_run.id), - ) - - assert result.status == JobStatus.SUCCEEDED - assert result.data["alleles_skipped"] == 1 - assert result.data["translations_created"] == 0 - - async def test_ca_allele_api_failure_records_failed_annotation( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that a ClinGen API failure for CA allele records a failed annotation.""" - import requests - - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - side_effect=requests.exceptions.ConnectionError("Connection failed"), - ), - ): - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_variant_translations_run.id), - ) - - assert result.status == JobStatus.FAILED - assert result.failure_category == FailureCategory.DEPENDENCY_FAILURE - assert result.data["alleles_failed"] == 1 - - annotation = session.scalars(select(VariantAnnotationStatus)).one() - assert annotation.status == "failed" - - async def test_unrecognized_allele_format_skipped( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that an unrecognized allele ID format is skipped.""" - _, mapped_variant = setup_sample_variants_with_caid_for_translation - mapped_variant.clingen_allele_id = "XX12345" - session.commit() - - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_variant_translations_run.id), - ) - - assert result.status == JobStatus.SUCCEEDED - assert result.data["alleles_skipped"] == 1 - - async def test_duplicate_translations_not_created( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that duplicate translations are not created on re-run.""" - # Pre-populate a translation - session.add(VariantTranslation(aa_clingen_id="PA00003", nt_clingen_id="CA9765210")) - session.commit() - - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - return_value=["PA00003"], - ), - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_matching_registered_ca_ids", - return_value=[], - ), - ): - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_variant_translations_run.id), - ) - - assert result.status == JobStatus.SUCCEEDED - assert result.data["translations_created"] == 0 - - translations = session.scalars( - select(VariantTranslation).where(VariantTranslation.aa_clingen_id == "PA00003") - ).all() - assert len(translations) == 1 - - async def test_propagates_exceptions( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that unexpected exceptions are propagated.""" - with patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - side_effect=Exception("Test exception"), - ): - with pytest.raises(Exception) as exc_info: - await populate_variant_translations_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_variant_translations_run.id), - ) - - assert str(exc_info.value) == "Test exception" - - async def test_multiple_alleles_sharing_pa_no_duplicate_error( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that two CA alleles mapping to the same PA don't cause a UniqueViolation. - - This is a regression test for a bug where the SELECT-then-INSERT upsert pattern - failed to detect in-session duplicates: both alleles' iterations called - upsert_variant_translations with overlapping (PA, CA) pairs, the SELECT found - no committed row, both staged db.add() for the same pair, and the subsequent - update_progress commit raised a UniqueViolation. - """ - # Add a second variant with a different CA allele under the same score set. - score_set_id = sample_populate_variant_translations_run.job_params["score_set_id"] - - variant2 = Variant( - urn="urn:variant:test-second-ca-allele", - score_set_id=score_set_id, - hgvs_nt="NM_000000.1:c.2T>G", - hgvs_pro="NP_000000.1:p.Val2Gly", - data={}, - ) - session.add(variant2) - session.commit() - mapped_variant2 = MappedVariant( - variant_id=variant2.id, - clingen_allele_id="CA_SECOND", - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - ) - session.add(mapped_variant2) - session.commit() - - # Both CA alleles resolve to the same PA. The PA then returns the same set of - # registered CAs for both iterations, producing fully overlapping translation pairs. - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - return_value=["PA_SHARED"], - ), - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_matching_registered_ca_ids", - return_value=["CA9765210", "CA_SECOND"], - ), - ): - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_variant_translations_run.id), - ) - - assert result.status == JobStatus.SUCCEEDED - - translations = session.scalars(select(VariantTranslation)).all() - pairs = {(t.aa_clingen_id, t.nt_clingen_id) for t in translations} - # PA_SHARED paired with each CA: CA9765210 (original from allele 1), - # CA_SECOND (original from allele 2), plus both as registered CAs. - assert ("PA_SHARED", "CA9765210") in pairs - assert ("PA_SHARED", "CA_SECOND") in pairs - - async def test_total_api_failure_returns_failed( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that the job returns FAILED when all variant translation lookups fail.""" - import requests - - with patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - side_effect=requests.exceptions.ConnectionError("Connection failed"), - ): - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_variant_translations_run.id), - ) - - assert result.status == JobStatus.FAILED - assert result.failure_category == FailureCategory.DEPENDENCY_FAILURE - assert result.data["alleles_failed"] == 1 - assert result.data["translations_created"] == 0 - - -# --- Integration Tests --- - - -@pytest.mark.asyncio -@pytest.mark.integration -class TestPopulateVariantTranslationsIntegration: - """Integration tests that exercise the full decorator stack.""" - - async def test_no_mapped_variants( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - ): - """Test end-to-end when no mapped variants exist.""" - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, sample_populate_variant_translations_run.id - ) - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 0 - - session.refresh(sample_populate_variant_translations_run) - assert sample_populate_variant_translations_run.status == JobStatus.SUCCEEDED - - async def test_successful_job_updates_status( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that a successful job run updates the job status to SUCCEEDED.""" - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - return_value=["PA00004"], - ), - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_matching_registered_ca_ids", - return_value=["CA77777"], - ), - ): - await populate_variant_translations_for_score_set( - mock_worker_ctx, - sample_populate_variant_translations_run.id, - ) - - session.refresh(sample_populate_variant_translations_run) - assert sample_populate_variant_translations_run.status == JobStatus.SUCCEEDED - - translations = session.scalars(select(VariantTranslation)).all() - assert len(translations) == 2 # PA00004->CA9765210 and PA00004->CA77777 - - async def test_job_with_pipeline_updates_pipeline_status( - self, - session, - with_populated_domain_data, - mock_worker_ctx, - sample_populate_variant_translations_run_pipeline, - sample_populate_variant_translations_pipeline, - setup_sample_variants_with_caid_for_translation, - ): - """Test that a job in a pipeline updates the pipeline status on success.""" - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - return_value=["PA00005"], - ), - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_matching_registered_ca_ids", - return_value=[], - ), - ): - await populate_variant_translations_for_score_set( - mock_worker_ctx, - sample_populate_variant_translations_run_pipeline.id, - ) - - session.refresh(sample_populate_variant_translations_run_pipeline) - session.refresh(sample_populate_variant_translations_pipeline) - assert sample_populate_variant_translations_run_pipeline.status == JobStatus.SUCCEEDED - assert sample_populate_variant_translations_pipeline.status == PipelineStatus.SUCCEEDED - - async def test_variant_without_caid_creates_skipped_annotation( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that variants without CAIDs produce no annotations (filtered before processing).""" - _, mapped_variant = setup_sample_variants_with_caid_for_translation - mapped_variant.clingen_allele_id = None - session.commit() - - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, sample_populate_variant_translations_run.id - ) - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - assert result.data["translations_created"] == 0 - - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 0 - - session.refresh(sample_populate_variant_translations_run) - assert sample_populate_variant_translations_run.status == JobStatus.SUCCEEDED - - async def test_unrecognized_allele_creates_skipped_annotation( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that unrecognized allele formats create skipped annotations through the full stack.""" - _, mapped_variant = setup_sample_variants_with_caid_for_translation - mapped_variant.clingen_allele_id = "XX12345" - session.commit() - - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, sample_populate_variant_translations_run.id - ) - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 1 - assert annotation_statuses[0].status == "skipped" - assert annotation_statuses[0].annotation_type == "variant_translation" - - session.refresh(sample_populate_variant_translations_run) - assert sample_populate_variant_translations_run.status == JobStatus.SUCCEEDED - - async def test_exceptions_handled_by_decorators( - self, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - mock_worker_ctx, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that unexpected exceptions are handled by decorators.""" - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - side_effect=Exception("Test exception"), - ), - patch("mavedb.worker.lib.decorators.job_management.send_slack_job_error") as mock_send_slack_job_error, - ): - result = await populate_variant_translations_for_score_set( - mock_worker_ctx, - sample_populate_variant_translations_run.id, - ) - - mock_send_slack_job_error.assert_called_once() - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.ERRORED - assert isinstance(result.exception, Exception) - - session.refresh(sample_populate_variant_translations_run) - assert sample_populate_variant_translations_run.status == JobStatus.ERRORED - - -# --- ARQ Context Tests --- - - -@pytest.mark.asyncio -@pytest.mark.integration -class TestPopulateVariantTranslationsArqContext: - """Tests for populate_variant_translations_for_score_set job using the ARQ context fixture.""" - - async def test_with_arq_context_independent( - self, - arq_redis, - arq_worker, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that the job works with the ARQ context fixture.""" - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - return_value=["PA00006"], - ), - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_matching_registered_ca_ids", - return_value=["CA88888"], - ), - ): - await arq_redis.enqueue_job( - "populate_variant_translations_for_score_set", - sample_populate_variant_translations_run.id, - ) - await arq_worker.async_run() - await arq_worker.run_check() - - session.refresh(sample_populate_variant_translations_run) - assert sample_populate_variant_translations_run.status == JobStatus.SUCCEEDED - - translations = session.scalars(select(VariantTranslation)).all() - assert len(translations) == 2 # PA00006->CA9765210 and PA00006->CA88888 - - async def test_with_arq_context_pipeline( - self, - arq_redis, - arq_worker, - session, - with_populated_domain_data, - sample_populate_variant_translations_run_pipeline, - sample_populate_variant_translations_pipeline, - setup_sample_variants_with_caid_for_translation, - ): - """Test that the job works with the ARQ context fixture in a pipeline.""" - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - return_value=["PA00007"], - ), - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_matching_registered_ca_ids", - return_value=[], - ), - ): - await arq_redis.enqueue_job( - "populate_variant_translations_for_score_set", - sample_populate_variant_translations_run_pipeline.id, - ) - await arq_worker.async_run() - await arq_worker.run_check() - - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 1 - assert annotation_statuses[0].status == "success" - assert annotation_statuses[0].annotation_type == "variant_translation" - - session.refresh(sample_populate_variant_translations_run_pipeline) - assert sample_populate_variant_translations_run_pipeline.status == JobStatus.SUCCEEDED - - session.refresh(sample_populate_variant_translations_pipeline) - assert sample_populate_variant_translations_pipeline.status == PipelineStatus.SUCCEEDED - - async def test_with_arq_context_exception_handling_independent( - self, - arq_redis, - arq_worker, - session, - with_populated_domain_data, - with_populate_variant_translations_job, - sample_populate_variant_translations_run, - setup_sample_variants_with_caid_for_translation, - ): - """Test that exceptions are handled with the ARQ context fixture.""" - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - side_effect=Exception("Test exception"), - ), - patch("mavedb.worker.lib.decorators.job_management.send_slack_job_error") as mock_send_slack_job_error, - ): - await arq_redis.enqueue_job( - "populate_variant_translations_for_score_set", - sample_populate_variant_translations_run.id, - ) - await arq_worker.async_run() - await arq_worker.run_check() - - mock_send_slack_job_error.assert_called_once() - - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 0 - - session.refresh(sample_populate_variant_translations_run) - assert sample_populate_variant_translations_run.status == JobStatus.ERRORED - - async def test_with_arq_context_exception_handling_pipeline( - self, - arq_redis, - arq_worker, - session, - with_populated_domain_data, - sample_populate_variant_translations_pipeline, - sample_populate_variant_translations_run_pipeline, - setup_sample_variants_with_caid_for_translation, - ): - """Test that exceptions in pipeline context are handled.""" - with ( - patch( - "mavedb.worker.jobs.external_services.variant_translation.get_canonical_pa_ids", - side_effect=Exception("Test exception"), - ), - patch("mavedb.worker.lib.decorators.job_management.send_slack_job_error") as mock_send_slack_job_error, - ): - await arq_redis.enqueue_job( - "populate_variant_translations_for_score_set", - sample_populate_variant_translations_run_pipeline.id, - ) - await arq_worker.async_run() - await arq_worker.run_check() - - mock_send_slack_job_error.assert_called_once() - - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == 0 - - session.refresh(sample_populate_variant_translations_run_pipeline) - assert sample_populate_variant_translations_run_pipeline.status == JobStatus.ERRORED - - session.refresh(sample_populate_variant_translations_pipeline) - assert sample_populate_variant_translations_pipeline.status == PipelineStatus.FAILED diff --git a/tests/worker/jobs/external_services/test_vep.py b/tests/worker/jobs/external_services/test_vep.py index 611c3389..19587f5c 100644 --- a/tests/worker/jobs/external_services/test_vep.py +++ b/tests/worker/jobs/external_services/test_vep.py @@ -4,29 +4,49 @@ pytest.importorskip("arq") +from datetime import date, timedelta from unittest.mock import patch from sqlalchemy import select from mavedb.lib.types.workflow import JobExecutionOutcome -from mavedb.models.enums.annotation_type import AnnotationType -from mavedb.models.enums.job_pipeline import AnnotationFailureCategory, AnnotationStatus, JobStatus -from mavedb.models.mapped_variant import MappedVariant -from mavedb.models.score_set import ScoreSet -from mavedb.models.variant import Variant -from mavedb.models.variant_annotation_status import VariantAnnotationStatus +from mavedb.models.enums.job_pipeline import JobStatus, PipelineStatus +from mavedb.models.annotation_event import AnnotationEvent +from mavedb.models.vep_allele_consequence import VepAlleleConsequence +from mavedb.lib.vep import VepResolution from mavedb.worker.jobs.external_services.vep import populate_vep_for_score_set from mavedb.worker.lib.managers.job_manager import JobManager pytestmark = pytest.mark.usefixtures("patch_db_session_ctxmgr") +_RESOLVE = "mavedb.worker.jobs.external_services.vep._resolve_consequences" +_RELEASE = "mavedb.worker.jobs.external_services.vep.get_ensembl_release" +_ENSEMBL_RELEASE = "116" + + +@pytest.fixture(autouse=True) +def mock_ensembl_release(): + """Stamp every job run with a fixed Ensembl release so tests version-key deterministically without + hitting /info/software. Tests exercising a release-fetch failure override this with an inner patch.""" + with patch(_RELEASE, return_value=_ENSEMBL_RELEASE): + yield + + +def _live_consequences_for(session, allele_id): + return session.scalars( + select(VepAlleleConsequence).where( + VepAlleleConsequence.allele_id == allele_id, + VepAlleleConsequence.current, + ) + ).all() + @pytest.mark.asyncio @pytest.mark.unit class TestPopulateVepForScoreSetUnit: - """Unit tests for populate_vep_for_score_set.""" + """Unit tests for the populate_vep_for_score_set job.""" - async def test_no_mapped_variants( + async def test_no_alleles_with_hgvs( self, session, with_populated_domain_data, @@ -34,7 +54,7 @@ async def test_no_mapped_variants( mock_worker_ctx, sample_populate_vep_run, ): - """Job succeeds with zero counts when no mapped variants exist.""" + """No alleles with HGVS -> the job succeeds with nothing to do.""" result = await populate_vep_for_score_set( mock_worker_ctx, 1, @@ -43,405 +63,451 @@ async def test_no_mapped_variants( assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - assert result.data["variants_processed"] == 0 - assert result.data["variants_with_consequences"] == 0 - assert result.data["variants_recoder_failed"] == 0 + assert result.data["created_allele_count"] == 0 + assert result.data["preexisting_allele_count"] == 0 + assert result.data["absent_allele_count"] == 0 + assert result.data["errored_allele_count"] == 0 - async def test_variant_without_hgvs_assay_level_skipped( + async def test_calls_resolver_when_alleles_present( self, session, with_populated_domain_data, with_populate_vep_job, mock_worker_ctx, sample_populate_vep_run, - setup_sample_variants_for_vep, + setup_sample_alleles_for_vep, ): - """A mapped variant with no hgvs_assay_level gets a SKIPPED annotation.""" - _, mapped_variant = setup_sample_variants_for_vep - mapped_variant.hgvs_assay_level = None - session.commit() - - result = await populate_vep_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_vep_run.id), - ) + """The VEP resolver is invoked once when the score set has HGVS-bearing alleles to query.""" + with patch(_RESOLVE, return_value=VepResolution({}, set())) as mock_resolve: + result = await populate_vep_for_score_set( + mock_worker_ctx, + 1, + JobManager(session, mock_worker_ctx["redis"], sample_populate_vep_run.id), + ) assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - assert result.data["variants_processed"] == 0 - - annotation = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.variant_id == mapped_variant.variant_id, - VariantAnnotationStatus.annotation_type == AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, - ) - ).one() - assert annotation.status == AnnotationStatus.SKIPPED - assert annotation.failure_category == AnnotationFailureCategory.MISSING_IDENTIFIER + mock_resolve.assert_called_once() - async def test_vep_api_success_sets_consequence_and_annotation( + async def test_propagates_exceptions( self, session, with_populated_domain_data, with_populate_vep_job, mock_worker_ctx, sample_populate_vep_run, - setup_sample_variants_for_vep, + setup_sample_alleles_for_vep, ): - """VEP returns a consequence: mapped variant and SUCCESS annotation are updated.""" - _, mapped_variant = setup_sample_variants_for_vep - hgvs = mapped_variant.hgvs_assay_level - - with patch( - "mavedb.worker.jobs.external_services.vep.get_functional_consequence", - return_value={hgvs: "missense_variant"}, - ): - result = await populate_vep_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_vep_run.id), - ) - - assert isinstance(result, JobExecutionOutcome) - assert result.status == JobStatus.SUCCEEDED - assert result.data["variants_processed"] == 1 - assert result.data["variants_with_consequences"] == 1 - assert result.data["variants_recoder_failed"] == 0 - - session.refresh(mapped_variant) - assert mapped_variant.vep_functional_consequence == "missense_variant" - assert mapped_variant.vep_access_date is not None - - annotation = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.variant_id == mapped_variant.variant_id, - VariantAnnotationStatus.annotation_type == AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, - ) - ).one() - assert annotation.status == AnnotationStatus.SUCCESS - - async def test_vep_missing_triggers_variant_recoder_fallback( + """Exceptions raised while resolving consequences are propagated.""" + with patch(_RESOLVE, side_effect=Exception("Test exception")): + with pytest.raises(Exception) as exc_info: + await populate_vep_for_score_set( + mock_worker_ctx, + 1, + JobManager(session, mock_worker_ctx["redis"], sample_populate_vep_run.id), + ) + + assert str(exc_info.value) == "Test exception" + + async def test_aborts_when_release_unavailable( self, session, with_populated_domain_data, with_populate_vep_job, mock_worker_ctx, sample_populate_vep_run, - setup_sample_variants_for_vep, + setup_sample_alleles_for_vep, ): - """When VEP misses a variant, Variant Recoder is called and its result fed back to VEP.""" - _, mapped_variant = setup_sample_variants_for_vep - hgvs = mapped_variant.hgvs_assay_level - genomic_hgvs = "NC_000017.11:g.43094692C>T" - + """If the Ensembl release cannot be fetched the job aborts rather than mis-versioning its writes + — the version is load-bearing for the skip, so a failure must propagate, not be swallowed.""" with ( - patch( - "mavedb.worker.jobs.external_services.vep.get_functional_consequence", - side_effect=[ - {}, # initial VEP pass returns nothing - {genomic_hgvs: "missense_variant"}, # second VEP pass on recoded HGVS - ], - ), - patch( - "mavedb.worker.jobs.external_services.vep.run_variant_recoder", - return_value={hgvs: [genomic_hgvs]}, - ), + patch(_RELEASE, side_effect=Exception("info/software unavailable")), + patch(_RESOLVE) as mock_resolve, ): - result = await populate_vep_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_vep_run.id), - ) + with pytest.raises(Exception) as exc_info: + await populate_vep_for_score_set( + mock_worker_ctx, + 1, + JobManager(session, mock_worker_ctx["redis"], sample_populate_vep_run.id), + ) + assert str(exc_info.value) == "info/software unavailable" + mock_resolve.assert_not_called() # never queried VEP without a version to stamp + + +@pytest.mark.asyncio +@pytest.mark.integration +class TestPopulateVepForScoreSetIntegration: + """Integration tests for the populate_vep_for_score_set job.""" + + async def test_no_alleles_with_hgvs( + self, + session, + with_populated_domain_data, + with_populate_vep_job, + mock_worker_ctx, + sample_populate_vep_run, + ): + """End-to-end: no alleles -> no consequence rows, no annotations, job succeeds.""" + result = await populate_vep_for_score_set(mock_worker_ctx, sample_populate_vep_run.id) assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - assert result.data["variants_with_consequences"] == 1 - assert result.data["variants_recoder_failed"] == 0 - session.refresh(mapped_variant) - assert mapped_variant.vep_functional_consequence == "missense_variant" + assert len(session.scalars(select(VepAlleleConsequence)).all()) == 0 + assert len(session.scalars(select(AnnotationEvent)).all()) == 0 + + session.refresh(sample_populate_vep_run) + assert sample_populate_vep_run.status == JobStatus.SUCCEEDED - async def test_variant_recoder_failure_annotated_as_failed( + async def test_no_consequence_resolved( self, session, with_populated_domain_data, with_populate_vep_job, mock_worker_ctx, sample_populate_vep_run, - setup_sample_variants_for_vep, + setup_sample_alleles_for_vep, ): - """Variant Recoder returning no result for an HGVS produces a FAILED annotation.""" - _, mapped_variant = setup_sample_variants_for_vep + """A genuine empty (VEP queried successfully, found nothing) writes no consequence row and an + ABSENT/no_record event — a trustworthy negative, now that empty is split from a failed request.""" + _, allele = setup_sample_alleles_for_vep - with ( - patch( - "mavedb.worker.jobs.external_services.vep.get_functional_consequence", - return_value={}, - ), - patch( - "mavedb.worker.jobs.external_services.vep.run_variant_recoder", - return_value={}, # recoder has no result - ), - ): - result = await populate_vep_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_vep_run.id), - ) + with patch(_RESOLVE, return_value=VepResolution({}, set())): + result = await populate_vep_for_score_set(mock_worker_ctx, sample_populate_vep_run.id) - assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - assert result.data["variants_without_consequences"] == 0 - assert result.data["variants_recoder_failed"] == 1 + assert result.data["absent_allele_count"] == 1 + assert len(_live_consequences_for(session, allele.id)) == 0 - annotation = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.variant_id == mapped_variant.variant_id, - VariantAnnotationStatus.annotation_type == AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, - ) - ).one() - assert annotation.status == AnnotationStatus.FAILED - assert annotation.failure_category == AnnotationFailureCategory.EXTERNAL_REFERENCE_NOT_FOUND + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 1 + assert events[0].disposition == "absent" + assert events[0].reason == "no_record" + assert events[0].annotation_type == "vep_functional_consequence" + assert events[0].allele_id == allele.id and events[0].variant_id is None - async def test_vep_failure_after_recoder_annotated_as_failed( + async def test_request_failure_recorded_as_errored( self, session, with_populated_domain_data, with_populate_vep_job, mock_worker_ctx, sample_populate_vep_run, - setup_sample_variants_for_vep, + setup_sample_alleles_for_vep, ): - """VEP returning no consequence even after Variant Recoder produces a FAILED annotation.""" - _, mapped_variant = setup_sample_variants_for_vep - hgvs = mapped_variant.hgvs_assay_level - genomic_hgvs = "NC_000017.11:g.43094692C>T" + """An allele whose VEP/Recoder request *failed* is FAILED/api_error — distinct from a genuine + empty — so a transient outage is never mistaken for a confirmed absence.""" + _, allele = setup_sample_alleles_for_vep - with ( - patch( - "mavedb.worker.jobs.external_services.vep.get_functional_consequence", - return_value={}, # VEP returns nothing in both passes - ), - patch( - "mavedb.worker.jobs.external_services.vep.run_variant_recoder", - return_value={hgvs: [genomic_hgvs]}, - ), - ): - result = await populate_vep_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_vep_run.id), - ) + # Resolver reports the allele's HGVS as errored (request failed), not as a hit or an empty. + with patch(_RESOLVE, return_value=VepResolution({}, {allele.hgvs_c})): + result = await populate_vep_for_score_set(mock_worker_ctx, sample_populate_vep_run.id) - assert isinstance(result, JobExecutionOutcome) assert result.status == JobStatus.SUCCEEDED - assert result.data["variants_without_consequences"] == 1 - assert result.data["variants_recoder_failed"] == 0 + assert result.data["errored_allele_count"] == 1 + assert result.data["absent_allele_count"] == 0 + assert len(_live_consequences_for(session, allele.id)) == 0 + + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 1 + assert events[0].disposition == "failed" + assert events[0].reason == "api_error" + assert events[0].annotation_type == "vep_functional_consequence" + assert events[0].allele_id == allele.id and events[0].variant_id is None + + async def test_successful_linking_independent( + self, + session, + with_populated_domain_data, + with_populate_vep_job, + mock_worker_ctx, + sample_populate_vep_run, + setup_sample_alleles_for_vep, + ): + """A resolved consequence creates a single live VepAlleleConsequence and a SUCCESS VAS row.""" + _, allele = setup_sample_alleles_for_vep - annotation = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.variant_id == mapped_variant.variant_id, - VariantAnnotationStatus.annotation_type == AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, - ) - ).one() - assert annotation.status == AnnotationStatus.FAILED - assert annotation.failure_category == AnnotationFailureCategory.EXTERNAL_REFERENCE_NOT_FOUND + with patch(_RESOLVE, return_value=VepResolution({allele.hgvs_c: "missense_variant"}, set())): + result = await populate_vep_for_score_set(mock_worker_ctx, sample_populate_vep_run.id) - async def test_vep_none_consequence_routes_to_recoder( + assert result.status == JobStatus.SUCCEEDED + assert result.data["created_allele_count"] == 1 + + live = _live_consequences_for(session, allele.id) + assert len(live) == 1 + assert live[0].functional_consequence == "missense_variant" + assert live[0].source_version == _ENSEMBL_RELEASE + assert live[0].access_date == date.today() + + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 1 + assert events[0].disposition == "present" + assert events[0].reason == "created" + assert events[0].annotation_type == "vep_functional_consequence" + assert events[0].allele_id == allele.id and events[0].variant_id is None + + async def test_links_and_annotates_rt_derived_allele( self, session, with_populated_domain_data, with_populate_vep_job, mock_worker_ctx, sample_populate_vep_run, - setup_sample_variants_for_vep, + setup_rt_derived_allele_for_vep, ): - """A None consequence from VEP Phase 1 is treated as a miss and routed through Recoder. + """VEP linkage covers the full allele set, not just authoritative links: the RT-derived allele's + genomic HGVS resolves and gets a consequence row. Events are allele-keyed, so each allele in the + score set gets its own event — present for the resolved RT allele, absent/no_record for the + authoritative allele VEP queried but found nothing (a genuine empty, distinct from a request + failure). The per-variant bandaid's limitation (dropping the RT-derived allele's status) is lifted.""" + variant, authoritative_allele, rt_allele = setup_rt_derived_allele_for_vep + + # VEP resolves only the RT-derived allele's genomic HGVS; the authoritative allele's coding + # HGVS is queried successfully but yields no consequence this run (a genuine empty, not errored). + with patch(_RESOLVE, return_value=VepResolution({rt_allele.hgvs_g: "missense_variant"}, set())): + result = await populate_vep_for_score_set(mock_worker_ctx, sample_populate_vep_run.id) - VEP can return an entry with most_severe_consequence=None when it recognises a variant - but cannot classify it. This should not silently fall into the UNKNOWN outcome branch — - instead it should be treated identically to an absent entry and sent to Recoder. - """ - _, mapped_variant = setup_sample_variants_for_vep - hgvs = mapped_variant.hgvs_assay_level - genomic_hgvs = "NC_000017.11:g.43094692C>T" + assert result.status == JobStatus.SUCCEEDED - with ( - patch( - "mavedb.worker.jobs.external_services.vep.get_functional_consequence", - side_effect=[ - {hgvs: None}, # VEP knows the variant but returns no consequence - {genomic_hgvs: "missense_variant"}, # Phase 3 on recoded genomic string - ], - ), - patch( - "mavedb.worker.jobs.external_services.vep.run_variant_recoder", - return_value={hgvs: [genomic_hgvs]}, - ), - ): - result = await populate_vep_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_vep_run.id), + # The RT-derived (non-authoritative) allele IS linked — the core fix. + rt_live = _live_consequences_for(session, rt_allele.id) + assert len(rt_live) == 1 + assert rt_live[0].functional_consequence == "missense_variant" + # The authoritative allele's HGVS had no consequence, so it gets no row. + assert len(_live_consequences_for(session, authoritative_allele.id)) == 0 + + # One allele-keyed event per allele (never per-variant): the resolved RT allele is present, the + # unclassifiable authoritative allele is absent/no_record (queried, genuinely empty). + events = session.scalars(select(AnnotationEvent)).all() + assert all(e.annotation_type == "vep_functional_consequence" for e in events) + assert all(e.variant_id is None for e in events) + by_allele = {e.allele_id: e for e in events} + assert by_allele[rt_allele.id].disposition == "present" + assert by_allele[authoritative_allele.id].disposition == "absent" + assert by_allele[authoritative_allele.id].reason == "no_record" + + async def test_skips_allele_already_at_current_release( + self, + session, + with_populated_domain_data, + with_populate_vep_job, + mock_worker_ctx, + sample_populate_vep_run, + setup_sample_alleles_for_vep, + ): + """An allele with a live consequence already at the current Ensembl release is skipped: no VEP + query, the status reports SUCCESS/preexisting, and the existing row is not churned.""" + _, allele = setup_sample_alleles_for_vep + + # Simulate a prior run at the current release. + session.add( + VepAlleleConsequence( + allele_id=allele.id, + functional_consequence="missense_variant", + source_version=_ENSEMBL_RELEASE, + access_date=date.today(), ) + ) + session.commit() - assert result.status == JobStatus.SUCCEEDED - assert result.data["variants_with_consequences"] == 1 + with patch(_RESOLVE) as mock_resolve: + result = await populate_vep_for_score_set(mock_worker_ctx, sample_populate_vep_run.id) - session.refresh(mapped_variant) - assert mapped_variant.vep_functional_consequence == "missense_variant" + mock_resolve.assert_not_called() # version-keyed skip avoided the external query entirely + assert result.data["preexisting_allele_count"] == 1 + assert result.data["created_allele_count"] == 0 - async def test_most_severe_consequence_selected_from_multiple_genomic_hgvs( + # Row not churned: still exactly one, still live. + rows = session.scalars(select(VepAlleleConsequence).where(VepAlleleConsequence.allele_id == allele.id)).all() + assert len(rows) == 1 + assert rows[0].valid_to is None + + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 1 + assert events[0].disposition == "present" + assert events[0].reason == "preexisting" + + async def test_new_release_same_consequence_bumps_in_place( self, session, with_populated_domain_data, with_populate_vep_job, mock_worker_ctx, sample_populate_vep_run, - setup_sample_variants_for_vep, + setup_sample_alleles_for_vep, ): - """When Recoder returns multiple genomic strings with different consequences, the most severe wins.""" - _, mapped_variant = setup_sample_variants_for_vep - hgvs = mapped_variant.hgvs_assay_level - genomic_less_severe = "NC_000017.11:g.43094692C>T" # → missense_variant - genomic_more_severe = "NC_000017.11:g.43094692C>A" # → stop_gained - - with ( - patch( - "mavedb.worker.jobs.external_services.vep.get_functional_consequence", - side_effect=[ - {}, # Phase 1: VEP misses - { - genomic_less_severe: "missense_variant", - genomic_more_severe: "stop_gained", - }, # Phase 3: two different consequences - ], - ), - patch( - "mavedb.worker.jobs.external_services.vep.run_variant_recoder", - return_value={hgvs: [genomic_less_severe, genomic_more_severe]}, - ), - ): - result = await populate_vep_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_vep_run.id), + """An allele live at an older release is re-queried; an unchanged consequence advances + source_version (and access_date) in place — no supersede, so a new release does not churn + history for a categorical value that did not change.""" + _, allele = setup_sample_alleles_for_vep + + session.add( + VepAlleleConsequence( + allele_id=allele.id, + functional_consequence="missense_variant", + source_version="115", + access_date=date.today() - timedelta(days=90), ) + ) + session.commit() + + with patch(_RESOLVE, return_value=VepResolution({allele.hgvs_c: "missense_variant"}, set())) as mock_resolve: + result = await populate_vep_for_score_set(mock_worker_ctx, sample_populate_vep_run.id) - assert result.data["variants_with_consequences"] == 1 + mock_resolve.assert_called_once() # older release -> not skipped, re-queried + assert result.data["preexisting_allele_count"] == 1 + assert result.data["created_allele_count"] == 0 - session.refresh(mapped_variant) - assert mapped_variant.vep_functional_consequence == "stop_gained" + # Unchanged value -> no supersede: one row, still live, version + date advanced in place. + rows = session.scalars(select(VepAlleleConsequence).where(VepAlleleConsequence.allele_id == allele.id)).all() + assert len(rows) == 1 + assert rows[0].valid_to is None + assert rows[0].source_version == _ENSEMBL_RELEASE + assert rows[0].access_date == date.today() - async def test_multiple_variants_sharing_hgvs_all_get_consequence( + async def test_force_requeries_unchanged_without_churn( self, session, with_populated_domain_data, with_populate_vep_job, mock_worker_ctx, sample_populate_vep_run, - setup_sample_variants_for_vep, + setup_sample_alleles_for_vep, ): - """All mapped variants that share an HGVS string each receive the consequence.""" - _, mapped_variant_1 = setup_sample_variants_for_vep - hgvs = mapped_variant_1.hgvs_assay_level - - score_set = session.get(ScoreSet, sample_populate_vep_run.job_params["score_set_id"]) - variant_2 = Variant( - urn="urn:variant:test-variant-for-vep-2", - score_set_id=score_set.id, - hgvs_nt=hgvs, - data={"hgvs_c": hgvs}, + """force bypasses the current-release skip and re-queries, but the linker only supersedes on a + value change: a forced re-run that resolves the same consequence reports preexisting and does + not churn the row (version/access_date advanced in place).""" + _, allele = setup_sample_alleles_for_vep + + # Prior live consequence already at the current release (would be skipped without force). + session.add( + VepAlleleConsequence( + allele_id=allele.id, + functional_consequence="missense_variant", + source_version=_ENSEMBL_RELEASE, + access_date=date.today() - timedelta(days=1), + ) ) - session.add(variant_2) session.commit() - mapped_variant_2 = MappedVariant( - variant_id=variant_2.id, - current=True, - mapped_date="2024-01-01T00:00:00Z", - mapping_api_version="1.0.0", - post_mapped={"type": "Allele", "expressions": [{"value": hgvs, "syntax": "hgvs.c"}]}, - hgvs_assay_level=hgvs, - ) - session.add(mapped_variant_2) + + sample_populate_vep_run.job_params = {**sample_populate_vep_run.job_params, "force": True} session.commit() - with patch( - "mavedb.worker.jobs.external_services.vep.get_functional_consequence", - return_value={hgvs: "missense_variant"}, - ): - result = await populate_vep_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_vep_run.id), - ) + with patch(_RESOLVE, return_value=VepResolution({allele.hgvs_c: "missense_variant"}, set())) as mock_resolve: + result = await populate_vep_for_score_set(mock_worker_ctx, sample_populate_vep_run.id) - assert result.data["variants_processed"] == 2 - assert result.data["variants_with_consequences"] == 2 + mock_resolve.assert_called_once() # force bypassed the current-release skip + assert result.data["preexisting_allele_count"] == 1 + assert result.data["created_allele_count"] == 0 - session.refresh(mapped_variant_1) - session.refresh(mapped_variant_2) - assert mapped_variant_1.vep_functional_consequence == "missense_variant" - assert mapped_variant_2.vep_functional_consequence == "missense_variant" + # Unchanged consequence -> no supersede: one row, still live, access_date touched in place. + rows = session.scalars(select(VepAlleleConsequence).where(VepAlleleConsequence.allele_id == allele.id)).all() + assert len(rows) == 1 + assert rows[0].valid_to is None + assert rows[0].access_date == date.today() - async def test_vep_batch_api_exception_raises( + async def test_new_release_changed_consequence_supersedes( self, session, with_populated_domain_data, with_populate_vep_job, mock_worker_ctx, sample_populate_vep_run, - setup_sample_variants_for_vep, + setup_sample_alleles_for_vep, ): - """An unexpected exception from the VEP API propagates to the job management decorator.""" - with ( - patch( - "mavedb.worker.jobs.external_services.vep.get_functional_consequence", - side_effect=RuntimeError("VEP API unreachable"), - ), - pytest.raises(RuntimeError, match="VEP API unreachable"), - ): - await populate_vep_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_vep_run.id), + """An allele live at an older release is re-queried; a changed result supersedes it — one live + row carrying the new consequence/release and one retired row preserving the old.""" + _, allele = setup_sample_alleles_for_vep + + # Prior live consequence at an older release -> not skipped, eligible for re-query. + session.add( + VepAlleleConsequence( + allele_id=allele.id, + functional_consequence="synonymous_variant", + source_version="115", + access_date=date.today() - timedelta(days=90), ) + ) + session.commit() - async def test_variant_recoder_api_exception_raises( + with patch(_RESOLVE, return_value=VepResolution({allele.hgvs_c: "missense_variant"}, set())): + result = await populate_vep_for_score_set(mock_worker_ctx, sample_populate_vep_run.id) + + assert result.data["created_allele_count"] == 1 + + live = _live_consequences_for(session, allele.id) + assert len(live) == 1 + assert live[0].functional_consequence == "missense_variant" + assert live[0].source_version == _ENSEMBL_RELEASE + + # Old consequence retired, not deleted. + all_rows = session.scalars( + select(VepAlleleConsequence).where(VepAlleleConsequence.allele_id == allele.id) + ).all() + assert len(all_rows) == 2 + assert len([r for r in all_rows if r.valid_to is not None]) == 1 + + async def test_successful_linking_pipeline( + self, + session, + with_populated_domain_data, + mock_worker_ctx, + sample_populate_vep_run_pipeline, + sample_populate_vep_pipeline, + setup_sample_alleles_for_vep, + ): + """End-to-end successful linking within a pipeline updates both job and pipeline status.""" + _, allele = setup_sample_alleles_for_vep + + with patch(_RESOLVE, return_value=VepResolution({allele.hgvs_c: "missense_variant"}, set())): + result = await populate_vep_for_score_set(mock_worker_ctx, sample_populate_vep_run_pipeline.id) + + assert result.status == JobStatus.SUCCEEDED + assert len(session.scalars(select(VepAlleleConsequence).where(VepAlleleConsequence.current)).all()) == 1 + + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 1 + assert events[0].disposition == "present" + assert events[0].reason == "created" + assert events[0].annotation_type == "vep_functional_consequence" + assert events[0].allele_id is not None and events[0].variant_id is None + + session.refresh(sample_populate_vep_run_pipeline) + assert sample_populate_vep_run_pipeline.status == JobStatus.SUCCEEDED + session.refresh(sample_populate_vep_pipeline) + assert sample_populate_vep_pipeline.status == PipelineStatus.SUCCEEDED + + async def test_exceptions_handled_by_decorators( self, session, with_populated_domain_data, with_populate_vep_job, mock_worker_ctx, sample_populate_vep_run, - setup_sample_variants_for_vep, + setup_sample_alleles_for_vep, ): - """An unexpected exception from the Variant Recoder API propagates to the job management decorator.""" + """Exceptions during resolution are handled by the job decorators (Slack alert, ERRORED).""" with ( - patch( - "mavedb.worker.jobs.external_services.vep.get_functional_consequence", - return_value={}, - ), - patch( - "mavedb.worker.jobs.external_services.vep.run_variant_recoder", - side_effect=RuntimeError("Recoder API unreachable"), - ), - pytest.raises(RuntimeError, match="Recoder API unreachable"), + patch(_RESOLVE, side_effect=Exception("Test exception")), + patch("mavedb.worker.lib.decorators.job_management.send_slack_job_error") as mock_send_slack_job_error, ): - await populate_vep_for_score_set( - mock_worker_ctx, - 1, - JobManager(session, mock_worker_ctx["redis"], sample_populate_vep_run.id), - ) + result = await populate_vep_for_score_set(mock_worker_ctx, sample_populate_vep_run.id) + + mock_send_slack_job_error.assert_called_once() + assert isinstance(result, JobExecutionOutcome) + assert result.status == JobStatus.ERRORED + assert isinstance(result.exception, Exception) + + session.refresh(sample_populate_vep_run) + assert sample_populate_vep_run.status == JobStatus.ERRORED @pytest.mark.asyncio @pytest.mark.integration -class TestPopulateVepForScoreSetIntegration: - """Integration tests for populate_vep_for_score_set run through an ARQ worker context.""" +class TestPopulateVepForScoreSetArqContext: + """Tests for the populate_vep_for_score_set job using the ARQ context fixture.""" - async def test_populate_vep_with_arq_context( + async def test_populate_vep_with_arq_context_independent( self, arq_redis, arq_worker, @@ -449,35 +515,29 @@ async def test_populate_vep_with_arq_context( with_populated_domain_data, with_populate_vep_job, sample_populate_vep_run, - setup_sample_variants_for_vep, + setup_sample_alleles_for_vep, ): - """Job completes successfully within an ARQ worker context.""" - _, mapped_variant = setup_sample_variants_for_vep - hgvs = mapped_variant.hgvs_assay_level + """The VEP job links a consequence and records a SUCCESS annotation through the ARQ worker.""" + _, allele = setup_sample_alleles_for_vep - with patch( - "mavedb.worker.jobs.external_services.vep.get_functional_consequence", - return_value={hgvs: "missense_variant"}, - ): + with patch(_RESOLVE, return_value=VepResolution({allele.hgvs_c: "missense_variant"}, set())): await arq_redis.enqueue_job("populate_vep_for_score_set", sample_populate_vep_run.id) await arq_worker.async_run() await arq_worker.run_check() - session.refresh(sample_populate_vep_run) - assert sample_populate_vep_run.status == JobStatus.SUCCEEDED + assert len(session.scalars(select(VepAlleleConsequence).where(VepAlleleConsequence.current)).all()) == 1 - session.refresh(mapped_variant) - assert mapped_variant.vep_functional_consequence == "missense_variant" + events = session.scalars(select(AnnotationEvent)).all() + assert len(events) == 1 + assert events[0].disposition == "present" + assert events[0].reason == "created" + assert events[0].annotation_type == "vep_functional_consequence" + assert events[0].allele_id is not None and events[0].variant_id is None - annotation = session.scalars( - select(VariantAnnotationStatus).where( - VariantAnnotationStatus.variant_id == mapped_variant.variant_id, - VariantAnnotationStatus.annotation_type == AnnotationType.VEP_FUNCTIONAL_CONSEQUENCE, - ) - ).one() - assert annotation.status == AnnotationStatus.SUCCESS + session.refresh(sample_populate_vep_run) + assert sample_populate_vep_run.status == JobStatus.SUCCEEDED - async def test_populate_vep_in_pipeline_context( + async def test_populate_vep_with_arq_context_pipeline( self, arq_redis, arq_worker, @@ -485,24 +545,73 @@ async def test_populate_vep_in_pipeline_context( with_populated_domain_data, sample_populate_vep_run_pipeline, sample_populate_vep_pipeline, - setup_sample_variants_for_vep, + setup_sample_alleles_for_vep, ): - """Job completes and advances the pipeline when run in a pipeline context.""" - from mavedb.models.enums.job_pipeline import PipelineStatus - - _, mapped_variant = setup_sample_variants_for_vep - hgvs = mapped_variant.hgvs_assay_level + """The VEP job completes and advances the pipeline through the ARQ worker.""" + _, allele = setup_sample_alleles_for_vep - with patch( - "mavedb.worker.jobs.external_services.vep.get_functional_consequence", - return_value={hgvs: "synonymous_variant"}, - ): + with patch(_RESOLVE, return_value=VepResolution({allele.hgvs_c: "missense_variant"}, set())): await arq_redis.enqueue_job("populate_vep_for_score_set", sample_populate_vep_run_pipeline.id) await arq_worker.async_run() await arq_worker.run_check() + assert len(session.scalars(select(VepAlleleConsequence).where(VepAlleleConsequence.current)).all()) == 1 + session.refresh(sample_populate_vep_run_pipeline) assert sample_populate_vep_run_pipeline.status == JobStatus.SUCCEEDED - session.refresh(sample_populate_vep_pipeline) assert sample_populate_vep_pipeline.status == PipelineStatus.SUCCEEDED + + async def test_populate_vep_with_arq_context_exception_handling_independent( + self, + arq_redis, + arq_worker, + session, + with_populated_domain_data, + with_populate_vep_job, + sample_populate_vep_run, + setup_sample_alleles_for_vep, + ): + """Exceptions in the VEP job are handled with the ARQ context fixture.""" + with ( + patch(_RESOLVE, side_effect=Exception("Test exception")), + patch("mavedb.worker.lib.decorators.job_management.send_slack_job_error") as mock_send_slack_job_error, + ): + await arq_redis.enqueue_job("populate_vep_for_score_set", sample_populate_vep_run.id) + await arq_worker.async_run() + await arq_worker.run_check() + + mock_send_slack_job_error.assert_called_once() + assert len(session.scalars(select(VepAlleleConsequence)).all()) == 0 + assert len(session.scalars(select(AnnotationEvent)).all()) == 0 + + session.refresh(sample_populate_vep_run) + assert sample_populate_vep_run.status == JobStatus.ERRORED + + async def test_populate_vep_with_arq_context_exception_handling_pipeline( + self, + arq_redis, + arq_worker, + session, + with_populated_domain_data, + sample_populate_vep_pipeline, + sample_populate_vep_run_pipeline, + setup_sample_alleles_for_vep, + ): + """Exceptions in the VEP job fail the pipeline with the ARQ context fixture.""" + with ( + patch(_RESOLVE, side_effect=Exception("Test exception")), + patch("mavedb.worker.lib.decorators.job_management.send_slack_job_error") as mock_send_slack_job_error, + ): + await arq_redis.enqueue_job("populate_vep_for_score_set", sample_populate_vep_run_pipeline.id) + await arq_worker.async_run() + await arq_worker.run_check() + + mock_send_slack_job_error.assert_called_once() + assert len(session.scalars(select(VepAlleleConsequence)).all()) == 0 + assert len(session.scalars(select(AnnotationEvent)).all()) == 0 + + session.refresh(sample_populate_vep_run_pipeline) + assert sample_populate_vep_run_pipeline.status == JobStatus.ERRORED + session.refresh(sample_populate_vep_pipeline) + assert sample_populate_vep_pipeline.status == PipelineStatus.FAILED diff --git a/tests/worker/jobs/test_allele_translations.py b/tests/worker/jobs/test_allele_translations.py new file mode 100644 index 00000000..0e1c1b12 --- /dev/null +++ b/tests/worker/jobs/test_allele_translations.py @@ -0,0 +1,41 @@ +# ruff: noqa: E402 + +import pytest + +pytest.importorskip("arq") + +from datetime import datetime, timezone + +from mavedb.lib.alleles import get_allele_translations + + +@pytest.mark.unit +class TestGetAlleleTranslations: + """The cross-layer equivalence query traverses the MappingRecordAllele link graph. + + Uses the RT-derived fixture, which links an authoritative allele and a derived (cross-layer) + allele to one MappingRecord — exactly the co-membership the query resolves. + """ + + def test_returns_full_equivalence_set_from_any_member(self, session, setup_rt_derived_allele_with_caid): + _variant, authoritative_allele, rt_allele = setup_rt_derived_allele_with_caid + + from_authoritative = {a.id for a in get_allele_translations(session, authoritative_allele.id)} + from_rt = {a.id for a in get_allele_translations(session, rt_allele.id)} + + # Reachable from either member, and includes the anchor itself — the full equivalence set. + assert from_authoritative == {authoritative_allele.id, rt_allele.id} + assert from_rt == {authoritative_allele.id, rt_allele.id} + + def test_as_of_before_links_existed_is_empty(self, session, setup_rt_derived_allele_with_caid): + _variant, authoritative_allele, _rt = setup_rt_derived_allele_with_caid + + past = datetime(2000, 1, 1, tzinfo=timezone.utc) + assert get_allele_translations(session, authoritative_allele.id, as_of=past) == [] + + def test_as_of_after_links_existed_returns_set(self, session, setup_rt_derived_allele_with_caid): + _variant, authoritative_allele, rt_allele = setup_rt_derived_allele_with_caid + + future = datetime(2999, 1, 1, tzinfo=timezone.utc) + ids = {a.id for a in get_allele_translations(session, authoritative_allele.id, as_of=future)} + assert ids == {authoritative_allele.id, rt_allele.id} diff --git a/tests/worker/jobs/variant_processing/test_mapping.py b/tests/worker/jobs/variant_processing/test_mapping.py index 3a0c6db2..96321d81 100644 --- a/tests/worker/jobs/variant_processing/test_mapping.py +++ b/tests/worker/jobs/variant_processing/test_mapping.py @@ -20,7 +20,7 @@ from mavedb.models.mapping_record_allele import MappingRecordAllele from mavedb.models.target_gene_mapping import TargetGeneMapping from mavedb.models.variant import Variant -from mavedb.models.variant_annotation_status import VariantAnnotationStatus +from mavedb.models.annotation_event import AnnotationEvent from mavedb.worker.jobs.variant_processing.mapping import map_variants_for_score_set from mavedb.worker.lib.managers.job_manager import JobManager from tests.helpers.constants import TEST_CODING_LAYER, TEST_GENOMIC_LAYER, TEST_PROTEIN_LAYER @@ -90,8 +90,8 @@ async def test_map_variants_for_score_set_no_mapping_results( # Verify no annotations were created annotation_statuses = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter(Variant.score_set_id == sample_score_set.id) .all() ) @@ -137,8 +137,8 @@ async def test_map_variants_for_score_set_no_mapped_scores( # Verify no annotations were created annotation_statuses = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter(Variant.score_set_id == sample_score_set.id) .all() ) @@ -184,8 +184,8 @@ async def test_map_variants_for_score_set_no_reference_data( # Verify no annotations were created annotation_statuses = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter(Variant.score_set_id == sample_score_set.id) .all() ) @@ -239,8 +239,8 @@ async def test_map_variants_for_score_set_nonexistent_target_gene( # Verify no annotations were created annotation_statuses = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter(Variant.score_set_id == sample_score_set.id) .all() ) @@ -286,8 +286,8 @@ async def test_map_variants_for_score_set_returns_variants_not_in_score_set( # Verify no annotations were created annotation_statuses = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter(Variant.score_set_id == sample_score_set.id) .all() ) @@ -354,14 +354,14 @@ async def dummy_mapping_job(): # Verify that annotation statuses were created and correct annotation_statuses = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter(Variant.score_set_id == sample_score_set.id) .all() ) assert len(annotation_statuses) == 1 assert annotation_statuses[0].annotation_type == "vrs_mapping" - assert annotation_statuses[0].status == "success" + assert annotation_statuses[0].disposition == "present" @pytest.mark.parametrize( "with_layers", @@ -471,14 +471,14 @@ async def dummy_mapping_job(): # Verify that annotation statuses were created and correct annotation_statuses = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter(Variant.score_set_id == sample_score_set.id) .all() ) assert len(annotation_statuses) == 1 assert annotation_statuses[0].annotation_type == "vrs_mapping" - assert annotation_statuses[0].status == "success" + assert annotation_statuses[0].disposition == "present" async def test_persists_cdna_target_gene_mapping_with_reference_accession_and_null_qc( self, @@ -584,14 +584,14 @@ async def dummy_mapping_job(): # Verify that annotation statuses were created and correct annotation_statuses = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter(Variant.score_set_id == sample_score_set.id) .all() ) assert len(annotation_statuses) == 1 assert annotation_statuses[0].annotation_type == "vrs_mapping" - assert annotation_statuses[0].status == "failed" + assert annotation_statuses[0].disposition == "failed" async def test_map_variants_for_score_set_incomplete_mapping( self, @@ -668,17 +668,17 @@ async def dummy_mapping_job(): # Verify that annotation statuses were created and correct annotation_status_success = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) - .filter(Variant.score_set_id == sample_score_set.id, VariantAnnotationStatus.status == "success") + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) + .filter(Variant.score_set_id == sample_score_set.id, AnnotationEvent.disposition == "present") .all() ) assert len(annotation_status_success) == 1 assert annotation_status_success[0].annotation_type == "vrs_mapping" annotation_status_failed = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) - .filter(Variant.score_set_id == sample_score_set.id, VariantAnnotationStatus.status == "failed") + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) + .filter(Variant.score_set_id == sample_score_set.id, AnnotationEvent.disposition == "failed") .all() ) assert len(annotation_status_failed) == 1 @@ -694,7 +694,8 @@ async def test_map_variants_for_score_set_benign_outcomes_are_not_failures( ): """A score set whose only unmapped variants are benign absences (intronic / no protein consequence) is ``complete``, not ``incomplete``/``failed``: benign - outcomes carry no allele but are skips, not failures.""" + outcomes carry no allele, so they are recorded as ``absent`` (an informative + biological negative), never ``failed``.""" async def dummy_mapping_job(): mapping_output = await construct_mock_mapping_output( @@ -761,17 +762,17 @@ async def dummy_mapping_job(): assert len(mapping_records) == 2 assert all(authoritative_allele_for(session, r) is None for r in mapping_records) - # Benign outcomes are SKIPPED (not FAILED), and the finer outcome is preserved in metadata. - annotation_statuses = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + # Benign outcomes carry no allele → recorded as `absent` (not `failed`), with the finer + # outcome preserved as the event `reason`. + events = ( + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter(Variant.score_set_id == sample_score_set.id) .all() ) - assert len(annotation_statuses) == 2 - assert all(s.status == "skipped" for s in annotation_statuses) - assert all(s.failure_category is None for s in annotation_statuses) - recorded_outcomes = {s.annotation_metadata["outcome"] for s in annotation_statuses} + assert len(events) == 2 + assert all(e.disposition == "absent" for e in events) + recorded_outcomes = {e.reason for e in events} assert recorded_outcomes == {"intronic", "no_protein_consequence"} async def test_map_variants_for_score_set_complete_mapping( @@ -850,15 +851,15 @@ async def dummy_mapping_job(): # Verify that annotation statuses were created and correct annotation_statuses = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter(Variant.score_set_id == sample_score_set.id) .all() ) assert len(annotation_statuses) == 2 for status in annotation_statuses: assert status.annotation_type == "vrs_mapping" - assert status.status == "success" + assert status.disposition == "present" async def test_map_variants_for_score_set_updates_existing_mapped_variants( self, @@ -909,10 +910,10 @@ async def dummy_mapping_job(): ) session.add(prior_link) session.commit() - variant_annotation_status = VariantAnnotationStatus( - variant_id=variant.id, current=True, annotation_type="vrs_mapping", status="success" + prior_event = AnnotationEvent( + variant_id=variant.id, annotation_type="vrs_mapping", disposition="present", reason="mapped" ) - session.add(variant_annotation_status) + session.add(prior_event) session.commit() with ( @@ -965,24 +966,14 @@ async def dummy_mapping_job(): assert new_mapping_record.mapped_date != date(2023, 1, 1) assert new_mapping_record.mapping_api_version != "v1.0.0" - # Verify the non-current annotation status still exists - old_annotation_status = ( - session.query(VariantAnnotationStatus) - .filter( - VariantAnnotationStatus.variant_id == non_current_mapping_record.variant_id, - VariantAnnotationStatus.current.is_(False), - ) - .one_or_none() - ) - assert old_annotation_status is not None - - # Verify that a new annotation status was created - new_annotation_status = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == variant.id, VariantAnnotationStatus.current.is_(True)) - .one_or_none() + # Append-only: the prior event is retained and the re-map appends a new one, so the variant + # now has two vrs_mapping events (there is no current flag to flip). + events = ( + session.query(AnnotationEvent) + .filter(AnnotationEvent.variant_id == variant.id, AnnotationEvent.annotation_type == "vrs_mapping") + .all() ) - assert new_annotation_status is not None + assert len(events) == 2 @pytest.mark.integration @@ -1065,8 +1056,8 @@ async def dummy_mapping_job(): # Verify that each variant has an annotation status annotation_statuses = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter(Variant.score_set_id == sample_score_set.id) .all() ) @@ -1156,8 +1147,8 @@ async def dummy_mapping_job(): # Verify that each variant has an annotation status annotation_statuses = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter(Variant.score_set_id == sample_score_set.id) .all() ) @@ -1237,7 +1228,7 @@ async def dummy_mapping_job(): assert len(mapping_records) == 0 # Verify that no annotation statuses were created - annotation_statuses = session.query(VariantAnnotationStatus).all() + annotation_statuses = session.query(AnnotationEvent).all() assert len(annotation_statuses) == 0 # Verify that the job status was updated. @@ -1314,7 +1305,7 @@ async def dummy_mapping_job(): assert len(mapping_records) == 0 # Verify that no annotation statuses were created - annotation_statuses = session.query(VariantAnnotationStatus).all() + annotation_statuses = session.query(AnnotationEvent).all() assert len(annotation_statuses) == 0 # Verify that the job status was updated. @@ -1390,7 +1381,7 @@ async def dummy_mapping_job(): assert len(mapping_records) == 0 # Verify that no annotation statuses were created - annotation_statuses = session.query(VariantAnnotationStatus).all() + annotation_statuses = session.query(AnnotationEvent).all() assert len(annotation_statuses) == 0 # Verify that the job status was updated. @@ -1434,10 +1425,10 @@ async def test_map_variants_for_score_set_updates_current_mapped_variants( mapped_date=date(2023, 1, 1), mapping_api_version="v1.0.0", ) - annotation_status = VariantAnnotationStatus( - variant_id=variant.id, current=True, annotation_type="vrs_mapping", status="success" + prior_event = AnnotationEvent( + variant_id=variant.id, annotation_type="vrs_mapping", disposition="present", reason="mapped" ) - session.add(annotation_status) + session.add(prior_event) session.add(mapping_record) session.commit() @@ -1496,23 +1487,16 @@ async def dummy_mapping_job(): assert new_mapping_record.mapped_date != date(2023, 1, 1) assert new_mapping_record.mapping_api_version != "v1.0.0" - # Verify that annotation statuses where marked as non-current and new entries created - annotation_statuses = session.query(VariantAnnotationStatus).all() - assert len(annotation_statuses) == len(variants) * 2 # Each variant has two annotation statuses now + # Append-only: each variant keeps its prior event and gains a new one from this run. + annotation_events = session.query(AnnotationEvent).all() + assert len(annotation_events) == len(variants) * 2 for variant in variants: - old_annotation_status = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == variant.id, VariantAnnotationStatus.current.is_(False)) - .one_or_none() - ) - assert old_annotation_status is not None - - new_annotation_status = ( - session.query(VariantAnnotationStatus) - .filter(VariantAnnotationStatus.variant_id == variant.id, VariantAnnotationStatus.current.is_(True)) - .one_or_none() + variant_events = ( + session.query(AnnotationEvent) + .filter(AnnotationEvent.variant_id == variant.id, AnnotationEvent.annotation_type == "vrs_mapping") + .all() ) - assert new_annotation_status is not None + assert len(variant_events) == 2 # Verify that the job status was updated. processing_run = ( @@ -1573,7 +1557,7 @@ async def dummy_mapping_job(): assert len(mapping_records) == 0 # Verify that no annotation statuses were created - annotation_statuses = session.query(VariantAnnotationStatus).all() + annotation_statuses = session.query(AnnotationEvent).all() assert len(annotation_statuses) == 0 # Verify that the job status was updated. @@ -1632,7 +1616,7 @@ async def dummy_mapping_job(): assert len(mapping_records) == 0 # Verify that no annotation statuses were created - annotation_statuses = session.query(VariantAnnotationStatus).all() + annotation_statuses = session.query(AnnotationEvent).all() assert len(annotation_statuses) == 0 # Verify that the job status was updated. @@ -1716,8 +1700,8 @@ async def dummy_mapping_job(): # Verify that each variant has an annotation status annotation_statuses = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter(Variant.score_set_id == sample_score_set.id) .all() ) @@ -1804,8 +1788,8 @@ async def dummy_mapping_job(): # Verify that each variant has an annotation status annotation_statuses = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter(Variant.score_set_id == sample_score_set.id) .all() ) @@ -1874,7 +1858,7 @@ async def dummy_mapping_job(): assert len(mapping_records) == 0 # Verify that no annotation statuses were created - annotation_statuses = session.query(VariantAnnotationStatus).all() + annotation_statuses = session.query(AnnotationEvent).all() assert len(annotation_statuses) == 0 # Verify that the job status was updated. @@ -1928,7 +1912,7 @@ async def dummy_mapping_job(): assert len(mapping_records) == 0 # Verify that no annotation statuses were created - annotation_statuses = session.query(VariantAnnotationStatus).all() + annotation_statuses = session.query(AnnotationEvent).all() assert len(annotation_statuses) == 0 # Verify that the job status was updated. diff --git a/tests/worker/jobs/variant_processing/test_reverse_translation.py b/tests/worker/jobs/variant_processing/test_reverse_translation.py index 970fa854..1a77e8e8 100644 --- a/tests/worker/jobs/variant_processing/test_reverse_translation.py +++ b/tests/worker/jobs/variant_processing/test_reverse_translation.py @@ -21,7 +21,7 @@ from mavedb.models.mapping_record_allele import MappingRecordAllele from mavedb.models.target_gene_mapping import TargetGeneMapping from mavedb.models.variant import Variant -from mavedb.models.variant_annotation_status import VariantAnnotationStatus +from mavedb.models.annotation_event import AnnotationEvent from mavedb.worker.jobs.variant_processing.mapping import map_variants_for_score_set from mavedb.worker.jobs.variant_processing.reverse_translation import ( _build_translation_config, @@ -143,17 +143,19 @@ async def _reverse_translate(session, mock_worker_ctx, rt_run): ) -def _cross_level_statuses(session, score_set_id, status=None): +def _cross_level_events(session, score_set_id, disposition=None, reason=None): query = ( - session.query(VariantAnnotationStatus) - .join(Variant, VariantAnnotationStatus.variant_id == Variant.id) + session.query(AnnotationEvent) + .join(Variant, AnnotationEvent.variant_id == Variant.id) .filter( Variant.score_set_id == score_set_id, - VariantAnnotationStatus.annotation_type == "cross_level_translation", + AnnotationEvent.annotation_type == "cross_level_translation", ) ) - if status is not None: - query = query.filter(VariantAnnotationStatus.status == status) + if disposition is not None: + query = query.filter(AnnotationEvent.disposition == disposition) + if reason is not None: + query = query.filter(AnnotationEvent.reason == reason) return query.all() @@ -226,7 +228,7 @@ async def test_no_mapping_records_is_a_noop( # No candidate alleles, links, or annotations were produced. assert session.query(Allele).count() == 0 assert _non_authoritative_links(session) == [] - assert _cross_level_statuses(session, sample_score_set.id) == [] + assert _cross_level_events(session, sample_score_set.id) == [] async def test_creates_genomic_and_coding_candidate_alleles( self, @@ -287,9 +289,9 @@ async def test_creates_genomic_and_coding_candidate_alleles( mapping_record = session.query(MappingRecord).filter(MappingRecord.variant_id == variant.id).one() assert {link.mapping_record_id for link in non_auth_links} == {mapping_record.id} - statuses = _cross_level_statuses(session, sample_score_set.id) - assert len(statuses) == 1 - assert statuses[0].status == "success" + events = _cross_level_events(session, sample_score_set.id) + assert len(events) == 1 + assert events[0].disposition == "present" async def test_transcript_is_queryable_via_derived_expression( self, @@ -604,10 +606,11 @@ async def test_independent_rerun_retires_prior_links_and_keeps_current_stable( # The two candidate alleles are reused across runs, not duplicated. assert session.query(Allele).filter(Allele.vrs_digest.in_(["ga4gh:VA.genomic", "ga4gh:VA.coding"])).count() == 2 - # The cross-level translation status is versioned the same way: prior retired, one current. - statuses = _cross_level_statuses(session, sample_score_set.id) - assert len(statuses) == 2 - assert len([s for s in statuses if s.current]) == 1 + # The cross-level translation log is append-only: the rerun adds a second event, and + # "current" is the latest by id (there is no current flag to desync). + events = _cross_level_events(session, sample_score_set.id) + assert len(events) == 2 + assert max(events, key=lambda e: e.id).disposition == "present" async def test_all_failures_fail_the_job( self, @@ -644,11 +647,11 @@ async def test_all_failures_fail_the_job( assert result.data == {"translated": 0, "failed": 1, "skipped": 0, "alleles_created": 0} assert _non_authoritative_links(session) == [] - failed_statuses = _cross_level_statuses(session, sample_score_set.id, status="failed") - assert len(failed_statuses) == 1 - assert failed_statuses[0].error_message == "forward translation failed" + failed_events = _cross_level_events(session, sample_score_set.id, disposition="failed") + assert len(failed_events) == 1 + assert failed_events[0].event_metadata["error_message"] == "forward translation failed" # Library/input-level failures still carry the assay-level HGVS as metadata. - assert failed_statuses[0].annotation_metadata == {"hgvs_input": assay_hgvs} + assert failed_events[0].event_metadata["hgvs_input"] == assay_hgvs async def test_partial_success_succeeds_with_mixed_annotations( self, @@ -696,8 +699,8 @@ async def test_partial_success_succeeds_with_mixed_annotations( assert result.status == JobStatus.SUCCEEDED assert result.data == {"translated": 1, "failed": 1, "skipped": 0, "alleles_created": 1} - assert len(_cross_level_statuses(session, sample_score_set.id, status="success")) == 1 - assert len(_cross_level_statuses(session, sample_score_set.id, status="failed")) == 1 + assert len(_cross_level_events(session, sample_score_set.id, disposition="present")) == 1 + assert len(_cross_level_events(session, sample_score_set.id, disposition="failed")) == 1 assert len(_non_authoritative_links(session)) == 1 async def test_partial_candidate_translation_failure_keeps_success_with_metadata( @@ -742,9 +745,9 @@ async def test_partial_candidate_translation_failure_keeps_success_with_metadata assert result.data == {"translated": 1, "failed": 0, "skipped": 0, "alleles_created": 1} assert len(_non_authoritative_links(session)) == 1 - statuses = _cross_level_statuses(session, sample_score_set.id, status="success") - assert len(statuses) == 1 - failed_candidates = statuses[0].annotation_metadata["failed_candidates"] + events = _cross_level_events(session, sample_score_set.id, disposition="present") + assert len(events) == 1 + failed_candidates = events[0].event_metadata["failed_candidates"] assert failed_candidates == [{"hgvs": bad_candidate, "level": "genomic", "error": "untranslatable form"}] async def test_all_candidates_failing_translation_marks_variant_failed( @@ -785,16 +788,16 @@ async def test_all_candidates_failing_translation_marks_variant_failed( assert result.data == {"translated": 0, "failed": 1, "skipped": 0, "alleles_created": 0} assert _non_authoritative_links(session) == [] - failed_statuses = _cross_level_statuses(session, sample_score_set.id, status="failed") - assert len(failed_statuses) == 1 - assert failed_statuses[0].error_message == "All candidate HGVS failed VRS translation." - metadata = failed_statuses[0].annotation_metadata + failed_events = _cross_level_events(session, sample_score_set.id, disposition="failed") + assert len(failed_events) == 1 + assert failed_events[0].event_metadata["error_message"] == "All candidate HGVS failed VRS translation." + metadata = failed_events[0].event_metadata assert metadata["hgvs_input"] == assay_hgvs assert metadata["failed_candidates"] == [ {"hgvs": bad_candidate, "level": "genomic", "error": "untranslatable form"} ] - async def test_no_coding_transcript_is_skipped_not_failed( + async def test_unresolved_transcript_is_tallied_as_skip_but_recorded_as_failed( self, session, with_independent_processing_runs, @@ -804,8 +807,11 @@ async def test_no_coding_transcript_is_skipped_not_failed( sample_independent_reverse_translation_run, sample_score_set, ): - """A target gene aligned only at the genomic level has no coding transcript, so its - variants are SKIPPED (no protein consequence) rather than attempted and failed.""" + """A protein-coding target aligned only at the genomic level has no resolvable coding + transcript (transcript_unresolved). The job tallies it in its `skipped` counter, but the + recoverable gap is recorded as a `failed` event — distinct from a hard translation + failure (reason `translation_failed`) and from a true biological absence + (no_coding_transcript → `absent`).""" variant = Variant( score_set_id=sample_score_set.id, urn="variant:1", @@ -831,8 +837,12 @@ async def test_no_coding_transcript_is_skipped_not_failed( # No translation attempted: no candidate alleles or links, and the variant is # recorded as SKIPPED rather than FAILED. assert _non_authoritative_links(session) == [] - assert _cross_level_statuses(session, sample_score_set.id, status="failed") == [] - assert len(_cross_level_statuses(session, sample_score_set.id, status="skipped")) == 1 + # The job counts this as a skip, but a protein-coding target with no resolvable coding + # transcript is a *recoverable* gap (transcript_unresolved) → recorded as a `failed` event, + # distinct from a hard translation failure (which carries reason `translation_failed`). + unresolved = _cross_level_events(session, sample_score_set.id, reason="transcript_unresolved") + assert len(unresolved) == 1 + assert unresolved[0].disposition == "failed" async def test_genomic_accession_coding_target_is_reverse_translated( self, @@ -875,7 +885,8 @@ async def test_genomic_accession_coding_target_is_reverse_translated( assert result.status == JobStatus.SUCCEEDED assert result.data == {"translated": 1, "failed": 0, "skipped": 0, "alleles_created": 1} - assert _cross_level_statuses(session, sample_score_set.id, status="skipped") == [] + events = _cross_level_events(session, sample_score_set.id) + assert events and all(e.disposition == "present" for e in events) def _run_mapped_date(self, session, target_gene_id): """The mapped_date the (mock) mapping run stamped on its TargetGeneMappings.""" @@ -999,9 +1010,9 @@ async def test_ignores_stale_cdna_row_from_a_different_run( result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) assert result.data == {"translated": 0, "failed": 0, "skipped": 1, "alleles_created": 0} - skipped = _cross_level_statuses(session, sample_score_set.id, status="skipped") + skipped = _cross_level_events(session, sample_score_set.id, reason="transcript_unresolved") assert len(skipped) == 1 - assert skipped[0].annotation_metadata["skip_category"] == "transcript_unresolved" + assert skipped[0].disposition == "failed" async def test_coding_target_skip_is_classified_recoverable( self, @@ -1037,9 +1048,9 @@ async def test_coding_target_skip_is_classified_recoverable( result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) assert result.data == {"translated": 0, "failed": 0, "skipped": 1, "alleles_created": 0} - skipped = _cross_level_statuses(session, sample_score_set.id, status="skipped") + skipped = _cross_level_events(session, sample_score_set.id, reason="transcript_unresolved") assert len(skipped) == 1 - assert skipped[0].annotation_metadata["skip_category"] == "transcript_unresolved" + assert skipped[0].disposition == "failed" async def test_regulatory_target_skip_is_classified_correct( self, @@ -1077,9 +1088,9 @@ async def test_regulatory_target_skip_is_classified_correct( result = await _reverse_translate(session, mock_worker_ctx, sample_independent_reverse_translation_run) assert result.data == {"translated": 0, "failed": 0, "skipped": 1, "alleles_created": 0} - skipped = _cross_level_statuses(session, sample_score_set.id, status="skipped") + skipped = _cross_level_events(session, sample_score_set.id, reason="no_coding_transcript") assert len(skipped) == 1 - assert skipped[0].annotation_metadata["skip_category"] == "no_coding_transcript" + assert skipped[0].disposition == "absent" async def test_translation_config_param_overrides_job_defaults( self,