diff --git a/.gitignore b/.gitignore index 4bf6245e0..73d3f2c7e 100644 --- a/.gitignore +++ b/.gitignore @@ -40,7 +40,7 @@ transfers/data/nma_csv_cache/* transfers/metrics/* transfers/logs/* run_bdd-local.sh - +.pre-commit-config.local.yaml # deployment files app.yaml \ No newline at end of file diff --git a/admin/config.py b/admin/config.py index e88dfdc37..1c43f15e6 100644 --- a/admin/config.py +++ b/admin/config.py @@ -36,13 +36,14 @@ GroupAdmin, NotesAdmin, SampleAdmin, + ChemistrySampleInfoAdmin, GeologicFormationAdmin, DataProvenanceAdmin, FieldEventAdmin, FieldActivityAdmin, - FieldEventParticipantAdmin, ParameterAdmin, ) + from db.engine import engine from db.location import Location from db.thing import Thing @@ -60,10 +61,10 @@ from db.group import Group from db.notes import Notes from db.sample import Sample +from db.nma_legacy import ChemistrySampleInfo from db.geologic_formation import GeologicFormation from db.data_provenance import DataProvenance -from db.field import FieldEvent, FieldActivity, FieldEventParticipant -from db.permission_history import PermissionHistory +from db.field import FieldEvent, FieldActivity from db.parameter import Parameter @@ -126,6 +127,7 @@ def create_admin(app): # Samples admin.add_view(SampleAdmin(Sample)) + admin.add_view(ChemistrySampleInfoAdmin(ChemistrySampleInfo)) # Field admin.add_view(FieldEventAdmin(FieldEvent)) diff --git a/admin/views/__init__.py b/admin/views/__init__.py index 74c2c141b..6b1de4c09 100644 --- a/admin/views/__init__.py +++ b/admin/views/__init__.py @@ -31,6 +31,7 @@ from admin.views.group import GroupAdmin from admin.views.notes import NotesAdmin from admin.views.sample import SampleAdmin +from admin.views.chemistry_sampleinfo import ChemistrySampleInfoAdmin from admin.views.geologic_formation import GeologicFormationAdmin from admin.views.data_provenance import DataProvenanceAdmin from admin.views.field import ( @@ -55,6 +56,7 @@ "GroupAdmin", "NotesAdmin", "SampleAdmin", + "ChemistrySampleInfoAdmin", "GeologicFormationAdmin", "DataProvenanceAdmin", "FieldEventAdmin", diff --git a/admin/views/chemistry_sampleinfo.py b/admin/views/chemistry_sampleinfo.py new file mode 100644 index 000000000..adeff45a7 --- /dev/null +++ b/admin/views/chemistry_sampleinfo.py @@ -0,0 +1,140 @@ +# =============================================================================== +# Copyright 2026 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== +""" +ChemistrySampleInfoAdmin view for legacy Chemistry_SampleInfo. +""" +from admin.views.base import OcotilloModelView + + +class ChemistrySampleInfoAdmin(OcotilloModelView): + """ + Admin view for ChemistrySampleInfo model. + """ + + # ========== Basic Configuration ========== + + name = "Chemistry Sample Info" + label = "Chemistry Sample Info" + icon = "fa fa-flask" + + # ========== List View ========== + + column_list = [ + "object_id", + "sample_point_id", + "sample_pt_id", + "wclab_id", + "collection_date", + "sample_type", + "data_source", + "data_quality", + "public_release", + ] + + column_sortable_list = [ + "object_id", + "sample_point_id", + "sample_pt_id", + "wclab_id", + "collection_date", + "sample_type", + "data_source", + "data_quality", + "public_release", + ] + + column_default_sort = ("collection_date", True) + + search_fields = [ + "sample_point_id", + "sample_pt_id", + "wclab_id", + "collected_by", + "analyses_agency", + "sample_notes", + ] + + column_filters = [ + "collection_date", + "sample_type", + "sample_material_not_h2o", + "water_type", + "study_sample", + "data_source", + "data_quality", + "public_release", + ] + + can_export = True + export_types = ["csv", "excel"] + + page_size = 50 + page_size_options = [25, 50, 100, 200] + + # ========== Form View ========== + + fields = [ + "object_id", + "sample_point_id", + "sample_pt_id", + "wclab_id", + "collection_date", + "collection_method", + "collected_by", + "analyses_agency", + "sample_type", + "sample_material_not_h2o", + "water_type", + "study_sample", + "data_source", + "data_quality", + "public_release", + "added_day_to_date", + "added_month_day_to_date", + "sample_notes", + ] + + exclude_fields_from_create = [ + "object_id", + ] + + exclude_fields_from_edit = [ + "object_id", + ] + + labels = { + "object_id": "OBJECTID", + "sample_point_id": "SamplePointID", + "sample_pt_id": "SamplePtID", + "wclab_id": "WCLab ID", + "collection_date": "Collection Date", + "collection_method": "Collection Method", + "collected_by": "Collected By", + "analyses_agency": "Analyses Agency", + "sample_type": "Sample Type", + "sample_material_not_h2o": "Sample Material Not H2O", + "water_type": "Water Type", + "study_sample": "Study Sample", + "data_source": "Data Source", + "data_quality": "Data Quality", + "public_release": "Public Release", + "added_day_to_date": "Added Day to Date", + "added_month_day_to_date": "Added Month Day to Date", + "sample_notes": "Sample Notes", + } + + +# ============= EOF ============================================= diff --git a/alembic/versions/b7d4c6a1b2c3_create_chemistry_sampleinfo.py b/alembic/versions/b7d4c6a1b2c3_create_chemistry_sampleinfo.py new file mode 100644 index 000000000..07aa1f758 --- /dev/null +++ b/alembic/versions/b7d4c6a1b2c3_create_chemistry_sampleinfo.py @@ -0,0 +1,54 @@ +"""Create legacy Chemistry_SampleInfo table. + +Revision ID: b7d4c6a1b2c3 +Revises: 4b7aa74b15ad +Create Date: 2026-02-10 02:00:00.000000 +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa +from sqlalchemy import inspect + +# revision identifiers, used by Alembic. +revision: str = "b7d4c6a1b2c3" +down_revision: Union[str, Sequence[str], None] = "5f4e2b0a6b8b" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Create the legacy chemistry sample info table used for backfill.""" + bind = op.get_bind() + inspector = inspect(bind) + if not inspector.has_table("NMA_Chemistry_SampleInfo"): + op.create_table( + "NMA_Chemistry_SampleInfo", + sa.Column("OBJECTID", sa.Integer(), primary_key=True), + sa.Column("SamplePointID", sa.String(length=50), nullable=True), + sa.Column("SamplePtID", sa.String(length=50), nullable=True), + sa.Column("WCLab_ID", sa.String(length=50), nullable=True), + sa.Column("CollectionDate", sa.Date(), nullable=True), + sa.Column("CollectionMethod", sa.String(length=100), nullable=True), + sa.Column("CollectedBy", sa.String(length=100), nullable=True), + sa.Column("AnalysesAgency", sa.String(length=100), nullable=True), + sa.Column("SampleType", sa.String(length=100), nullable=True), + sa.Column("SampleMaterialNotH2O", sa.Boolean(), nullable=True), + sa.Column("WaterType", sa.String(length=100), nullable=True), + sa.Column("StudySample", sa.Boolean(), nullable=True), + sa.Column("DataSource", sa.String(length=100), nullable=True), + sa.Column("DataQuality", sa.String(length=100), nullable=True), + sa.Column("PublicRelease", sa.Boolean(), nullable=True), + sa.Column("AddedDaytoDate", sa.String(length=10), nullable=True), + sa.Column("AddedMonthDaytoDate", sa.String(length=10), nullable=True), + sa.Column("SampleNotes", sa.Text(), nullable=True), + ) + + +def downgrade() -> None: + """Drop the legacy chemistry sample info table.""" + bind = op.get_bind() + inspector = inspect(bind) + if inspector.has_table("NMA_Chemistry_SampleInfo"): + op.drop_table("NMA_Chemistry_SampleInfo") diff --git a/db/nma_legacy.py b/db/nma_legacy.py index 8033dcc47..9d300d7e5 100644 --- a/db/nma_legacy.py +++ b/db/nma_legacy.py @@ -26,6 +26,7 @@ Float, Integer, String, + Text, ) from sqlalchemy.orm import Mapped, mapped_column @@ -148,4 +149,45 @@ class ViewNGWMNLithology(Base): ) +class ChemistrySampleInfo(Base): + """ + Legacy Chemistry SampleInfo table from AMPAPI. + """ + + __tablename__ = "NMA_Chemistry_SampleInfo" + + object_id: Mapped[int] = mapped_column("OBJECTID", Integer, primary_key=True) + sample_point_id: Mapped[Optional[str]] = mapped_column("SamplePointID", String(50)) + sample_pt_id: Mapped[Optional[str]] = mapped_column("SamplePtID", String(50)) + wclab_id: Mapped[Optional[str]] = mapped_column("WCLab_ID", String(50)) + + collection_date: Mapped[Optional[date]] = mapped_column("CollectionDate", Date) + collection_method: Mapped[Optional[str]] = mapped_column( + "CollectionMethod", String(100) + ) + collected_by: Mapped[Optional[str]] = mapped_column("CollectedBy", String(100)) + analyses_agency: Mapped[Optional[str]] = mapped_column( + "AnalysesAgency", String(100) + ) + + sample_type: Mapped[Optional[str]] = mapped_column("SampleType", String(100)) + sample_material_not_h2o: Mapped[Optional[bool]] = mapped_column( + "SampleMaterialNotH2O", Boolean + ) + water_type: Mapped[Optional[str]] = mapped_column("WaterType", String(100)) + study_sample: Mapped[Optional[bool]] = mapped_column("StudySample", Boolean) + + data_source: Mapped[Optional[str]] = mapped_column("DataSource", String(100)) + data_quality: Mapped[Optional[str]] = mapped_column("DataQuality", String(100)) + public_release: Mapped[Optional[bool]] = mapped_column("PublicRelease", Boolean) + + added_day_to_date: Mapped[Optional[str]] = mapped_column( + "AddedDaytoDate", String(10) + ) + added_month_day_to_date: Mapped[Optional[str]] = mapped_column( + "AddedMonthDaytoDate", String(10) + ) + sample_notes: Mapped[Optional[str]] = mapped_column("SampleNotes", Text) + + # ============= EOF ============================================= diff --git a/run_backfill.sh b/run_backfill.sh index 42e6e7620..b81149165 100755 --- a/run_backfill.sh +++ b/run_backfill.sh @@ -16,5 +16,7 @@ set -a source "$ENV_FILE" set +a +uv run alembic upgrade head + # Forward any args (e.g., --batch-size 500) python -m transfers.backfill.staging "$@" diff --git a/transfers/backfill/chemistry_sampleinfo.py b/transfers/backfill/chemistry_sampleinfo.py new file mode 100644 index 000000000..711548052 --- /dev/null +++ b/transfers/backfill/chemistry_sampleinfo.py @@ -0,0 +1,162 @@ +# =============================================================================== +# Copyright 2026 ross +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== + +from __future__ import annotations + +from typing import Any, Optional + +import pandas as pd +from sqlalchemy.dialects.postgresql import insert +from sqlalchemy.orm import Session + +from db import ChemistrySampleInfo +from transfers.logger import logger +from transfers.transferer import Transferer +from transfers.util import read_csv + + +class ChemistrySampleInfoBackfill(Transferer): + """ + Backfill for the legacy Chemistry_SampleInfo table. + + Loads the CSV and upserts into the legacy table for backfill workflows. + """ + + source_table = "Chemistry_SampleInfo" + + def __init__(self, *args, batch_size: int = 1000, **kwargs): + super().__init__(*args, **kwargs) + self.batch_size = batch_size + + def _get_dfs(self) -> tuple[pd.DataFrame, pd.DataFrame]: + input_df = read_csv(self.source_table, parse_dates=["CollectionDate"]) + return input_df, input_df + + def _transfer_hook(self, session: Session) -> None: + rows = self._dedupe_rows( + [self._row_dict(row) for row in self.cleaned_df.to_dict("records")], + key="OBJECTID", + ) + + insert_stmt = insert(ChemistrySampleInfo) + excluded = insert_stmt.excluded + + for i in range(0, len(rows), self.batch_size): + chunk = rows[i : i + self.batch_size] + logger.info( + f"Upserting batch {i}-{i+len(chunk)-1} ({len(chunk)} rows) into Chemistry_SampleInfo" + ) + stmt = insert_stmt.values(chunk).on_conflict_do_update( + index_elements=["OBJECTID"], + set_={ + "SamplePointID": excluded.SamplePointID, + "SamplePtID": excluded.SamplePtID, + "WCLab_ID": excluded.WCLab_ID, + "CollectionDate": excluded.CollectionDate, + "CollectionMethod": excluded.CollectionMethod, + "CollectedBy": excluded.CollectedBy, + "AnalysesAgency": excluded.AnalysesAgency, + "SampleType": excluded.SampleType, + "SampleMaterialNotH2O": excluded.SampleMaterialNotH2O, + "WaterType": excluded.WaterType, + "StudySample": excluded.StudySample, + "DataSource": excluded.DataSource, + "DataQuality": excluded.DataQuality, + "PublicRelease": excluded.PublicRelease, + "AddedDaytoDate": excluded.AddedDaytoDate, + "AddedMonthDaytoDate": excluded.AddedMonthDaytoDate, + "SampleNotes": excluded.SampleNotes, + }, + ) + session.execute(stmt) + session.commit() + session.expunge_all() + + def _row_dict(self, row: dict[str, Any]) -> dict[str, Any]: + def val(key: str) -> Optional[Any]: + v = row.get(key) + if pd.isna(v): + return None + return v + + def bool_val(key: str) -> Optional[bool]: + v = val(key) + if v is None: + return None + if isinstance(v, bool): + return v + if isinstance(v, (int, float)): + return bool(int(v)) + if isinstance(v, str): + normalized = v.strip().lower() + if normalized in {"y", "yes", "true", "t", "1"}: + return True + if normalized in {"n", "no", "false", "f", "0"}: + return False + return None + + collection_date = val("CollectionDate") + if hasattr(collection_date, "date"): + collection_date = collection_date.date() + + return { + "OBJECTID": val("OBJECTID"), + "SamplePointID": val("SamplePointID"), + "SamplePtID": val("SamplePtID"), + "WCLab_ID": val("WCLab_ID"), + "CollectionDate": collection_date, + "CollectionMethod": val("CollectionMethod"), + "CollectedBy": val("CollectedBy"), + "AnalysesAgency": val("AnalysesAgency"), + "SampleType": val("SampleType"), + "SampleMaterialNotH2O": bool_val("SampleMaterialNotH2O"), + "WaterType": val("WaterType"), + "StudySample": bool_val("StudySample"), + "DataSource": val("DataSource"), + "DataQuality": val("DataQuality"), + "PublicRelease": bool_val("PublicRelease"), + "AddedDaytoDate": val("AddedDaytoDate"), + "AddedMonthDaytoDate": val("AddedMonthDaytoDate"), + "SampleNotes": val("SampleNotes"), + } + + def _dedupe_rows( + self, rows: list[dict[str, Any]], key: str + ) -> list[dict[str, Any]]: + """ + Deduplicate rows within a batch by the given key to avoid ON CONFLICT loops. + Later rows win. + """ + deduped = {} + for row in rows: + oid = row.get(key) + if oid is None: + continue + deduped[oid] = row + return list(deduped.values()) + + +def run(batch_size: int = 1000) -> None: + """Entrypoint to execute the backfill.""" + transferer = ChemistrySampleInfoBackfill(batch_size=batch_size) + transferer.transfer() + + +if __name__ == "__main__": + # Allow running via `python -m transfers.backfill.chemistry_sampleinfo` + run() + +# ============= EOF ============================================= diff --git a/transfers/backfill/staging.py b/transfers/backfill/staging.py index 172b67371..94f679506 100644 --- a/transfers/backfill/staging.py +++ b/transfers/backfill/staging.py @@ -32,6 +32,7 @@ from transfers.backfill.waterlevelscontinuous_pressure_daily import ( run as run_pressure_daily, ) +from transfers.backfill.chemistry_sampleinfo import run as run_chemistry_sampleinfo from transfers.logger import logger @@ -41,6 +42,7 @@ def run(batch_size: int = 1000) -> None: """ steps = ( ("WaterLevelsContinuous_Pressure_Daily", run_pressure_daily), + ("Chemistry_SampleInfo", run_chemistry_sampleinfo), ("NGWMN views", run_ngwmn_views), )