From 99a84e5249e64d00abbdcf7330f9e36c2409e7dc Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 17 Jun 2026 10:20:51 -0600 Subject: [PATCH 1/9] feat(db): nightly pg_cron refresh of pygeoapi materialized views Register a pg_cron job that refreshes the pygeoapi materialized views once a night in production, with the schedule traceable in version control via an alembic migration. - alembic migration x2y3z4a5b6c7 creates the pg_cron extension, a public.refresh_pygeoapi_materialized_views() helper, and a nightly cron job (0 9 * * *, server timezone). Idempotent: it unschedules any same-named job before re-registering. - pg_cron is production-only. The migration is a no-op unless ENABLE_PG_CRON is truthy, so alembic upgrade head still works on the dev/test/CI Postgres image (which does not preload pg_cron). - services/materialized_views.py is the single source of truth for the view list, shared by the CLI refresh command and the migration. - docker/db/Dockerfile (production image) installs pg_cron and preloads it with cron.database_name pointed at the app database. - CD_staging.yml and CD_production.yml set ENABLE_PG_CRON=1 on the migration step. - docs/pg_cron-nightly-refresh.md documents setup (self-hosted Docker and Cloud SQL), verification, and the non-concurrent REFRESH rationale. Co-Authored-By: Claude Fable 5 --- .env.example | 7 + .github/workflows/CD_production.yml | 5 + .github/workflows/CD_staging.yml | 5 + ...chedule_nightly_matview_refresh_pg_cron.py | 144 ++++++++++++++++++ cli/cli.py | 13 +- docker/db/Dockerfile | 28 ++++ docs/pg_cron-nightly-refresh.md | 100 ++++++++++++ services/materialized_views.py | 17 +++ 8 files changed, 308 insertions(+), 11 deletions(-) create mode 100644 alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py create mode 100644 docs/pg_cron-nightly-refresh.md create mode 100644 services/materialized_views.py diff --git a/.env.example b/.env.example index 27f624d4c..1645fa31c 100644 --- a/.env.example +++ b/.env.example @@ -48,6 +48,13 @@ GOOGLE_APPLICATION_CREDENTIALS=/path/to/gcs_credentials.json # set to development for lexicon and parameter to be populated and enable the enums to work MODE=development +# pg_cron nightly materialized-view refresh (PRODUCTION ONLY). +# Leave unset/0 in development, test, and CI: the dev Postgres image does not +# load pg_cron, and alembic migration x2y3z4a5b6c7 is a no-op when this is off. +# Set to 1 in production (DB server has shared_preload_libraries=pg_cron) to +# register the nightly refresh job. See docs/pg_cron-nightly-refresh.md. +# ENABLE_PG_CRON=0 + # disable authentication (for development only) AUTHENTIK_DISABLE_AUTHENTICATION=1 diff --git a/.github/workflows/CD_production.yml b/.github/workflows/CD_production.yml index 155bd1db1..1160c140c 100644 --- a/.github/workflows/CD_production.yml +++ b/.github/workflows/CD_production.yml @@ -90,6 +90,11 @@ jobs: CLOUD_SQL_DATABASE: "${{ vars.CLOUD_SQL_DATABASE }}" CLOUD_SQL_USER: "${{ secrets.CLOUD_SQL_USER }}" CLOUD_SQL_IAM_AUTH: true + # Register the nightly pg_cron materialized-view refresh job. + # Requires the Cloud SQL instance flag cloudsql.enable_pg_cron=on and + # cron.database_name set to CLOUD_SQL_DATABASE. See + # docs/pg_cron-nightly-refresh.md. + ENABLE_PG_CRON: "1" run: | uv run --no-dev alembic upgrade head diff --git a/.github/workflows/CD_staging.yml b/.github/workflows/CD_staging.yml index 047237d9d..0ec5baed0 100644 --- a/.github/workflows/CD_staging.yml +++ b/.github/workflows/CD_staging.yml @@ -55,6 +55,11 @@ jobs: CLOUD_SQL_DATABASE: "${{ vars.CLOUD_SQL_DATABASE }}" CLOUD_SQL_USER: "${{ secrets.CLOUD_SQL_USER }}" CLOUD_SQL_IAM_AUTH: true + # Register the nightly pg_cron materialized-view refresh job. + # Requires the Cloud SQL instance flag cloudsql.enable_pg_cron=on and + # cron.database_name set to CLOUD_SQL_DATABASE. See + # docs/pg_cron-nightly-refresh.md. + ENABLE_PG_CRON: "1" run: | uv run --no-dev alembic upgrade head diff --git a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py new file mode 100644 index 000000000..e85428869 --- /dev/null +++ b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py @@ -0,0 +1,144 @@ +"""schedule nightly materialized-view refresh via pg_cron + +Registers a pg_cron job that refreshes the pygeoapi materialized views +once a night. The job is created through a SQL helper function, +``public.refresh_pygeoapi_materialized_views()``, so the list of views and +the refresh logic live in the database and version control together. + +pg_cron is a *production-only* dependency. It requires the extension to be +loaded via ``shared_preload_libraries`` on the database server, which the +development docker-compose Postgres image does not do. To avoid breaking +``alembic upgrade head`` in development (and in test/CI), this migration is a +no-op unless ``ENABLE_PG_CRON`` is truthy in the environment. Production sets +``ENABLE_PG_CRON=1``; everywhere else the migration records itself as applied +without touching pg_cron. See ``docs/pg_cron-nightly-refresh.md``. + +Revision ID: x2y3z4a5b6c7 +Revises: w1x2y3z4a5b6 +Create Date: 2026-06-17 00:00:00.000000 +""" + +import re +from typing import Sequence, Union + +from alembic import op +from sqlalchemy import text + +from services.env import get_bool_env +from services.materialized_views import PYGEOAPI_MATERIALIZED_VIEWS + +# revision identifiers, used by Alembic. +revision: str = "x2y3z4a5b6c7" +down_revision: Union[str, Sequence[str], None] = "w1x2y3z4a5b6" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + +# Name of the pg_cron job. Used to (re)register and to unschedule. +CRON_JOB_NAME = "refresh-pygeoapi-materialized-views" + +# Nightly schedule in standard cron syntax. pg_cron interprets this in the +# database server's timezone (UTC on Cloud SQL / the docker image), so 09:00 +# UTC is roughly 02:00-03:00 in US Mountain time -- comfortably off-peak. +CRON_SCHEDULE = "0 9 * * *" + + +def _build_refresh_function_sql() -> str: + """Build the helper function body from the shared view list. + + The view set is owned by ``services.materialized_views`` (the single source + of truth shared with the CLI). Plain (non-concurrent) REFRESH is used + deliberately: REFRESH ... CONCURRENTLY cannot run inside the implicit + transaction of a PL/pgSQL function, and the nightly window tolerates the + brief exclusive lock. Each view is guarded by an existence check so a + missing view never aborts the whole run. + """ + for name in PYGEOAPI_MATERIALIZED_VIEWS: + # These names are baked into a SQL literal array below; validate them + # rather than trust the constant blindly. + if not re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", name): + raise ValueError(f"Invalid materialized view name: {name!r}") + + array_literal = ",\n ".join(f"'{name}'" for name in PYGEOAPI_MATERIALIZED_VIEWS) + return f""" +CREATE OR REPLACE FUNCTION public.refresh_pygeoapi_materialized_views() +RETURNS void +LANGUAGE plpgsql +AS $func$ +DECLARE + v text; + views text[] := ARRAY[ + {array_literal} + ]; +BEGIN + FOREACH v IN ARRAY views LOOP + IF EXISTS (SELECT 1 FROM pg_matviews WHERE matviewname = v) THEN + EXECUTE format('REFRESH MATERIALIZED VIEW %I', v); + END IF; + END LOOP; +END; +$func$; +""" + + +def _pg_cron_enabled() -> bool: + """pg_cron is only wired up where the server explicitly enables it.""" + return get_bool_env("ENABLE_PG_CRON", False) is True + + +def upgrade() -> None: + if not _pg_cron_enabled(): + print( + "ENABLE_PG_CRON is not set; skipping pg_cron job registration " + "(expected in development, test, and CI)." + ) + return + + bind = op.get_bind() + + # Requires shared_preload_libraries to include 'pg_cron' and the extension + # to be creatable in this database (cron.database_name = this DB). See docs. + op.execute(text("CREATE EXTENSION IF NOT EXISTS pg_cron")) + + # (Re)create the refresh helper. + op.execute(text(_build_refresh_function_sql())) + + # Drop any previously registered job with the same name so re-running this + # migration (or a re-deploy) does not accumulate duplicate schedules. + op.execute( + text( + "SELECT cron.unschedule(jobid) FROM cron.job " + "WHERE jobname = :name" + ).bindparams(name=CRON_JOB_NAME) + ) + + bind.execute( + text("SELECT cron.schedule(:name, :sched, :cmd)").bindparams( + name=CRON_JOB_NAME, + sched=CRON_SCHEDULE, + cmd="SELECT public.refresh_pygeoapi_materialized_views();", + ) + ) + + print( + f"Registered pg_cron job '{CRON_JOB_NAME}' " + f"(schedule '{CRON_SCHEDULE}', server timezone)." + ) + + +def downgrade() -> None: + if not _pg_cron_enabled(): + print("ENABLE_PG_CRON is not set; nothing to unschedule.") + return + + op.execute( + text( + "SELECT cron.unschedule(jobid) FROM cron.job " + "WHERE jobname = :name" + ).bindparams(name=CRON_JOB_NAME) + ) + op.execute( + text("DROP FUNCTION IF EXISTS public.refresh_pygeoapi_materialized_views()") + ) + # The pg_cron extension itself is left installed: it is a server-level + # capability that other jobs may depend on, and dropping it is not the + # inverse of "schedule a job". diff --git a/cli/cli.py b/cli/cli.py index 30c9742f6..f68857a5a 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -24,6 +24,8 @@ import typer from dotenv import load_dotenv +from services.materialized_views import PYGEOAPI_MATERIALIZED_VIEWS + # CLI should load `.env` defaults without clobbering an explicitly prepared environment. load_dotenv(override=False) os.environ.setdefault("OCO_LOG_CONTEXT", "cli") @@ -50,17 +52,6 @@ class SmokePopulation(str, Enum): agreed = "agreed" -PYGEOAPI_MATERIALIZED_VIEWS = ( - "ogc_latest_depth_to_water_wells", - "ogc_water_elevation_wells", - "ogc_avg_tds_wells", - "ogc_depth_to_water_trend_wells", - "ogc_water_well_summary", - "ogc_major_chemistry_results", - "ogc_minor_chemistry_wells", -) - - def _resolve_theme(theme: ThemeMode) -> ThemeMode: if theme != ThemeMode.auto: return theme diff --git a/docker/db/Dockerfile b/docker/db/Dockerfile index 4a1fbd51b..e250b7e3c 100644 --- a/docker/db/Dockerfile +++ b/docker/db/Dockerfile @@ -1 +1,29 @@ +# Production database image: PostGIS + pg_cron. +# +# This image is intentionally NOT used by the development docker-compose +# service (which runs the stock postgis/postgis image). pg_cron is a +# production-only dependency required by the nightly materialized-view refresh +# job registered in alembic migration x2y3z4a5b6c7. +# +# pg_cron must be loaded via shared_preload_libraries, and its background +# worker schedules jobs in a single database (cron.database_name). Both are set +# below so the alembic-registered job runs against the application database. +# +# Build/run example: +# docker build -f docker/db/Dockerfile -t ocotillo-db-prod . +# docker run -e POSTGRES_DB=ocotilloapi ocotillo-db-prod +# +# On Google Cloud SQL, pg_cron is enabled via the cloudsql.enable_pg_cron flag +# instead of this image; see docs/pg_cron-nightly-refresh.md. FROM postgis/postgis:17-3.5 + +# Install the pg_cron extension for PostgreSQL 17. +RUN apt-get update \ + && apt-get install -y --no-install-recommends postgresql-17-cron \ + && rm -rf /var/lib/apt/lists/* + +# Load pg_cron at server start and point its scheduler at the application +# database. POSTGRES_DB defaults to ocotilloapi here but can be overridden at +# run time; keep cron.database_name aligned with the database the API uses. +ENV POSTGRES_DB=ocotilloapi +CMD ["postgres", "-c", "shared_preload_libraries=pg_cron", "-c", "cron.database_name=ocotilloapi"] diff --git a/docs/pg_cron-nightly-refresh.md b/docs/pg_cron-nightly-refresh.md new file mode 100644 index 000000000..5a0015796 --- /dev/null +++ b/docs/pg_cron-nightly-refresh.md @@ -0,0 +1,100 @@ +# Nightly materialized-view refresh with pg_cron + +The pygeoapi materialized views (`ogc_latest_depth_to_water_wells`, +`ogc_water_elevation_wells`, `ogc_avg_tds_wells`, +`ogc_depth_to_water_trend_wells`, `ogc_water_well_summary`, +`ogc_major_chemistry_results`, `ogc_minor_chemistry_wells`) are refreshed once +a night in production by a [pg_cron](https://github.com/citusdata/pg_cron) job. + +## What is registered, and where + +Alembic migration +[`x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py`](../alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py) +registers everything, so the schedule is traceable in version control: + +- A SQL helper, `public.refresh_pygeoapi_materialized_views()`, that runs + `REFRESH MATERIALIZED VIEW` for each view (plain, non-concurrent — see note). +- A pg_cron job named `refresh-pygeoapi-materialized-views` that runs + `SELECT public.refresh_pygeoapi_materialized_views();` on the schedule + `0 9 * * *` (09:00 in the **server timezone**, UTC on Cloud SQL and the + production image — roughly 02:00–03:00 US Mountain). + +The view set comes from +[`services/materialized_views.py`](../services/materialized_views.py) +(`PYGEOAPI_MATERIALIZED_VIEWS`) — the single source of truth shared with the +`oco refresh-pygeoapi-materialized-views` CLI command. To change which views are +refreshed, edit that tuple. To change the schedule, edit the migration (or add a +new one). Do not edit the job in the database by hand, or it will drift from the +repo. + +## Why it is gated by `ENABLE_PG_CRON` + +pg_cron is a **production-only** dependency. It must be loaded through the +server's `shared_preload_libraries`, which the development docker-compose +Postgres image (`postgis/postgis:17-3.5`) does not do. Running +`CREATE EXTENSION pg_cron` without that preload fails. + +So the migration is a **no-op unless `ENABLE_PG_CRON` is truthy**: + +- Development, test, CI: `ENABLE_PG_CRON` unset → migration prints a skip + message and records itself as applied. `alembic upgrade head` works on the + stock dev image with nothing extra installed. +- Production: `ENABLE_PG_CRON=1` → migration creates the extension, the helper + function, and the cron job. + +## Production setup + +### Self-hosted / Docker + +Use the production database image, which installs pg_cron and preloads it: + +- [`docker/db/Dockerfile`](../docker/db/Dockerfile) installs + `postgresql-17-cron` and starts Postgres with + `-c shared_preload_libraries=pg_cron -c cron.database_name=ocotilloapi`. + +`cron.database_name` must match the application database so the alembic +migration (which connects to that database) can `CREATE EXTENSION pg_cron` and +`cron.schedule(...)` locally. Then deploy with `ENABLE_PG_CRON=1` set for the +app container that runs migrations. + +### Google Cloud SQL + +Do not use the Docker image; enable pg_cron with the instance flag instead: + +1. Set the flag `cloudsql.enable_pg_cron=on` and + `cron.database_name=`, then restart the instance. +2. Deploy the app with `ENABLE_PG_CRON=1` so the migration registers the job. + +## Verifying + +```sql +-- the registered job +SELECT jobid, jobname, schedule, command, active FROM cron.job + WHERE jobname = 'refresh-pygeoapi-materialized-views'; + +-- recent run history +SELECT status, start_time, end_time, return_message + FROM cron.job_run_details + WHERE jobid = (SELECT jobid FROM cron.job + WHERE jobname = 'refresh-pygeoapi-materialized-views') + ORDER BY start_time DESC LIMIT 5; +``` + +## Manual / ad-hoc refresh + +Independent of the cron job, the views can be refreshed on demand with the CLI +(also useful in development, where the cron job does not exist): + +```bash +oco refresh-pygeoapi-materialized-views # all views, plain +oco refresh-pygeoapi-materialized-views --concurrently # no read lock +``` + +### Note on non-concurrent REFRESH + +The cron helper uses plain `REFRESH MATERIALIZED VIEW`, not `CONCURRENTLY`, +because `REFRESH ... CONCURRENTLY` cannot run inside the implicit transaction of +a PL/pgSQL function. Plain refresh takes a brief exclusive lock on each view, +which is acceptable in the off-peak nightly window. The CLI still offers +`--concurrently` for daytime manual refreshes (every view has the required +unique index). diff --git a/services/materialized_views.py b/services/materialized_views.py new file mode 100644 index 000000000..ddc49322c --- /dev/null +++ b/services/materialized_views.py @@ -0,0 +1,17 @@ +"""Single source of truth for the pygeoapi materialized views. + +Both the ``oco refresh-pygeoapi-materialized-views`` CLI command and the +pg_cron nightly-refresh alembic migration import this tuple so the view set is +defined in exactly one place. Add or remove a view here and both stay in sync. +""" + +# Order is the order views are refreshed in. +PYGEOAPI_MATERIALIZED_VIEWS: tuple[str, ...] = ( + "ogc_latest_depth_to_water_wells", + "ogc_water_elevation_wells", + "ogc_avg_tds_wells", + "ogc_depth_to_water_trend_wells", + "ogc_water_well_summary", + "ogc_major_chemistry_results", + "ogc_minor_chemistry_wells", +) From 5ad3f25d7c28b132ac64e66b5c2ecf4578962931 Mon Sep 17 00:00:00 2001 From: jirhiker <2035568+jirhiker@users.noreply.github.com> Date: Wed, 17 Jun 2026 16:22:19 +0000 Subject: [PATCH 2/9] Formatting changes --- ...4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py index e85428869..159a2c602 100644 --- a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py +++ b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py @@ -58,7 +58,9 @@ def _build_refresh_function_sql() -> str: if not re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", name): raise ValueError(f"Invalid materialized view name: {name!r}") - array_literal = ",\n ".join(f"'{name}'" for name in PYGEOAPI_MATERIALIZED_VIEWS) + array_literal = ",\n ".join( + f"'{name}'" for name in PYGEOAPI_MATERIALIZED_VIEWS + ) return f""" CREATE OR REPLACE FUNCTION public.refresh_pygeoapi_materialized_views() RETURNS void @@ -106,8 +108,7 @@ def upgrade() -> None: # migration (or a re-deploy) does not accumulate duplicate schedules. op.execute( text( - "SELECT cron.unschedule(jobid) FROM cron.job " - "WHERE jobname = :name" + "SELECT cron.unschedule(jobid) FROM cron.job " "WHERE jobname = :name" ).bindparams(name=CRON_JOB_NAME) ) @@ -132,8 +133,7 @@ def downgrade() -> None: op.execute( text( - "SELECT cron.unschedule(jobid) FROM cron.job " - "WHERE jobname = :name" + "SELECT cron.unschedule(jobid) FROM cron.job " "WHERE jobname = :name" ).bindparams(name=CRON_JOB_NAME) ) op.execute( From 3bb19baaf9bfbb79eab8ea72d0ec00c067909e59 Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 17 Jun 2026 10:55:00 -0600 Subject: [PATCH 3/9] fix(ci): keep nightly pg_cron job production-only Staging refreshes the materialized views on each deploy (the existing "Refresh materialized views" CD step), so it does not need the nightly pg_cron job. Drop ENABLE_PG_CRON from CD_staging.yml; only production registers the cron job. Co-Authored-By: Claude Fable 5 --- .github/workflows/CD_staging.yml | 5 ----- docs/pg_cron-nightly-refresh.md | 10 ++++++---- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/.github/workflows/CD_staging.yml b/.github/workflows/CD_staging.yml index 0ec5baed0..047237d9d 100644 --- a/.github/workflows/CD_staging.yml +++ b/.github/workflows/CD_staging.yml @@ -55,11 +55,6 @@ jobs: CLOUD_SQL_DATABASE: "${{ vars.CLOUD_SQL_DATABASE }}" CLOUD_SQL_USER: "${{ secrets.CLOUD_SQL_USER }}" CLOUD_SQL_IAM_AUTH: true - # Register the nightly pg_cron materialized-view refresh job. - # Requires the Cloud SQL instance flag cloudsql.enable_pg_cron=on and - # cron.database_name set to CLOUD_SQL_DATABASE. See - # docs/pg_cron-nightly-refresh.md. - ENABLE_PG_CRON: "1" run: | uv run --no-dev alembic upgrade head diff --git a/docs/pg_cron-nightly-refresh.md b/docs/pg_cron-nightly-refresh.md index 5a0015796..838125abc 100644 --- a/docs/pg_cron-nightly-refresh.md +++ b/docs/pg_cron-nightly-refresh.md @@ -36,11 +36,13 @@ Postgres image (`postgis/postgis:17-3.5`) does not do. Running So the migration is a **no-op unless `ENABLE_PG_CRON` is truthy**: -- Development, test, CI: `ENABLE_PG_CRON` unset → migration prints a skip - message and records itself as applied. `alembic upgrade head` works on the - stock dev image with nothing extra installed. +- Development, test, CI, **and staging**: `ENABLE_PG_CRON` unset → migration + prints a skip message and records itself as applied. `alembic upgrade head` + works on the stock dev image with nothing extra installed. Staging refreshes + the views on each deploy instead (the "Refresh materialized views" CD step), + so it does not need the nightly job. - Production: `ENABLE_PG_CRON=1` → migration creates the extension, the helper - function, and the cron job. + function, and the cron job. Only `CD_production.yml` sets this. ## Production setup From a7025e5937e48c3957e354a77284428b85873edf Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 17 Jun 2026 11:11:53 -0600 Subject: [PATCH 4/9] refactor(db): address pg_cron PR review - Migration helper now discovers ogc_* materialized views from the catalog at run time instead of importing the mutable view tuple. Keeps the versioned migration immutable and self-contained, and auto-includes views added by later migrations. Resolves the cross-environment drift concern from review. - Production DB image derives cron.database_name from POSTGRES_DB via a start-postgres.sh entrypoint wrapper, so pg_cron tracks the same database the migration connects to even when POSTGRES_DB is overridden. - services/materialized_views.py is now the CLI's curated default only; docs updated to match. Co-Authored-By: Claude Fable 5 --- ...chedule_nightly_matview_refresh_pg_cron.py | 59 ++++++++----------- docker/db/Dockerfile | 8 ++- docker/db/start-postgres.sh | 10 ++++ docs/pg_cron-nightly-refresh.md | 23 ++++---- services/materialized_views.py | 9 +-- 5 files changed, 58 insertions(+), 51 deletions(-) create mode 100644 docker/db/start-postgres.sh diff --git a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py index 159a2c602..e9271ba3d 100644 --- a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py +++ b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py @@ -1,9 +1,11 @@ """schedule nightly materialized-view refresh via pg_cron Registers a pg_cron job that refreshes the pygeoapi materialized views -once a night. The job is created through a SQL helper function, -``public.refresh_pygeoapi_materialized_views()``, so the list of views and -the refresh logic live in the database and version control together. +once a night. The job calls a SQL helper function, +``public.refresh_pygeoapi_materialized_views()``, which discovers the +``ogc_*`` materialized views from the catalog at run time -- so this +migration stays immutable and self-contained, and views added by later +migrations are refreshed without any rescheduling. pg_cron is a *production-only* dependency. It requires the extension to be loaded via ``shared_preload_libraries`` on the database server, which the @@ -18,14 +20,12 @@ Create Date: 2026-06-17 00:00:00.000000 """ -import re from typing import Sequence, Union from alembic import op from sqlalchemy import text from services.env import get_bool_env -from services.materialized_views import PYGEOAPI_MATERIALIZED_VIEWS # revision identifiers, used by Alembic. revision: str = "x2y3z4a5b6c7" @@ -42,40 +42,31 @@ CRON_SCHEDULE = "0 9 * * *" -def _build_refresh_function_sql() -> str: - """Build the helper function body from the shared view list. - - The view set is owned by ``services.materialized_views`` (the single source - of truth shared with the CLI). Plain (non-concurrent) REFRESH is used - deliberately: REFRESH ... CONCURRENTLY cannot run inside the implicit - transaction of a PL/pgSQL function, and the nightly window tolerates the - brief exclusive lock. Each view is guarded by an existence check so a - missing view never aborts the whole run. - """ - for name in PYGEOAPI_MATERIALIZED_VIEWS: - # These names are baked into a SQL literal array below; validate them - # rather than trust the constant blindly. - if not re.fullmatch(r"[A-Za-z_][A-Za-z0-9_]*", name): - raise ValueError(f"Invalid materialized view name: {name!r}") - - array_literal = ",\n ".join( - f"'{name}'" for name in PYGEOAPI_MATERIALIZED_VIEWS - ) - return f""" +# Helper function the cron job calls. It discovers the pygeoapi materialized +# views (the ``ogc_*`` views in the public schema) from the catalog at run time +# rather than from a baked-in list. This keeps the migration immutable and +# self-contained -- it does not depend on mutable application code, and views +# added by later migrations are picked up automatically without rescheduling. +# +# Plain (non-concurrent) REFRESH is used deliberately: REFRESH ... CONCURRENTLY +# cannot run inside the implicit transaction of a PL/pgSQL function, and the +# nightly window tolerates the brief exclusive lock. +_REFRESH_FUNCTION_SQL = r""" CREATE OR REPLACE FUNCTION public.refresh_pygeoapi_materialized_views() RETURNS void LANGUAGE plpgsql AS $func$ DECLARE - v text; - views text[] := ARRAY[ - {array_literal} - ]; + r record; BEGIN - FOREACH v IN ARRAY views LOOP - IF EXISTS (SELECT 1 FROM pg_matviews WHERE matviewname = v) THEN - EXECUTE format('REFRESH MATERIALIZED VIEW %I', v); - END IF; + FOR r IN + SELECT matviewname + FROM pg_matviews + WHERE schemaname = 'public' + AND matviewname LIKE 'ogc\_%' ESCAPE '\' + ORDER BY matviewname + LOOP + EXECUTE format('REFRESH MATERIALIZED VIEW %I', r.matviewname); END LOOP; END; $func$; @@ -102,7 +93,7 @@ def upgrade() -> None: op.execute(text("CREATE EXTENSION IF NOT EXISTS pg_cron")) # (Re)create the refresh helper. - op.execute(text(_build_refresh_function_sql())) + op.execute(text(_REFRESH_FUNCTION_SQL)) # Drop any previously registered job with the same name so re-running this # migration (or a re-deploy) does not accumulate duplicate schedules. diff --git a/docker/db/Dockerfile b/docker/db/Dockerfile index e250b7e3c..ffa2c8864 100644 --- a/docker/db/Dockerfile +++ b/docker/db/Dockerfile @@ -23,7 +23,9 @@ RUN apt-get update \ && rm -rf /var/lib/apt/lists/* # Load pg_cron at server start and point its scheduler at the application -# database. POSTGRES_DB defaults to ocotilloapi here but can be overridden at -# run time; keep cron.database_name aligned with the database the API uses. +# database. cron.database_name is derived from POSTGRES_DB at start time (see +# start-postgres.sh) so it stays aligned even when POSTGRES_DB is overridden. ENV POSTGRES_DB=ocotilloapi -CMD ["postgres", "-c", "shared_preload_libraries=pg_cron", "-c", "cron.database_name=ocotilloapi"] +COPY docker/db/start-postgres.sh /usr/local/bin/start-postgres.sh +RUN chmod +x /usr/local/bin/start-postgres.sh +ENTRYPOINT ["start-postgres.sh"] diff --git a/docker/db/start-postgres.sh b/docker/db/start-postgres.sh new file mode 100644 index 000000000..0582b271f --- /dev/null +++ b/docker/db/start-postgres.sh @@ -0,0 +1,10 @@ +#!/bin/sh +# Start Postgres with pg_cron preloaded and its scheduler pointed at the +# application database. cron.database_name is derived from POSTGRES_DB so that, +# when the image is run with POSTGRES_DB overridden, pg_cron watches the same +# database the app (and the alembic migration) connects to. +set -e + +exec docker-entrypoint.sh postgres \ + -c shared_preload_libraries=pg_cron \ + -c "cron.database_name=${POSTGRES_DB:-ocotilloapi}" diff --git a/docs/pg_cron-nightly-refresh.md b/docs/pg_cron-nightly-refresh.md index 838125abc..685adc402 100644 --- a/docs/pg_cron-nightly-refresh.md +++ b/docs/pg_cron-nightly-refresh.md @@ -12,20 +12,21 @@ Alembic migration [`x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py`](../alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py) registers everything, so the schedule is traceable in version control: -- A SQL helper, `public.refresh_pygeoapi_materialized_views()`, that runs - `REFRESH MATERIALIZED VIEW` for each view (plain, non-concurrent — see note). +- A SQL helper, `public.refresh_pygeoapi_materialized_views()`, that discovers + the `ogc_*` materialized views from the catalog at run time and runs + `REFRESH MATERIALIZED VIEW` for each (plain, non-concurrent — see note). - A pg_cron job named `refresh-pygeoapi-materialized-views` that runs `SELECT public.refresh_pygeoapi_materialized_views();` on the schedule `0 9 * * *` (09:00 in the **server timezone**, UTC on Cloud SQL and the production image — roughly 02:00–03:00 US Mountain). -The view set comes from -[`services/materialized_views.py`](../services/materialized_views.py) -(`PYGEOAPI_MATERIALIZED_VIEWS`) — the single source of truth shared with the -`oco refresh-pygeoapi-materialized-views` CLI command. To change which views are -refreshed, edit that tuple. To change the schedule, edit the migration (or add a -new one). Do not edit the job in the database by hand, or it will drift from the -repo. +The helper refreshes whatever `ogc_*` materialized views exist, so a view added +by a later migration is picked up automatically — there is nothing to keep in +sync and no need to reschedule. (The `oco refresh-pygeoapi-materialized-views` +CLI command, used for manual/on-deploy refreshes, keeps an explicit curated +list in [`services/materialized_views.py`](../services/materialized_views.py).) +To change the schedule, edit the migration (or add a new one). Do not edit the +job in the database by hand, or it will drift from the repo. ## Why it is gated by `ENABLE_PG_CRON` @@ -52,7 +53,9 @@ Use the production database image, which installs pg_cron and preloads it: - [`docker/db/Dockerfile`](../docker/db/Dockerfile) installs `postgresql-17-cron` and starts Postgres with - `-c shared_preload_libraries=pg_cron -c cron.database_name=ocotilloapi`. + `-c shared_preload_libraries=pg_cron -c cron.database_name=$POSTGRES_DB` + (via [`start-postgres.sh`](../docker/db/start-postgres.sh), so overriding + `POSTGRES_DB` keeps the scheduler pointed at the same database). `cron.database_name` must match the application database so the alembic migration (which connects to that database) can `CREATE EXTENSION pg_cron` and diff --git a/services/materialized_views.py b/services/materialized_views.py index ddc49322c..4ce701599 100644 --- a/services/materialized_views.py +++ b/services/materialized_views.py @@ -1,8 +1,9 @@ -"""Single source of truth for the pygeoapi materialized views. +"""Curated pygeoapi materialized-view list for the CLI refresh command. -Both the ``oco refresh-pygeoapi-materialized-views`` CLI command and the -pg_cron nightly-refresh alembic migration import this tuple so the view set is -defined in exactly one place. Add or remove a view here and both stay in sync. +``oco refresh-pygeoapi-materialized-views`` refreshes these views (in order) +by default. The nightly pg_cron job does NOT use this list -- its SQL helper +discovers the ``ogc_*`` materialized views from the catalog at run time (see +alembic migration ``x2y3z4a5b6c7``) to stay immutable and self-contained. """ # Order is the order views are refreshed in. From fc990afa14659bcd8fb3ed277d19d2ddb58a3014 Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 17 Jun 2026 11:18:03 -0600 Subject: [PATCH 5/9] feat(db): refresh all materialized views, not just pygeoapi Refresh every materialized view nightly, including transducer_daily_data. - Migration helper drops the ogc_* filter and refreshes all public-schema materialized views discovered from the catalog at run time. - Rename PYGEOAPI_MATERIALIZED_VIEWS -> MATERIALIZED_VIEWS and add transducer_daily_data to the CLI's default list. - Update CLI refresh test expectations (8 views) and docs. Co-Authored-By: Claude Fable 5 --- ...chedule_nightly_matview_refresh_pg_cron.py | 23 +++++++++---------- cli/cli.py | 4 ++-- docs/pg_cron-nightly-refresh.md | 22 ++++++++---------- services/materialized_views.py | 9 ++++---- tests/test_cli_commands.py | 3 ++- 5 files changed, 30 insertions(+), 31 deletions(-) diff --git a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py index e9271ba3d..3a66b71a3 100644 --- a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py +++ b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py @@ -1,11 +1,11 @@ """schedule nightly materialized-view refresh via pg_cron -Registers a pg_cron job that refreshes the pygeoapi materialized views -once a night. The job calls a SQL helper function, -``public.refresh_pygeoapi_materialized_views()``, which discovers the -``ogc_*`` materialized views from the catalog at run time -- so this -migration stays immutable and self-contained, and views added by later -migrations are refreshed without any rescheduling. +Registers a pg_cron job that refreshes the materialized views once a +night. The job calls a SQL helper function, +``public.refresh_pygeoapi_materialized_views()``, which discovers every +materialized view in the public schema from the catalog at run time -- so +this migration stays immutable and self-contained, and views added by +later migrations are refreshed without any rescheduling. pg_cron is a *production-only* dependency. It requires the extension to be loaded via ``shared_preload_libraries`` on the database server, which the @@ -42,11 +42,11 @@ CRON_SCHEDULE = "0 9 * * *" -# Helper function the cron job calls. It discovers the pygeoapi materialized -# views (the ``ogc_*`` views in the public schema) from the catalog at run time -# rather than from a baked-in list. This keeps the migration immutable and -# self-contained -- it does not depend on mutable application code, and views -# added by later migrations are picked up automatically without rescheduling. +# Helper function the cron job calls. It discovers every materialized view in +# the public schema from the catalog at run time rather than from a baked-in +# list. This keeps the migration immutable and self-contained -- it does not +# depend on mutable application code, and views added by later migrations are +# picked up automatically without rescheduling. # # Plain (non-concurrent) REFRESH is used deliberately: REFRESH ... CONCURRENTLY # cannot run inside the implicit transaction of a PL/pgSQL function, and the @@ -63,7 +63,6 @@ SELECT matviewname FROM pg_matviews WHERE schemaname = 'public' - AND matviewname LIKE 'ogc\_%' ESCAPE '\' ORDER BY matviewname LOOP EXECUTE format('REFRESH MATERIALIZED VIEW %I', r.matviewname); diff --git a/cli/cli.py b/cli/cli.py index f68857a5a..44bab91ed 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -24,7 +24,7 @@ import typer from dotenv import load_dotenv -from services.materialized_views import PYGEOAPI_MATERIALIZED_VIEWS +from services.materialized_views import MATERIALIZED_VIEWS # CLI should load `.env` defaults without clobbering an explicitly prepared environment. load_dotenv(override=False) @@ -1106,7 +1106,7 @@ def refresh_pygeoapi_materialized_views( from db.engine import engine, session_ctx - target_views = tuple(view) if view else PYGEOAPI_MATERIALIZED_VIEWS + target_views = tuple(view) if view else MATERIALIZED_VIEWS # Validate all view names before opening any DB connections or sessions. safe_views = tuple(_validate_sql_identifier(v) for v in target_views) diff --git a/docs/pg_cron-nightly-refresh.md b/docs/pg_cron-nightly-refresh.md index 685adc402..d6a05f236 100644 --- a/docs/pg_cron-nightly-refresh.md +++ b/docs/pg_cron-nightly-refresh.md @@ -1,10 +1,8 @@ # Nightly materialized-view refresh with pg_cron -The pygeoapi materialized views (`ogc_latest_depth_to_water_wells`, -`ogc_water_elevation_wells`, `ogc_avg_tds_wells`, -`ogc_depth_to_water_trend_wells`, `ogc_water_well_summary`, -`ogc_major_chemistry_results`, `ogc_minor_chemistry_wells`) are refreshed once -a night in production by a [pg_cron](https://github.com/citusdata/pg_cron) job. +Every materialized view in the database (the `ogc_*` pygeoapi views and +`transducer_daily_data`) is refreshed once a night in production by a +[pg_cron](https://github.com/citusdata/pg_cron) job. ## What is registered, and where @@ -13,18 +11,18 @@ Alembic migration registers everything, so the schedule is traceable in version control: - A SQL helper, `public.refresh_pygeoapi_materialized_views()`, that discovers - the `ogc_*` materialized views from the catalog at run time and runs - `REFRESH MATERIALIZED VIEW` for each (plain, non-concurrent — see note). + every materialized view in the public schema from the catalog at run time and + runs `REFRESH MATERIALIZED VIEW` for each (plain, non-concurrent — see note). - A pg_cron job named `refresh-pygeoapi-materialized-views` that runs `SELECT public.refresh_pygeoapi_materialized_views();` on the schedule `0 9 * * *` (09:00 in the **server timezone**, UTC on Cloud SQL and the production image — roughly 02:00–03:00 US Mountain). -The helper refreshes whatever `ogc_*` materialized views exist, so a view added -by a later migration is picked up automatically — there is nothing to keep in -sync and no need to reschedule. (The `oco refresh-pygeoapi-materialized-views` -CLI command, used for manual/on-deploy refreshes, keeps an explicit curated -list in [`services/materialized_views.py`](../services/materialized_views.py).) +The helper refreshes whatever materialized views exist, so a view added by a +later migration is picked up automatically — there is nothing to keep in sync +and no need to reschedule. (The `oco refresh-pygeoapi-materialized-views` CLI +command, used for manual/on-deploy refreshes, keeps an explicit list in +[`services/materialized_views.py`](../services/materialized_views.py).) To change the schedule, edit the migration (or add a new one). Do not edit the job in the database by hand, or it will drift from the repo. diff --git a/services/materialized_views.py b/services/materialized_views.py index 4ce701599..e72b70d4e 100644 --- a/services/materialized_views.py +++ b/services/materialized_views.py @@ -1,13 +1,13 @@ -"""Curated pygeoapi materialized-view list for the CLI refresh command. +"""Curated materialized-view list for the CLI refresh command. ``oco refresh-pygeoapi-materialized-views`` refreshes these views (in order) by default. The nightly pg_cron job does NOT use this list -- its SQL helper -discovers the ``ogc_*`` materialized views from the catalog at run time (see -alembic migration ``x2y3z4a5b6c7``) to stay immutable and self-contained. +discovers every materialized view from the catalog at run time (see alembic +migration ``x2y3z4a5b6c7``) to stay immutable and self-contained. """ # Order is the order views are refreshed in. -PYGEOAPI_MATERIALIZED_VIEWS: tuple[str, ...] = ( +MATERIALIZED_VIEWS: tuple[str, ...] = ( "ogc_latest_depth_to_water_wells", "ogc_water_elevation_wells", "ogc_avg_tds_wells", @@ -15,4 +15,5 @@ "ogc_water_well_summary", "ogc_major_chemistry_results", "ogc_minor_chemistry_wells", + "transducer_daily_data", ) diff --git a/tests/test_cli_commands.py b/tests/test_cli_commands.py index 5953c0f2e..7a7707f30 100644 --- a/tests/test_cli_commands.py +++ b/tests/test_cli_commands.py @@ -70,9 +70,10 @@ def __exit__(self, exc_type, exc, tb): "REFRESH MATERIALIZED VIEW ogc_water_well_summary", "REFRESH MATERIALIZED VIEW ogc_major_chemistry_results", "REFRESH MATERIALIZED VIEW ogc_minor_chemistry_wells", + "REFRESH MATERIALIZED VIEW transducer_daily_data", ] assert commit_called["value"] is True - assert "Refreshed 7 materialized view(s)." in result.output + assert "Refreshed 8 materialized view(s)." in result.output def test_refresh_pygeoapi_materialized_views_custom_and_concurrently( From ce34ca4a7ee58de0bf711cbef2bf439441c3a5d8 Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 17 Jun 2026 11:24:51 -0600 Subject: [PATCH 6/9] refactor: drop pygeoapi-specific naming for matview refresh The refresh covers all materialized views, not just the ogc_* pygeoapi views, so rename the pygeoapi-specific identifiers to generic ones. - CLI command refresh-pygeoapi-materialized-views -> refresh-materialized-views (function refresh_pygeoapi_materialized_views -> refresh_materialized_views). - SQL helper public.refresh_pygeoapi_materialized_views() -> public.refresh_materialized_views(); rename it in d5e6f7a8b9c0 too, and have the pg_cron migration drop the legacy function on databases that already created it. - Cron job name refresh-pygeoapi-materialized-views -> refresh-materialized-views. - Update CD workflows (staging/testing/production), tests, and docs. Co-Authored-By: Claude Fable 5 --- .github/workflows/CD_production.yml | 2 +- .github/workflows/CD_staging.yml | 2 +- .github/workflows/CD_testing.yml | 2 +- ...a8b9c0_create_pygeoapi_supporting_views.py | 2 +- ...chedule_nightly_matview_refresh_pg_cron.py | 20 ++++++++++++++----- cli/cli.py | 6 +++--- docs/pg_cron-nightly-refresh.md | 18 ++++++++--------- services/materialized_views.py | 2 +- tests/test_cli_commands.py | 12 +++++------ 9 files changed, 38 insertions(+), 28 deletions(-) diff --git a/.github/workflows/CD_production.yml b/.github/workflows/CD_production.yml index 1160c140c..1ade7f251 100644 --- a/.github/workflows/CD_production.yml +++ b/.github/workflows/CD_production.yml @@ -106,7 +106,7 @@ jobs: CLOUD_SQL_USER: "${{ secrets.CLOUD_SQL_USER }}" CLOUD_SQL_IAM_AUTH: true run: | - uv run --no-dev python -m cli.cli refresh-pygeoapi-materialized-views + uv run --no-dev python -m cli.cli refresh-materialized-views - name: Ensure envsubst is available run: | diff --git a/.github/workflows/CD_staging.yml b/.github/workflows/CD_staging.yml index 047237d9d..e55c6f2a4 100644 --- a/.github/workflows/CD_staging.yml +++ b/.github/workflows/CD_staging.yml @@ -66,7 +66,7 @@ jobs: CLOUD_SQL_USER: "${{ secrets.CLOUD_SQL_USER }}" CLOUD_SQL_IAM_AUTH: true run: | - uv run --no-dev python -m cli.cli refresh-pygeoapi-materialized-views + uv run --no-dev python -m cli.cli refresh-materialized-views - name: Ensure envsubst is available run: | diff --git a/.github/workflows/CD_testing.yml b/.github/workflows/CD_testing.yml index 66c96a2ce..64e15443e 100644 --- a/.github/workflows/CD_testing.yml +++ b/.github/workflows/CD_testing.yml @@ -66,7 +66,7 @@ jobs: CLOUD_SQL_USER: "${{ secrets.CLOUD_SQL_USER }}" CLOUD_SQL_IAM_AUTH: true run: | - uv run --no-dev python -m cli.cli refresh-pygeoapi-materialized-views + uv run --no-dev python -m cli.cli refresh-materialized-views - name: Ensure envsubst is available run: | diff --git a/alembic/versions/d5e6f7a8b9c0_create_pygeoapi_supporting_views.py b/alembic/versions/d5e6f7a8b9c0_create_pygeoapi_supporting_views.py index 60d03fc04..d8e12b2bc 100644 --- a/alembic/versions/d5e6f7a8b9c0_create_pygeoapi_supporting_views.py +++ b/alembic/versions/d5e6f7a8b9c0_create_pygeoapi_supporting_views.py @@ -16,7 +16,7 @@ down_revision: Union[str, Sequence[str], None] = "c4d5e6f7a8b9" branch_labels: Union[str, Sequence[str], None] = None depends_on: Union[str, Sequence[str], None] = None -REFRESH_FUNCTION_NAME = "refresh_pygeoapi_materialized_views" +REFRESH_FUNCTION_NAME = "refresh_materialized_views" THING_COLLECTIONS = [ ("water_wells", "water well"), diff --git a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py index 3a66b71a3..6ac1f66f0 100644 --- a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py +++ b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py @@ -2,11 +2,15 @@ Registers a pg_cron job that refreshes the materialized views once a night. The job calls a SQL helper function, -``public.refresh_pygeoapi_materialized_views()``, which discovers every +``public.refresh_materialized_views()``, which discovers every materialized view in the public schema from the catalog at run time -- so this migration stays immutable and self-contained, and views added by later migrations are refreshed without any rescheduling. +This also drops the legacy ``refresh_pygeoapi_materialized_views`` helper +(created by ``d5e6f7a8b9c0``) on databases that already ran that revision, +folding it into the generically named function. + pg_cron is a *production-only* dependency. It requires the extension to be loaded via ``shared_preload_libraries`` on the database server, which the development docker-compose Postgres image does not do. To avoid breaking @@ -34,7 +38,10 @@ depends_on: Union[str, Sequence[str], None] = None # Name of the pg_cron job. Used to (re)register and to unschedule. -CRON_JOB_NAME = "refresh-pygeoapi-materialized-views" +CRON_JOB_NAME = "refresh-materialized-views" + +# Legacy helper created by d5e6f7a8b9c0, superseded by refresh_materialized_views. +LEGACY_FUNCTION_NAME = "refresh_pygeoapi_materialized_views" # Nightly schedule in standard cron syntax. pg_cron interprets this in the # database server's timezone (UTC on Cloud SQL / the docker image), so 09:00 @@ -52,7 +59,7 @@ # cannot run inside the implicit transaction of a PL/pgSQL function, and the # nightly window tolerates the brief exclusive lock. _REFRESH_FUNCTION_SQL = r""" -CREATE OR REPLACE FUNCTION public.refresh_pygeoapi_materialized_views() +CREATE OR REPLACE FUNCTION public.refresh_materialized_views() RETURNS void LANGUAGE plpgsql AS $func$ @@ -94,6 +101,9 @@ def upgrade() -> None: # (Re)create the refresh helper. op.execute(text(_REFRESH_FUNCTION_SQL)) + # Remove the legacy helper on databases that already ran d5e6f7a8b9c0. + op.execute(text(f"DROP FUNCTION IF EXISTS public.{LEGACY_FUNCTION_NAME}()")) + # Drop any previously registered job with the same name so re-running this # migration (or a re-deploy) does not accumulate duplicate schedules. op.execute( @@ -106,7 +116,7 @@ def upgrade() -> None: text("SELECT cron.schedule(:name, :sched, :cmd)").bindparams( name=CRON_JOB_NAME, sched=CRON_SCHEDULE, - cmd="SELECT public.refresh_pygeoapi_materialized_views();", + cmd="SELECT public.refresh_materialized_views();", ) ) @@ -127,7 +137,7 @@ def downgrade() -> None: ).bindparams(name=CRON_JOB_NAME) ) op.execute( - text("DROP FUNCTION IF EXISTS public.refresh_pygeoapi_materialized_views()") + text("DROP FUNCTION IF EXISTS public.refresh_materialized_views()") ) # The pg_cron extension itself is left installed: it is a server-level # capability that other jobs may depend on, and dropping it is not the diff --git a/cli/cli.py b/cli/cli.py index 44bab91ed..14c8f9470 100644 --- a/cli/cli.py +++ b/cli/cli.py @@ -1086,14 +1086,14 @@ def alembic_upgrade_and_data( typer.echo(f"applied {len(ran)} migration(s)") -@cli.command("refresh-pygeoapi-materialized-views") -def refresh_pygeoapi_materialized_views( +@cli.command("refresh-materialized-views") +def refresh_materialized_views( view: list[str] = typer.Option( None, "--view", help=( "Materialized view name(s) to refresh. Repeat --view for multiple. " - "Defaults to all pygeoapi materialized views." + "Defaults to all materialized views." ), ), concurrently: bool = typer.Option( diff --git a/docs/pg_cron-nightly-refresh.md b/docs/pg_cron-nightly-refresh.md index d6a05f236..15b81c634 100644 --- a/docs/pg_cron-nightly-refresh.md +++ b/docs/pg_cron-nightly-refresh.md @@ -1,6 +1,6 @@ # Nightly materialized-view refresh with pg_cron -Every materialized view in the database (the `ogc_*` pygeoapi views and +Every materialized view in the database (the `ogc_*` views and `transducer_daily_data`) is refreshed once a night in production by a [pg_cron](https://github.com/citusdata/pg_cron) job. @@ -10,17 +10,17 @@ Alembic migration [`x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py`](../alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py) registers everything, so the schedule is traceable in version control: -- A SQL helper, `public.refresh_pygeoapi_materialized_views()`, that discovers +- A SQL helper, `public.refresh_materialized_views()`, that discovers every materialized view in the public schema from the catalog at run time and runs `REFRESH MATERIALIZED VIEW` for each (plain, non-concurrent — see note). -- A pg_cron job named `refresh-pygeoapi-materialized-views` that runs - `SELECT public.refresh_pygeoapi_materialized_views();` on the schedule +- A pg_cron job named `refresh-materialized-views` that runs + `SELECT public.refresh_materialized_views();` on the schedule `0 9 * * *` (09:00 in the **server timezone**, UTC on Cloud SQL and the production image — roughly 02:00–03:00 US Mountain). The helper refreshes whatever materialized views exist, so a view added by a later migration is picked up automatically — there is nothing to keep in sync -and no need to reschedule. (The `oco refresh-pygeoapi-materialized-views` CLI +and no need to reschedule. (The `oco refresh-materialized-views` CLI command, used for manual/on-deploy refreshes, keeps an explicit list in [`services/materialized_views.py`](../services/materialized_views.py).) To change the schedule, edit the migration (or add a new one). Do not edit the @@ -73,13 +73,13 @@ Do not use the Docker image; enable pg_cron with the instance flag instead: ```sql -- the registered job SELECT jobid, jobname, schedule, command, active FROM cron.job - WHERE jobname = 'refresh-pygeoapi-materialized-views'; + WHERE jobname = 'refresh-materialized-views'; -- recent run history SELECT status, start_time, end_time, return_message FROM cron.job_run_details WHERE jobid = (SELECT jobid FROM cron.job - WHERE jobname = 'refresh-pygeoapi-materialized-views') + WHERE jobname = 'refresh-materialized-views') ORDER BY start_time DESC LIMIT 5; ``` @@ -89,8 +89,8 @@ Independent of the cron job, the views can be refreshed on demand with the CLI (also useful in development, where the cron job does not exist): ```bash -oco refresh-pygeoapi-materialized-views # all views, plain -oco refresh-pygeoapi-materialized-views --concurrently # no read lock +oco refresh-materialized-views # all views, plain +oco refresh-materialized-views --concurrently # no read lock ``` ### Note on non-concurrent REFRESH diff --git a/services/materialized_views.py b/services/materialized_views.py index e72b70d4e..ec1ae7103 100644 --- a/services/materialized_views.py +++ b/services/materialized_views.py @@ -1,6 +1,6 @@ """Curated materialized-view list for the CLI refresh command. -``oco refresh-pygeoapi-materialized-views`` refreshes these views (in order) +``oco refresh-materialized-views`` refreshes these views (in order) by default. The nightly pg_cron job does NOT use this list -- its SQL helper discovers every materialized view from the catalog at run time (see alembic migration ``x2y3z4a5b6c7``) to stay immutable and self-contained. diff --git a/tests/test_cli_commands.py b/tests/test_cli_commands.py index 7a7707f30..f64a81306 100644 --- a/tests/test_cli_commands.py +++ b/tests/test_cli_commands.py @@ -38,7 +38,7 @@ from db.engine import session_ctx -def test_refresh_pygeoapi_materialized_views_defaults(monkeypatch): +def test_refresh_materialized_views_defaults(monkeypatch): executed_sql: list[str] = [] commit_called = {"value": False} @@ -59,7 +59,7 @@ def __exit__(self, exc_type, exc, tb): monkeypatch.setattr("db.engine.session_ctx", lambda: _FakeCtx()) runner = CliRunner() - result = runner.invoke(cli, ["refresh-pygeoapi-materialized-views"]) + result = runner.invoke(cli, ["refresh-materialized-views"]) assert result.exit_code == 0, result.output assert executed_sql == [ @@ -76,7 +76,7 @@ def __exit__(self, exc_type, exc, tb): assert "Refreshed 8 materialized view(s)." in result.output -def test_refresh_pygeoapi_materialized_views_custom_and_concurrently( +def test_refresh_materialized_views_custom_and_concurrently( monkeypatch, ): executed_sql: list[str] = [] @@ -106,7 +106,7 @@ def connect(self): result = runner.invoke( cli, [ - "refresh-pygeoapi-materialized-views", + "refresh-materialized-views", "--view", "ogc_avg_tds_wells", "--concurrently", @@ -120,12 +120,12 @@ def connect(self): ] -def test_refresh_pygeoapi_materialized_views_rejects_invalid_identifier(): +def test_refresh_materialized_views_rejects_invalid_identifier(): runner = CliRunner() result = runner.invoke( cli, [ - "refresh-pygeoapi-materialized-views", + "refresh-materialized-views", "--view", "ogc_avg_tds_wells;drop table thing", ], From 18e8b790bc4881278f2effb260caa0db30d6b6ea Mon Sep 17 00:00:00 2001 From: jirhiker <2035568+jirhiker@users.noreply.github.com> Date: Wed, 17 Jun 2026 17:25:20 +0000 Subject: [PATCH 7/9] Formatting changes --- .../x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py index 6ac1f66f0..e41523bd3 100644 --- a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py +++ b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py @@ -136,9 +136,7 @@ def downgrade() -> None: "SELECT cron.unschedule(jobid) FROM cron.job " "WHERE jobname = :name" ).bindparams(name=CRON_JOB_NAME) ) - op.execute( - text("DROP FUNCTION IF EXISTS public.refresh_materialized_views()") - ) + op.execute(text("DROP FUNCTION IF EXISTS public.refresh_materialized_views()")) # The pg_cron extension itself is left installed: it is a server-level # capability that other jobs may depend on, and dropping it is not the # inverse of "schedule a job". From 2526a8313b3b7b211237ced1397ceb5a902055ac Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 17 Jun 2026 11:26:08 -0600 Subject: [PATCH 8/9] revert(docker): keep db image dev-only, drop pg_cron from it The docker/db image is used only for development, so it should not carry the production pg_cron setup. Revert it to the stock postgis image, remove the start-postgres.sh wrapper, and document pg_cron as Cloud SQL-only in production. Co-Authored-By: Claude Fable 5 --- ...chedule_nightly_matview_refresh_pg_cron.py | 4 +-- docker/db/Dockerfile | 30 ------------------ docker/db/start-postgres.sh | 10 ------ docs/pg_cron-nightly-refresh.md | 31 ++++++------------- 4 files changed, 12 insertions(+), 63 deletions(-) delete mode 100644 docker/db/start-postgres.sh diff --git a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py index e41523bd3..1797b7936 100644 --- a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py +++ b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py @@ -44,8 +44,8 @@ LEGACY_FUNCTION_NAME = "refresh_pygeoapi_materialized_views" # Nightly schedule in standard cron syntax. pg_cron interprets this in the -# database server's timezone (UTC on Cloud SQL / the docker image), so 09:00 -# UTC is roughly 02:00-03:00 in US Mountain time -- comfortably off-peak. +# database server's timezone (UTC on Cloud SQL), so 09:00 UTC is roughly +# 02:00-03:00 in US Mountain time -- comfortably off-peak. CRON_SCHEDULE = "0 9 * * *" diff --git a/docker/db/Dockerfile b/docker/db/Dockerfile index ffa2c8864..4a1fbd51b 100644 --- a/docker/db/Dockerfile +++ b/docker/db/Dockerfile @@ -1,31 +1 @@ -# Production database image: PostGIS + pg_cron. -# -# This image is intentionally NOT used by the development docker-compose -# service (which runs the stock postgis/postgis image). pg_cron is a -# production-only dependency required by the nightly materialized-view refresh -# job registered in alembic migration x2y3z4a5b6c7. -# -# pg_cron must be loaded via shared_preload_libraries, and its background -# worker schedules jobs in a single database (cron.database_name). Both are set -# below so the alembic-registered job runs against the application database. -# -# Build/run example: -# docker build -f docker/db/Dockerfile -t ocotillo-db-prod . -# docker run -e POSTGRES_DB=ocotilloapi ocotillo-db-prod -# -# On Google Cloud SQL, pg_cron is enabled via the cloudsql.enable_pg_cron flag -# instead of this image; see docs/pg_cron-nightly-refresh.md. FROM postgis/postgis:17-3.5 - -# Install the pg_cron extension for PostgreSQL 17. -RUN apt-get update \ - && apt-get install -y --no-install-recommends postgresql-17-cron \ - && rm -rf /var/lib/apt/lists/* - -# Load pg_cron at server start and point its scheduler at the application -# database. cron.database_name is derived from POSTGRES_DB at start time (see -# start-postgres.sh) so it stays aligned even when POSTGRES_DB is overridden. -ENV POSTGRES_DB=ocotilloapi -COPY docker/db/start-postgres.sh /usr/local/bin/start-postgres.sh -RUN chmod +x /usr/local/bin/start-postgres.sh -ENTRYPOINT ["start-postgres.sh"] diff --git a/docker/db/start-postgres.sh b/docker/db/start-postgres.sh deleted file mode 100644 index 0582b271f..000000000 --- a/docker/db/start-postgres.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh -# Start Postgres with pg_cron preloaded and its scheduler pointed at the -# application database. cron.database_name is derived from POSTGRES_DB so that, -# when the image is run with POSTGRES_DB overridden, pg_cron watches the same -# database the app (and the alembic migration) connects to. -set -e - -exec docker-entrypoint.sh postgres \ - -c shared_preload_libraries=pg_cron \ - -c "cron.database_name=${POSTGRES_DB:-ocotilloapi}" diff --git a/docs/pg_cron-nightly-refresh.md b/docs/pg_cron-nightly-refresh.md index 15b81c634..8c82a9edd 100644 --- a/docs/pg_cron-nightly-refresh.md +++ b/docs/pg_cron-nightly-refresh.md @@ -15,8 +15,8 @@ registers everything, so the schedule is traceable in version control: runs `REFRESH MATERIALIZED VIEW` for each (plain, non-concurrent — see note). - A pg_cron job named `refresh-materialized-views` that runs `SELECT public.refresh_materialized_views();` on the schedule - `0 9 * * *` (09:00 in the **server timezone**, UTC on Cloud SQL and the - production image — roughly 02:00–03:00 US Mountain). + `0 9 * * *` (09:00 in the **server timezone**, UTC on Cloud SQL — + roughly 02:00–03:00 US Mountain). The helper refreshes whatever materialized views exist, so a view added by a later migration is picked up automatically — there is nothing to keep in sync @@ -43,30 +43,19 @@ So the migration is a **no-op unless `ENABLE_PG_CRON` is truthy**: - Production: `ENABLE_PG_CRON=1` → migration creates the extension, the helper function, and the cron job. Only `CD_production.yml` sets this. -## Production setup +## Production setup (Google Cloud SQL) -### Self-hosted / Docker +Production runs on Cloud SQL, where pg_cron is enabled with an instance flag +(the `docker/db/Dockerfile` image is development-only and does not load pg_cron): -Use the production database image, which installs pg_cron and preloads it: - -- [`docker/db/Dockerfile`](../docker/db/Dockerfile) installs - `postgresql-17-cron` and starts Postgres with - `-c shared_preload_libraries=pg_cron -c cron.database_name=$POSTGRES_DB` - (via [`start-postgres.sh`](../docker/db/start-postgres.sh), so overriding - `POSTGRES_DB` keeps the scheduler pointed at the same database). +1. Set the flag `cloudsql.enable_pg_cron=on` and + `cron.database_name=`, then restart the instance. +2. Deploy with `ENABLE_PG_CRON=1` (already set on the migration step in + `CD_production.yml`) so the migration registers the job. `cron.database_name` must match the application database so the alembic migration (which connects to that database) can `CREATE EXTENSION pg_cron` and -`cron.schedule(...)` locally. Then deploy with `ENABLE_PG_CRON=1` set for the -app container that runs migrations. - -### Google Cloud SQL - -Do not use the Docker image; enable pg_cron with the instance flag instead: - -1. Set the flag `cloudsql.enable_pg_cron=on` and - `cron.database_name=`, then restart the instance. -2. Deploy the app with `ENABLE_PG_CRON=1` so the migration registers the job. +`cron.schedule(...)` locally. ## Verifying From ef4335c97a9d7065b4e5d1f32ba830abdb57171a Mon Sep 17 00:00:00 2001 From: jross Date: Wed, 17 Jun 2026 11:30:01 -0600 Subject: [PATCH 9/9] refactor: drop legacy-function cleanup from pg_cron migration The cron job was never deployed, so the migration does not need to drop a previously created refresh_pygeoapi_materialized_views helper. Remove the LEGACY_FUNCTION_NAME constant, the DROP in upgrade, and the related note. Co-Authored-By: Claude Fable 5 --- ...4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py index 1797b7936..0e50fdef9 100644 --- a/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py +++ b/alembic/versions/x2y3z4a5b6c7_schedule_nightly_matview_refresh_pg_cron.py @@ -7,10 +7,6 @@ this migration stays immutable and self-contained, and views added by later migrations are refreshed without any rescheduling. -This also drops the legacy ``refresh_pygeoapi_materialized_views`` helper -(created by ``d5e6f7a8b9c0``) on databases that already ran that revision, -folding it into the generically named function. - pg_cron is a *production-only* dependency. It requires the extension to be loaded via ``shared_preload_libraries`` on the database server, which the development docker-compose Postgres image does not do. To avoid breaking @@ -40,9 +36,6 @@ # Name of the pg_cron job. Used to (re)register and to unschedule. CRON_JOB_NAME = "refresh-materialized-views" -# Legacy helper created by d5e6f7a8b9c0, superseded by refresh_materialized_views. -LEGACY_FUNCTION_NAME = "refresh_pygeoapi_materialized_views" - # Nightly schedule in standard cron syntax. pg_cron interprets this in the # database server's timezone (UTC on Cloud SQL), so 09:00 UTC is roughly # 02:00-03:00 in US Mountain time -- comfortably off-peak. @@ -101,9 +94,6 @@ def upgrade() -> None: # (Re)create the refresh helper. op.execute(text(_REFRESH_FUNCTION_SQL)) - # Remove the legacy helper on databases that already ran d5e6f7a8b9c0. - op.execute(text(f"DROP FUNCTION IF EXISTS public.{LEGACY_FUNCTION_NAME}()")) - # Drop any previously registered job with the same name so re-running this # migration (or a re-deploy) does not accumulate duplicate schedules. op.execute(