diff --git a/.gitignore b/.gitignore index c1d8db1ee..4bf6245e0 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,13 @@ dist/ wheels/ *.egg-info +# Test coverage reports +*.cover +.coverage +.coverage.* +htmlcov/ +coverage.xml + # Virtual environments .venv requirements.txt @@ -25,11 +32,11 @@ launcher.sh gcs_credentials.json transfers/data/assets* transfers/data/nma_csv_cache/* +transfers/data/*.csv transfers/transfer*.log transfer*.log transfers/data/nma_csv_cache/* !transfers/data/nma_csv_cache/.gitkeep -tests/features/*.feature transfers/metrics/* transfers/logs/* run_bdd-local.sh diff --git a/db/location.py b/db/location.py index 50b1aa0db..fda4611f9 100644 --- a/db/location.py +++ b/db/location.py @@ -23,6 +23,7 @@ String, ForeignKey, DateTime, + Date, func, Text, ) @@ -61,6 +62,18 @@ class Location(Base, AutoBaseMixin, ReleaseMixin, NotesMixin, DataProvenanceMixi nma_notes_location: Mapped[str] = mapped_column(Text, nullable=True) nma_coordinate_notes: Mapped[str] = mapped_column(Text, nullable=True) + # --- AMPAPI Date Fields (Migration-Only, Read-Only Post-Migration) --- + nma_date_created: Mapped[datetime.date] = mapped_column( + Date, + nullable=True, + comment="Original AMPAPI DateCreated (read-only, populated only during migration)", + ) + nma_site_date: Mapped[datetime.date] = mapped_column( + Date, + nullable=True, + comment="Original AMPAPI SiteDate (read-only, populated only during migration)", + ) + # --- Relationship Definitions --- thing_associations: Mapped[list["LocationThingAssociation"]] = relationship( back_populates="location", cascade="all, delete-orphan" diff --git a/schemas/location.py b/schemas/location.py index e911e3359..17414b5c4 100644 --- a/schemas/location.py +++ b/schemas/location.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== +from datetime import date from typing import List from geoalchemy2 import WKBElement @@ -106,6 +107,9 @@ class GeoJSONProperties(BaseModel): default_factory=GeoJSONUTMCoordinates ) notes: list[NoteResponse] = [] + # AMPAPI date fields (read-only, populated only during migration) + nma_date_created: date | None = None + nma_site_date: date | None = None model_config = ConfigDict( from_attributes=True, @@ -150,6 +154,9 @@ def populate_fields(cls, data: Any) -> Any: data_dict["properties"]["notes"] = data_dict.get("notes") data_dict["properties"]["elevation"] = convert_m_to_ft(elevation_m) data_dict["properties"]["elevation_method"] = data_dict.get("elevation_method") + # populate AMPAPI date fields + data_dict["properties"]["nma_date_created"] = data_dict.get("nma_date_created") + data_dict["properties"]["nma_site_date"] = data_dict.get("nma_site_date") # populate UTM coordinates point_utm_zone_13n_wkt = transform_srid( @@ -181,6 +188,10 @@ class LocationResponse(BaseResponseModel): county: str | None quad_name: str | None + # AMPAPI date fields (read-only, populated only during migration, not in Create/Update schemas) + nma_date_created: date | None = None + nma_site_date: date | None = None + @field_validator("point", mode="before") def point_to_wkt(cls, value): if isinstance(value, WKBElement): diff --git a/schemas/thing.py b/schemas/thing.py index cf8c3ef2b..692b78459 100644 --- a/schemas/thing.py +++ b/schemas/thing.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== +from datetime import date from typing import List from pydantic import BaseModel, model_validator, Field, field_validator diff --git a/tests/features/post-migration-legacy-data-retrieval.feature b/tests/features/post-migration-legacy-data-retrieval.feature new file mode 100644 index 000000000..13b2b347d --- /dev/null +++ b/tests/features/post-migration-legacy-data-retrieval.feature @@ -0,0 +1,94 @@ +Feature: Post-Migration AMPAPI Date Field Retrieval + As a data manager + After migrating data from AMPAPI to NMSampleLocations + I want to verify that all AMPAPI temporal information is preserved and queryable + So that no historical context is lost + + Background: + Given a functioning api + And the AMPAPI data has been migrated to the database + + # Location AMPAPI Date Lookups (Read-Only Fields) + + Scenario: Retrieve location with both AMPAPI date fields via API + Given a location exists with: + | field | value | + | nma_date_created | 2014-04-03 | + | nma_site_date | 2002-12-10 | + When I retrieve that location via the API + Then the response should include nma_date_created as "2014-04-03" + And the response should include nma_site_date as "2002-12-10" + And the time gap should be approximately 11.3 years + + Scenario: Retrieve location with large time gap (54 years) + Given a location exists with: + | field | value | + | nma_date_created | 2008-05-28 | + | nma_site_date | 1954-05-01 | + When I retrieve that location via the API + Then the response should include nma_date_created as "2008-05-28" + And the response should include nma_site_date as "1954-05-01" + And the time gap should be approximately 54 years + + Scenario: List all locations includes AMPAPI date fields + Given 5 locations exist with various AMPAPI dates + When I GET /location to list all locations + Then each location should have a date created field + And each location should have a site date field + And some locations should have null site date + + Scenario: Filter locations by AMPAPI site date range + Given locations exist with nma_site_date ranging from 1950 to 2024 + When I filter locations where nma_site_date is between "2000-01-01" and "2010-12-31" + Then the response should only include locations with site date in that decade + And locations with site date before 2000 should not be included + And locations with site date after 2010 should not be included + + Scenario: Query location by nma_date_created + Given 3 locations exist with nma_date_created "2014-04-03" + And 2 locations exist with nma_date_created "2017-12-06" + When I query for locations with nma_date_created "2014-04-03" + Then the response should include exactly 3 locations + And all should have nma_date_created "2014-04-03" + + # Data Quality Validation + + Scenario: Verify migration preserved expected percentage of AMPAPI dates + Given 100 locations were migrated + And 9 of them had non-null SiteDate in AMPAPI + When I query the migrated locations + Then 9% should have non-null nma_site_date + And 100% should have non-null nma_date_created + + # Audit Trail Verification + + Scenario: AMPAPI dates preserved alongside audit timestamps + Given a location was migrated with AMPAPI dates + When I retrieve that location + Then it should have created_at (new system timestamp from migration) + And it should have nma_date_created (original AMPAPI DateCreated) + And it should have nma_site_date (original AMPAPI SiteDate) + And all three timestamps should be independently queryable + And created_at should be a recent timestamp + And nma_date_created should be an older date + + # Edge Cases + + Scenario: Location where SiteDate is later than DateCreated (data anomaly) + Given a location exists with: + | field | value | + | nma_date_created | 2010-01-15 | + | nma_site_date | 2015-06-20 | + When I retrieve that location + Then nma_date_created should be "2010-01-15" + And nma_site_date should be "2015-06-20" + And the system should accept this without error + + Scenario: Location with only nma_date_created (no nma_site_date) + Given a location exists with: + | field | value | + | nma_date_created | 2014-10-17 | + | nma_site_date | null | + When I retrieve that location + Then nma_date_created should be "2014-10-17" + And nma_site_date should be null diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py new file mode 100644 index 000000000..185b1a758 --- /dev/null +++ b/tests/features/steps/post_migration_legacy_data.py @@ -0,0 +1,453 @@ +# =============================================================================== +# Copyright 2025 ross +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== +from datetime import date, datetime, timezone +from behave import given, when, then, register_type +from behave.runner import Context +import parse + +from db import Location, Thing, LocationThingAssociation +from db.engine import session_ctx + + +# Custom type parsers +@parse.with_pattern(r"\d+") +def parse_number(text): + return int(text) + + +register_type(Number=parse_number) + + +def create_test_location(nma_date_created=None, nma_site_date=None): + """Helper to create a test location with AMPAPI date fields (read-only post-migration).""" + with session_ctx() as session: + location = Location( + point="POINT(-106.607784 35.118924)", + elevation=1558.8, + release_status="public", + nma_date_created=nma_date_created, + nma_site_date=nma_site_date, + ) + session.add(location) + session.commit() + session.refresh(location) + return location + + +@given("the AMPAPI data has been migrated to the database") +def step_given_data_migrated(context: Context): + """Assumption that migration has occurred.""" + context.migrated = True + + +@given("a location exists with") +def step_given_location_with_table(context: Context): + """Create location with fields from table.""" + data = {row["field"]: row["value"] for row in context.table} + + nma_date_created = ( + date.fromisoformat(data["nma_date_created"]) + if data.get("nma_date_created") and data["nma_date_created"] != "null" + else None + ) + nma_site_date = ( + date.fromisoformat(data["nma_site_date"]) + if data.get("nma_site_date") and data["nma_site_date"] != "null" + else None + ) + + location = create_test_location( + nma_date_created=nma_date_created, nma_site_date=nma_site_date + ) + + context.test_location = location + context.test_location_id = location.id + + +@given("{count:Number} locations exist with various legacy dates") +def step_given_multiple_locations(context: Context, count: int): + """Create multiple locations with various legacy dates.""" + context.test_locations = [] + + test_data = [ + ("2014-04-03", "2002-12-10"), + ("2014-04-03", "2003-01-07"), + ("2017-12-06", "2003-12-11"), + ("2008-05-28", "1954-05-01"), + ("2020-01-15", None), + ] + + for i in range(min(count, len(test_data))): + created_date, site_date = test_data[i] + location = create_test_location( + nma_date_created=date.fromisoformat(created_date), + nma_site_date=(date.fromisoformat(site_date) if site_date else None), + ) + context.test_locations.append(location) + + +@given( + "locations exist with nma_site_date ranging from {start_year:Number} to {end_year:Number}" +) +def step_given_locations_date_range(context: Context, start_year: int, end_year: int): + """Create locations with nma_site_date across a date range.""" + context.test_locations = [] + + years = [1954, 2002, 2003, 2010, 2015, 2020, 2024] + for year in years: + location = create_test_location( + nma_date_created=date(year + 5, 1, 1), # Always 5 years after site date + nma_site_date=date(year, 6, 15), + ) + context.test_locations.append(location) + + +@given('{count:Number} locations exist with nma_date_created "{target_date}"') +def step_given_locations_with_specific_date( + context: Context, count: int, target_date: str +): + """Create locations with specific nma_date_created.""" + if not hasattr(context, "test_locations"): + context.test_locations = [] + + target = date.fromisoformat(target_date) + + for i in range(count): + location = create_test_location( + nma_date_created=target, + nma_site_date=date(2000 + i, 1, 1), # Vary the site dates + ) + context.test_locations.append(location) + + +@given("{count:Number} locations were migrated") +def step_given_count_locations_migrated(context: Context, count: int): + """Create specified number of test locations.""" + context.test_locations = [] + + for i in range(count): + # 9% have nma_site_date + has_site_date = i < count * 0.09 + + location = create_test_location( + nma_date_created=date(2014, 1, i % 28 + 1), + nma_site_date=date(2003, 1, i % 28 + 1) if has_site_date else None, + ) + context.test_locations.append(location) + + +@given("{count:Number} of them had non-null SiteDate in AMPAPI") +def step_given_sitedate_count(context: Context, count: int): + """Declarative - data created in previous step.""" + pass + + +@given("a location was migrated with legacy dates") +def step_given_location_migrated_with_dates(context: Context): + """Create location with both legacy dates.""" + location = create_test_location( + nma_date_created=date(2014, 4, 3), nma_site_date=date(2002, 12, 10) + ) + context.test_location = location + + +# WHEN steps + + +@when("I retrieve that location via the API") +def step_when_retrieve_location_api(context: Context): + """Retrieve location via GET API.""" + response = context.client.get(f"/location/{context.test_location_id}") + assert response.status_code == 200 + context.location_response = response.json() + + +@when("I GET /location to list all locations") +def step_when_get_all_locations(context: Context): + """Get all locations.""" + response = context.client.get("/location") + assert response.status_code == 200 + context.locations_response = response.json() + + +@when( + 'I filter locations where nma_site_date is between "{start_date}" and "{end_date}"' +) +def step_when_filter_locations(context: Context, start_date: str, end_date: str): + """Filter locations by date range.""" + # Since API may not support this yet, query database directly + with session_ctx() as session: + start = date.fromisoformat(start_date) + end = date.fromisoformat(end_date) + + locations = ( + session.query(Location) + .filter(Location.nma_site_date >= start, Location.nma_site_date <= end) + .all() + ) + + context.filtered_locations = locations + + +@when('I query for locations with nma_date_created "{target_date}"') +def step_when_query_by_ampapi_date(context: Context, target_date: str): + """Query locations by nma_date_created.""" + with session_ctx() as session: + target = date.fromisoformat(target_date) + locations = ( + session.query(Location).filter(Location.nma_date_created == target).all() + ) + context.queried_locations = locations + + +@when("I query the migrated locations") +def step_when_query_migrated_locations(context: Context): + """Query all test locations.""" + with session_ctx() as session: + # Query only our test locations + location_ids = [loc.id for loc in context.test_locations] + locations = session.query(Location).filter(Location.id.in_(location_ids)).all() + context.queried_locations = locations + + +@when("I retrieve that location") +def step_when_retrieve_location(context: Context): + """Retrieve location by ID.""" + with session_ctx() as session: + location = session.get(Location, context.test_location.id) + context.retrieved_location = location + + +# THEN steps + + +@then('the response should include nma_date_created as "{expected_date}"') +def step_then_nma_date_created(context: Context, expected_date: str): + """Assert nma_date_created matches.""" + actual = context.location_response.get("nma_date_created") + assert actual == expected_date, f"Expected {expected_date}, got {actual}" + + +@then('the response should include nma_site_date as "{expected_date}"') +def step_then_nma_site_date(context: Context, expected_date: str): + """Assert nma_site_date matches.""" + actual = context.location_response.get("nma_site_date") + assert actual == expected_date, f"Expected {expected_date}, got {actual}" + + +@then("the time gap should be approximately {years} years") +def step_then_time_gap_years(context: Context, years: str): + """Assert approximate year gap.""" + created_str = context.location_response.get("nma_date_created") + site_date_str = context.location_response.get("nma_site_date") + + if not created_str or not site_date_str: + raise AssertionError("Missing date fields for gap calculation") + + created_date = date.fromisoformat(created_str) + site_date = date.fromisoformat(site_date_str) + + gap_days = (created_date - site_date).days + gap_years = gap_days / 365.25 + + expected_years = float(years) + tolerance = 0.5 + assert ( + abs(gap_years - expected_years) < tolerance + ), f"Expected ~{expected_years} year gap, got {gap_years:.1f} years" + + +@then("each location should have a date created field") +def step_then_all_have_date_created_field(context: Context): + """Assert all locations have the date created field.""" + items = context.locations_response.get("items", []) + for item in items: + assert "nma_date_created" in item, f"Location missing nma_date_created" + + +@then("each location should have a site date field") +def step_then_all_have_site_date_field(context: Context): + """Assert all locations have the site date field.""" + items = context.locations_response.get("items", []) + for item in items: + assert "nma_site_date" in item, f"Location missing nma_site_date" + + +@then("some locations should have null site date") +def step_then_some_null_site_date(context: Context): + """Assert some locations have null site date.""" + items = context.locations_response.get("items", []) + null_count = sum(1 for item in items if item.get("nma_site_date") is None) + assert null_count > 0, "Expected at least one location with null site date" + + +@then("the response should only include locations with site date in that decade") +def step_then_locations_in_decade(context: Context): + """Assert filtered locations are in range.""" + for loc in context.filtered_locations: + assert ( + 2000 <= loc.nma_site_date.year <= 2010 + ), f"Location not in 2000-2010: {loc.nma_site_date}" + + +@then("locations with site date before {year:Number} should not be included") +def step_then_locations_before_excluded(context: Context, year: int): + """Assert no locations before year.""" + for loc in context.filtered_locations: + assert ( + loc.nma_site_date.year >= year + ), f"Location from {loc.nma_site_date.year} should not be included" + + +@then("locations with site date after {year:Number} should not be included") +def step_then_locations_after_excluded(context: Context, year: int): + """Assert no locations after year.""" + for loc in context.filtered_locations: + assert ( + loc.nma_site_date.year <= year + ), f"Location from {loc.nma_site_date.year} should not be included" + + +@then("the response should include exactly {count:Number} locations") +def step_then_exact_count_locations(context: Context, count: int): + """Assert exact count.""" + actual = len(context.queried_locations) + assert actual == count, f"Expected {count} locations, got {actual}" + + +@then('all should have nma_date_created "{expected_date}"') +def step_then_all_have_date(context: Context, expected_date: str): + """Assert all have same date.""" + expected = date.fromisoformat(expected_date) + for loc in context.queried_locations: + assert ( + loc.nma_date_created == expected + ), f"Location has {loc.nma_date_created}, expected {expected}" + + +@then("{percentage:Number}% should have non-null nma_site_date") +def step_then_percentage_site_date(context: Context, percentage: int): + """Assert percentage with nma_site_date.""" + total = len(context.queried_locations) + populated = sum(1 for loc in context.queried_locations if loc.nma_site_date) + actual_pct = (populated / total) * 100 + + tolerance = 2 + assert ( + abs(actual_pct - percentage) < tolerance + ), f"Expected ~{percentage}%, got {actual_pct:.1f}%" + + +@then("{percentage:Number}% should have non-null nma_date_created") +def step_then_percentage_legacy(context: Context, percentage: int): + """Assert percentage with nma_date_created.""" + total = len(context.queried_locations) + populated = sum(1 for loc in context.queried_locations if loc.nma_date_created) + actual_pct = (populated / total) * 100 + + tolerance = 2 + assert ( + abs(actual_pct - percentage) < tolerance + ), f"Expected ~{percentage}%, got {actual_pct:.1f}%" + + +@then("it should have created_at (new system timestamp from migration)") +def step_then_has_created_at(context: Context): + """Assert created_at exists.""" + assert context.retrieved_location.created_at is not None + + +@then("it should have nma_date_created (original AMPAPI DateCreated)") +def step_then_has_ampapi_date_created(context: Context): + """Assert nma_date_created exists.""" + assert context.retrieved_location.nma_date_created is not None + + +@then("it should have nma_site_date (original AMPAPI SiteDate)") +def step_then_has_site_date(context: Context): + """Assert nma_site_date exists.""" + assert context.retrieved_location.nma_site_date is not None + + +@then("all three timestamps should be independently queryable") +def step_then_all_queryable(context: Context): + """Assert all fields are queryable.""" + assert hasattr(context.retrieved_location, "created_at") + assert hasattr(context.retrieved_location, "nma_date_created") + assert hasattr(context.retrieved_location, "nma_site_date") + + +@then("created_at should be a recent timestamp") +def step_then_created_at_recent(context: Context): + """Assert created_at is recent.""" + created_at = context.retrieved_location.created_at + now = datetime.now(timezone.utc) + + # created_at should always be timezone-aware (configured in AutoBaseMixin with DateTime(timezone=True)) + # If it's naive, this indicates a database/ORM configuration issue + if created_at.tzinfo is None: + raise AssertionError( + "created_at is a naive datetime (no timezone info). " + "Check ORM/database config for timezone-aware UTC datetimes (see AutoBaseMixin.created_at)." + ) + + diff_seconds = abs((now - created_at).total_seconds()) + assert diff_seconds < 3600, "created_at should be within last hour" + + +@then("nma_date_created should be an older date") +def step_then_ampapi_date_older(context: Context): + """Assert nma_date_created is old.""" + ampapi_created_date = context.retrieved_location.nma_date_created + assert ampapi_created_date.year < 2024, "nma_date_created should be from the past" + + +@then('nma_date_created should be "{expected_date}"') +def step_then_ampapi_created_is(context: Context, expected_date: str): + """Assert nma_date_created value.""" + actual = context.retrieved_location.nma_date_created + expected = date.fromisoformat(expected_date) + assert actual == expected, f"Expected {expected}, got {actual}" + + +@then('nma_site_date should be "{expected_date}"') +def step_then_site_date_is(context: Context, expected_date: str): + """Assert nma_site_date value.""" + actual = context.retrieved_location.nma_site_date + expected = date.fromisoformat(expected_date) + assert actual == expected, f"Expected {expected}, got {actual}" + + +@then("the system should accept this without error") +def step_then_no_error(context: Context): + """Assert no errors.""" + # If we got here, no errors + pass + + +@then("nma_site_date should be null") +def step_then_site_date_null(context: Context): + """Assert nma_site_date is null.""" + assert context.retrieved_location.nma_site_date is None + + +@then("the well should still be valid") +def step_then_well_valid(context: Context): + """Assert well is valid.""" + assert context.retrieved_well.id is not None + + +# ============= EOF ============================================= diff --git a/tests/test_location.py b/tests/test_location.py index 4b6ec6faa..9dcb3d098 100644 --- a/tests/test_location.py +++ b/tests/test_location.py @@ -235,4 +235,83 @@ def test_delete_location_404_not_found(second_location): assert data["detail"] == f"Location with ID {bad_location_id} not found." +# ============= AMPAPI date field tests ======================================= + + +def test_new_location_has_null_ampapi_fields(): + """Test that newly created locations have null AMPAPI date fields (AMPAPI fields are migration-only)""" + payload = { + "point": "POINT (-106.607784 35.118924)", + "elevation": 1558.8, + "release_status": "draft", + } + response = client.post("/location", json=payload) + + assert response.status_code == 201 + data = response.json() + assert "id" in data + # AMPAPI date fields should be present in response but null (not set during creation, read-only) + assert "nma_date_created" in data + assert "nma_site_date" in data + assert data["nma_date_created"] is None + assert data["nma_site_date"] is None + + # cleanup after test + cleanup_post_test(Location, data["id"]) + + +def test_ampapi_fields_present_in_location_response(): + """Test that AMPAPI date fields (read-only) are included in location GET response""" + # Create a new location (without AMPAPI date fields set - they're read-only) + payload = { + "point": "POINT (-106.607784 35.118924)", + "elevation": 1558.8, + "release_status": "draft", + } + create_response = client.post("/location", json=payload) + assert create_response.status_code == 201 + location_id = create_response.json()["id"] + + # Retrieve the location and verify AMPAPI date fields are in the schema + get_response = client.get(f"/location/{location_id}") + assert get_response.status_code == 200 + data = get_response.json() + + # Verify read-only fields exist in response (even if null) + assert "nma_date_created" in data + assert "nma_site_date" in data + assert data["nma_date_created"] is None + assert data["nma_site_date"] is None + + # cleanup after test + cleanup_post_test(Location, location_id) + + +def test_ampapi_fields_independent_of_created_at(): + """Test that created_at (system timestamp) is separate from AMPAPI date fields (read-only)""" + payload = { + "point": "POINT (-106.607784 35.118924)", + "elevation": 1558.8, + "release_status": "draft", + } + response = client.post("/location", json=payload) + + assert response.status_code == 201 + data = response.json() + + # created_at is automatically set by AutoBaseMixin + assert "created_at" in data + assert data["created_at"] is not None + + # nma_date_created is separate and null for new records (read-only, populated only during migration) + assert "nma_date_created" in data + assert data["nma_date_created"] is None + + # These are independent fields with different purposes + assert "created_at" != "nma_date_created" + + # cleanup after test + cleanup_post_test(Location, data["id"]) + + # ============= EOF ============================================= diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py new file mode 100644 index 000000000..985214fbb --- /dev/null +++ b/tests/test_transfer_legacy_dates.py @@ -0,0 +1,354 @@ +# =============================================================================== +# Copyright 2025 ross +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== +""" +Unit tests for AMPAPI date field population during AMPAPI → NMSampleLocations migration. + +These tests verify that: +1. Location.nma_date_created is populated from CSV DateCreated (read-only post-migration) +2. Location.nma_site_date is populated from CSV SiteDate if not null (read-only post-migration) +""" +import datetime +from unittest.mock import Mock, patch, MagicMock +import pandas as pd +import pytest + +from transfers.util import make_location + + +# ============================================================================ +# FIXTURES +# ============================================================================ + + +@pytest.fixture +def mock_lexicon_mapper(): + """Fixture to mock lexicon_mapper for all transfer tests""" + with patch("transfers.util.lexicon_mapper") as mock: + mock.map_value.return_value = "GPS" + yield mock + + +# ============================================================================ +# LOCATION AMPAPI DATE TESTS (Read-Only Post-Migration) +# ============================================================================ + + +def test_make_location_with_both_ampapi_dates(mock_lexicon_mapper): + """Test that make_location populates both nma_date_created and nma_site_date""" + + # Create a mock CSV row with both DateCreated and SiteDate + row = pd.Series( + { + "PointID": "TEST-001", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": "2002-12-10 00:00:00.000", + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 1, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + + # Call make_location + location, elevation_method = make_location(row, elevations) + + # Verify nma_date_created is set from DateCreated + assert location.nma_date_created is not None + assert location.nma_date_created == datetime.date(2014, 4, 3) + + # Verify nma_site_date is set from SiteDate + assert location.nma_site_date is not None + assert location.nma_site_date == datetime.date(2002, 12, 10) + + # Verify created_at is NOT set during migration (it's auto-set by AutoBaseMixin on save) + assert location.created_at is None + + +def test_make_location_with_only_date_created(mock_lexicon_mapper): + """Test that make_location handles locations with only DateCreated (no SiteDate)""" + row = pd.Series( + { + "PointID": "TEST-002", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": None, # No SiteDate + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 2, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Verify nma_date_created is set + assert location.nma_date_created == datetime.date(2014, 4, 3) + + # Verify nma_site_date is null (91% of locations don't have SiteDate) + assert location.nma_site_date is None + + +def test_make_location_with_site_date_later_than_date_created(mock_lexicon_mapper): + """Test data anomaly: SiteDate is later than DateCreated (should still be accepted)""" + row = pd.Series( + { + "PointID": "TEST-003", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2010-01-15 00:00:00.000", + "SiteDate": "2015-06-20 00:00:00.000", # Later than DateCreated (anomaly) + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 3, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Both dates should be preserved as-is, regardless of order + assert location.nma_date_created == datetime.date(2010, 1, 15) + assert location.nma_site_date == datetime.date(2015, 6, 20) + + +def test_make_location_with_very_old_site_date(mock_lexicon_mapper): + """Test that very old SiteDates (1950s) are preserved correctly""" + row = pd.Series( + { + "PointID": "SM-0227", # Real example from dataset + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2008-05-28 00:00:00.000", + "SiteDate": "1954-05-01 00:00:00.000", # 54 years earlier! + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 4, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Verify very old date is preserved + assert location.nma_site_date == datetime.date(1954, 5, 1) + assert location.nma_date_created == datetime.date(2008, 5, 28) + + # Verify 54-year time gap + time_gap = (location.nma_date_created - location.nma_site_date).days + assert time_gap == 19751 # Approximately 54 years + + +def test_make_location_ampapi_dates_are_date_not_datetime(mock_lexicon_mapper): + """Test that AMPAPI date fields are Date type (not DateTime)""" + row = pd.Series( + { + "PointID": "TEST-004", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2014-04-03 10:30:45.123", # Has time component + "SiteDate": "2002-12-10 14:22:33.456", # Has time component + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 5, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Verify they are date objects (not datetime) + assert isinstance(location.nma_date_created, datetime.date) + assert not isinstance(location.nma_date_created, datetime.datetime) + + assert isinstance(location.nma_site_date, datetime.date) + assert not isinstance(location.nma_site_date, datetime.datetime) + + # Verify time component is stripped + assert location.nma_date_created == datetime.date(2014, 4, 3) + assert location.nma_site_date == datetime.date(2002, 12, 10) + + +def test_make_location_ampapi_dates_independent_of_created_at(mock_lexicon_mapper): + """Test that AMPAPI dates don't affect created_at timestamp""" + row = pd.Series( + { + "PointID": "TEST-005", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": "2002-12-10 00:00:00.000", + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 6, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # created_at should be None during transfer (auto-set by AutoBaseMixin on save) + assert location.created_at is None + + # legacy fields should be Date (no timezone) + assert isinstance(location.nma_date_created, datetime.date) + assert isinstance(location.nma_site_date, datetime.date) + + # Legacy fields should be populated + assert location.nma_date_created is not None + assert location.nma_site_date is not None + + +# ============================================================================ +# DATA COVERAGE TESTS (Simulating Migration Statistics) +# ============================================================================ + + +def test_make_location_with_no_ampapi_dates(mock_lexicon_mapper): + """Test that make_location handles locations with no AMPAPI dates (both null)""" + row = pd.Series( + { + "PointID": "TEST-NODATES", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": None, # No DateCreated + "SiteDate": None, # No SiteDate + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 999, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Both AMPAPI date fields should be null + assert location.nma_date_created is None + assert location.nma_site_date is None + + +def test_make_location_with_empty_string_dates(mock_lexicon_mapper): + """Test that make_location handles empty string dates (CSV edge case)""" + row = pd.Series( + { + "PointID": "TEST-EMPTY", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "", # Empty string + "SiteDate": "", # Empty string + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 998, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Both AMPAPI date fields should be null (empty strings are falsy) + assert location.nma_date_created is None + assert location.nma_site_date is None + + +def test_location_ampapi_date_coverage_statistics(mock_lexicon_mapper): + """Test that migration preserves expected percentages of AMPAPI dates""" + + def create_test_row(i, has_site_date): + """Helper to create test row with common fields""" + return pd.Series( + { + "PointID": f"TEST-{i:03d}", + "Easting": 350000 + i, + "Northing": 3880000 + i, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": "2002-12-10 00:00:00.000" if has_site_date else None, + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": i, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + # Simulate 100 location records from CSV (9% with SiteDate, 91% without) + locations_created = 0 + locations_with_site_date = 0 + elevations = {} + + for i in range(100): + row = create_test_row(i, has_site_date=(i < 9)) + location, _ = make_location(row, elevations) + + # Count coverage + if location.nma_date_created is not None: + locations_created += 1 + if location.nma_site_date is not None: + locations_with_site_date += 1 + + # Verify expected coverage + assert locations_created == 100 # 100% should have nma_date_created + assert locations_with_site_date == 9 # 9% should have nma_site_date + + +# ============================================================================ +# EOF +# ============================================================================ diff --git a/transfers/util.py b/transfers/util.py index cbf0f2b17..876e142fc 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -59,20 +59,28 @@ def replace_nans(df: pd.DataFrame, default=None) -> pd.DataFrame: def read_csv(name: str, dtype: dict | None = None) -> pd.DataFrame: + # Try to read from local data directory first + local_file = Path(__file__).parent / "data" / f"{name}.csv" + + if local_file.exists(): + logger.info(f"Reading {name} from local file: {local_file}") + return pd.read_csv(local_file, dtype=dtype) + + # Check cache directory p = get_transfers_data_path(Path("nma_csv_cache") / f"{name}.csv") if os.path.exists(p): + logger.info(f"Reading {name} from cache: {p}") return pd.read_csv(p, dtype=dtype) + # Fall back to GCS if local file doesn't exist + logger.info(f"Local file and cache not found, reading {name} from GCS") bucket = get_storage_bucket() blob = bucket.blob(f"nma_csv/{name}.csv") data = blob.download_as_bytes() with open(p, "wb") as f: f.write(data) - if dtype: - return pd.read_csv(io.BytesIO(data), dtype=dtype) - else: - return pd.read_csv(io.BytesIO(data)) + return pd.read_csv(io.BytesIO(data), dtype=dtype) def get_valid_point_ids(session, thing_type="water well"): @@ -200,33 +208,6 @@ def make_location(row: pd.Series, elevations: dict) -> tuple: point, source_srid=SRID_UTM_ZONE_13N, target_srid=SRID_WGS84 ) - """ - Developer's notes - - AMP folks said that the earlier date between DateCreated and SiteDate is when - the site was inventoried, whereas the later is when the record was made in - the database. This was because they were used interchangeably. - """ - if row.DateCreated and row.SiteDate: - - date_created = datetime.strptime(row.DateCreated, "%Y-%m-%d %H:%M:%S.%f") - site_date = datetime.strptime(row.SiteDate, "%Y-%m-%d %H:%M:%S.%f") - - if date_created > site_date: - created_at = date_created - else: - created_at = site_date - elif row.DateCreated and not row.SiteDate: - created_at = datetime.strptime(row.DateCreated, "%Y-%m-%d %H:%M:%S.%f") - elif not row.DateCreated and row.SiteDate: - created_at = datetime.strptime(row.SiteDate, "%Y-%m-%d %H:%M:%S.%f") - else: - created_at = None - - # convert created_at from MST/MDT to UTC - if created_at is not None: - created_at = convert_mt_to_utc(created_at) - z = row.Altitude if z: elevation_from_epqs = False @@ -257,14 +238,26 @@ def make_location(row: pd.Series, elevations: dict) -> tuple: f"LU_AltitudeMethod:{row.AltitudeMethod.strip()}" ) + # Extract AMPAPI date fields (Date type, not DateTime) + nma_date_created = None + if row.DateCreated: + nma_date_created = datetime.strptime( + row.DateCreated, "%Y-%m-%d %H:%M:%S.%f" + ).date() + + nma_site_date = None + if row.SiteDate: + nma_site_date = datetime.strptime(row.SiteDate, "%Y-%m-%d %H:%M:%S.%f").date() + location = Location( nma_pk_location=row.LocationId, point=transformed_point.wkt, elevation=z, release_status="public" if row.PublicRelease else "private", - created_at=created_at, nma_coordinate_notes=row.CoordinateNotes, nma_notes_location=row.LocationNotes, + nma_date_created=nma_date_created, + nma_site_date=nma_site_date, ) return location, elevation_method