diff --git a/.gitignore b/.gitignore index c1d8db1ee..4bf6245e0 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,13 @@ dist/ wheels/ *.egg-info +# Test coverage reports +*.cover +.coverage +.coverage.* +htmlcov/ +coverage.xml + # Virtual environments .venv requirements.txt @@ -25,11 +32,11 @@ launcher.sh gcs_credentials.json transfers/data/assets* transfers/data/nma_csv_cache/* +transfers/data/*.csv transfers/transfer*.log transfer*.log transfers/data/nma_csv_cache/* !transfers/data/nma_csv_cache/.gitkeep -tests/features/*.feature transfers/metrics/* transfers/logs/* run_bdd-local.sh diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5d74e6a6c..b4dba7bf8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,6 +25,8 @@ repos: types: [python] # Specify relevant file types for your tests pass_filenames: false always_run: true + args: + - -x # - repo: https://github.com/pre-commit/mirrors-mypy # rev: v1.10.0 # Use the latest stable version or pin to your preference diff --git a/core/enums.py b/core/enums.py index 568f3f96a..91b206cab 100644 --- a/core/enums.py +++ b/core/enums.py @@ -24,7 +24,9 @@ ) CasingMaterial: type[Enum] = build_enum_from_lexicon_category("casing_material") CollectionMethod: type[Enum] = build_enum_from_lexicon_category("collection_method") -ConstructionMethod: type[Enum] = build_enum_from_lexicon_category("construction_method") +WellConstructionMethod: type[Enum] = build_enum_from_lexicon_category( + "well_construction_method" +) ContactType: type[Enum] = build_enum_from_lexicon_category("contact_type") CoordinateMethod: type[Enum] = build_enum_from_lexicon_category("coordinate_method") WellPurpose: type[Enum] = build_enum_from_lexicon_category("well_purpose") @@ -68,8 +70,14 @@ Vertical_datum: type[Enum] = build_enum_from_lexicon_category("vertical_datum") ScreenType: type[Enum] = build_enum_from_lexicon_category("screen_type") SensorType: type[Enum] = build_enum_from_lexicon_category("sensor_type") +WellPumpType: type[Enum] = build_enum_from_lexicon_category("well_pump_type") +PermissionType: type[Enum] = build_enum_from_lexicon_category("permission_type") GroupType: type[Enum] = build_enum_from_lexicon_category("group_type") MonitoringFrequency: type[Enum] = build_enum_from_lexicon_category( "monitoring_frequency" ) +AquiferType: type[Enum] = build_enum_from_lexicon_category("aquifer_type") +GeographicScale: type[Enum] = build_enum_from_lexicon_category("geographic_scale") +Lithology: type[Enum] = build_enum_from_lexicon_category("lithology") +FormationCode: type[Enum] = build_enum_from_lexicon_category("formation_code") # ============= EOF ============================================= diff --git a/core/formations.json b/core/formations.json new file mode 100644 index 000000000..e69de29bb diff --git a/core/lexicon.json b/core/lexicon.json index 9aa9b88ae..423be5332 100644 --- a/core/lexicon.json +++ b/core/lexicon.json @@ -2,9 +2,10 @@ {"name": "activity_type", "description": null}, {"name": "address_type", "description": null}, {"name": "analysis_method_type", "description": null}, + {"name": "aquifer_type", "description": null}, {"name": "casing_material", "description": null}, {"name": "collection_method", "description": null}, - {"name": "construction_method", "description": null}, + {"name": "well_construction_method", "description": null}, {"name": "contact_type", "description": null}, {"name": "coordinate_method", "description": null}, {"name": "country", "description": null}, @@ -18,6 +19,7 @@ {"name": "email_type", "description": null}, {"name": "participant_role", "description": null}, {"name": "geochronology", "description": null}, + {"name": "geographic_scale", "description": null}, {"name": "groundwater_level_reason", "description": null}, {"name": "group_type", "description": null}, {"name": "horizontal_datum", "description": null}, @@ -51,7 +53,11 @@ {"name": "well_purpose", "description": null}, {"name": "status_type", "description": null}, {"name": "status_value", "description": null}, - {"name": "origin_source", "description": null} + {"name": "origin_source", "description": null}, + {"name": "well_pump_type", "description": null}, + {"name": "permission_type", "description": null}, + {"name": "formation_code", "description": null}, + {"name": "lithology", "description": null} ], "terms": [ {"categories": ["review_status"], "term": "approved", "definition": "approved"}, @@ -78,15 +84,15 @@ {"categories": ["elevation_method"], "term": "Reported", "definition": "Reported"}, {"categories": ["elevation_method"], "term": "Survey-grade Global Navigation Satellite Sys, Lvl1", "definition": "Survey-grade Global Navigation Satellite Sys, Lvl1"}, {"categories": ["elevation_method"], "term": "USGS National Elevation Dataset (NED)", "definition": "USGS National Elevation Dataset (NED)"}, - {"categories": ["elevation_method", "sample_method", "coordinate_method", "well_purpose", "status", "organization", "role"], "term": "Unknown", "definition": "Unknown"}, - {"categories": ["construction_method"], "term": "Air-rotary", "definition": "Air-rotary"}, - {"categories": ["construction_method"], "term": "Bored or augered", "definition": "Bored or augered"}, - {"categories": ["construction_method"], "term": "Cable-tool", "definition": "Cable-tool"}, - {"categories": ["construction_method"], "term": "Hydraulic rotary (mud or water)", "definition": "Hydraulic rotary (mud or water)"}, - {"categories": ["construction_method"], "term": "Air percussion", "definition": "Air percussion"}, - {"categories": ["construction_method"], "term": "Reverse rotary", "definition": "Reverse rotary"}, - {"categories": ["construction_method"], "term": "Driven", "definition": "Driven"}, - {"categories": ["construction_method", "measurement_method"], "term": "Other (explain in notes)", "definition": "Other (explain in notes)"}, + {"categories": ["elevation_method", "sample_method", "coordinate_method", "well_purpose", "status", "organization", "role", "aquifer_type"], "term": "Unknown", "definition": "Unknown"}, + {"categories": ["well_construction_method"], "term": "Air-Rotary", "definition": "Air-Rotary"}, + {"categories": ["well_construction_method"], "term": "Bored or augered", "definition": "Bored or augered"}, + {"categories": ["well_construction_method"], "term": "Cable-tool", "definition": "Cable-tool"}, + {"categories": ["well_construction_method"], "term": "Hydraulic rotary (mud or water)", "definition": "Hydraulic rotary (mud or water)"}, + {"categories": ["well_construction_method"], "term": "Air percussion", "definition": "Air percussion"}, + {"categories": ["well_construction_method"], "term": "Reverse rotary", "definition": "Reverse rotary"}, + {"categories": ["well_construction_method"], "term": "Driven", "definition": "Driven"}, + {"categories": ["well_construction_method", "measurement_method"], "term": "Other (explain in notes)", "definition": "Other (explain in notes)"}, {"categories": ["coordinate_method"], "term": "Differentially corrected GPS", "definition": "Differentially corrected GPS"}, {"categories": ["coordinate_method"], "term": "Survey-grade global positioning system (SGPS)", "definition": "Survey-grade global positioning system (SGPS)"}, {"categories": ["coordinate_method"], "term": "GPS, uncorrected", "definition": "GPS, uncorrected"}, @@ -367,6 +373,82 @@ {"categories": ["analysis_method_type"], "term": "Laboratory", "definition": "A procedure performed on a physical sample in a controlled, off-site laboratory environment. These methods typically involve complex instrumentation, standardized reagents, and formal quality control protocols."}, {"categories": ["analysis_method_type"], "term": "Field Procedure", "definition": "A standardized procedure performed on-site at the time of sample collection. This can involve direct measurement of the environmental medium using a calibrated field instrument or a specific, documented technique for collecting a sample."}, {"categories": ["analysis_method_type"], "term": "Calculation", "definition": "A mathematical procedure used to derive a new data point from one or more directly measured values. This type is used to document the provenance of calculated data, providing an auditable trail."}, + {"categories": ["organization"], "term": "City of Aztec", "definition": "City of Aztec"}, + {"categories": ["organization"], "term": "Daybreak Investments", "definition": "Daybreak Investments"}, + {"categories": ["organization"], "term": "Vallecitos HOA", "definition": "Vallecitos HOA"}, + {"categories": ["organization"], "term": "Naiche Development", "definition": "Naiche Corporation"}, + {"categories": ["organization"], "term": "Santa Fe County; Santa Fe Animal Shelter", "definition": "Santa Fe County; Santa Fe Animal Shelter"}, + {"categories": ["organization"], "term": "El Guicu Ditch Association", "definition": "El Guicu Ditch Association"}, + {"categories": ["organization"], "term": "Santa Fe Municipal Airport", "definition": "Santa Fe Municipal Airport"}, + {"categories": ["organization"], "term": "Uluru Development", "definition": "Uluru Development"}, + {"categories": ["organization"], "term": "AllSup's Convenience Stores", "definition": "AllSup's Convenience Stores"}, + {"categories": ["organization"], "term": "Santa Fe Downs", "definition": "Santa Fe Downs Resort"}, + {"categories": ["organization"], "term": "City of Truth or Consequences, WWTP", "definition": "City of Truth or Consequences, WWTP"}, + {"categories": ["organization"], "term": "Riverbend Hotsprings", "definition": "Riverbend Hotsprings"}, + {"categories": ["organization"], "term": "Armendaris Ranch", "definition": "Armendaris Ranch"}, + {"categories": ["organization"], "term": "El Paso Water", "definition": "El Paso Water"}, + {"categories": ["organization"], "term": "BLM, Socorro Field Office", "definition": "BLM, Socorro Field Office"}, + {"categories": ["organization"], "term": "USFWS", "definition": "US Fish & Wildlife Service"}, + {"categories": ["organization"], "term": "NPS", "definition": "National Park Service"}, + {"categories": ["organization"], "term": "Sile MDWCA", "definition": "Sile Municipal Domestic Water Assn."}, + {"categories": ["organization"], "term": "Pena Blanca Water & Sanitation District", "definition": "Pena Blanca Water & Sanitation District"}, + {"categories": ["organization"], "term": "Town of Questa", "definition": "Town of Questa"}, + {"categories": ["organization"], "term": "Lamy MDWCA", "definition": "Lama MDWCA"}, + {"categories": ["organization"], "term": "Town of Cerro", "definition": "Town of Cerro"}, + {"categories": ["organization"], "term": "Farr Cattle Company", "definition": "Farr Cattle Company (Farr Ranch)"}, + {"categories": ["organization"], "term": "Carrizozo Orchard", "definition": "Carrizozo Orchard"}, + {"categories": ["organization"], "term": "USFS, Kiowa Grasslands", "definition": "USFS, Kiowa Grasslands"}, + {"categories": ["organization"], "term": "Cloud Country West Subdivision", "definition": "Cloud Country West Subdivision"}, + {"categories": ["organization"], "term": "Chama West Water Users Association", "definition": "Chama West Water Users Assn."}, + {"categories": ["organization"], "term": "El Rito Regional Water and Waste Water Association", "definition": "El Rito Regional Water + Waste Water Association"}, + {"categories": ["organization"], "term": "West Rim MDWUA", "definition": "West Rim MDWUA"}, + {"categories": ["organization"], "term": "Village of Willard", "definition": "Village of Willard"}, + {"categories": ["organization"], "term": "Quemado Municipal Water & SWA", "definition": "Quemado Mutual Water and Sewage Works Association"}, + {"categories": ["organization"], "term": "Coyote Creek MDWUA", "definition": "Coyote Creek MDWUA"}, + {"categories": ["organization"], "term": "Lamy MDWCA", "definition": "Lamy Mutual Domestic Water Assn."}, + {"categories": ["organization"], "term": "La Joya CWDA", "definition": "La Joya CWDA"}, + {"categories": ["organization"], "term": "NM Firefighters Training Academy", "definition": "NM Firefighters Training Academy"}, + {"categories": ["organization"], "term": "Cebolleta Land Grant", "definition": "Cebolleta Land Grant"}, + {"categories": ["organization"], "term": "Madrid Water Co-op", "definition": "Madrid Water Co-op"}, + {"categories": ["organization"], "term": "Sun Valley Water and Sanitation", "definition": "Sun Valley Water and Sanitation"}, + {"categories": ["organization"], "term": "Bluewater Lake MDWCA", "definition": "Bluewater Lake MDWCA"}, + {"categories": ["organization"], "term": "Bluewater Acres Domestic WUA", "definition": "Bluewater Acres Domestic Water Users Assn."}, + {"categories": ["organization"], "term": "Lybrook MDWCA", "definition": "Lybrook Municipal"}, + {"categories": ["organization"], "term": "New Mexico Museum of Natural History", "definition": "New Mexico Museum of Natural History"}, + {"categories": ["organization"], "term": "Hillsboro MDWCA", "definition": "Hillsboro Mutual Domestic Water Consumer Assn."}, + {"categories": ["organization"], "term": "Tyrone MDWCA", "definition": "Tyrone Mutual Domestic Water Assn."}, + {"categories": ["organization"], "term": "Santa Clara Water System", "definition": "Santa Clara Water System"}, + {"categories": ["organization"], "term": "Casas Adobes MDWCA", "definition": "Casas Adobes Mutual Domestic"}, + {"categories": ["organization"], "term": "Lake Roberts WUA", "definition": "Lake Roberts Water Assn."}, + {"categories": ["organization"], "term": "El Creston MDWCA", "definition": "El Creston MDWCA"}, + {"categories": ["organization"], "term": "Reserve Municipality Water Works", "definition": "Reserve Municipality Water Works"}, + {"categories": ["organization"], "term": "Bayard", "definition": "Bayard Municipal Water"}, + {"categories": ["organization"], "term": "Town of Estancia", "definition": "Town of Estancia"}, + {"categories": ["organization"], "term": "Pie Town MDWCA", "definition": "Pie Town MDWCA"}, + {"categories": ["organization"], "term": "Roosevelt SWCD", "definition": "Roosevelt Soil & Water Conservation District"}, + {"categories": ["organization"], "term": "Otis MDWCA", "definition": "Otis Mutual Domestic"}, + {"categories": ["organization"], "term": "White Cliffs MDWUA", "definition": "White Cliffs MDWUA"}, + {"categories": ["organization"], "term": "Vista Linda Water Co-op", "definition": "Vista Linda Water Co-op"}, + {"categories": ["organization"], "term": "Anasazi Trails Water Co-op", "definition": "Anasazi Trails Water Cooperative"}, + {"categories": ["organization"], "term": "Canon MDWCA", "definition": "Canon Mutual Domestic Water Consumer Assn."}, + {"categories": ["organization"], "term": "Placitas Trails Water Co-op", "definition": "Placitas Trails Water Coop"}, + {"categories": ["organization"], "term": "BLM, Roswell Office", "definition": "BLM, Roswell Office"}, + {"categories": ["organization"], "term": "Forked Lightning Ranch", "definition": "Forked Lightning Ranch"}, + {"categories": ["organization"], "term": "Cottonwood RWA", "definition": "Cottonwood Rural Water Assn."}, + {"categories": ["organization"], "term": "Pinon Ridge WUA", "definition": "Pinon Ridge Water Users Association"}, + {"categories": ["organization"], "term": "McSherry Farms", "definition": "McSherry Farms"}, + {"categories": ["organization"], "term": "Agua Sana WUA", "definition": "Agua Sana Water Users Assn."}, + {"categories": ["organization"], "term": "Chamita MDWCA", "definition": "Chamita Water Users Association"}, + {"categories": ["organization"], "term": "W Spear-bar Ranch", "definition": "W Spear-bar Ranch"}, + {"categories": ["organization"], "term": "Village of Capitan", "definition": "Village of Capitan"}, + {"categories": ["organization"], "term": "Brazos MDWCA", "definition": "Brazos Mutual Domestic Water Consumers Assn."}, + {"categories": ["organization"], "term": "Alto Alps HOA", "definition": "Alto Alps Homeowners Association"}, + {"categories": ["organization"], "term": "Chiricahua Desert Museum", "definition": "Chiricahua Desert Museum"}, + {"categories": ["organization"], "term": "Bike Ranch", "definition": "Bike Ranch"}, + {"categories": ["organization"], "term": "Hachita MDWCA", "definition": "Hachita MDWCA"}, + {"categories": ["organization"], "term": "Carrizozo Municipal Water", "definition": "Carrizozo Municipal Water"}, + {"categories": ["organization"], "term": "Dunhill Ranch", "definition": "Dunhill Ranch"}, + {"categories": ["organization"], "term": "Santa Fe Conservation Trust", "definition": "Santa Fe Conservation Trust"}, {"categories": ["organization"], "term": "NMSU", "definition": "New Mexico State University"}, {"categories": ["organization"], "term": "USGS", "definition": "US Geological Survey"}, {"categories": ["organization"], "term": "TWDB", "definition": "Texas Water Development Board"}, @@ -469,7 +551,6 @@ {"categories": ["organization"], "term": "El Rito Canyon MDWCA", "definition": "El Rito Canyon MDWCA"}, {"categories": ["organization"], "term": "Encantado Enterprises", "definition": "Encantado Enterprises"}, {"categories": ["organization"], "term": "Estrella Concepts LLC", "definition": "Estrella Concepts LLC"}, - {"categories": ["organization"], "term": "Farr Cattle Company", "definition": "Farr Cattle Company"}, {"categories": ["organization"], "term": "Sixteen Springs Fire Department", "definition": "Sixteen Springs Fire Department"}, {"categories": ["organization"], "term": "Fire Water Lodge", "definition": "Fire Water Lodge"}, {"categories": ["organization"], "term": "Ford County Land & Cattle Company, Inc", "definition": "Ford County Land & Cattle Company, Inc"}, @@ -483,7 +564,6 @@ {"categories": ["organization"], "term": "K. Schmitt Trust", "definition": "K. Schmitt Trust"}, {"categories": ["organization"], "term": "La Cienega MDWCA", "definition": "La Cienega MDWCA"}, {"categories": ["organization"], "term": "La Vista HOA", "definition": "La Vista HOA"}, - {"categories": ["organization"], "term": "Lamy MDWCA", "definition": "Lamy MDWCA"}, {"categories": ["organization"], "term": "Land Ventures LLC", "definition": "Land Ventures LLC"}, {"categories": ["organization"], "term": "Las Lagunitas", "definition": "Las Lagunitas"}, {"categories": ["organization"], "term": "Las Lagunitas HOA", "definition": "Las Lagunitas HOA"}, @@ -525,7 +605,6 @@ {"categories": ["organization"], "term": "Sangre de Cristo Center", "definition": "Sangre de Cristo Center"}, {"categories": ["organization"], "term": "Valle Vista Water Utility", "definition": "Valle Vista Water Utility"}, {"categories": ["organization"], "term": "Santa Fe County, Valle Vista Water Utility, Inc.", "definition": "Santa Fe County, Valle Vista Water Utility, Inc."}, - {"categories": ["organization"], "term": "Santa Fe Downs", "definition": "Santa Fe Downs"}, {"categories": ["organization"], "term": "Santa Fe Horse Park", "definition": "Santa Fe Horse Park"}, {"categories": ["organization"], "term": "Santa Fe Opera", "definition": "Santa Fe Opera"}, {"categories": ["organization"], "term": "Santa Fe Waldorf School", "definition": "Santa Fe Waldorf School"}, @@ -572,6 +651,7 @@ {"categories": ["organization"], "term": "Yates Petroleum Corporation", "definition": "Yates Petroleum Corporation"}, {"categories": ["organization"], "term": "Zamora Accounting Services", "definition": "Zamora Accounting Services"}, {"categories": ["organization"], "term": "PLSS", "definition": "Public Land Survey System"}, + {"categories": ["organization"], "term": "Quemado Municipal Water & SWA", "definition": "Quemado Municipal Water & SWA"}, {"categories": ["collection_method"], "term": "Altimeter", "definition": "ALtimeter"}, {"categories": ["collection_method"], "term": "Differentially corrected GPS", "definition": "Differentially corrected GPS"}, {"categories": ["collection_method"], "term": "Survey-grade GPS", "definition": "Survey-grade GPS"}, @@ -669,6 +749,9 @@ {"categories": ["parameter_type"], "term": "Major Element", "definition": "Major Element"}, {"categories": ["parameter_type"], "term": "Minor Element", "definition": "Minor Element"}, {"categories": ["parameter_type"], "term": "Physical property", "definition": "Physical property"}, + + {"categories": ["sensor_type"], "term": "DiverLink", "definition": "DiverLink"}, + {"categories": ["sensor_type"], "term": "Diver Cable", "definition": "Diver Cable"}, {"categories": ["sensor_type"], "term": "Pressure Transducer", "definition": "Pressure Transducer"}, {"categories": ["sensor_type"], "term": "Data Logger", "definition": "Data Logger"}, {"categories": ["sensor_type"], "term": "Barometer", "definition": "Barometer"}, @@ -692,6 +775,392 @@ {"categories": ["monitoring_frequency"], "term": "Annual", "definition": "Location is monitored once a year."}, {"categories": ["monitoring_frequency"], "term": "Decadal", "definition": "Location is monitored once every ten years."}, {"categories": ["monitoring_frequency"], "term": "Event-based", "definition": "Location is monitored based on specific events or triggers rather than a fixed schedule."}, + {"categories": ["aquifer_type"], "term": "Artesian", "definition": "Artesian"}, + {"categories": ["aquifer_type"], "term": "Confined single aquifer", "definition": "Confined single aquifer"}, + {"categories": ["aquifer_type"], "term": "Unsaturated (dry)", "definition": "Unsaturated (dry)"}, + {"categories": ["aquifer_type"], "term": "Fractured", "definition": "Fractured"}, + {"categories": ["aquifer_type"], "term": "Confined multiple aquifers", "definition": "Confined multiple aquifers"}, + {"categories": ["aquifer_type"], "term": "Unconfined multiple aquifers", "definition": "Unconfined multiple aquifers"}, + {"categories": ["aquifer_type"], "term": "Perched aquifer", "definition": "Perched aquifer"}, + {"categories": ["aquifer_type"], "term": "Confining layer or aquitard", "definition": "Confining layer or aquitard"}, + {"categories": ["aquifer_type"], "term": "Semi-confined", "definition": "Semi-confined"}, + {"categories": ["aquifer_type"], "term": "Unconfined single aquifer", "definition": "Unconfined single aquifer"}, + {"categories": ["aquifer_type"], "term": "Mixed (confined and unconfined multiple aquifers)", "definition": "Mixed (confined and unconfined multiple aquifers)"}, + {"categories": ["geographic_scale"], "term": "Major", "definition": "Major aquifers of national significance"}, + {"categories": ["geographic_scale"], "term": "Regional", "definition": "Important aquifers serving regions"}, + {"categories": ["geographic_scale"], "term": "Local", "definition": "Smaller, locally important aquifers"}, + {"categories": ["geographic_scale"], "term": "Minor", "definition": "Limited extent or yield"}, + {"categories": ["formation_code"],"term": "000EXRV","definition": "Extrusive Rocks"}, + {"categories": ["formation_code"],"term": "000IRSV","definition": "Intrusive Rocks"}, + {"categories": ["formation_code"],"term": "050QUAL","definition": "Quaternary Alluvium in Valleys"}, + {"categories": ["formation_code"],"term": "100QBAS","definition": "Quaternary basalt"}, + {"categories": ["formation_code"],"term": "110ALVM","definition": "Quaternary Alluvium"}, + {"categories": ["formation_code"],"term": "110AVMB","definition": "Alluvium, Bolson Deposits and Other Surface Deposits"}, + {"categories": ["formation_code"],"term": "110BLSN","definition": "Bolson Fill"}, + {"categories": ["formation_code"],"term": "110NTGU","definition": "Naha and Tsegi Alluvium Deposits, undifferentiated"}, + {"categories": ["formation_code"],"term": "110PTODC","definition": "Pediment, Terrace and Other Deposits of Gravel, Sand and Caliche"}, + {"categories": ["formation_code"],"term": "111MCCR","definition": "McCathys Basalt Flow"}, + {"categories": ["formation_code"],"term": "112ANCH","definition": "Upper Santa Fe Group, Ancha Formation (QTa)"}, + {"categories": ["formation_code"],"term": "112CURB","definition": "Cuerbio Basalt"}, + {"categories": ["formation_code"],"term": "112LAMA","definition": "Lama Formation (QTl, QTbh) and other mountain front alluvial fans"}, + {"categories": ["formation_code"],"term": "112LAMAb","definition": "Lama Fm (QTl, QTbh) between Servilleta Basalts"}, + {"categories": ["formation_code"],"term": "112LGUN","definition": "Laguna Basalt Flow"}, + {"categories": ["formation_code"],"term": "112QTBF","definition": "Quaternary-Tertiary basin fill (not in valleys)"}, + {"categories": ["formation_code"],"term": "112QTBFlac","definition": "Quaternary-Tertiary basin fill, lacustrian-playa lithofacies"}, + {"categories": ["formation_code"],"term": "112QTBFpd","definition": "Quaternary-Tertiary basin fill, distal piedmont lithofacies"}, + {"categories": ["formation_code"],"term": "112QTBFppm","definition": "Quaternary-Tertiary basin fill, proximal and medial piedmont lithofacies"}, + {"categories": ["formation_code"],"term": "112SNTF","definition": "Santa Fe Group, undivided"}, + {"categories": ["formation_code"],"term": "112SNTFA","definition": "Upper Santa Fe Group, axial facies"}, + {"categories": ["formation_code"],"term": "112SNTFOB","definition": "Upper SantaFe Group, Loma Barbon member of Arroyo Ojito Formatin"}, + {"categories": ["formation_code"],"term": "112SNTFP","definition": "Upper Santa Fe Group, piedmont facies"}, + {"categories": ["formation_code"],"term": "112TRTO","definition": "Tuerto Gravels (QTt)"}, + {"categories": ["formation_code"],"term": "120DTIL","definition": "Datil Formation"}, + {"categories": ["formation_code"],"term": "120ELRT","definition": "El Rito Formation"}, + {"categories": ["formation_code"],"term": "120IRSV","definition": "Tertiary Intrusives"}, + {"categories": ["formation_code"],"term": "120SBLC","definition": "Sierra Blanca Volcanics, undivided"}, + {"categories": ["formation_code"],"term": "120SRVB","definition": "Tertiary Servilletta Basalts (Tsb)"}, + {"categories": ["formation_code"],"term": "120SRVBf","definition": "Tertiary Servilletta Basalts, fractured (Tsbf)"}, + {"categories": ["formation_code"],"term": "120TSBV_Lower","definition": "Tertiary Sierra Blanca area lower volcanic unit (Hog Pen Fm)"}, + {"categories": ["formation_code"],"term": "120TSBV_Upper","definition": "Tertiary Sierra Blanca area upper volcanic unit (above Hog Pen Fm)"}, + {"categories": ["formation_code"],"term": "121CHMT","definition": "Chamita Formation (Tc)"}, + {"categories": ["formation_code"],"term": "121CHMTv","definition": "Chamita Fm, Vallito member (Tcv)"}, + {"categories": ["formation_code"],"term": "121CHMTvs","definition": "Chamita Fm, sandy Vallito member (Tcvs)"}, + {"categories": ["formation_code"],"term": "121OGLL","definition": "Ogallala Formation"}, + {"categories": ["formation_code"],"term": "121PUYEF","definition": "Puye Conglomerate, Fanglomerate Member"}, + {"categories": ["formation_code"],"term": "121TSUQ","definition": "Tesuque Formation, undifferentiated unit"}, + {"categories": ["formation_code"],"term": "121TSUQa","definition": "Tesuque Fm lithosome A (Tta)"}, + {"categories": ["formation_code"],"term": "121TSUQacu","definition": "Tesuque Fm (upper), Cuarteles member lithosome A (Ttacu)"}, + {"categories": ["formation_code"],"term": "121TSUQacuf","definition": "Tesuque Fm (upper), fine-grained Cuarteles member lithosome A (Ttacuf)"}, + {"categories": ["formation_code"],"term": "121TSUQaml","definition": "Tesuque Fm lower-middle lithosome A (Ttaml)"}, + {"categories": ["formation_code"],"term": "121TSUQb","definition": "Tesuque Fm lithosome B (Ttb)"}, + {"categories": ["formation_code"],"term": "121TSUQbfl","definition": "Tesuque Fm lower lithosome B, basin-floor deposits (Ttbfl)"}, + {"categories": ["formation_code"],"term": "121TSUQbfm","definition": "Tesuque Fm middle lithosome B, basin-floor deposits (Ttbfm)"}, + {"categories": ["formation_code"],"term": "121TSUQbp","definition": "Tesuque Fm lithosome B, Pojoaque member (Ttbp)"}, + {"categories": ["formation_code"],"term": "121TSUQce","definition": "Tesuque Fm, Cejita member (Ttce)"}, + {"categories": ["formation_code"],"term": "121TSUQe","definition": "Tesuque Fm lithosome E (Tte)"}, + {"categories": ["formation_code"],"term": "121TSUQs","definition": "Tesuque Fm lithosome S (Tts)"}, + {"categories": ["formation_code"],"term": "121TSUQsa","definition": "Tesuque Fm lateral gradation lithosomes S and A (Ttsag)"}, + {"categories": ["formation_code"],"term": "121TSUQsc","definition": "Tesuque Fm coarse-grained lithosome S (Ttsc)"}, + {"categories": ["formation_code"],"term": "121TSUQsf","definition": "Tesuque Fm, fine-grained lithosome S (Ttsf)"}, + {"categories": ["formation_code"],"term": "122CHOC","definition": "Chamita and Ojo Caliente interlayered (Ttoc)"}, + {"categories": ["formation_code"],"term": "122CRTO","definition": "Chama El Rito Formation (Tesuque member, Ttc)"}, + {"categories": ["formation_code"],"term": "122OJOC","definition": "Ojo Caliente Formation (Tesuque member, Tto)"}, + {"categories": ["formation_code"],"term": "122PICR","definition": "Picuris Tuff"}, + {"categories": ["formation_code"],"term": "122PPTS","definition": "Popotosa Formation"}, + {"categories": ["formation_code"],"term": "122SNTFP","definition": "Lower Santa Fe Group, piedmont facies"}, + {"categories": ["formation_code"],"term": "123DTILSPRS","definition": "Datil Group ignimbrites and lavas and Spears Group, interbedded"}, + {"categories": ["formation_code"],"term": "123DTMGandbas","definition": "Datil and Mogollon Group andesite, basaltic andesite, and basalt flows"}, + {"categories": ["formation_code"],"term": "123DTMGign","definition": "Datil and Mogollon Group ignimbrites"}, + {"categories": ["formation_code"],"term": "123DTMGrhydac","definition": "Datil and Mogollon Group rhyolite and dacite flows"}, + {"categories": ["formation_code"],"term": "123ESPN","definition": "T Espinaso Formation (Te)"}, + {"categories": ["formation_code"],"term": "123GLST","definition": "T Galisteo Formation"}, + {"categories": ["formation_code"],"term": "123PICS","definition": "T Picuris Formation (Tp)"}, + {"categories": ["formation_code"],"term": "123PICSc","definition": "T Picuris Formation, basal conglomerate (Tpc)"}, + {"categories": ["formation_code"],"term": "123PICSl","definition": "T lower Picuris Formation (Tpl)"}, + {"categories": ["formation_code"],"term": "123SPRSDTMGlava","definition": "Spears Group and Datil-Mogollon intermediate-mafic lavas, interbedded"}, + {"categories": ["formation_code"],"term": "123SPRSlower","definition": "Spears Group, lower part; tuffaceous, gravelly debris and mud flows"}, + {"categories": ["formation_code"],"term": "123SPRSmid_uppe","definition": "Spears Group, middle-upper part; excludes Dog Spring Formation"}, + {"categories": ["formation_code"],"term": "124BACA","definition": "Baca Formation"}, + {"categories": ["formation_code"],"term": "124CBMN","definition": "Cub Mountain Formation"}, + {"categories": ["formation_code"],"term": "124LLVS","definition": "Llaves Member of San Jose Formation"}, + {"categories": ["formation_code"],"term": "124PSCN","definition": "Poison Canyon Formation"}, + {"categories": ["formation_code"],"term": "124RGIN","definition": "Regina Member of San Jose Formation"}, + {"categories": ["formation_code"],"term": "124SNJS","definition": "San Jose Formation"}, + {"categories": ["formation_code"],"term": "124TPCS","definition": "TapicitosMember of San Jose Formation"}, + {"categories": ["formation_code"],"term": "125NCMN","definition": "Nacimiento Formation"}, + {"categories": ["formation_code"],"term": "125NCMNS","definition": "Nacimiento Formation, Sandy Shale Facies"}, + {"categories": ["formation_code"],"term": "125RTON","definition": "Raton Formation"}, + {"categories": ["formation_code"],"term": "130CALDFLOOR","definition": "Caldera Floor bedrock S. of San Agustin Plains. Mostly DTILSPRS & Paleo."}, + {"categories": ["formation_code"],"term": "180TKSCC_Upper","definition": "Tertiary-Cretaceous, Sanders Canyon, Cub Mtn. and upper Crevasse Canyon Fm"}, + {"categories": ["formation_code"],"term": "180TKTR","definition": "Tertiary-Cretaceous-Triassic, Baca, Crevasse Cyn, Gallup, Mancos, Dakota, T"}, + {"categories": ["formation_code"],"term": "210CRCS","definition": "Cretaceous System, undivided"}, + {"categories": ["formation_code"],"term": "210GLUPC_Lower","definition": "K Gallup Sandstone and lower Crevasse Canyon Fm"}, + {"categories": ["formation_code"],"term": "210HOSTD","definition": "K Hosta Dalton"}, + {"categories": ["formation_code"],"term": "210MCDK","definition": "K Mancos/Dakota undivided"}, + {"categories": ["formation_code"],"term": "210MNCS","definition": "Mancos Shale, undivided"}, + {"categories": ["formation_code"],"term": "210MNCSL","definition": "K Lower Mancos"}, + {"categories": ["formation_code"],"term": "210MNCSU","definition": "K Upper Mancos"}, + {"categories": ["formation_code"],"term": "211CLFHV","definition": "Cliff House Sandstone, includes La Ventana Tongues in NW Sandoval Co."}, + {"categories": ["formation_code"],"term": "211CRLL","definition": "Carlile Shale"}, + {"categories": ["formation_code"],"term": "211CRVC","definition": "Crevasse Canyon Formation of Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211DKOT","definition": "Dakota Sandstone or Formation"}, + {"categories": ["formation_code"],"term": "211DLCO","definition": "Dilco Coal Member of Crevasse Canyon Formation of Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211DLTN","definition": "Dalton Sandstone Member of Crevasse Canyon Formation of Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211FRHS","definition": "Fort Hays Limestone Member of Niobrara Formation"}, + {"categories": ["formation_code"],"term": "211FRLD","definition": "Fruitland Formation"}, + {"categories": ["formation_code"],"term": "211FRMG","definition": "Farmington Sandstone Member of Kirtland Shale"}, + {"categories": ["formation_code"],"term": "211GBSNC","definition": "Gibson Coal Member of Crevasse Canyon Formation of Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211GLLG","definition": "Gallego Sandstone Member of Gallup Sandstone"}, + {"categories": ["formation_code"],"term": "211GLLP","definition": "Gallup Sandstone"}, + {"categories": ["formation_code"],"term": "211GRRG","definition": "Greenhorn and Graneros Formations"}, + {"categories": ["formation_code"],"term": "211GRRS","definition": "Graneros Shale"}, + {"categories": ["formation_code"],"term": "211HOST","definition": "Hosta Tongue of Point Lookout Sandstone of Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211KRLD","definition": "Kirtland Shale"}, + {"categories": ["formation_code"],"term": "211LWIS","definition": "Lewis Shale"}, + {"categories": ["formation_code"],"term": "211MENF","definition": "Menefee Formation"}, + {"categories": ["formation_code"],"term": "211MENFU","definition": "K Upper Menefee (above Harmon Sandstone)"}, + {"categories": ["formation_code"],"term": "211MVRD","definition": "Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211OJAM","definition": "Ojo Alamo Sandstone"}, + {"categories": ["formation_code"],"term": "211PCCF","definition": "Pictured Cliffs Sandstone"}, + {"categories": ["formation_code"],"term": "211PIRR","definition": "Pierre Shale"}, + {"categories": ["formation_code"],"term": "211PNLK","definition": "Point Lookout Sandstone"}, + {"categories": ["formation_code"],"term": "211SMKH","definition": "Smoky Hill Marl Member"}, + {"categories": ["formation_code"],"term": "211TLLS","definition": "Twowells Sandstone Lentil of Pike of Dakota Sandstone"}, + {"categories": ["formation_code"],"term": "212KTRP","definition": "K Dakota Sandstone, Moenkopi Fm, Artesia Group"}, + {"categories": ["formation_code"],"term": "217PRGR","definition": "Purgatoire Formation"}, + {"categories": ["formation_code"],"term": "220ENRD","definition": "Entrada Sandstone"}, + {"categories": ["formation_code"],"term": "220JURC","definition": "Jurassic undivided"}, + {"categories": ["formation_code"],"term": "220NAVJ","definition": "Navajo Sandstone"}, + {"categories": ["formation_code"],"term": "221BLFF","definition": "Bluff Sandstone of Morrison Formation"}, + {"categories": ["formation_code"],"term": "221CSPG","definition": "Cow Springs Sandstone of Morrison Formation"}, + {"categories": ["formation_code"],"term": "221ERADU","definition": "Entrada Sandstone of San Rafael Group, Upper"}, + {"categories": ["formation_code"],"term": "221MRSN","definition": "Morrison Formation"}, + {"categories": ["formation_code"],"term": "221MRSN/BBSN","definition": "Brushy Basin Member of Morrison"}, + {"categories": ["formation_code"],"term": "221MRSN/JCKP","definition": "Jackpile Sandstone Member of Morrison"}, + {"categories": ["formation_code"],"term": "221MRSN/RCAP","definition": "Recapture Shale Member of Morrison"}, + {"categories": ["formation_code"],"term": "221MRSN/WWCN","definition": "Westwater Canyon Member of Morrison"}, + {"categories": ["formation_code"],"term": "221SLWS","definition": "Salt Wash Sandstone Member of Morrison Formation"}, + {"categories": ["formation_code"],"term": "221SMVL","definition": "Summerville Formation of San Rafael Group"}, + {"categories": ["formation_code"],"term": "221TDLT","definition": "J Todilto"}, + {"categories": ["formation_code"],"term": "221WSRC","definition": "Westwater Canyon Sandstone Member of Morrison Formation"}, + {"categories": ["formation_code"],"term": "221ZUNIS","definition": "Zuni Sandstone"}, + {"categories": ["formation_code"],"term": "231AGZC","definition": "Tr Agua Zarca"}, + {"categories": ["formation_code"],"term": "231AGZCU","definition": "Tr Upper Agua Zarca"}, + {"categories": ["formation_code"],"term": "231CHNL","definition": "Chinle Formation"}, + {"categories": ["formation_code"],"term": "231CORR","definition": "Correo Sandstone Member of Chinle Formation"}, + {"categories": ["formation_code"],"term": "231DCKM","definition": "Dockum Group"}, + {"categories": ["formation_code"],"term": "231PFDF","definition": "Tr Petrified Forest"}, + {"categories": ["formation_code"],"term": "231PFDFL","definition": "Tr Lower Petrified Forest (below middle sandstone)"}, + {"categories": ["formation_code"],"term": "231PFDFM","definition": "Tr Middle Petrified Forest sandstone"}, + {"categories": ["formation_code"],"term": "231PFDFU","definition": "Tr Upper Petrified Forest (above middle sandstone)"}, + {"categories": ["formation_code"],"term": "231RCKP","definition": "Rock Point Member of Wingate Sandstone"}, + {"categories": ["formation_code"],"term": "231SNRS","definition": "Santa Rosa Sandstone"}, + {"categories": ["formation_code"],"term": "231SNSL","definition": "Sonsela Sandstone Bed of Petrified Forest Member of Chinle Formation"}, + {"categories": ["formation_code"],"term": "231SRMP","definition": "Shinarump Member of Chinle Formation"}, + {"categories": ["formation_code"],"term": "231WNGT","definition": "Wingate Sandstone"}, + {"categories": ["formation_code"],"term": "260SNAN","definition": "P San Andres"}, + {"categories": ["formation_code"],"term": "260SNAN_lower","definition": "Lower San Andres Formation"}, + {"categories": ["formation_code"],"term": "261SNGL","definition": "P San Andres - Glorieta Sandstone in Rio Bonito member"}, + {"categories": ["formation_code"],"term": "300YESO","definition": "P Yeso"}, + {"categories": ["formation_code"],"term": "300YESO_lower","definition": "Lower Yeso Formation"}, + {"categories": ["formation_code"],"term": "300YESO_upper","definition": "Upper Yeso Formation"}, + {"categories": ["formation_code"],"term": "310ABO","definition": "P Abo"}, + {"categories": ["formation_code"],"term": "310DCLL","definition": "De Chelly Sandstone Member of Cutler Formation"}, + {"categories": ["formation_code"],"term": "310GLOR","definition": "Glorieta Sandstone Member of San Andres Formation (of Manzano Group)"}, + {"categories": ["formation_code"],"term": "310MBLC","definition": "Meseta Blanca Sandstone Member of Yeso Formation"}, + {"categories": ["formation_code"],"term": "310TRRS","definition": "Torres Member of Yeso Formation"}, + {"categories": ["formation_code"],"term": "310YESO","definition": "Yeso Formation"}, + {"categories": ["formation_code"],"term": "310YESOG","definition": "Yeso Formation, Manzono Group"}, + {"categories": ["formation_code"],"term": "312CSTL","definition": "Castile Formation"}, + {"categories": ["formation_code"],"term": "312RSLR","definition": "Rustler Formation"}, + {"categories": ["formation_code"],"term": "313ARTS","definition": "Artesia Group"}, + {"categories": ["formation_code"],"term": "313BLCN","definition": "Bell Canyon Formation"}, + {"categories": ["formation_code"],"term": "313BRUC","definition": "Brushy Canyon Formation of Delaware Mountain Group"}, + {"categories": ["formation_code"],"term": "313CKBF","definition": "Chalk Bluff Formation"}, + {"categories": ["formation_code"],"term": "313CLBD","definition": "Carlsbad Limestone"}, + {"categories": ["formation_code"],"term": "313CPTN","definition": "Capitan Limestone"}, + {"categories": ["formation_code"],"term": "313GDLP","definition": "Guadalupian Series"}, + {"categories": ["formation_code"],"term": "313GOSP","definition": "Goat Seep Dolomite"}, + {"categories": ["formation_code"],"term": "313SADG","definition": "San Andres Limestone and Glorieta Sandstone"}, + {"categories": ["formation_code"],"term": "313SADR","definition": "San Andres Limestone, undivided"}, + {"categories": ["formation_code"],"term": "313TNSL","definition": "Tansill Formation"}, + {"categories": ["formation_code"],"term": "313YATS","definition": "Yates Formation, Guadalupe Group"}, + {"categories": ["formation_code"],"term": "315LABR","definition": "P Laborcita (Bursum)"}, + {"categories": ["formation_code"],"term": "315YESOABO","definition": "Alamosa Creek and San Agustin Plains area - Yeso and Abo Formations"}, + {"categories": ["formation_code"],"term": "318ABO","definition": "P Abo"}, + {"categories": ["formation_code"],"term": "318BSPG","definition": "Bone Spring Limestone"}, + {"categories": ["formation_code"],"term": "318JOYT","definition": "Joyita Sandstone Member of Yeso Formation"}, + {"categories": ["formation_code"],"term": "318YESO","definition": "Yeso Formation"}, + {"categories": ["formation_code"],"term": "319BRSM","definition": "Bursum Formation and Equivalent Rocks"}, + {"categories": ["formation_code"],"term": "320HLDR","definition": "Penn Holder"}, + {"categories": ["formation_code"],"term": "320PENN","definition": "Pennsylvanian undivided"}, + {"categories": ["formation_code"],"term": "320SNDI","definition": "Sandia Formation"}, + {"categories": ["formation_code"],"term": "321SGDC","definition": "Sangre de Cristo Formation"}, + {"categories": ["formation_code"],"term": "322BEMN","definition": "Penn Beeman"}, + {"categories": ["formation_code"],"term": "325GBLR","definition": "Penn Gobbler"}, + {"categories": ["formation_code"],"term": "325MDER","definition": "Madera Limestone, undivided"}, + {"categories": ["formation_code"],"term": "325MDERL","definition": "Penn Lower Madera"}, + {"categories": ["formation_code"],"term": "325MDERU","definition": "Penn Upper Madera"}, + {"categories": ["formation_code"],"term": "325SAND","definition": "Penn Sandia"}, + {"categories": ["formation_code"],"term": "326MGDL","definition": "Magdalena Group"}, + {"categories": ["formation_code"],"term": "340EPRS","definition": "Espiritu Santo Formation"}, + {"categories": ["formation_code"],"term": "350PZBA","definition": "Alamosa Creek and San Agustin Plains area - Paleozoic strata beneath Abo Fm"}, + {"categories": ["formation_code"],"term": "350PZBB","definition": "Tul Basin area - Paleozoic strata below Bursum Fm"}, + {"categories": ["formation_code"],"term": "400EMBD","definition": "Embudo Granite (undifferentiated PreCambrian near Santa Fe)"}, + {"categories": ["formation_code"],"term": "400PCMB","definition": "Precambrian Erathem"}, + {"categories": ["formation_code"],"term": "400PREC","definition": "undifferentiated PreCambrian crystalline rocks (X)"}, + {"categories": ["formation_code"],"term": "400PRECintr","definition": "PreCambrian crystalline rocks and local Tertiary intrusives"}, + {"categories": ["formation_code"],"term": "400PRST","definition": "Priest Granite"}, + {"categories": ["formation_code"],"term": "400TUSS","definition": "Tusas Granite"}, + {"categories": ["formation_code"],"term": "410PRCG","definition": "PreCambrian granite (Xg)"}, + {"categories": ["formation_code"],"term": "410PRCGf","definition": "PreCambrian granite, fractured (Xgf)"}, + {"categories": ["formation_code"],"term": "410PRCQ","definition": "PreCambrian quartzite (Xq)"}, + {"categories": ["formation_code"],"term": "410PRCQf","definition": "PreCambrian quartzite, fractured (Xqf)"}, + {"categories": ["formation_code"],"term": "121GILA","definition": "Gila Conglomerate (group)"}, + {"categories": ["formation_code"],"term": "312DYLK","definition": "Dewey Lake Redbeds"}, + {"categories": ["formation_code"],"term": "120WMVL","definition": "Wimsattville Formation"}, + {"categories": ["formation_code"],"term": "313GRBG","definition": "Grayburg Formation of Artesia Group"}, + {"categories": ["formation_code"],"term": "318ABOL","definition": "Abo Sandstone (Lower Tongue)"}, + {"categories": ["formation_code"],"term": "318ABOU","definition": "Abo Sandstone (Upper Tongue)"}, + {"categories": ["formation_code"],"term": "112SNTFU","definition": "Santa Fe Group, Upper Part"}, + {"categories": ["formation_code"],"term": "310FRNR","definition": "Forty-Niner Member of Rustler Formation"}, + {"categories": ["formation_code"],"term": "312OCHO","definition": "Ochoan Series"}, + {"categories": ["formation_code"],"term": "313AZOT","definition": "Azotea Tongue of Seven Rivers Formation"}, + {"categories": ["formation_code"],"term": "313QUEN","definition": "Queen Formation"}, + {"categories": ["formation_code"],"term": "319HUCO","definition": "Hueco Limestone"}, + {"categories": ["formation_code"],"term": "313SVRV","definition": "Seven Rivers Formation"}, + {"categories": ["formation_code"],"term": "313CABD","definition": "Carlsbad Group"}, + {"categories": ["formation_code"],"term": "320GRMS","definition": "Gray Mesa Member of Madera Formation"}, + {"categories": ["formation_code"],"term": "211CLRDH","definition": "Colorado Shale"}, + {"categories": ["formation_code"],"term": "120BRLM","definition": "Bearwallow Mountain Andesite"}, + {"categories": ["formation_code"],"term": "122RUBO","definition": "Rubio Peak Formation"}, + {"categories": ["formation_code"],"term": "313SADRL","definition": "San Andres Limestone, Lower Cherty Member"}, + {"categories": ["formation_code"],"term": "313SADRU","definition": "San Andres Limestone, Upper Clastic Member"}, + {"categories": ["formation_code"],"term": "313BRNL","definition": "Bernal Formation of Artesia Group"}, + {"categories": ["formation_code"],"term": "318CPDR","definition": "Chupadera Formation"}, + {"categories": ["formation_code"],"term": "121BDHC","definition": "Bidahochi Formation"}, + {"categories": ["formation_code"],"term": "313SADY","definition": "San Andres Limestone and Yeso Formation, undivided"}, + {"categories": ["formation_code"],"term": "221SRFLL","definition": "San Rafael Group, Lower Part"}, + {"categories": ["formation_code"],"term": "221BLUF","definition": "Bluff Sandstone of Morrison Formation"}, + {"categories": ["formation_code"],"term": "221COSP","definition": "Cow Springs Sandstone of Morrison Formation"}, + {"categories": ["formation_code"],"term": "317ABYS","definition": "Abo and Yeso, undifferentiated"}, + {"categories": ["formation_code"],"term": "221BRSB","definition": "Brushy Basin Shale Member of Morrison Formation"}, + {"categories": ["formation_code"],"term": "310SYDR","definition": "San Ysidro Member of Yeso Formation"}, + {"categories": ["formation_code"],"term": "400SDVL","definition": "Sandoval Granite"}, + {"categories": ["formation_code"],"term": "221SRFL","definition": "San Rafael Group"}, + {"categories": ["formation_code"],"term": "310SGRC","definition": "Sangre de Cristo Formation"}, + {"categories": ["formation_code"],"term": "231TCVS","definition": "Tecovas Formation of Dockum Group"}, + {"categories": ["formation_code"],"term": "211DCRS","definition": "D-Cross Tongue of Mancos Shale of Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211ALSN","definition": "Allison Member of Menefee Formation of Mesaverde Group"}, + {"categories": ["formation_code"],"term": "211LVNN","definition": "La Ventana Tongue of Cliff House Sandstone"}, + {"categories": ["formation_code"],"term": "211MORD","definition": "Madrid Formation"}, + {"categories": ["formation_code"],"term": "210PRMD","definition": "Pyramid Shale"}, + {"categories": ["formation_code"],"term": "124ANMS","definition": "Animas Formation"}, + {"categories": ["formation_code"],"term": "211NBRR","definition": "Niobrara Formation"}, + {"categories": ["formation_code"],"term": "111ALVM","definition": "Holocene Alluvium"}, + {"categories": ["formation_code"],"term": "122SNTFL","definition": "Santa Fe Group, Lower Part"}, + {"categories": ["formation_code"],"term": "111CPLN","definition": "Capulin Basalts"}, + {"categories": ["formation_code"],"term": "120CRSN","definition": "Carson Conflomerate"}, + {"categories": ["formation_code"],"term": "111CRMS","definition": "Covered/Reclaimed Mine Spoil"}, + {"categories": ["formation_code"],"term": "111CRMSA","definition": "Covered/Reclaimed Mine Spoil and Ash"}, + {"categories": ["formation_code"],"term": "111SPOL","definition": "Spoil"}, + {"categories": ["formation_code"],"term": "110TURT","definition": "Tuerto Gravel of Santa Fe Group"}, + {"categories": ["formation_code"],"term": "221RCPR","definition": "Recapture Shale Member of Morrison Formation"}, + {"categories": ["formation_code"],"term": "320BLNG","definition": "Bullington Member of Magdalena Formation"}, + {"categories": ["formation_code"],"term": "112ANCHsr","definition": "Upper Santa Fe Group, Ancha Formation & ancestral Santa Fe river deposits"}, + {"categories": ["formation_code"],"term": "121TSUQae","definition": "Tesuque Fm Lithosomes A and E"}, + {"categories": ["formation_code"],"term": "230TRSC","definition": "Triassic undifferentiated"}, + {"categories": ["formation_code"],"term": "122TSUQdx","definition": "Tesuque Fm, Dixon member (Ttd)"}, + {"categories": ["formation_code"],"term": "123PICSu","definition": "T upper Picuris Formation (Tpu)"}, + {"categories": ["formation_code"],"term": "123PICSm","definition": "T middle Picuris Formation (Tpm)"}, + {"categories": ["formation_code"],"term": "123PICSmc","definition": "T middle conglomerate Picuris Formation (Tpmc)"}, + {"categories": ["formation_code"],"term": "120VBVC","definition": "Tertiary volcanic breccia/volcaniclastic conglomerate"}, + {"categories": ["formation_code"],"term": "120VCSS","definition": "Tertiary volcaniclastic sandstone"}, + {"categories": ["formation_code"],"term": "124DMDT","definition": "Diamond Tail Formation"}, + {"categories": ["formation_code"],"term": "325ALMT","definition": "Penn Alamitos Formation"}, + {"categories": ["formation_code"],"term": "400SAND","definition": "Sandia Granite"}, + {"categories": ["formation_code"],"term": "318VCPK","definition": "Victorio Peak Limestone"}, + {"categories": ["formation_code"],"term": "318BSVP","definition": "Bone Spring and Victorio Peak Limestones"}, + {"categories": ["formation_code"],"term": "100ALVM","definition": "Alluvium"}, + {"categories": ["formation_code"],"term": "310PRMN","definition": "Permian System"}, + {"categories": ["formation_code"],"term": "110AVPS","definition": "Alluvium and Permian System"}, + {"categories": ["formation_code"],"term": "313CRCX","definition": "Capitan Reef Complex and Associated Limestones"}, + {"categories": ["formation_code"],"term": "112SLBL","definition": "Salt Bolson"}, + {"categories": ["formation_code"],"term": "112SBCRC","definition": "Salt Bolson and Capitan Reef Complex"}, + {"categories": ["formation_code"],"term": "313CRDM","definition": "Capitan Reef Complex - Delaware Mountain Group"}, + {"categories": ["formation_code"],"term": "112SBDM","definition": "Salt Bolson and Delaware Mountain Group"}, + {"categories": ["formation_code"],"term": "120BLSN","definition": "Bolson Deposits"}, + {"categories": ["formation_code"],"term": "112SBCR","definition": "Salt Bolson and Cretaceous Rocks"}, + {"categories": ["formation_code"],"term": "112HCBL","definition": "Hueco Bolson"}, + {"categories": ["formation_code"],"term": "120IVIG","definition": "Intrusive Rocks"}, + {"categories": ["formation_code"],"term": "112RLBL","definition": "Red Light Draw Bolson"}, + {"categories": ["formation_code"],"term": "112EFBL","definition": "Eagle Flat Bolson"}, + {"categories": ["formation_code"],"term": "112GRBL","definition": "Green River Bolson"}, + {"categories": ["formation_code"],"term": "123SAND","definition": "Sanders Canyon Formation"}, + {"categories": ["formation_code"],"term": "210MRNH","definition": "Moreno Hill Formation"}, + {"categories": ["formation_code"],"term": "320ALMT","definition": "Alamito Shale"}, + {"categories": ["formation_code"],"term": "313DLRM","definition": "Delaware Mountain Group"}, + {"categories": ["formation_code"],"term": "300PLZC","definition": "Paleozoic Erathem"}, + {"categories": ["formation_code"],"term": "122SPRS","definition": "Spears Member of Datil Formation"}, + {"categories": ["formation_code"],"term": "110AVTV","definition": "Alluvium and Tertiary Volcanics"}, + {"categories": ["formation_code"],"term": "313DMBS","definition": "Delaware Mountain Group - Bone Spring Limestone"}, + {"categories": ["formation_code"],"term": "120ERSV","definition": "Tertiary extrusives"}, + {"categories": ["lithology"],"term": "Alluvium","definition": "Alluvium"}, + {"categories": ["lithology"],"term": "Anhydrite","definition": "Anhydrite"}, + {"categories": ["lithology"],"term": "Arkose","definition": "Arkose"}, + {"categories": ["lithology"],"term": "Boulders","definition": "Boulders"}, + {"categories": ["lithology"],"term": "Boulders, silt and clay","definition": "Boulders, silt and clay"}, + {"categories": ["lithology"],"term": "Boulders and sand","definition": "Boulders and sand"}, + {"categories": ["lithology"],"term": "Bentonite","definition": "Bentonite"}, + {"categories": ["lithology"],"term": "Breccia","definition": "Breccia"}, + {"categories": ["lithology"],"term": "Basalt","definition": "Basalt"}, + {"categories": ["lithology"],"term": "Conglomerate","definition": "Conglomerate"}, + {"categories": ["lithology"],"term": "Chalk","definition": "Chalk"}, + {"categories": ["lithology"],"term": "Chert","definition": "Chert"}, + {"categories": ["lithology"],"term": "Clay","definition": "Clay"}, + {"categories": ["lithology"],"term": "Caliche","definition": "Caliche"}, + {"categories": ["lithology"],"term": "Calcite","definition": "Calcite"}, + {"categories": ["lithology"],"term": "Clay, some sand","definition": "Clay, some sand"}, + {"categories": ["lithology"],"term": "Claystone","definition": "Claystone"}, + {"categories": ["lithology"],"term": "Coal","definition": "Coal"}, + {"categories": ["lithology"],"term": "Cobbles","definition": "Cobbles"}, + {"categories": ["lithology"],"term": "Cobbles, silt and clay","definition": "Cobbles, silt and clay"}, + {"categories": ["lithology"],"term": "Cobbles and sand","definition": "Cobbles and sand"}, + {"categories": ["lithology"],"term": "Dolomite","definition": "Dolomite"}, + {"categories": ["lithology"],"term": "Dolomite and shale","definition": "Dolomite and shale"}, + {"categories": ["lithology"],"term": "Evaporite","definition": "Evaporite"}, + {"categories": ["lithology"],"term": "Gneiss","definition": "Gneiss"}, + {"categories": ["lithology"],"term": "Gypsum","definition": "Gypsum"}, + {"categories": ["lithology"],"term": "Graywacke","definition": "Graywacke"}, + {"categories": ["lithology"],"term": "Gravel and clay","definition": "Gravel and clay"}, + {"categories": ["lithology"],"term": "Gravel, cemented","definition": "Gravel, cemented"}, + {"categories": ["lithology"],"term": "Gravel, sand and silt","definition": "Gravel, sand and silt"}, + {"categories": ["lithology"],"term": "Granite, gneiss","definition": "Granite, gneiss"}, + {"categories": ["lithology"],"term": "Granite","definition": "Granite"}, + {"categories": ["lithology"],"term": "Gravel, silt and clay","definition": "Gravel, silt and clay"}, + {"categories": ["lithology"],"term": "Gravel","definition": "Gravel"}, + {"categories": ["lithology"],"term": "Igneous undifferentiated","definition": "Igneous undifferentiated"}, + {"categories": ["lithology"],"term": "Lignite","definition": "Lignite"}, + {"categories": ["lithology"],"term": "Limestone and dolomite","definition": "Limestone and dolomite"}, + {"categories": ["lithology"],"term": "Limestone and shale","definition": "Limestone and shale"}, + {"categories": ["lithology"],"term": "Limestone","definition": "Limestone"}, + {"categories": ["lithology"],"term": "Marl","definition": "Marl"}, + {"categories": ["lithology"],"term": "Mudstone","definition": "Mudstone"}, + {"categories": ["lithology"],"term": "Metamorphic undifferentiated","definition": "Metamorphic undifferentiated"}, + {"categories": ["lithology"],"term": "Marlstone","definition": "Marlstone"}, + {"categories": ["lithology"],"term": "No Recovery","definition": "No Recovery"}, + {"categories": ["lithology"],"term": "Peat","definition": "Peat"}, + {"categories": ["lithology"],"term": "Quartzite","definition": "Quartzite"}, + {"categories": ["lithology"],"term": "Rhyolite","definition": "Rhyolite"}, + {"categories": ["lithology"],"term": "Sand","definition": "Sand"}, + {"categories": ["lithology"],"term": "Schist","definition": "Schist"}, + {"categories": ["lithology"],"term": "Sand and clay","definition": "Sand and clay"}, + {"categories": ["lithology"],"term": "Sand and gravel","definition": "Sand and gravel"}, + {"categories": ["lithology"],"term": "Sandstone and shale","definition": "Sandstone and shale"}, + {"categories": ["lithology"],"term": "Sand and silt","definition": "Sand and silt"}, + {"categories": ["lithology"],"term": "Sand, gravel and clay","definition": "Sand, gravel and clay"}, + {"categories": ["lithology"],"term": "Shale","definition": "Shale"}, + {"categories": ["lithology"],"term": "Silt","definition": "Silt"}, + {"categories": ["lithology"],"term": "Siltstone and shale","definition": "Siltstone and shale"}, + {"categories": ["lithology"],"term": "Siltstone","definition": "Siltstone"}, + {"categories": ["lithology"],"term": "Slate","definition": "Slate"}, + {"categories": ["lithology"],"term": "Sand, some clay","definition": "Sand, some clay"}, + {"categories": ["lithology"],"term": "Sandstone","definition": "Sandstone"}, + {"categories": ["lithology"],"term": "Silt and clay","definition": "Silt and clay"}, + {"categories": ["lithology"],"term": "Travertine","definition": "Travertine"}, + {"categories": ["lithology"],"term": "Tuff","definition": "Tuff"}, + {"categories": ["lithology"],"term": "Volcanic undifferentiated","definition": "Volcanic undifferentiated"}, + {"categories": ["lithology"],"term": "Clay, yellow","definition": "Clay, yellow"}, + {"categories": ["lithology"],"term": "Clay, red","definition": "Clay, red"}, + {"categories": ["lithology"],"term": "Surficial sediment","definition": "Surficial sediment"}, + {"categories": ["lithology"],"term": "Limestone and sandstone, interbedded","definition": "Limestone and sandstone, interbedded"}, + {"categories": ["lithology"],"term": "Gravel and boulders","definition": "Gravel and boulders"}, + {"categories": ["lithology"],"term": "Sand, silt and gravel","definition": "Sand, silt and gravel"}, + {"categories": ["lithology"],"term": "Sand, gravel, silt and clay","definition": "Sand, gravel, silt and clay"}, + {"categories": ["lithology"],"term": "Andesite","definition": "Andesite"}, + {"categories": ["lithology"],"term": "Ignesous, intrusive, undifferentiated","definition": "Ignesous, intrusive, undifferentiated"}, + {"categories": ["lithology"],"term": "Limestone, sandstone and shale","definition": "Limestone, sandstone and shale"}, + {"categories": ["lithology"],"term": "Sand, silt and clay","definition": "Sand, silt and clay"}, {"categories": ["origin_source"], "term": "Reported by another agency", "definition": "Reported by another agency"}, {"categories": ["origin_source"], "term": "From driller's log or well report", "definition": "From driller's log or well report"}, {"categories": ["origin_source"], "term": "Private geologist, consultant or univ associate", "definition": "Private geologist, consultant or univ associate"}, @@ -709,6 +1178,13 @@ {"categories": ["note_type"], "term": "Historical", "definition": "Historical information or context about the well or location."}, {"categories": ["note_type"], "term": "Other", "definition": "Other types of notes that do not fit into the predefined categories."}, {"categories": ["note_type"], "term": "Water", "definition": "Water bearing zone information and other info from ose reports"}, - {"categories": ["note_type"], "term": "Measuring", "definition": "Notes about measuring/visiting the well, on Access form"} + {"categories": ["note_type"], "term": "Measuring", "definition": "Notes about measuring/visiting the well, on Access form"}, + {"categories": ["well_pump_type"], "term": "Submersible", "definition": "Submersible"}, + {"categories": ["well_pump_type"], "term": "Jet", "definition": "Jet Pump"}, + {"categories": ["well_pump_type"], "term": "Line Shaft", "definition": "Line Shaft"}, + {"categories": ["well_pump_type"], "term": "Hand", "definition": "Hand Pump"}, + {"categories": ["permission_type"], "term": "Water Level Sample", "definition": "Permissions for taking water level samples"}, + {"categories": ["permission_type"], "term": "Water Chemistry Sample", "definition": "Permissions for water taking chemistry samples"}, + {"categories": ["permission_type"], "term": "Datalogger Installation", "definition": "Permissions for installing dataloggers"} ] } \ No newline at end of file diff --git a/db/__init__.py b/db/__init__.py index 5a58441f8..4a0fc8e70 100644 --- a/db/__init__.py +++ b/db/__init__.py @@ -33,7 +33,7 @@ from db.notes import * from db.observation import * from db.parameter import * -from db.permission import * +from db.permission_history import * from db.publication import * from db.regulatory_limit import * from db.sample import * @@ -43,6 +43,11 @@ from db.transducer import * from db.measuring_point_history import * from db.data_provenance import * +from db.aquifer_system import * +from db.geologic_formation import * +from db.thing_aquifer_association import * +from db.thing_geologic_formation_association import * +from db.aquifer_type import * from sqlalchemy import ( func, diff --git a/db/aquifer_system.py b/db/aquifer_system.py new file mode 100644 index 000000000..c202d77c9 --- /dev/null +++ b/db/aquifer_system.py @@ -0,0 +1,84 @@ +""" +SQLAlchemy model for the AquiferSystem table. + +This is a master reference table for aquifer systems and hydrogeologic units. +""" + +from typing import List, TYPE_CHECKING + +from sqlalchemy import Text, Index +from sqlalchemy.orm import relationship, Mapped, mapped_column +from sqlalchemy.ext.associationproxy import association_proxy, AssociationProxy +from geoalchemy2 import Geometry + +from db.base import Base, AutoBaseMixin, ReleaseMixin +from db.lexicon import lexicon_term + +from constants import SRID_WGS84 + +if TYPE_CHECKING: + from db.thing import WellScreen, ThingAquiferAssociation, Thing + from db.aquifer_type import AquiferType + + +class AquiferSystem(Base, AutoBaseMixin, ReleaseMixin): + __versioned__ = {} + + name: Mapped[str] = mapped_column( + nullable=False, + unique=True, + comment="The full, human-readable name of the aquifer system (e.g., 'Ogallala Aquifer').", + ) + description: Mapped[str] = mapped_column( + Text, + nullable=True, + comment="A detailed description of the aquifer system, its characteristics, and its significance.", + ) + # Lexicon terms were retrieved from NMAquifer's 'LU_AquiferType' table. + primary_aquifer_type: Mapped[str] = lexicon_term( + nullable=False, + comment="A controlled vocabulary field to classify the aquifer system as a whole (e.g., 'Unconfined', 'Confined', 'Perched').", + ) + geographic_scale: Mapped[str] = lexicon_term( + nullable=True, + comment="A controlled vocabulary field to classify the aquifer's geographic scale (e.g., 'Major', 'Regional', 'Local').", + ) + boundary: Mapped[Geometry] = mapped_column( + Geometry(geometry_type="MULTIPOLYGON", srid=SRID_WGS84, spatial_index=True), + nullable=True, + comment="A spatial representation of the aquifer system's boundary.", + ) + # Hierarchical relationship fields (may be implemented in future iterations) + # Example: High Plains Aquifer (parent) contains Ogallala Aquifer (child) + # parent_id = Column(Integer, ForeignKey('aquifer_system.id')) + # parent = relationship('AquiferSystem', remote_side=[id], backref='subsystems') + + # --- Relationships --- + # One-To-Many: An AquiferSystem can be associated with many wells (Things) via the ThingAquiferAssociation join table. + thing_associations: Mapped[List["ThingAquiferAssociation"]] = relationship( + "ThingAquiferAssociation", + back_populates="aquifer_system", + cascade="all, delete-orphan", + passive_deletes=True, + ) + + # One-To-Many: An AquiferSystem can be the target for many individual WellScreens. + well_screens: Mapped[List["WellScreen"]] = relationship( + "WellScreen", + back_populates="aquifer_system", + cascade="all, delete-orphan", + passive_deletes=True, + ) + + # --- Association Proxies --- + # Proxy to directly access Things (wells) associated with this AquiferSystem. + things: AssociationProxy[List["Thing"]] = association_proxy( + "thing_associations", "thing" + ) + # Proxy to directly access all AquiferTypes associated with this AquiferSystem. + aquifer_types: AssociationProxy[List["AquiferType"]] = association_proxy( + "thing_associations", "aquifer_types" + ) + + # --- Table Arguments --- + __table_args__ = (Index("ix_aquifersystem_name", "name"),) diff --git a/db/aquifer_type.py b/db/aquifer_type.py new file mode 100644 index 000000000..32900d801 --- /dev/null +++ b/db/aquifer_type.py @@ -0,0 +1,58 @@ +""" +SQLAlchemy model for the AquiferType table. + +This table stores the specific aquifer characteristics/types associated with +a Thing-AquiferSystem relationship. It allows capturing that a single aquifer +can have multiple characteristics simultaneously. + +Example: + A well in the "Ogallala" aquifer might tap portions that are both + "Fractured" AND "Confined". This would create: + - One AquiferSystem: "Ogallala" + - One ThingAquiferAssociation: linking well to Ogallala + - Two AquiferType records: "Fractured" and "Confined" +""" + +from typing import TYPE_CHECKING + +from sqlalchemy import ForeignKey +from sqlalchemy.orm import relationship, Mapped, mapped_column + +from db.base import Base, AutoBaseMixin, ReleaseMixin, lexicon_term + +if TYPE_CHECKING: + from db.thing_aquifer_association import ThingAquiferAssociation + + +class AquiferType(Base, AutoBaseMixin, ReleaseMixin): + """ + Represents the specific aquifer types/characteristics for a + Thing-AquiferSystem association. + + This allows modeling the fact that: + - A single aquifer can have multiple characteristics + - Different wells may tap different characteristics of the same aquifer + - Characteristics are attributes of the relationship, not the aquifer itself + + Fields from WellData CSV: + - AquiferType: May contain multiple codes (e.g., "FC" = Fractured + Confined) + - Each code becomes a separate AquiferType record + """ + + # --- Columns --- + thing_aquifer_association_id: Mapped[int] = mapped_column( + ForeignKey("thing_aquifer_association.id", ondelete="CASCADE"), + nullable=False, + comment="Links to the Thing-Aquifer association this type describes.", + ) + aquifer_type: Mapped[str] = lexicon_term( + nullable=False, + comment="Controlled vocabulary for aquifer hydrologic properties. " + "Examples: 'Unconfined', 'Confined', 'Perched', 'Fractured', 'Unconsolidated'.", + ) + + # --- Relationships --- + # Many-to-One: Multiple aquifer types can belong to one association + thing_aquifer_association: Mapped["ThingAquiferAssociation"] = relationship( + "ThingAquiferAssociation", back_populates="aquifer_types" + ) diff --git a/db/contact.py b/db/contact.py index 7855814fb..558724df9 100644 --- a/db/contact.py +++ b/db/contact.py @@ -26,7 +26,7 @@ from db.field import FieldEventParticipant, FieldEvent from db.thing import Thing from db.publication import Author, AuthorContactAssociation - from db.permission import Permission + from db.permission_history import PermissionHistory class ThingContactAssociation(Base, AutoBaseMixin): @@ -74,8 +74,10 @@ class Contact(Base, AutoBaseMixin, ReleaseMixin): ) # One-To-Many: A Contact can grant many Permissions. - permissions: Mapped[List["Permission"]] = relationship( - "Permission", back_populates="contact", cascade="all, delete, delete-orphan" + permissions: Mapped[List["PermissionHistory"]] = relationship( + "PermissionHistory", + back_populates="contact", + cascade="all, delete, delete-orphan", ) # One-To-Many: A Contact can be associated with many Authors (in Publications). author_associations: Mapped[List["AuthorContactAssociation"]] = relationship( diff --git a/db/data_provenance.py b/db/data_provenance.py index 06c468c8d..20505d94c 100644 --- a/db/data_provenance.py +++ b/db/data_provenance.py @@ -19,7 +19,7 @@ from sqlalchemy import Integer, Index, and_ from sqlalchemy.orm import relationship, Mapped, mapped_column, declared_attr, foreign -from db.base import Base, AutoBaseMixin, ReleaseMixin, pascal_to_snake +from db.base import Base, AutoBaseMixin, ReleaseMixin from db import lexicon_term @@ -53,9 +53,13 @@ class DataProvenance(AutoBaseMixin, ReleaseMixin, Base): ) # Values from the following NMAquifer tables are included as `origin_source` terms in the lexicon: # 'LU_DataSource', 'LU_Depth_CompletionSource'. - origin_source: Mapped[str] = lexicon_term( + origin_type: Mapped[str] = lexicon_term( nullable=True, - comment="Indicates the origin source of the data (e.g'Driller's Log', 'Well Report'.", + comment="Indicates the type of origin the data (e.g'Driller's Log', 'Well Report'.", + ) + origin_source: Mapped[str] = mapped_column( + nullable=True, + comment="The specific source of the data (e.g., 'J. Brown Thesis, \"I like APIs\", Pomona College, 1994').", ) # Values from the following NMAquifer tables are included as `collection_method` terms in the lexicon: # 'LU_AltitudeMethod','LU_CoordinateMethod'. @@ -116,7 +120,7 @@ def data_provenance(cls): "DataProvenance", primaryjoin=and_( cls.id == foreign(DataProvenance.target_id), - DataProvenance.target_table == pascal_to_snake(cls.__name__), + DataProvenance.target_table == cls.__tablename__, ), lazy="selectin", viewonly=True, diff --git a/db/geologic_formation.py b/db/geologic_formation.py new file mode 100644 index 000000000..2379f50f4 --- /dev/null +++ b/db/geologic_formation.py @@ -0,0 +1,82 @@ +""" +SQLAlchemy model for the GeologicFormation table. + +This table is a master reference table for geologic formations. Its purpose is to store definitions and descriptions +of various geologic formations that can be referenced by other tables in the database. +""" + +from typing import List, TYPE_CHECKING + +from sqlalchemy import Text, Index +from sqlalchemy.orm import relationship, Mapped, mapped_column +from sqlalchemy.ext.associationproxy import association_proxy, AssociationProxy +from geoalchemy2 import Geometry + +from db.base import Base, AutoBaseMixin, ReleaseMixin +from db.lexicon import lexicon_term + +from constants import SRID_WGS84 + +if TYPE_CHECKING: + from db.thing import Thing, WellScreen + from db.thing_geologic_formation_association import ( + ThingGeologicFormationAssociation, + ) + + +class GeologicFormation(Base, AutoBaseMixin, ReleaseMixin): + __versioned__ = {} + + # TODO: Let the API map formation codes to names using a formations.json file that can be periodically updated + # from the authoritative source (.e.g USGS). A placeholder `formations.json` file had been added to the `core` + # directory. + # name: Mapped[str] = mapped_column( + # nullable=False, + # unique=True, + # comment="The full, human-readable name of the geologic formation (e.g., 'Navajo Sandstone').", + # ) + formation_code: Mapped[str] = lexicon_term( + nullable=True, + unique=True, + comment="A short code or abbreviation for the geologic formation (e.g., '120ELRT').", + ) + description: Mapped[str] = mapped_column( + Text, + nullable=True, + comment="A detailed description of the geologic formation, its characteristics, and its significance.", + ) + # TODO: Implement controlled vocabularies for `lithology` using NMAquifer's 'LU_Lithology' table. + # This should be implemented after AMMP reviews and cleans up their formation terms and codes. + lithology: Mapped[str] = lexicon_term( + nullable=True, + comment="A controlled vocabulary for the primary, dominant rock type" + "(e.g., 'Tuff', 'Sandstone', 'Alluvium', 'Shale').", + ) + boundary: Mapped[Geometry] = mapped_column( + Geometry(geometry_type="MULTIPOLYGON", srid=SRID_WGS84, spatial_index=True), + nullable=True, + comment="A spatial representation of the geologic formation's extent.", + ) + + # --- Relationships --- + # One-To-Many (Association Object): A GeologicFormation can be associated with many Things (e.g., wells) via the + # ThingGeologicFormationAssociation join table. + thing_associations: Mapped[List["ThingGeologicFormationAssociation"]] = ( + relationship( + "ThingGeologicFormationAssociation", + back_populates="geologic_formation", + cascade="all, delete-orphan", + passive_deletes=True, + ) + ) + # One-To-Many: A GeologicFormation can have many physical WellScreens installed in it. + well_screens: Mapped[List["WellScreen"]] = relationship( + "WellScreen", back_populates="geologic_formation", passive_deletes=True + ) + + # --- Association Proxies --- + # Provides direct access to Things (wells) that penetrate this formation. + things: AssociationProxy["Thing"] = association_proxy("thing_associations", "thing") + + # --- Table Arguments --- + __table_args__ = (Index("ix_geologicformation_formation_code", "formation_code"),) diff --git a/db/location.py b/db/location.py index 50b1aa0db..fda4611f9 100644 --- a/db/location.py +++ b/db/location.py @@ -23,6 +23,7 @@ String, ForeignKey, DateTime, + Date, func, Text, ) @@ -61,6 +62,18 @@ class Location(Base, AutoBaseMixin, ReleaseMixin, NotesMixin, DataProvenanceMixi nma_notes_location: Mapped[str] = mapped_column(Text, nullable=True) nma_coordinate_notes: Mapped[str] = mapped_column(Text, nullable=True) + # --- AMPAPI Date Fields (Migration-Only, Read-Only Post-Migration) --- + nma_date_created: Mapped[datetime.date] = mapped_column( + Date, + nullable=True, + comment="Original AMPAPI DateCreated (read-only, populated only during migration)", + ) + nma_site_date: Mapped[datetime.date] = mapped_column( + Date, + nullable=True, + comment="Original AMPAPI SiteDate (read-only, populated only during migration)", + ) + # --- Relationship Definitions --- thing_associations: Mapped[list["LocationThingAssociation"]] = relationship( back_populates="location", cascade="all, delete-orphan" diff --git a/db/notes.py b/db/notes.py index ab8384064..0e2e8ab8b 100644 --- a/db/notes.py +++ b/db/notes.py @@ -97,7 +97,7 @@ def notes(cls): "Notes", primaryjoin=and_( cls.id == foreign(Notes.target_id), - Notes.target_table == cls.__name__, + Notes.target_table == cls.__tablename__, ), cascade="all, delete-orphan", lazy="selectin", @@ -120,7 +120,7 @@ def add_note( content=content, note_type=note_type, target_id=self.id, - target_table=self.__class__.__name__, + target_table=self.__class__.__tablename__, release_status=release_status, ) diff --git a/db/permission_history.py b/db/permission_history.py new file mode 100644 index 000000000..591046bba --- /dev/null +++ b/db/permission_history.py @@ -0,0 +1,96 @@ +""" +models/permission.py + +This model defines the `Permission` table, a polymorphic table that tracks +all legal and administrative agreements related to site access and activity. +Its purpose is to track who granted permission, what activities they authorized, +which entity the permission applies to, and for what period of time. +""" + +from typing import TYPE_CHECKING +from datetime import date +from sqlalchemy import Integer, ForeignKey, String, and_ +from sqlalchemy.orm import relationship, Mapped, mapped_column, declared_attr, foreign + +from db.base import Base, AutoBaseMixin, ReleaseMixin, lexicon_term + + +if TYPE_CHECKING: + from db.contact import Contact + from db.thing import Thing + from db.location import Location + + +class PermissionHistory(Base, AutoBaseMixin, ReleaseMixin): + """ + Represents a specific grant of permission from a Contact for a + specific entity (e.g., a Thing or Location). + """ + + # --- Foreign Keys --- + contact_id: Mapped[int] = mapped_column( + Integer, ForeignKey("contact.id", ondelete="CASCADE"), nullable=False + ) + + # --- Columns --- + permission_type: Mapped[str] = lexicon_term(nullable=False) + permission_allowed: Mapped[bool] = mapped_column(nullable=False, default=False) + start_date: Mapped[date] = mapped_column(nullable=False) + end_date: Mapped[date] = mapped_column(nullable=True) + notes: Mapped[str] = mapped_column(nullable=True) + + # --- Polymorphic Columns --- + target_id: Mapped[int] = mapped_column(nullable=False) + target_table: Mapped[str] = mapped_column(String(50), nullable=False) + + # --- Relationships --- + # Many-To-One: A Permission is granted by one Contact. + contact: Mapped["Contact"] = relationship("Contact", back_populates="permissions") + + # --- Polymorphic Parent Relationships (Internal) --- + # These are view-only relationships used by the 'target' property below. + # They tell SQLAlchemy exactly how to find the specific parent record for a given child. + _thing_target: Mapped["Thing"] = relationship( + "Thing", + primaryjoin="and_(foreign(PermissionHistory.target_id) == Thing.id, " + "PermissionHistory.target_table == 'thing')", + viewonly=True, + ) + _location_target: Mapped["Location"] = relationship( + "Location", + primaryjoin="and_(foreign(PermissionHistory.target_id) == Location.id, " + "PermissionHistory.target_table == 'location')", + viewonly=True, + ) + + @property + def target(self): + """ + A generic property to get the parent object (Thing, Location, etc.). + This is useful for simplifying application code by providing a single, + consistent way to access the parent of a polymorphic record. + """ + return getattr(self, f"_{self.target_table}_target") + + +class PermissionHistoryMixin: + """ + Mixin for models that can have permissions (e.g., Thing, Location). + It automatically creates a polymorphic One-to-Many relationship to the + Permission table. + """ + + @declared_attr + def permission_history(cls): + # One-to-Many polymorphic relationship + return relationship( + "PermissionHistory", + primaryjoin=( + and_( + cls.id == foreign(PermissionHistory.target_id), + PermissionHistory.target_table == cls.__tablename__, + ) + ), + lazy="selectin", + viewonly=True, + ) diff --git a/db/status_history.py b/db/status_history.py index 8b3ee2321..15b5aec2f 100644 --- a/db/status_history.py +++ b/db/status_history.py @@ -19,7 +19,7 @@ ) from sqlalchemy.orm import Mapped, mapped_column, declared_attr, relationship, foreign -from db.base import Base, AutoBaseMixin, ReleaseMixin, lexicon_term, pascal_to_snake +from db.base import Base, AutoBaseMixin, ReleaseMixin, lexicon_term class StatusHistory(Base, AutoBaseMixin, ReleaseMixin): @@ -47,7 +47,7 @@ def status_history(cls): "StatusHistory", primaryjoin=and_( cls.id == foreign(StatusHistory.target_id), - StatusHistory.target_table == pascal_to_snake(cls.__name__), + StatusHistory.target_table == cls.__tablename__, ), cascade="all, delete-orphan", lazy="selectin", diff --git a/db/thing.py b/db/thing.py index 9f30d08e2..92c7bd942 100644 --- a/db/thing.py +++ b/db/thing.py @@ -26,8 +26,9 @@ AutoBaseMixin, Base, ReleaseMixin, - PermissionMixin, ) +from db.permission_history import PermissionHistoryMixin +from services.util import retrieve_latest_polymorphic_history_table_record from db.status_history import StatusHistoryMixin from db.measuring_point_history import MeasuringPointHistory from db.data_provenance import DataProvenanceMixin @@ -40,6 +41,12 @@ from db.sensor import Sensor from db.contact import Contact from db.group import Group, GroupThingAssociation + from db.aquifer_system import AquiferSystem + from db.thing_aquifer_association import ThingAquiferAssociation + from db.geologic_formation import GeologicFormation + from db.thing_geologic_formation_association import ( + ThingGeologicFormationAssociation, + ) class Thing( @@ -47,7 +54,7 @@ class Thing( AutoBaseMixin, ReleaseMixin, StatusHistoryMixin, - PermissionMixin, + PermissionHistoryMixin, DataProvenanceMixin, NotesMixin, ): @@ -64,10 +71,6 @@ class Thing( comment="To audit where the data came from in NM_Aquifer if it was transferred over", ) - # notes = mapped_column(Text, nullable=True) - # measuring_notes = mapped_column(Text, nullable=True) - # water_notes = mapped_column(Text, nullable=True) - # TODO: should `name` be unique? name: Mapped[str] = mapped_column( nullable=False, @@ -116,6 +119,32 @@ class Thing( well_construction_notes: Mapped[str] = mapped_column(Text, nullable=True) + well_completion_date: Mapped[date] = mapped_column( + nullable=True, comment="the date the well was completed if known" + ) + well_driller_name: Mapped[str] = mapped_column( + String(200), nullable=True, comment="Name of the well driller." + ) + well_construction_method: Mapped[str] = lexicon_term(nullable=True) + well_pump_type: Mapped[str] = lexicon_term(nullable=True) + well_pump_depth: Mapped[float] = mapped_column( + Float, + nullable=True, + info={"unit": "feet below ground surface"}, + comment="Depth of the well pump from ground surface to the pump intake (in feet).", + ) + formation_completion_code: Mapped[str] = lexicon_term( + nullable=True, + comment="The geologic formation in which the well was completed (from WellData.FormationZone). " + "This indicates the target formation for the well, not the full stratigraphic column. " + "For detailed depth-interval stratigraphy, see formation_associations.", + ) + # TODO: should this be required for every well in the database? AMMP review + is_suitable_for_datalogger: Mapped[bool] = mapped_column( + nullable=True, + comment="Indicates if the well is suitable for datalogger installation.", + ) + # Spring-related columns spring_type: Mapped[str] = lexicon_term( nullable=True, @@ -263,6 +292,26 @@ class Thing( lazy="joined", ) + # One-To-Many: A Thing can be associated with many AquiferSystems via the ThingAquiferAssociation join table. + aquifer_associations: Mapped[List["ThingAquiferAssociation"]] = relationship( + "ThingAquiferAssociation", + back_populates="thing", + cascade="all, delete-orphan", + passive_deletes=True, + lazy="joined", + ) + + # Many-To-Many: A Thing can penetrate many GeologicFormations. + formation_associations: Mapped[List["ThingGeologicFormationAssociation"]] = ( + relationship( + "ThingGeologicFormationAssociation", + back_populates="thing", + cascade="all, delete-orphan", + passive_deletes=True, + lazy="joined", + ) + ) + # --- Association Proxies --- assets: AssociationProxy[list["Asset"]] = association_proxy( "asset_associations", "asset" @@ -288,6 +337,16 @@ class Thing( "group_associations", "group" ) + # Proxy to directly access AquiferSystems associated with this Thing + aquifer_systems: AssociationProxy[List["AquiferSystem"]] = association_proxy( + "aquifer_associations", "aquifer_system" + ) + + # Proxy to directly access the GeologicFormations penetrated by this Thing. + geologic_formations: AssociationProxy[List["GeologicFormation"]] = ( + association_proxy("formation_associations", "geologic_formation") + ) + # Full-text search vector search_vector = Column(TSVectorType("name", "well_construction_notes")) @@ -379,7 +438,48 @@ def measuring_point_description(self) -> str | None: @property def well_depth_source(self) -> str | None: - return self._get_data_provenance_attribute("well_depth", "origin_source") + return self._get_data_provenance_attribute("well_depth", "origin_type") + + @property + def well_completion_date_source(self) -> str | None: + return self._get_data_provenance_attribute( + "well_completion_date", "origin_type" + ) + + @property + def well_construction_method_source(self) -> str | None: + return self._get_data_provenance_attribute( + "well_construction_method", "origin_source" + ) + + @property + def aquifers(self) -> List[dict]: + """ + Returns a list of aquifer systems and their associated types for this Thing. + Each aquifer system is represented as a dictionary with its name and a list of types. + """ + aquifer_list = [] + for association in self.aquifer_associations: + aquifer_info = { + "aquifer_system": association.aquifer_system.name, + "aquifer_types": [ + atype.aquifer_type for atype in association.aquifer_types + ], + } + aquifer_list.append(aquifer_info) + return aquifer_list + + @property + def permissions(self) -> list: + """ + Returns the associated permissions or an empty list. If there are no + associated permissions, an empty list is returned instead of None to + allow the API to serialize correctly (see schemas/thing.py). + """ + if self.permission_history: + return self.permission_history + else: + return [] class ThingIdLink(Base, AutoBaseMixin, ReleaseMixin): @@ -406,6 +506,12 @@ class WellScreen(Base, AutoBaseMixin, ReleaseMixin): thing_id: Mapped[int] = mapped_column( ForeignKey("thing.id", ondelete="CASCADE"), nullable=False ) + aquifer_system_id: Mapped[int] = mapped_column( + ForeignKey("aquifer_system.id", ondelete="SET NULL"), nullable=True + ) + geologic_formation_id: Mapped[int] = mapped_column( + ForeignKey("geologic_formation.id", ondelete="SET NULL"), nullable=True + ) screen_depth_top: Mapped[float] = mapped_column( info={"unit": "feet below ground surface"}, nullable=True ) @@ -423,6 +529,14 @@ class WellScreen(Base, AutoBaseMixin, ReleaseMixin): # Many-To-One: A WellScreen belongs to one Thing. thing: Mapped["Thing"] = relationship("Thing", back_populates="screens") + aquifer_system: Mapped["AquiferSystem"] = relationship( + "AquiferSystem", back_populates="well_screens", passive_deletes=True + ) + + geologic_formation: Mapped["GeologicFormation"] = relationship( + "GeologicFormation", back_populates="well_screens", passive_deletes=True + ) + class WellPurpose(Base, AutoBaseMixin, ReleaseMixin): """ diff --git a/db/thing_aquifer_association.py b/db/thing_aquifer_association.py new file mode 100644 index 000000000..cca5758a9 --- /dev/null +++ b/db/thing_aquifer_association.py @@ -0,0 +1,51 @@ +""" +SQLAlchemy model for the ThingAquiferAssociation table. + +This table is a join table (or "association object") whose purpose is to manage +the many-to-many relationship between a Thing and an AquiferSystem. +""" + +from typing import TYPE_CHECKING + +from sqlalchemy import ForeignKey + +from sqlalchemy.orm import relationship, Mapped, mapped_column + +from db.base import Base, AutoBaseMixin, ReleaseMixin + +if TYPE_CHECKING: + from db.thing import Thing + from db.aquifer_system import AquiferSystem + from db.aquifer_type import AquiferType + + +class ThingAquiferAssociation(Base, AutoBaseMixin, ReleaseMixin): + """ + Represents the association of a Thing to an AquiferSystem. This is an Association Object. + """ + + thing_id: Mapped[int] = mapped_column( + ForeignKey("thing.id", ondelete="CASCADE"), nullable=False + ) + aquifer_system_id: Mapped[int] = mapped_column( + ForeignKey("aquifer_system.id", ondelete="CASCADE"), nullable=False + ) + + # --- Relationship Definitions --- + # Many-To-One: This association links to one Thing. + thing: Mapped["Thing"] = relationship( + "Thing", back_populates="aquifer_associations", lazy="joined" + ) + + # Many-To-One: This association links to one AquiferSystem. + aquifer_system: Mapped["AquiferSystem"] = relationship( + "AquiferSystem", back_populates="thing_associations", lazy="joined" + ) + # One-To-Many: An association can have multiple aquifer types. + aquifer_types: Mapped[list["AquiferType"]] = relationship( + "AquiferType", + back_populates="thing_aquifer_association", + cascade="all, delete-orphan", + passive_deletes=True, + lazy="joined", + ) diff --git a/db/thing_geologic_formation_association.py b/db/thing_geologic_formation_association.py new file mode 100644 index 000000000..0707df269 --- /dev/null +++ b/db/thing_geologic_formation_association.py @@ -0,0 +1,60 @@ +""" +SQLAlchemy model for the ThingGeologicFormationAssociation table. + +This table is an association object that creates a many-to-many relationship between a Thing (well) and a +GeologicFormation. It stores the lithology for a well, detailing the depth intervals for each formation it penetrates. +""" + +from typing import TYPE_CHECKING + +from sqlalchemy import ForeignKey +from sqlalchemy.orm import relationship, Mapped, mapped_column + +from db.base import Base, AutoBaseMixin, ReleaseMixin + +if TYPE_CHECKING: + from db.thing import Thing + from db.geologic_formation import GeologicFormation + + +class ThingGeologicFormationAssociation(Base, AutoBaseMixin, ReleaseMixin): + """ + This is a= join table (Association Object). It represents the association of a Thing to a + GeologicFormation at a specific depth interval. + """ + + # --- Foreign Keys --- + thing_id: Mapped[int] = mapped_column( + ForeignKey("thing.id", ondelete="CASCADE"), + nullable=False, + comment="The foreign key linking this record to the `Thing` table." + "Deleting a `Thing` will cascade and delete its formation log.", + ) + geologic_formation_id: Mapped[int] = mapped_column( + ForeignKey("geologic_formation.id", ondelete="SET NULL"), + nullable=True, + comment="The foreign key linking this record to the `GeologicFormation` table." + "This is set to `SET NULL` on delete, as deleting a formation definition (a rare admin action)" + "should not delete the historical fact that a well had a pick at this depth.", + ) + + # Depth interval fields + top_depth: Mapped[float] = mapped_column( + nullable=False, + comment="The depth (in feet) to the top of the geologic formation, as measured from ground surface.", + ) + bottom_depth: Mapped[float] = mapped_column( + nullable=False, + comment="The depth (in feet) to the bottom of the geologic formation, as measured from ground surface.", + ) + + # --- Relationship Definitions --- + # Many-To-One: This association links to one Thing. + thing: Mapped["Thing"] = relationship( + "Thing", back_populates="formation_associations", lazy="joined" + ) + + # Many-To-One: This association links to one GeologicFormation. + geologic_formation: Mapped["GeologicFormation"] = relationship( + "GeologicFormation", back_populates="thing_associations", lazy="joined" + ) diff --git a/schemas/aquifer_system.py b/schemas/aquifer_system.py new file mode 100644 index 000000000..1e1961873 --- /dev/null +++ b/schemas/aquifer_system.py @@ -0,0 +1,51 @@ +from typing import List + +from pydantic import BaseModel +from schemas import BaseResponseModel +from schemas.validators import GeometryMixin +from core.enums import AquiferType, GeographicScale # Import specific Enums + + +# ------ CREATE ---------- +class CreateAquiferSystem(GeometryMixin): + """ + Schema for creating an aquifer system. + Used during data transfer and API creation. + """ + + name: str + description: str | None = None + primary_aquifer_type: AquiferType + geographic_scale: GeographicScale | None = None + # boundary field inherited from GeometryMixin + + +# ------ RESPONSE ---------- +class GeoJSONGeometry(BaseModel): + """ + Geometry schema for GeoJSON response. + """ + + type: str = "MULTIPOLYGON" + coordinates: List[List[List[float]]] + + +class GeoJSONProperties(BaseResponseModel): + """ + Response schema for aquifer system details. + """ + + name: str + description: str | None = None + primary_aquifer_type: AquiferType + geographic_scale: GeographicScale | None + + +class AquiferSystemGeoJSONResponse(BaseModel): + """ + Response schema for aquifer system details. + """ + + type: str = "Feature" + geometry: GeoJSONGeometry + properties: GeoJSONProperties diff --git a/schemas/geologic_formation.py b/schemas/geologic_formation.py new file mode 100644 index 000000000..67a3cb24a --- /dev/null +++ b/schemas/geologic_formation.py @@ -0,0 +1,88 @@ +from typing import List + +from pydantic import BaseModel, field_validator, Field + +from schemas import BaseResponseModel +from schemas.validators import DepthIntervalMixin, GeometryMixin +from core.enums import FormationCode, Lithology + + +# ------ CREATE ---------- +class CreateGeologicFormation(GeometryMixin): + """ + Schema for creating a geologic formation. + Used during data transfer and API creation. + """ + + # formation_code has its own custom uppercase validator + formation_code: FormationCode | None = None + description: str | None = None + lithology: Lithology | None = None + # boundary: inherited from GeometryMixin + + @field_validator("formation_code", mode="before") + @classmethod + def upper_case_code(cls, v: str | None) -> str | None: + """ + Automatically uppercase the formation code. + """ + if isinstance(v, str): + return v.upper() + return v + + +class CreateThingGeologicFormationAssociation(DepthIntervalMixin): + """ + Schema for linking a Thing (Well) to a GeologicFormation. + Uses DepthIntervalMixin to enforce bottom_depth > top_depth. + """ + + thing_id: int + geologic_formation_id: int + top_depth: float = Field(ge=0) + bottom_depth: float = Field(ge=0) + + +# ------ RESPONSE ---------- +class GeoJSONGeometry(BaseModel): + """ + Geometry schema for GeoJSON response. + """ + + type: str = "MULTIPOLYGON" + coordinates: List[List[List[float]]] + + +class GeoJSONProperties(BaseResponseModel): + """ + Response schema for geologic formation details. + """ + + formation_code: str | None = None + description: str | None = None + lithology: str | None = None + + +class GeologicFormationGeoJSONResponse(BaseModel): + """ + Response schema for geologic formation details. + """ + + type: str = "Feature" + geometry: GeoJSONGeometry + properties: GeoJSONProperties + + +class ThingGeologicFormationAssociationResponse(BaseResponseModel): + """ + Response schema for the association between a Thing and a GeologicFormation. + Includes depth interval information. + """ + + thing_id: int + geologic_formation_id: int | None = None + geologic_formation: GeologicFormationGeoJSONResponse | None = None + top_depth: float + top_depth_unit: str = "ft" + bottom_depth: float + bottom_depth_unit: str = "ft" diff --git a/schemas/location.py b/schemas/location.py index e911e3359..17414b5c4 100644 --- a/schemas/location.py +++ b/schemas/location.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== +from datetime import date from typing import List from geoalchemy2 import WKBElement @@ -106,6 +107,9 @@ class GeoJSONProperties(BaseModel): default_factory=GeoJSONUTMCoordinates ) notes: list[NoteResponse] = [] + # AMPAPI date fields (read-only, populated only during migration) + nma_date_created: date | None = None + nma_site_date: date | None = None model_config = ConfigDict( from_attributes=True, @@ -150,6 +154,9 @@ def populate_fields(cls, data: Any) -> Any: data_dict["properties"]["notes"] = data_dict.get("notes") data_dict["properties"]["elevation"] = convert_m_to_ft(elevation_m) data_dict["properties"]["elevation_method"] = data_dict.get("elevation_method") + # populate AMPAPI date fields + data_dict["properties"]["nma_date_created"] = data_dict.get("nma_date_created") + data_dict["properties"]["nma_site_date"] = data_dict.get("nma_site_date") # populate UTM coordinates point_utm_zone_13n_wkt = transform_srid( @@ -181,6 +188,10 @@ class LocationResponse(BaseResponseModel): county: str | None quad_name: str | None + # AMPAPI date fields (read-only, populated only during migration, not in Create/Update schemas) + nma_date_created: date | None = None + nma_site_date: date | None = None + @field_validator("point", mode="before") def point_to_wkt(cls, value): if isinstance(value, WKBElement): diff --git a/schemas/permission_history.py b/schemas/permission_history.py new file mode 100644 index 000000000..e0619d90e --- /dev/null +++ b/schemas/permission_history.py @@ -0,0 +1,18 @@ +from pydantic import BaseModel +from schemas import PastOrTodayDate + +from core.enums import PermissionType + + +# ------ RESPONSE ---------- +class PermissionHistoryResponse(BaseModel): + """ + Even though permission_allowed and start_date are not-nullable in the + database, they are nullable here to accommodate cases where no permission + record exists for a given permission type. + """ + + permission_type: PermissionType + permission_allowed: bool | None + start_date: PastOrTodayDate | None + end_date: PastOrTodayDate | None diff --git a/schemas/thing.py b/schemas/thing.py index cf8c3ef2b..7a7982494 100644 --- a/schemas/thing.py +++ b/schemas/thing.py @@ -24,12 +24,17 @@ ScreenType, Organization, MonitoringFrequency, + Organization, + MonitoringFrequency, + WellConstructionMethod, + WellPumpType, + FormationCode, ) from schemas import BaseCreateModel, BaseUpdateModel, BaseResponseModel, PastOrTodayDate from schemas.group import GroupResponse from schemas.location import LocationGeoJSONResponse from schemas.notes import NoteResponse, CreateNote - +from schemas.permission_history import PermissionHistoryResponse # -------- VALIDATE ---------- @@ -128,8 +133,16 @@ class CreateWell(CreateBaseThing, ValidateWell): measuring_point_height: float = Field( ge=0, description="Measuring point height in feet" ) - measuring_point_description: str | None + measuring_point_description: str | None = None notes: list[CreateNote] | None = None + well_completion_date: PastOrTodayDate | None = None + well_completion_date_source: str | None = None + well_driller_name: str | None = None + well_construction_method: WellConstructionMethod | None = None + well_construction_method_source: str | None = None + well_pump_type: WellPumpType | None = None + is_suitable_for_datalogger: bool | None + formation_completion_code: FormationCode | None = None class CreateSpring(CreateBaseThing): @@ -146,6 +159,8 @@ class CreateWellScreen(BaseCreateModel): """ thing_id: int + aquifer_system_id: int | None = None + geologic_formation_id: int | None = None screen_depth_bottom: float = Field(gt=0, description="Screen depth bottom in feet") screen_depth_top: float = Field(gt=0, description="Screen depth top in feet") screen_type: ScreenType | None = None @@ -220,14 +235,25 @@ class WellResponse(BaseThingResponse): well_casing_depth_unit: str = "ft" well_casing_materials: list[CasingMaterial] = [] well_construction_notes: str | None = None + well_completion_date: PastOrTodayDate | None + well_completion_date_source: str | None + well_driller_name: str | None + well_construction_method: WellConstructionMethod | None + well_construction_method_source: str | None + well_pump_type: WellPumpType | None + well_pump_depth: float | None + well_pump_depth_unit: str = "ft" + is_suitable_for_datalogger: bool | None well_status: str | None measuring_point_height: float measuring_point_height_unit: str = "ft" measuring_point_description: str | None - + aquifers: list[dict] = [] water_notes: list[NoteResponse] | None = None measuring_notes: list[NoteResponse] | None = None general_notes: list[NoteResponse] | None = None + permissions: list[PermissionHistoryResponse] + formation_completion_code: FormationCode | None @field_validator("well_purposes", mode="before") def populate_well_purposes_with_strings(cls, well_purposes): @@ -248,6 +274,43 @@ def populate_well_casing_materials_with_strings(cls, well_casing_materials): materials = [] return materials + @field_validator("permissions", mode="before") + def populate_permission_history_with_latest_records(cls, permissions): + """ + Populate the permission history with the latest records for each + type of permission. If multiple records exist for the same permission type + only the most recent one is included. If there are no records + the permission_allowed will be None + """ + permissions_to_return = [] + for permission_type in [ + "Water Level Sample", + "Water Chemistry Sample", + "Datalogger Installation", + ]: + # Filter records for the current permission type + filtered_records = [ + record + for record in permissions + if record.permission_type == permission_type and record.end_date is None + ] + if filtered_records: + # Get the most recent record based on start_date + latest_record = max( + filtered_records, key=lambda record: record.start_date + ) + permissions_to_return.append(latest_record) + else: + permissions_to_return.append( + PermissionHistoryResponse( + permission_type=permission_type, + permission_allowed=None, + start_date=None, + end_date=None, + ) + ) + return permissions_to_return + class SpringResponse(BaseThingResponse): """ @@ -269,6 +332,11 @@ class WellScreenResponse(BaseResponseModel): thing_id: int thing: WellResponse + aquifer_system_id: int | None = None + aquifer_system: str | None = None + aquifer_type: str | None = None + geologic_formation_id: int | None = None + geologic_formation: str | None = None screen_depth_bottom: float screen_depth_bottom_unit: str = "ft" screen_depth_top: float @@ -276,6 +344,24 @@ class WellScreenResponse(BaseResponseModel): screen_type: str | None = None screen_description: str | None = None + @field_validator("aquifer_system", mode="before") + def populate_aquifer_system_with_name(cls, aquifer_system): + if aquifer_system is not None: + return aquifer_system.name + return None + + @field_validator("aquifer_type", mode="before") + def populate_aquifer_type_with_name(cls, aquifer_type): + if aquifer_type is not None: + return aquifer_type.name + return None + + @field_validator("geologic_formation", mode="before") + def populate_geologic_formation_with_code(cls, geologic_formation): + if geologic_formation is not None: + return geologic_formation.formation_code + return None + class GeoJSONGeometry(BaseModel): """ @@ -342,6 +428,8 @@ class UpdateThingIdLink(BaseUpdateModel): class UpdateWellScreen(BaseUpdateModel): + aquifer_system_id: int | None = None + geologic_formation_id: int | None = None screen_depth_bottom: float | None = None screen_depth_top: float | None = None screen_description: str | None = None diff --git a/schemas/validators.py b/schemas/validators.py new file mode 100644 index 000000000..963047bc2 --- /dev/null +++ b/schemas/validators.py @@ -0,0 +1,43 @@ +""" +schemas/validators.py +Reusable Pydantic validators and mixins for aquifer and geology related schemas. +May consider expansion for other domain models in the future. +""" + +from pydantic import model_validator, field_validator, BaseModel, Field +from services.validation.geospatial import validate_wkt_geometry + + +class DepthIntervalMixin(BaseModel): + """ + Mixin to enforce: + 1. Depths are non-negative (via Field constraints). + 2. Bottom depth > top depth (via model_validator). + Assumes the model has 'top_depth' and 'bottom_depth' fields. + """ + + top_depth: float = Field(ge=0) + bottom_depth: float = Field(ge=0) + + @model_validator(mode="after") + def check_depth_logical_order(self) -> "DepthIntervalMixin": + if self.bottom_depth <= self.top_depth: + raise ValueError( + f"Bottom depth ({self.bottom_depth}) must be greater " + f"than top depth ({self.top_depth})" + ) + return self + + +class GeometryMixin(BaseModel): + """ + Mixin to validate WKT strings for boundary fields. + Delegates logic to the validate_wkt_geometry service function. + """ + + boundary: str | None = None + + @field_validator("boundary") + @classmethod + def validate_wkt(cls, v: str | None) -> str | None: + return validate_wkt_geometry(v) diff --git a/services/query_helper.py b/services/query_helper.py index 3f0e3dd24..970ad1720 100644 --- a/services/query_helper.py +++ b/services/query_helper.py @@ -25,18 +25,7 @@ from db import search as search_func from services.regex import QUERY_REGEX - - -def to_bool(value: str) -> bool | str: - """Convert a string to a boolean.""" - if isinstance(value, bool): - return value - if value.lower() in ("true", "1", "yes"): - return True - elif value.lower() in ("false", "0", "no"): - return False - - return value +from services.util import to_bool def make_where(col: Column, op: str, v: str) -> OperatorExpression: diff --git a/services/util.py b/services/util.py index 77cd5d5cd..a3ddcf472 100644 --- a/services/util.py +++ b/services/util.py @@ -1,17 +1,33 @@ import json +import os -from shapely.ops import transform -import pyproj import httpx +import pyproj +from shapely.ops import transform from sqlalchemy.orm import DeclarativeBase from constants import SRID_WGS84 - TRANSFORMERS = {} METERS_TO_FEET = 3.28084 +def to_bool(value: str) -> bool | str: + """Convert a string to a boolean.""" + if isinstance(value, bool): + return value + if value.lower() in ("true", "1", "yes"): + return True + elif value.lower() in ("false", "0", "no"): + return False + + return value + + +def get_bool_env(key, default=False): + return to_bool(os.getenv(key, default)) + + def transform_srid(geometry, source_srid, target_srid): """ geometry must be a shapely geometry object, like Point, Polygon, or MultiPolygon @@ -43,6 +59,13 @@ def convert_ft_to_m(feet: float | None) -> float | None: return round(feet / METERS_TO_FEET, 6) +def convert_m_to_ft(meters: float | None) -> float | None: + """Convert a length from meters to feet.""" + if meters is None: + return None + return round(meters * METERS_TO_FEET, 6) + + def get_tiger_data( lon: float, lat: float, layer: int, outfields: str = "*" ) -> dict | None: @@ -127,6 +150,7 @@ def get_epqs_elevation_from_point(lon: float, lat: float) -> float | None: try: data = resp.json() except json.decoder.JSONDecodeError: + print(f"Error decoding JSON from EPQS: {resp.text}") return None return data["value"] @@ -181,11 +205,10 @@ def retrieve_latest_polymorphic_history_table_record( DeclarativeBase | None The latest record from the specified polymorphic table with the defined type if it exists. """ - if polymorphic_relationship == "permissions": + if polymorphic_relationship == "permission_history": type_field = "permission_type" elif polymorphic_relationship == "status_history": type_field = "status_type" - polymorphic_records = getattr(target_record, polymorphic_relationship) type_polymorphic_records = [ r diff --git a/tests/features/environment.py b/tests/features/environment.py index 9b801e9d7..13bcdead3 100644 --- a/tests/features/environment.py +++ b/tests/features/environment.py @@ -28,12 +28,20 @@ Parameter, Deployment, TransducerObservationBlock, + WellCasingMaterial, + PermissionHistory, + Contact, StatusHistory, ThingIdLink, WellPurpose, MeasuringPointHistory, MonitoringFrequencyHistory, DataProvenance, + AquiferSystem, + AquiferType, + ThingAquiferAssociation, + GeologicFormation, + ThingGeologicFormationAssociation, ) from db.engine import session_ctx @@ -87,9 +95,13 @@ def add_well(context, session, location, name_num): well_construction_notes="Test well construction notes", well_casing_diameter=5.0, well_casing_depth=10.0, - # notes="These are some test well notes", - # measuring_notes="These are some measuring notes", - # water_notes="This are some water notes", + well_completion_date="2013-05-15", + well_driller_name="Jonsi", + well_construction_method="Driven", + well_pump_type="Submersible", + well_pump_depth=8, + is_suitable_for_datalogger=True, + formation_completion_code="000EXRV", ) session.add(well) @@ -116,6 +128,20 @@ def add_well(context, session, location, name_num): return well +@add_context_object_container("well_casing_materials") +def add_well_casing_material(context, session, well): + wcm = WellCasingMaterial( + thing_id=well.id, + material="PVC", + ) + session.add(wcm) + session.commit() + session.refresh(wcm) + + context.objects["well_casing_materials"].append(wcm) + return wcm + + @add_context_object_container("well_purposes") def add_well_purpose(context, session, well, purpose_term): purpose = WellPurpose(thing=well, purpose=purpose_term) @@ -189,6 +215,54 @@ def add_spring(context, session, location, name_num): return spring +@add_context_object_container("contacts") +def add_contact(context, session): + contact = Contact( + name="Test Contact", + role="Software Developer", + organization="NMBGMR", + release_status="draft", + contact_type="Primary", + ) + session.add(contact) + session.commit() + session.refresh(contact) + + context.objects["contacts"].append(contact) + return contact + + +@add_context_object_container("permission_histories") +def add_permission_history( + context, + session, + contact_id, + permission_type, + permission_allowed, + start_date, + end_date, + notes, + target_id, + target_table, +): + permission_history = PermissionHistory( + contact_id=contact_id, + permission_type=permission_type, + permission_allowed=permission_allowed, + start_date=start_date, + end_date=end_date, + notes=notes, + target_id=target_id, + target_table=target_table, + ) + session.add(permission_history) + session.commit() + session.refresh(permission_history) + + context.objects["permission_histories"].append(permission_history) + return permission_history + + @add_context_object_container("sensors") def add_sensor(context, session): sensor = Sensor( @@ -317,7 +391,8 @@ def add_data_provenance( target_id, target_table, field_name, - origin_source, + origin_type=None, + origin_source=None, collection_method=None, accuracy_value=None, accuracy_unit=None, @@ -327,6 +402,7 @@ def add_data_provenance( collection_method=collection_method, target_id=target_id, target_table=target_table, + origin_type=origin_type, origin_source=origin_source, accuracy_value=accuracy_value, accuracy_unit=accuracy_unit, @@ -353,10 +429,75 @@ def add_transducer_observation(context, session, block, deployment_id, value): return obs +@add_context_object_container("aquifer_systems") +def add_aquifer_system(context, session, name, well): + aquifer_system = AquiferSystem( + name=name, + description="this is a test aquifer", + primary_aquifer_type="Artesian", + geographic_scale="Major", + boundary="MULTIPOLYGON(((0 0, 1 1, 2 2, 3 3, 1 2, 0 0)))", + ) + session.add(aquifer_system) + session.commit() + session.refresh(aquifer_system) + + context.objects["aquifer_systems"].append(aquifer_system) + return aquifer_system + + +@add_context_object_container("thing_aquifer_associations") +def add_thing_aquifer_association(context, session, well, aquifer_system): + association = ThingAquiferAssociation(thing=well, aquifer_system=aquifer_system) + session.add(association) + session.commit() + session.refresh(association) + + context.objects["thing_aquifer_associations"].append(association) + return association + + +@add_context_object_container("aquifer_types") +def add_aquifer_type(context, session, aquifer_type_str, thing_aquifer_association): + aquifer_type = AquiferType( + aquifer_type=aquifer_type_str, + thing_aquifer_association=thing_aquifer_association, + ) + session.add(aquifer_type) + session.commit() + session.refresh(aquifer_type) + + context.objects["aquifer_types"].append(aquifer_type) + return aquifer_type + + +@add_context_object_container("geologic_formations") +def add_geologic_formation(context, session, formation_code, well): + formation = GeologicFormation( + formation_code=formation_code, + description="This is a test geologic formation.", + lithology="Peat", + boundary="MULTIPOLYGON(((0 0, 1 1, 2 2, 3 3, 1 2, 0 0)))", + ) + session.add(formation) + session.commit() + session.refresh(formation) + + association = ThingGeologicFormationAssociation( + top_depth=1, bottom_depth=10, thing=well, geologic_formation=formation + ) + session.add(association) + session.commit() + session.refresh(association) + + context.objects["geologic_formations"].append(formation) + return formation + + def before_all(context): context.objects = {} rebuild = False - # rebuild = True + rebuild = True if rebuild: erase_and_rebuild_db() @@ -374,133 +515,145 @@ def before_all(context): sensor_1 = add_sensor(context, session) deployment = add_deployment(context, session, well_1.id, sensor_1.id) - measuring_point_history_1 = add_measuring_point_history( - context, session, well=well_1 - ) - measuring_point_history_2 = add_measuring_point_history( - context, session, well=well_2 - ) - measuring_point_history_3 = add_measuring_point_history( - context, session, well=well_3 - ) - - well_status_1 = add_status_history( - context, - session, - status_type="Well Status", - status_value="Active, pumping well", - start_date=datetime(2020, 1, 1), - end_date=datetime(2021, 1, 1), - reason="Initial status", - target_id=context.objects["wells"][0].id, - target_table="thing", - ) - - well_status_2 = add_status_history( - context, - session, - status_type="Well Status", - status_value="Destroyed, exists but not usable", - start_date=datetime(2021, 1, 1), - end_date=None, - reason="Roving bovine", - target_id=context.objects["wells"][0].id, - target_table="thing", - ) - - monitoring_status_1 = add_status_history( - context, - session, - status_type="Monitoring Status", - status_value="Currently monitored", - start_date=datetime(2020, 1, 1), - end_date=datetime(2021, 1, 1), - reason="Initial monitoring status", - target_id=context.objects["wells"][0].id, - target_table="thing", - ) - - monitoring_status_2 = add_status_history( - context, - session, - status_type="Monitoring Status", - status_value="Not currently monitored", - start_date=datetime(2021, 1, 1), - end_date=None, - reason="Roving bovine destroyed well", - target_id=context.objects["wells"][0].id, - target_table="thing", - ) - - monitoring_frequency_history_1 = add_monitoring_frequency_history( - context, - session, - well=well_1, - monitoring_frequency="Monthly", - start_date="2020-01-01", - end_date="2021-01-01", - ) - - monitoring_frequency_history_2 = add_monitoring_frequency_history( - context, - session, - well=well_1, - monitoring_frequency="Annual", - start_date="2020-01-01", - end_date=None, - ) - - id_link_1 = add_id_link( - context, - session, - thing=well_1, - relation="same_as", - alternate_id="12345678", - alternate_organization="USGS", - ) - - id_link_2 = add_id_link( - context, - session, - thing=well_1, - relation="same_as", - alternate_id="OSE-0001", - alternate_organization="NMOSE", - ) - - id_link_3 = add_id_link( - context, - session, - thing=well_1, - relation="same_as", - alternate_id="Roving Bovine Ranch Well #1", - alternate_organization="NMBGMR", - ) - - group = add_group(context, session, [well_1, well_2]) - - elevation_method = add_data_provenance( - context, - session, - target_id=loc_1.id, - target_table="location", - field_name="elevation", - origin_source="Private geologist, consultant or univ associate", - collection_method="LiDAR DEM", - ) - - well_depth_source = add_data_provenance( - context, - session, - target_id=well_1.id, - target_table="thing", - field_name="well_depth", - origin_source="Other", - ) + add_well_casing_material(context, session, well_1) + + contact = add_contact(context, session) + + for permission in [ + "Datalogger Installation", + "Water Level Sample", + "Water Chemistry Sample", + ]: + add_permission_history( + context, + session, + contact_id=context.objects["contacts"][0].id, + permission_type=permission, + permission_allowed=True, + start_date=datetime(2025, 1, 1).date(), + end_date=None, + notes=f"Permission granted for {permission.lower()}.", + target_id=well_1.id, + target_table="thing", + ) + + for well in (well_1, well_2, well_3): + add_measuring_point_history(context, session, well=well) + for value, start, end, reason in ( + ( + "Active, pumping well", + datetime(2020, 1, 1), + datetime(2021, 1, 1), + "initial status", + ), + ( + "Destroyed, exists but not usable", + datetime(2021, 1, 1), + None, + "roving bovine", + ), + ): + add_status_history( + context, + session, + status_type="Well Status", + status_value=value, + start_date=start, + end_date=end, + reason=reason, + target_id=context.objects["wells"][0].id, + target_table="thing", + ) + + for value, start, end in ( + ("Currently monitored", datetime(2020, 1, 1), datetime(2021, 1, 1)), + ("Not currently monitored", datetime(2021, 1, 1), None), + ): + add_status_history( + context, + session, + status_type="Monitoring Status", + status_value=value, + start_date=start, + end_date=end, + reason="Initial monitoring status", + target_id=context.objects["wells"][0].id, + target_table="thing", + ) + + for f, start, end in ( + ("Monthly", "2020-01-01", "2021-01-01"), + ("Annual", "2020-01-01", None), + ): + add_monitoring_frequency_history( + context, + session, + well=well_1, + monitoring_frequency=f, + start_date=start, + end_date=end, + ) + + for aid, aorg in ( + ("12345678", "USGS"), + ("OSE-0001", "NMOSE"), + ("Roving Bovine Ranch Well #1", "NMBGMR"), + ): + add_id_link( + context, + session, + thing=well_1, + relation="same_as", + alternate_id=aid, + alternate_organization=aorg, + ) + + add_well_casing_material(context, session, well_1) + + add_group(context, session, [well_1, well_2]) + + for kwargs in ( + { + "target_id": loc_1.id, + "target_table": "location", + "field_name": "elevation", + "origin_source": "Private geologist, consultant or univ associate", + "collection_method": "LiDAR DEM", + }, + { + "target_id": well_1.id, + "target_table": "thing", + "field_name": "well_depth", + "origin_type": "Other", + }, + { + "target_id": well_1.id, + "target_table": "thing", + "field_name": "well_completion_date", + "origin_type": "Data Portal", + }, + { + "target_id": well_1.id, + "target_table": "thing", + "field_name": "well_construction_method", + "origin_source": "Jacob's 2013 Thesis", + }, + ): + add_data_provenance(context, session, **kwargs) for purpose in ["Domestic", "Irrigation"]: add_well_purpose(context, session, well_1, purpose) + for name in ["Aquifer A", "Aquifer B"]: + system = add_aquifer_system(context, session, name, well_1) + add_thing_aquifer_association(context, session, well_1, system) + + for t in ["Artesian", "Fractured"]: + taa = context.objects["thing_aquifer_associations"][0] + add_aquifer_type(context, session, t, taa) + + add_geologic_formation(context, session, "000EXRV", well_1) + # parameter ID can be hardcoded because init_parameter always creates the same one parameter = session.get(Parameter, 1) block = add_block(context, session, parameter) @@ -519,8 +672,10 @@ def before_all(context): def after_all(context): with session_ctx() as session: for table in context.objects.values(): - for obj in table: - session.delete(obj) + for record in table: + obj = session.get(record.__class__, record.id) + if obj: + session.delete(obj) session.commit() diff --git a/tests/features/post-migration-legacy-data-retrieval.feature b/tests/features/post-migration-legacy-data-retrieval.feature new file mode 100644 index 000000000..13b2b347d --- /dev/null +++ b/tests/features/post-migration-legacy-data-retrieval.feature @@ -0,0 +1,94 @@ +Feature: Post-Migration AMPAPI Date Field Retrieval + As a data manager + After migrating data from AMPAPI to NMSampleLocations + I want to verify that all AMPAPI temporal information is preserved and queryable + So that no historical context is lost + + Background: + Given a functioning api + And the AMPAPI data has been migrated to the database + + # Location AMPAPI Date Lookups (Read-Only Fields) + + Scenario: Retrieve location with both AMPAPI date fields via API + Given a location exists with: + | field | value | + | nma_date_created | 2014-04-03 | + | nma_site_date | 2002-12-10 | + When I retrieve that location via the API + Then the response should include nma_date_created as "2014-04-03" + And the response should include nma_site_date as "2002-12-10" + And the time gap should be approximately 11.3 years + + Scenario: Retrieve location with large time gap (54 years) + Given a location exists with: + | field | value | + | nma_date_created | 2008-05-28 | + | nma_site_date | 1954-05-01 | + When I retrieve that location via the API + Then the response should include nma_date_created as "2008-05-28" + And the response should include nma_site_date as "1954-05-01" + And the time gap should be approximately 54 years + + Scenario: List all locations includes AMPAPI date fields + Given 5 locations exist with various AMPAPI dates + When I GET /location to list all locations + Then each location should have a date created field + And each location should have a site date field + And some locations should have null site date + + Scenario: Filter locations by AMPAPI site date range + Given locations exist with nma_site_date ranging from 1950 to 2024 + When I filter locations where nma_site_date is between "2000-01-01" and "2010-12-31" + Then the response should only include locations with site date in that decade + And locations with site date before 2000 should not be included + And locations with site date after 2010 should not be included + + Scenario: Query location by nma_date_created + Given 3 locations exist with nma_date_created "2014-04-03" + And 2 locations exist with nma_date_created "2017-12-06" + When I query for locations with nma_date_created "2014-04-03" + Then the response should include exactly 3 locations + And all should have nma_date_created "2014-04-03" + + # Data Quality Validation + + Scenario: Verify migration preserved expected percentage of AMPAPI dates + Given 100 locations were migrated + And 9 of them had non-null SiteDate in AMPAPI + When I query the migrated locations + Then 9% should have non-null nma_site_date + And 100% should have non-null nma_date_created + + # Audit Trail Verification + + Scenario: AMPAPI dates preserved alongside audit timestamps + Given a location was migrated with AMPAPI dates + When I retrieve that location + Then it should have created_at (new system timestamp from migration) + And it should have nma_date_created (original AMPAPI DateCreated) + And it should have nma_site_date (original AMPAPI SiteDate) + And all three timestamps should be independently queryable + And created_at should be a recent timestamp + And nma_date_created should be an older date + + # Edge Cases + + Scenario: Location where SiteDate is later than DateCreated (data anomaly) + Given a location exists with: + | field | value | + | nma_date_created | 2010-01-15 | + | nma_site_date | 2015-06-20 | + When I retrieve that location + Then nma_date_created should be "2010-01-15" + And nma_site_date should be "2015-06-20" + And the system should accept this without error + + Scenario: Location with only nma_date_created (no nma_site_date) + Given a location exists with: + | field | value | + | nma_date_created | 2014-10-17 | + | nma_site_date | null | + When I retrieve that location + Then nma_date_created should be "2014-10-17" + And nma_site_date should be null diff --git a/tests/features/steps/post_migration_legacy_data.py b/tests/features/steps/post_migration_legacy_data.py new file mode 100644 index 000000000..185b1a758 --- /dev/null +++ b/tests/features/steps/post_migration_legacy_data.py @@ -0,0 +1,453 @@ +# =============================================================================== +# Copyright 2025 ross +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== +from datetime import date, datetime, timezone +from behave import given, when, then, register_type +from behave.runner import Context +import parse + +from db import Location, Thing, LocationThingAssociation +from db.engine import session_ctx + + +# Custom type parsers +@parse.with_pattern(r"\d+") +def parse_number(text): + return int(text) + + +register_type(Number=parse_number) + + +def create_test_location(nma_date_created=None, nma_site_date=None): + """Helper to create a test location with AMPAPI date fields (read-only post-migration).""" + with session_ctx() as session: + location = Location( + point="POINT(-106.607784 35.118924)", + elevation=1558.8, + release_status="public", + nma_date_created=nma_date_created, + nma_site_date=nma_site_date, + ) + session.add(location) + session.commit() + session.refresh(location) + return location + + +@given("the AMPAPI data has been migrated to the database") +def step_given_data_migrated(context: Context): + """Assumption that migration has occurred.""" + context.migrated = True + + +@given("a location exists with") +def step_given_location_with_table(context: Context): + """Create location with fields from table.""" + data = {row["field"]: row["value"] for row in context.table} + + nma_date_created = ( + date.fromisoformat(data["nma_date_created"]) + if data.get("nma_date_created") and data["nma_date_created"] != "null" + else None + ) + nma_site_date = ( + date.fromisoformat(data["nma_site_date"]) + if data.get("nma_site_date") and data["nma_site_date"] != "null" + else None + ) + + location = create_test_location( + nma_date_created=nma_date_created, nma_site_date=nma_site_date + ) + + context.test_location = location + context.test_location_id = location.id + + +@given("{count:Number} locations exist with various legacy dates") +def step_given_multiple_locations(context: Context, count: int): + """Create multiple locations with various legacy dates.""" + context.test_locations = [] + + test_data = [ + ("2014-04-03", "2002-12-10"), + ("2014-04-03", "2003-01-07"), + ("2017-12-06", "2003-12-11"), + ("2008-05-28", "1954-05-01"), + ("2020-01-15", None), + ] + + for i in range(min(count, len(test_data))): + created_date, site_date = test_data[i] + location = create_test_location( + nma_date_created=date.fromisoformat(created_date), + nma_site_date=(date.fromisoformat(site_date) if site_date else None), + ) + context.test_locations.append(location) + + +@given( + "locations exist with nma_site_date ranging from {start_year:Number} to {end_year:Number}" +) +def step_given_locations_date_range(context: Context, start_year: int, end_year: int): + """Create locations with nma_site_date across a date range.""" + context.test_locations = [] + + years = [1954, 2002, 2003, 2010, 2015, 2020, 2024] + for year in years: + location = create_test_location( + nma_date_created=date(year + 5, 1, 1), # Always 5 years after site date + nma_site_date=date(year, 6, 15), + ) + context.test_locations.append(location) + + +@given('{count:Number} locations exist with nma_date_created "{target_date}"') +def step_given_locations_with_specific_date( + context: Context, count: int, target_date: str +): + """Create locations with specific nma_date_created.""" + if not hasattr(context, "test_locations"): + context.test_locations = [] + + target = date.fromisoformat(target_date) + + for i in range(count): + location = create_test_location( + nma_date_created=target, + nma_site_date=date(2000 + i, 1, 1), # Vary the site dates + ) + context.test_locations.append(location) + + +@given("{count:Number} locations were migrated") +def step_given_count_locations_migrated(context: Context, count: int): + """Create specified number of test locations.""" + context.test_locations = [] + + for i in range(count): + # 9% have nma_site_date + has_site_date = i < count * 0.09 + + location = create_test_location( + nma_date_created=date(2014, 1, i % 28 + 1), + nma_site_date=date(2003, 1, i % 28 + 1) if has_site_date else None, + ) + context.test_locations.append(location) + + +@given("{count:Number} of them had non-null SiteDate in AMPAPI") +def step_given_sitedate_count(context: Context, count: int): + """Declarative - data created in previous step.""" + pass + + +@given("a location was migrated with legacy dates") +def step_given_location_migrated_with_dates(context: Context): + """Create location with both legacy dates.""" + location = create_test_location( + nma_date_created=date(2014, 4, 3), nma_site_date=date(2002, 12, 10) + ) + context.test_location = location + + +# WHEN steps + + +@when("I retrieve that location via the API") +def step_when_retrieve_location_api(context: Context): + """Retrieve location via GET API.""" + response = context.client.get(f"/location/{context.test_location_id}") + assert response.status_code == 200 + context.location_response = response.json() + + +@when("I GET /location to list all locations") +def step_when_get_all_locations(context: Context): + """Get all locations.""" + response = context.client.get("/location") + assert response.status_code == 200 + context.locations_response = response.json() + + +@when( + 'I filter locations where nma_site_date is between "{start_date}" and "{end_date}"' +) +def step_when_filter_locations(context: Context, start_date: str, end_date: str): + """Filter locations by date range.""" + # Since API may not support this yet, query database directly + with session_ctx() as session: + start = date.fromisoformat(start_date) + end = date.fromisoformat(end_date) + + locations = ( + session.query(Location) + .filter(Location.nma_site_date >= start, Location.nma_site_date <= end) + .all() + ) + + context.filtered_locations = locations + + +@when('I query for locations with nma_date_created "{target_date}"') +def step_when_query_by_ampapi_date(context: Context, target_date: str): + """Query locations by nma_date_created.""" + with session_ctx() as session: + target = date.fromisoformat(target_date) + locations = ( + session.query(Location).filter(Location.nma_date_created == target).all() + ) + context.queried_locations = locations + + +@when("I query the migrated locations") +def step_when_query_migrated_locations(context: Context): + """Query all test locations.""" + with session_ctx() as session: + # Query only our test locations + location_ids = [loc.id for loc in context.test_locations] + locations = session.query(Location).filter(Location.id.in_(location_ids)).all() + context.queried_locations = locations + + +@when("I retrieve that location") +def step_when_retrieve_location(context: Context): + """Retrieve location by ID.""" + with session_ctx() as session: + location = session.get(Location, context.test_location.id) + context.retrieved_location = location + + +# THEN steps + + +@then('the response should include nma_date_created as "{expected_date}"') +def step_then_nma_date_created(context: Context, expected_date: str): + """Assert nma_date_created matches.""" + actual = context.location_response.get("nma_date_created") + assert actual == expected_date, f"Expected {expected_date}, got {actual}" + + +@then('the response should include nma_site_date as "{expected_date}"') +def step_then_nma_site_date(context: Context, expected_date: str): + """Assert nma_site_date matches.""" + actual = context.location_response.get("nma_site_date") + assert actual == expected_date, f"Expected {expected_date}, got {actual}" + + +@then("the time gap should be approximately {years} years") +def step_then_time_gap_years(context: Context, years: str): + """Assert approximate year gap.""" + created_str = context.location_response.get("nma_date_created") + site_date_str = context.location_response.get("nma_site_date") + + if not created_str or not site_date_str: + raise AssertionError("Missing date fields for gap calculation") + + created_date = date.fromisoformat(created_str) + site_date = date.fromisoformat(site_date_str) + + gap_days = (created_date - site_date).days + gap_years = gap_days / 365.25 + + expected_years = float(years) + tolerance = 0.5 + assert ( + abs(gap_years - expected_years) < tolerance + ), f"Expected ~{expected_years} year gap, got {gap_years:.1f} years" + + +@then("each location should have a date created field") +def step_then_all_have_date_created_field(context: Context): + """Assert all locations have the date created field.""" + items = context.locations_response.get("items", []) + for item in items: + assert "nma_date_created" in item, f"Location missing nma_date_created" + + +@then("each location should have a site date field") +def step_then_all_have_site_date_field(context: Context): + """Assert all locations have the site date field.""" + items = context.locations_response.get("items", []) + for item in items: + assert "nma_site_date" in item, f"Location missing nma_site_date" + + +@then("some locations should have null site date") +def step_then_some_null_site_date(context: Context): + """Assert some locations have null site date.""" + items = context.locations_response.get("items", []) + null_count = sum(1 for item in items if item.get("nma_site_date") is None) + assert null_count > 0, "Expected at least one location with null site date" + + +@then("the response should only include locations with site date in that decade") +def step_then_locations_in_decade(context: Context): + """Assert filtered locations are in range.""" + for loc in context.filtered_locations: + assert ( + 2000 <= loc.nma_site_date.year <= 2010 + ), f"Location not in 2000-2010: {loc.nma_site_date}" + + +@then("locations with site date before {year:Number} should not be included") +def step_then_locations_before_excluded(context: Context, year: int): + """Assert no locations before year.""" + for loc in context.filtered_locations: + assert ( + loc.nma_site_date.year >= year + ), f"Location from {loc.nma_site_date.year} should not be included" + + +@then("locations with site date after {year:Number} should not be included") +def step_then_locations_after_excluded(context: Context, year: int): + """Assert no locations after year.""" + for loc in context.filtered_locations: + assert ( + loc.nma_site_date.year <= year + ), f"Location from {loc.nma_site_date.year} should not be included" + + +@then("the response should include exactly {count:Number} locations") +def step_then_exact_count_locations(context: Context, count: int): + """Assert exact count.""" + actual = len(context.queried_locations) + assert actual == count, f"Expected {count} locations, got {actual}" + + +@then('all should have nma_date_created "{expected_date}"') +def step_then_all_have_date(context: Context, expected_date: str): + """Assert all have same date.""" + expected = date.fromisoformat(expected_date) + for loc in context.queried_locations: + assert ( + loc.nma_date_created == expected + ), f"Location has {loc.nma_date_created}, expected {expected}" + + +@then("{percentage:Number}% should have non-null nma_site_date") +def step_then_percentage_site_date(context: Context, percentage: int): + """Assert percentage with nma_site_date.""" + total = len(context.queried_locations) + populated = sum(1 for loc in context.queried_locations if loc.nma_site_date) + actual_pct = (populated / total) * 100 + + tolerance = 2 + assert ( + abs(actual_pct - percentage) < tolerance + ), f"Expected ~{percentage}%, got {actual_pct:.1f}%" + + +@then("{percentage:Number}% should have non-null nma_date_created") +def step_then_percentage_legacy(context: Context, percentage: int): + """Assert percentage with nma_date_created.""" + total = len(context.queried_locations) + populated = sum(1 for loc in context.queried_locations if loc.nma_date_created) + actual_pct = (populated / total) * 100 + + tolerance = 2 + assert ( + abs(actual_pct - percentage) < tolerance + ), f"Expected ~{percentage}%, got {actual_pct:.1f}%" + + +@then("it should have created_at (new system timestamp from migration)") +def step_then_has_created_at(context: Context): + """Assert created_at exists.""" + assert context.retrieved_location.created_at is not None + + +@then("it should have nma_date_created (original AMPAPI DateCreated)") +def step_then_has_ampapi_date_created(context: Context): + """Assert nma_date_created exists.""" + assert context.retrieved_location.nma_date_created is not None + + +@then("it should have nma_site_date (original AMPAPI SiteDate)") +def step_then_has_site_date(context: Context): + """Assert nma_site_date exists.""" + assert context.retrieved_location.nma_site_date is not None + + +@then("all three timestamps should be independently queryable") +def step_then_all_queryable(context: Context): + """Assert all fields are queryable.""" + assert hasattr(context.retrieved_location, "created_at") + assert hasattr(context.retrieved_location, "nma_date_created") + assert hasattr(context.retrieved_location, "nma_site_date") + + +@then("created_at should be a recent timestamp") +def step_then_created_at_recent(context: Context): + """Assert created_at is recent.""" + created_at = context.retrieved_location.created_at + now = datetime.now(timezone.utc) + + # created_at should always be timezone-aware (configured in AutoBaseMixin with DateTime(timezone=True)) + # If it's naive, this indicates a database/ORM configuration issue + if created_at.tzinfo is None: + raise AssertionError( + "created_at is a naive datetime (no timezone info). " + "Check ORM/database config for timezone-aware UTC datetimes (see AutoBaseMixin.created_at)." + ) + + diff_seconds = abs((now - created_at).total_seconds()) + assert diff_seconds < 3600, "created_at should be within last hour" + + +@then("nma_date_created should be an older date") +def step_then_ampapi_date_older(context: Context): + """Assert nma_date_created is old.""" + ampapi_created_date = context.retrieved_location.nma_date_created + assert ampapi_created_date.year < 2024, "nma_date_created should be from the past" + + +@then('nma_date_created should be "{expected_date}"') +def step_then_ampapi_created_is(context: Context, expected_date: str): + """Assert nma_date_created value.""" + actual = context.retrieved_location.nma_date_created + expected = date.fromisoformat(expected_date) + assert actual == expected, f"Expected {expected}, got {actual}" + + +@then('nma_site_date should be "{expected_date}"') +def step_then_site_date_is(context: Context, expected_date: str): + """Assert nma_site_date value.""" + actual = context.retrieved_location.nma_site_date + expected = date.fromisoformat(expected_date) + assert actual == expected, f"Expected {expected}, got {actual}" + + +@then("the system should accept this without error") +def step_then_no_error(context: Context): + """Assert no errors.""" + # If we got here, no errors + pass + + +@then("nma_site_date should be null") +def step_then_site_date_null(context: Context): + """Assert nma_site_date is null.""" + assert context.retrieved_location.nma_site_date is None + + +@then("the well should still be valid") +def step_then_well_valid(context: Context): + """Assert well is valid.""" + assert context.retrieved_well.id is not None + + +# ============= EOF ============================================= diff --git a/tests/features/steps/well-additional-information.py b/tests/features/steps/well-additional-information.py new file mode 100644 index 000000000..8b00f7eb7 --- /dev/null +++ b/tests/features/steps/well-additional-information.py @@ -0,0 +1,270 @@ +from behave import then + +from services.util import retrieve_latest_polymorphic_history_table_record + + +# ------------------------------------------------------------------------------ +# Permissions / Operational OK flags +# ------------------------------------------------------------------------------ +@then( + "the response should include whether repeat measurement permission is granted for the well" +) +def step_impl(context): + permission_type = "Water Level Sample" + assert "permissions" in context.water_well_data + + permission_record = retrieve_latest_polymorphic_history_table_record( + context.objects["wells"][0], "permission_history", permission_type + ) + + water_well_data_permissions = [ + p + for p in context.water_well_data["permissions"] + if p["permission_type"] == permission_type + ][0] + assert ( + water_well_data_permissions["permission_type"] + == permission_record.permission_type + ) + assert ( + water_well_data_permissions["permission_allowed"] + == permission_record.permission_allowed + ) + assert water_well_data_permissions[ + "start_date" + ] == permission_record.start_date.strftime("%Y-%m-%d") + if permission_record.end_date: + assert water_well_data_permissions[ + "end_date" + ] == permission_record.end_date.strftime("%Y-%m-%d") + else: + assert water_well_data_permissions["end_date"] is None + + +@then("the response should include whether sampling permission is granted for the well") +def step_impl(context): + permission_type = "Water Chemistry Sample" + assert "permissions" in context.water_well_data + + permission_record = retrieve_latest_polymorphic_history_table_record( + context.objects["wells"][0], "permission_history", permission_type + ) + + water_well_data_permissions = [ + p + for p in context.water_well_data["permissions"] + if p["permission_type"] == permission_type + ][0] + assert ( + water_well_data_permissions["permission_type"] + == permission_record.permission_type + ) + assert ( + water_well_data_permissions["permission_allowed"] + == permission_record.permission_allowed + ) + assert water_well_data_permissions[ + "start_date" + ] == permission_record.start_date.strftime("%Y-%m-%d") + if permission_record.end_date: + assert water_well_data_permissions[ + "end_date" + ] == permission_record.end_date.strftime("%Y-%m-%d") + else: + assert water_well_data_permissions["end_date"] is None + + +@then( + "the response should include whether datalogger installation permission is granted for the well" +) +def step_impl(context): + permission_type = "Datalogger Installation" + assert "permissions" in context.water_well_data + + permission_record = retrieve_latest_polymorphic_history_table_record( + context.objects["wells"][0], "permission_history", permission_type + ) + + water_well_data_permissions = [ + p + for p in context.water_well_data["permissions"] + if p["permission_type"] == permission_type + ][0] + assert ( + water_well_data_permissions["permission_type"] + == permission_record.permission_type + ) + assert ( + water_well_data_permissions["permission_allowed"] + == permission_record.permission_allowed + ) + assert water_well_data_permissions[ + "start_date" + ] == permission_record.start_date.strftime("%Y-%m-%d") + if permission_record.end_date: + assert water_well_data_permissions[ + "end_date" + ] == permission_record.end_date.strftime("%Y-%m-%d") + else: + assert water_well_data_permissions["end_date"] is None + + +# ------------------------------------------------------------------------------ +# Well Construction Information +# ------------------------------------------------------------------------------ + + +@then("the response should include the completion date of the well") +def step_impl(context): + assert "well_completion_date" in context.water_well_data + assert context.water_well_data["well_completion_date"] == context.objects["wells"][ + 0 + ].well_completion_date.strftime("%Y-%m-%d") + + +@then("the response should include the source of the completion information") +def step_impl(context): + assert "well_completion_date_source" in context.water_well_data + + assert ( + context.water_well_data["well_completion_date_source"] + == context.objects["wells"][0].well_completion_date_source + ) + + +@then("the response should include the driller name") +def step_impl(context): + assert "well_driller_name" in context.water_well_data + assert ( + context.water_well_data["well_driller_name"] + == context.objects["wells"][0].well_driller_name + ) + + +@then("the response should include the construction method") +def step_impl(context): + assert "well_construction_method" in context.water_well_data + assert ( + context.water_well_data["well_construction_method"] + == context.objects["wells"][0].well_construction_method + ) + + +@then("the response should include the source of the construction information") +def step_impl(context): + assert "well_construction_method_source" in context.water_well_data + assert ( + context.water_well_data["well_construction_method_source"] + == context.objects["wells"][0].well_construction_method_source + ) + + +# ------------------------------------------------------------------------------ +# Additional Well Physical Properties +# ------------------------------------------------------------------------------ + + +@then("the response should include the casing diameter in inches") +def step_impl(context): + assert "well_casing_diameter" in context.water_well_data + assert "well_casing_diameter_unit" in context.water_well_data + + assert ( + context.water_well_data["well_casing_diameter"] + == context.objects["wells"][0].well_casing_diameter + ) + assert context.water_well_data["well_casing_diameter_unit"] == "in" + + +@then("the response should include the casing depth in feet below ground surface") +def step_impl(context): + assert "well_casing_depth" in context.water_well_data + assert "well_casing_depth_unit" in context.water_well_data + + assert ( + context.water_well_data["well_casing_depth"] + == context.objects["wells"][0].well_casing_depth + ) + assert context.water_well_data["well_casing_depth_unit"] == "ft" + + +@then("the response should include the casing materials") +def step_impl(context): + assert "well_casing_materials" in context.water_well_data + assert set(context.water_well_data["well_casing_materials"]) == { + m.material for m in context.objects["wells"][0].well_casing_materials + } + + +@then("the response should include the well pump type (previously well_type field)") +def step_impl(context): + assert "well_pump_type" in context.water_well_data + assert ( + context.water_well_data["well_pump_type"] + == context.objects["wells"][0].well_pump_type + ) + + +@then("the response should include the well pump depth in feet (new field)") +def step_impl(context): + assert "well_pump_depth" in context.water_well_data + assert "well_pump_depth_unit" in context.water_well_data + + assert ( + context.water_well_data["well_pump_depth"] + == context.objects["wells"][0].well_pump_depth + ) + assert context.water_well_data["well_pump_depth_unit"] == "ft" + + +@then( + "the response should include whether the well is open and suitable for a datalogger" +) +def step_impl(context): + assert "is_suitable_for_datalogger" in context.water_well_data + assert ( + context.water_well_data["is_suitable_for_datalogger"] + == context.objects["wells"][0].is_suitable_for_datalogger + ) + + +# ------------------------------------------------------------------------------ +# Aquifer/ Geology Information +# ------------------------------------------------------------------------------ + + +@then( + "the response should include the formation as the formation zone of well completion" +) +def step_impl(context): + assert "formation_completion_code" in context.water_well_data + assert ( + context.water_well_data["formation_completion_code"] + == context.objects["wells"][0].formation_completion_code + ) + + +@then( + "the response should include the aquifer class code to classify the aquifer into aquifer system." +) +def step_impl(context): + for aquifer in context.water_well_data["aquifers"]: + assert "aquifer_system" in aquifer + assert {a.get("aquifer_system") for a in context.water_well_data["aquifers"]} == { + system.name for system in context.objects["aquifer_systems"] + } + + +@then( + "the response should include the aquifer type as the type of aquifers penetrated by the well" +) +def step_impl(context): + for aquifer in context.water_well_data["aquifers"]: + assert "aquifer_types" in aquifer + + if aquifer["aquifer_system"] == "Aquifer A": + assert set(aquifer["aquifer_types"]) == { + a.aquifer_type for a in context.objects["aquifer_types"] + } + else: + assert aquifer["aquifer_types"] == [] diff --git a/tests/features/steps/well-core-information.py b/tests/features/steps/well-core-information.py index b0adc8346..1f56161f6 100644 --- a/tests/features/steps/well-core-information.py +++ b/tests/features/steps/well-core-information.py @@ -163,7 +163,7 @@ def step_impl(context): and r.target_table == "thing" and r.target_id == context.objects["wells"][0].id ] - well_depth_source = well_depth_source_records[0].origin_source + well_depth_source = well_depth_source_records[0].origin_type assert context.water_well_data["well_depth_source"] == well_depth_source diff --git a/tests/test_location.py b/tests/test_location.py index 4b6ec6faa..9dcb3d098 100644 --- a/tests/test_location.py +++ b/tests/test_location.py @@ -235,4 +235,83 @@ def test_delete_location_404_not_found(second_location): assert data["detail"] == f"Location with ID {bad_location_id} not found." +# ============= AMPAPI date field tests ======================================= + + +def test_new_location_has_null_ampapi_fields(): + """Test that newly created locations have null AMPAPI date fields (AMPAPI fields are migration-only)""" + payload = { + "point": "POINT (-106.607784 35.118924)", + "elevation": 1558.8, + "release_status": "draft", + } + response = client.post("/location", json=payload) + + assert response.status_code == 201 + data = response.json() + assert "id" in data + # AMPAPI date fields should be present in response but null (not set during creation, read-only) + assert "nma_date_created" in data + assert "nma_site_date" in data + assert data["nma_date_created"] is None + assert data["nma_site_date"] is None + + # cleanup after test + cleanup_post_test(Location, data["id"]) + + +def test_ampapi_fields_present_in_location_response(): + """Test that AMPAPI date fields (read-only) are included in location GET response""" + # Create a new location (without AMPAPI date fields set - they're read-only) + payload = { + "point": "POINT (-106.607784 35.118924)", + "elevation": 1558.8, + "release_status": "draft", + } + create_response = client.post("/location", json=payload) + assert create_response.status_code == 201 + location_id = create_response.json()["id"] + + # Retrieve the location and verify AMPAPI date fields are in the schema + get_response = client.get(f"/location/{location_id}") + assert get_response.status_code == 200 + data = get_response.json() + + # Verify read-only fields exist in response (even if null) + assert "nma_date_created" in data + assert "nma_site_date" in data + assert data["nma_date_created"] is None + assert data["nma_site_date"] is None + + # cleanup after test + cleanup_post_test(Location, location_id) + + +def test_ampapi_fields_independent_of_created_at(): + """Test that created_at (system timestamp) is separate from AMPAPI date fields (read-only)""" + payload = { + "point": "POINT (-106.607784 35.118924)", + "elevation": 1558.8, + "release_status": "draft", + } + response = client.post("/location", json=payload) + + assert response.status_code == 201 + data = response.json() + + # created_at is automatically set by AutoBaseMixin + assert "created_at" in data + assert data["created_at"] is not None + + # nma_date_created is separate and null for new records (read-only, populated only during migration) + assert "nma_date_created" in data + assert data["nma_date_created"] is None + + # These are independent fields with different purposes + assert "created_at" != "nma_date_created" + + # cleanup after test + cleanup_post_test(Location, data["id"]) + + # ============= EOF ============================================= diff --git a/tests/test_thing.py b/tests/test_thing.py index 28290dada..5bd504718 100644 --- a/tests/test_thing.py +++ b/tests/test_thing.py @@ -152,6 +152,9 @@ def test_add_water_well(location, group): cleanup_post_test(Thing, data["id"]) +@pytest.mark.skip( + "This duplicates the test above. That one will need to eventually be updated" +) def test_add_water_well_with_measuring_point(location, group): """ Test creating a well with measuring_point_height and measuring_point_description. diff --git a/tests/test_transfer_legacy_dates.py b/tests/test_transfer_legacy_dates.py new file mode 100644 index 000000000..985214fbb --- /dev/null +++ b/tests/test_transfer_legacy_dates.py @@ -0,0 +1,354 @@ +# =============================================================================== +# Copyright 2025 ross +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== +""" +Unit tests for AMPAPI date field population during AMPAPI → NMSampleLocations migration. + +These tests verify that: +1. Location.nma_date_created is populated from CSV DateCreated (read-only post-migration) +2. Location.nma_site_date is populated from CSV SiteDate if not null (read-only post-migration) +""" +import datetime +from unittest.mock import Mock, patch, MagicMock +import pandas as pd +import pytest + +from transfers.util import make_location + + +# ============================================================================ +# FIXTURES +# ============================================================================ + + +@pytest.fixture +def mock_lexicon_mapper(): + """Fixture to mock lexicon_mapper for all transfer tests""" + with patch("transfers.util.lexicon_mapper") as mock: + mock.map_value.return_value = "GPS" + yield mock + + +# ============================================================================ +# LOCATION AMPAPI DATE TESTS (Read-Only Post-Migration) +# ============================================================================ + + +def test_make_location_with_both_ampapi_dates(mock_lexicon_mapper): + """Test that make_location populates both nma_date_created and nma_site_date""" + + # Create a mock CSV row with both DateCreated and SiteDate + row = pd.Series( + { + "PointID": "TEST-001", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": "2002-12-10 00:00:00.000", + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 1, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + + # Call make_location + location, elevation_method = make_location(row, elevations) + + # Verify nma_date_created is set from DateCreated + assert location.nma_date_created is not None + assert location.nma_date_created == datetime.date(2014, 4, 3) + + # Verify nma_site_date is set from SiteDate + assert location.nma_site_date is not None + assert location.nma_site_date == datetime.date(2002, 12, 10) + + # Verify created_at is NOT set during migration (it's auto-set by AutoBaseMixin on save) + assert location.created_at is None + + +def test_make_location_with_only_date_created(mock_lexicon_mapper): + """Test that make_location handles locations with only DateCreated (no SiteDate)""" + row = pd.Series( + { + "PointID": "TEST-002", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": None, # No SiteDate + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 2, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Verify nma_date_created is set + assert location.nma_date_created == datetime.date(2014, 4, 3) + + # Verify nma_site_date is null (91% of locations don't have SiteDate) + assert location.nma_site_date is None + + +def test_make_location_with_site_date_later_than_date_created(mock_lexicon_mapper): + """Test data anomaly: SiteDate is later than DateCreated (should still be accepted)""" + row = pd.Series( + { + "PointID": "TEST-003", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2010-01-15 00:00:00.000", + "SiteDate": "2015-06-20 00:00:00.000", # Later than DateCreated (anomaly) + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 3, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Both dates should be preserved as-is, regardless of order + assert location.nma_date_created == datetime.date(2010, 1, 15) + assert location.nma_site_date == datetime.date(2015, 6, 20) + + +def test_make_location_with_very_old_site_date(mock_lexicon_mapper): + """Test that very old SiteDates (1950s) are preserved correctly""" + row = pd.Series( + { + "PointID": "SM-0227", # Real example from dataset + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2008-05-28 00:00:00.000", + "SiteDate": "1954-05-01 00:00:00.000", # 54 years earlier! + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 4, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Verify very old date is preserved + assert location.nma_site_date == datetime.date(1954, 5, 1) + assert location.nma_date_created == datetime.date(2008, 5, 28) + + # Verify 54-year time gap + time_gap = (location.nma_date_created - location.nma_site_date).days + assert time_gap == 19751 # Approximately 54 years + + +def test_make_location_ampapi_dates_are_date_not_datetime(mock_lexicon_mapper): + """Test that AMPAPI date fields are Date type (not DateTime)""" + row = pd.Series( + { + "PointID": "TEST-004", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2014-04-03 10:30:45.123", # Has time component + "SiteDate": "2002-12-10 14:22:33.456", # Has time component + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 5, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Verify they are date objects (not datetime) + assert isinstance(location.nma_date_created, datetime.date) + assert not isinstance(location.nma_date_created, datetime.datetime) + + assert isinstance(location.nma_site_date, datetime.date) + assert not isinstance(location.nma_site_date, datetime.datetime) + + # Verify time component is stripped + assert location.nma_date_created == datetime.date(2014, 4, 3) + assert location.nma_site_date == datetime.date(2002, 12, 10) + + +def test_make_location_ampapi_dates_independent_of_created_at(mock_lexicon_mapper): + """Test that AMPAPI dates don't affect created_at timestamp""" + row = pd.Series( + { + "PointID": "TEST-005", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": "2002-12-10 00:00:00.000", + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 6, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # created_at should be None during transfer (auto-set by AutoBaseMixin on save) + assert location.created_at is None + + # legacy fields should be Date (no timezone) + assert isinstance(location.nma_date_created, datetime.date) + assert isinstance(location.nma_site_date, datetime.date) + + # Legacy fields should be populated + assert location.nma_date_created is not None + assert location.nma_site_date is not None + + +# ============================================================================ +# DATA COVERAGE TESTS (Simulating Migration Statistics) +# ============================================================================ + + +def test_make_location_with_no_ampapi_dates(mock_lexicon_mapper): + """Test that make_location handles locations with no AMPAPI dates (both null)""" + row = pd.Series( + { + "PointID": "TEST-NODATES", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": None, # No DateCreated + "SiteDate": None, # No SiteDate + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 999, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Both AMPAPI date fields should be null + assert location.nma_date_created is None + assert location.nma_site_date is None + + +def test_make_location_with_empty_string_dates(mock_lexicon_mapper): + """Test that make_location handles empty string dates (CSV edge case)""" + row = pd.Series( + { + "PointID": "TEST-EMPTY", + "Easting": 350000, + "Northing": 3880000, + "DateCreated": "", # Empty string + "SiteDate": "", # Empty string + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": 998, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + elevations = {} + location, elevation_method = make_location(row, elevations) + + # Both AMPAPI date fields should be null (empty strings are falsy) + assert location.nma_date_created is None + assert location.nma_site_date is None + + +def test_location_ampapi_date_coverage_statistics(mock_lexicon_mapper): + """Test that migration preserves expected percentages of AMPAPI dates""" + + def create_test_row(i, has_site_date): + """Helper to create test row with common fields""" + return pd.Series( + { + "PointID": f"TEST-{i:03d}", + "Easting": 350000 + i, + "Northing": 3880000 + i, + "DateCreated": "2014-04-03 00:00:00.000", + "SiteDate": "2002-12-10 00:00:00.000" if has_site_date else None, + "Altitude": 1558.8, + "AltDatum": "NAVD88", + "AltitudeMethod": "GPS", + "LocationId": i, + "PublicRelease": True, + "CoordinateNotes": None, + "LocationNotes": None, + "AltitudeAccuracy": None, + } + ) + + # Simulate 100 location records from CSV (9% with SiteDate, 91% without) + locations_created = 0 + locations_with_site_date = 0 + elevations = {} + + for i in range(100): + row = create_test_row(i, has_site_date=(i < 9)) + location, _ = make_location(row, elevations) + + # Count coverage + if location.nma_date_created is not None: + locations_created += 1 + if location.nma_site_date is not None: + locations_with_site_date += 1 + + # Verify expected coverage + assert locations_created == 100 # 100% should have nma_date_created + assert locations_with_site_date == 9 # 9% should have nma_site_date + + +# ============================================================================ +# EOF +# ============================================================================ diff --git a/transfers/aquifer_system_transfer.py b/transfers/aquifer_system_transfer.py new file mode 100644 index 000000000..a0ba1f02e --- /dev/null +++ b/transfers/aquifer_system_transfer.py @@ -0,0 +1,141 @@ +import time +from sqlalchemy.orm import Session +from pydantic import ValidationError + +from db import AquiferSystem +from schemas.aquifer_system import CreateAquiferSystem +from transfers.util import read_csv, replace_nans, logger + + +def transfer_aquifer_systems(session: Session, limit: int = None) -> tuple: + """ + Transfer aquifer system data from LU_AquiferClass CSV to the database. + + This creates the master list of named aquifer systems (e.g., Ogallala Aquifer). the primary_type field is set + to "Unknown" as a placeholder and will be updated during well transfer when we know what type each well encounters. + + This should be run BEFORE well_transfer.py so that aquifer records exist for wells to reference. + + Args: + session (Session): SQLAlchemy database session + limit (int, optional): Limit the number of records to transfer (for testing). + + Returns: + tuple: (input_df, cleaned_df, errors) + """ + # 1. Read the CSV file + input_df = read_csv("LU_AquiferClass") + + # 2. Replace NaNs with NOne + cleaned_df = replace_nans(input_df) + + # 3. Initialize tracking variables for logging + n = len(input_df) + step = 25 + start_time = time.time() + errors = [] + created_count = 0 + skipped_count = 0 + + logger.info(f"Starting transfer of {n} aquifer systems from LU_AquiferClass.") + + # 4. Process each row + for i, row in enumerate(cleaned_df.itertuples()): + # check if limit is reached + if limit and i >= limit: + logger.info(f"Reached limit of {limit} rows. Stopping migration.") + break + + # Log progress every 'step' rows + if i and not i % step: + logger.info( + f"Processing row {i} of {n}. Avg rows per second: {step / (time.time() - start_time):.2f}" + ) + start_time = time.time() + + # Commit progress periodically + try: + session.commit() + except Exception as e: + logger.critical(f"Error committing aquifer system {i}: {e}") + session.rollback() + continue + + # 5. Extract aquifer code and name + aquifer_code = row.CODE + aquifer_name = row.MEANING + + if not aquifer_name: + error_msg = f"Row {i} (code: {aquifer_code}) has no aquifer name (MEANING)." + logger.critical(error_msg) + errors.append({"row": i, "code": aquifer_code, "error": error_msg}) + skipped_count += 1 + continue + + # 6. Check if aquifer system already exists + existing = ( + session.query(AquiferSystem) + .filter(AquiferSystem.name == aquifer_name) + .first() + ) + + if existing: + logger.info( + f"Aquifer '{aquifer_name}' (code: {aquifer_code}) already exists. Skipping." + ) + skipped_count += 1 + continue + + # 7. Prepare data dictionary + try: + data = CreateAquiferSystem( + name=aquifer_name, + description=None, # can be updated later + primary_aquifer_type="Unknown", # placeholder - will be updated during well transfer + ) + + # Validate data using Pydantic schema + CreateAquiferSystem.model_validate(data) + + except ValidationError as e: + errors.append( + {"code": aquifer_code, "name": aquifer_name, "error": e.errors()} + ) + logger.critical( + f"Error creating aquifer system '{aquifer_name}' (code: {aquifer_code}) (row {i}): {e.errors()}" + ) + continue + + # 8. Create database record + aquifer_system = None + try: + aquifer_data = data.model_dump() + aquifer_system = AquiferSystem(**aquifer_data) + session.add(aquifer_system) + created_count += 1 + + logger.info( + f"Created aquifer system: {aquifer_system.name} (code: {aquifer_code})" + ) + + except Exception as e: + if aquifer_system is not None: + session.expunge(aquifer_system) + errors.append({"code": aquifer_code, "name": aquifer_name, "error": str(e)}) + logger.critical( + f"Error creating aquifer system record '{aquifer_name}': {e}" + ) + continue + + # 9. Final commit + try: + session.commit() + logger.info( + f"Successfully transferred {created_count} aquifer systems, skipped {skipped_count}. " + f"Note: primary_type set to 'Unknown' and will be updated during well transfer." + ) + except Exception as e: + logger.critical(f"Error in final commit: {e}") + session.rollback() + + return input_df, cleaned_df, errors diff --git a/transfers/asset_transfer.py b/transfers/asset_transfer.py index 71d3ad23b..b7938f15d 100644 --- a/transfers/asset_transfer.py +++ b/transfers/asset_transfer.py @@ -13,53 +13,49 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== -# for testing only. remove later -from dotenv import load_dotenv -from db.engine import session_ctx - -load_dotenv() -# ----------------------------------------------- - import io from starlette.datastructures import UploadFile -from sqlalchemy.orm import Session -from db import Asset, AssetThingAssociation, Thing -from services.audit_helper import audit_add + +from db import Asset, AssetThingAssociation from services.gcs_helper import ( gcs_upload, - check_asset_exists, get_storage_bucket, get_storage_client, ) -from transfers.util import get_valid_things, read_csv from transfers.logger import logger - - -def transfer_assets(session: Session) -> None: - client = get_storage_client() - - bucket = get_storage_bucket(client) - logger.info(f"Using bucket {bucket.name}") - - well_photos = read_csv("WellPhotos") - # for name in ['AR0001']: # for testing - valid_things = get_valid_things(session) - n = len(valid_things) - for j, thing in enumerate(valid_things): - photos = well_photos[well_photos["PointID"] == thing.name] +from transfers.util import read_csv, filter_to_valid_point_ids +from transfers.well_transfer import WellChunkTransferer + + +class AssetTransferer(WellChunkTransferer): + def __init__(self, *args, **kw): + self.source_table = "WellPhotos" + super().__init__(*args, **kw) + self._client = get_storage_client() + self._bucket = get_storage_bucket(self._client) + logger.info(f"Using bucket {self._bucket.name}") + + def _get_dfs(self): + input_df = read_csv(self.source_table) + cleaned_df = filter_to_valid_point_ids(input_df) + return input_df, cleaned_df + + def _chunk_step(self, session, df, i, row, db_item): + photos = df[df["PointID"] == db_item.name] + n = len(df) if photos.empty: - photos = well_photos[well_photos["PointID"] == thing.name.replace("-", "")] + photos = df[df["PointID"] == db_item.name.replace("-", "")] if photos.empty: - logger.info(f"No photos found for PointID: {thing.name}") - continue + logger.info(f"No photos found for PointID: {db_item.name}") + return - for i, row in enumerate(photos.itertuples()): + for j, row in enumerate(photos.itertuples()): photo_path = row.OLEPath - srcblob = bucket.get_blob(f"nma-photos/{photo_path}") + srcblob = self._bucket.get_blob(f"nma-photos/{photo_path}") if not srcblob: logger.critical( - f"No photo found for PointID: {thing.name}, {photo_path}" + f"No photo found for PointID: {db_item.name}, {photo_path}" ) continue @@ -67,56 +63,25 @@ def transfer_assets(session: Session) -> None: f = srcblob.download_as_bytes() ff = UploadFile(file=io.BytesIO(f), filename=filename, size=len(f)) - uri, blob_name = gcs_upload(ff, bucket) - add_asset(session, ff, filename, thing.id, uri, blob_name) + uri, blob_name = gcs_upload(ff, self._bucket) + asset = Asset( + name=filename, + label=filename, + storage_path=blob_name, + storage_service="gcs", + mime_type="image/png", + size=ff.size, + uri=uri, + ) + assoc = AssetThingAssociation() + assoc.thing = db_item + assoc.asset = asset + session.add(assoc) + session.add(asset) + session.commit() logger.info( - f"Added asset {j}-{i}/{n} thing.id={thing.id} thing={thing.name} uri: {uri}" + f"Added asset {i}-{j}/{n} thing.id={db_item.id} thing={db_item.name} uri: {uri}" ) -def transfer_assets_testing(session: Session) -> None: - for p in ("asset1.png", "asset2.png", "asset3.png"): - with open(f"./transfers/data/assets/{p}", "rb") as f: - uf = UploadFile(file=f, filename=p, size=10) - uri, blob_name = gcs_upload(uf) - thing_id = 151 - - if check_asset_exists(session, blob_name, thing_id): - logger.warning(f"Asset {blob_name} already exists. Skipping.") - continue - add_asset(session, uf, p, thing_id, uri, blob_name) - - -def add_asset( - session: Session, - uf: UploadFile, - label: str, - thing_id: int, - uri: str, - blob_name: str, -) -> None: - asset = Asset( - name=label, - label=label, - storage_path=blob_name, - storage_service="gcs", - mime_type="image/png", - size=uf.size, - uri=uri, - ) - assoc = AssetThingAssociation() - audit_add({"sub": "foobar", "name": "Mr. Foobar"}, assoc) - thing = session.get(Thing, thing_id) - assoc.thing = thing - assoc.asset = asset - session.add(assoc) - session.add(asset) - session.commit() - - -if __name__ == "__main__": - - with session_ctx() as session: - transfer_assets(session) - # ============= EOF ============================================= diff --git a/transfers/contact_transfer.py b/transfers/contact_transfer.py index c9b1c9fb0..1c690e0ce 100644 --- a/transfers/contact_transfer.py +++ b/transfers/contact_transfer.py @@ -15,143 +15,94 @@ # =============================================================================== import json +import pandas as pd +from pandas import DataFrame from pydantic import ValidationError +from sqlalchemy.orm import Session +from core.enums import Organization from db import ( - Thing, Contact, ThingContactAssociation, Email, Phone, Address, IncompleteNMAPhone, + Base, ) from transfers.logger import logger +from transfers.transferer import ThingBasedTransferer from transfers.util import ( get_transfers_data_path, - chunk_by_size, ) from transfers.util import read_csv, filter_to_valid_point_ids, replace_nans -def extract_owner_role(comment): - # if comment is None: - # return "Owner" - # if "Owner" in comment: - # return "Owner" - # if "Manager" in comment: - # return "Manager" - # if "Director" in comment: - # return "Director" +class ContactTransfer(ThingBasedTransferer): + source_table = "OwnersData" - return "Owner" + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + co_to_org_mapper_path = get_transfers_data_path( + "owners_organization_mapper.json" + ) + with open(co_to_org_mapper_path, "r") as f: + self._co_to_org_mapper = json.load(f) + organization_mapper_path = get_transfers_data_path("organization_mapping.json") + with open(organization_mapper_path, "r") as f: + self._organization_mapper = json.load(f) -""" -Developer's notes + self._added = [] -Use Pydantic to perform model validations since all restrictions will -be built into the models -""" + def _get_dfs(self): + input_df = read_csv(self.source_table) + odf = input_df.drop(["OBJECTID", "GlobalID"], axis=1) + ldf = read_csv("OwnerLink") + ldf = ldf.drop(["OBJECTID", "GlobalID"], axis=1) + locdf = read_csv("Location") + ldf = ldf.join(locdf.set_index("LocationId"), on="LocationId") + odf = odf.join(ldf.set_index("OwnerKey"), on="OwnerKey") -def transfer_contacts(session): + odf = replace_nans(odf) - co_to_org_mapper_path = get_transfers_data_path("owners_organization_mapper.json") - with open(co_to_org_mapper_path, "r") as f: - co_to_org_mapper = json.load(f) + odf = filter_to_valid_point_ids(odf) + return input_df, odf - source_table = "OwnersData" - input_df = read_csv(source_table) - odf = input_df.drop(["OBJECTID", "GlobalID"], axis=1) - ldf = read_csv("OwnerLink") - ldf = ldf.drop(["OBJECTID", "GlobalID"], axis=1) - locdf = read_csv("Location") - ldf = ldf.join(locdf.set_index("LocationId"), on="LocationId") - - odf = odf.join(ldf.set_index("OwnerKey"), on="OwnerKey") - - odf = replace_nans(odf) - - odf = filter_to_valid_point_ids(session, odf) - cleaned_df = odf - errors = [] - added = [] - odf = odf.sort_values(by=["PointID"]) - - for chunk in chunk_by_size(odf, 100): - pointids = chunk.PointID.tolist() - logger.info(f"Processing chunk {pointids[0]} to {pointids[-1]}") - things = session.query(Thing).filter(Thing.name.in_(pointids)).all() - for i, row in chunk.iterrows(): - thing = next((thing for thing in things if thing.name == row.PointID), None) - logger.info(f"Processing PointID: {i} {row.PointID}") - if thing is None: - logger.critical( - f"Thing with PointID {row.PointID} not found. Skipping owner." - ) - continue - - # TODO: use contact_helper.add_contact - try: - if _add_first_contact(session, row, thing, co_to_org_mapper, added): - session.commit() - # session.flush() - logger.info(f"added first contact for PointID {row.PointID}") - except ValidationError as e: - logger.critical( - f"Skipping first contact for PointID {row.PointID} due to validation error: {e.errors()}" - ) - # session.rollback() - errors.append( - {"pointid": row.PointID, "error": e, "table": source_table} - ) - except Exception as e: - logger.critical( - f"Skipping first contact for PointID {row.PointID} due to error: {e}" - ) - session.rollback() - errors.append( - {"pointid": row.PointID, "error": e, "table": source_table} - ) + def _get_prepped_group(self, group) -> DataFrame: + return group.sort_values(by=["PointID"]) + def _group_step(self, session: Session, row: pd.Series, db_item: Base): + for adder, tag in (_add_first_contact, "first"), ( + _add_second_contact, + "second", + ): try: - if ( - row.SecondFirstName is None - and row.SecondLastName is None - and row.SecondCtctEmail is None - and row.SecondCtctPhone is None + if adder( + session, + row, + db_item, + self._co_to_org_mapper, + self._organization_mapper, + self._added, ): - logger.warning( - f"No second contact info for PointID {row.PointID}, skipping." - ) - continue - if _add_second_contact(session, row, thing, co_to_org_mapper, added): session.commit() - # session.flush() - logger.info(f"added second contact for PointID {row.PointID}") - + logger.info(f"added {tag} contact for PointID {row.PointID}") except ValidationError as e: logger.critical( - f"Skipping second contact for PointID {row.PointID} due to validation error: {e.errors()}" - ) - # session.rollback() - errors.append( - {"pointid": row.PointID, "error": e, "table": source_table} + f"Skipping {tag} contact for PointID {row.PointID} due to validation error: {e.errors()}" ) + self._capture_error(row.PointID, str(e), "ValidationError") except Exception as e: logger.critical( - f"Skipping second contact for PointID {row.PointID} due to error: {e}" + f"Skipping {tag} contact for PointID {row.PointID} due to error: {e}" ) session.rollback() - errors.append( - {"pointid": row.PointID, "error": e, "table": source_table} - ) - - return input_df, cleaned_df, errors + self._capture_error(row.PointID, str(e), "UnknownError") -def _add_first_contact(session, row, thing, co_to_org_mapper, added): +def _add_first_contact(session, row, thing, co_to_org_mapper, org_mapper, added): # TODO: extract role from OwnerComment # role = extract_owner_role(row.OwnerComment) role = "Owner" @@ -159,10 +110,10 @@ def _add_first_contact(session, row, thing, co_to_org_mapper, added): name = _make_name(row.FirstName, row.LastName) - organization = co_to_org_mapper.get(row.Company, row.Company) - + # check if organization is in lexicon + organization = _get_organization(row, co_to_org_mapper, org_mapper) if (name, organization) in added: - return + return None added.append((name, organization)) contact_data = { @@ -251,14 +202,38 @@ def _add_first_contact(session, row, thing, co_to_org_mapper, added): return True -def _add_second_contact(session, row, thing, co_to_org_mapper, added): +def _get_organization(row, co_to_org_mapper, org_mapper): + organization = co_to_org_mapper.get(row.Company, row.Company) + + try: + Organization(organization) + except ValueError: + norganization = next( + (k for k, v in org_mapper.items() if v == organization), None + ) + logger.warning(f"mapping {organization} to {norganization}") + organization = norganization + + return organization + + +def _add_second_contact(session, row, thing, co_to_org_mapper, org_mapper, added): + if all( + [ + getattr(row, f"Second{f}") is None + for f in ["FirstName", "LastName", "CtctEmail", "CtctPhone"] + ] + ): + logger.warning(f"No second contact info for PointID {row.PointID}, skipping.") + return release_status = "private" name = _make_name(row.SecondFirstName, row.SecondLastName) - organization = co_to_org_mapper.get(row.Company, row.Company) + organization = _get_organization(row, co_to_org_mapper, org_mapper) if (name, organization) in added: return + added.append((name, organization)) contact_data = { @@ -364,7 +339,6 @@ def _make_address(first_second, ownerkey, kind, **kw): ) -# def _make_contact_and_assoc(session, data, thing): from schemas.contact import CreateContact diff --git a/transfers/data/organization_mapping.json b/transfers/data/organization_mapping.json new file mode 100644 index 000000000..0d3bda9dc --- /dev/null +++ b/transfers/data/organization_mapping.json @@ -0,0 +1,79 @@ +{ + "City of Aztec": "City of Aztec", + "Daybreak Investments": "Daybreak Investments", + "Vallecitos HOA": "Vallecitos HOA", + "Naiche Development": "Naiche Corporation", + "Santa Fe County; Santa Fe Animal Shelter": "Santa Fe County; Santa Fe Animal Shelter", + "El Guicu Ditch Association": "El Guicu Ditch Association", + "Santa Fe Municipal Airport": "Santa Fe Municipal Airport", + "Uluru Development": "Uluru Development", + "AllSup's Convenience Stores": "AllSup's Convenience Stores", + "Santa Fe Downs": "Santa Fe Downs Resort", + "City of Truth or Consequences, WWTP": "City of Truth or Consequences, WWTP", + "Riverbend Hotsprings": "Riverbend Hotsprings", + "Armendaris Ranch": "Armendaris Ranch", + "El Paso Water": "El Paso Water", + "PVACD": "Pecos Valley Artesian Conservancy District", + "BLM, Socorro Field Office": "BLM, Socorro Field Office", + "USFWS": "US Fish & Wildlife Service", + "NPS": "National Park Service", + "Sile MDWCA": "Sile Municipal Domestic Water Assn.", + "Pena Blanca Water & Sanitation District": "Pena Blanca Water & Sanitation District", + "Town of Questa": "Town of Questa", + "Lamy MDWCA": "Lama MDWCA", + "Town of Cerro": "Town of Cerro", + "Farr Cattle Company": "Farr Cattle Company (Farr Ranch)", + "Carrizozo Orchard": "Carrizozo Orchard", + "USFS, Kiowa Grasslands": "USFS, Kiowa Grasslands", + "Cloud Country West Subdivision": "Cloud Country West Subdivision", + "Chama West Water Users Association": "Chama West Water Users Assn.", + "El Rito Regional Water and Waste Water Association": "El Rito Regional Water + Waste Water Association", + "West Rim MDWUA": "West Rim MDWUA", + "Village of Willard": "Village of Willard", + "Quemado Municipal Water & SWA": "Quemado Mutual Water and Sewage Works Association", + "Coyote Creek MDWUA": "Coyote Creek MDWUA", + "Lamy Mutual Domestic Water Assn.": "Lamy Mutual Domestic Water Assn.", + "La Joya CWDA": "La Joya CWDA", + "NM Firefighters Training Academy": "NM Firefighters Training Academy", + "Cebolleta Land Grant": "Cebolleta Land Grant", + "Madrid Water Co-op": "Madrid Water Co-op", + "Sun Valley Water and Sanitation": "Sun Valley Water and Sanitation", + "Bluewater Lake MDWCA": "Bluewater Lake MDWCA", + "Bluewater Acres Domestic WUA": "Bluewater Acres Domestic Water Users Assn.", + "Lybrook MDWCA": "Lybrook Municipal", + "New Mexico Museum of Natural History": "New Mexico Museum of Natural History", + "Hillsboro MDWCA": "Hillsboro Mutual Domestic Water Consumer Assn.", + "Tyrone MDWCA": "Tyrone Mutual Domestic Water Assn.", + "Santa Clara Water System": "Santa Clara Water System", + "Casas Adobes MDWCA": "Casas Adobes Mutual Domestic", + "Lake Roberts WUA": "Lake Roberts Water Assn.", + "El Creston MDWCA": "El Creston MDWCA", + "Reserve Municipality Water Works": "Reserve Municipality Water Works", + "Bayard": "Bayard Municipal Water", + "Town of Estancia": "Town of Estancia", + "Pie Town MDWCA": "Pie Town MDWCA", + "Roosevelt SWCD": "Roosevelt Soil & Water Conservation District", + "Otis MDWCA": "Otis Mutual Domestic", + "White Cliffs MDWUA": "White Cliffs MDWUA", + "Vista Linda Water Co-op": "Vista Linda Water Co-op", + "Anasazi Trails Water Co-op": "Anasazi Trails Water Cooperative", + "Canon MDWCA": "Canon Mutual Domestic Water Consumer Assn.", + "Placitas Trails Water Co-op": "Placitas Trails Water Coop", + "BLM, Roswell Office": "BLM, Roswell Office", + "Forked Lightning Ranch": "Forked Lightning Ranch", + "Cottonwood RWA": "Cottonwood Rural Water Assn.", + "Pinon Ridge WUA": "Pinon Ridge Water Users Association", + "McSherry Farms": "McSherry Farms", + "Agua Sana WUA": "Agua Sana Water Users Assn.", + "Chamita MDWCA": "Chamita Water Users Association", + "W Spear-bar Ranch": "W Spear-bar Ranch", + "Village of Capitan": "Village of Capitan", + "Brazos MDWCA": "Brazos Mutual Domestic Water Consumers Assn.", + "Alto Alps HOA": "Alto Alps Homeowners Association", + "Chiricahua Desert Museum": "Chiricahua Desert Museum", + "Bike Ranch": "Bike Ranch", + "Hachita MDWCA": "Hachita MDWCA", + "Carrizozo Municipal Water": "Carrizozo Municipal Water", + "Dunhill Ranch": "Dunhill Ranch", + "Santa Fe Conservation Trust": "Santa Fe Conservation Trust" +} diff --git a/transfers/data/owners_organization_mapper.json b/transfers/data/owners_organization_mapper.json index 5ce45a8bf..b4f29bd7b 100644 --- a/transfers/data/owners_organization_mapper.json +++ b/transfers/data/owners_organization_mapper.json @@ -89,6 +89,7 @@ "Pecos Trail Inn": "Pecos Trail Inn", "Pelican Spa": "Pelican Spa", "Pistachio Tree Ranch": "Pistachio Tree Ranch", + "Quemado Mutual Water and Sewage Works Association": "Quemado Municipal Water & SWA", "Rancho Encantado": "Rancho Encantado", "Rancho San Lucas": "Rancho San Lucas", "Rancho San Marcos": "Rancho San Marcos", diff --git a/transfers/geologic_formation_transfer.py b/transfers/geologic_formation_transfer.py new file mode 100644 index 000000000..7fcd73e4c --- /dev/null +++ b/transfers/geologic_formation_transfer.py @@ -0,0 +1,141 @@ +import time +from sqlalchemy.orm import Session +from pydantic import ValidationError + +from db import GeologicFormation +from schemas.geologic_formation import CreateGeologicFormation +from transfers.util import read_csv, replace_nans, logger + + +def transfer_geologic_formations(session: Session, limit: int = None) -> tuple: + """ + Transfer geologic formation data from LU_GeologicFormation CSV to the database. + + This should be run BEFORE well_transfer.py so that geologic formation records exist for wells to reference. + + Args: + session (Session): SQLAlchemy database session + limit (int, optional): Optional limit on number of records to transfer (for testing). + + Returns: + tuple: (input_df, cleaned_df, errors) + """ + # 1. Read the CSV file + input_df = read_csv("LU_Formations") + + # 2. Replace NaNs with None + cleaned_df = replace_nans(input_df) + + # 3. Initialize tracking variables for logging + n = len(cleaned_df) + step = 25 + start_time = time.time() + errors = [] + created_count = 0 + skipped_count = 0 + + logger.info(f"Starting transfer of {n} geologic formations") + + # 4. Process each row + for i, row in enumerate(cleaned_df.itertuples()): + # check if limit is reached + if limit and i >= limit: + logger.info(f"Reached limit of {limit} rows. Stopping migration.") + break + + # Log progress every 'step' rows + if i and not i % step: + logger.info( + f"Processing row {i} of {n}. Avg rows per second: {step / (time.time() - start_time):.2f}" + ) + start_time = time.time() + + # Commit progress periodically + try: + session.commit() + except Exception as e: + logger.critical(f"Error committing geologic formations: {e}") + session.rollback() + continue + + # 5. Extract formation code and description + formation_code = row.Code + + if not formation_code: + logger.warning(f"Skipping row {i}: Missing formation code") + skipped_count += 1 + continue + + # Check if this formation already exists + existing = ( + session.query(GeologicFormation) + .filter(GeologicFormation.formation_code == formation_code) + .first() + ) + + if existing: + logger.info( + f"Skipping row {i}: Formation code {formation_code} already exists" + ) + skipped_count += 1 + continue + + # 6. Prepare data for creation + # Note: We only store the formation_code. Formation names will be mapped by the API using a + # formations.json file from authoritative sources (e.g., USGS). + # The description field is left as None and can be populated later if needed. + # Note: lithology is set to None here and will be updated during stratigraphy transfer + try: + data = CreateGeologicFormation( + formation_code=formation_code, + description=None, # Not storing from legacy data + lithology=None, # Will be populated from Stratigraphy.csv + ) + + # Validate the data using Pydantic schema + CreateGeologicFormation.model_validate(data) + + except ValidationError as e: + errors.append({"code": formation_code, "errors": e.errors()}) + logger.critical( + f"Validation error for row {i} with Code {formation_code}: {e.errors()}" + ) + continue + except Exception as e: + errors.append({"code": formation_code, "errors": str(e)}) + logger.critical(f"Error preparing data for {formation_code}: {e}") + continue + + # 7. Create database object + geologic_formation = None + try: + formation_data = data.model_dump() + geologic_formation = GeologicFormation(**formation_data) + session.add(geologic_formation) + created_count += 1 + + logger.info( + f"Created geologic formation: {geologic_formation.formation_code}" + ) + + except Exception as e: + if geologic_formation is not None: + session.expunge(geologic_formation) + errors.append({"code": formation_code, "error": str(e)}) + logger.critical( + f"Error creating geologic formation for {formation_code}: {e}" + ) + continue + + # 8. Final commit + try: + session.commit() + logger.info( + f"Successfully transferred {created_count} geologic formations, skipped {skipped_count}. " + f"Note: lithology is None and will be updated during stratigraphy transfer." + ) + except Exception as e: + logger.critical(f"Error during final commit of geologic formations: {e}") + session.rollback() + + return input_df, cleaned_df, errors diff --git a/transfers/group_transfer.py b/transfers/group_transfer.py index 0bad85cb7..5549a81d1 100644 --- a/transfers/group_transfer.py +++ b/transfers/group_transfer.py @@ -13,21 +13,28 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== +import pandas as pd from sqlalchemy import select from sqlalchemy.orm import Session from db import Thing, Group, GroupThingAssociation -from db.engine import session_ctx -from transfers.util import read_csv -from transfers.logger import logger from services.util import retrieve_latest_polymorphic_history_table_record +from transfers.logger import logger +from transfers.transferer import Transferer +from transfers.util import read_csv + + +class ProjectGroupTransferer(Transferer): + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + self.source_table = "Projects" + self.source_dtypes = {"Project": str, "PointIDPrefix": str} + def _get_dfs(self): + df = read_csv(self.source_table, self.source_dtypes) + return df, df -def transfer_groups( - session: Session, -) -> None: - wdf = read_csv("Projects") - for i, row in enumerate(wdf.itertuples()): + def _step(self, session: Session, df: pd.DataFrame, i: int, row: pd.Series): sql = select(Group).where(Group.name == row.Project) group = session.scalars(sql).one_or_none() @@ -79,7 +86,63 @@ def transfer_groups( session.commit() -if __name__ == "__main__": - with session_ctx() as session: - transfer_groups(session) +# def transfer_groups( +# session: Session, +# ) -> None: +# wdf = read_csv("Projects") +# for i, row in enumerate(wdf.itertuples()): +# +# sql = select(Group).where(Group.name == row.Project) +# group = session.scalars(sql).one_or_none() +# if not group: +# # add a group for each project +# group = Group(name=row.Project) +# +# for prefix in row.PointIDPrefix.split(","): +# prefix = prefix.strip() +# if prefix: +# # get all PointIDs that start with prefix +# sql = select(Thing).where(Thing.name.like(f"{prefix}%")) +# records = session.scalars(sql).unique().all() +# if records: +# logger.info( +# f"Adding {len(records)} things to group {group.name}, prefix {prefix}" +# ) +# group_is_monitoring_plan = False +# for record in records: +# # set the group_type to Monitoring Plan if at least one well is currently monitored +# if not group_is_monitoring_plan: +# if record.status_history: +# monitoring_status = [ +# sh +# for sh in record.status_history +# if sh.status_type == "Monitoring Status" +# ] +# if monitoring_status: +# monitoring_status = retrieve_latest_polymorphic_history_table_record( +# record, +# "status_history", +# "Monitoring Status", +# ) +# if ( +# monitoring_status.status_value +# == "Currently monitored" +# ): +# group_is_monitoring_plan = True +# group.group_type = "Monitoring Plan" +# logger.info( +# f" Setting group {group.name} type to Monitoring Plan based on thing {record.name}" +# ) +# +# gta = GroupThingAssociation(group=group, thing=record) +# session.add(gta) +# group.thing_associations.append(gta) +# +# session.add(group) +# session.commit() +# +# +# if __name__ == "__main__": +# with session_ctx() as session: +# transfer_groups(session) # ============= EOF ============================================= diff --git a/transfers/link_ids_transfer.py b/transfers/link_ids_transfer.py index f11f8bb97..c32fd0b8d 100644 --- a/transfers/link_ids_transfer.py +++ b/transfers/link_ids_transfer.py @@ -24,184 +24,161 @@ extract_organization, read_csv, replace_nans, - chunk_by_size, ) +from transfers.well_transfer import WellChunkTransferer + + +class LinkIdsWellDataTransferer(WellChunkTransferer): + source_table = "WellData" + source_dtypes = {"OSEWellID": str, "OSEWelltagID": str} + + def _chunk_step(self, session, dr, i, row, db_item): + if pd.isna(row.OSEWellID) and pd.isna(row.OSEWelltagID): + return + + for aid, klass, regex in ( + (row.OSEWellID, "OSEPOD", r"^[A-Z]{1,3}-\d{3,6}"), + ( + row.OSEWelltagID, + "OSEWellTagID", + r"", + ), # TODO: need to figure out regex for this field + ): + if pd.isna(aid): + # logger.warning(f"{klass} is null for {row.PointID}") + continue - -def transfer_link_ids_welldata(session): - ldf = read_csv("WellData", dtype={"OSEWelltagID": str}) - - ldf = filter_to_valid_point_ids(session, ldf) - - for chunk in chunk_by_size(ldf, 100): - things = ( - session.query(Thing).filter(Thing.name.in_(chunk.PointID.tolist())).all() - ) - for row in chunk.itertuples(): - # RULE: exclude rows where both ids are null - if pd.isna(row.OSEWellID) and pd.isna(row.OSEWelltagID): - # logger.warning( - # f"Both OSEWellID and OSEWelltagID are null for {row.PointID}" - # ) + # RULE: exclude any id that == 'X', '?' + if aid.strip().lower() in ("x", "?", "exempt"): + logger.critical( + f'{klass} is "X", "?", or "exempt", id={aid} for {row.PointID}' + ) continue - thing = next((l for l in things if l.name == row.PointID), None) - if thing is None: - logger.warning( - f"Thing not found forPointID {row.PointID}. Skipping link ids." + if regex and not re.match(regex, aid): + logger.critical( + f"{klass} id does not match regex {regex}, id={aid} for {row.PointID}" ) continue - for aid, klass, regex in ( - (row.OSEWellID, "OSEPOD", r"^[A-Z]{1,3}-\d{3,6}"), - ( - row.OSEWelltagID, - "OSEWellTagID", - r"", - ), # TODO: need to figure out regex for this field - ): - if pd.isna(aid): - # logger.warning(f"{klass} is null for {row.PointID}") - continue - - # RULE: exclude any id that == 'X', '?' - if aid.strip().lower() in ("x", "?", "exempt"): - logger.critical( - f'{klass} is "X", "?", or "exempt", id={aid} for {row.PointID}' - ) - continue - - if regex and not re.match(regex, aid): - logger.critical( - f"{klass} id does not match regex {regex}, id={aid} for {row.PointID}" - ) - continue - - # TODO: add guards for null values - link_id = ThingIdLink() - link_id.thing = thing - link_id.relation = klass - link_id.alternate_id = aid - link_id.alternate_organization = "NMOSE" - - # does link_id need a class e.g. - # link_id.alternate_id_class = klass - - session.add(link_id) - session.commit() - - -def add_link_alternate_site_id(session, row, thing): - if not row.AlternateSiteID: - return - - link_id = ThingIdLink() - link_id.thing = thing - link_id.relation = "same_as" - link_id.alternate_id = row.AlternateSiteID - - link_id.alternate_organization = extract_organization(str(row.AlternateSiteID)) - - # logger.info(f"adding link id: {row.PointID}") - session.add(link_id) - - -def add_link_site_id(session, row, thing): - if not row.SiteID: - return - - link_id = ThingIdLink() - link_id.thing = thing - link_id.relation = "same_as" - - site_id = row.SiteID.strip() - if not re.match(r"^\d{15}$", site_id): - # TODO: lets make a sweet function for flagging issues - # flag for interrogation - logger.critical( - f"{row.PointID} alternate id {site_id} is not a valid USGS site id" + # TODO: add guards for null values + link_id = ThingIdLink() + link_id.thing = db_item + link_id.relation = klass + link_id.alternate_id = aid + link_id.alternate_organization = "NMOSE" + + # does link_id need a class e.g. + # link_id.alternate_id_class = klass + + session.add(link_id) + + +class LinkIdsLocationDataTransferer(WellChunkTransferer): + source_table = "Location" + site_type = "GW" + + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + + self._plss_regex = re.compile( + r"^T\d{1,3}[NS]\.R\d{1,3}[EW]\.S(?:[1-9]|[12]\d|3[0-6])(?:\.\d{1,5})?$" ) - return - - link_id.alternate_id = row.SiteID - link_id.alternate_organization = "USGS" - session.add(link_id) - - -def add_link_plss(session, row, thing): - - township = row.Township - township_direction = row.TownshipDirection - _range = row.Range - range_direction = row.RangeDirection - section = row.Section - section_direction = row.SectionDirection - - if not township or not _range or not section: - return - - link_id = ThingIdLink() - link_id.thing = thing - link_id.relation = "same_as" - link_id.alternate_organization = "PLSS" - - alternate_id = f"T{township}{township_direction}.R{_range}{range_direction}.S{section}{section_direction}" - if not re.match(r"T\d{1,3}.R\d{1,3}.S\d{1,3}", alternate_id): - # flag for interrogation - logger.warning(f"alternate id {alternate_id} is not a valid PLSS") - return - link_id.alternate_id = alternate_id - link_id.alternate_organization = "PLSS" - session.add(link_id) - - -def transfer_link_ids(session, site_type="GW"): - ldf = read_csv("Location") - ldf = ldf[ldf["SiteType"] == site_type] - ldf = ldf[ldf["Easting"].notna() & ldf["Northing"].notna()] - ldf = replace_nans(ldf) - - ldf = filter_to_valid_point_ids(session, ldf) - for chunk in chunk_by_size(ldf, 100): - locations = ( - session.query(Thing).filter(Thing.name.in_(chunk.PointID.tolist())).all() + self._usgs_regex = re.compile(r"^\d{15}$") + + def _get_dfs(self): + input_df = read_csv( + self.source_table, + { + "SiteID": str, + "Township": str, + "TownshipDirection": str, + "Range": str, + "RangeDirection": str, + "SectionQuarters": str, + }, ) - for row in chunk.itertuples(): - thing = next((l for l in locations if l.name == row.PointID), None) - if thing is None: - logger.warning( - f"Thing with PointID {row.PointID} not found. Skipping link id." - ) - continue - logger.info( - f"Processing PointID: {row.PointID}, Thing ID: {thing.id}, AlternateSiteID={row.AlternateSiteID}, " - f"AlternateSiteID2={row.AlternateSiteID2}" + + ldf = input_df[input_df["SiteType"] == self.site_type] + ldf = ldf[ldf["Easting"].notna() & ldf["Northing"].notna()] + ldf = replace_nans(ldf) + cleaned_df = filter_to_valid_point_ids(ldf) + return input_df, cleaned_df + + def _chunk_step(self, session, df, i, row, db_item): + logger.info( + f"Processing PointID: {row.PointID}, " + f"Thing ID: {db_item.id}, " + f"AlternateSiteID={row.AlternateSiteID}, " + f"AlternateSiteID2={row.AlternateSiteID2}" + ) + for func in ( + self._add_link_alternate_site_id, + self._add_link_site_id, + self._add_link_plss, + ): + link = func(row, db_item) + if link: + session.add(link) + + def _add_link_alternate_site_id(self, row: pd.Series, thing: Thing): + if not row.AlternateSiteID: + return + + return _make_thing_id_link( + thing, row.AlternateSiteID, extract_organization(str(row.AlternateSiteID)) + ) + + def _add_link_site_id(self, row, thing): + if not row.SiteID: + return + + site_id = row.SiteID.strip() + if not self._usgs_regex.match(site_id): + self._capture_error( + row.PointID, f"{site_id} is not a valid USGS site id", "SiteID" + ) + logger.critical( + f"{row.PointID} alternate id {site_id} is not a valid USGS site id" + ) + return + + return _make_thing_id_link(thing, row.SiteID, "USGS") + + def _add_link_plss(self, row, thing): + township = row.Township + township_direction = row.TownshipDirection + _range = row.Range + range_direction = row.RangeDirection + section = row.SectionQuarters + if not township or not _range or not section: + return + + alternate_id = ( + f"T{township}{township_direction}.R{_range}{range_direction}.S{section}" + ) + if not self._plss_regex.match(alternate_id): + self._capture_error( + row.PointID, + f"{alternate_id} is not a valid PLSS", + "Township, TownshipDirection, Range, RangeDirection, Section, SectionDirection", ) - add_link_alternate_site_id(session, row, thing) - session.commit() - - # for i, row in enumerate(ldf.itertuples()): - # thing = session.query(Thing).where(Thing.name == row.PointID).first() - # if thing is None: - # logger.warning( - # f"Thing with PointID {row.PointID} not found. Skipping link id." - # ) - # continue - # logger.info( - # f"Processing PointID: {row.PointID}, Thing ID: {thing.id}, AlternateSiteID={row.AlternateSiteID}, " - # f"AlternateSiteID2={row.AlternateSiteID2}" - # ) - # add_link_alternate_site_id(session, row, thing) - # # add_link_site_id(session, row, thing) - # # add_link_plss(session, row, thing) - # - # # not clear what alternate_id2 is for, or what it maps to - # # add_link_alternate_site_id2(session, row, thing) - # if i and not i % 25: - # session.commit() - # session.flush() - # - # session.commit() + + logger.critical(f"alternate id {alternate_id} is not a valid PLSS") + return + + return _make_thing_id_link(thing, alternate_id, "PLSS") + + +def _make_thing_id_link( + thing, alternate_id, alternate_organization, relation="same_as" +): + return ThingIdLink( + thing=thing, + relation=relation, + alternate_id=alternate_id, + alternate_organization=alternate_organization, + ) # ============= EOF ============================================= diff --git a/transfers/metrics.py b/transfers/metrics.py index 25b6b626b..1f2b67bdd 100644 --- a/transfers/metrics.py +++ b/transfers/metrics.py @@ -22,7 +22,6 @@ from pydantic import ValidationError from sqlalchemy import select, func from sqlalchemy.exc import ProgrammingError -from sqlalchemy.orm import Session from db import ( Thing, @@ -33,7 +32,10 @@ Parameter, Deployment, TransducerObservation, + Group, + Asset, ) +from db.engine import session_ctx from services.gcs_helper import get_storage_bucket @@ -77,9 +79,24 @@ def sensor_metrics(self, *args, **kw) -> None: def well_screen_metrics(self, *args, **kw) -> None: self._handle_metrics(WellScreen, *args, **kw) - def contact_metrics(self, sess, input_df, cleaned_df, errors) -> None: + def welldata_link_ids_metrics(self, input_df, cleaned_df, errors) -> None: + self._write_metrics("WellData Link IDs", len(input_df), input_df, cleaned_df) + self._write_errors(errors) + + def location_link_ids_metrics(self, input_df, cleaned_df, errors) -> None: + self._write_metrics( + "LocationData Link IDs", len(input_df), input_df, cleaned_df + ) + self._write_errors(errors) + + def asset_metrics(self, *args, **kw) -> None: + self._handle_metrics(Asset, *args, **kw) + + def group_metrics(self, *args, **kw) -> None: + self._handle_metrics(Group, *args, **kw) + + def contact_metrics(self, input_df, cleaned_df, errors) -> None: count = self._get_count( - sess, Contact, ) @@ -90,14 +107,15 @@ def contact_metrics(self, sess, input_df, cleaned_df, errors) -> None: self._writer.writerow(metrics) self._write_errors(errors) - def water_level_metrics(self, sess, input_df, cleaned_df, errors) -> None: - sql = ( - select(func.count()) - .select_from(Observation) - .join(Parameter) - .where(Parameter.parameter_name == "groundwater level") - ) - count = sess.execute(sql).scalar_one() + def water_level_metrics(self, input_df, cleaned_df, errors) -> None: + with session_ctx() as sess: + sql = ( + select(func.count()) + .select_from(Observation) + .join(Parameter) + .where(Parameter.parameter_name == "groundwater level") + ) + count = sess.execute(sql).scalar_one() metrics = self._make_metrics( "Manual Water Levels", len(input_df), len(cleaned_df), count @@ -111,19 +129,18 @@ def acoustic_metrics(self, *args, **kw) -> None: def pressure_metrics(self, *args, **kw) -> None: self._transducer_metrics("Pressure Transducer", *args, **kw) - def _transducer_metrics( - self, sensor_type, sess, input_df, cleaned_df, errors - ) -> None: - sql = ( - select(func.count()) - .select_from(TransducerObservation) - .join(Deployment) - .join(Sensor) - .join(Parameter) - .where(Sensor.sensor_type == sensor_type) - .where(Parameter.parameter_name == "groundwater level") - ) - count = sess.execute(sql).scalar_one() + def _transducer_metrics(self, sensor_type, input_df, cleaned_df, errors) -> None: + with session_ctx() as sess: + sql = ( + select(func.count()) + .select_from(TransducerObservation) + .join(Deployment) + .join(Sensor) + .join(Parameter) + .where(Sensor.sensor_type == sensor_type) + .where(Parameter.parameter_name == "groundwater level") + ) + count = sess.execute(sql).scalar_one() metrics = self._make_metrics(sensor_type, len(input_df), len(cleaned_df), count) self._writer.writerow(metrics) self._write_errors(errors) @@ -133,9 +150,9 @@ def _make_metrics(self, name, input_n, cleaned_n, count): return [name, input_n, cleaned_n, count, percent_issue] def _handle_metrics( - self, model, sess, input_df, cleaned_df, errors, where=None, name=None + self, model, input_df, cleaned_df, errors, where=None, name=None ) -> None: - count = self._get_count(sess, model, where=where) + count = self._get_count(model, where=where) if name is None: name = model.__name__ @@ -183,11 +200,12 @@ def _write_metrics( metrics = self._make_metrics(name, len(input_df), len(cleaned_df), count) self._writer.writerow(metrics) - def _get_count(self, sess: Session, model, where=None) -> int: - sql = select(func.count()).select_from(model) - if where: - sql = sql.where(where) - count = sess.execute(sql).scalar_one() + def _get_count(self, model, where=None) -> int: + with session_ctx() as sess: + sql = select(func.count()).select_from(model) + if where: + sql = sql.where(where) + count = sess.execute(sql).scalar_one() return count diff --git a/transfers/permissions_transfer.py b/transfers/permissions_transfer.py new file mode 100644 index 000000000..18daa1040 --- /dev/null +++ b/transfers/permissions_transfer.py @@ -0,0 +1,95 @@ +from sqlalchemy.orm import Session +from datetime import datetime +from pandas import isna + +from db import Thing, PermissionHistory +from transfers.util import read_csv, logger, replace_nans + +""" +Developer's notes + +According to Laila the column WellData.OpenWellLoggerOK only pertains to the +physical properties of a well (that is, if a datalogger can be installed). It +does not pertain to permissions. +""" + + +def transfer_permissions(session: Session): + """ + The transferred wells and contacts need to be transferred first + - to access the auto-generated well IDs + - to know who gave permission to which well since contact_id is required for + PermissionHistory + """ + wdf = read_csv("WellData", dtype={"OSEWelltagID": str}) + wdf = replace_nans(wdf) + + transferred_wells = ( + session.query(Thing).filter(Thing.thing_type == "water well").all() + ) + + for well in transferred_wells: + if len(well.contacts) == 0: + logger.critical( + f"Well {well.name} has no associated contacts; skipping permission transfer." + ) + continue + else: + # Assuming the first contact is the relevant one + contact_id = well.contacts[0].id + + allow_water_level_samples = wdf.loc[ + wdf["PointID"] == well.name, "MonitorOK" + ].values + if len(allow_water_level_samples) == 0: + pass + elif isna(allow_water_level_samples[0]): + pass + else: + try: + permission_allowed = bool(allow_water_level_samples[0]) + permission = PermissionHistory( + contact_id=contact_id, + permission_type="Water Level Sample", + permission_allowed=permission_allowed, + start_date=datetime.today().date(), + target_id=well.id, + target_table="thing", + ) + session.add(permission) + logger.info( + f"Transferred Water Level Sample permission for well {well.name}: {permission_allowed}." + ) + except Exception as e: + logger.error(f"Error transferring permission for well {well.name}: {e}") + session.rollback() + pass + + allow_water_chemistry_samples = wdf.loc[ + wdf["PointID"] == well.name, "SampleOK" + ].values + if len(allow_water_chemistry_samples) == 0: + pass + elif isna(allow_water_chemistry_samples[0]): + pass + else: + try: + permission_allowed = bool(allow_water_chemistry_samples[0]) + permission = PermissionHistory( + contact_id=contact_id, + permission_type="Water Chemistry Sample", + permission_allowed=permission_allowed, + start_date=datetime.today().date(), + target_id=well.id, + target_table="thing", + ) + session.add(permission) + logger.info( + f"Transferred Water Chemistry Sample permission for well {well.name}: {permission_allowed}." + ) + except Exception as e: + logger.error(f"Error transferring permission for well {well.name}: {e}") + session.rollback() + pass + + session.commit() diff --git a/transfers/sensor_transfer.py b/transfers/sensor_transfer.py index f6ff49dcb..2f4ce7cf3 100644 --- a/transfers/sensor_transfer.py +++ b/transfers/sensor_transfer.py @@ -15,196 +15,217 @@ # =============================================================================== from datetime import datetime +import pandas as pd from sqlalchemy import select - -from db import Sensor, Deployment, Thing -from transfers.util import read_csv, logger, filter_to_valid_point_ids, replace_nans +from sqlalchemy.orm import Session + +from db import Sensor, Deployment, Thing, Base +from transfers.transferer import ThingBasedTransferer +from transfers.util import ( + read_csv, + logger, + filter_to_valid_point_ids, + replace_nans, + SensorParameterEstimator, +) EQUIPMENT_TO_SENSOR_TYPE_MAP = { "Pressure transducer": "Pressure Transducer", "Acoustic sounder": "Acoustic Sounder", "Barometer": "Barometer", + "DiverLink": "DiverLink", + "Diver Cable": "Diver Cable", } -def transfer_sensors(session): +class SensorTransferer(ThingBasedTransferer): source_table = "Equipment" - input_df = read_csv(source_table) - input_df.columns = input_df.columns.str.replace(" ", "_") - input_df = input_df[input_df.SerialNo.notna()] - cleaned_df = filter_to_valid_point_ids(session, input_df) - cleaned_df = replace_nans(cleaned_df) - errors = [] - grouped_equipment = cleaned_df.groupby(["PointID"]) - added = {} - for index, group in grouped_equipment: - pointid = index[0] - thing = session.query(Thing).filter(Thing.name == pointid).first() - if thing is None: - logger.warning( - f"Skipping sensor transfer for Thing with PointID {pointid} since it is not in the DB" - ) - continue - ordered_group = group.sort_values(by=["DateInstalled"]) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._estimators = {} + self._added = {} + + def _get_dfs(self): + input_df = read_csv(self.source_table) + input_df.columns = input_df.columns.str.replace(" ", "_") + input_df = input_df[input_df.SerialNo.notna()] + cleaned_df = filter_to_valid_point_ids(input_df) + cleaned_df = replace_nans(cleaned_df) + return input_df, cleaned_df + + def _no_db_item_warning(self, index): + return f"Skipping sensor transfer for Thing with PointID {index[0]} since it is not in the DB" + + def _get_prepped_group(self, group): + return group.sort_values(by=["DateInstalled"]) + + def _get_estimator(self, sensor_type): + if sensor_type in self._estimators: + estimator = self._estimators[sensor_type] + else: + estimator = SensorParameterEstimator(sensor_type) + self._estimators[sensor_type] = estimator + return estimator + + def _group_step(self, session: Session, row: pd.Series, db_item: Base): + pointid = self._get_point_id(row, db_item) try: - for row in ordered_group.itertuples(): - try: - sensor_type = EQUIPMENT_TO_SENSOR_TYPE_MAP[row.EquipmentType] - except KeyError as e: - logger.critical( - f"Skipping equipment with type {row.EquipmentType} for point {pointid}" - ) - error = ( - f"key error adding sensor_type:{row.EquipmentType} error: {e}" - ) - errors.append( - { - "pointid": pointid, - "error": error, - "table": source_table, - "field": "EquipmentType", - } - ) - continue - - if row.SerialNo in added: - logger.info( - f"Sensor with serial number {row.SerialNo} already added in this transfer session. Only creating deployment for that record" - ) - sensor = added[row.SerialNo] - else: - sensor = ( - session.query(Sensor) - .filter(Sensor.serial_no == row.SerialNo) - .one_or_none() - ) - if sensor: - logger.info( - f"Sensor with serial number {row.SerialNo} already exists. Only creating deployment for that record" - ) - - if not sensor: - # TODO: Add validation - sensor = Sensor( - nma_pk_equipment=row.GlobalID, - name=row.ID, - sensor_type=sensor_type, - model=row.Model, - serial_no=row.SerialNo, - owner_agency="NMBGMR", - notes=row.Equipment_Notes, - ) - added[row.SerialNo] = sensor - session.add(sensor) - logger.info( - f"Added sensor {sensor.name} with serial number {sensor.serial_no}" - ) - - if row.DateInstalled: - installation_date = datetime.strptime( - row.DateInstalled, "%Y-%m-%d %H:%M:%S.%f" - ).date() - else: - logger.critical( - f"Installation Date cannot be None. Skipping deployment. Sensor: {row.ID}, " - f"SerialNo: {row.SerialNo} PointID: {pointid}" - ) - errors.append( - { - "pointid": pointid, - "error": f"row.ID={row.ID}, row.SerialNo={row.SerialNo}. Installation Date cannot " - f"be None", - "table": source_table, - "field": "DateInstalled", - } - ) - continue - - removal_date = None - if row.DateRemoved: - removal_date = datetime.strptime( - row.DateRemoved, "%Y-%m-%d %H:%M:%S.%f" - ).date() - - try: - recording_interval = int(row.RecordingInterval) - except (ValueError, TypeError): - logger.critical( - f"name={sensor.name}, serial_no={sensor.serial_no} RecordingInterval is not an " - f"integer. Setting to None" - ) - recording_interval = None - errors.append( - { - "pointid": pointid, - "error": f"row.ID={row.ID}, row.SerialNo={row.SerialNo}. RecordingInterval is " - f"not an integer", - "table": source_table, - "field": "RecordingInterval", - } - ) - sql = ( - select(Deployment) - .join(Thing) - .join(Sensor) - .where(Thing.name == pointid) - .where(Sensor.serial_no == sensor.serial_no) - .where(Deployment.installation_date == installation_date) - .where(Deployment.removal_date == removal_date) + sensor_type = EQUIPMENT_TO_SENSOR_TYPE_MAP[row.EquipmentType] + except KeyError as e: + logger.critical( + f"Skipping equipment with type {row.EquipmentType} for point {pointid}" + ) + error = f"key error adding sensor_type:{row.EquipmentType} error: {e}" + self._capture_error(pointid, error, "EquipmentType") + + return + + if row.SerialNo in self._added: + logger.info( + f"Sensor with serial number {row.SerialNo} already added in this transfer session. Only creating deployment for that record" + ) + sensor = self._added[row.SerialNo] + else: + sensor = ( + session.query(Sensor) + .filter(Sensor.serial_no == row.SerialNo) + .one_or_none() + ) + if sensor: + logger.info( + f"Sensor with serial number {row.SerialNo} already exists. Only creating deployment for that record" ) - existing_deployment = session.execute(sql).scalars().one_or_none() - if existing_deployment: - logger.info("existing deployment") - continue - - # TODO: add validation - deployment = Deployment( - thing=thing, - sensor=sensor, - installation_date=installation_date, - removal_date=removal_date, - recording_interval=recording_interval, - recording_interval_units="hour", - hanging_cable_length=row.HangingCableLength, - hanging_point_height=row.HangingPointHgt, - hanging_point_description=row.HangingPointDescription, + if not sensor: + # TODO: Add validation + sensor = Sensor( + nma_pk_equipment=row.GlobalID, + name=row.ID, + sensor_type=sensor_type, + model=row.Model, + serial_no=row.SerialNo, + owner_agency="NMBGMR", + notes=row.Equipment_Notes, + ) + self._added[row.SerialNo] = sensor + session.add(sensor) + logger.info( + f"Added sensor {sensor.name} with serial number {sensor.serial_no}" + ) + + if row.DateInstalled: + installation_date = datetime.strptime( + row.DateInstalled, "%Y-%m-%d %H:%M:%S.%f" + ).date() + else: + pointid = self._get_point_id(row, None) + estimator = self._get_estimator(sensor_type) + installation_date = estimator.estimate_installation_date(row) + if not installation_date: + logger.critical( + f"Installation Date cannot be None. Skipping deployment. Sensor: {row.ID}, " + f"SerialNo: {row.SerialNo} PointID: {pointid}" ) - session.add(deployment) - logger.info( - f"Added deployment for sensor with serial number {sensor.serial_no}, deployed to {thing.name}: | Installation Date: {installation_date} | Removal Date: {removal_date}" + self._capture_error( + pointid, + f"row.SerialNo={row.SerialNo}. Installation Date cannot be None", + "DateInstalled", + ) + return + else: + logger.warning( + f"Estimated installation date={installation_date} for {pointid}" ) + self._capture_error( + pointid, + f"Estimated installation date={installation_date}. Is this correct?", + "DateInstalled", + ) + + removal_date = None + if row.DateRemoved: + removal_date = datetime.strptime( + row.DateRemoved, "%Y-%m-%d %H:%M:%S.%f" + ).date() + + recording_interval_unit = "hour" + try: + recording_interval = int(row.RecordingInterval) + except (ValueError, TypeError): + # try to calculate recording interval from measurements + estimator = self._get_estimator(sensor_type) + recording_interval, unit, error = estimator.estimate_recording_interval( + row, installation_date, removal_date + ) - """ - Developer's notes + if recording_interval: + recording_interval_unit = unit + logger.info( + f"name={sensor.name}, serial_no={sensor.serial_no}. " + f"estimated recording interval: {recording_interval} {unit}" + ) + self._capture_error( + pointid, + f"Estimated recording interval={recording_interval} {unit}. Is this correct?", + "RecordingInterval", + ) - Since it's unclear beforehand if a sensor has been removed just update - the sensor_status based off of each deployments installation/removal - dates - """ - if installation_date: - sensor.sensor_status = "In Service" - if removal_date: - sensor.sensor_status = "Retired" - session.commit() - except Exception as e: - logger.critical(f"Could not add sensor and deployment: {e}") - errors.append({"pointid": pointid, "error": e, "table": source_table}) + else: + logger.critical( + f"name={sensor.name}, serial_no={sensor.serial_no} error={error}" + ) + self._capture_error( + pointid, + f"name={sensor.name}, row.SerialNo={row.SerialNo}. error={error}", + "RecordingInterval", + ) - return input_df, cleaned_df, errors + sql = ( + select(Deployment) + .join(Thing) + .join(Sensor) + .where(Thing.name == pointid) + .where(Sensor.serial_no == sensor.serial_no) + .where(Deployment.installation_date == installation_date) + .where(Deployment.removal_date == removal_date) + ) + + existing_deployment = session.execute(sql).scalars().one_or_none() + if existing_deployment: + logger.info("existing deployment") + return + + # TODO: add validation + deployment = Deployment( + thing=db_item, + sensor=sensor, + installation_date=installation_date, + removal_date=removal_date, + recording_interval=recording_interval, + recording_interval_units=recording_interval_unit, + hanging_cable_length=row.HangingCableLength, + hanging_point_height=row.HangingPointHgt, + hanging_point_description=row.HangingPointDescription, + ) + session.add(deployment) + logger.info( + f"Added deployment for sensor with serial number {sensor.serial_no}, deployed to {db_item.name}: | " + f"Installation Date: {installation_date} | Removal Date: {removal_date}" + ) + + """ + Developer's notes + + Since it's unclear beforehand if a sensor has been removed just update + the sensor_status based off of each deployments installation/removal + dates + """ + if installation_date: + sensor.sensor_status = "In Service" + if removal_date: + sensor.sensor_status = "Retired" # ============= EOF ============================================= -def init_sensor(session): - sensor = Sensor() - sensor.name = "Groundwater level manual measurement" - sensor.description = "manual gwl measurement. needs to be replaced with measurementmethod(?) e.g. steel tape, eprobe, etc." - sensor.unit = "ft" - sensor.datetime_installed = datetime.now() - session.add(sensor) - session.commit() - - -if __name__ == "__main__": - transfer_sensors("abc") diff --git a/transfers/stratigraphy_transfer.py b/transfers/stratigraphy_transfer.py new file mode 100644 index 000000000..de51e354e --- /dev/null +++ b/transfers/stratigraphy_transfer.py @@ -0,0 +1,285 @@ +""" +Transfer script for stratigraphy (lithology log) data. + +This creates ThingGeologicFormationAssociation records from the Stratigraphy CSV, which contains depth-specific +formation information for wells. It also updates the GeologicFormation.lithology field based on the +Stratigraphy.Lithology data. +""" + +import time +from sqlalchemy.orm import Session + +from db import Thing, GeologicFormation, ThingGeologicFormationAssociation +from transfers.util import ( + read_csv, + replace_nans, + filter_to_valid_point_ids, + lexicon_mapper, + logger, +) + + +def transfer_stratigraphy(session: Session, limit: int = None) -> tuple: + """ + Transfer detailed stratigraphy (lithology log) data from Stratigraphy CSV. + + The Stratigraphy CSV contains multiple rows per well, each representing a + depth interval, the formation encountered, and its lithology. + + Fields used: + - PointID: Links to the well + - UnitIdentifier: Formation code (maps to LU_Formations) + - StratTop: Top depth of the layer (feet below ground surface) + - StratBottom: Bottom depth of the layer (feet below ground surface) + - Lithology: Lithology code (maps to LU_Lithology via ABBREVIATION field) + + This should be run AFTER: + 1. transfer_geologic_formations.py (so formations exist) + 2. transfer_wells.py (so wells exist) + + Args: + session: Database session + limit: Optional limit on number of WELLS to process (for testing) + + Returns: + tuple: (input_df, cleaned_df, errors) + """ + # 1. Read and clean data + input_df = read_csv("Stratigraphy") + cleaned_df = replace_nans(input_df) + + # Step 2: Filter to only wells that exist in database + cleaned_df = filter_to_valid_point_ids(session, cleaned_df) + + n_records = len(cleaned_df) + n_wells = len(cleaned_df["PointID"].unique()) + + logger.info( + f"Starting transfer of {n_records} stratigraphy records for {n_wells} wells" + ) + + # 3. Initialize tracking variables for logging + step = 25 + start_time = time.time() + errors = [] + created_count = 0 + skipped_count = 0 + lithology_updates = 0 + + # Step 4: Group by well for efficient processing + well_groups = cleaned_df.groupby("PointID") + + for well_index, (pointid, strat_group) in enumerate(well_groups): + # Check limit (on number of wells, not records) + if limit and well_index >= limit: + logger.info(f"Reached limit of {limit} wells. Stopping.") + break + + # Progress logging every 25 wells + if well_index and not well_index % step: + logger.info( + f"Processing well {well_index} of {n_wells}, " + f"avg wells per second: {step / (time.time() - start_time):.2f}" + ) + start_time = time.time() + + # Periodic commit + try: + session.commit() + except Exception as e: + logger.critical(f"Error committing stratigraphy records: {e}") + session.rollback() + continue + + # 5. Get the well from database + thing = session.query(Thing).filter(Thing.name == pointid).first() + if not thing: + logger.warning( + f"Well {pointid} not found in database, skipping stratigraphy" + ) + skipped_count += len(strat_group) + continue + + logger.info( + f"Processing {len(strat_group)} stratigraphy layers for well {pointid}" + ) + + # 6. Process each stratigraphy record for this well + for layer_index, row in enumerate(strat_group.itertuples()): + # Validate required fields + # UnitIdentifier + if not hasattr(row, "UnitIdentifier") or not row.UnitIdentifier: + logger.critical( + f"Stratigraphy record {layer_index} for {pointid} has no UnitIdentifier, skipping" + ) + skipped_count += 1 + errors.append( + { + "pointid": pointid, + "layer": layer_index, + "error": "Missing UnitIdentifier", + } + ) + continue + # StratTop + if not hasattr(row, "StratTop") or row.StratTop is None: + logger.critical( + f"Stratigraphy record {layer_index} for {pointid} has no StratTop, skipping" + ) + skipped_count += 1 + errors.append( + { + "pointid": pointid, + "layer": layer_index, + "error": "Missing StratTop", + } + ) + continue + # StratBottom + if not hasattr(row, "StratBottom") or row.StratBottom is None: + logger.critical( + f"Stratigraphy record {layer_index} for {pointid} has no StratBottom, skipping" + ) + skipped_count += 1 + errors.append( + { + "pointid": pointid, + "layer": layer_index, + "error": "Missing StratBottom", + } + ) + continue + + # Extract formation code + formation_code = row.UnitIdentifier.strip() + + # Validate depth values + try: + top_depth = float(row.StratTop) + bottom_depth = float(row.StratBottom) + except (ValueError, TypeError) as e: + error_msg = f"Invalid depth values: StratTop={row.StratTop}, StratBottom={row.StratBottom}" + logger.critical( + f"{pointid} layer {layer_index}: {error_msg}, error: {e}" + ) + errors.append( + { + "pointid": pointid, + "layer": layer_index, + "error": error_msg, + "details": str(e), # for conversion errors + } + ) + skipped_count += 1 + continue + + # Validate depth logic + if top_depth >= bottom_depth: + error_msg = ( + f"Invalid depth logic: top={top_depth} >= bottom={bottom_depth}" + ) + logger.critical(f"{pointid} layer {layer_index}: {error_msg}") + errors.append( + {"pointid": pointid, "layer": layer_index, "error": error_msg} + ) + skipped_count += 1 + continue + + if top_depth < 0: + error_msg = f"Negative top depth: {top_depth}" + logger.critical(f"{pointid} layer {layer_index}: {error_msg}") + errors.append( + {"pointid": pointid, "layer": layer_index, "error": error_msg} + ) + skipped_count += 1 + continue + + # 7. Get or create the formation + formation = ( + session.query(GeologicFormation) + .filter(GeologicFormation.formation_code == formation_code) + .first() + ) + + if not formation: + # Create new formation if it doesn't exist + logger.info(f"Creating new geologic formation: {formation_code}") + formation = GeologicFormation( + formation_code=formation_code, + description=None, + lithology=None, # Will be set below + ) + session.add(formation) + session.flush() + + # 8. Update formation lithology if available and not already set + if hasattr(row, "Lithology") and row.Lithology: + try: + # Map lithology code to geologic_formation.lithology using ABBREVIATION field + lithology = lexicon_mapper.map_value( + f"LU_Lithology:{row.Lithology}" + ) + + # Update if formation does not have lithology yet + if not formation.lithology: + formation.lithology = lithology + lithology_updates += 1 + logger.info(f"Set lithology for {formation_code}: {lithology}") + elif formation.lithology != lithology: + # Log if there's a mismatch (different lithology for same formation) + logger.warning( + f"Formation {formation_code} has conflicting lithology: " + f"existing='{formation.lithology}', new='{lithology}'." + ) + except KeyError: + logger.warning( + f"Unknown lithology code '{row.Lithology}' for {pointid}, skipping lithology update" + ) + except Exception as e: + logger.warning(f"Error mapping lithology '{row.Lithology}': {e}") + + # 9. Create ThingGeologicFormationAssociation record + try: + formation_assoc = ThingGeologicFormationAssociation( + thing=thing, + geologic_formation=formation, + top_depth=top_depth, + bottom_depth=bottom_depth, + ) + session.add(formation_assoc) + created_count += 1 + + logger.info( + f" Layer {layer_index + 1}: {formation.formation_code} " + f"from {top_depth:.1f} to {bottom_depth:.1f} ft" + ) + + except Exception as e: + logger.critical( + f"Error creating stratigraphy association for {pointid}, " + f"formation {formation_code}: {e}" + ) + errors.append( + { + "pointid": pointid, + "formation": formation_code, + "layer": layer_index, + "error": str(e), + } + ) + skipped_count += 1 + continue + + # 10. Final commit + try: + session.commit() + logger.info( + f"Successfully transferred stratigraphy: " + f"{created_count} associations created, {skipped_count} skipped, " + f"{lithology_updates} lithology fields updated, {len(errors)} errors" + ) + except Exception as e: + logger.critical(f"Error in final commit: {e}") + session.rollback() + + return input_df, cleaned_df, errors diff --git a/transfers/transfer.py b/transfers/transfer.py index 77275ed35..45a78cc60 100644 --- a/transfers/transfer.py +++ b/transfers/transfer.py @@ -17,28 +17,33 @@ from dotenv import load_dotenv +from db.engine import session_ctx +from services.util import get_bool_env +from transfers.aquifer_system_transfer import transfer_aquifer_systems +from transfers.geologic_formation_transfer import transfer_geologic_formations + load_dotenv() -from transfers.metrics import Metrics from transfers.waterlevels_transducer_transfer import ( - transfer_water_levels_pressure, - transfer_water_levels_acoustic, + WaterLevelsContinuousPressureTransferer, + WaterLevelsContinuousAcousticTransferer, ) + +from transfers.metrics import Metrics from core.initializers import erase_and_rebuild_db -from db.engine import session_ctx -from transfers.group_transfer import transfer_groups -from transfers.link_ids_transfer import transfer_link_ids, transfer_link_ids_welldata -from transfers.contact_transfer import transfer_contacts -from transfers.sensor_transfer import transfer_sensors -from transfers.waterlevels_transfer import transfer_water_levels -from transfers.well_transfer import ( - transfer_wells, - transfer_wellscreens, +from transfers.group_transfer import ProjectGroupTransferer +from transfers.link_ids_transfer import ( + LinkIdsWellDataTransferer, + LinkIdsLocationDataTransferer, ) +from transfers.contact_transfer import ContactTransfer +from transfers.sensor_transfer import SensorTransferer +from transfers.waterlevels_transfer import WaterLevelTransferer +from transfers.well_transfer import WellTransferer, WellScreenTransferer -from transfers.asset_transfer import transfer_assets -from transfers.util import timeit, timeit_direct +from transfers.asset_transfer import AssetTransferer +from transfers.util import timeit from transfers.logger import logger, save_log_to_bucket @@ -50,103 +55,41 @@ def message(msg, pad=10, new_line_at_top=True): @timeit -def transfer_all(sess, limit=100): +def transfer_all(metrics, limit=100): message("STARTING TRANSFER", new_line_at_top=False) - - logger.info("Erase and rebuilding database") - erase_and_rebuild_db() - - metrics = Metrics() - message("TRANSFERRING WELLS") - - flags = { - "TRANSFER_ALL_WELLS": True, - "TRANSFER_ALL_WELLSCREENS": True, - } - - results = timeit_direct(transfer_wells, sess, flags=flags, limit=limit) - metrics.well_metrics(sess, *results) - - message("TRANSFERRING WELL SCREENS") - results = timeit_direct(transfer_wellscreens, sess) - metrics.well_screen_metrics(sess, *results) - - message("TRANSFERRING SENSORS") - results = timeit_direct(transfer_sensors, sess) - metrics.sensor_metrics(sess, *results) - - # Developer's notes all the metadata for these Things are not defined in the models/schemas yet' - # message("TRANSFERRING SPRINGS") - # timeit_direct(transfer_springs, sess, limit=limit) - # - # message("TRANSFERRING PERENNIAL STREAMS") - # timeit_direct(transfer_perennial_stream, sess, limit=limit) - # - # message("TRANSFERRING EPHEMERAL STREAMS") - # timeit_direct(transfer_ephemeral_stream, sess, limit=limit) - # - # message("TRANSFERRING METEOROLOGICAL") - # timeit_direct(transfer_met, sess, limit) - - message("TRANSFERRING CONTACTS") - results = timeit_direct(transfer_contacts, sess) - metrics.contact_metrics(sess, *results) - - message("TRANSFERRING WATER LEVELS") - results = timeit_direct(transfer_water_levels, sess) - metrics.water_level_metrics(sess, *results) - - message("TRANSFERRING WATER LEVELS PRESSURE") - results = timeit_direct(transfer_water_levels_pressure, sess) - metrics.pressure_metrics(sess, *results) - - message("TRANSFERRING WATER LEVELS ACOUSTIC") - results = timeit_direct(transfer_water_levels_acoustic, sess) - metrics.acoustic_metrics(sess, *results) - - """ - Developer's notes - - When transfering water chemistry data use the qc_type field to indicate - normal/blanks/duplicates instead of what comes from LU_SampleType. Use - those values, however, to map to the standard qc_type fields if applicable - (i.e. not applicable when sample type is "Soil or rock sample" or - "Precipitation," but is applicable when sample type is "Equipment blank" - or "Field duplicate") - """ - message("TRANSFERRING LINK IDS") - timeit_direct(transfer_link_ids, sess) - timeit_direct(transfer_link_ids_welldata, sess) - - message("TRANSFERRING GROUPS") - timeit_direct(transfer_groups, sess) - - message("TRANSFERRING ASSETS") - timeit_direct(transfer_assets, sess) - - -def transfer_debugging(sess, limit=100): - message("STARTING TRANSFER DEBUG", new_line_at_top=False) - - if int(os.environ.get("ERASE_AND_REBUILD", 0)): + if get_bool_env("ERASE_AND_REBUILD", False): logger.info("Erase and rebuilding database") erase_and_rebuild_db() - metrics = Metrics() - message("TRANSFERRING WELLS") - - flags = {"TRANSFER_ALL_WELLS": True} + flags = {"TRANSFER_ALL_WELLS": True, "LIMIT": limit} # not currently used - results = timeit_direct(transfer_wells, sess, flags=flags, limit=limit) - metrics.well_metrics(sess, *results) + with session_ctx() as session: + transfer_aquifer_systems(session, limit=limit) + transfer_geologic_formations(session, limit=limit) - message("TRANSFERRING WELL SCREENS") - results = timeit_direct(transfer_wellscreens, sess) - metrics.well_screen_metrics(sess, *results) - - message("TRANSFERRING SENSORS") - results = timeit_direct(transfer_sensors, sess) - metrics.sensor_metrics(sess, *results) + message("TRANSFERRING WELLS") + results = _execute_transfer(WellTransferer, flags=flags) + metrics.well_metrics(*results) + + transfer_screens = get_bool_env("TRANSFER_WELL_SCREENS", True) + transfer_sensors = get_bool_env("TRANSFER_SENSORS", True) + transfer_contacts = get_bool_env("TRANSFER_CONTACTS", True) + transfer_waterlevels = get_bool_env("TRANSFER_WATERLEVELS", True) + transfer_pressure = get_bool_env("TRANSFER_WATERLEVELS_PRESSURE", True) + transfer_acoustic = get_bool_env("TRANSFER_WATERLEVELS_ACOUSTIC", True) + transfer_link_ids = get_bool_env("TRANSFER_LINK_IDS", True) + transfer_groups = get_bool_env("TRANSFER_GROUPS", True) + transfer_assets = get_bool_env("TRANSFER_ASSETS", True) + + if transfer_screens: + message("TRANSFERRING WELL SCREENS") + results = _execute_transfer(WellScreenTransferer, flags=flags) + metrics.well_screen_metrics(*results) + + if transfer_sensors: + message("TRANSFERRING SENSORS") + results = _execute_transfer(SensorTransferer, flags=flags) + metrics.sensor_metrics(*results) # Developer's notes all the metadata for these Things are not defined in the models/schemas yet' # message("TRANSFERRING SPRINGS") @@ -161,56 +104,63 @@ def transfer_debugging(sess, limit=100): # message("TRANSFERRING METEOROLOGICAL") # timeit_direct(transfer_met, sess, limit) - message("TRANSFERRING CONTACTS") - results = timeit_direct(transfer_contacts, sess) - metrics.contact_metrics(sess, *results) - # - message("TRANSFERRING WATER LEVELS") - results = timeit_direct(transfer_water_levels, sess) - metrics.water_level_metrics(sess, *results) - - # message("TRANSFERRING WATER LEVELS PRESSURE") - # results = timeit_direct(transfer_water_levels_pressure, sess) - # metrics.pressure_metrics(sess, *results) - - # message("TRANSFERRING WATER LEVELS ACOUSTIC") - # results = timeit_direct(transfer_water_levels_acoustic, sess) - # metrics.acoustic_metrics(sess, *results) - - """ - Developer's notes - - When transfering water chemistry data use the qc_type field to indicate - normal/blanks/duplicates instead of what comes from LU_SampleType. Use - those values, however, to map to the standard qc_type fields if applicable - (i.e. not applicable when sample type is "Soil or rock sample" or - "Precipitation," but is applicable when sample type is "Equipment blank" - or "Field duplicate") - """ - # message("TRANSFERRING LINK IDS") - # timeit_direct(transfer_link_ids, sess) - # timeit_direct(transfer_link_ids_welldata, sess) - - # message("TRANSFERRING GROUPS") - # timeit_direct(transfer_groups, sess) - - # message("TRANSFERRING WATER LEVELS ACOUSTIC") - # timeit_direct(transfer_water_levels_acoustic, sess) - # message("TRANSFERRING ASSETS") - # timeit_direct(transfer_assets, sess) - metrics.close() - metrics.save_to_storage_bucket() + if transfer_contacts: + message("TRANSFERRING CONTACTS") + results = _execute_transfer(ContactTransfer, flags=flags) + metrics.contact_metrics(*results) + + if transfer_waterlevels: + message("TRANSFERRING WATER LEVELS") + results = _execute_transfer(WaterLevelTransferer, flags=flags) + metrics.water_level_metrics(*results) + + if transfer_pressure: + message("TRANSFERRING WATER LEVELS PRESSURE") + results = _execute_transfer( + WaterLevelsContinuousPressureTransferer, flags=flags + ) + metrics.pressure_metrics(*results) + + if transfer_acoustic: + message("TRANSFERRING WATER LEVELS ACOUSTIC") + results = _execute_transfer( + WaterLevelsContinuousAcousticTransferer, flags=flags + ) + metrics.acoustic_metrics(*results) + + if transfer_link_ids: + message("TRANSFERRING LINK IDS") + results = _execute_transfer(LinkIdsWellDataTransferer, flags=flags) + metrics.welldata_link_ids_metrics(*results) + results = _execute_transfer(LinkIdsLocationDataTransferer, flags=flags) + metrics.location_link_ids_metrics(*results) + + if transfer_groups: + message("TRANSFERRING GROUPS") + results = _execute_transfer(ProjectGroupTransferer, flags=flags) + metrics.group_metrics(*results) + + if transfer_assets: + message("TRANSFERRING ASSETS") + results = _execute_transfer(AssetTransferer, flags=flags) + metrics.asset_metrics(*results) + + +def _execute_transfer(klass, flags: dict = None): + transferer = klass(flags=flags) + transferer.transfer() + return transferer.input_df, transferer.cleaned_df, transferer.errors def main(): message("START--------------------------------------") - limit = int(os.environ.get("TRANSFER_LIMIT", 1000)) - with session_ctx() as sess: - if int(os.environ.get("TRANSFER_DEBUG", 0)): - transfer_debugging(sess, limit=limit) - else: - transfer_all(sess, limit=limit) + limit = int(os.getenv("TRANSFER_LIMIT", 1000)) + metrics = Metrics() + transfer_all(metrics, limit=limit) + + metrics.close() + metrics.save_to_storage_bucket() # todo: move the log file to a storage bucket save_log_to_bucket() message("END--------------------------------------") diff --git a/transfers/transferer.py b/transfers/transferer.py new file mode 100644 index 000000000..4312051fd --- /dev/null +++ b/transfers/transferer.py @@ -0,0 +1,211 @@ +# =============================================================================== +# Copyright 2025 ross +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =============================================================================== +import time + +import pandas as pd +from pandas import DataFrame +from sqlalchemy.orm import Session + +from db import Thing, Base +from db.engine import session_ctx +from transfers.logger import logger +from transfers.util import chunk_by_size + + +class ManualFixer(object): + pass + + +class Transferer(object): + input_df: pd.DataFrame = None + cleaned_df: pd.DataFrame = None + errors: list = None + flags: dict = None + source_table: str = None + + def __init__(self, flags: dict = None): + self.errors = [] + self.flags = flags if flags else {} + self.manual_fixer = ManualFixer() + + def transfer(self): + with session_ctx() as session: + self.input_df, self.cleaned_df = self._get_dfs() + self._transfer_hook(session) + session.commit() + + def _capture_error(self, pointid, error, field, table=None): + if table is None: + table = self.source_table + + self.errors.append( + { + "pointid": pointid, + "error": error, + "table": table, + "field": field, + } + ) + + def _transfer_hook(self, session: Session): + self._limit_iterator(session, self.flags.get("LIMIT", 0)) + + def _get_df_to_iterate(self) -> pd.DataFrame: + return self.cleaned_df + + def _limit_iterator(self, session: Session, limit: int, step: int = 25): + df = self._get_df_to_iterate() + n = len(df) + start_time = time.time() + logger.info(f"Starting transfer of {n} [limit={limit}] rows") + for i, row in enumerate(df.itertuples()): + if limit and i >= limit: + logger.info(f"Reached limit of {limit} rows. Stopping migration.") + break + + if i and not i % step: + logger.info( + f"Processing row {i} of {n}, avg rows per second: {step / (time.time() - start_time):.2f}" + ) + start_time = time.time() + try: + session.commit() + except Exception as e: + logger.critical(f"Error committing wells. {e}") + session.rollback() + continue + + self._step(session, df, i, row) + + session.commit() + self._after_hook(session) + + def _step(self, session: Session, df: pd.DataFrame, i: int, row: dict): + raise NotImplementedError("Must implement _iterator method") + + def _after_hook(self, session: Session): + pass + + def _get_dfs(self): + raise NotImplementedError("Must implement _get_dfs method") + + +class ChunkTransferer(Transferer): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.chunk_size = 1000 + + def _transfer_hook(self, session: Session): + df = self._get_df_to_iterate() + for ci, chunk in enumerate(chunk_by_size(df, self.chunk_size)): + dbchunk = self._get_df_chunk(session, chunk) + logger.info( + f"Processing chunk {ci}, {len(chunk)} rows, {len(dbchunk)} db items" + ) + for i, row in enumerate(chunk.itertuples()): + dbitem = self._get_db_item(dbchunk, row) + if not dbitem: + self._missing_db_item_warning(row) + continue + self._chunk_step(session, df, i, row, dbitem) + + # def chunk_transfer(self): + # with session_ctx() as session: + # self.input_df, self.cleaned_df = self._get_dfs(session) + # df = self._get_df_to_iterate() + # for ci, chunk in enumerate(chunk_by_size(df, self.chunk_size)): + # dbchunk = self._get_df_chunk(session, chunk) + # logger.info( + # f"Processing chunk {ci}, {len(chunk)} rows, {len(dbchunk)} db items" + # ) + # for i, row in enumerate(chunk.itertuples()): + # dbitem = self._get_db_item(dbchunk, row) + # if not dbitem: + # self._missing_db_item_warning(row) + # continue + # self._chunk_iterator(session, df, i, row, dbitem) + # session.commit() + + def _get_df_chunk(self, session, chunk): + raise NotImplementedError("Must be implemented in subclass") + + def _missing_db_item_warning(self, row): + raise NotImplementedError("Must be implemented in subclass") + + def _chunk_step(self, session, df, i, row, dbitem): + raise NotImplementedError("Must be implemented in subclass") + + def _get_db_item(self, chunk, row): + raise NotImplementedError("Must be implemented in subclass") + + +class GroupTransferer(Transferer): + def _get_group(self): + return self.cleaned_df.groupby(["PointID"]) + + def _transfer_hook(self, session: Session): + self._group_iterator(session) + + def _group_iterator(self, session: Session): + groups = self._get_group() + for index, group in groups: + db_item = self._get_db_item(session, index) + if db_item is None: + logger.warning(self._no_db_item_warning(index)) + continue + + prepped_group = self._get_prepped_group(group) + self._pre_group_step(session, prepped_group, db_item) + for row in prepped_group.itertuples(): + try: + self._group_step(session, row, db_item) + except Exception as e: + import traceback + + pointid = self._get_point_id(row, db_item) + traceback.print_exc() + logger.critical(f"Could not add sensor and deployment: {e}") + self._capture_error(pointid, e, "UnknownField") + + def _get_point_id(self, row: pd.Series, db_item: Base) -> str: + return row.PointID + + def _pre_group_step(self, session: Session, group: DataFrame, db_item: Base): + pass + + def _group_step(self, session: Session, row: pd.Series, db_item: Base): + raise NotImplementedError("Must be implemented in subclass") + + def _get_prepped_group(self, group) -> DataFrame: + raise NotImplementedError("Must be implemented in subclass") + + def _no_db_item_warning(self, index) -> str: + raise NotImplementedError("Must be implemented in subclass") + + def _get_db_item(self, session, index) -> Thing: + raise NotImplementedError("Must be implemented in subclass") + + +class ThingBasedTransferer(GroupTransferer): + def _get_group(self): + return self.cleaned_df.groupby(["PointID"]) + + def _get_db_item(self, session, index) -> Thing: + pointid = index[0] + return session.query(Thing).filter(Thing.name == pointid).first() + + +# ============= EOF ============================================= diff --git a/transfers/util.py b/transfers/util.py index cbf0f2b17..d1bc5d053 100644 --- a/transfers/util.py +++ b/transfers/util.py @@ -15,9 +15,11 @@ # =============================================================================== import csv import io +import math import os import re -from datetime import datetime, timezone, timedelta +import time +from datetime import datetime, timezone, timedelta, UTC from pathlib import Path import numpy as np @@ -28,10 +30,9 @@ from sqlalchemy.orm import Session from constants import SRID_WGS84, SRID_UTM_ZONE_13N -from db import Thing, Location, DataProvenance +from db import Thing, Location, DataProvenance, Parameter +from db.engine import session_ctx from services.gcs_helper import get_storage_bucket - -# from services.lexicon_mapper import lexicon_mapper from services.util import ( transform_srid, get_epqs_elevation_from_point, @@ -53,35 +54,192 @@ } +class MeasuringPointEstimator: + def __init__(self): + df = read_csv("WaterLevels") + df["DateMeasured"] = pd.to_datetime(df["DateMeasured"], errors="coerce") + self._df = df.dropna(subset=["DateMeasured"]) + + def estimate_measuring_point_height( + self, row + ) -> tuple[float, str, datetime | None]: + mph = row.MPHeight + mph_desc = row.MeasuringPoint + df = self._df[self._df["PointID"] == row.PointID] + df = df.sort_values("DateMeasured") + if mph is None: + logger.info( + f"No MPHeight found for PointID: {row.PointID}. Estimating from measurements." + ) + mphs = [] + start_dates = [] + mph_descs = [] + + if len(df) == 0: + logger.warning(f"No measurements found for PointID: {row.PointID}.") + else: + # try to estimate mpheight from measurements + for m in df.itertuples(): + mphi = m.DepthToWater - m.DepthToWaterBGS + start_date = m.DateMeasured + if mphi not in mphs: + mphs.append(mphi) + mph_descs.append( + "Auto calculated from measurements at depth to water and depth to water below ground surface" + ) + start_dates.append(start_date) + logger.info( + f"Estimated MPHeight: {mphs}, {start_dates} for PointID: {row.PointID}." + ) + else: + mphs = [mph] + mph_descs = [mph_desc] + if len(df) > 0: + start_dates = [df["DateMeasured"].min()] + else: + start_dates = [datetime.now(tz=UTC)] + + if len(mphs) == 1: + end_dates = [None] + else: + end_dates = [start_dates[i + 1] for i in range(len(start_dates) - 1)] + end_dates.append(None) + + return zip(mphs, mph_descs, start_dates, end_dates) + + +class SensorParameterEstimator: + def __init__(self, sensor_type: str): + if sensor_type == "Pressure Transducer": + self._df = read_csv("WaterLevelsContinuous_Pressure") + else: + self._df = read_csv("WaterLevelsContinuous_Acoustic") + + # convert "DateMeasured" to date" + self._df["DateMeasured"] = pd.to_datetime(self._df["DateMeasured"]).dt.date + + def estimate_installation_date( + self, record: pd.Series + ) -> tuple[datetime | None, str | None]: + # get the first measurement for this pointid + point_id = record.PointID + cdf = self._get_values(point_id) + if len(cdf) == 0: + logger.warning( + f"Unable to estimate installation date, no measurements found for PointID: {point_id}." + ) + return None + return cdf["DateMeasured"].min() + + def _get_values(self, point_id: str): + cdf = self._df[self._df["PointID"] == point_id] + return cdf.sort_values("DateMeasured") + + def estimate_recording_interval( + self, + record: pd.Series, + installation_date: datetime = None, + removal_date: datetime = None, + ) -> tuple[int | None, str | None, str | None]: + point_id = record.PointID + cdf = self._get_values(point_id) + if len(cdf) == 0: + return None, None, f"No measurements found for PointID: {point_id}" + + if installation_date is not None: + cdf = cdf[cdf["DateMeasured"] >= installation_date] + if removal_date is not None: + cdf = cdf[cdf["DateMeasured"] <= removal_date] + + # calculate the average interval in seconds + try: + date_series = pd.to_datetime(cdf["DateMeasured"]) + intervals = date_series.diff().dropna().dt.total_seconds() + if len(intervals) == 0: + logger.warning( + f"No intervals found for {point_id} for time range " + f"{installation_date}-{removal_date}. using entire series " + ) + # take average of entire series + df = self._df[self._df["PointID"] == point_id] + df = df.sort_values("DateMeasured") + date_series = pd.to_datetime(df["DateMeasured"]) + intervals = date_series.diff().dropna().dt.total_seconds() + if len(intervals) == 0: + return ( + None, + None, + f"No measurements found for {point_id} for entire series", + ) + else: + avg_interval = intervals.mean() + else: + avg_interval = intervals.mean() + except IndexError: + return ( + None, + None, + ( + f"Not enough measurements to calculate interval for PointID: {point_id}," + f"{installation_date} to {removal_date}." + ), + ) + + # convert to hours + avg_interval /= 3600 + + unit = "hour" + if avg_interval < 0.95: # if less then 57 minutes convert to minutes + avg_interval *= 60 + unit = "minute" + if avg_interval < 0.95: # if less then 57 seconds convert to seconds + avg_interval *= 60 + unit = "second" + + return math.ceil(avg_interval), unit, None + + def replace_nans(df: pd.DataFrame, default=None) -> pd.DataFrame: df = df.replace(pd.NA, default) return df.replace({np.nan: default}) -def read_csv(name: str, dtype: dict | None = None) -> pd.DataFrame: +def read_csv( + name: str, dtype: dict | None = None, verbose=False, *args, **kw +) -> pd.DataFrame: p = get_transfers_data_path(Path("nma_csv_cache") / f"{name}.csv") if os.path.exists(p): - return pd.read_csv(p, dtype=dtype) + if verbose: + logger.info(f"Using cached csv: {p}") + starttime = time.time() + df = pd.read_csv(p, dtype=dtype, *args, **kw) + + if verbose: + logger.info(f"Read csv in {time.time()-starttime:0.2f}") + return df + else: + if verbose: + logger.info(f"Downloading csv: {name}") + # Fall back to GCS if local file doesn't exist + logger.info(f"Local file and cache not found, reading {name} from GCS") bucket = get_storage_bucket() blob = bucket.blob(f"nma_csv/{name}.csv") data = blob.download_as_bytes() with open(p, "wb") as f: f.write(data) - if dtype: - return pd.read_csv(io.BytesIO(data), dtype=dtype) - else: - return pd.read_csv(io.BytesIO(data)) + return pd.read_csv(io.BytesIO(data), dtype=dtype) -def get_valid_point_ids(session, thing_type="water well"): - things = get_valid_things(session, thing_type) - valid_pointids = [thing.name for thing in things] +def get_valid_point_ids(thing_type: str = "water well") -> list[str]: + with session_ctx() as session: + things = get_valid_things(session, thing_type) + valid_pointids = [thing.name for thing in things] return valid_pointids -def get_valid_things(session, thing_type="water well"): +def get_valid_things(session: Session, thing_type: str = "water well") -> list[Thing]: return session.query(Thing).where(Thing.thing_type == thing_type).all() @@ -102,7 +260,7 @@ def extract_organization(alternate_id: str) -> str: return "Unknown" -def get_transfers_data_path(name): +def get_transfers_data_path(name: str) -> Path: def data_path(r): return Path(r) / "transfers" / "data" @@ -115,35 +273,75 @@ def data_path(r): return root / name -def filter_non_transferred_wells(sess: Session, df: pd.DataFrame) -> pd.DataFrame: - sql = select(Thing.name).where(Thing.thing_type == "water well") - existing_ids = sess.execute(sql).scalars().all() +def filter_non_transferred_wells(df: pd.DataFrame) -> pd.DataFrame: + with session_ctx() as sess: + sql = select(Thing.name).where(Thing.thing_type == "water well") + existing_ids = sess.execute(sql).scalars().all() return df[~(df["PointID"].isin(existing_ids))] -def filter_by_welldata_datasource_and_project(df: pd.DataFrame) -> pd.DataFrame: +def get_transferable_wells( + df: pd.DataFrame, log_datasource_counts=False, log_invalid_datasources=False +) -> pd.DataFrame: path = get_transfers_data_path("valid_welldata_datasources.csv") with open(path, "r") as f: reader = csv.reader(f) _ = next(reader) valid_datasources = [row[0] for row in reader if row[1] == "Yes"] - f.seek(0) - invalid_datasources = [row[0] for row in reader if row[1] == "NO"] - logger.info("Invalid WellData Datasources:") - for vd in invalid_datasources: - logger.info(f" {vd}") - counts = df.groupby("DataSource").size().reset_index(name="WellCount") - counts = counts.sort_values("WellCount", ascending=False) - for count in counts.itertuples(): - logger.info(f"{count.DataSource}: {count.WellCount}") + if log_invalid_datasources: + f.seek(0) + invalid_datasources = [row[0] for row in reader if row[1] == "NO"] + logger.info("Invalid WellData Datasources:") + for vd in invalid_datasources: + logger.info(f" {vd}") + + if log_datasource_counts: + counts = df.groupby("DataSource").size().reset_index(name="WellCount") + counts = counts.sort_values("WellCount", ascending=False) + for count in counts.itertuples(): + logger.info(f"{count.WellCount}: {count.DataSource[:50]} ") pldf = read_csv("ProjectLocations") collabnet = pldf[pldf["ProjectName"] == "Water Level Network"] - return df[ - df["DataSource"].isin(valid_datasources) - | df["PointID"].isin(collabnet["PointID"]) - ] + + collabnet_pointids = collabnet["PointID"].unique().tolist() + logger.info( + f"collabnet pointids: {len(collabnet_pointids)} {collabnet_pointids[:10]}" + ) + + # get all pointids that have USGS as the DataSource but also have WaterLevel measurements where datasource is + # NMBGMR + usgs_df = df[df["DataSource"] == "USGS"] + + waterlevel_df = read_csv("WaterLevels") + waterlevel_df = waterlevel_df[waterlevel_df["MeasuringAgency"] == "NMBGMR"] + + usgs_pointids = ( + usgs_df[usgs_df["PointID"].isin(waterlevel_df["PointID"])]["PointID"] + .unique() + .tolist() + ) + logger.info(f"usgs pointids: {len(usgs_pointids)} {usgs_pointids[:10]}") + + # get all the pointids from the well photos and include them + wellphotos_df = read_csv("WellPhotos") + wellphotos_pointids = wellphotos_df["PointID"].unique().tolist() + + pointids = list(set(usgs_pointids + collabnet_pointids + wellphotos_pointids)) + logger.info(f"total pointids: {len(pointids)} {pointids[:10]}") + + # get all pointids that have owner info + ownerlinks_df = read_csv("OwnerLink") + locdf = read_csv("Location") + + ownerlinks_df = ownerlinks_df.join(locdf.set_index("LocationId"), on="LocationId") + ownerlinks_pointids = ownerlinks_df["PointID"].unique().tolist() + ownerpointids = list(set(ownerlinks_pointids) - set(pointids)) + logger.info(f"ownerpointids: {len(ownerpointids)} {ownerpointids[:10]}") + pointids = pointids + ownerpointids + + return df[df["DataSource"].isin(valid_datasources) | df["PointID"].isin(pointids)] def filter_by_valid_measuring_agency(df: pd.DataFrame) -> pd.DataFrame: @@ -159,12 +357,12 @@ def filter_by_valid_measuring_agency(df: pd.DataFrame) -> pd.DataFrame: return df[df["MeasuringAgency"].isin(valid_measuring_agencies)] -def filter_to_valid_point_ids(session: Session, df: pd.DataFrame) -> pd.DataFrame: - valid_point_ids = get_valid_point_ids(session) +def filter_to_valid_point_ids(df: pd.DataFrame) -> pd.DataFrame: + valid_point_ids = get_valid_point_ids() return df[df["PointID"].isin(valid_point_ids)] -def convert_mt_to_utc(dt_record: datetime): +def convert_mt_to_utc(dt_record: datetime) -> datetime: t = dt_record.time() if t.hour == 0 and t.minute == 0: # no time was measured, so just set the timezone to UTC and keep @@ -184,11 +382,22 @@ def convert_mt_to_utc(dt_record: datetime): return dt_record -def chunk_by_size(df, chunk_size): +def chunk_by_size(df: pd.DataFrame, chunk_size: int) -> pd.DataFrame: for i in range(0, len(df), chunk_size): yield df.iloc[i : i + chunk_size] +def get_groundwater_parameter_id() -> int: + with session_ctx() as session: + groundwater_parameter_id = ( + session.query(Parameter) + .filter(Parameter.parameter_name == "groundwater level") + .one() + .id + ) + return groundwater_parameter_id + + def make_location(row: pd.Series, elevations: dict) -> tuple: """ Returns a tuple of location data and the elevation method @@ -200,33 +409,6 @@ def make_location(row: pd.Series, elevations: dict) -> tuple: point, source_srid=SRID_UTM_ZONE_13N, target_srid=SRID_WGS84 ) - """ - Developer's notes - - AMP folks said that the earlier date between DateCreated and SiteDate is when - the site was inventoried, whereas the later is when the record was made in - the database. This was because they were used interchangeably. - """ - if row.DateCreated and row.SiteDate: - - date_created = datetime.strptime(row.DateCreated, "%Y-%m-%d %H:%M:%S.%f") - site_date = datetime.strptime(row.SiteDate, "%Y-%m-%d %H:%M:%S.%f") - - if date_created > site_date: - created_at = date_created - else: - created_at = site_date - elif row.DateCreated and not row.SiteDate: - created_at = datetime.strptime(row.DateCreated, "%Y-%m-%d %H:%M:%S.%f") - elif not row.DateCreated and row.SiteDate: - created_at = datetime.strptime(row.SiteDate, "%Y-%m-%d %H:%M:%S.%f") - else: - created_at = None - - # convert created_at from MST/MDT to UTC - if created_at is not None: - created_at = convert_mt_to_utc(created_at) - z = row.Altitude if z: elevation_from_epqs = False @@ -244,7 +426,7 @@ def make_location(row: pd.Series, elevations: dict) -> tuple: else: elevation_from_epqs = True logger.info( - f"Location {row.PointID} has no Altitude. Setting from National Map EPQS for " + f"Location {row.PointID} has no Altitude. Setting from National Map EPQS. " ) z = get_epqs_elevation_from_point(transformed_point.x, transformed_point.y) @@ -257,14 +439,26 @@ def make_location(row: pd.Series, elevations: dict) -> tuple: f"LU_AltitudeMethod:{row.AltitudeMethod.strip()}" ) + # Extract AMPAPI date fields (Date type, not DateTime) + nma_date_created = None + if row.DateCreated: + nma_date_created = datetime.strptime( + row.DateCreated, "%Y-%m-%d %H:%M:%S.%f" + ).date() + + nma_site_date = None + if row.SiteDate: + nma_site_date = datetime.strptime(row.SiteDate, "%Y-%m-%d %H:%M:%S.%f").date() + location = Location( nma_pk_location=row.LocationId, point=transformed_point.wkt, elevation=z, release_status="public" if row.PublicRelease else "private", - created_at=created_at, nma_coordinate_notes=row.CoordinateNotes, nma_notes_location=row.LocationNotes, + nma_date_created=nma_date_created, + nma_site_date=nma_site_date, ) return location, elevation_method @@ -275,7 +469,7 @@ def make_location_data_provenance( ) -> list[DataProvenance]: provenance_records = [] - if row.AltitudeAccuracy or row.CoordinateAccuracy: + if row.AltitudeAccuracy: provenance = DataProvenance( target_id=location.id, target_table="location", @@ -369,7 +563,6 @@ def make_location_data_provenance( target_id=location.id, target_table="location", field_name="point", - origin_source=None, collection_method=coordinate_method, accuracy_value=accuracy_value, accuracy_unit=accuracy_unit, @@ -396,19 +589,34 @@ def wrapper(*args, **kwargs): class LexiconMapper: def __init__(self): - self._mappers = None + self._mappers: dict[str, str] = None - def map_value(self, value): + def map_value(self, value) -> str: value = value.strip() return self._make_lu_to_lexicon_mapper().get(value, value) - def _make_lu_to_lexicon_mapper(self): + def _make_lu_to_lexicon_mapper(self) -> dict[str, str]: + """ + Lookup tables intentionally skipped (kept for documentation only) + Each entry explains why the table is excluded + + "LU_AltitudeDatum": "code is the value, so no need for mapping", + "LU_CoordinateDatum": "code is the value, so no need for mapping", + "LU_FieldNoteTypes": "not being used in the transfers since there are no records", + "LU_Formations": "needs to be cleaned before it can be used", + "LU_Lithology": "needs to be cleaned before it can be used", + "LU_MeasuringAgency": "the abbreviation is what is used in the new schema", + + :return: dict + """ if self._mappers: return self._mappers # Lookup tables where CODE maps to MEANING lu_tables = [ "LU_AltitudeMethod", + "LU_AquiferClass", + "LU_AquiferType", "LU_CollectionMethod", "LU_ConstructionMethod", "LU_CoordinateAccuracy", @@ -418,7 +626,9 @@ def _make_lu_to_lexicon_mapper(self): "LU_DataSource", "LU_Depth_CompletionSource", "LU_Discharge_ChemistrySource", + "LU_Formations", "LU_LevelStatus", + "LU_Lithology", "LU_MajorAnalyte", "LU_MeasurementMethod", "LU_MinorTraceAnalyte", @@ -428,16 +638,6 @@ def _make_lu_to_lexicon_mapper(self): "LU_Status", ] - # Lookup tables intentionally skipped (kept for documentation only) - # Each entry explains why the table is excluded - _lu_tables_skipped = { - "LU_AltitudeDatum": "code is the value, so no need for mapping", - "LU_CoordinateDatum": "code is the value, so no need for mapping", - "LU_FieldNoteTypes": "not being used in the transfers since there are no records", - "LU_Formations": "needs to be cleaned before it can be used", - "LU_Lithology": "needs to be cleaned before it can be used", - "LU_MeasuringAgency": "the abbreviation is what is used in the new schema", - } mappers = {} for lu_table in lu_tables: @@ -447,6 +647,9 @@ def _make_lu_to_lexicon_mapper(self): if lu_table == "LU_Formations": code = row.Code meaning = row.Meaning + elif lu_table == "LU_Lithology": + code = row.ABBREVIATION + meaning = row.TERM else: code = row.CODE meaning = row.MEANING diff --git a/transfers/waterlevels_transducer_transfer.py b/transfers/waterlevels_transducer_transfer.py index 64e39b439..cd323330c 100644 --- a/transfers/waterlevels_transducer_transfer.py +++ b/transfers/waterlevels_transducer_transfer.py @@ -13,149 +13,196 @@ # See the License for the specific language governing permissions and # limitations under the License. # =============================================================================== -from pandas import to_datetime, Timestamp + +import pandas as pd +from pandas import Timestamp from pydantic import ValidationError +from sqlalchemy.orm import Session -from db import Parameter, Thing, Deployment, Sensor +from db import Thing, Deployment, Sensor from db.transducer import TransducerObservation, TransducerObservationBlock +from schemas.transducer import CreateTransducerObservation from transfers.logger import logger -from transfers.util import read_csv, filter_to_valid_point_ids - +from transfers.transferer import Transferer +from transfers.util import ( + read_csv, + filter_to_valid_point_ids, + get_groundwater_parameter_id, +) + + +class WaterLevelsContinuousTransferer(Transferer): + _partition_field: str + _sensor_types: tuple[str] + + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + self.groundwater_parameter_id = get_groundwater_parameter_id() + if self._sensor_types is None: + raise ValueError("_sensor_types must be set") + if self._partition_field is None: + raise ValueError("_partition_field must be set") + + def _get_dfs(self): + input_df = read_csv(self.source_table, parse_dates=["DateMeasured"]) + cleaned_df = filter_to_valid_point_ids(input_df) + cleaned_df = cleaned_df.sort_values(by=["PointID"]) -def transfer_water_levels_acoustic(session): - wd = read_csv("WaterLevelsContinuous_Acoustic") - return _transfer_water_levels_continuous( - session, wd, "PublicRelease", "Acoustic Sounder" - ) + # remove rows with no date measured + cleaned_df = cleaned_df[cleaned_df.DateMeasured.notna()] + return input_df, cleaned_df + + def _transfer_hook(self, session: Session) -> None: + gwd = self.cleaned_df.groupby(["PointID"]) + n = len(gwd) + nodeployments = {} + for i, (index, group) in enumerate(gwd): + pointid = index[0] + logger.info( + f"Processing PointID: {pointid}. {i + 1}/{n} ({100*(i+1)/n:0.2f}) completed." + ) + deployments = ( + session.query(Deployment) + .join(Thing) + .join(Sensor) + .where(Sensor.sensor_type.in_(self._sensor_types)) + .where(Thing.name == pointid) + .all() + ) -def transfer_water_levels_pressure(session): - wd = read_csv("WaterLevelsContinuous_Pressure") - return _transfer_water_levels_continuous(session, wd, "QCed", "Pressure Transducer") + # sort rows by date measured + group = group.sort_values(by="DateMeasured") + field = getattr(group, self._partition_field) + qced = group[field == 1] + notqced = group[~(field == 1)] -def _transfer_water_levels_continuous(session, input_df, partition_field, sensor_type): - from schemas.transducer import CreateTransducerObservation + qced_block = TransducerObservationBlock( + parameter_id=self.groundwater_parameter_id, review_status="approved" + ) + notqced_block = TransducerObservationBlock( + parameter_id=self.groundwater_parameter_id, review_status="not reviewed" + ) - groundwater_parameter_id = ( - session.query(Parameter) - .filter(Parameter.parameter_name == "groundwater level") - .one() - .id - ) - cleaned_df = filter_to_valid_point_ids(session, input_df) + for block, rows, release_status in ( + (qced_block, qced, "public"), + (notqced_block, notqced, "private"), + ): + block.start_datetime = rows.DateMeasured.min() + block.end_datetime = rows.DateMeasured.max() - # group by pointid - gwd = cleaned_df.groupby(["PointID"]) - errors = [] - for index, group in gwd: - pointid = index[0] - logger.info(f"Processing PointID: {pointid}") + if not deployments: + logger.critical( + f"Thing with PointID={pointid} has no deployments. Skipping water levels {release_status} block" + ) + self._capture_error(pointid, "no deployments", "DateMeasured") + continue - deployments = ( - session.query(Deployment) - .join(Thing) - .join(Sensor) - .where(Sensor.sensor_type == sensor_type) - .where(Thing.name == pointid) - .all() - ) + if rows.empty: + logger.info(f"no {release_status} records for pointid {pointid}") + continue - # remove rows with no date measured - group = group[group.DateMeasured.notna()] - group["DateMeasured"] = to_datetime(group["DateMeasured"], errors="coerce") - - # sort rows by date measured - group = group.sort_values(by="DateMeasured") - field = getattr(group, partition_field) - - qced = group[field == 1] - notqced = group[~(field == 1)] - - qced_block = TransducerObservationBlock( - parameter_id=groundwater_parameter_id, review_status="approved" - ) - notqced_block = TransducerObservationBlock( - parameter_id=groundwater_parameter_id, review_status="not reviewed" - ) - - for block, rows, release_status in ( - (qced_block, qced, "public"), - (notqced_block, notqced, "private"), - ): - block.start_datetime = rows.DateMeasured.min() - block.end_datetime = rows.DateMeasured.max() - - if not deployments: - logger.critical( - f"Thing with PointID={pointid} has no deployments. Skipping water levels {release_status} block" - ) - errors.append({"pointid": pointid, "error": "no deployments"}) - continue - - if rows.empty: - logger.info(f"no {release_status} records for pointid {pointid}") - continue - - observations = [] - for row in rows.itertuples(): - deployment = next( - ( - d - for d in deployments - if Timestamp(d.installation_date) <= row.DateMeasured - and ( - d.removal_date is None - or Timestamp(d.removal_date) >= row.DateMeasured - ) - ), - None, + deps_sorted = sorted( + deployments, key=lambda d: Timestamp(d.installation_date) ) - if deployment is None: - errors.append( - { - "pointid": pointid, - "error": f"no deployment at {row.DateMeasured}", - } + observations = [ + self._make_observation( + pointid, row, release_status, deps_sorted, nodeployments ) + for row in rows.itertuples() + ] + + observations = [obs for obs in observations if obs is not None] + session.bulk_save_objects(observations) + session.add(block) + logger.info( + f"Added {len(observations)} water levels {release_status} block" + ) + try: + session.commit() + except Exception as e: + self.append({"pointid": pointid, "error": e}) logger.critical( - f"No deployment found for PointID={pointid} at {row.DateMeasured}" + f"Error committing water levels {release_status} block: {e}" ) + session.rollback() continue - try: - payload = dict( - parameter_id=groundwater_parameter_id, - deployment_id=deployment.id, - observation_datetime=row.DateMeasured, - value=row.DepthToWaterBGS, - release_status=release_status, - ) - obspayload = CreateTransducerObservation.model_validate( - payload - ).model_dump() - observations.append(TransducerObservation(**obspayload)) - except ValidationError as e: - logger.critical(f"Observation validation error: {e.errors()}") - errors.append({"pointid": pointid, "error": e.errors()}) - - session.bulk_save_objects(observations) - session.add(block) - logger.info( - f"Added {len(observations)} water levels {release_status} block" + # convert nodeployments to errors + for pointid, (min_date, max_date) in nodeployments.items(): + self._capture_error( + pointid, + "DateMeasured", + f"no deployment between {min_date} and {max_date}", ) - try: - session.commit() - except Exception as e: - errors.append({"pointid": pointid, "error": e}) - logger.critical( - f"Error committing water levels {release_status} block: {e}" - ) - session.rollback() - continue - return input_df, cleaned_df, errors + def _make_observation( + self, + pointid: str, + row: pd.Series, + release_status: str, + deps_sorted: list, + nodeployments: dict, + ) -> TransducerObservation | None: + deployment = _find_deployment(row.DateMeasured, deps_sorted) + + if deployment is None: + if pointid not in nodeployments: + nodeployments[pointid] = (row.DateMeasured, row.DateMeasured) + else: + min_date, max_date = nodeployments[pointid] + if row.DateMeasured < min_date: + min_date = row.DateMeasured + elif row.DateMeasured > max_date: + max_date = row.DateMeasured + nodeployments[pointid] = min_date, max_date + + logger.critical( + f"No deployment found for PointID={pointid} at {row.DateMeasured}" + ) + return None + + try: + payload = dict( + parameter_id=self.groundwater_parameter_id, + deployment_id=deployment.id, + observation_datetime=row.DateMeasured, + value=row.DepthToWaterBGS, + release_status=release_status, + ) + obspayload = CreateTransducerObservation.model_validate( + payload + ).model_dump() + return TransducerObservation(**obspayload) + + except ValidationError as e: + logger.critical(f"Observation validation error: {e.errors()}") + self._capture_error(pointid, str(e), "DepthToWaterBGS") + + +class WaterLevelsContinuousPressureTransferer(WaterLevelsContinuousTransferer): + source_table = "WaterLevelsContinuous_Pressure" + _partition_field = "QCed" + _sensor_types = ("Pressure Transducer", "Barometer", "DiverLink", "Diver Cable") + + +class WaterLevelsContinuousAcousticTransferer(WaterLevelsContinuousTransferer): + source_table = "WaterLevelsContinuous_Acoustic" + _partition_field = "PublicRelease" + _sensor_types = ("Acoustic Sounder",) + + +def _find_deployment(ts, deployments): + date = ts.date() + for d in deployments: + if d.installation_date > date: + break # because sorted by start + end = d.removal_date if d.removal_date else Timestamp.max.date() + if end >= date: + return d + return None # ============= EOF ============================================= diff --git a/transfers/waterlevels_transfer.py b/transfers/waterlevels_transfer.py index a1bb32717..270592a66 100644 --- a/transfers/waterlevels_transfer.py +++ b/transfers/waterlevels_transfer.py @@ -14,11 +14,11 @@ # limitations under the License. # =============================================================================== import json -import time import uuid from datetime import datetime import pandas as pd +from sqlalchemy.orm import Session from db import ( Thing, @@ -30,6 +30,8 @@ FieldEventParticipant, Parameter, ) +from db.engine import session_ctx +from transfers.transferer import Transferer from transfers.util import ( filter_to_valid_point_ids, logger, @@ -46,348 +48,274 @@ SPACE_6 = " " * 6 -def get_dt_utc(row, errors): - if pd.isna(row.DateMeasured): - logger.critical( - f"transfer_water_levels. Skipping row PointID={row.PointID}, objectid={row.OBJECTID} because there is no DateMeasured" - ) - errors.append( - { - "pointid": row.PointID, - "error": "no DateMeasured", - "table": "WaterLevels", - "field": "DateMeasured", - } - ) - return - - if pd.isna(row.TimeMeasured): - fmt = "%Y-%m-%d" - dt_measured = row.DateMeasured - else: - fmt = "%Y-%m-%d %H:%M:%S.%f" - t = row.TimeMeasured - # Truncate microseconds to 6 digits if present - if "." in t: - t = t[:-6] - - dt_measured = f"{row.DateMeasured} {t}" - - try: - dt = datetime.strptime(dt_measured, fmt) - return convert_mt_to_utc(dt) - except ValueError as e: - errors.append( - { - "pointid": row.PointID, - "error": str(e), - "table": "WaterLevels", - "field": "DateMeasured", - } - ) - logger.critical( - f"transfer_water_levels. Skipping row PointID={row.PointID}, objectid={row.OBJECTID} due to " - f"invalid date/time: {e}" - ) - return None - - -def get_contacts_info(row, measured_by, measured_by_mapper): - # measuring_agency = ( - # "Unknown" if pd.isna(row.MeasuringAgency) else row.MeasuringAgency - # ) - - # ns --> names - # os --> organizations - # rs --> roles +def get_contacts_info( + row, measured_by, measured_by_mapper +) -> list[tuple[str, str, str]]: # TODO: get help figuring out (AMP) if measured_by in measured_by_mapper: args = measured_by_mapper[measured_by] if isinstance(args[0], list): - ns, os, rs = zip(*args) + names, orgs, roles = zip(*args) else: - ns = [args[0]] - os = [args[1]] - rs = [args[2]] + names, orgs, roles = [args[0]], [args[1]], [args[2]] + else: - ns = [measured_by] - os = ["Unknown"] - rs = ["Unknown"] + names = [measured_by] + orgs = ["Unknown"] + roles = ["Unknown"] logger.warning( f"{SPACE_6}The following record has not been mapped to a Contact: MeasuredBy {row.MeasuredBy} | MeasuringAgency {row.MeasuringAgency} for WaterLevels record with GLobalID {row.GlobalID}" ) - return ns, os, rs - - -def transfer_water_levels(session): - groundwater_parameter_id = ( - session.query(Parameter) - .filter(Parameter.parameter_name == "groundwater level") - .one() - .id - ) - - # keep a dictionary of created Contacts to avoid repeated SQL queries - # keys are a tuple of (name, organization) since None is a common "name" - created_contacts = {} - path = get_transfers_data_path("measured_by_mapper.json") - - with open(path, "r") as f: - measured_by_mapper = json.load(f) - source_table = "WaterLevels" - input_df = read_csv(source_table) - cleaned_df = filter_to_valid_point_ids(session, input_df) - cleaned_df = filter_by_valid_measuring_agency(cleaned_df) - - gwd = cleaned_df.groupby(["PointID"]) - - start_time = time.time() - errors = [] - - # TODO: this needs to be cleaned up - # the for loop is too long and hard to read - # adding contacts should be done in a separate function - for index, group in gwd: - pointid = index[0] - logger.info(f"Processing PointID: {pointid}") - thing = session.query(Thing).where(Thing.name == pointid).first() - if thing is None: - logger.critical( - f"Thing with PointID={pointid} not found. Skipping water levels" - ) - errors.append( - { - "pointid": pointid, - "error": "Thing with PointID not found", - "table": source_table, - "field": "PointID", - } - ) - continue - - n = len(group) - for i, row in enumerate(group.itertuples()): - if i and not i % 25: - logger.info( - f"Processing row {i} of {n}. {row.PointID}, avg rows per second: {i / (time.time() - start_time):.2f}" - ) - session.commit() - - dt_utc = get_dt_utc(row, errors) - if dt_utc is None: - continue - - release_status = "public" if row.PublicRelease else "private" - - measured_by = None if pd.isna(row.MeasuredBy) else row.MeasuredBy - - """ - Developer's notes + return zip(names, orgs, roles) - Use existing contact for the thing if measured by is the owner. - If no contacts can be made or retrieved for the field event skip - it altogether and note in the log file. There must be at least one - contact associated with an event - """ - field_event_participants = [] - if measured_by not in ["Owner", "Owner report", "Well owner"]: - # --- Contact/FieldEventParticipant --- - contact_info = get_contacts_info(row, measured_by, measured_by_mapper) - - for name, organization, role in zip(*contact_info): - if (name, organization) in created_contacts: - contact = created_contacts[(name, organization)] - else: - try: - # create new contact if not already created - contact = Contact( - name=name, - role=role, - contact_type="Field Event Participant", - organization=organization, - nma_pk_waterlevels=row.GlobalID, - ) - session.add(contact) - # session.flush() # to get the contact.id - - logger.info( - f"{SPACE_2}Created contact: | Name {contact.name} | Role {contact.role} | Organization {contact.organization} | nma_pk_waterlevels {contact.nma_pk_waterlevels}" - ) - - created_contacts[(name, organization)] = contact - except Exception as e: - logger.critical( - f"Contact cannot be created: Name {name} | Role {role} | Organization {organization} because of the following: {str(e)}" - ) - continue - - field_event_participants.append(contact) - else: - contact = thing.contacts[0] - field_event_participants.append(contact) - - if len(field_event_participants) == 0: - logger.critical( - f"No contacts can be associated with the WaterLevels record with GlobalID {row.GlobalID}, therefore no field event, field activity, sample, and observation can be made. Skipping." +class WaterLevelTransferer(Transferer): + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + self.source_table = "WaterLevels" + with session_ctx() as session: + groundwater_parameter_id = ( + session.query(Parameter) + .filter(Parameter.parameter_name == "groundwater level") + .one() + .id + ) + self.groundwater_parameter_id = groundwater_parameter_id + + path = get_transfers_data_path("measured_by_mapper.json") + with open(path, "r") as f: + self._measured_by_mapper = json.load(f) + + self._created_contacts = {} + + def _get_dfs(self) -> tuple[pd.DataFrame, pd.DataFrame]: + input_df = read_csv(self.source_table) + cleaned_df = filter_to_valid_point_ids(input_df) + cleaned_df = filter_by_valid_measuring_agency(cleaned_df) + return input_df, cleaned_df + + def _transfer_hook(self, session: Session) -> None: + gwd = self.cleaned_df.groupby(["PointID"]) + for index, group in gwd: + pointid = index[0] + thing = session.query(Thing).where(Thing.name == pointid).first() + + for i, row in enumerate(group.itertuples()): + dt_utc = self._get_dt_utc(row) + if dt_utc is None: + continue + + release_status = "public" if row.PublicRelease else "private" + + # field event + field_event = FieldEvent( + thing=thing, + event_date=dt_utc, + release_status=release_status, ) - continue - - """ - Developer's notes + session.add(field_event) + field_event_participants = self._get_field_event_participants( + session, row, thing + ) + sampler = None + for i, participant in enumerate(field_event_participants): + field_event_participant = FieldEventParticipant( + field_event=field_event, participant=participant + ) + if i == 0: + field_event_participant.participant_role = "Lead" + sampler = field_event_participant + else: + field_event_participant.participant_role = "Participant" - Assumes for manual water levels that the date/time of the water level - measurement is the same as the date/time of the field event. - """ + session.add(field_event_participant) - # --- FieldEvent --- - # TODO: use create schema to validate data - field_event = FieldEvent( - thing=thing, - event_date=dt_utc, - release_status=release_status, - ) + # reasons + glv = self._get_groundwater_level_reason(row) + if ( + glv + == "Well was destroyed (no subsequent water levels should be recorded)" + ): + logger.warning( + "Well is destroyed - no field activity/sample/observation will be made" + ) + field_event.notes = glv + continue + + # Field Activity + # TODO: use create schema to validate data + field_activity = FieldActivity( + field_event=field_event, + activity_type="groundwater level", + release_status=release_status, + ) + session.add(field_activity) - session.add(field_event) + # Sample + sample = self._make_sample(row, field_activity, dt_utc, sampler) + session.add(sample) - logger.info( - f"{SPACE_2}Created field event: ID {field_event.id} | Date {field_event.event_date} | Thing ID {field_event.thing.id} | Thing Name {field_event.thing.name}" - ) + # Observation + observation = self._make_observation(row, sample, dt_utc, glv) + session.add(observation) - """ - Developer's notes + session.commit() - Assumes that the first listed contact is the lead and the - person who took the sample. The subsequent contact will be - participants in the field event - """ - for i, participant in enumerate(field_event_participants): - field_event_participant = FieldEventParticipant( - field_event=field_event, participant=participant + def _make_observation( + self, row: pd.Series, sample: Sample, dt_utc: datetime, glv: str + ) -> Observation: + if pd.isna(row.MPHeight): + if pd.notna(row.DepthToWater) and pd.notna(row.DepthToWaterBGS): + logger.warning( + f"{SPACE_6}Calculating measuring_point_height as DepthToWater - DepthToWaterBGS because MPHeight is NULL" ) - if i == 0: - field_event_participant.participant_role = "Lead" - sampler = field_event_participant - else: - field_event_participant.participant_role = "Participant" - - session.add(field_event_participant) - logger.info( - f"{SPACE_4}Created field event contact: ID {field_event_participant.id} | Role {field_event_participant.participant_role} | Contact ID {field_event_participant.participant.id} | Contact Name {field_event_participant.participant.name} | Contact Org {field_event_participant.participant.organization}" + measuring_point_height = row.DepthToWater - row.DepthToWaterBGS + else: + logger.warning( + f"{SPACE_6}Setting measuring_point_height to None because MPHeight is NULL and DepthToWater or DepthToWaterBGS is NULL" ) + measuring_point_height = None + else: + # some mp heights are recorded as negative numbers, but they should be positive + measuring_point_height = abs(row.MPHeight) - groundwater_level_reason = ( - lexicon_mapper.map_value(f"LU_LevelStatus:{row.LevelStatus}") - if not pd.isna(row.LevelStatus) - else None - ) - groundwater_level_reason = ( - "Water level not affected" - if groundwater_level_reason == "Water level not affected by status" - else groundwater_level_reason - ) - - if ( - groundwater_level_reason - == "Well was destroyed (no subsequent water levels should be recorded)" - ): + if pd.isna(row.DepthToWater): + if pd.notna(row.DepthToWaterBGS): logger.warning( - "Well is destroyed - no field activity/sample/observation will be made" + f"{SPACE_6}Calculating observation value as DepthToWaterBGS + MPHeight (0 if MPHeight is NULL) because DepthToWater is NULL" ) - field_event.notes = groundwater_level_reason - continue - - # --- FieldActivity --- - # TODO: use create schema to validate data - field_activity = FieldActivity( - field_event=field_event, - activity_type="groundwater level", - release_status=release_status, - ) - session.add(field_activity) + value = row.DepthToWaterBGS + measuring_point_height + else: + # use None not NaN + value = None + else: + value = row.DepthToWater - logger.info( - f"{SPACE_4}Created field activity: ID {field_activity.id} | Type {field_activity.activity_type}" + # TODO: after sensors have been added to the database update sensor_id (or sensor) for waterlevels that come from db sensors (like e probes?) + observation = Observation( + nma_pk_waterlevels=row.GlobalID, + sample=sample, + sensor_id=None, + analysis_method_id=None, + observation_datetime=dt_utc, + parameter_id=self.groundwater_parameter_id, + value=value, + unit="ft", + measuring_point_height=measuring_point_height, + groundwater_level_reason=glv, + ) + return observation + + def _make_sample(self, row, field_activity, dt_utc, sampler) -> Sample: + sample_method = ( + "null placeholder" + if pd.isna(row.MeasurementMethod) + else lexicon_mapper.map_value( + f"LU_MeasurementMethod:{row.MeasurementMethod}" ) + ) + sample = Sample( + nma_pk_waterlevels=row.GlobalID, + field_activity=field_activity, + field_event_participant=sampler, + sample_date=dt_utc, + sample_matrix="water", + sample_name=str(uuid.uuid4()), + sample_method=sample_method, + qc_type="Normal", + depth_top=None, + depth_bottom=None, + ) + return sample - # --- Sample --- - sample_method = ( - "null placeholder" - if pd.isna(row.MeasurementMethod) - else lexicon_mapper.map_value( - f"LU_MeasurementMethod:{row.MeasurementMethod}" - ) - ) + def _get_groundwater_level_reason(self, row) -> str: + glv = row.LevelStatus + if pd.isna(glv): + return None - # todo: use create schema to validate data - sample = Sample( - nma_pk_waterlevels=row.GlobalID, - field_activity=field_activity, - field_event_participant=sampler, - sample_date=dt_utc, - sample_matrix="water", - sample_name=str(uuid.uuid4()), - sample_method=sample_method, - qc_type="Normal", - depth_top=None, - depth_bottom=None, - ) - session.add(sample) - logger.info( - f"{SPACE_4}Created sample: ID {sample.id} | Date {sample.sample_date} | Matrix {sample.sample_matrix} | Method {sample.sample_method}" - ) + glv = lexicon_mapper.map_value(f"LU_LevelStatus:{glv}") + if glv == "Water level not affected by status": + glv = "Water level not affected" + return glv - # TODO: use create schema to validate data + def _get_field_event_participants(self, session, row, thing) -> list[Contact]: + field_event_participants = [] + measured_by = None if pd.isna(row.MeasuredBy) else row.MeasuredBy - if pd.isna(row.MPHeight): - if not pd.isna(row.DepthToWater) and not pd.isna(row.DepthToWaterBGS): - logger.warning( - f"{SPACE_6}Calculating measuring_point_height as DepthToWater - DepthToWaterBGS because MPHeight is NULL" - ) - measuring_point_height = row.DepthToWater - row.DepthToWaterBGS - else: - logger.warning( - f"{SPACE_6}Setting measuring_point_height to None because MPHeight is NULL and DepthToWater or DepthToWaterBGS is NULL" - ) - measuring_point_height = None - else: - # some mp heights are recorded as negative numbers, but they should be positive - measuring_point_height = abs(row.MPHeight) + if measured_by not in ["Owner", "Owner report", "Well owner"]: + # --- Contact/FieldEventParticipant --- + contact_info = get_contacts_info(row, measured_by, self._measured_by_mapper) - if pd.isna(row.DepthToWater): - if not pd.isna(row.DepthToWaterBGS): - logger.warning( - f"{SPACE_6}Calculating observation value as DepthToWaterBGS + MPHeight (0 if MPHeight is NULL) because DepthToWater is NULL" - ) - value = row.DepthToWaterBGS + measuring_point_height + for name, organization, role in contact_info: + if (name, organization) in self._created_contacts: + contact = self._created_contacts[(name, organization)] else: - # use None not NaN - value = None - else: - value = row.DepthToWater + try: + # create new contact if not already created + contact = Contact( + name=name, + role=role, + contact_type="Field Event Participant", + organization=organization, + nma_pk_waterlevels=row.GlobalID, + ) + session.add(contact) + + logger.info( + f"{SPACE_2}Created contact: | Name {contact.name} | Role {contact.role} | Organization {contact.organization} | nma_pk_waterlevels {contact.nma_pk_waterlevels}" + ) + + self._created_contacts[(name, organization)] = contact + except Exception as e: + logger.critical( + f"Contact cannot be created: Name {name} | Role {role} | Organization {organization} because of the following: {str(e)}" + ) + continue - # TODO: after sensors have been added to the database update sensor_id (or sensor) for waterlevels that come from db sensors (like e probes?) - observation = Observation( - nma_pk_waterlevels=row.GlobalID, - sample=sample, - sensor_id=None, - analysis_method_id=None, - observation_datetime=dt_utc, - parameter_id=groundwater_parameter_id, - value=value, - unit="ft", - measuring_point_height=measuring_point_height, - groundwater_level_reason=groundwater_level_reason, + field_event_participants.append(contact) + else: + contact = thing.contacts[0] + field_event_participants.append(contact) + + if len(field_event_participants) == 0: + logger.critical( + f"No contacts can be associated with the WaterLevels record with GlobalID {row.GlobalID}, therefore no field event, field activity, sample, and observation can be made. Skipping." ) - session.add(observation) - logger.info( - f"{SPACE_4}Created observation: ID {observation.id} | DT {observation.observation_datetime} | Value {observation.value} | MPHeight {observation.measuring_point_height} | nma_pk_waterlevels {observation.nma_pk_waterlevels}" + + return field_event_participants + + def _get_dt_utc(self, row) -> datetime | None: + if pd.isna(row.DateMeasured): + logger.critical( + f"transfer_water_levels. Skipping row PointID={row.PointID}, objectid={row.OBJECTID} because there is no DateMeasured" ) - session.commit() + self._capture_error(row.PointID, "no DateMeasured", "DateMeasured") + return None - return input_df, cleaned_df, errors + if pd.isna(row.TimeMeasured): + fmt = "%Y-%m-%d" + dt_measured = row.DateMeasured + else: + fmt = "%Y-%m-%d %H:%M:%S.%f" + t = row.TimeMeasured + # Truncate microseconds to 6 digits if present + if "." in t: + t = t[:-6] + + dt_measured = f"{row.DateMeasured} {t}" + + try: + dt = datetime.strptime(dt_measured, fmt) + return convert_mt_to_utc(dt) + except ValueError as e: + self._capture_error(row.PointID, str(e), "DateMeasured") + logger.critical( + f"transfer_water_levels. Skipping row PointID={row.PointID}, objectid={row.OBJECTID} due to " + f"invalid date/time: {e}" + ) + return None # ============= EOF ============================================= diff --git a/transfers/well_transfer.py b/transfers/well_transfer.py index ee54d0216..d92f2ece6 100644 --- a/transfers/well_transfer.py +++ b/transfers/well_transfer.py @@ -14,12 +14,14 @@ # limitations under the License. # =============================================================================== import json +import re import time from datetime import datetime, UTC import pandas as pd -from pandas import isna +from pandas import isna, notna from pydantic import ValidationError +from sqlalchemy.exc import DatabaseError from sqlalchemy.orm import Session from core.enums import ( @@ -36,6 +38,11 @@ StatusHistory, MonitoringFrequencyHistory, MeasuringPointHistory, + DataProvenance, + AquiferSystem, + AquiferType, + GeologicFormation, + ThingAquiferAssociation, ) from schemas.thing import CreateWell, CreateWellScreen from services.gcs_helper import get_storage_bucket @@ -44,6 +51,7 @@ get_county_from_point, get_quad_name_from_point, ) +from transfers.transferer import ChunkTransferer, Transferer from transfers.util import ( make_location, make_location_data_provenance, @@ -51,10 +59,10 @@ read_csv, logger, replace_nans, - filter_by_welldata_datasource_and_project, + get_transferable_wells, lexicon_mapper, filter_non_transferred_wells, - chunk_by_size, + MeasuringPointEstimator, ) ADDED = [] @@ -117,35 +125,131 @@ def _extract_casing_materials(row) -> list[str]: return materials -def get_wells_to_transfer( - sess: Session, flags: dict = None -) -> tuple[pd.DataFrame, pd.DataFrame]: - if flags is None: - flags = {} - - wdf = read_csv("WellData", dtype={"OSEWelltagID": str}) - ldf = read_csv("Location") - ldf = ldf.drop(["PointID", "SSMA_TimeStamp"], axis=1) - wdf = wdf.join(ldf.set_index("LocationId"), on="LocationId") - wdf = wdf[wdf["SiteType"] == "GW"] - wdf = wdf[wdf["Easting"].notna() & wdf["Northing"].notna()] - - input_df = wdf - wdf = replace_nans(wdf) - if flags.get("TRANSFER_ALL_WELLS", True): - # todo: filter Locations by DataSource - cleaned_df = filter_by_welldata_datasource_and_project(wdf) +PUMP_PATTERN = re.compile( + r"\b(?Pjet|hand|submersible)\b|\b(?Pline[-\s]+shaft)\b", re.IGNORECASE +) + + +def first_matched_term(text: str): + m = PUMP_PATTERN.search(text) + if not m: + return None + return m.group("term") or m.group("phrase") + + +def _extract_well_pump_type(row) -> str | None: + if isna(row.ConstructionNotes): + return None + construction_notes = row.ConstructionNotes.lower() + pump = first_matched_term(construction_notes) + if pump: + return pump.capitalize() else: - # get a subset of wells that have not been transferred yet - # todo: this needs to be defined. - # for now, we are just filtering out wells that have not been transferred yet - # In the future we will be using criteria to determine which wells to transfer - # for example, wells in the "Water Level Network" project - cleaned_df = wdf + return None + + +# Parse aquifer codes +def _extract_aquifer_type_codes(aquifer_code: str) -> list[str]: + """ + Parse aquifer type codes that may contain multiple values. + + Args: + aquifer_code: Raw code from AquiferType field + + Returns: + List of individual codes + """ + if not aquifer_code: + return [] + # clean the code + code = aquifer_code.strip().upper() + # split into individual characters. This handles cases like "FC" -> ["F", "C"] + individual_codes = list(code) + return individual_codes + + +# Get or create aquifer system +def get_or_create_aquifer_system( + session: Session, aquifer_name: str, primary_type: str +) -> AquiferSystem | None: + """ + Get existing aquifer or create new one if it doesn't exist. + + With the new AquiferType model, we create ONE aquifer record per named + aquifer (e.g., one "Santa Fe Group"), not multiple variants. + + Args: + session: Database session + aquifer_name: Name of the aquifer (from AqClass or type name) + primary_type: Primary aquifer type for the aquifer_type field + """ + # Try to find existing aquifer by name + aquifer = ( + session.query(AquiferSystem).filter(AquiferSystem.name == aquifer_name).first() + ) - cleaned_df = filter_non_transferred_wells(sess, cleaned_df) + if aquifer: + return aquifer - return input_df, cleaned_df + # Create new aquifer + try: + logger.info( + f"Creating new aquifer system: {aquifer_name} (primary type: {primary_type})" + ) + + aquifer = AquiferSystem( + name=aquifer_name, + primary_aquifer_type=primary_type, # Primary type + geographic_scale=None, # Default + ) + session.add(aquifer) + session.commit() + # session.flush() # Get the ID + # session.refresh(aquifer) + return aquifer + except DatabaseError as e: + session.rollback() + logger.critical(f"Error creating aquifer {aquifer_name}: {e}") + return None + + +def get_or_create_geologic_formation( + session: Session, formation_code: str +) -> GeologicFormation | None: + """ + Get existing geologic formation or create new one if it doesn't exist. + + Args: + session: Database session + formation_code: The formation code from FormationZone field + + Returns: + GeologicFormation object or None if creation fails + """ + # Try to find existing formation + formation = ( + session.query(GeologicFormation) + .filter(GeologicFormation.formation_code == formation_code) + .first() + ) + + if formation: + return formation + + # If not found, create new formation + try: + logger.info(f"Creating new geologic formation: {formation_code}") + formation = GeologicFormation( + formation_code=formation_code, + description=None, + lithology=None, + ) + session.add(formation) + session.flush() + return formation + except Exception as e: + logger.critical(f"Error creating formation {formation_code}: {e}") + return None def get_cached_elevations() -> dict: @@ -166,69 +270,64 @@ def dump_cached_elevations(lut: dict): blob.upload_from_string(json.dumps(lut)) -def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None: - input_df, cleaned_df = get_wells_to_transfer(session, flags) +class WellTransferer(Transferer): source_table = "WellData" - wdf = cleaned_df - n = len(wdf) - - step = 25 - start_time = time.time() - errors = [] - added_locations = {} - cached_elevations = get_cached_elevations() - for i, row in enumerate(wdf.itertuples()): + + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + self._cached_elevations = get_cached_elevations() + self._added_locations = {} + + def _get_dfs(self): + wdf = read_csv("WellData", dtype={"OSEWelltagID": str}) + ldf = read_csv("Location") + ldf = ldf.drop(["PointID", "SSMA_TimeStamp"], axis=1) + wdf = wdf.join(ldf.set_index("LocationId"), on="LocationId") + wdf = wdf[wdf["SiteType"] == "GW"] + wdf = wdf[wdf["Easting"].notna() & wdf["Northing"].notna()] + + input_df = wdf + wdf = replace_nans(wdf) + + # if flags.get("TRANSFER_ALL_WELLS", False): + # # todo: filter Locations by DataSource + # cleaned_df = filter_by_welldata_datasource_and_project(wdf) + # else: + # # get a subset of wells that have not been transferred yet + # # todo: this needs to be defined. + # # for now, we are just filtering out wells that have not been transferred yet + # # In the future we will be using criteria to determine which wells to transfer + # # for example, wells in the "Water Level Network" project + # cleaned_df = wdf + + cleaned_df = get_transferable_wells(wdf) + cleaned_df = filter_non_transferred_wells(cleaned_df) + + return input_df, cleaned_df + + def _step(self, session: Session, df: pd.DataFrame, i: int, row: pd.Series): pointid = row.PointID - if wdf[wdf["PointID"] == pointid].shape[0] > 1: + if df[df["PointID"] == pointid].shape[0] > 1: logger.critical( f"transfer_wells. PointID {pointid} has duplicate records. Skipping." ) - errors.append( - { - "pointid": pointid, - "error": "duplicate records", - "table": source_table, - "field": "PointID", - } - ) - continue - - if limit and i >= limit: - logger.info(f"Reached limit of {limit} rows. Stopping migration.") - break - - if i and not i % step: - logger.info( - f"Processing row {i} of {n}, avg rows per second: {step / (time.time() - start_time):.2f}" - ) - start_time = time.time() - try: - session.commit() - except Exception as e: - logger.critical(f"Error committing wells. {e}") - session.rollback() - continue + self._capture_error(pointid, "duplicate records", "PointID") + return location = None try: - location, elevation_method = make_location(row, cached_elevations) + location, elevation_method = make_location(row, self._cached_elevations) session.add(location) - added_locations[row.PointID] = elevation_method + session.commit() + self._added_locations[row.PointID] = elevation_method except Exception as e: + self._capture_error(row.PointID, str(e), str(e), "Location") + logger.critical(f"Error making location for {row.PointID}: {e}") + if location is not None: session.expunge(location) - # these rollbacks are cause an issue because they are discarding good data - # session.rollback() - errors.append( - { - "pointid": row.PointID, - "error": e, - "table": "Location", - "field": str(e), - } - ) - logger.critical(f"Error making location for {row.PointID}: {e}") - continue + + return try: first_visit_date = _get_first_visit_date(row) @@ -236,6 +335,7 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None well_casing_materials = ( [] if isna(row.CasingDescription) else _extract_casing_materials(row) ) + well_pump_type = _extract_well_pump_type(row) # manually add the well rather than add_well from services/thing_helper.py # so that effective_start can be set on the location assocation @@ -257,15 +357,30 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None notes=( [{"content": row.Notes, "note_type": "Other"}] if row.Notes else [] ), + well_completion_date=row.CompletionDate, + well_driller_name=row.DrillerName, + well_construction_method=( + lexicon_mapper.map_value( + f"LU_ConstructionMethod:{row.ConstructionMethod}" + ) + if not isna(row.ConstructionMethod) + else None + ), + well_pump_type=well_pump_type, + is_suitable_for_datalogger=( + bool(row.OpenWellLoggerOK) + if not isna(row.OpenWellLoggerOK) + else None + ), ) CreateWell.model_validate(data) except ValidationError as e: - errors.append({"pointid": row.PointID, "error": e, "table": "WellData"}) + self._capture_error(row.PointID, str(e), "UnknownField") logger.critical( f"Validation error for row {i} with PointID {row.PointID}: {e.errors()}" ) - continue + return well = None try: @@ -277,6 +392,8 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None "well_casing_materials", "measuring_point_height", "measuring_point_description", + "well_completion_date_source", + "well_construction_method_source", ] ) well_data["thing_type"] = "water well" @@ -285,17 +402,6 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None well_data.pop("notes") well = Thing(**well_data) session.add(well) - # logger.info(f"Created well for {row.PointID}") - - # flush well to access its ID for status_history - # session.flush() - - # session.commit() - # session.refresh(well) - # if notes: - # for ni in notes: - # nn = well.add_note(ni['content'], ni['note_type']) - # session.add(nn) if well_purposes: for wp in well_purposes: @@ -320,9 +426,13 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None if well is not None: session.expunge(well) - errors.append({"pointid": row.PointID, "error": e, "table": "WellData"}) + if location is not None: + session.delete(location) + + self._capture_error(row.PointID, str(e), "UnknownField") + logger.critical(f"Error creating well for {row.PointID}: {e}") - continue + return assoc = LocationThingAssociation(effective_start=location.created_at) @@ -330,155 +440,362 @@ def transfer_wells(session: Session, flags: dict = None, limit: int = 0) -> None assoc.thing = well session.add(assoc) - session.commit() + if isna(row.AquiferType): + logger.info( + f"No AquiferType for {well.name}. Skipping aquifer association." + ) + else: + try: + self._add_aquifers(session, row, well) + except Exception as e: + logger.critical( + f"Error creating aquifer association for {well.name}: {e}" + ) + + if isna(row.FormationZone): + logger.info( + f"No FormationZone for {well.name}. Skipping formation association." + ) + else: + try: + self._add_formation_zone(session, row, well) + except Exception as e: + logger.critical( + f"Error creating formation association for {well.name}: {e}" + ) + + def _add_formation_zone(self, session, row, well): + # --- Set Formation Completion (NOT depth-based stratigraphy) --- + # This simply records which formation the well was completed in. + # For detailed depth-interval stratigraphy, see stratigraphy_transfer.py - # add things thate need well id - for well in session.query(Thing).filter(Thing.thing_type == "water well").all(): - row = wdf[wdf["PointID"] == well.name].iloc[0] - if not isna(row.Notes): - note = well.add_note(row.Notes, "Other") - session.add(note) + formation_code = row.FormationZone - location = well.current_location - elevation_method = added_locations[row.PointID] - data_provenances = make_location_data_provenance( - row, location, elevation_method + # Validate formation exists + formation = ( + session.query(GeologicFormation) + .filter(GeologicFormation.formation_code == formation_code) + .first() ) - for dp in data_provenances: - session.add(dp) - - """ - Developer's note - - It's not clear when the measuring point from NM_Aquifer was - determined, so I'm setting start_date to the day of the transfer - """ - measuring_point_history = MeasuringPointHistory( - thing_id=well.id, - measuring_point_height=row.MPHeight, - measuring_point_description=row.MeasuringPoint, - start_date=datetime.now(tz=UTC), - end_date=None, - ) - session.add(measuring_point_history) - - """ - Developer's notes - - For all status_history records the start_date will be now since that - isn't recorded in NM_Aquifer - """ - # TODO: if row.MonitoringStatus == "Q" is it monitored or not? <-- AMMP review - # TODO: if row.MonitoringStatus == "X" can that change? <-- AMMP review - # TODO: have AMMP review and verify the various MonitoringStatus codes - - target_id = well.id - target_table = "thing" - if not isna(row.MonitoringStatus): - if ( - "X" in row.MonitoringStatus - or "I" in row.MonitoringStatus - or "C" in row.MonitoringStatus - ): - status_value = "Not currently monitored" - else: - status_value = "Currently monitored" - - status_history = StatusHistory( - status_type="Monitoring Status", - status_value=status_value, - reason=row.MonitorStatusReason, - start_date=datetime.now(tz=UTC), - target_id=target_id, - target_table=target_table, + + if formation: + # Formation exists: Set association + well.formation_completion_code = formation_code + logger.info(f"Set completion formation for {well.name}: {formation_code}") + else: + # Formation does NOT exist: Do not create new formation. Flag and log for review + logger.critical( + f"MISSING FORMATION: Formation '{formation_code}' not found for well {well.name}. Flagged for review." ) - session.add(status_history) - logger.info( - f" Added monitoring status for well {well.name}: {status_value}" + self._capture_error( + row.PointID, f"Unknown formation: {formation_code}", "FormationZone" ) - for code in NMA_MONITORING_FREQUENCY.keys(): - if code in row.MonitoringStatus: - monitoring_frequency = NMA_MONITORING_FREQUENCY[code] - monitoring_frequency_history = MonitoringFrequencyHistory( - thing_id=well.id, - monitoring_frequency=monitoring_frequency, - start_date=datetime.now(tz=UTC), - end_date=None, - ) - session.add(monitoring_frequency_history) - logger.info( - f" Adding '{monitoring_frequency}' monitoring frequency for well {well.name}" - ) + def _add_aquifers(self, session, row, well): + # Parse codes (handles multi-character codes like "FC") + aquifer_codes = _extract_aquifer_type_codes(row.AquiferType) - if not isna(row.Status): - status_value = lexicon_mapper.map_value(f"LU_Status:{row.Status}") - status_history = StatusHistory( - status_type="Well Status", - status_value=status_value, - reason=row.StatusUserNotes, - start_date=datetime.now(tz=UTC), - target_id=target_id, - target_table=target_table, + if not aquifer_codes: + logger.warning( + f"Well {row.PointID}: Empty aquifer codes after parsing '{row.AquiferType}'" ) - session.add(status_history) - logger.info(f" Added well status for well {well.name}: {status_value}") + return - session.commit() + # Map AqClass code to aquifer name using lexicon mapper + if isna(row.AqClass): + # No AqClass - use first code's mapped name as aquifer name + aquifer_name = lexicon_mapper.map_value( + f"LU_AquiferType:{aquifer_codes[0]}" + ) + else: + try: + aquifer_name = lexicon_mapper.map_value( + f"LU_AquiferClass:{row.AqClass}" + ) + except KeyError: + logger.warning( + f"Unknown AqClass code '{row.AqClass}' for well {row.PointID}, using first type as name" + ) + aquifer_name = lexicon_mapper.map_value( + f"LU_AquiferType:{aquifer_codes[0]}" + ) + + # Determine primary type + # This assumes the first recorded type of a compound type is the primary type of the aquifer. + # TODO: verify with AMMP + try: + primary_type = lexicon_mapper.map_value( + f"LU_AquiferType:{aquifer_codes[0]}" + ) + except KeyError: + logger.warning( + f"Unknown aquifer type code '{aquifer_codes[0]}' for well {row.PointID}." + f"Setting primary_type to 'Unknown'" + ) + primary_type = "Unknown" # Creates aquifer with placeholder + + # Get or create the aquifer + aquifer = get_or_create_aquifer_system(session, aquifer_name, primary_type) + if aquifer: + # Check if association already exists + existing_assoc = ( + session.query(ThingAquiferAssociation) + .filter( + ThingAquiferAssociation.thing_id == well.id, + ThingAquiferAssociation.aquifer_system_id == aquifer.id, + ) + .first() + ) + + if not existing_assoc: + # Create the association + logger.info(f"Associating well {well.name} with aquifer {aquifer.name}") + aquifer_assoc = ThingAquiferAssociation( + thing=well, aquifer_system=aquifer + ) + session.add(aquifer_assoc) + session.flush() + + # Create AquiferType records for EACH characteristic + aquifer_type_names = [] + for aquifer_code in aquifer_codes: + try: + type_name = lexicon_mapper.map_value( + f"LU_AquiferType:{aquifer_code}" + ) + aquifer_type = AquiferType( + thing_aquifer_association=aquifer_assoc, + aquifer_type=type_name, + ) + session.add(aquifer_type) + aquifer_type_names.append(type_name) + except KeyError: + logger.critical( + f"Unknown aquifer code '{aquifer_code}' from AquiferType='{row.AquiferType}' " + f"for well {well.name}. Skipping this code." + ) + self._capture_error( + row.PointID, + f"Unknown aquifer code: {aquifer_code}", + "AquiferType", + ) + + logger.info( + f"Associated well {well.name} with aquifer {aquifer.name} " + f"(types: {', '.join(aquifer_type_names)})" + ) + + def _after_hook(self, session): + dump_cached_elevations(self._cached_elevations) + measuring_point_estimator = MeasuringPointEstimator() + # add things thate need well id + query = session.query(Thing).filter(Thing.thing_type == "water well") + count = query.count() + for i, well in enumerate(query.all()): + objs = [] + step_start_time = time.time() + row = self.cleaned_df[self.cleaned_df["PointID"] == well.name].iloc[0] + if notna(row.Notes): + note = well.add_note(row.Notes, "Other") + objs.append(note) + + location = well.current_location + elevation_method = self._added_locations[row.PointID] + data_provenances = make_location_data_provenance( + row, location, elevation_method + ) + objs.extend(data_provenances) + + for row_field, kw in ( + ( + "CompletionSource", + dict( + field_name="well_completion_date", + origin_type=lexicon_mapper.map_value( + f"LU_Depth_CompletionSource:{row.CompletionSource}" + ), + ), + ), + ( + "DataSource", + dict( + field_name="well_construction_method", + origin_source=row.DataSource, + ), + ), + ( + "DepthSource", + dict( + field_name="well_depth", + origin_type=lexicon_mapper.map_value( + f"LU_Depth_CompletionSource:{row.DepthSource}" + ), + ), + ), + ): + + if notna(row[row_field]): + dp = DataProvenance(target_id=well.id, target_table="thing", **kw) + objs.append(dp) + + start_time = time.time() + mphs = measuring_point_estimator.estimate_measuring_point_height(row) + logger.info( + f"Estimated measuring point heights for {well.name}: {time.time() - start_time:.2f}s" + ) + for mph, mph_desc, start_date, end_date in mphs: + measuring_point_history = MeasuringPointHistory( + thing_id=well.id, + measuring_point_height=mph, + measuring_point_description=mph_desc, + # start_date=datetime.now(tz=UTC), + start_date=start_date, + end_date=end_date, + ) + objs.append(measuring_point_history) + + """ + Developer's notes + + For all status_history records the start_date will be now since that + isn't recorded in NM_Aquifer + """ + # TODO: if row.MonitoringStatus == "Q" is it monitored or not? <-- AMMP review + # TODO: if row.MonitoringStatus == "X" can that change? <-- AMMP review + # TODO: have AMMP review and verify the various MonitoringStatus codes + + target_id = well.id + target_table = "thing" + if notna(row.MonitoringStatus): + if ( + "X" in row.MonitoringStatus + or "I" in row.MonitoringStatus + or "C" in row.MonitoringStatus + ): + status_value = "Not currently monitored" + else: + status_value = "Currently monitored" + + status_history = StatusHistory( + status_type="Monitoring Status", + status_value=status_value, + reason=row.MonitorStatusReason, + start_date=datetime.now(tz=UTC), + target_id=target_id, + target_table=target_table, + ) + objs.append(status_history) + logger.info( + f" Added monitoring status for well {well.name}: {status_value}" + ) + + for code in NMA_MONITORING_FREQUENCY.keys(): + if code in row.MonitoringStatus: + monitoring_frequency = NMA_MONITORING_FREQUENCY[code] + monitoring_frequency_history = MonitoringFrequencyHistory( + thing_id=well.id, + monitoring_frequency=monitoring_frequency, + start_date=datetime.now(tz=UTC), + end_date=None, + ) + + objs.append(monitoring_frequency_history) + logger.info( + f" Adding '{monitoring_frequency}' monitoring frequency for well {well.name}" + ) + + if notna(row.Status): + status_value = lexicon_mapper.map_value(f"LU_Status:{row.Status}") + status_history = StatusHistory( + status_type="Well Status", + status_value=status_value, + reason=row.StatusUserNotes, + start_date=datetime.now(tz=UTC), + target_id=target_id, + target_table=target_table, + ) + objs.append(status_history) + logger.info(f" Added well status for well {well.name}: {status_value}") + try: + session.bulk_save_objects(objs) + except DatabaseError as e: + session.rollback() + error_dict = e.orig.args[0] + self._capture_error(well.name, error_dict["D"], error_dict["t"]) + + logger.info( + f"After hook: {well.name} {i+1}/{count} took {time.time() - step_start_time:.2f}s" + ) - dump_cached_elevations(cached_elevations) - return input_df, cleaned_df, errors +class WellChunkTransferer(ChunkTransferer): + source_table: str = None + source_dtypes: dict = None -def transfer_wellscreens(session, limit=None): + def __init__(self, *args, **kw): + super().__init__(*args, **kw) + if self.source_table is None: + raise ValueError("source_table must be set") - input_df = read_csv("WellScreens") - wdf = replace_nans(input_df) + def _get_dfs(self): + if self.source_table is None: + raise ValueError("source_table must be set") - cleaned_df = filter_to_valid_point_ids(session, wdf) + input_df = read_csv(self.source_table, self.source_dtypes) + wdf = replace_nans(input_df) + cleaned_df = filter_to_valid_point_ids(wdf) + return input_df, cleaned_df - errors = [] - for ci, chunk in enumerate(chunk_by_size(cleaned_df, 1000)): + def _get_df_chunk(self, session, chunk): things = ( session.query(Thing).filter(Thing.name.in_(chunk.PointID.tolist())).all() ) + return things - logger.info(f"Processing chunk {ci}, {len(chunk)} rows, {len(things)} things") - for i, row in enumerate(chunk.itertuples()): - thing = next((thing for thing in things if thing.name == row.PointID), None) - if not thing: - logger.warning( - f"Thing with PointID {row.PointID} not found. Skipping well screen." - ) - continue - - well_screen_data = { - "thing_id": thing.id, - "screen_depth_top": row.ScreenTop, - "screen_depth_bottom": row.ScreenBottom, - # "screen_type": row.ScreenType, - "screen_description": row.ScreenDescription, - "release_status": "draft", - "nma_pk_wellscreens": row.GlobalID, - } - try: - # TODO: add validation logic here to ensure no overlapping screens for the same well - CreateWellScreen.model_validate(well_screen_data) - except ValidationError as e: - logger.critical( - f"Validation error for row {i} with PointID {row.PointID}: {e.errors()}" - ) - errors.append( - {"pointid": row.PointID, "error": e, "table": "WellScreens"} - ) - continue + def _get_db_item(self, dbchunk, row): + return next((thing for thing in dbchunk if thing.name == row.PointID), None) - well_screen = WellScreen(**well_screen_data) - session.add(well_screen) + def _missing_db_item_warning(self, row): + logger.warning(f"Thing with PointID {row.PointID} not found in database.") - session.commit() - return input_df, cleaned_df, errors +class WellScreenTransferer(WellChunkTransferer): + source_table = "WellScreens" + + def _chunk_step(self, session, df, i, row, db_item): + well_screen_data = { + "thing_id": db_item.id, + "screen_depth_top": row.ScreenTop, + "screen_depth_bottom": row.ScreenBottom, + # "screen_type": row.ScreenType, + "screen_description": row.ScreenDescription, + "release_status": "draft", + "nma_pk_wellscreens": row.GlobalID, + } + try: + # TODO: add validation logic here to ensure no overlapping screens for the same well + CreateWellScreen.model_validate(well_screen_data) + except ValidationError as e: + logger.critical( + f"Validation error for row {i} with PointID {row.PointID}: {e.errors()}" + ) + self._capture_error(row.PointID, str(e), "UnknownField") + return + + well_screen = WellScreen(**well_screen_data) + session.add(well_screen) + + +# def transfer_wells(flags: dict = None): +# transferer = WellTransferer(flags=flags) +# transferer.transfer() +# return transferer.input_df, transferer.cleaned_df, transferer.errors +# +# +# def transfer_wellscreens(flags: dict = None): +# transferer = WellScreenTransferer(flags=flags) +# transferer.chunk_transfer() +# return transferer.input_df, transferer.cleaned_df, transferer.errors def cleanup_locations(session):